diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a5250c1ffc74..dd85ef2a5d17 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -34,53 +34,55 @@ jobs:
   Build:
     strategy:
       matrix:
-        os: [windows-latest, macOS-latest]
+        os: [windows-2016, macOS-latest]
 
     runs-on: ${{ matrix.os }}
 
     steps:
     - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
-    - name: Lint Python
-      if: matrix.os == 'macOS-latest'
-      run: |
-        pip install flake8
-        flake8 . --count --select=E9,F63,F7 --show-source --statistics
     - name: Initialize submodules
       run: git submodule update --recursive --init
-
-    - name: Make Build Directory
-      run: cmake -E make_directory build.common
-
-    # configuration for Windows
-    - name: CMake@Win
-      if: matrix.os == 'windows-latest'
-      working-directory: build.common
+    - name: Lint Python
+      if: startsWith(matrix.os, 'macOS')
+      run: |
+        python3 -m pip install flake8
+        python3 -m flake8 . --count --select=E9,F63,F7 --show-source --statistics
+    - uses: actions/cache@v1
+      env:
+        CACHE_NUMBER: 0
+      with:
+        path: ~/conda_pkgs_dir
+        key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda/build-environment.yaml') }}
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        activate-environment: tvm-build
+        channel-priority: strict
+        environment-file: conda/build-environment.yaml
+        auto-activate-base: false
+        use-only-tar-bz2: true
+    - name: Conda info
+      run: |
+        conda info
+        conda list
+    - name: Conda-Build@Win
+      if: startsWith(matrix.os, 'windows')
+      shell: cmd /C call {0}
       run: >-
-        cmake
-        -DUSE_SORT=ON
-        -DUSE_RPC=ON
-        -DUSE_GRAPH_RUNTIME=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DCMAKE_CONFIGURATION_TYPES="Release"
-        ..
-
-    # configuration for Mac
-    - name: CMake@MacOS
-      if: matrix.os  == 'macOS-latest'
-      working-directory: build.common
+        conda build --output-folder=conda/pkg conda/recipe &&
+        conda install tvm -c ./conda/pkg
+    - name: Conda-Build@MacOS
+      if: startsWith(matrix.os, 'macOS')
+      shell: bash -l {0}
       run: >-
-        cmake
-        "-DUSE_SORT=ON"
-        "-DUSE_RPC=ON"
-        "-DUSE_GRAPH_RUNTIME=ON"
-        "-DUSE_METAL=ON"
-        ..
-
-    - name: Build@Win
-      if: matrix.os == 'windows-latest'
-      run: cmake --build build.common --config Release -- /m
-
-    - name: Build@MacOS
-      if: matrix.os == 'macOS-latest'
-      run: cmake --build build.common --config Release -j3
+        conda build --output-folder=conda/pkg  conda/recipe &&
+        conda install tvm -c ./conda/pkg
+    - name: Test@Win
+      if: startsWith(matrix.os, 'windows')
+      shell: cmd /C call {0}
+      run: >-
+        python -m pytest -v tests/python/all-platform-minimal-test
+    - name: Test@MacOS
+      if: startsWith(matrix.os, 'macOS')
+      shell: bash -l {0}
+      run: >-
+        python -m pytest -v tests/python/all-platform-minimal-test
diff --git a/.gitignore b/.gitignore
index 77c593ca2ab8..cdcf6780a3f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,7 +24,7 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
-
+.conda/
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 87ce9acfae55..12fb486a491b 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 87ce9acfae550d1a487746e9d06c2e250076e54c
+Subproject commit 12fb486a491b75d70ec4c5e0a0cd112ab49a95bc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e24bbeb5acd8..8fe416e9de93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,13 +2,13 @@ cmake_minimum_required(VERSION 3.2)
 project(tvm C CXX)
 
 # Utility functions
-include(cmake/util/Util.cmake)
-include(cmake/util/FindCUDA.cmake)
-include(cmake/util/FindOpenCL.cmake)
-include(cmake/util/FindVulkan.cmake)
-include(cmake/util/FindLLVM.cmake)
-include(cmake/util/FindROCM.cmake)
-include(cmake/util/FindEthosN.cmake)
+include(cmake/utils/Utils.cmake)
+include(cmake/utils/FindCUDA.cmake)
+include(cmake/utils/FindOpenCL.cmake)
+include(cmake/utils/FindVulkan.cmake)
+include(cmake/utils/FindLLVM.cmake)
+include(cmake/utils/FindROCM.cmake)
+include(cmake/utils/FindEthosN.cmake)
 
 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
   include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
@@ -79,6 +79,10 @@ tvm_option(USE_COREML "Build with coreml support" OFF)
 tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF)
+tvm_option(USE_TENSORRT_CODEGEN "Build with TensorRT Codegen support" OFF)
+tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF)
+tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, DYNAMIC, or OFF" OFF)
+tvm_option(USE_VITIS_AI "Build with VITIS-AI Codegen support" OFF)
 
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
@@ -100,6 +104,8 @@ if(MSVC)
   add_definitions(-D_SCL_SECURE_NO_WARNINGS)
   add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
   add_definitions(-DNOMINMAX)
+  # regeneration does not work well with msbuild custom rules.
+  set(CMAKE_SUPPRESS_REGENERATION ON)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /bigobj")
@@ -361,8 +367,12 @@ include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
 include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
+include(cmake/modules/contrib/TensorRT.cmake)
+include(cmake/modules/contrib/VitisAI.cmake)
+include(cmake/modules/contrib/Verilator.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
+include(cmake/modules/RustExt.cmake)
 
 include(CheckCXXCompilerFlag)
 if(NOT MSVC)
@@ -400,23 +410,23 @@ endif()
 
 if(USE_RELAY_DEBUG)
   message(STATUS "Building Relay in debug mode...")
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "USE_RELAY_DEBUG")
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "DMLC_LOG_DEBUG")
+  target_compile_definitions(tvm_objs PRIVATE "USE_RELAY_DEBUG")
+  target_compile_definitions(tvm_objs PRIVATE "DMLC_LOG_DEBUG")
 else()
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "NDEBUG")
+  target_compile_definitions(tvm_objs PRIVATE "NDEBUG")
 endif(USE_RELAY_DEBUG)
 
 if(USE_FALLBACK_STL_MAP)
   message(STATUS "Building with STL Map...")
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "USE_FALLBACK_STL_MAP=1")
+  target_compile_definitions(tvm_objs PRIVATE "USE_FALLBACK_STL_MAP=1")
 else()
   message(STATUS "Building with TVM Map...")
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "USE_FALLBACK_STL_MAP=0")
+  target_compile_definitions(tvm_objs PRIVATE "USE_FALLBACK_STL_MAP=0")
 endif(USE_FALLBACK_STL_MAP)
 
 if(BUILD_FOR_HEXAGON)
   # Wrap pthread_create to allow setting custom stack size.
-  set_target_properties(tvm_runtime PROPERTIES LINK_FLAGS
+  set_property(TARGET tvm_runtime APPEND PROPERTY LINK_FLAGS
                         "-Wl,--wrap=pthread_create")
 
   target_include_directories(tvm_runtime SYSTEM
@@ -483,7 +493,7 @@ if(GTEST_INCLUDE_DIR AND GTEST_LIB)
     add_executable(${__execname} ${__srcpath})
     list(APPEND TEST_EXECS ${__execname})
     target_include_directories(${__execname} SYSTEM PUBLIC ${GTEST_INCLUDE_DIR})
-    target_link_libraries(${__execname} ${TVM_TEST_LIBRARY_NAME} ${GTEST_LIB} pthread dl)
+    target_link_libraries(${__execname} PRIVATE ${TVM_TEST_LIBRARY_NAME} ${GTEST_LIB} pthread dl)
     set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_ALL 1)
     set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
   endforeach()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9b2faf78d8bc..650d1bc40e6d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -40,6 +40,8 @@ We add tag along with committer name to show areas that they are familiar with.
 We do encourage everyone to work anything they are interested in.
 
 - [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
+- [Matthew Barrett](https://github.com/mbaret): @mbaret - byoc, arm
+- [Matthew Brookhart](https://github.com/mbrookhart): @mbrookhart - relay, frontends
 - [Tianqi Chen](https://github.com/tqchen) (PPMC): @tqchen - topi, compiler, relay, docs
 - [Liangfu Chen](https://github.com/liangfu): @liangfu - vta, chisel, intel FPGA, c runtime
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm
@@ -59,6 +61,7 @@ We do encourage everyone to work anything they are interested in.
 - [Jared Roesch](https://github.com/jroesch) (PPMC): @jroesch - relay
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
+- [Junru Shao](https://github.com/junrushao1994) @junrushao1994 - relay, compiler
 - [Haichen Shen](https://github.com/icemelon9) (PPMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch - topi, compiler, runtime
@@ -136,7 +139,7 @@ We do encourage everyone to work anything they are interested in.
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
 
 ## List of Contributors
-- [Full List of Contributors](https://github.com/apache/incubator-tvm/graphs/contributors)
+- [Full List of Contributors](https://github.com/apache/tvm/graphs/contributors)
   - To contributors: please add your name to the list.
 - [Qiao Zhang](https://github.com/zhangqiaorjc)
 - [Haolong Zhang](https://github.com/haolongzhangm)
diff --git a/Jenkinsfile b/Jenkinsfile
index 207d12c21d6d..feea8c2f9489 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,11 +45,12 @@
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = "tlcpack/ci-lint:v0.62"
-ci_gpu = "tlcpack/ci-gpu:v0.64"
-ci_cpu = "tlcpack/ci-cpu:v0.66"
-ci_wasm = "tlcpack/ci-wasm:v0.60"
-ci_i386 = "tlcpack/ci-i386:v0.52"
+ci_gpu = "tlcpack/ci-gpu:v0.72"
+ci_cpu = "tlcpack/ci-cpu:v0.71"
+ci_wasm = "tlcpack/ci-wasm:v0.70"
+ci_i386 = "tlcpack/ci-i386:v0.71"
 ci_qemu = "tlcpack/ci-qemu:v0.01"
+ci_arm = "tlcpack/ci-arm:v0.01"
 // <--- End of regex-scanned config.
 
 // tvm libraries
@@ -180,11 +181,12 @@ stage('Build') {
         make(ci_cpu, 'build', '-j2')
         pack_lib('cpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_fsim.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh"
-          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+          // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh"
         }
       }
@@ -197,6 +199,7 @@ stage('Build') {
         sh "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh"
         make(ci_wasm, 'build', '-j2')
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_wasm} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh"
         }
       }
@@ -212,6 +215,16 @@ stage('Build') {
       }
     }
   },
+  // 'BUILD : arm': {
+  //   node('ARM') {
+  //     ws(per_exec_ws("tvm/build-arm")) {
+  //       init_git()
+  //       sh "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh"
+  //       make(ci_arm, 'build', '-j4')
+  //       pack_lib('arm', tvm_multilib)
+  //     }
+  //   }
+  // },
   'BUILD: QEMU': {
     node('CPU') {
       ws(per_exec_ws("tvm/build-qemu")) {
@@ -219,6 +232,7 @@ stage('Build') {
         sh "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh"
         make(ci_qemu, 'build', '-j2')
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh"
         }
       }
@@ -233,6 +247,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_sphinx_precheck.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh"
@@ -246,6 +261,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('i386', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_i386} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_vta_fsim.sh"
@@ -253,12 +269,26 @@ stage('Unit Test') {
       }
     }
   },
+  // 'python3: arm': {
+  //   node('ARM') {
+  //     ws(per_exec_ws("tvm/ut-python-arm")) {
+  //       init_git()
+  //       unpack_lib('arm', tvm_multilib)
+  //       timeout(time: max_time, unit: 'MINUTES') {
+  //         sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_python_setup.sh"
+  //         sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_unittest.sh"
+  //         // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
+  //       }
+  //     }
+  //   }
+  // },
   'java: GPU': {
     node('GPU') {
       ws(per_exec_ws("tvm/ut-java")) {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh"
         }
       }
@@ -273,6 +303,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh"
         }
       }
@@ -284,6 +315,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh"
         }
       }
@@ -295,6 +327,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('cpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh"
         }
       }
@@ -307,12 +340,12 @@ stage('Integration Test') {
   //       init_git()
   //       unpack_lib('gpu', tvm_multilib)
   //       timeout(time: max_time, unit: 'MINUTES') {
+  //         sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
   //         sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh"
   //       }
   //       pack_lib('mydocs', 'docs.tgz')
   //     }
   //   }
-  // }
 }
 
 /*
diff --git a/README.md b/README.md
index 6c82b1585c45..b3a3e850adb2 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-<img src=https://raw.githubusercontent.com/apache/incubator-tvm-site/main/images/logo/tvm-logo-small.png width=128/> Open Deep Learning Compiler Stack
+<img src=https://raw.githubusercontent.com/apache/tvm-site/main/images/logo/tvm-logo-small.png width=128/> Open Deep Learning Compiler Stack
 ==============================================
 [Documentation](https://tvm.apache.org/docs) |
 [Contributors](CONTRIBUTORS.md) |
@@ -23,7 +23,7 @@
 [Release Notes](NEWS.md)
 
 [![Build Status](https://ci.tlcpack.ai/buildStatus/icon?job=tvm/main)](https://ci.tlcpack.ai/job/tvm/job/main/)
-[![WinMacBuild](https://github.com/apache/incubator-tvm/workflows/WinMacBuild/badge.svg)](https://github.com/apache/incubator-tvm/actions?query=workflow%3AWinMacBuild)
+[![WinMacBuild](https://github.com/apache/tvm/workflows/WinMacBuild/badge.svg)](https://github.com/apache/tvm/actions?query=workflow%3AWinMacBuild)
 
 Apache TVM (incubating) is a compiler stack for deep learning systems. It is designed to close the gap between the
 productivity-focused deep learning frameworks, and the performance- and efficiency-focused hardware backends.
diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h
index bc10bdaa508c..5f3db04274a1 100644
--- a/apps/android_camera/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h
@@ -40,7 +40,7 @@
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
-#include "../src/runtime/file_util.cc"
+#include "../src/runtime/file_utils.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
 #include "../src/runtime/library_module.cc"
 #include "../src/runtime/module.cc"
diff --git a/apps/android_camera/models/prepare_model.py b/apps/android_camera/models/prepare_model.py
index 19be368c97e9..ab20e028c2ad 100644
--- a/apps/android_camera/models/prepare_model.py
+++ b/apps/android_camera/models/prepare_model.py
@@ -25,7 +25,7 @@
 
 import tvm
 import tvm.relay as relay
-from tvm.contrib import util, ndk, graph_runtime as runtime
+from tvm.contrib import utils, ndk, graph_runtime as runtime
 from tvm.contrib.download import download_testdata, download
 
 target = "llvm -mtriple=arm64-linux-android"
diff --git a/apps/android_deploy/README.md b/apps/android_deploy/README.md
index d5efba88b901..32e601840f04 100644
--- a/apps/android_deploy/README.md
+++ b/apps/android_deploy/README.md
@@ -34,7 +34,7 @@ Alternatively, you may execute Docker image we provide which contains the requir
 
 ### Build APK
 
-Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/incubator-tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
+Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
 
 ```
 dependencies {
@@ -124,7 +124,7 @@ If everything goes well, you will find compile tools in `/opt/android-toolchain-
 
 Follow instruction to get compiled version model for android target [here.](https://tvm.apache.org/docs/deploy/android.html)
 
-Copied these compiled model deploy_lib.so, deploy_graph.json and deploy_param.params to apps/android_deploy/app/src/main/assets/ and modify TVM flavor changes on [java](https://github.com/apache/incubator-tvm/blob/main/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java#L81)
+Copied these compiled model deploy_lib.so, deploy_graph.json and deploy_param.params to apps/android_deploy/app/src/main/assets/ and modify TVM flavor changes on [java](https://github.com/apache/tvm/blob/main/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java#L81)
 
 `CPU Verison flavor`
 ```
diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
index f1a47a674281..362d278c38c4 100644
--- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
@@ -28,7 +28,7 @@
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
-#include "../src/runtime/file_util.cc"
+#include "../src/runtime/file_utils.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
 #include "../src/runtime/library_module.cc"
 #include "../src/runtime/module.cc"
diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 29962d329165..c5e21ecbbc12 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -28,7 +28,7 @@ You will need JDK, [Android NDK](https://developer.android.com/ndk) and an Andro
 
 We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
 
-Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/incubator-tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
+Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
 
 ```
 dependencies {
@@ -146,7 +146,7 @@ android   1      1     0
 ```
 
 
-Then checkout [android\_rpc/tests/android\_rpc\_test.py](https://github.com/apache/incubator-tvm/blob/main/apps/android_rpc/tests/android_rpc_test.py) and run,
+Then checkout [android\_rpc/tests/android\_rpc\_test.py](https://github.com/apache/tvm/blob/main/apps/android_rpc/tests/android_rpc_test.py) and run,
 
 ```bash
 # Specify the RPC tracker
@@ -157,7 +157,7 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
 python android_rpc_test.py
 ```
 
-This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set `'test_opencl = True'` and on Vulkan target set `'test_vulkan = True'` in  [tests/android_rpc_test.py](https://github.com/apache/incubator-tvm/blob/main/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
+This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set `'test_opencl = True'` and on Vulkan target set `'test_vulkan = True'` in  [tests/android_rpc_test.py](https://github.com/apache/tvm/blob/main/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
 On my test device, it gives following results.
 
 ```bash
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index aea61e757aa7..2005568c608c 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -40,7 +40,7 @@
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
-#include "../src/runtime/file_util.cc"
+#include "../src/runtime/file_utils.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
 #include "../src/runtime/graph/graph_runtime_factory.cc"
 #include "../src/runtime/library_module.cc"
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index 2827c140ea92..9586bffeca0b 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -25,7 +25,7 @@
 from tvm import te
 import os
 from tvm import rpc
-from tvm.contrib import util, ndk
+from tvm.contrib import utils, ndk
 import numpy as np
 
 # Set to be address of tvm proxy.
@@ -50,7 +50,7 @@ def test_rpc_module():
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
     a_np = np.random.uniform(size=1024).astype(A.dtype)
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     # Establish remote connection with target hardware
     tracker = rpc.connect_tracker(tracker_host, tracker_port)
diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index 920033f755ea..43d93d9e00fa 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -20,7 +20,7 @@
 
 ## Results
 
-See results on wiki page https://github.com/apache/incubator-tvm/wiki/Benchmark
+See results on wiki page https://github.com/apache/tvm/wiki/Benchmark
 
 ## How to Reproduce
 
@@ -78,7 +78,7 @@ python3 -m tvm.exec.rpc_tracker
   `python3 -m tvm.exec.rpc_server --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
 
 * For Android device
-   * Build and install tvm RPC apk on your device [Help](https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc).
+   * Build and install tvm RPC apk on your device [Help](https://github.com/apache/tvm/tree/main/apps/android_rpc).
      Make sure you can pass the android rpc test. Then you have alreadly known how to register.
 
 3. Verify the device registration
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index fb58819d3c5c..e7233370e6d6 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -23,7 +23,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 from tvm import relay
 
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
index b57f6028ab73..cf78c66141d0 100644
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -23,7 +23,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 from tvm import relay
 
diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc
index 8e294a05775d..3224028b60a1 100644
--- a/apps/bundle_deploy/runtime.cc
+++ b/apps/bundle_deploy/runtime.cc
@@ -24,7 +24,7 @@
 
 #include "../../src/runtime/c_runtime_api.cc"
 #include "../../src/runtime/cpu_device_api.cc"
-#include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/file_utils.cc"
 #include "../../src/runtime/graph/graph_runtime.cc"
 #include "../../src/runtime/library_module.cc"
 #include "../../src/runtime/module.cc"
diff --git a/apps/cpp_rpc/main.cc b/apps/cpp_rpc/main.cc
index 777fffa7d37c..e381dd2b261b 100644
--- a/apps/cpp_rpc/main.cc
+++ b/apps/cpp_rpc/main.cc
@@ -35,7 +35,7 @@
 #include <vector>
 
 #include "../../src/support/socket.h"
-#include "../../src/support/util.h"
+#include "../../src/support/utils.h"
 #include "rpc_server.h"
 
 #if defined(_WIN32)
@@ -139,7 +139,7 @@ string GetCmdOption(int argc, char* argv[], string option, bool key = false) {
         return cmd;
       }
       // We assume "=" is the end of option.
-      CHECK_EQ(*option.rbegin(), '=');
+      ICHECK_EQ(*option.rbegin(), '=');
       cmd = arg.substr(arg.find('=') + 1);
       return cmd;
     }
diff --git a/apps/cpp_rpc/rpc_env.cc b/apps/cpp_rpc/rpc_env.cc
index c64cb2f09f94..5b351725b1f1 100644
--- a/apps/cpp_rpc/rpc_env.cc
+++ b/apps/cpp_rpc/rpc_env.cc
@@ -40,8 +40,7 @@ int mkdir(const char* path, int /* ignored */) { return _mkdir(path); }
 #include <string>
 #include <vector>
 
-#include "../../src/runtime/file_util.h"
-#include "../../src/support/util.h"
+#include "../../src/support/utils.h"
 #include "rpc_env.h"
 
 namespace {
@@ -115,7 +114,15 @@ RPCEnv::RPCEnv() {
         std::string file_name = this->GetPath(args[0]);
         file_name = BuildSharedLibrary(file_name);
         std::string bin;
-        LoadBinaryFromFile(file_name, &bin);
+
+        std::ifstream fs(file_name, std::ios::in | std::ios::binary);
+        ICHECK(!fs.fail()) << "Cannot open " << file_name;
+        fs.seekg(0, std::ios::end);
+        size_t size = static_cast<size_t>(fs.tellg());
+        fs.seekg(0, std::ios::beg);
+        bin.resize(size);
+        fs.read(dmlc::BeginPtr(bin), size);
+
         TVMByteArray binarr;
         binarr.data = bin.data();
         binarr.size = bin.length();
diff --git a/apps/cpp_rpc/rpc_server.cc b/apps/cpp_rpc/rpc_server.cc
index 592a6db6d2ef..16939456451b 100644
--- a/apps/cpp_rpc/rpc_server.cc
+++ b/apps/cpp_rpc/rpc_server.cc
@@ -245,7 +245,7 @@ class RPCServer {
       support::TCPSocket conn = listen_sock_.Accept(addr);
 
       int code = kRPCMagic;
-      CHECK_EQ(conn.RecvAll(&code, sizeof(code)), sizeof(code));
+      ICHECK_EQ(conn.RecvAll(&code, sizeof(code)), sizeof(code));
       if (code != kRPCMagic) {
         conn.Close();
         LOG(FATAL) << "Client connected is not TVM RPC server";
@@ -253,7 +253,7 @@ class RPCServer {
       }
 
       int keylen = 0;
-      CHECK_EQ(conn.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
+      ICHECK_EQ(conn.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
 
       const char* CLIENT_HEADER = "client:";
       const char* SERVER_HEADER = "server:";
@@ -265,10 +265,10 @@ class RPCServer {
         continue;
       }
 
-      CHECK_NE(keylen, 0);
+      ICHECK_NE(keylen, 0);
       std::string remote_key;
       remote_key.resize(keylen);
-      CHECK_EQ(conn.RecvAll(&remote_key[0], keylen), keylen);
+      ICHECK_EQ(conn.RecvAll(&remote_key[0], keylen), keylen);
 
       std::stringstream ssin(remote_key);
       std::string arg0;
@@ -280,16 +280,16 @@ class RPCServer {
 
       if (arg0 != expect_header) {
         code = kRPCMismatch;
-        CHECK_EQ(conn.SendAll(&code, sizeof(code)), sizeof(code));
+        ICHECK_EQ(conn.SendAll(&code, sizeof(code)), sizeof(code));
         conn.Close();
         LOG(WARNING) << "Mismatch key from" << addr->AsString();
         continue;
       } else {
         code = kRPCSuccess;
-        CHECK_EQ(conn.SendAll(&code, sizeof(code)), sizeof(code));
+        ICHECK_EQ(conn.SendAll(&code, sizeof(code)), sizeof(code));
         keylen = int(server_key.length());
-        CHECK_EQ(conn.SendAll(&keylen, sizeof(keylen)), sizeof(keylen));
-        CHECK_EQ(conn.SendAll(server_key.c_str(), keylen), keylen);
+        ICHECK_EQ(conn.SendAll(&keylen, sizeof(keylen)), sizeof(keylen));
+        ICHECK_EQ(conn.SendAll(server_key.c_str(), keylen), keylen);
         LOG(INFO) << "Connection success " << addr->AsString();
 #ifndef __ANDROID__
         ssin >> *opts;
@@ -325,7 +325,7 @@ class RPCServer {
     size_t pos = opts.rfind(option);
     if (pos != std::string::npos) {
       const std::string cmd = opts.substr(pos + option.size());
-      CHECK(support::IsNumber(cmd)) << "Timeout is not valid";
+      ICHECK(support::IsNumber(cmd)) << "Timeout is not valid";
       return std::stoi(cmd);
     }
     return 0;
diff --git a/apps/cpp_rpc/rpc_tracker_client.h b/apps/cpp_rpc/rpc_tracker_client.h
index cdfb64780ba6..1497ab3251be 100644
--- a/apps/cpp_rpc/rpc_tracker_client.h
+++ b/apps/cpp_rpc/rpc_tracker_client.h
@@ -74,9 +74,9 @@ class TrackerClient {
       tracker_sock_ = ConnectWithRetry();
 
       int code = kRPCTrackerMagic;
-      CHECK_EQ(tracker_sock_.SendAll(&code, sizeof(code)), sizeof(code));
-      CHECK_EQ(tracker_sock_.RecvAll(&code, sizeof(code)), sizeof(code));
-      CHECK_EQ(code, kRPCTrackerMagic) << tracker_addr_.c_str() << " is not RPC Tracker";
+      ICHECK_EQ(tracker_sock_.SendAll(&code, sizeof(code)), sizeof(code));
+      ICHECK_EQ(tracker_sock_.RecvAll(&code, sizeof(code)), sizeof(code));
+      ICHECK_EQ(code, kRPCTrackerMagic) << tracker_addr_.c_str() << " is not RPC Tracker";
 
       std::ostringstream ss;
       ss << "[" << static_cast<int>(TrackerCode::kUpdateInfo) << ", {\"key\": \"server:" << key_
@@ -85,7 +85,7 @@ class TrackerClient {
 
       // Receive status and validate
       std::string remote_status = tracker_sock_.RecvBytes();
-      CHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
+      ICHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
     }
   }
   /*!
@@ -117,7 +117,7 @@ class TrackerClient {
 
       // Receive status and validate
       std::string remote_status = tracker_sock_.RecvBytes();
-      CHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
+      ICHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
     } else {
       *matchkey = key_;
     }
@@ -167,7 +167,7 @@ class TrackerClient {
             tracker_sock_.SendBytes(ss.str());
 
             std::string remote_status = tracker_sock_.RecvBytes();
-            CHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
+            ICHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
             unmatch_period_count = 0;
           }
           continue;
@@ -199,7 +199,7 @@ class TrackerClient {
       auto period = (std::chrono::duration_cast<std::chrono::seconds>(
                          std::chrono::system_clock::now() - tbegin))
                         .count();
-      CHECK(period < timeout) << "Failed to connect to server" << addr.AsString();
+      ICHECK(period < timeout) << "Failed to connect to server" << addr.AsString();
       LOG(WARNING) << "Cannot connect to tracker " << addr.AsString() << " retry in "
                    << retry_period << " seconds.";
       std::this_thread::sleep_for(std::chrono::seconds(retry_period));
diff --git a/apps/cpp_rpc/win32_process.h b/apps/cpp_rpc/win32_process.h
index 621444e18764..0f784681f209 100644
--- a/apps/cpp_rpc/win32_process.h
+++ b/apps/cpp_rpc/win32_process.h
@@ -23,8 +23,12 @@
  */
 #ifndef TVM_APPS_CPP_RPC_WIN32_PROCESS_H_
 #define TVM_APPS_CPP_RPC_WIN32_PROCESS_H_
+
 #include <chrono>
 #include <string>
+
+#include "../../src/support/socket.h"
+
 namespace tvm {
 namespace runtime {
 /*!
@@ -41,4 +45,4 @@ void SpawnRPCChild(SOCKET fd, std::chrono::seconds timeout);
 void ChildProcSocketHandler(const std::string& mmap_path);
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_APPS_CPP_RPC_WIN32_PROCESS_H_
\ No newline at end of file
+#endif  // TVM_APPS_CPP_RPC_WIN32_PROCESS_H_
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index 87cb69b4f4ce..be431bab68d1 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -75,12 +75,12 @@ class NDSubClass : public tvm::runtime::NDArray {
   NDSubClass AddWith(const NDSubClass& other) const {
     SubContainer* a = static_cast<SubContainer*>(get_mutable());
     SubContainer* b = static_cast<SubContainer*>(other.get_mutable());
-    CHECK(a != nullptr && b != nullptr);
+    ICHECK(a != nullptr && b != nullptr);
     return NDSubClass(a->additional_info_ + b->additional_info_);
   }
   int get_additional_info() const {
     SubContainer* self = static_cast<SubContainer*>(get_mutable());
-    CHECK(self != nullptr);
+    ICHECK(self != nullptr);
     return self->additional_info_;
   }
   using ContainerType = SubContainer;
@@ -146,7 +146,7 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev").set_body([](TVMArgs args, TVMRetValue*
 TVM_REGISTER_GLOBAL("tvm_ext.nd_create").set_body([](TVMArgs args, TVMRetValue* rv) {
   int additional_info = args[0];
   *rv = NDSubClass(additional_info);
-  CHECK_EQ(rv->type_code(), kTVMNDArrayHandle);
+  ICHECK_EQ(rv->type_code(), kTVMNDArrayHandle);
 });
 
 TVM_REGISTER_GLOBAL("tvm_ext.nd_add_two").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc
index fdb55a51480a..829241d31a6d 100644
--- a/apps/howto_deploy/cpp_deploy.cc
+++ b/apps/howto_deploy/cpp_deploy.cc
@@ -31,7 +31,7 @@
 void Verify(tvm::runtime::Module mod, std::string fname) {
   // Get the function from the module.
   tvm::runtime::PackedFunc f = mod.GetFunction(fname);
-  CHECK(f != nullptr);
+  ICHECK(f != nullptr);
   // Allocate the DLPack data structures.
   //
   // Note that we use TVM runtime API to allocate the DLTensor in this example.
@@ -64,7 +64,7 @@ void Verify(tvm::runtime::Module mod, std::string fname) {
   f(x, y);
   // Print out the output
   for (int i = 0; i < shape[0]; ++i) {
-    CHECK_EQ(static_cast<float*>(y->data)[i], i + 1.0f);
+    ICHECK_EQ(static_cast<float*>(y->data)[i], i + 1.0f);
   }
   LOG(INFO) << "Finish verification...";
   TVMArrayFree(x);
@@ -112,7 +112,7 @@ void DeployGraphRuntime() {
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 2; ++j) {
-      CHECK_EQ(static_cast<float*>(y->data)[i * 2 + j], i * 2 + j + 1);
+      ICHECK_EQ(static_cast<float*>(y->data)[i * 2 + j], i * 2 + j + 1);
     }
   }
 }
diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc
index b43f920b6056..d6dd5876a994 100644
--- a/apps/howto_deploy/tvm_runtime_pack.cc
+++ b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -39,7 +39,7 @@
  */
 #include "../../src/runtime/c_runtime_api.cc"
 #include "../../src/runtime/cpu_device_api.cc"
-#include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/file_utils.cc"
 #include "../../src/runtime/library_module.cc"
 #include "../../src/runtime/module.cc"
 #include "../../src/runtime/ndarray.cc"
diff --git a/apps/ios_rpc/tests/ios_rpc_mobilenet.py b/apps/ios_rpc/tests/ios_rpc_mobilenet.py
index 132377ac4412..90ac6bfb9218 100644
--- a/apps/ios_rpc/tests/ios_rpc_mobilenet.py
+++ b/apps/ios_rpc/tests/ios_rpc_mobilenet.py
@@ -22,7 +22,7 @@
 from tvm.relay import transform
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.quantize.quantize import prerequisite_optimize
-from tvm.contrib import util, xcode, graph_runtime, coreml_runtime
+from tvm.contrib import utils, xcode, graph_runtime, coreml_runtime
 from tvm.contrib.target import coreml as _coreml
 
 import os
@@ -98,7 +98,7 @@ def get_model(model_name, data_shape):
 
 
 def test_mobilenet():
-    temp = util.tempdir()
+    temp = utils.tempdir()
     image, synset = prepare_input()
     model, params = get_model("mobilenetv2_1.0", image.shape)
 
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index 620fe493771f..a967c2f75e61 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -26,7 +26,7 @@
 import re
 import sys
 from tvm import rpc
-from tvm.contrib import util, xcode
+from tvm.contrib import utils, xcode
 import numpy as np
 
 # Set to be address of tvm proxy.
@@ -59,7 +59,7 @@ def test_rpc_module():
     n = tvm.runtime.convert(1024)
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    temp = util.tempdir()
+    temp = utils.tempdir()
     s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
     s[B].bind(xi, te.thread_axis("threadIdx.x"))
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index 9e2899bf6e5e..fbe4850e1b57 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -25,7 +25,7 @@
 #include "../../../src/runtime/c_runtime_api.cc"
 #include "../../../src/runtime/cpu_device_api.cc"
 #include "../../../src/runtime/dso_library.cc"
-#include "../../../src/runtime/file_util.cc"
+#include "../../../src/runtime/file_utils.cc"
 #include "../../../src/runtime/library_module.cc"
 #include "../../../src/runtime/metadata_module.cc"
 #include "../../../src/runtime/module.cc"
@@ -118,7 +118,7 @@ void LaunchSyncServer() {
   std::ifstream fs(name, std::ios::in);
   std::string url, key;
   int port;
-  CHECK(fs >> url >> port >> key) << "Invalid RPC config file " << name;
+  ICHECK(fs >> url >> port >> key) << "Invalid RPC config file " << name;
   RPCConnect(url, port, "server:" + key, TVMArgs(nullptr, nullptr, 0))->ServerLoop();
 }
 
diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm
index 6c618c48096f..910c650aedc1 100644
--- a/apps/ios_rpc/tvmrpc/ViewController.mm
+++ b/apps/ios_rpc/tvmrpc/ViewController.mm
@@ -80,7 +80,7 @@ - (void)onReadAvailable {
     } else {
       initialized_ = true;
       self.statusLabel.text = @"Proxy connected.";
-      CHECK(handler_ != nullptr);
+      ICHECK(handler_ != nullptr);
     }
   }
   const int kBufferSize = 4 << 10;
@@ -158,7 +158,7 @@ - (void)open {
   [outputStream_ open];
   [inputStream_ open];
   handler_ = tvm::runtime::CreateServerEventHandler(outputStream_, key_, "%toinit");
-  CHECK(handler_ != nullptr);
+  ICHECK(handler_ != nullptr);
   self.infoText.text = @"";
   self.statusLabel.text = @"Connecting...";
 }
diff --git a/apps/microtvm/README.md b/apps/microtvm/README.md
new file mode 100644
index 000000000000..97b844a4c01b
--- /dev/null
+++ b/apps/microtvm/README.md
@@ -0,0 +1,28 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# microTVM Reference Virtual Machines
+
+
+microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers.
+These Virtual Machines are used to reproduce results and bugs when using microTVM with real
+physical hardware. Note that they are not used to run Continuous Integration regression tests--
+those are instead run by the QEMU container (they run against an emulator, rather than real
+hardware).
+
+
+See the "microTVM Reference Virtual Machines" tutorial for information on how to use these.
diff --git a/apps/microtvm/reference-vm/.gitignore b/apps/microtvm/reference-vm/.gitignore
new file mode 100644
index 000000000000..d918f5e13cc5
--- /dev/null
+++ b/apps/microtvm/reference-vm/.gitignore
@@ -0,0 +1 @@
+/release-test
\ No newline at end of file
diff --git a/apps/microtvm/reference-vm/README.md b/apps/microtvm/reference-vm/README.md
new file mode 100644
index 000000000000..7ef7900c3e05
--- /dev/null
+++ b/apps/microtvm/reference-vm/README.md
@@ -0,0 +1,67 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# microTVM Reference Virtual Machines
+
+This directory contains Vagrant specifications that create reference Virtual Machines for use with
+microTVM. These machines help microTVM users collaborate by providing a stable reference test
+environment.
+
+For more information on how to use them, see the microTVM Reference Virtual Machines tutorial.
+
+
+## Reference VM Developer Information
+
+Each RTOS or platform that integrates with microTVM can check-in a Reference VM in this directory to
+help the community collaborate. You should use the tools provided here to ensure a uniform release
+process across all platforms. Typically, releases need to be created by TVM committers.
+
+Generally speaking, it's expected that any integrated platform with a regression test checked-in to
+the tvm repository should also define a reference VM. If you want to integrate a new platform,
+please raise a discussion on [the forum](https://discuss.tvm.ai).
+
+### Organization
+
+Reference VMs are organized as follows:
+
+* `base-box-tool.py` - Reference VM build, test, and release tool
+* `<platform>/`
+** `Vagrantfile` Vagrantfile that end-users will inovke. Should be based off a base box
+    which contains dependencies other than the TVM python dependencies.
+** `base-box` - Top-level directory which defines the base box.
+*** `Vagrantfile.packer-template` - Packer template Vagrantfile which will be used to build the
+    base box.
+*** `test-config.json` - JSON file explaining how to perform release tests to `base-box-tool.py`
+
+## Creating Releases
+
+1. Build the base box for the given platform: `$ ./base-box-tool.py build <platform>`
+2. Run release tests for each platform:
+    1. Connect any needed hardware to the VM host machine.
+    2. Run tests: `$ ./base-box-tool.py test <platform> [--test-device-serial=<serial>]`. This
+       command does the following for each provider:
+        1. Copies all files inside `./<platform>` except `.vagrant` and `base-box` to
+           `./release-test`. This is done to avoid reusing any VM the developer may have started.
+        2. Executes `$ vagrant up --provider=<provider>`.
+        3. Finds an attached USB device matching the VID and PID specified in `test-config.json`,
+           and if `--test-device-serial` was given, that serial number (as reported to USB). Creates
+           a rule to autoconnect this device to the VM, and also attaches it to the VM>
+        4. SSHs to the VM, `cd` to the TVM root directory, and runs `test_cmd` from
+           `test-config.json`. Nonzero status means failure.
+3. If release tests fail, fix them and restart from step 1.
+4. If release tests pass: `$ ./base-box-tool.py release <platform> <version>`. Be sure you've logged
+   in to Vagrant Cloud using the `vagrant` tool.
diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
new file mode 100755
index 000000000000..c317a373bd8b
--- /dev/null
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -0,0 +1,426 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import argparse
+import json
+import logging
+import os
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+
+
+_LOG = logging.getLogger(__name__)
+
+
+THIS_DIR = os.path.realpath(os.path.dirname(__file__) or ".")
+
+
+# List of vagrant providers supported by this tool
+ALL_PROVIDERS = (
+    "parallels",
+    "virtualbox",
+)
+
+
+def parse_virtualbox_devices():
+    output = subprocess.check_output(["VBoxManage", "list", "usbhost"], encoding="utf-8")
+    devices = []
+    current_dev = {}
+    for line in output.split("\n"):
+        if not line.strip():
+            if current_dev:
+                if "VendorId" in current_dev and "ProductId" in current_dev:
+                    devices.append(current_dev)
+                current_dev = {}
+
+            continue
+
+        key, value = line.split(":", 1)
+        value = value.lstrip(" ")
+        current_dev[key] = value
+
+    if current_dev:
+        devices.append(current_dev)
+    return devices
+
+
+VIRTUALBOX_VID_PID_RE = re.compile(r"0x([0-9A-Fa-f]{4}).*")
+
+
+def attach_virtualbox(uuid, vid_hex=None, pid_hex=None, serial=None):
+    usb_devices = parse_virtualbox_devices()
+    for dev in usb_devices:
+        m = VIRTUALBOX_VID_PID_RE.match(dev["VendorId"])
+        if not m:
+            _LOG.warning("Malformed VendorId: %s", dev["VendorId"])
+            continue
+
+        dev_vid_hex = m.group(1).lower()
+
+        m = VIRTUALBOX_VID_PID_RE.match(dev["ProductId"])
+        if not m:
+            _LOG.warning("Malformed ProductId: %s", dev["ProductId"])
+            continue
+
+        dev_pid_hex = m.group(1).lower()
+
+        if (
+            vid_hex == dev_vid_hex
+            and pid_hex == dev_pid_hex
+            and (serial is None or serial == dev["SerialNumber"])
+        ):
+            rule_args = [
+                "VBoxManage",
+                "usbfilter",
+                "add",
+                "0",
+                "--action",
+                "hold",
+                "--name",
+                "test device",
+                "--target",
+                uuid,
+                "--vendorid",
+                vid_hex,
+                "--productid",
+                pid_hex,
+            ]
+            if serial is not None:
+                rule_args.extend(["--serialnumber", serial])
+            subprocess.check_call(rule_args)
+            subprocess.check_call(["VBoxManage", "controlvm", uuid, "usbattach", dev["UUID"]])
+            return
+
+    raise Exception(
+        f"Device with vid={vid_hex}, pid={pid_hex}, serial={serial!r} not found:\n{usb_devices!r}"
+    )
+
+
+def attach_parallels(uuid, vid_hex=None, pid_hex=None, serial=None):
+    usb_devices = json.loads(
+        subprocess.check_output(["prlsrvctl", "usb", "list", "-j"], encoding="utf-8")
+    )
+    for dev in usb_devices:
+        _, dev_vid_hex, dev_pid_hex, _, _, dev_serial = dev["System name"].split("|")
+        dev_vid_hex = dev_vid_hex.lower()
+        dev_pid_hex = dev_pid_hex.lower()
+        if (
+            vid_hex == dev_vid_hex
+            and pid_hex == dev_pid_hex
+            and (serial is None or serial == dev_serial)
+        ):
+            subprocess.check_call(["prlsrvctl", "usb", "set", dev["Name"], uuid])
+            if "Used-By-Vm-Name" in dev:
+                subprocess.check_call(
+                    ["prlctl", "set", dev["Used-By-Vm-Name"], "--device-disconnect", dev["Name"]]
+                )
+            subprocess.check_call(["prlctl", "set", uuid, "--device-connect", dev["Name"]])
+            return
+
+    raise Exception(
+        f"Device with vid={vid_hex}, pid={pid_hex}, serial={serial!r} not found:\n{usb_devices!r}"
+    )
+
+
+ATTACH_USB_DEVICE = {
+    "parallels": attach_parallels,
+    "virtualbox": attach_virtualbox,
+}
+
+
+def generate_packer_config(file_path, providers):
+    builders = []
+    for provider_name in providers:
+        builders.append(
+            {
+                "type": "vagrant",
+                "output_dir": f"output-packer-{provider_name}",
+                "communicator": "ssh",
+                "source_path": "generic/ubuntu1804",
+                "provider": provider_name,
+                "template": "Vagrantfile.packer-template",
+            }
+        )
+
+    with open(file_path, "w") as f:
+        json.dump(
+            {
+                "builders": builders,
+            },
+            f,
+            sort_keys=True,
+            indent=2,
+        )
+
+
+def build_command(args):
+    generate_packer_config(
+        os.path.join(THIS_DIR, args.platform, "base-box", "packer.json"),
+        args.provider.split(",") or ALL_PROVIDERS,
+    )
+    subprocess.check_call(
+        ["packer", "build", "packer.json"], cwd=os.path.join(THIS_DIR, args.platform, "base-box")
+    )
+
+
+REQUIRED_TEST_CONFIG_KEYS = {
+    "vid_hex": str,
+    "pid_hex": str,
+    "test_cmd": list,
+}
+
+
+VM_BOX_RE = re.compile(r'(.*\.vm\.box) = "(.*)"')
+
+
+# Paths, relative to the platform box directory, which will not be copied to release-test dir.
+SKIP_COPY_PATHS = [".vagrant", "base-box"]
+
+
+def do_build_release_test_vm(release_test_dir, user_box_dir, base_box_dir, provider_name):
+    if os.path.exists(release_test_dir):
+        try:
+            subprocess.check_call(["vagrant", "destroy", "-f"], cwd=release_test_dir)
+        except subprocess.CalledProcessError:
+            _LOG.warning("vagrant destroy failed--removing dirtree anyhow", exc_info=True)
+
+        shutil.rmtree(release_test_dir)
+
+    for dirpath, _, filenames in os.walk(user_box_dir):
+        rel_path = os.path.relpath(dirpath, user_box_dir)
+        if any(
+            rel_path == scp or rel_path.startswith(f"{scp}{os.path.sep}") for scp in SKIP_COPY_PATHS
+        ):
+            continue
+
+        dest_dir = os.path.join(release_test_dir, rel_path)
+        os.makedirs(dest_dir)
+        for filename in filenames:
+            shutil.copy2(os.path.join(dirpath, filename), os.path.join(dest_dir, filename))
+
+    release_test_vagrantfile = os.path.join(release_test_dir, "Vagrantfile")
+    with open(release_test_vagrantfile) as f:
+        lines = list(f)
+
+    found_box_line = False
+    with open(release_test_vagrantfile, "w") as f:
+        for line in lines:
+            m = VM_BOX_RE.match(line)
+            if not m:
+                f.write(line)
+                continue
+
+            box_package = os.path.join(
+                base_box_dir, f"output-packer-{provider_name}", "package.box"
+            )
+            box_relpath = os.path.relpath(box_package, release_test_dir)
+            f.write(f'{m.group(1)} = "{box_relpath}"\n')
+            found_box_line = True
+
+    if not found_box_line:
+        _LOG.error(
+            "testing provider %s: couldn't find config.box.vm = line in Vagrantfile; unable to test",
+            provider_name,
+        )
+        return False
+
+    # Delete the old box registered with Vagrant, which may lead to a falsely-passing release test.
+    remove_args = ["vagrant", "box", "remove", box_relpath]
+    return_code = subprocess.call(remove_args, cwd=release_test_dir)
+    assert return_code in (0, 1), f'{" ".join(remove_args)} returned exit code {return_code}'
+    subprocess.check_call(["vagrant", "up", f"--provider={provider_name}"], cwd=release_test_dir)
+
+    return True
+
+
+def do_run_release_test(release_test_dir, provider_name, test_config, test_device_serial):
+    with open(
+        os.path.join(release_test_dir, ".vagrant", "machines", "default", provider_name, "id")
+    ) as f:
+        machine_uuid = f.read()
+    ATTACH_USB_DEVICE[provider_name](
+        machine_uuid,
+        vid_hex=test_config["vid_hex"],
+        pid_hex=test_config["pid_hex"],
+        serial=test_device_serial,
+    )
+    tvm_home = os.path.realpath(os.path.join(THIS_DIR, "..", "..", ".."))
+
+    def _quote_cmd(cmd):
+        return " ".join(shlex.quote(a) for a in cmd)
+
+    test_cmd = _quote_cmd(["cd", tvm_home]) + " && " + _quote_cmd(test_config["test_cmd"])
+    subprocess.check_call(["vagrant", "ssh", "-c", f"bash -ec '{test_cmd}'"], cwd=release_test_dir)
+
+
+def test_command(args):
+    user_box_dir = os.path.join(THIS_DIR, args.platform)
+    base_box_dir = os.path.join(THIS_DIR, args.platform, "base-box")
+    test_config_file = os.path.join(base_box_dir, "test-config.json")
+    with open(test_config_file) as f:
+        test_config = json.load(f)
+        for key, expected_type in REQUIRED_TEST_CONFIG_KEYS.items():
+            assert key in test_config and isinstance(
+                test_config[key], expected_type
+            ), f"Expected key {key} of type {expected_type} in {test_config_file}: {test_config!r}"
+
+        test_config["vid_hex"] = test_config["vid_hex"].lower()
+        test_config["pid_hex"] = test_config["pid_hex"].lower()
+
+    providers = args.provider
+    provider_passed = {p: False for p in providers}
+
+    release_test_dir = os.path.join(THIS_DIR, "release-test")
+
+    if args.skip_build:
+        assert len(providers) == 1, "--skip-build was given, but >1 provider specified"
+
+    for provider_name in providers:
+        try:
+            if not args.skip_build:
+                do_build_release_test_vm(
+                    release_test_dir, user_box_dir, base_box_dir, provider_name
+                )
+            do_run_release_test(
+                release_test_dir, provider_name, test_config, args.test_device_serial
+            )
+            provider_passed[provider_name] = True
+
+        finally:
+            if not args.skip_build and len(providers) > 1:
+                subprocess.check_call(["vagrant", "destroy", "-f"], cwd=release_test_dir)
+                shutil.rmtree(release_test_dir)
+
+        if not all(provider_passed[p] for p in provider_passed.keys()):
+            sys.exit(
+                "some providers failed release test: "
+                + ",".join(name for name, passed in provider_passed if not passed)
+            )
+
+
+def release_command(args):
+    subprocess.check_call(
+        [
+            "vagrant",
+            "cloud",
+            "version",
+            "create",
+            f"tlcpack/microtvm-{args.platform}",
+            args.release_version,
+        ]
+    )
+    if not args.release_version:
+        sys.exit(f"--release-version must be specified")
+
+    for provider_name in args.provider:
+        subprocess.check_call(
+            [
+                "vagrant",
+                "cloud",
+                "publish",
+                "-f",
+                f"tlcpack/microtvm-{args.platform}",
+                args.release_version,
+                provider_name,
+                os.path.join(
+                    THIS_DIR,
+                    args.platform,
+                    "base-box",
+                    f"output-packer-{provider_name}/package.box",
+                ),
+            ]
+        )
+
+
+ALL_COMMANDS = {
+    "build": build_command,
+    "test": test_command,
+    "release": release_command,
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Automates building, testing, and releasing a base box"
+    )
+    parser.add_argument(
+        "command",
+        default=",".join(ALL_COMMANDS),
+        choices=ALL_COMMANDS,
+        help="Action or actions (comma-separated) to perform.",
+    )
+    parser.add_argument(
+        "platform",
+        help="Name of the platform VM to act on. Must be a sub-directory of this directory.",
+    )
+    parser.add_argument(
+        "--provider",
+        choices=ALL_PROVIDERS,
+        action="append",
+        default=[],
+        help="Name of the provider or providers to act on; if not specified, act on all",
+    )
+    parser.add_argument(
+        "--skip-build",
+        action="store_true",
+        help=(
+            "For use with the 'test' command. If given, assume a box has already been built in "
+            "the release-test subdirectory. Attach a USB device to this box and execute the "
+            "release test script--do not delete it."
+        ),
+    )
+    parser.add_argument(
+        "--test-device-serial",
+        help=(
+            "If given, attach the test device with this USB serial number. Corresponds to the "
+            "iSerial field from `lsusb -v` output."
+        ),
+    )
+    parser.add_argument(
+        "--release-version",
+        help="Version to release, in the form 'x.y.z'. Must be specified with release.",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if os.path.sep in args.platform or not os.path.isdir(os.path.join(THIS_DIR, args.platform)):
+        sys.exit(f"<platform> must be a sub-direcotry of {THIS_DIR}; got {args.platform}")
+
+    if not args.provider:
+        args.provider = list(ALL_PROVIDERS)
+
+    todo = []
+    for phase in args.command.split(","):
+        if phase not in ALL_COMMANDS:
+            sys.exit(f"unknown command: {phase}")
+
+        todo.append(ALL_COMMANDS[phase])
+
+    for phase in todo:
+        phase(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/apps/microtvm/reference-vm/zephyr/.gitignore b/apps/microtvm/reference-vm/zephyr/.gitignore
new file mode 100644
index 000000000000..dace7081e3f2
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/.gitignore
@@ -0,0 +1 @@
+/.vagrant
diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile
new file mode 100644
index 000000000000..5a73d1f5e79b
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/Vagrantfile
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+Vagrant.configure("2") do |config|
+  config.vm.box = "tlcpack/microtvm-zephyr"
+
+  tvm_home = "../../../.."
+  dirs_to_mount = [Pathname.new(Pathname.new(tvm_home).expand_path())]
+  if ENV.has_key?("TVM_PROJECT_DIR") then
+    dirs_to_mount.append(ENV["TVM_PROJECT_DIR"])
+    puts "NOTE: also configuring project dir: %s" % [dirs_to_mount[-1]]
+  end
+
+  git_file = Pathname.new(tvm_home + "/.git")
+  if git_file.ftype() == "file" then
+    gitdir_match = Regexp.new('^gitdir: (?<gitdir>.*/.git).*\n$', Regexp::MULTILINE).match(git_file.read())
+    if !gitdir_match.nil? then
+      dirs_to_mount.append(Pathname.new(gitdir_match.named_captures["gitdir"]))
+      puts "NOTE: also configuring git-worktree gitdir: %s" % [dirs_to_mount[-1]]
+    end
+  end
+
+  config.vm.provision "shell", path: "setup.sh", env: {"TVM_HOME": dirs_to_mount[0]}, privileged: false
+
+  # Enable USB Controller on VirtualBox
+  vm_name = "microtvm-#{Time.now.tv_sec}"
+  config.vm.provider "virtualbox" do |vb, overrides|
+    vb.name = vm_name
+    vb.customize ["modifyvm", :id, "--usb", "on"]
+    vb.customize ["modifyvm", :id, "--usbehci", "on"]
+    vb.customize ["modifyvm", :id, "--usbxhci", "on"]
+    dirs_to_mount.each do |d|
+      overrides.vm.synced_folder d.to_s, d.to_s
+    end
+  end
+
+  config.vm.provider "parallels" do |prl, overrides|
+    prl.name = vm_name
+    prl.update_guest_tools = true
+    prl.customize ["set", :id, "--support-usb30", "on"]
+    dirs_to_mount.each do |d|
+      overrides.vm.synced_folder d.to_s, d.to_s, mount_options: ["share", "nosuid", "host_inodes"]
+    end
+  end
+
+end
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/.gitignore b/apps/microtvm/reference-vm/zephyr/base-box/.gitignore
new file mode 100644
index 000000000000..e4406c4f61e2
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/base-box/.gitignore
@@ -0,0 +1,4 @@
+*.box
+.vagrant
+/output-packer-*
+/packer.json
diff --git a/conda/tvm/meta.yaml b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
similarity index 50%
rename from conda/tvm/meta.yaml
rename to apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
index 9e8f94789394..b1fff9c63806 100644
--- a/conda/tvm/meta.yaml
+++ b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
@@ -15,48 +15,26 @@
 # specific language governing permissions and limitations
 # under the License.
 
-{% set version = "0.8.dev0" %}
+Vagrant.configure("2") do |config|
+  # From hashicorp default template:
+  # https://github.com/hashicorp/packer/blob/master/builder/vagrant/step_create_vagrantfile.go#L23-L37
 
-package:
-  name: tvm
-  version: {{ version }}
+  config.vm.define "source" do |source|
+    source.vm.box = "{{.SourceBox}}"
+    config.ssh.insert_key = {{.InsertKey}}
+  end
 
-source:
-  path: ../..
+  config.vm.define "output" do |output|
+    output.vm.box = "{{.BoxName}}"
+    output.vm.box_url = "file://package.box"
+    config.ssh.insert_key = {{.InsertKey}}
+  end
 
-build:
-  number: 0
+  {{ if ne .SyncedFolder "" -}}
+    config.vm.synced_folder "{{.SyncedFolder}}", "/vagrant"
+  {{- else -}}
+    config.vm.synced_folder ".", "/vagrant", disabled: true
+  {{- end}}
 
-requirements:
-  build:
-    - {{ compiler('cxx') }}
-  host:
-    - python {{ python }}
-    - cython
-    - numpy
-    - setuptools
-    - decorator
-    - tvm-libs {{ version }}
-  run:
-    - python {{ python }}
-    - {{ pin_compatible('numpy') }}
-    - decorator
-    - tvm-libs {{ version }}
-    - psutil
-
-test:
-  imports:
-    - tvm
-  requires:
-    - pytest
-    - scipy
-  source_files:
-    - tests/python
-  commands:
-    - python -m pytest -v tests/python/integration
-
-about:
-  home: https://github.com/apache/incubator-tvm
-  license: Apache-2.0
-  license_family: Apache
-  summary: a low level domain specific language for compiling tensor computation pipelines
+  config.vm.provision "shell", path: "../setup.sh", privileged: false
+end
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
new file mode 100644
index 000000000000..fd758064f4ca
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
@@ -0,0 +1,105 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+sudo apt update
+sudo apt install -y build-essential
+sudo apt-get --purge remove modemmanager  # required to access serial ports.
+
+# Zephyr
+wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
+sudo apt-key add kitware-archive-latest.asc
+sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+sudo apt update
+# NOTE: latest cmake cannot be installed due to
+# https://github.com/zephyrproject-rtos/zephyr/issues/30232
+sudo apt install -y --no-install-recommends git \
+     cmake=3.18.4-0kitware1 cmake-data=3.18.4-0kitware1 \
+     ninja-build gperf ccache dfu-util device-tree-compiler wget \
+     python3-dev python3-pip python3-setuptools python3-tk python3-wheel xz-utils file \
+     make gcc gcc-multilib g++-multilib libsdl2-dev
+
+# Avahi, so that ssh microtvm works.
+# apt install -y avahi-daemon
+
+OLD_HOSTNAME=$(hostname)
+sudo hostnamectl set-hostname microtvm
+sudo sed -i.bak "s/${OLD_HOSTNAME}/microtvm.localdomain/g" /etc/hosts
+
+# Poetry deps
+sudo apt install -y python3-venv
+
+# TVM deps
+sudo apt install -y llvm
+
+# ONNX deps
+sudo apt install -y protobuf-compiler libprotoc-dev
+
+# nrfjprog
+cd ~
+mkdir -p nrfjprog
+wget --no-verbose -O nRFCommandLineTools1090Linuxamd64.tar.gz https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-9-0/nRFCommandLineTools1090Linuxamd64tar.gz
+cd nrfjprog
+tar -xzvf ../nRFCommandLineTools1090Linuxamd64.tar.gz
+sudo apt install -y ./JLink_Linux_V680a_x86_64.deb
+sudo apt install -y ./nRF-Command-Line-Tools_10_9_0_Linux-amd64.deb
+source ~/.profile
+nrfjprog --help
+cd ..
+rm -rf nrfjprog nRFCommandLineTools1090Linuxamd64.tar.gz
+
+# Zephyr
+pip3 install --user -U west
+echo 'export PATH=$HOME/.local/bin:"$PATH"' >> ~/.profile
+source ~/.profile
+echo PATH=$PATH
+west init --mr v2.4.0 ~/zephyr
+cd ~/zephyr
+west update
+west zephyr-export
+
+cd ~
+echo "Downloading zephyr SDK..."
+wget --no-verbose https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v0.11.3/zephyr-sdk-0.11.3-setup.run
+chmod +x zephyr-sdk-0.11.3-setup.run
+./zephyr-sdk-0.11.3-setup.run -- -d ~/zephyr-sdk -y
+rm -rf zephyr-sdk-0.11.3-setup.run
+
+# GDB for Zephyr SDK depends on python3.8
+sudo add-apt-repository ppa:deadsnakes/ppa
+sudo apt install -y python3.8-dev
+
+sudo find ~/zephyr-sdk -name '*.rules' -exec cp {} /etc/udev/rules.d \;
+sudo udevadm control --reload
+
+# Poetry
+curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python3
+sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
+sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zephyr" ~/.bashrc
+sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
+
+# Clean box for packaging as a base box
+sudo apt-get clean
+EMPTY_FILE="$HOME/EMPTY"
+dd if=/dev/zero "of=${EMPTY_FILE}" bs=1M || /bin/true
+if [ ! -e "${EMPTY_FILE}" ]; then
+    echo "failed to zero empty sectors on disk"
+    exit 2
+fi
+rm -f "${EMPTY_FILE}"
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/test-config.json b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
new file mode 100644
index 000000000000..78a6bd216e65
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
@@ -0,0 +1,4 @@
+{"vid_hex": "0483",
+ "pid_hex": "374b",
+ "test_cmd": ["pytest", "tests/micro/qemu/test_zephyr.py", "--microtvm-platforms=stm32f746xx"]
+}
diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/reference-vm/zephyr/pyproject.toml
new file mode 100644
index 000000000000..ed8182584e36
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/pyproject.toml
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[tool.black]
+line-length = 100
+target-version = ['py36']
+include = '(\.pyi?$)'
+exclude = '''
+
+(
+  /(
+      \.github
+    | \.tvm
+    | \.tvm_test_data
+    | \.vscode
+    | \.venv
+    | 3rdparty
+    | build\/
+    | cmake\/
+    | conda\/
+    | docker\/
+    | docs\/
+    | golang\/
+    | include\/
+    | jvm\/
+    | licenses\/
+    | nnvm\/
+    | rust\/
+    | src\/
+    | vta\/
+    | web\/
+  )/
+)
+'''
+[tool.poetry]
+name = "tvm"
+version = "0.1.0"
+description = ""
+authors = ["Your Name <you@example.com>"]
+packages = [
+    { include = "tvm", from = "../../../../python" },
+]
+
+[tool.poetry.dependencies]
+attrs = "^19"
+decorator = "^4.4"
+numpy = "~1.19"
+psutil = "^5"
+scipy = "^1.4"
+python = "^3.6"
+tornado = "^6"
+typed_ast = "^1.4"
+
+# AutoTVM
+xgboost = {version = "^1.1", optional = true}
+
+#############
+# Importers #
+#############
+
+# NOTE: Caffe frontend dependency is from torch package.
+
+# CoreML
+coremltools = {version = "^3.3", optional = true}
+
+# Darknet
+opencv-python = {version = "^4.2", optional = true}
+cffi = {version = "^1.14", optional = true}
+
+# NOTE: Keras provided by tensorflow package.
+# If TF version conflict, maybe try: keras = "2.3.1"
+
+# MXNet frontend
+mxnet = {version = "^1.6.0", optional = true}
+
+# ONNX frontend
+onnx = {version = "1.6.0", optional = true}
+onnxruntime = {version = "1.0.0", optional = true}
+
+# Pytorch (also used by ONNX)
+# NOTE: cannot download this right now due to https://github.com/python-poetry/poetry/issues/2247
+# torch = {url = "https://download.pytorch.org/whl/cu101/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl", optional = true}
+# torchvision = {version = "0.5.0", optional = true}
+# NOTE: torch depends on a number of other packages, but unhelpfully, does not expose that in the
+# wheel!!!
+future = {version = "*", optional = true}
+
+# Tensorflow frontend
+tensorflow = {version = "^2.1", optional = true}
+tensorflow-estimator = {version = "^2.1", optional = true}
+
+# TFLite frontend
+tflite = {version = "2.1.0", optional = true}
+wheel = "*"
+
+
+[tool.poetry.extras]
+xgboost = ["xgboost"]
+importer-caffe2 = ["torch"]
+importer-coreml = ["coremltools"]
+importer-darknet = ["opencv-python"]
+importer-keras = ["tensorflow", "tensorflow-estimator"]
+importer-onnx = ["onnx", "onnxruntime", "torch", "torchvision", "future"]
+importer-pytorch = ["torch", "torchvision", "future"]
+importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
+importer-tflite = ["tlfite", "tensorflow", "tensorflow-estimator"]
+
+[tool.poetry.dev-dependencies]
+autodocsumm = "^0.1"
+black = "^19.10b0"
+sphinx = "^3.0"
+sphinx-gallery = "^0.4"
+sphinx-rtd-theme = "^0.4"
+matplotlib = "^3.2"
+Image = "^1.5"
+recommonmark = "^0.6"
+pillow = "< 7"
+pyformat = "^0.7"
+pylint = "^2.4"
+pytest = "^5.4"
+
+[build-system]
+requires = ["poetry>=0.12"]
+build-backend = "poetry.masonry.api"
+
+[tool.autopep8]
+max_line_length = 100
diff --git a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
new file mode 100755
index 000000000000..df833042c670
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
@@ -0,0 +1,34 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+cd "$(dirname $0)"
+cd "$(git rev-parse --show-toplevel)"
+BUILD_DIR=build-microtvm
+
+if [ ! -e "${BUILD_DIR}" ]; then
+    mkdir "${BUILD_DIR}"
+fi
+cp cmake/config.cmake "${BUILD_DIR}"
+cd "${BUILD_DIR}"
+sed -i 's/USE_MICRO OFF/USE_MICRO ON/' config.cmake
+sed -i 's/USE_GRAPH_RUNTIME_DEBUG OFF/USE_GRAPH_RUNTIME_DEBUG ON/' config.cmake
+sed -i 's/USE_LLVM OFF/USE_LLVM ON/' config.cmake
+cmake ..
+make -j4
diff --git a/apps/microtvm/reference-vm/zephyr/setup.sh b/apps/microtvm/reference-vm/zephyr/setup.sh
new file mode 100644
index 000000000000..053e41e85256
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/setup.sh
@@ -0,0 +1,44 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+# TVM
+# NOTE: TVM is presumed to be mounted already by Vagrantfile.
+cd "${TVM_HOME}"
+
+apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
+
+cd apps/microtvm/reference-vm/zephyr
+
+poetry env use 3.6
+# NOTE: due to https://github.com/python-poetry/poetry/issues/2247, download torch here.
+poetry run pip3 install torch==1.4.0 torchvision==0.5.0
+
+echo "------------------------------[ TVM Message ]------------------------------"
+echo "WARNING: running 'poetry lock', which could take several minutes (depending"
+echo "on your network connection and the state of PyPI) as dependencies are"
+echo "downloaded and cached for future use."
+echo "------------------------------[ TVM Message ]------------------------------"
+poetry lock -vvv
+poetry install
+poetry run pip3 install -r ~/zephyr/zephyr/scripts/requirements.txt
+
+echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm" >>~/.profile
+echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/zephyr && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile
+echo "source \$VENV_PATH/bin/activate" >>~/.profile
diff --git a/apps/topi_recipe/conv/depthwise_conv2d_test.py b/apps/topi_recipe/conv/depthwise_conv2d_test.py
index 036f1a4240f2..94687edde5f9 100644
--- a/apps/topi_recipe/conv/depthwise_conv2d_test.py
+++ b/apps/topi_recipe/conv/depthwise_conv2d_test.py
@@ -22,7 +22,7 @@
 from tvm.contrib import nvcc
 
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.topi.cuda.depthwise_conv2d import (
     schedule_depthwise_conv2d_nchw,
     schedule_depthwise_conv2d_nhwc,
diff --git a/apps/topi_recipe/conv/test_conv2d_hwcn_map.py b/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
index 1d2032d5c405..d67bfdc8952e 100644
--- a/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
+++ b/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
@@ -22,7 +22,7 @@
 from tvm import te
 from tvm.contrib import nvcc
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 TASK = "conv2d_hwcn_map"
 USE_MANUAL_CODE = False
diff --git a/apps/topi_recipe/gemm/android_gemm_square.py b/apps/topi_recipe/gemm/android_gemm_square.py
index 522818842cfa..0e64dcd3844d 100644
--- a/apps/topi_recipe/gemm/android_gemm_square.py
+++ b/apps/topi_recipe/gemm/android_gemm_square.py
@@ -19,7 +19,7 @@
 from tvm import te
 import os
 from tvm import rpc
-from tvm.contrib import util, ndk
+from tvm.contrib import utils, ndk
 import numpy as np
 
 # Set to be address of tvm proxy.
@@ -121,7 +121,7 @@ def test_gemm_gpu(N, times, bn, num_block, num_thread):
     print(tvm.lower(s, [A, B, C], simple_mode=True))
 
     f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu")
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path_dso = temp.relpath("gemm_gpu.so")
     f.export_library(path_dso, ndk.create_shared)
 
diff --git a/apps/wasm-standalone/wasm-graph/Cargo.toml b/apps/wasm-standalone/wasm-graph/Cargo.toml
index 9cdc8f599579..cea491b2f128 100644
--- a/apps/wasm-standalone/wasm-graph/Cargo.toml
+++ b/apps/wasm-standalone/wasm-graph/Cargo.toml
@@ -22,7 +22,7 @@ authors = ["TVM Contributors"]
 edition = "2018"
 description = "WebAssembly graph to deep learning frameworks using TVM"
 readme = "README.md"
-repository = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
 license = "Apache-2.0"
 keywords = ["wasm", "machine learning", "tvm"]
 
diff --git a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
index cfea02a230d2..42695d28fadb 100644
--- a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
+++ b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
@@ -44,7 +44,7 @@ def build_graph_lib(model_file, opt_level):
 
     # Compile the relay mod
     mod, params = _get_mod_and_params(model_file)
-    target = "llvm -target=wasm32-unknown-unknown -mattr=+simd128 --system-lib"
+    target = "llvm -mtriple=wasm32-unknown-unknown -mattr=+simd128 --system-lib"
     with tvm.transform.PassContext(opt_level=opt_level):
         graph_json, lib, params = relay.build(mod, target=target, params=params)
 
@@ -71,7 +71,7 @@ def build_graph_lib(model_file, opt_level):
         "--opt-level",
         type=int,
         default=0,
-        help="level of optimization. 0 is unoptimized and 3 is the highest level",
+        help="level of optimization. 0 is non-optimized and 3 is the highest level",
     )
     args = parser.parse_args()
 
diff --git a/apps/wasm-standalone/wasm-runtime/Cargo.toml b/apps/wasm-standalone/wasm-runtime/Cargo.toml
index db00a55c31b5..99f6db54431f 100644
--- a/apps/wasm-standalone/wasm-runtime/Cargo.toml
+++ b/apps/wasm-standalone/wasm-runtime/Cargo.toml
@@ -21,7 +21,7 @@ version = "0.1.0"
 authors = ["TVM Contributors"]
 edition = "2018"
 description = "WebAssembly runtime to deep learning frameworks using wasmtime"
-repository = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
 license = "Apache-2.0"
 keywords = ["wasm", "machine learning", "wasmtime"]
 
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 1d465b2fe389..4a010d3ef099 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -113,15 +113,16 @@ set(USE_MICRO_STANDALONE_RUNTIME OFF)
 #
 # Possible values:
 # - ON: enable llvm with cmake's find search
-# - OFF: disable llvm
+# - OFF: disable llvm, note this will disable CPU codegen
+#        which is needed for most cases
 # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
-set(USE_LLVM OFF)
+set(USE_LLVM ON)
 
 #---------------------------------------------
 # Contrib libraries
 #---------------------------------------------
 # Whether to build with BYODT software emulated posit custom datatype
-# 
+#
 # Possible values:
 # - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
 # - OFF: disable BYODT posit
@@ -222,6 +223,22 @@ set(USE_ETHOSN OFF)
 # otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
 set(USE_ETHOSN_HW OFF)
 
+# Whether to build with TensorRT codegen or runtime
+# Examples are available here: docs/deploy/tensorrt.rst.
+#
+# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
+#                        offloaded to TensorRT. OFF/ON
+# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
+#                        TensorRT library. OFF/ON/"path/to/TensorRT"
+set(USE_TENSORRT_CODEGEN OFF)
+set(USE_TENSORRT_RUNTIME OFF)
+
+# Whether use VITIS-AI codegen
+set(USE_VITIS_AI OFF)
+
+# Build Verilator codegen and runtime, example located in 3rdparty/vta-hw/apps/verilator
+set(USE_VERILATOR_HW OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
@@ -260,7 +277,3 @@ set(USE_HEXAGON_SDK /path/to/sdk)
 
 # Whether to use ONNX codegen
 set(USE_TARGET_ONNX OFF)
-
-# Whether to compile the standalone C runtime.
-set(USE_STANDALONE_CRT ON)
-
diff --git a/cmake/modules/ClangFlags.cmake b/cmake/modules/ClangFlags.cmake
index 9a3ac05a2a5b..53d0e3631caf 100644
--- a/cmake/modules/ClangFlags.cmake
+++ b/cmake/modules/ClangFlags.cmake
@@ -21,7 +21,11 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
   EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version)
   string (REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION ${clang_full_version})
   message(STATUS "CLANG_VERSION ${CLANG_VERSION}")
-  if (CLANG_VERSION VERSION_GREATER_EQUAL 10.0)
+  # cmake 3.2 does not support VERSION_GREATER_EQUAL
+  set(CLANG_MINIMUM_VERSION 10.0)
+  if ((CLANG_VERSION VERSION_GREATER ${CLANG_MINIMUM_VERSION})
+      OR
+      (CLANG_VERSION VERSION_GREATER ${CLANG_MINIMUM_VERSION}))
     message(STATUS "Setting enhanced clang warning flags")
 
     # These warnings are only enabled when clang's -Weverything flag is enabled
diff --git a/cmake/modules/LLVM.cmake b/cmake/modules/LLVM.cmake
index 5f8ace17111f..ac870b17faeb 100644
--- a/cmake/modules/LLVM.cmake
+++ b/cmake/modules/LLVM.cmake
@@ -16,7 +16,14 @@
 # under the License.
 
 # LLVM rules
-add_definitions(-DDMLC_USE_FOPEN64=0)
+# Due to LLVM debug symbols you can sometimes face linking issues on
+# certain compiler, platform combinations if you don't set NDEBUG.
+#
+# See https://github.com/imageworks/OpenShadingLanguage/issues/1069
+# for more discussion.
+add_definitions(-DDMLC_USE_FOPEN64=0 -DNDEBUG=1)
+# TODO(@jroesch, @tkonolige): if we actually use targets we can do this.
+# target_compile_definitions(tvm PRIVATE NDEBUG=1)
 
 # Test if ${USE_LLVM} is not an explicit boolean false
 # It may be a boolean or a string
diff --git a/cmake/modules/RustExt.cmake b/cmake/modules/RustExt.cmake
new file mode 100644
index 000000000000..2922bc48dee2
--- /dev/null
+++ b/cmake/modules/RustExt.cmake
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_RUST_EXT)
+    set(RUST_SRC_DIR "${CMAKE_SOURCE_DIR}/rust")
+    set(CARGO_OUT_DIR "${CMAKE_SOURCE_DIR}/rust/target")
+
+    if(USE_RUST_EXT STREQUAL "STATIC")
+        set(COMPILER_EXT_PATH "${CARGO_OUT_DIR}/release/libcompiler_ext.a")
+    elseif(USE_RUST_EXT STREQUAL "DYNAMIC")
+        set(COMPILER_EXT_PATH "${CARGO_OUT_DIR}/release/libcompiler_ext.so")
+    else()
+        message(FATAL_ERROR "invalid setting for USE_RUST_EXT, STATIC, DYNAMIC or OFF")
+    endif()
+
+    add_custom_command(
+        OUTPUT "${COMPILER_EXT_PATH}"
+        COMMAND cargo build --release
+        MAIN_DEPENDENCY "${RUST_SRC_DIR}"
+        WORKING_DIRECTORY "${RUST_SRC_DIR}/compiler-ext")
+
+    add_custom_target(rust_ext ALL DEPENDS "${COMPILER_EXT_PATH}")
+
+    # TODO(@jroesch, @tkonolige): move this to CMake target
+    # target_link_libraries(tvm "${COMPILER_EXT_PATH}" PRIVATE)
+    list(APPEND TVM_LINKER_LIBS ${COMPILER_EXT_PATH})
+
+    add_definitions(-DRUST_COMPILER_EXT=1)
+endif()
diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 73c85d13e2ef..256ce2a48a6c 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -44,6 +44,7 @@ if(USE_MICRO)
          "src/runtime/crt/include *.h -> include"
          "src/runtime/crt/common *.c -> src/runtime/crt/common"
          "src/runtime/crt/graph_runtime *.c -> src/runtime/crt/graph_runtime"
+         "src/runtime/crt/graph_runtime_module *.c -> src/runtime/crt/graph_runtime_module"
          "src/runtime/crt/host crt_config.h -> src/runtime/crt/host"
          "src/runtime/crt/utvm_rpc_common *.cc -> src/runtime/crt/utvm_rpc_common"
          "src/runtime/crt/utvm_rpc_server *.cc -> src/runtime/crt/utvm_rpc_server"
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index a9fc66507d35..115216680fff 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -65,7 +65,7 @@ elseif(PYTHON)
       target_compile_definitions(vta_fsim PUBLIC ${__strip_def})
     endforeach()
     if(APPLE)
-      set_target_properties(vta_fsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+      set_property(TARGET vta_fsim APPEND PROPERTY LINK_FLAGS "-undefined dynamic_lookup")
     endif(APPLE)
     target_compile_definitions(vta_fsim PUBLIC USE_FSIM_TLPP)
   endif()
@@ -86,7 +86,7 @@ elseif(PYTHON)
       target_compile_definitions(vta_tsim PUBLIC ${__strip_def})
     endforeach()
     if(APPLE)
-      set_target_properties(vta_tsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+      set_property(TARGET vta_fsim APPEND PROPERTY LINK_FLAGS "-undefined dynamic_lookup")
     endif(APPLE)
   endif()
 
diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake
index 2615f1fe31e1..24a8241a2229 100644
--- a/cmake/modules/contrib/TensorRT.cmake
+++ b/cmake/modules/contrib/TensorRT.cmake
@@ -15,12 +15,26 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# TensorRT Codegen only. This can be enabled independently of USE_TENSORRT_RUNTIME to enable
+# compilation of TensorRT modules without requiring TensorRT to be installed. The compiled modules
+# will only be able to be executed using a TVM built with USE_TENSORRT_RUNTIME=ON.
+
+include (FindPackageHandleStandardArgs)
+
+if(USE_TENSORRT_CODEGEN)
+    message(STATUS "Build with TensorRT codegen")
+    file(GLOB COMPILER_TENSORRT_SRCS src/relay/backend/contrib/tensorrt/*.cc)
+    set_source_files_properties(${COMPILER_TENSORRT_SRCS} PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
+    file(GLOB RUNTIME_TENSORRT_SRCS src/runtime/contrib/tensorrt/tensorrt_runtime.cc)
+    set_source_files_properties(${RUNTIME_TENSORRT_SRCS} PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
+    list(APPEND COMPILER_SRCS ${COMPILER_TENSORRT_SRCS})
+    list(APPEND COMPILER_SRCS ${RUNTIME_TENSORRT_SRCS})
+endif()
+
 # TensorRT Runtime
-if(USE_TENSORRT)
-    # Enable codegen as well
-    SET(USE_TENSORRT_CODEGEN ON)
-    if(IS_DIRECTORY ${USE_TENSORRT})
-        set(TENSORRT_ROOT_DIR ${USE_TENSORRT})
+if(USE_TENSORRT_RUNTIME)
+    if(IS_DIRECTORY ${USE_TENSORRT_RUNTIME})
+        set(TENSORRT_ROOT_DIR ${USE_TENSORRT_RUNTIME})
         message(STATUS "Custom TensorRT path: " ${TENSORRT_ROOT_DIR})
     endif()
     find_path(TENSORRT_INCLUDE_DIR NvInfer.h HINTS ${TENSORRT_ROOT_DIR} PATH_SUFFIXES include)
@@ -33,21 +47,11 @@ if(USE_TENSORRT)
     include_directories(${TENSORRT_INCLUDE_DIR})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${TENSORRT_LIB_DIR})
 
-    # Relay TRT runtime sources
-    file(GLOB TENSORRT_RELAY_CONTRIB_SRC src/runtime/contrib/tensorrt/*.cc)
-    list(APPEND RUNTIME_SRCS ${TENSORRT_RELAY_CONTRIB_SRC})
+    # TRT runtime sources
+    file(GLOB RUNTIME_TENSORRT_SRCS src/runtime/contrib/tensorrt/*.cc)
+    set_source_files_properties(${RUNTIME_TENSORRT_SRCS} PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
+    list(APPEND RUNTIME_SRCS ${RUNTIME_TENSORRT_SRCS})
 
     # Set defines
     add_definitions(-DTVM_GRAPH_RUNTIME_TENSORRT)
 endif()
-# TensorRT Codegen only. This can be enabled independently of USE_TENSORRT to
-# enable compilation of TensorRT modules without requiring TensorRT to be
-# installed. The compiled modules will only be able to be executed using a TVM
-# built with USE_TENSORRT=ON.
-if(USE_TENSORRT_CODEGEN)
-    message(STATUS "Build with TensorRT codegen")
-    # Relay TRT codegen sources
-    file(GLOB TENSORRT_RELAY_CONTRIB_SRC src/relay/backend/contrib/tensorrt/*.cc)
-    list(APPEND COMPILER_SRCS ${TENSORRT_RELAY_CONTRIB_SRC})
-    list(APPEND COMPILER_SRCS src/runtime/contrib/tensorrt/tensorrt_module.cc)
-endif()
diff --git a/cmake/modules/contrib/Verilator.cmake b/cmake/modules/contrib/Verilator.cmake
new file mode 100644
index 000000000000..d3c1a7161182
--- /dev/null
+++ b/cmake/modules/contrib/Verilator.cmake
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_VERILATOR_HW STREQUAL "ON")
+  execute_process(COMMAND make --directory ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw/apps/verilator)
+  file(GLOB VERILATOR_RELAY_CONTRIB_SRC src/relay/backend/contrib/verilator/codegen.cc)
+  list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC})
+  find_library(EXTERN_LIBRARY_VERILATOR NAMES verilator PATHS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw/apps/verilator)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_VERILATOR})
+  file(GLOB VERILATOR_CONTRIB_SRC src/runtime/contrib/verilator/verilator_runtime.cc)
+  list(APPEND RUNTIME_SRCS ${VERILATOR_CONTRIB_SRC})
+endif()
+
diff --git a/cmake/modules/contrib/VitisAI.cmake b/cmake/modules/contrib/VitisAI.cmake
new file mode 100644
index 000000000000..083bd6d7adc8
--- /dev/null
+++ b/cmake/modules/contrib/VitisAI.cmake
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_VITIS_AI)
+  set(PYXIR_SHARED_LIB libpyxir.so)
+  find_package(PythonInterp 3.6 REQUIRED)
+  if(NOT PYTHON)
+    find_program(PYTHON NAMES python3 python3.6)
+  endif()
+  execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+    "import pyxir as px; print(px.get_include_dir()); print(px.get_lib_dir());"
+    RESULT_VARIABLE __result
+    OUTPUT_VARIABLE __output
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(__result MATCHES 0)
+    string(REGEX REPLACE ";" "\\\\;" __values ${__output})
+    string(REGEX REPLACE "\r?\n" ";"    __values ${__values})
+    list(GET __values 0 PYXIR_INCLUDE_DIR)
+    list(GET __values 1 PYXIR_LIB_DIR)
+  else()
+    message(FATAL_ERROR "Can't build TVM with Vitis-AI because PyXIR can't be found")
+  endif()
+  message(STATUS "Build with contrib.vitisai")
+  include_directories(${PYXIR_INCLUDE_DIR})  
+  file(GLOB VAI_CONTRIB_SRC src/runtime/contrib/vitis_ai/*.cc)
+  file(GLOB COMPILER_VITIS_AI_SRCS
+       CONFIGURE_DEPENDS src/relay/backend/contrib/vitis_ai/*)
+  list(APPEND COMPILER_SRCS ${COMPILER_VITIS_AI_SRCS})
+  link_directories(${PYXIR_LIB_DIR})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS "pyxir")
+  list(APPEND RUNTIME_SRCS ${VAI_CONTRIB_SRC})
+endif(USE_VITIS_AI)
diff --git a/cmake/util/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
similarity index 93%
rename from cmake/util/FindCUDA.cmake
rename to cmake/utils/FindCUDA.cmake
index f7d9b5ed6d08..c95f8ce722f4 100644
--- a/cmake/util/FindCUDA.cmake
+++ b/cmake/utils/FindCUDA.cmake
@@ -87,15 +87,20 @@ macro(find_cuda use_cuda)
         NO_DEFAULT_PATH)
       find_library(CUDA_CUDNN_LIBRARY cudnn
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
+      # search default path if cannot find cudnn in non-default
+      find_library(CUDA_CUDNN_LIBRARY cudnn)
       find_library(CUDA_CUBLAS_LIBRARY cublas
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
       find_library(CUDA_CUBLASLT_LIBRARY
         NAMES cublaslt cublasLt
         PATHS
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
     endif(MSVC)
     message(STATUS "Found CUDA_TOOLKIT_ROOT_DIR=" ${CUDA_TOOLKIT_ROOT_DIR})
     message(STATUS "Found CUDA_CUDA_LIBRARY=" ${CUDA_CUDA_LIBRARY})
diff --git a/cmake/util/FindEthosN.cmake b/cmake/utils/FindEthosN.cmake
similarity index 100%
rename from cmake/util/FindEthosN.cmake
rename to cmake/utils/FindEthosN.cmake
diff --git a/cmake/util/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake
similarity index 100%
rename from cmake/util/FindLLVM.cmake
rename to cmake/utils/FindLLVM.cmake
diff --git a/cmake/util/FindOpenCL.cmake b/cmake/utils/FindOpenCL.cmake
similarity index 100%
rename from cmake/util/FindOpenCL.cmake
rename to cmake/utils/FindOpenCL.cmake
diff --git a/cmake/util/FindROCM.cmake b/cmake/utils/FindROCM.cmake
similarity index 100%
rename from cmake/util/FindROCM.cmake
rename to cmake/utils/FindROCM.cmake
diff --git a/cmake/util/FindVulkan.cmake b/cmake/utils/FindVulkan.cmake
similarity index 100%
rename from cmake/util/FindVulkan.cmake
rename to cmake/utils/FindVulkan.cmake
diff --git a/cmake/util/Util.cmake b/cmake/utils/Utils.cmake
similarity index 100%
rename from cmake/util/Util.cmake
rename to cmake/utils/Utils.cmake
diff --git a/conda/Dockerfile.template b/conda/Dockerfile.template
index 1b5dc6fbef5e..342d532bbff5 100644
--- a/conda/Dockerfile.template
+++ b/conda/Dockerfile.template
@@ -17,30 +17,16 @@
 
 FROM nvidia/cuda:{{ cuda_version }}-devel-ubuntu16.04
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            bzip2 curl sudo binutils && \
-    rm -rf /var/lib/apt/lists/*
+RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
 
-RUN  curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v{{ cudnn_short_version }}/cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz -O && \
+RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v{{ cudnn_short_version }}/cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz && \
     tar --no-same-owner -xzf cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz -C /usr/local && \
     rm cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz && \
     ldconfig
 
-
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda upgrade --all && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
-
-RUN /opt/conda/bin/conda install --download-only cmake make zlib
-RUN /opt/conda/bin/conda install --download-only -c numba llvmdev=8.0.0
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh
 
 ENV PATH /opt/conda/bin:$PATH
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV CONDA_BLD_PATH /tmp
-
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
diff --git a/conda/build-environment.yaml b/conda/build-environment.yaml
new file mode 100644
index 000000000000..31b39bfafcd0
--- /dev/null
+++ b/conda/build-environment.yaml
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Build environment that can be used to build tvm.
+name: tvm-build
+
+# The conda channels to lookup the dependencies
+channels:
+  - anaconda
+  - conda-forge
+
+# The packages to install to the environment
+dependencies:
+  - conda-build
+  - git
+  - llvmdev ==10.0.0
+  - numpy
+  - pytest
+  - cython
+  - cmake
+  - bzip2
+  - make
+  - scipy
diff --git a/conda/build_cpu.sh b/conda/build_cpu.sh
index 992b1a369b96..48b93b23dc0f 100755
--- a/conda/build_cpu.sh
+++ b/conda/build_cpu.sh
@@ -26,6 +26,4 @@ mkdir -p /tmp/.conda/pkgs
 touch /tmp/.conda/pkgs/urls.txt
 touch /tmp/.conda/environments.txt
 
-
-conda build --output-folder=conda/pkg -c numba conda/tvm-libs
-conda build --output-folder=conda/pkg -m conda/conda_build_config.yaml conda/tvm
+conda build --output-folder=conda/pkg conda/recipe
diff --git a/conda/build_cuda.sh b/conda/build_cuda.sh
index 2c9a20ae66ae..ec4a144852b7 100755
--- a/conda/build_cuda.sh
+++ b/conda/build_cuda.sh
@@ -26,5 +26,4 @@ mkdir -p /tmp/.conda/pkgs
 touch /tmp/.conda/pkgs/urls.txt
 touch /tmp/.conda/environments.txt
 
-
-conda build --output-folder=conda/pkg --variants "{cuda: True, cuda_version: ${CUDA_VERSION%.*}}" -c numba conda/tvm-libs
+conda build --output-folder=conda/pkg --variants "{cuda: True, cuda_version: ${CUDA_VERSION%.*}}" conda/recipe
diff --git a/conda/build_win.bat b/conda/build_win.bat
new file mode 100644
index 000000000000..59d0d07340c7
--- /dev/null
+++ b/conda/build_win.bat
@@ -0,0 +1,18 @@
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+
+conda build --output-folder=conda/pkg conda/recipe
diff --git a/conda/recipe/bld.bat b/conda/recipe/bld.bat
new file mode 100644
index 000000000000..9fc0469febc6
--- /dev/null
+++ b/conda/recipe/bld.bat
@@ -0,0 +1,38 @@
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+echo on
+
+rd /s /q build
+mkdir build
+cd build
+
+cmake ^
+      -DCMAKE_PREFIX_PATH=%LIBRARY_PREFIX% ^
+      -DCMAKE_INSTALL_PREFIX:PATH=%LIBRARY_PREFIX% ^
+      -DUSE_LLVM=ON ^
+      -DUSE_RPC=ON ^
+      -DUSE_CPP_RPC=ON ^
+      -DUSE_SORT=ON ^
+      -DUSE_RANDOM=ON ^
+      -DUSE_GRAPH_RUNTIME_DEBUG=ON ^
+      -DINSTALL_DEV=ON ^
+      %SRC_DIR%
+
+cd ..
+:: defer build to install stage to avoid rebuild.
+:: sometimes windows msbuild is not very good at file
+:: caching and install will results in a rebuild
diff --git a/conda/tvm-libs/build.sh b/conda/recipe/build.sh
old mode 100644
new mode 100755
similarity index 63%
rename from conda/tvm-libs/build.sh
rename to conda/recipe/build.sh
index 94919c60e779..c9e76314da31
--- a/conda/tvm-libs/build.sh
+++ b/conda/recipe/build.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,35 +19,41 @@
 set -e
 set -u
 
+GPU_OPT=""
+TOOLCHAIN_OPT=""
+
 if [ "$target_platform" == "osx-64" ]; then
     # macOS 64 bits
-    METAL_OPT="-DUSE_METAL=ON"
-    TOOLCHAIN_OPT="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.11"
-else
-    METAL_OPT=""
-    if [ "$target_platform" == "linux-64" ]; then
-	# Linux 64 bits
-	TOOLCHAIN_OPT="-DCMAKE_TOOLCHAIN_FILE=${RECIPE_DIR}/../cross-linux.cmake"
-    else
-	# Windows (or 32 bits, which we don't support)
-	TOOLCHAIN_OPT=""
-    fi
+    GPU_OPT="-DUSE_METAL=ON"
+elif [ "$target_platform" == "linux-64" ]; then
+    TOOLCHAIN_OPT="-DCMAKE_TOOLCHAIN_FILE=${RECIPE_DIR}/cross-linux.cmake"
 fi
 
 # When cuda is not set, we default to False
 cuda=${cuda:-False}
 
 if [ "$cuda" == "True" ]; then
-    CUDA_OPT="-DUSE_CUDA=ON -DUSE_CUBLAS=ON -DUSE_CUDNN=ON"
+    GPU_OPT="-DUSE_CUDA=ON -DUSE_CUBLAS=ON -DUSE_CUDNN=ON"
     TOOLCHAIN_OPT=""
-else
-    CUDA_OPT=""
 fi
 
+# remove touched cmake config
+rm -f config.cmake
 rm -rf build || true
 mkdir -p build
 cd build
-cmake $METAL_OPT $CUDA_OPT -DUSE_LLVM=$PREFIX/bin/llvm-config -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" $TOOLCHAIN_OPT ..
-make -j${CPU_COUNT} VERBOSE=1
-make install
+
+cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}" \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DUSE_RPC=ON \
+      -DUSE_CPP_RPC=OFF \
+      -DUSE_SORT=ON \
+      -DUSE_RANDOM=ON \
+      -DUSE_GRAPH_RUNTIME_DEBUG=ON \
+      -DUSE_LLVM=ON \
+      -DINSTALL_DEV=ON \
+      ${GPU_OPT} ${TOOLCHAIN_OPT} \
+      ${SRC_DIR}
+
+make -j${CPU_COUNT}
 cd ..
diff --git a/conda/conda_build_config.yaml b/conda/recipe/conda_build_config.yaml
similarity index 99%
rename from conda/conda_build_config.yaml
rename to conda/recipe/conda_build_config.yaml
index 79d6bfe3c175..938d294da556 100644
--- a/conda/conda_build_config.yaml
+++ b/conda/recipe/conda_build_config.yaml
@@ -16,9 +16,9 @@
 # under the License.
 
 python:
-  - 3.5
   - 3.6
   - 3.7
+  - 3.8
 
 cuda:
   - False
diff --git a/conda/cross-linux.cmake b/conda/recipe/cross-linux.cmake
similarity index 100%
rename from conda/cross-linux.cmake
rename to conda/recipe/cross-linux.cmake
diff --git a/conda/recipe/install_libtvm.bat b/conda/recipe/install_libtvm.bat
new file mode 100644
index 000000000000..f423c521f84e
--- /dev/null
+++ b/conda/recipe/install_libtvm.bat
@@ -0,0 +1,22 @@
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+
+cmake --build build --config Release --target install
+
+:: Copy files into library bin so that they can be found
+cp %LIBRARY_LIB%\tvm.dll %LIBRARY_BIN%\tvm.dll
+cp %LIBRARY_LIB%\tvm_runtime.dll %LIBRARY_BIN%\tvm_runtime.dll
diff --git a/conda/tvm/build.sh b/conda/recipe/install_libtvm.sh
old mode 100644
new mode 100755
similarity index 88%
rename from conda/tvm/build.sh
rename to conda/recipe/install_libtvm.sh
index 9bdbe0a6f509..b236c7dc2720
--- a/conda/tvm/build.sh
+++ b/conda/recipe/install_libtvm.sh
@@ -19,6 +19,5 @@
 set -e
 set -u
 
-cd python
-$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
-cd ..
+cd build
+make install
diff --git a/conda/recipe/install_tvm_python.bat b/conda/recipe/install_tvm_python.bat
new file mode 100644
index 000000000000..96187468c2b2
--- /dev/null
+++ b/conda/recipe/install_tvm_python.bat
@@ -0,0 +1,20 @@
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+echo on
+
+cd %SRC_DIR%\python
+%PYTHON% setup.py install --single-version-externally-managed --record=%SRC_DIR%\record.txt
diff --git a/conda/recipe/install_tvm_python.sh b/conda/recipe/install_tvm_python.sh
new file mode 100755
index 000000000000..2c721c64a156
--- /dev/null
+++ b/conda/recipe/install_tvm_python.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+
+cd ${SRC_DIR}/python
+${PYTHON} setup.py install --single-version-externally-managed --record=/tmp/record.txt
diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
new file mode 100644
index 000000000000..0113850a6602
--- /dev/null
+++ b/conda/recipe/meta.yaml
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{% set version = '0.8.dev0' %}
+{% set pkg_name = 'tvm' %}
+{% set cuda_tag = cuda_version | replace('.', '') %} # [cuda]
+{% set pkg_name = pkg_name + '-cu' + cuda_tag %} # [cuda]
+{% set build_tag = environ.get('GIT_BUILD_STR', 'unknown') %}
+{% set build_tag = build_tag + '_h' + PKG_HASH + '_' + PKG_BUILDNUM %}
+
+package:
+  name: {{ pkg_name }}-package
+  version: {{ version }}
+
+source:
+  path: '../..'
+
+build:
+  number: 0
+  include_recipe: False
+  missing_dso_whitelist:
+    - "*libcuda.*"  # [linux]
+
+requirements:
+  build:
+    # The anaconda compilers for OS X are old an annoying
+    # so we rely on the platform ones for now
+    - {{ compiler('cxx') }} # [not osx]
+    - cmake
+    - make # [not win]
+  host:
+    - zlib
+    - llvmdev ==10.0.0
+
+outputs:
+  - name: {{ pkg_name }}-libs
+    script: install_libtvm.bat # [win]
+    script: install_libtvm.sh  # [not win]
+    string: {{ build_tag }}
+    requirements:
+      build:
+        - {{ compiler('cxx') }}
+        - cmake
+        - git
+        - make # [not win]
+      host:
+        - zlib
+        - llvmdev ==10.0.0
+        - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
+        - {{ pin_compatible('cudnn', lower_bound='7.6.0', max_pin='x') }}  # [cuda]
+      run:
+        - llvmdev ==10.0.0
+        - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
+        - {{ pin_compatible('cudnn', lower_bound='7.6.0', max_pin='x') }}  # [cuda]
+
+  - name: {{ pkg_name }}
+    script: install_tvm_python.sh  # [not win]
+    script: install_tvm_python.bat # [win]
+    string: {{ build_tag }}_py{{ PY_VER | replace('.', '')}}
+    # skip bytecompile pyc to speedup CI speed
+    skip_compile_pyc:
+      - "*/**/*.py"
+    requirements:
+      host:
+        - python
+        - setuptools
+        - cython
+        - {{ pin_subpackage(pkg_name + '-libs', exact=True) }}
+      run:
+        - python
+        - decorator
+        - psutil
+        - scipy
+        - {{ pin_compatible('numpy') }}
+        - {{ pin_subpackage(pkg_name + '-libs', exact=True) }}
+
+about:
+  home: https://tvm.apache.org
+  license: Apache2
+  summary: An End to End Deep Learning Compiler Stack for CPUs, GPUs and accelerators.
diff --git a/conda/render_cuda.py b/conda/render_cuda_dockerfiles.py
similarity index 98%
rename from conda/render_cuda.py
rename to conda/render_cuda_dockerfiles.py
index efd616946314..d9d32f05fb5e 100644
--- a/conda/render_cuda.py
+++ b/conda/render_cuda_dockerfiles.py
@@ -48,7 +48,7 @@ def render_dockerfile(version):
     )
     fname = os.path.join(condadir, "../docker/Dockerfile.conda_cuda" + version.replace(".", ""))
     with open(fname, "w") as f:
-        f.write(txt)
+        f.write(txt + "\n")
     return fname
 
 
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
deleted file mode 100644
index f151048e445b..000000000000
--- a/conda/tvm-libs/meta.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-{% set version = "0.8.dev0" %}
-
-package:
-  name: tvm-libs
-  version: {{ version }}
-
-source:
-  path: ../..
-
-build:
-  number: 0
-  string: cuda{{ cuda_version | replace('.', '') }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda]
-
-requirements:
-  build:
-    # The anaconda compilers for OS X are old an annoying
-    # so we rely on the platform ones for now
-    - {{ compiler('cxx') }}  # [linux]
-    - cmake
-    - make
-  host:
-    - llvmdev ==8.0.0
-    - zlib  # [linux]
-  run:
-    - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
-    - {{ pin_compatible('cudnn', lower_bound='7.6.0', max_pin='x') }}  # [cuda]
-
-about:
-  home: https://github.com/apache/incubator-tvm
-  license: Apache2
-  summary: a low level domain specific language for compiling tensor computation pipelines
\ No newline at end of file
diff --git a/dmlc_tvm_commit_id.txt b/dmlc_tvm_commit_id.txt
index 3476b5d864e2..7b294e50bf70 100644
--- a/dmlc_tvm_commit_id.txt
+++ b/dmlc_tvm_commit_id.txt
@@ -1 +1 @@
-7d805b54d6adda82636d13bf7c46a2e9a933da5f
\ No newline at end of file
+9554e645922357af1d11679a102f3763b80b740f
\ No newline at end of file
diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
new file mode 100644
index 000000000000..f5b2c2af0fbf
--- /dev/null
+++ b/docker/Dockerfile.ci_arm
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# CI docker arm env
+# tag: v0.10
+
+FROM ubuntu:18.04
+
+RUN apt-get update --fix-missing
+RUN apt-get install -y ca-certificates gnupg2
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
+RUN bash /install/ubuntu_install_llvm.sh
+
+COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
+RUN bash /install/ubuntu1804_install_python.sh
+
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
+COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
+RUN bash /install/ubuntu_install_python_package.sh
+
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 4823488a731a..a3805660b2b1 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -36,6 +36,10 @@ RUN bash /install/ubuntu1804_install_llvm.sh
 COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh
 RUN bash /install/ubuntu_install_dnnl.sh
 
+# Install MxNet for access to Gluon Model Zoo.
+COPY install/ubuntu_install_mxnet.sh /install/ubuntu_install_mxnet.sh
+RUN bash /install/ubuntu_install_mxnet.sh
+
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
@@ -60,9 +64,17 @@ ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
 
+# BYODT deps
+COPY install/ubuntu_install_universal.sh /install/ubuntu_install_universal.sh
+RUN bash /install/ubuntu_install_universal.sh
+
 # Chisel deps for TSIM
-COPY install/ubuntu_install_chisel.sh /install/ubuntu_install_chisel.sh
-RUN bash /install/ubuntu_install_chisel.sh
+COPY install/ubuntu_install_sbt.sh /install/ubuntu_install_sbt.sh
+RUN bash /install/ubuntu_install_sbt.sh
+
+# Verilator deps
+COPY install/ubuntu_install_verilator.sh /install/ubuntu_install_verilator.sh
+RUN bash /install/ubuntu_install_verilator.sh
 
 # TFLite deps
 COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 1197d8e4c7b6..ac76af6b0a1e 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -83,6 +83,13 @@ RUN bash /install/ubuntu_install_dgl.sh
 COPY install/ubuntu_install_vulkan.sh /install/ubuntu_install_vulkan.sh
 RUN bash /install/ubuntu_install_vulkan.sh
 
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
 # AutoTVM deps
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
@@ -91,6 +98,10 @@ RUN bash /install/ubuntu_install_redis.sh
 COPY install/ubuntu_install_nnpack.sh /install/ubuntu_install_nnpack.sh
 RUN bash /install/ubuntu_install_nnpack.sh
 
+# BYODT deps
+COPY install/ubuntu_install_universal.sh /install/ubuntu_install_universal.sh
+RUN bash /install/ubuntu_install_universal.sh
+
 # Environment variables
 ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index a7d8308d4810..2cdf10c4369e 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -21,6 +21,7 @@
 FROM ioft/i386-ubuntu:16.04
 
 RUN apt-get update --fix-missing
+RUN apt-get install -y ca-certificates
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
@@ -31,6 +32,9 @@ RUN bash /install/ubuntu_install_llvm.sh
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
@@ -39,5 +43,9 @@ COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
 
 # Chisel deps for TSIM
-COPY install/ubuntu_install_chisel.sh /install/ubuntu_install_chisel.sh
-RUN bash /install/ubuntu_install_chisel.sh
+COPY install/ubuntu_install_sbt.sh /install/ubuntu_install_sbt.sh
+RUN bash /install/ubuntu_install_sbt.sh
+
+# Verilator deps
+COPY install/ubuntu_install_verilator.sh /install/ubuntu_install_verilator.sh
+RUN bash /install/ubuntu_install_verilator.sh
diff --git a/docker/Dockerfile.conda_cpu b/docker/Dockerfile.conda_cpu
index 4e0c35a26e55..d2779afbdaf3 100644
--- a/docker/Dockerfile.conda_cpu
+++ b/docker/Dockerfile.conda_cpu
@@ -17,25 +17,12 @@
 
 FROM ubuntu:16.04
 
-RUN apt-get update && apt-get install -y bzip2 curl sudo binutils && rm -rf /var/lib/apt/lists/*
+RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
 
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda upgrade --all && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
-
-# Cache some of the packages for the builds
-RUN /opt/conda/bin/conda install --download-only cmake make zlib && \
-    /opt/conda/bin/conda install --download-only -c numba llvmdev=8.0.0 && \
-    /opt/conda/bin/conda create -n py35 --download-only pytest scipy numpy=1.11 cython decorator python=3.5 && \
-    /opt/conda/bin/conda create -n py36 --download-only pytest scipy numpy=1.11 cython decorator python=3.6 && \
-    /opt/conda/bin/conda create -n py37 --download-only pytest scipy numpy=1.11 cython decorator python=3.7
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh
 
 ENV PATH /opt/conda/bin:$PATH
 ENV CONDA_BLD_PATH /tmp
-
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
+ENV CONDA_PKGS_DIRS /workspace/.conda/pkgs
+ENV CONDA_ENVS_DIRS /workspace/.conda/env
diff --git a/docker/Dockerfile.conda_cuda100 b/docker/Dockerfile.conda_cuda100
index d6e1cddbfd37..7705c8548b52 100644
--- a/docker/Dockerfile.conda_cuda100
+++ b/docker/Dockerfile.conda_cuda100
@@ -17,30 +17,16 @@
 
 FROM nvidia/cuda:10.0-devel-ubuntu16.04
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            bzip2 curl sudo binutils && \
-    rm -rf /var/lib/apt/lists/*
+RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
 
-RUN  curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-10.0-linux-x64-v7.6.0.64.tgz -O && \
+RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-10.0-linux-x64-v7.6.0.64.tgz && \
     tar --no-same-owner -xzf cudnn-10.0-linux-x64-v7.6.0.64.tgz -C /usr/local && \
     rm cudnn-10.0-linux-x64-v7.6.0.64.tgz && \
     ldconfig
 
-
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda upgrade --all && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
-
-RUN /opt/conda/bin/conda install --download-only cmake make zlib
-RUN /opt/conda/bin/conda install --download-only -c numba llvmdev=8.0.0
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh
 
 ENV PATH /opt/conda/bin:$PATH
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV CONDA_BLD_PATH /tmp
-
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
\ No newline at end of file
diff --git a/docker/Dockerfile.conda_cuda90 b/docker/Dockerfile.conda_cuda90
index f55aa1bf2e12..372167438141 100644
--- a/docker/Dockerfile.conda_cuda90
+++ b/docker/Dockerfile.conda_cuda90
@@ -17,30 +17,16 @@
 
 FROM nvidia/cuda:9.0-devel-ubuntu16.04
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            bzip2 curl sudo binutils && \
-    rm -rf /var/lib/apt/lists/*
+RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
 
-RUN  curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-9.0-linux-x64-v7.6.0.64.tgz -O && \
+RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-9.0-linux-x64-v7.6.0.64.tgz && \
     tar --no-same-owner -xzf cudnn-9.0-linux-x64-v7.6.0.64.tgz -C /usr/local && \
     rm cudnn-9.0-linux-x64-v7.6.0.64.tgz && \
     ldconfig
 
-
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda upgrade --all && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
-
-RUN /opt/conda/bin/conda install --download-only cmake make zlib
-RUN /opt/conda/bin/conda install --download-only -c numba llvmdev=8.0.0
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh
 
 ENV PATH /opt/conda/bin:$PATH
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV CONDA_BLD_PATH /tmp
-
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
\ No newline at end of file
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index cf13daa9734e..039439a937e9 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -53,7 +53,7 @@ RUN git clone https://github.com/KhronosGroup/OpenCL-Headers /usr/local/OpenCL-H
 
 # Build TVM
 RUN cd /usr && \
-    git clone --depth=1 https://github.com/apache/incubator-tvm tvm --recursive && \
+    git clone --depth=1 https://github.com/apache/tvm tvm --recursive && \
     cd /usr/tvm && \
     mkdir -p build && \
     cd build && \
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index e39ee4128c96..2f534d8b5b5c 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -62,7 +62,7 @@ RUN echo "Cloning TVM source & submodules"
 ENV TVM_PAR_DIR="/usr"
 RUN mkdir -p TVM_PAR_DIR && \
 	cd ${TVM_PAR_DIR} && \
-	git clone --depth=1 https://github.com/apache/incubator-tvm tvm --recursive
+	git clone --depth=1 https://github.com/apache/tvm tvm --recursive
 #RUN git submodule update --init --recursive
 
 
diff --git a/docker/README.md b/docker/README.md
index dffaf3a5ba4f..ae972f954668 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -52,7 +52,7 @@ Then inside the docker container, you can type the following command to start th
 jupyter notebook
 ```
 
-You can find some un-official prebuilt images in https://hub.docker.com/r/tvmai/ .
+You can find some un-official prebuilt images in https://hub.docker.com/r/tlcpack/ .
 Note that these are convenience images and are not part of the ASF release.
 
 
diff --git a/docker/bash.sh b/docker/bash.sh
index d2424f170219..7420e6f9024c 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -70,7 +70,7 @@ else
     CUDA_ENV=""
 fi
 
-if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* ]]; then
+if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* || "${DOCKER_IMAGE_NAME}" == *"cuda"* ]]; then
     if ! type "nvidia-docker" 1> /dev/null 2> /dev/null
     then
         DOCKER_BINARY="docker"
@@ -83,9 +83,9 @@ else
 fi
 
 if [[ "${DOCKER_IMAGE_NAME}" == *"ci"* ]]; then
-    CI_PY_ENV="-e PYTHONPATH=/workspace/python"
+    CI_ADDON_ENV="-e PYTHONPATH=/workspace/python"
 else
-    CI_PY_ENV=""
+    CI_ADDON_ENV=""
 fi
 
 # If the Vitis-AI docker image is selected, expose the Xilinx FPGA devices and required volumes containing e.g. DSA's and overlays
@@ -143,7 +143,8 @@ ${DOCKER_BINARY} run --rm --pid=host\
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_PYTEST_ADD_OPTIONS=$CI_PYTEST_ADD_OPTIONS" \
-    ${CI_PY_ENV} \
+    -e "CI_IMAGE_NAME=${DOCKER_IMAGE_NAME}" \
+    ${CI_ADDON_ENV} \
     ${CUDA_ENV} \
     "${CI_DOCKER_EXTRA_PARAMS[@]}" \
     ${DOCKER_IMAGE_NAME} \
diff --git a/docker/build.sh b/docker/build.sh
index 43f0a08700a4..bd13937b2571 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -91,7 +91,7 @@ if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then
 fi
 
 # Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == *"gpu"* ]]; then
+if [[ "${CONTAINER_TYPE}" == *"gpu"* || "${CONTAINER_TYPE}" == *"cuda"* ]]; then
     if ! type "nvidia-docker" 1> /dev/null 2> /dev/null
     then
         DOCKER_BINARY="docker"
@@ -164,6 +164,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_PYTEST_ADD_OPTIONS=$CI_PYTEST_ADD_OPTIONS" \
+    -e "CI_IMAGE_NAME=${DOCKER_IMAGE_NAME}" \
     ${CUDA_ENV}\
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMG_NAME} \
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
index b11c9791fb2d..c3a15fa26b6d 100755
--- a/docker/install/install_tvm_cpu.sh
+++ b/docker/install/install_tvm_cpu.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 cd /usr
-git clone https://github.com/apache/incubator-tvm tvm --recursive
+git clone https://github.com/apache/tvm tvm --recursive
 cd /usr/tvm
 # checkout a hash-tag
 git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470
diff --git a/docker/install/install_tvm_gpu.sh b/docker/install/install_tvm_gpu.sh
index 2dbf8e17398d..fe2214da8409 100755
--- a/docker/install/install_tvm_gpu.sh
+++ b/docker/install/install_tvm_gpu.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 cd /usr
-git clone https://github.com/apache/incubator-tvm tvm --recursive
+git clone https://github.com/apache/tvm tvm --recursive
 cd /usr/tvm
 # checkout a hash-tag
 git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470
diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh
new file mode 100644
index 000000000000..f818fba9721b
--- /dev/null
+++ b/docker/install/ubuntu_install_cmake_source.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+v=3.13
+version=3.13.5
+wget https://cmake.org/files/v${v}/cmake-${version}.tar.gz
+tar xvf cmake-${version}.tar.gz
+cd cmake-${version}
+./bootstrap
+make -j$(nproc)
+make install
+cd ..
+rm -rf cmake-${version} cmake-${version}.tar.gz
diff --git a/docker/install/ubuntu_install_conda.sh b/docker/install/ubuntu_install_conda.sh
new file mode 100755
index 000000000000..6f6019340293
--- /dev/null
+++ b/docker/install/ubuntu_install_conda.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+cd /tmp && wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+chmod +x Miniconda3-latest-Linux-x86_64.sh
+/tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda
+rm /tmp/Miniconda3-latest-Linux-x86_64.sh
+/opt/conda/bin/conda upgrade --all
+/opt/conda/bin/conda clean -ya
+/opt/conda/bin/conda install conda-build conda-verify
+chmod -R a+w /opt/conda/
diff --git a/docker/install/ubuntu_install_darknet.sh b/docker/install/ubuntu_install_darknet.sh
index c48724c6065b..37adf4a30270 100755
--- a/docker/install/ubuntu_install_darknet.sh
+++ b/docker/install/ubuntu_install_darknet.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -23,7 +23,4 @@ set -o pipefail
 #install the necessary dependancies, cffi, opencv
 wget -q 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
 debian_version=`cat /etc/debian_version`
-if [ "$debian_version" == "stretch/sid" ]; then
-    pip2 install opencv-python cffi
-fi
 pip3 install opencv-python cffi
diff --git a/docker/install/ubuntu_install_dgl.sh b/docker/install/ubuntu_install_dgl.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_emscripten.sh b/docker/install/ubuntu_install_emscripten.sh
index 2e48cccbe2a6..fa44e1c70f1d 100755
--- a/docker/install/ubuntu_install_emscripten.sh
+++ b/docker/install/ubuntu_install_emscripten.sh
@@ -23,5 +23,5 @@ set -o pipefail
 cd /
 git clone https://github.com/emscripten-core/emsdk.git
 cd emsdk
-./emsdk install latest
-./emsdk activate latest
+./emsdk install 2.0.7
+./emsdk activate 2.0.7
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
index 2ad601983fa2..a92a0244d707 100755
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -28,4 +28,4 @@ pip3 install onnxruntime==1.0.0
 # not expose that in the wheel!!!
 pip3 install future
 
-pip3 install torch==1.4.0 torchvision==0.5.0
+pip3 install torch==1.7.0 torchvision==0.8.1
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 2b8df74dab7b..7989a49a4826 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -21,4 +21,4 @@ set -u
 set -o pipefail
 
 # install libraries for python package on ubuntu
-pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest mypy orderedset attrs requests Pillow packaging
+pip3 install six numpy pytest cython decorator scipy tornado pytest pytest-xdist pytest-profiling mypy orderedset attrs requests Pillow packaging cloudpickle synr
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index 310e6507e3f3..5716b11db6c4 100755
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -26,10 +26,11 @@ export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
 # this rustc is one supported by the installed version of rust-sgx-sdk
 curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain stable
-. $CARGO_HOME/env
+export PATH=$CARGO_HOME/bin:$PATH
 rustup component add rustfmt
 
 # install wasmtime
+apt-get install -y --no-install-recommends libc6-dev-i386
 export WASMTIME_HOME=/opt/wasmtime
 curl https://wasmtime.dev/install.sh -sSf | bash
 export PATH="${WASMTIME_HOME}/bin:${PATH}"
diff --git a/docker/install/ubuntu_install_chisel.sh b/docker/install/ubuntu_install_sbt.sh
similarity index 80%
rename from docker/install/ubuntu_install_chisel.sh
rename to docker/install/ubuntu_install_sbt.sh
index d6776634ffe0..b02186e3263a 100755
--- a/docker/install/ubuntu_install_chisel.sh
+++ b/docker/install/ubuntu_install_sbt.sh
@@ -22,20 +22,12 @@ set -o pipefail
 
 # The https:// source added below required an apt https transport
 # support.
-apt-get update && apt-get install -y apt-transport-https flex bison
+apt-get update && apt-get install -y apt-transport-https
 
-# Install the necessary dependencies for Chisel
+# Install the necessary dependencies for sbt
 echo "deb https://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list
 apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
 
 # Note: The settings in vta/hardware/chisel/project/build.properties
 # file determines required sbt version.
 apt-get update && apt-get install -y sbt=1.1.1
-
-# Install the Verilator with major version 4.0
-wget https://www.veripool.org/ftp/verilator-4.010.tgz
-tar xf verilator-4.010.tgz
-cd verilator-4.010/
-./configure
-make -j4
-make install
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
index 2555164e2292..33757a0d4d57 100755
--- a/docker/install/ubuntu_install_sphinx.sh
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install sphinx sphinx-gallery==0.4.0 autodocsumm sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image commonmark>=0.7.3 docutils>=0.11
+pip3 install sphinx sphinx-gallery==0.4.0 autodocsumm sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image "commonmark>=0.7.3" "docutils>=0.11"
diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index 25543909d78b..286a086abd82 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -20,4 +20,7 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install tensorflow==2.1.0 keras==2.3.1 h5py
+# h5py is pinned to minor than 3 due to issues with
+# tensorflow:
+# https://github.com/tensorflow/tensorflow/issues/44467
+pip3 install tensorflow==2.3.1 keras==2.4.3 "h5py<3.0"
diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh
index 123ff520d725..2dfbb0681a80 100755
--- a/docker/install/ubuntu_install_tflite.sh
+++ b/docker/install/ubuntu_install_tflite.sh
@@ -33,14 +33,14 @@ pip3 install flatbuffers
 # Build the TFLite static library, necessary for building with TFLite ON.
 # The library is built at:
 # tensorflow/tensorflow/lite/tools/make/gen/*/lib/libtensorflow-lite.a.
-git clone https://github.com/tensorflow/tensorflow --branch=r2.1
+git clone https://github.com/tensorflow/tensorflow --branch=r2.3
 ./tensorflow/tensorflow/lite/tools/make/download_dependencies.sh
 ./tensorflow/tensorflow/lite/tools/make/build_lib.sh
 
 # Setup tflite from schema
 mkdir tflite
 cd tflite
-wget -q https://raw.githubusercontent.com/tensorflow/tensorflow/r2.1/tensorflow/lite/schema/schema.fbs
+wget -q https://raw.githubusercontent.com/tensorflow/tensorflow/r2.3/tensorflow/lite/schema/schema.fbs
 flatc --python schema.fbs
 
 cat <<EOM >setup.py
@@ -48,7 +48,7 @@ import setuptools
 
 setuptools.setup(
     name="tflite",
-    version="2.1.0",
+    version="2.3.1",
     author="google",
     author_email="google@google.com",
     description="TFLite",
diff --git a/docker/install/ubuntu_install_universal.sh b/docker/install/ubuntu_install_universal.sh
new file mode 100644
index 000000000000..a054aafdd5f7
--- /dev/null
+++ b/docker/install/ubuntu_install_universal.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+git clone https://github.com/stillwater-sc/universal.git /opt/universal
+
+# Use specific versioning tag.
+(cd /opt/universal && git checkout e32899d551b53d758865fabd5fdd69eed35bfb0f)
\ No newline at end of file
diff --git a/docker/install/ubuntu_install_verilator.sh b/docker/install/ubuntu_install_verilator.sh
new file mode 100644
index 000000000000..1c5193c053c1
--- /dev/null
+++ b/docker/install/ubuntu_install_verilator.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+# Verilator version
+version="4.104"
+
+# Install dependencies
+apt-get update && apt-get install -y autoconf g++ flex bison
+
+# Install Verilator
+wget "https://github.com/verilator/verilator/archive/v$version.tar.gz"
+tar xf "v$version.tar.gz"
+cd "verilator-$version"
+autoconf
+./configure
+make -j4
+make install
diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index d4077bc67b44..c34ed3addce2 100644
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -25,5 +25,5 @@ mkdir "$PYXIR_HOME"
 
 pip3 install progressbar
 
-git clone --recursive --branch v0.1.2 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
+git clone --recursive --branch v0.1.3 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
 cd "${PYXIR_HOME}" && python3 setup.py install
diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index 2bcbb6f49201..459978409be5 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -56,5 +56,6 @@ PATH=${PATH} \
 JAVA_HOME=${JAVA_HOME} \
 LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
 PYTHONPATH=${PYTHONPATH} \
+CI_IMAGE_NAME=${CI_IMAGE_NAME} \
 HOME=${CI_BUILD_HOME} \
 "${COMMAND[@]}"
diff --git a/docs/README.txt b/docs/README.txt
index 09c8e9b7e557..e409107b78a6 100644
--- a/docs/README.txt
+++ b/docs/README.txt
@@ -3,7 +3,7 @@ TVM Documentations
 This folder contains the source of TVM documents
 
 - A hosted version of doc is at https://tvm.apache.org/docs
-- pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark "Pillow<7" autodocsumm
+- pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark "Pillow<7" autodocsumm tlcpack-sphinx-addon
 - Build tvm first in the root folder.
 - Run the following command
 ```bash
@@ -51,3 +51,8 @@ You will need a gpu CI environment.
 ```bash
 ./tests/scripts/task_python_docs.sh
 ```
+
+Define the Order of Tutorials
+-----------------------------
+You can define the order of tutorials with `conf.py::subsection_order` and `conf.py::within_subsection_order`.
+By default, the tutorials within one subsection is sorted by filename.
\ No newline at end of file
diff --git a/docs/api/python/contrib.rst b/docs/api/python/contrib.rst
index 8ac4e1ff7d3a..0eb3024c2d08 100644
--- a/docs/api/python/contrib.rst
+++ b/docs/api/python/contrib.rst
@@ -122,9 +122,9 @@ tvm.contrib.tar
     :members:
 
 
-tvm.contrib.util
-~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.util
+tvm.contrib.utils
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.utils
     :members:
 
 
diff --git a/docs/conf.py b/docs/conf.py
index 259d9c3fa0e2..a7198bf22355 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -48,7 +48,7 @@
 project = "tvm"
 author = "Apache Software Foundation"
 copyright = "2020, %s" % author
-github_doc_root = "https://github.com/apache/incubator-tvm/tree/main/docs/"
+github_doc_root = "https://github.com/apache/tvm/tree/main/docs/"
 
 os.environ["TVM_BUILD_DOC"] = "1"
 # Version information.
@@ -204,6 +204,75 @@
     ]
 )
 
+# Explicitly define the order within a subsection.
+# The listed files are sorted according to the list.
+# The unlisted files are sorted by filenames.
+# The unlisted files always appear after listed files.
+within_subsection_order = {
+    "get_started": [
+        "relay_quick_start.py",
+        "tensor_expr_get_started.py",
+        "tvmc_command_line_driver.py",
+        "cross_compilation_and_rpc.py",
+    ],
+    "frontend": [
+        "from_pytorch.py",
+        "from_tensorflow.py",
+        "from_mxnet.py",
+        "from_onnx.py",
+        "from_keras.py",
+        "from_tflite.py",
+        "from_coreml.py",
+        "from_darknet.py",
+        "from_caffe2.py",
+    ],
+    "language": [
+        "schedule_primitives.py",
+        "reduciton.py",
+        "intrin_math.py",
+        "scan.py",
+        "extern_op.py",
+        "tensorize.py",
+        "tuple_inputs.py",
+        "tedd.py",
+    ],
+    "optimize": [
+        "opt_gemm.py",
+        "opt_conv_cuda.py",
+        "opt_conv_tensorcore.py",
+        "opt_matmul_auto_tensorcore.py",
+    ],
+    "autotvm": [
+        "tune_simple_template.py",
+        "tune_conv2d_cuda.py",
+        "tune_relay_cuda.py",
+        "tune_relay_x86.py",
+        "tune_relay_arm.py",
+        "tune_relay_mobile_gpu.py",
+    ],
+    "auto_scheduler": ["tune_matmul_x86.py", "tune_conv2d_layer_cuda.py"],
+    "dev": ["low_level_custom_pass.py", "use_pass_infra.py", "bring_your_own_datatypes.py"],
+}
+
+
+class WithinSubsectionOrder:
+    def __init__(self, src_dir):
+        self.src_dir = src_dir.split("/")[-1]
+
+    def __call__(self, filename):
+        # If the order is provided, use the provided order
+        if (
+            self.src_dir in within_subsection_order
+            and filename in within_subsection_order[self.src_dir]
+        ):
+            index = within_subsection_order[self.src_dir].index(filename)
+            assert index < 1e10
+            return "\0%010d" % index
+
+        # Otherwise, sort by filename
+        return filename
+
+
 sphinx_gallery_conf = {
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("tvm", "numpy"),
@@ -213,6 +282,7 @@
         "numpy": "https://numpy.org/doc/stable",
     },
     "examples_dirs": examples_dirs,
+    "within_subsection_order": WithinSubsectionOrder,
     "gallery_dirs": gallery_dirs,
     "subsection_order": subsection_order,
     "filename_pattern": os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", ".py"),
@@ -234,6 +304,57 @@
     "tvm.relay": ["tvm.ir", "tvm.tir"],
 }
 
+## Setup header and other configs
+import tlcpack_sphinx_addon
+
+footer_copyright = "© 2020 Apache Software Foundation | All right reserved"
+footer_note = " ".join(
+    """
+Copyright © 2020 The Apache Software Foundation. Apache TVM, Apache, the Apache feather,
+and the Apache TVM project logo are either trademarks or registered trademarks of
+the Apache Software Foundation.""".split(
+        "\n"
+    )
+).strip()
+
+header_logo = "https://tvm.apache.org/assets/images/logo.svg"
+header_logo_link = "https://tvm.apache.org/"
+
+header_links = [
+    ("Community", "https://tvm.apache.org/community"),
+    ("Download", "https://tvm.apache.org/download"),
+    ("VTA", "https://tvm.apache.org/vta"),
+    ("Blog", "https://tvm.apache.org/blog"),
+    ("Docs", "https://tvm.apache.org/docs"),
+    ("Conference", "https://tvmconf.org"),
+    ("Github", "https://github.com/apache/tvm/"),
+]
+
+header_dropdown = {
+    "name": "ASF",
+    "items": [
+        ("Apache Homepage", "https://apache.org/"),
+        ("License", "https://www.apache.org/licenses/"),
+        ("Sponsorship", "https://www.apache.org/foundation/sponsorship.html"),
+        ("Security", "https://www.apache.org/security/"),
+        ("Thanks", "https://www.apache.org/foundation/thanks.html"),
+        ("Events", "https://www.apache.org/events/current-event"),
+    ],
+}
+
+html_context = {
+    "footer_copyright": footer_copyright,
+    "footer_note": footer_note,
+    "header_links": header_links,
+    "header_dropdown": header_dropdown,
+    "header_logo": header_logo,
+    "header_logo_link": header_logo_link,
+}
+
+# add additional overrides
+templates_path += [tlcpack_sphinx_addon.get_templates_path()]
+html_static_path += [tlcpack_sphinx_addon.get_static_path()]
+
 
 def update_alias_docstring(name, obj, lines):
     """Update the docstring of alias functions.
@@ -282,4 +403,3 @@ def process_docstring(app, what, name, obj, options, lines):
 
 def setup(app):
     app.connect("autodoc-process-docstring", process_docstring)
-    app.add_css_file("css/tvm_theme.css")
diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index fd6df0f991bd..8867202a674c 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -20,7 +20,7 @@
 TVM Community Guideline
 =======================
 
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/apache/incubator-tvm/blob/main/CONTRIBUTORS.md>`_ for the current list of contributors.
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_ for the current list of contributors.
 
 
 
diff --git a/docs/contribute/document.rst b/docs/contribute/document.rst
index 1bfab1e1c061..3652a2891b37 100644
--- a/docs/contribute/document.rst
+++ b/docs/contribute/document.rst
@@ -68,7 +68,7 @@ Be careful to leave blank lines between sections of your documents.
 In the above case, there has to be a blank line before `Parameters`, `Returns` and `Examples`
 in order for the doc to be built correctly. To add a new function to the doc,
 we need to add the `sphinx.autodoc <http://www.sphinx-doc.org/en/master/ext/autodoc.html>`_
-rules to the `docs/api/python <https://github.com/apache/incubator-tvm/tree/main/docs/api/python>`_).
+rules to the `docs/api/python <https://github.com/apache/tvm/tree/main/docs/api/python>`_).
 You can refer to the existing files under this folder on how to add the functions.
 
 
@@ -96,7 +96,7 @@ to add comments about code logics to improve readability.
 Write Tutorials
 ---------------
 We use the `sphinx-gallery <https://sphinx-gallery.github.io/>`_ to build python tutorials.
-You can find the source code under `tutorials <https://github.com/apache/incubator-tvm/tree/main/tutorials>`_ quite self explanatory.
+You can find the source code under `tutorials <https://github.com/apache/tvm/tree/main/tutorials>`_ quite self explanatory.
 One thing that worth noting is that the comment blocks are written in reStructuredText instead of markdown so be aware of the syntax.
 
 The tutorial code will run on our build server to generate the document page.
diff --git a/docs/contribute/error_handling.rst b/docs/contribute/error_handling.rst
index 8f71ee61aeb6..d31b401ea654 100644
--- a/docs/contribute/error_handling.rst
+++ b/docs/contribute/error_handling.rst
@@ -37,14 +37,14 @@ raise an error of the corresponding type.
 Note that you do not have to add a new type
 :py:class:`tvm.error.TVMError` will be raised by default when
 there is no error type prefix in the message.
-This mechanism works for both ``LOG(FATAL)`` and ``CHECK`` macros.
+This mechanism works for both ``LOG(FATAL)`` and ``ICHECK`` macros.
 The following code gives an example on how to do so.
 
 .. code:: c
 
   // src/api_test.cc
   void ErrorTest(int x, int y) {
-    CHECK_EQ(x, y) << "ValueError: expect x and y to be equal."
+    ICHECK_EQ(x, y) << "ValueError: expect x and y to be equal."
     if (x == 1) {
       LOG(FATAL) << "InternalError: cannot reach here";
     }
diff --git a/docs/contribute/release_process.rst b/docs/contribute/release_process.rst
index 0f1e5151f5a9..f330a7ddd3e6 100644
--- a/docs/contribute/release_process.rst
+++ b/docs/contribute/release_process.rst
@@ -17,8 +17,8 @@
 
 .. _release_process:
 
-Apache TVM (incubating) Release Process
-=======================================
+Apache TVM Release Process
+==========================
 
 The release manager role in TVM means you are responsible for a few different things:
 
@@ -64,13 +64,13 @@ The last step is to update the KEYS file with your code signing key https://www.
 .. code-block:: bash
 
 	# the --depth=files will avoid checkout existing folders
-	svn co --depth=files "https://dist.apache.org/repos/dist/dev/incubator/tvm" svn-tvm
+	svn co --depth=files "https://dist.apache.org/repos/dist/dev/tvm" svn-tvm
 	cd svn-tvm
 	# edit KEYS file
 	svn ci --username $ASF_USERNAME --password "$ASF_PASSWORD" -m "Update KEYS"
 	# update downloads.apache.org
-	svn rm --username $ASF_USERNAME --password "$ASF_PASSWORD" https://dist.apache.org/repos/dist/release/incubator/tvm/KEYS -m "Update KEYS"
-	svn cp --username $ASF_USERNAME --password "$ASF_PASSWORD" https://dist.apache.org/repos/dist/dev/incubator/tvm/KEYS https://dist.apache.org/repos/dist/release/incubator/tvm/ -m "Update KEYS"
+	svn rm --username $ASF_USERNAME --password "$ASF_PASSWORD" https://dist.apache.org/repos/dist/release/tvm/KEYS -m "Update KEYS"
+	svn cp --username $ASF_USERNAME --password "$ASF_PASSWORD" https://dist.apache.org/repos/dist/dev/tvm/KEYS https://dist.apache.org/repos/dist/release/tvm/ -m "Update KEYS"
 
 
 Cut a Release Candidate
@@ -80,8 +80,8 @@ To cut a release candidate, one needs to first cut a branch using selected versi
 
 .. code-block:: bash
 
-	git clone https://github.com/apache/incubator-tvm.git
-	cd incubator-tvm/
+	git clone https://github.com/apache/tvm.git
+	cd tvm/
 	git branch v0.6.0
 	git push --set-upstream origin v0.6.0
 
@@ -107,8 +107,8 @@ Create source code artifacts,
 
 .. code-block:: bash
 
-	git clone git@github.com:apache/incubator-tvm.git apache-tvm-src-v0.6.0.rc0-incubating
-	cd apache-tvm-src-v0.6.0.rc0-incubating
+	git clone git@github.com:apache/tvm.git apache-tvm-src-v0.6.0.rc0
+	cd apache-tvm-src-v0.6.0.rc0
 	git checkout v0.6
 	git submodule update --init --recursive
 	git checkout v0.6.0.rc0
@@ -116,7 +116,7 @@ Create source code artifacts,
 	find . -name ".git*" -print0 | xargs -0 rm -rf
 	cd ..
 	brew install gnu-tar
-	gtar -czvf apache-tvm-src-v0.6.0.rc0-incubating.tar.gz apache-tvm-src-v0.6.0.rc0-incubating
+	gtar -czvf apache-tvm-src-v0.6.0.rc0.tar.gz apache-tvm-src-v0.6.0.rc0
 
 Use your GPG key to sign the created artifact. First make sure your GPG is set to use the correct private key,
 
@@ -129,8 +129,8 @@ Create GPG signature as well as the hash of the file,
 
 .. code-block:: bash
 
-	gpg --armor --output apache-tvm-src-v0.6.0.rc0-incubating.tar.gz.asc --detach-sig apache-tvm-src-v0.6.0.rc0-incubating.tar.gz
-	shasum -a 512 apache-tvm-src-v0.6.0.rc0-incubating.tar.gz > apache-tvm-src-v0.6.0.rc0-incubating.tar.gz.sha512
+	gpg --armor --output apache-tvm-src-v0.6.0.rc0.tar.gz.asc --detach-sig apache-tvm-src-v0.6.0.rc0.tar.gz
+	shasum -a 512 apache-tvm-src-v0.6.0.rc0.tar.gz > apache-tvm-src-v0.6.0.rc0.tar.gz.sha512
 
 
 Upload the Release Candidate
@@ -143,7 +143,7 @@ The release manager also needs to upload the artifacts to ASF SVN,
 .. code-block:: bash
 
 	# the --depth=files will avoid checkout existing folders
-	svn co --depth=files "https://dist.apache.org/repos/dist/dev/incubator/tvm" svn-tvm
+	svn co --depth=files "https://dist.apache.org/repos/dist/dev/tvm" svn-tvm
 	cd svn-tvm
 	mkdir tvm-v0.6.0-rc0
 	# copy files into it
@@ -154,9 +154,7 @@ The release manager also needs to upload the artifacts to ASF SVN,
 Call a Vote on the Release Candidate
 ------------------------------------
 
-As an incubator project, it requires voting on both dev@ and general@.
-
-The first voting takes place on the Apache TVM (incubator) developers list (dev@tvm.apache.org). To get more attention, one can create a github issue start with "[VOTE]" instead, it will be mirrored to dev@ automatically. Look at past voting threads to see how this proceeds. The email should follow this format.
+The first voting takes place on the Apache TVM developers list (dev@tvm.apache.org). To get more attention, one can create a github issue start with "[VOTE]" instead, it will be mirrored to dev@ automatically. Look at past voting threads to see how this proceeds. The email should follow this format.
 
 - Provide the link to the draft of the release notes in the email
 - Provide the link to the release candidate artifacts
@@ -164,14 +162,9 @@ The first voting takes place on the Apache TVM (incubator) developers list (dev@
 
 For the dev@ vote, there must be at least 3 binding +1 votes and more +1 votes than -1 votes. Once the vote is done, you should also send out a summary email with the totals, with a subject that looks something like [VOTE][RESULT] ....
 
-The voting then moves onto the general@incubator.apache.org. Anyone can contribute a vote, but only "Incubator PMC" (IPMC) votes are binding.
-To pass, there must be 3 binding +1 votes and more +1 votes than -1 votes.
-
 In ASF, votes are open "at least" 72hrs (3 days). If you don't get enough number of binding votes within that time, you cannot close the voting deadline. You need to extend it.
 
-Same as the one on dev@, send out a summary email to general@ once the vote passes.
-
-If either voting fails, the community needs to modified the release accordingly, create a new release candidate and re-run the voting process.
+If the voting fails, the community needs to modified the release accordingly, create a new release candidate and re-run the voting process.
 
 
 Post the Release
@@ -182,12 +175,12 @@ After the vote passes, to upload the binaries to Apache mirrors, you move the bi
 .. code-block:: bash
 
 	export SVN_EDITOR=vim
-	svn mkdir https://dist.apache.org/repos/dist/release/incubator/tvm
-	svn mv https://dist.apache.org/repos/dist/dev/incubator/tvm/tvm-v0.6.0-rc2 https://dist.apache.org/repos/dist/release/incubator/tvm/tvm-v0.6.0
+	svn mkdir https://dist.apache.org/repos/dist/release/tvm
+	svn mv https://dist.apache.org/repos/dist/dev/tvm/tvm-v0.6.0-rc2 https://dist.apache.org/repos/dist/release/tvm/tvm-v0.6.0
 
 	# If you've added your signing key to the KEYS file, also update the release copy.
-	svn co --depth=files "https://dist.apache.org/repos/dist/release/incubator/tvm" svn-tvm
-	curl "https://dist.apache.org/repos/dist/dev/incubator/tvm/KEYS" > svn-tvm/KEYS
+	svn co --depth=files "https://dist.apache.org/repos/dist/release/tvm" svn-tvm
+	curl "https://dist.apache.org/repos/dist/dev/tvm/KEYS" > svn-tvm/KEYS
 	(cd svn-tvm && svn ci --username $ASF_USERNAME --password "$ASF_PASSWORD" -m"Update KEYS")
 
 Remember to create a new release TAG (v0.6.0 in this case) on Github and remove the pre-release candidate TAG.
@@ -200,10 +193,10 @@ Remember to create a new release TAG (v0.6.0 in this case) on Github and remove
 Update the TVM Website
 ----------------------
 
-The website repository is located at `https://github.com/apache/incubator-tvm-site <https://github.com/apache/incubator-tvm-site>`_. Modify the download page to include the release artifacts as well as the GPG signature and SHA hash.
+The website repository is located at `https://github.com/apache/tvm-site <https://github.com/apache/tvm-site>`_. Modify the download page to include the release artifacts as well as the GPG signature and SHA hash.
 
 
 Post the Announcement
 ---------------------
 
-Send out an announcement email to general@incubator.apache.org, announce@apache.org, and dev@tvm.apache.org. The announcement should include the link to release note and download page.
+Send out an announcement email to announce@apache.org, and dev@tvm.apache.org. The announcement should include the link to release note and download page.
diff --git a/docs/deploy/android.rst b/docs/deploy/android.rst
index e28eef383164..8c8fcfb49679 100644
--- a/docs/deploy/android.rst
+++ b/docs/deploy/android.rst
@@ -38,5 +38,5 @@ deploy_lib.so, deploy_graph.json, deploy_param.params will go to android target.
 TVM Runtime for Android Target
 ------------------------------
 
-Refer `here <https://github.com/apache/incubator-tvm/blob/main/apps/android_deploy/README.md#build-and-installation>`_ to build CPU/OpenCL version flavor TVM runtime for android target.
-From android java TVM API to load model & execute can be referred at this `java <https://github.com/apache/incubator-tvm/blob/main/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java>`_ sample source.
+Refer `here <https://github.com/apache/tvm/blob/main/apps/android_deploy/README.md#build-and-installation>`_ to build CPU/OpenCL version flavor TVM runtime for android target.
+From android java TVM API to load model & execute can be referred at this `java <https://github.com/apache/tvm/blob/main/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java>`_ sample source.
diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index 5dd00764bcbc..a2eaa5fb5662 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -36,7 +36,7 @@ determine the architecture by looking online.
 
 We recommend two different ways to build and install ACL:
 
-* Use the script located at `docker/install/ubuntu_install_arm_compute_library.sh`. You can use this
+* Use the script located at `docker/install/ubuntu_install_arm_compute_lib.sh`. You can use this
   script for building ACL from source natively or for cross-compiling the library on an x86 machine.
   You may need to change the architecture of the device you wish to compile for by altering the
   `target_arch` variable. Binaries will be built from source and installed to the location denoted by
diff --git a/docs/deploy/cpp_deploy.rst b/docs/deploy/cpp_deploy.rst
index f3de69db2d1c..44df1e55b58e 100644
--- a/docs/deploy/cpp_deploy.rst
+++ b/docs/deploy/cpp_deploy.rst
@@ -19,7 +19,7 @@
 Deploy TVM Module using C++ API
 ===============================
 
-We provide an example on how to deploy TVM modules in `apps/howto_deploy <https://github.com/apache/incubator-tvm/tree/main/apps/howto_deploy>`_
+We provide an example on how to deploy TVM modules in `apps/howto_deploy <https://github.com/apache/tvm/tree/main/apps/howto_deploy>`_
 
 To run the example, you can use the following command
 
@@ -38,17 +38,17 @@ TVM provides a minimum runtime, which costs around 300K to 600K depending on how
 In most cases, we can use ``libtvm_runtime.so`` that comes with the build.
 
 If somehow you find it is hard to build ``libtvm_runtime``, checkout
-`tvm_runtime_pack.cc <https://github.com/apache/incubator-tvm/tree/main/apps/howto_deploy/tvm_runtime_pack.cc>`_.
+`tvm_runtime_pack.cc <https://github.com/apache/tvm/tree/main/apps/howto_deploy/tvm_runtime_pack.cc>`_.
 It is an example all in one file that gives you TVM runtime.
 You can compile this file using your build system and include this into your project.
 
-You can also checkout `apps <https://github.com/apache/incubator-tvm/tree/main/apps/>`_ for example applications build with TVM on iOS, Android and others.
+You can also checkout `apps <https://github.com/apache/tvm/tree/main/apps/>`_ for example applications build with TVM on iOS, Android and others.
 
 Dynamic Library vs. System Module
 ---------------------------------
 TVM provides two ways to use the compiled library.
-You can checkout `prepare_test_libs.py <https://github.com/apache/incubator-tvm/tree/main/apps/howto_deploy/prepare_test_libs.py>`_
-on how to generate the library and `cpp_deploy.cc <https://github.com/apache/incubator-tvm/tree/main/apps/howto_deploy/cpp_deploy.cc>`_ on how to use them.
+You can checkout `prepare_test_libs.py <https://github.com/apache/tvm/tree/main/apps/howto_deploy/prepare_test_libs.py>`_
+on how to generate the library and `cpp_deploy.cc <https://github.com/apache/tvm/tree/main/apps/howto_deploy/cpp_deploy.cc>`_ on how to use them.
 
 - Store library as a shared library and dynamically load the library into your project.
 - Bundle the compiled library into your project in system module mode.
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
index b38a7f561ab3..2b37f734c3c3 100644
--- a/docs/deploy/index.rst
+++ b/docs/deploy/index.rst
@@ -38,7 +38,7 @@ on a Linux based embedded system such as Raspberry Pi:
 
 .. code:: bash
 
-    git clone --recursive https://github.com/apache/incubator-tvm tvm
+    git clone --recursive https://github.com/apache/tvm tvm
     cd tvm
     mkdir build
     cp cmake/config.cmake build
@@ -69,3 +69,5 @@ target device without relying on RPC. see the following resources on how to do s
    integrate
    hls
    arm_compute_lib
+   tensorrt
+   vitis_ai
diff --git a/docs/deploy/tensorrt.rst b/docs/deploy/tensorrt.rst
new file mode 100644
index 000000000000..27f11e9b5377
--- /dev/null
+++ b/docs/deploy/tensorrt.rst
@@ -0,0 +1,297 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Relay TensorRT Integration
+==========================
+**Author**: `Trevor Morris <https://github.com/trevor-m>`_
+
+Introduction
+------------
+
+NVIDIA TensorRT is a library for optimized deep learning inference. This integration will offload as
+many operators as possible from Relay to TensorRT, providing a performance boost on NVIDIA GPUs
+without the need to tune schedules.
+
+This guide will demonstrate how to install TensorRT and build TVM with TensorRT BYOC and runtime
+enabled. It will also provide example code to compile and run a ResNet-18 model using TensorRT and
+how to configure the compilation and runtime settings. Finally, we document the supported operators
+and how to extend the integration to support other operators.
+
+Installing TensorRT
+-------------------
+
+In order to download TensorRT, you will need to create an NVIDIA Developer program account. Please
+see NVIDIA's documentation for more info:
+https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html. If you have a Jetson device
+such as a TX1, TX2, Xavier, or Nano, TensorRT will already be installed on the device via the
+JetPack SDK.
+
+There are two methods to install TensorRT:
+
+* System install via deb or rpm package.
+* Tar file installation.
+
+With the tar file installation method, you must provide the path of the extracted tar archive to
+USE_TENSORRT_RUNTIME=/path/to/TensorRT. With the system install method,
+USE_TENSORRT_RUNTIME=ON will automatically locate your installation.
+
+Building TVM with TensorRT support
+----------------------------------
+
+There are two separate build flags for TensorRT integration in TVM. These flags also enable
+cross-compilation: USE_TENSORRT_CODEGEN=ON will also you to build a module with TensorRT support on
+a host machine, while USE_TENSORRT_RUNTIME=ON will enable the TVM runtime on an edge device to
+execute the TensorRT module. You should enable both if you want to compile and also execute models
+with the same TVM build.
+
+* USE_TENSORRT_CODEGEN=ON/OFF - This flag will enable compiling a TensorRT module, which does not require any
+  TensorRT library.
+* USE_TENSORRT_RUNTIME=ON/OFF/path-to-TensorRT - This flag will enable the TensorRT runtime module.
+  This will build TVM against the installed TensorRT library.
+
+Example setting in config.cmake file:
+
+.. code:: cmake
+
+    set(USE_TENSORRT_CODEGEN ON)
+    set(USE_TENSORRT_RUNTIME /home/ubuntu/TensorRT-7.0.0.11)
+
+
+Build and Deploy ResNet-18 with TensorRT
+----------------------------------------
+
+Create a Relay graph from a MXNet ResNet-18 model.
+
+.. code:: python
+
+    import tvm
+    from tvm import relay
+    import mxnet
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    block = get_model('resnet18_v1', pretrained=True)
+    mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+
+
+Annotate and partition the graph for TensorRT. All ops which are supported by the TensorRT
+integration will be marked and offloaded to TensorRT. The rest of the ops will go through the
+regular TVM CUDA compilation and code generation.
+
+.. code:: python
+
+    from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt
+    mod, config = partition_for_tensorrt(mod, params)
+
+
+Build the Relay graph, using the new module and config returned by partition_for_tensorrt. The
+target must always be a cuda target. ``partition_for_tensorrt`` will automatically fill out the
+required values in the config, so there is no need to modify it - just pass it along to the
+PassContext so the values can be read during compilation.
+
+.. code:: python
+
+    target = "cuda"
+    with tvm.transform.PassContext(opt_level=3, config={'relay.ext.tensorrt.options': config}):
+        lib = relay.build(mod, target=target, params=params)
+
+
+Export the module.
+
+.. code:: python
+
+    lib.export_library('compiled.so')
+
+
+Load module and run inference on the target machine, which must be built with
+``USE_TENSORRT_RUNTIME`` enabled. The first run will take longer because the TensorRT engine will
+have to be built.
+
+.. code:: python
+
+    ctx = tvm.gpu(0)
+    loaded_lib = tvm.runtime.load_module('compiled.so')
+    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx))
+    input_data = np.random.uniform(0, 1, input_shape).astype(dtype)
+    gen_module.run(data=input_data)
+
+
+Partitioning and Compilation Settings
+-------------------------------------
+
+There are some options which can be configured in ``partition_for_tensorrt``.
+
+* ``version`` - TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled
+  with USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead. The version
+  will affect which ops can be partitioned to TensorRT.
+* ``use_implicit_batch`` - Use TensorRT implicit batch mode (default true). Setting to false will
+  enable explicit batch mode which will widen supported operators to include those which modify the
+  batch dimension, but may reduce performance for some models.
+* ``remove_no_mac_subgraphs`` - A heuristic to improve performance. Removes subgraphs which have
+  been partitioned for TensorRT if they do not have any multiply-accumulate operations. The removed
+  subgraphs will go through TVM's standard compilation instead.
+* ``max_workspace_size`` - How many bytes of workspace size to allow each subgraph to use for
+  TensorRT engine creation. See TensorRT documentation for more info. Can be overriden at runtime.
+
+
+Runtime Settings
+----------------
+
+There are some additional options which can be configured at runtime using environment variables.
+
+* Automatic FP16 Conversion - Environment variable ``TVM_TENSORRT_USE_FP16=1`` can be set to
+  automatically convert the TensorRT components of your model to 16-bit floating point precision.
+  This can greatly increase performance, but may cause some slight loss in the model accuracy.
+* Caching TensorRT Engines - During the first inference, the runtime will invoke the TensorRT API
+  to build an engine. This can be time consuming, so you can set ``TVM_TENSORRT_CACHE_DIR`` to
+  point to a directory to save these built engines to on the disk. The next time you load the model
+  and give it the same directory, the runtime will load the already built engines to avoid the long
+  warmup time. A unique directory is required for each model.
+* TensorRT has a paramter to configure the maximum amount of scratch space that each layer in the
+  model can use. It is generally best to use the highest value which does not cause you to run out
+  of memory. You can use ``TVM_TENSORRT_MAX_WORKSPACE_SIZE`` to override this by specifying the
+  workspace size in bytes you would like to use.
+
+
+Operator support
+----------------
++------------------------+------------------------------------+
+|       Relay Node       |              Remarks               |
++========================+====================================+
+| nn.relu                |                                    |
++------------------------+------------------------------------+
+| sigmoid                |                                    |
++------------------------+------------------------------------+
+| tanh                   |                                    |
++------------------------+------------------------------------+
+| nn.batch_norm          |                                    |
++------------------------+------------------------------------+
+| nn.softmax             |                                    |
++------------------------+------------------------------------+
+| nn.conv2d              |                                    |
++------------------------+------------------------------------+
+| nn.dense               |                                    |
++------------------------+------------------------------------+
+| nn.bias_add            |                                    |
++------------------------+------------------------------------+
+| add                    |                                    |
++------------------------+------------------------------------+
+| subtract               |                                    |
++------------------------+------------------------------------+
+| multiply               |                                    |
++------------------------+------------------------------------+
+| divide                 |                                    |
++------------------------+------------------------------------+
+| power                  |                                    |
++------------------------+------------------------------------+
+| maximum                |                                    |
++------------------------+------------------------------------+
+| minimum                |                                    |
++------------------------+------------------------------------+
+| nn.max_pool2d          |                                    |
++------------------------+------------------------------------+
+| nn.avg_pool2d          |                                    |
++------------------------+------------------------------------+
+| nn.global_max_pool2d   |                                    |
++------------------------+------------------------------------+
+| nn.global_avg_pool2d   |                                    |
++------------------------+------------------------------------+
+| exp                    |                                    |
++------------------------+------------------------------------+
+| log                    |                                    |
++------------------------+------------------------------------+
+| sqrt                   |                                    |
++------------------------+------------------------------------+
+| abs                    |                                    |
++------------------------+------------------------------------+
+| negative               |                                    |
++------------------------+------------------------------------+
+| nn.batch_flatten       |                                    |
++------------------------+------------------------------------+
+| expand_dims            |                                    |
++------------------------+------------------------------------+
+| squeeze                |                                    |
++------------------------+------------------------------------+
+| concatenate            |                                    |
++------------------------+------------------------------------+
+| nn.conv2d_transpose    |                                    |
++------------------------+------------------------------------+
+| transpose              |                                    |
++------------------------+------------------------------------+
+| layout_transform       |                                    |
++------------------------+------------------------------------+
+| reshape                |                                    |
++------------------------+------------------------------------+
+| nn.pad                 |                                    |
++------------------------+------------------------------------+
+| sum                    |                                    |
++------------------------+------------------------------------+
+| prod                   |                                    |
++------------------------+------------------------------------+
+| max                    |                                    |
++------------------------+------------------------------------+
+| min                    |                                    |
++------------------------+------------------------------------+
+| mean                   |                                    |
++------------------------+------------------------------------+
+| nn.adaptive_max_pool2d |                                    |
++------------------------+------------------------------------+
+| nn.adaptive_avg_pool2d |                                    |
++------------------------+------------------------------------+
+| clip                   | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| nn.leaky_relu          | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| sin                    | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| cos                    | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| atan                   | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| ceil                   | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| floor                  | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| strided_slice          | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| nn.conv3d              | Requires TensorRT 6.0.1 or greater |
++------------------------+------------------------------------+
+| nn.max_pool3d          | Requires TensorRT 6.0.1 or greater |
++------------------------+------------------------------------+
+| nn.avg_pool3d          | Requires TensorRT 6.0.1 or greater |
++------------------------+------------------------------------+
+| nn.conv3d_transpose    | Requires TensorRT 6.0.1 or greater |
++------------------------+------------------------------------+
+
+
+Adding a new operator
+---------------------
+To add support for a new operator, there are a series of files we need to make changes to:
+
+* `src/runtime/contrib/tensorrt/tensorrt_ops.cc` Create a new op converter class which
+  implements the ``TensorRTOpConverter`` interface. You must implement the constructor to specify how
+  many inputs there are and whether they are tensors or weights. You must also implement the
+  ``Convert`` method to perform the conversion. This is done by using the inputs, attributes, and
+  network from params to add the new TensorRT layers and push the layer outputs. You can use the
+  existing converters as an example. Finally, register your new op conventer in the
+  ``GetOpConverters()`` map.
+* `python/relay/op/contrib/tensorrt.py` This file contains the annotation rules for TensorRT. These
+  determine which operators and their attributes that are supported. You must register an annotation
+  function for the relay operator and specify which attributes are supported by your converter, by
+  checking the attributes are returning true or false.
+* `tests/python/contrib/test_tensorrt.py` Add unit tests for the given operator.
diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst
new file mode 100755
index 000000000000..df29f16f9d8d
--- /dev/null
+++ b/docs/deploy/vitis_ai.rst
@@ -0,0 +1,652 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+
+Vitis-AI Integration
+====================
+
+`Vitis-AI <https://github.com/Xilinx/Vitis-AI>`__ is Xilinx's
+development stack for hardware-accelerated AI inference on Xilinx
+platforms, including both edge devices and Alveo cards. It consists of
+optimized IP, tools, libraries, models, and example designs. It is
+designed with high efficiency and ease of use in mind, unleashing the
+full potential of AI acceleration on Xilinx FPGA and ACAP.
+
+The current Vitis-AI Byoc flow inside TVM enables acceleration of Neural
+Network model inference on edge and cloud. The identifiers for the
+supported edge and cloud Deep Learning Processor Units (DPU's) are
+DPUCZDX8G respectively DPUCADX8G. DPUCZDX8G and DPUCADX8G are hardware
+accelerators for convolutional neural networks (CNN's) on top of the
+Xilinx `Zynq Ultrascale+
+MPSoc <https://www.xilinx.com/products/silicon-devices/soc/zynq-ultrascale-mpsoc.html>`__
+respectively
+`Alveo <https://www.xilinx.com/products/boards-and-kits/alveo.html>`__
+(U200/U250) platforms. For more information about the DPU identifiers
+see the section on `DPU naming information <#dpu-naming-information>`__.
+
+On this page you will find information on how to
+`build <#build-instructions>`__ TVM with Vitis-AI and on how to `get
+started <#getting-started>`__ with an example.
+
+DPU naming information
+----------------------
+
++---------------------------------+-----------------+-------------------------------------------------------------------------+------------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------+
+| DPU                             | Application     | HW Platform                                                             | Quantization Method                                        | Quantization Bitwidth                             | Design Target                                                            |
++=================================+=================+=========================================================================+============================================================+===================================================+==========================================================================+
+| Deep Learning Processing Unit   | C: CNN R: RNN   | AD: Alveo DDR AH: Alveo HBM VD: Versal DDR with AIE & PL ZD: Zynq DDR   | X: DECENT I: Integer threshold F: Float threshold R: RNN   | 4: 4-bit 8: 8-bit 16: 16-bit M: Mixed Precision   | G: General purpose H: High throughput L: Low latency C: Cost optimized   |
++---------------------------------+-----------------+-------------------------------------------------------------------------+------------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------+
+
+Build instructions
+------------------
+
+This section lists the instructions for building TVM with Vitis-AI for
+both `cloud <#cloud-dpucadx8g>`__ and `edge <#edge-dpuczdx8g>`__.
+
+Cloud (DPUCADX8G)
+~~~~~~~~~~~~~~~~~
+
+For Vitis-AI acceleration in the cloud TVM has to be built on top of the
+Xilinx Alveo platform.
+
+System requirements
+^^^^^^^^^^^^^^^^^^^
+
+The following table lists system requirements for running docker
+containers as well as Alveo cards.
+
++-----------------------------------------------------+----------------------------------------------------------+
+| **Component**                                       | **Requirement**                                          |
++=====================================================+==========================================================+
+| Motherboard                                         | PCI Express 3.0-compliant with one dual-width x16 slot   |
++-----------------------------------------------------+----------------------------------------------------------+
+| System Power Supply                                 | 225W                                                     |
++-----------------------------------------------------+----------------------------------------------------------+
+| Operating System                                    | Ubuntu 16.04, 18.04                                      |
++-----------------------------------------------------+----------------------------------------------------------+
+|                                                     | CentOS 7.4, 7.5                                          |
++-----------------------------------------------------+----------------------------------------------------------+
+|                                                     | RHEL 7.4, 7.5                                            |
++-----------------------------------------------------+----------------------------------------------------------+
+| CPU                                                 | Intel i3/i5/i7/i9/Xeon 64-bit CPU                        |
++-----------------------------------------------------+----------------------------------------------------------+
+| GPU (Optional to accelerate quantization)           | NVIDIA GPU with a compute capability > 3.0               |
++-----------------------------------------------------+----------------------------------------------------------+
+| CUDA Driver (Optional to accelerate quantization)   | nvidia-410                                               |
++-----------------------------------------------------+----------------------------------------------------------+
+| FPGA                                                | Xilinx Alveo U200 or U250                                |
++-----------------------------------------------------+----------------------------------------------------------+
+| Docker Version                                      | 19.03.1                                                  |
++-----------------------------------------------------+----------------------------------------------------------+
+
+Hardware setup and docker build
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. Clone the Vitis AI repository:
+
+   .. code:: bash
+
+      git clone --recurse-submodules https://github.com/Xilinx/Vitis-AI
+
+2. Install Docker, and add the user to the docker group. Link the user
+   to docker installation instructions from the following docker's
+   website:
+
+
+   -  https://docs.docker.com/install/linux/docker-ce/ubuntu/
+   -  https://docs.docker.com/install/linux/docker-ce/centos/
+   -  https://docs.docker.com/install/linux/linux-postinstall/
+
+3. Download the latest Vitis AI Docker with the following command. This container runs on CPU.
+
+   .. code:: bash
+
+      docker pull xilinx/vitis-ai:latest
+
+   To accelerate the quantization, you can optionally use the Vitis-AI GPU docker image. Use the below commands to build the Vitis-AI GPU docker container:
+
+   .. code:: bash
+
+      cd Vitis-AI/docker
+      ./docker_build_gpu.sh
+
+4. Set up Vitis AI to target Alveo cards. To target Alveo cards with
+   Vitis AI for machine learning workloads, you must install the
+   following software components:
+
+   -  Xilinx Runtime (XRT)
+   -  Alveo Deployment Shells (DSAs)
+   -  Xilinx Resource Manager (XRM) (xbutler)
+   -  Xilinx Overlaybins (Accelerators to Dynamically Load - binary
+      programming files)
+
+   While it is possible to install all of these software components
+   individually, a script has been provided to automatically install
+   them at once. To do so:
+
+   -  Run the following commands:
+
+      .. code:: bash
+
+         cd Vitis-AI/alveo/packages
+         sudo su
+         ./install.sh
+
+   -  Power cycle the system.
+
+5. Clone tvm repo and pyxir repo
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/apache/tvm.git
+      git clone --recursive https://github.com/Xilinx/pyxir.git
+
+6. Build and start the tvm runtime Vitis-AI Docker Container.
+
+   .. code:: bash
+
+      ./tvm/docker/build.sh demo_vitis_ai bash
+      ./tvm/docker/bash.sh tvm.demo_vitis_ai
+
+      #Setup inside container
+      source /opt/xilinx/xrt/setup.sh
+      . $VAI_ROOT/conda/etc/profile.d/conda.sh
+      conda activate vitis-ai-tensorflow
+
+7. Install PyXIR
+
+   .. code:: bash
+
+     cd pyxir
+     python3 setup.py install --use_vai_rt_dpucadx8g --user
+
+
+8. Build TVM inside the container with Vitis-AI
+
+   .. code:: bash
+
+      cd tvm
+      mkdir build
+      cp cmake/config.cmake build
+      cd build
+      echo set\(USE_LLVM ON\) >> config.cmake
+      echo set\(USE_VITIS_AI ON\) >> config.cmake
+      cmake ..
+      make -j$(nproc)
+
+9.  Install TVM
+
+    .. code:: bash
+
+      cd tvm/python
+      pip3 install -e . --user
+
+Edge (DPUCZDX8G)
+^^^^^^^^^^^^^^^^
+
+
+For edge deployment we make use of two systems referred to as host and
+edge. The `host <#host-requirements>`__ system is responsible for
+quantization and compilation of the neural network model in a first
+offline step. Afterwards, the model will de deployed on the
+`edge <#edge-requirements>`__ system.
+
+Host requirements
+^^^^^^^^^^^^^^^^^
+
+The following table lists system requirements for running the TVM -
+Vitis-AI docker container.
+
++-----------------------------------------------------+----------------------------------------------+
+| **Component**                                       | **Requirement**                              |
++=====================================================+==============================================+
+| Operating System                                    | Ubuntu 16.04, 18.04                          |
++-----------------------------------------------------+----------------------------------------------+
+|                                                     | CentOS 7.4, 7.5                              |
++-----------------------------------------------------+----------------------------------------------+
+|                                                     | RHEL 7.4, 7.5                                |
++-----------------------------------------------------+----------------------------------------------+
+| CPU                                                 | Intel i3/i5/i7/i9/Xeon 64-bit CPU            |
++-----------------------------------------------------+----------------------------------------------+
+| GPU (Optional to accelerate quantization)           | NVIDIA GPU with a compute capability > 3.0   |
++-----------------------------------------------------+----------------------------------------------+
+| CUDA Driver (Optional to accelerate quantization)   | nvidia-410                                   |
++-----------------------------------------------------+----------------------------------------------+
+| FPGA                                                | Not necessary on host                        |
++-----------------------------------------------------+----------------------------------------------+
+| Docker Version                                      | 19.03.1                                      |
++-----------------------------------------------------+----------------------------------------------+
+
+Host setup and docker build
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. Clone tvm repo
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/apache/tvm.git
+2. Build and start the tvm runtime Vitis-AI Docker Container.
+
+   .. code:: bash
+
+      cd tvm
+      ./tvm/docker/build.sh demo_vitis_ai bash
+      ./tvm/docker/bash.sh tvm.demo_vitis_ai
+
+      #Setup inside container
+      . $VAI_ROOT/conda/etc/profile.d/conda.sh
+      conda activate vitis-ai-tensorflow
+
+3. Install PyXIR
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/Xilinx/pyxir.git
+      cd pyxir
+      python3 setup.py install --user
+
+
+4. Build TVM inside the container with Vitis-AI.
+
+   .. code:: bash
+
+      cd tvm
+      mkdir build
+      cp cmake/config.cmake build
+      cd build
+      echo set\(USE_LLVM ON\) >> config.cmake
+      echo set\(USE_VITIS_AI ON\) >> config.cmake
+      cmake ..
+      make -j$(nproc)
+
+5. Install TVM
+
+   .. code:: bash
+
+      cd tvm/python
+      pip3 install -e . --user
+
+Edge requirements
+^^^^^^^^^^^^^^^^^
+
+The DPUCZDX8G can be deployed on the `Zynq Ultrascale+
+MPSoc <https://www.xilinx.com/products/silicon-devices/soc/zynq-ultrascale-mpsoc.html>`__
+platform. The following development boards can be used out-of-the-box:
+
++--------------------+----------------------+-----------------------------------------------------------------------+
+| **Target board**   | **TVM identifier**   | **Info**                                                              |
++====================+======================+=======================================================================+
+| Ultra96            | DPUCZDX8G-ultra96    | https://www.xilinx.com/products/boards-and-kits/1-vad4rl.html         |
++--------------------+----------------------+-----------------------------------------------------------------------+
+| ZCU104             | DPUCZDX8G-zcu104     | https://www.xilinx.com/products/boards-and-kits/zcu104.html           |
++--------------------+----------------------+-----------------------------------------------------------------------+
+| ZCU102             | DPUCZDX8G-zcu102     | https://www.xilinx.com/products/boards-and-kits/ek-u1-zcu102-g.html   |
++--------------------+----------------------+-----------------------------------------------------------------------+
+
+Edge hardware setup
+^^^^^^^^^^^^^^^^^^^
+.. note::
+
+  This section provides instructions for setting up with the `Pynq <http://www.pynq.io/>`__ platform but
+  Petalinux based flows are also supported.
+
+1. Download the Pynq v2.5 image for your target (use Z1 or Z2 for
+   Ultra96 target depending on board version) Link to image:
+   https://github.com/Xilinx/PYNQ/releases/tag/v2.5
+2. Follow Pynq instructions for setting up the board: `pynq
+   setup <https://pynq.readthedocs.io/en/latest/getting_started.html>`__
+3. After connecting to the board, make sure to run as root. Execute
+   ``su``
+4. Set up DPU on Pynq by following the steps here: `DPU Pynq
+   setup <https://github.com/Xilinx/DPU-PYNQ>`__
+5. Run the following command to download the DPU bitstream:
+
+   .. code:: bash
+
+     python3 -c 'from pynq_dpu import DpuOverlay ; overlay = DpuOverlay("dpu.bit")'
+
+6. Check whether the DPU kernel is alive:
+
+   .. code:: bash
+
+     dexplorer -w
+
+Edge TVM setup
+^^^^^^^^^^^^^^
+
+.. note::
+
+  When working on Petalinux instead of Pynq, the following steps might take more manual work (e.g building
+  hdf5 from source). Also, TVM has a scipy dependency which you then might have to build from source or
+  circumvent. We don't depend on scipy in our flow.
+
+Building TVM depends on the Xilinx
+`PyXIR <https://github.com/Xilinx/pyxir>`__ package. PyXIR acts as an
+interface between TVM and Vitis-AI tools.
+
+1. First install the PyXIR h5py and pydot dependencies:
+
+   .. code:: bash
+
+      apt-get install libhdf5-dev
+      pip3 install pydot h5py
+
+2. Install PyXIR
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/Xilinx/pyxir.git
+      cd pyxir
+      sudo python3 setup.py install --use_vai_rt_dpuczdx8g
+
+3. Build TVM with Vitis-AI
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/apache/tvm
+      cd tvm
+      mkdir build
+      cp cmake/config.cmake build
+      cd build
+      echo set\(USE_VITIS_AI ON\) >> config.cmake
+      cmake ..
+      make
+
+4. Install TVM
+
+   .. code:: bash
+
+      cd tvm/python
+      pip3 install -e . --user
+
+5. Check whether the setup was successful in the Python shell:
+
+   .. code:: bash
+
+      python3 -c 'import pyxir; import tvm'
+
+
+Getting started
+---------------
+
+This section shows how to use TVM with Vitis-AI. For this it's important
+to understand that neural network models are quantized for Vitis-AI
+execution in fixed point arithmetic. The approach we take here is to
+quantize on-the-fly using the first N inputs as explained in the next
+section.
+
+On-the-fly quantization
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Usually, to be able to accelerate inference of Neural Network models
+with Vitis-AI DPU accelerators, those models need to quantized upfront.
+In TVM - Vitis-AI flow, we make use of on-the-fly quantization to remove
+this additional preprocessing step. In this flow, one doesn't need to
+quantize his/her model upfront but can make use of the typical inference
+execution calls (module.run) to quantize the model on-the-fly using the
+first N inputs that are provided (see more information below). This will
+set up and calibrate the Vitis-AI DPU and from that point onwards
+inference will be accelerated for all next inputs. Note that the edge
+flow deviates slightly from the explained flow in that inference won't
+be accelerated after the first N inputs but the model will have been
+quantized and compiled and can be moved to the edge device for
+deployment. Please check out the `edge <#Edge%20usage>`__ usage
+instructions below for more information.
+
+Config/Settings
+~~~~~~~~~~~~~~~
+
+A couple of environment variables can be used to customize the Vitis-AI
+Byoc flow.
+
++----------------------------+----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Environment Variable**   | **Default if unset**                   | **Explanation**                                                                                                                                                                                                                                                                                                                            |
++============================+========================================+============================================================================================================================================================================================================================================================================================================================================+
+| PX\_QUANT\_SIZE            | 128                                    | The number of inputs that will be used for quantization (necessary for Vitis-AI acceleration)                                                                                                                                                                                                                                              |
++----------------------------+----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| PX\_BUILD\_DIR             | Use the on-the-fly quantization flow   | Loads the quantization and compilation information from the provided build directory and immediately starts Vitis-AI hardware acceleration. This configuration can be used if the model has been executed before using on-the-fly quantization during which the quantization and comilation information was cached in a build directory.   |
++----------------------------+----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Cloud usage
+~~~~~~~~~~~
+
+This section shows how to accelerate a convolutional neural network
+model in TVM with Vitis-AI on the cloud.
+
+To be able to target the Vitis-AI cloud DPUCADX8G target we first have
+to import the target in PyXIR. This PyXIR package is the interface being
+used by TVM to integrate with the Vitis-AI stack. Additionaly, import
+the typical TVM and Relay modules and the Vitis-AI contrib module inside
+TVM.
+
+.. code:: python
+
+   import pyxir
+   import pyxir.contrib.target.DPUCADX8G
+
+   import tvm
+   import tvm.relay as relay
+   from tvm.contrib.target import vitis_ai
+   from tvm.contrib import util, graph_runtime
+   from tvm.relay.build_module import bind_params_by_name
+   from tvm.relay.op.contrib.vitis_ai import annotation
+
+After importing a convolutional neural network model using the usual
+Relay API's, annotate the Relay expression for the given Vitis-AI DPU
+target and partition the graph.
+
+.. code:: python
+
+   mod["main"] = bind_params_by_name(mod["main"], params)
+   mod = annotation(mod, params, target)
+   mod = relay.transform.MergeCompilerRegions()(mod)
+   mod = relay.transform.PartitionGraph()(mod)
+
+Now, we can build the TVM runtime library for executing the model. The
+TVM target is 'llvm' as the operations that can't be handled by the DPU
+are executed on the CPU. The Vitis-AI target is DPUCADX8G as we are
+targeting the cloud DPU and this target is passed as a config to the TVM
+build call.
+
+.. code:: python
+
+   tvm_target = 'llvm'
+   target='DPUCADX8G'
+
+   with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target}):
+      lib = relay.build(mod, tvm_target, params=params)
+
+As one more step before we can accelerate a model with Vitis-AI in TVM
+we have to quantize and compile the model for execution on the DPU. We
+make use of on-the-fly quantization for this. Using this method one
+doesn’t need to quantize their model upfront and can make use of the
+typical inference execution calls (module.run) to calibrate the model
+on-the-fly using the first N inputs that are provided. After the first N
+iterations, computations will be accelerated on the DPU. So now we will
+feed N inputs to the TVM runtime module. Note that these first N inputs
+will take a substantial amount of time.
+
+.. code:: python
+
+   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+
+   # First N (default = 128) inputs are used for quantization calibration and will
+   # be executed on the CPU
+   # This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64)
+   for i in range(128):
+      module.set_input(input_name, inputs[i])
+      module.run()
+
+Afterwards, inference will be accelerated on the DPU.
+
+.. code:: python
+
+   module.set_input(name, data)
+   module.run()
+
+To save and load the built module, one can use the typical TVM API's:
+
+.. code:: python
+
+   lib_path = "deploy_lib.so"
+   lib.export_library(lib_path)
+
+Load the module from compiled files and run inference
+
+.. code:: python
+
+   # load the module into memory
+   loaded_lib = tvm.runtime.load_module(lib_path)
+
+   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+   module.set_input(name, data)
+   module.run()
+
+Edge usage
+~~~~~~~~~~
+
+This section shows how to accelerate a convolutional neural network
+model in TVM with Vitis-AI at the edge. The first couple of steps will
+have to be run on the host machine and take care of quantization and
+compilation for deployment at the edge.
+
+Host steps
+^^^^^^^^^^
+
+To be able to target the Vitis-AI cloud DPUCZDX8G target we first have
+to import the target in PyXIR. This PyXIR package is the interface being
+used by TVM to integrate with the Vitis-AI stack. Additionaly, import
+the typical TVM and Relay modules and the Vitis-AI contrib module inside
+TVM.
+
+.. code:: python
+
+   import pyxir
+   import pyxir.contrib.target.DPUCZDX8G
+
+   import tvm
+   import tvm.relay as relay
+   from tvm.contrib.target import vitis_ai
+   from tvm.contrib import util, graph_runtime
+   from tvm.relay.build_module import bind_params_by_name
+   from tvm.relay.op.contrib.vitis_ai import annotation
+
+After importing a convolutional neural network model using the usual
+Relay API's, annotate the Relay expression for the given Vitis-AI DPU
+target and partition the graph.
+
+.. code:: python
+
+   mod["main"] = bind_params_by_name(mod["main"], params)
+   mod = annotation(mod, params, target)
+   mod = relay.transform.MergeCompilerRegions()(mod)
+   mod = relay.transform.PartitionGraph()(mod)
+
+Now, we can build the TVM runtime library for executing the model. The
+TVM target is 'llvm' as the operations that can't be handled by the DPU
+are executed on the CPU. At this point that means the CPU on the host machine.
+The Vitis-AI target is DPUCZDX8G-zcu104 as we are targeting the edge DPU
+on the ZCU104 board and this target is passed as a config to the TVM
+build call. Note that different identifiers can be passed for different
+targets, see `edge targets info <#edge-requirements>`__. Additionally, we
+provide the 'export_runtime_module' config that points to a file to which we
+can export the Vitis-AI runtime module. We have to do this because we will
+first be compiling and quantizing the model on the host machine before building
+the model for edge deployment. As you will see later on, the exported runtime
+module will be passed to the edge build so that the Vitis-AI runtime module
+can be included.
+
+.. code:: python
+
+   from tvm.contrib import util
+
+   temp = util.tempdir()
+
+   tvm_target = 'llvm'
+   target='DPUCZDX8G-zcu104'
+   export_rt_mod_file = temp.relpath("vitis_ai.rtmod")
+
+   with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target,
+   						        'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file}):
+      lib = relay.build(mod, tvm_target, params=params)
+
+We will quantize and compile the model for execution on the DPU using on-the-fly
+quantization on the host machine. This makes use of TVM inference calls
+(module.run) to quantize the model on the host with the first N inputs.
+
+.. code:: python
+
+   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+
+   # First N (default = 128) inputs are used for quantization calibration and will
+   # be executed on the CPU
+   # This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64)
+   for i in range(128):
+      module.set_input(input_name, inputs[i])
+      module.run()
+
+Save the TVM lib module so that the Vitis-AI runtime module will also be exported
+(to the 'export_runtime_module' path we previously passed as a config).
+
+.. code:: python
+
+   from tvm.contrib import util
+
+   temp = util.tempdir()
+   lib.export_library(temp.relpath("tvm_lib.so"))
+
+After quantizing and compiling the model for Vitis-AI acceleration using the
+first N inputs we can build the model for execution on the ARM edge device.
+Here we pass the previously exported Vitis-AI runtime module so it can be included
+in the TVM build.
+
+.. code:: python
+
+   # Export lib for aarch64 target
+   tvm_target = tvm.target.arm_cpu('ultra96')
+   lib_kwargs = {
+        'fcompile': contrib.cc.create_shared,
+        'cc': "/usr/aarch64-linux-gnu/bin/ld"
+   }
+
+   with tvm.transform.PassContext(opt_level=3,
+                                  config={'relay.ext.vitis_ai.options.load_runtime_module': export_rt_mod_file}):
+        lib_arm = relay.build(mod, tvm_target, params=params)
+
+   lib_dpuv2.export_library('tvm_dpu_arm.so', **lib_kwargs)
+
+Now, move the TVM build files (tvm\_dpu\_arm.json, tvm\_dpu\_arm.so,
+tvm\_dpu\_arm.params) to the edge device. For information on setting
+up the edge device check out the `edge setup <#edge-dpuczdx8g>`__
+section.
+
+Edge steps
+^^^^^^^^^^
+
+After setting up TVM with Vitis-AI on the edge device, you can now load
+the TVM runtime module into memory and feed inputs for inference.
+
+.. code:: python
+
+   ctx = tvm.cpu()
+
+   # load the module into memory
+   lib = tvm.runtime.load_module("tvm_dpu_arm.so")
+
+   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+   module.set_input(name, data)
+   module.run()
diff --git a/docs/dev/convert_layout.rst b/docs/dev/convert_layout.rst
index 07ebc2048dd3..53038e9605e8 100644
--- a/docs/dev/convert_layout.rst
+++ b/docs/dev/convert_layout.rst
@@ -157,7 +157,7 @@ First example is for layout agnostic operators. These operators do not have any
       Layout ret;
 
       if (new_in_layouts.defined()) {
-        CHECK_GE(new_in_layouts.size(), 1);
+        ICHECK_GE(new_in_layouts.size(), 1);
         ret = new_in_layouts[0];
       } else {
         for (size_t i = 0; i < old_in_layouts.size(); ++i) {
@@ -227,6 +227,7 @@ Second example is for a lightly-layout sensitive operator - batch normalization.
 ********
 4. Usage
 ********
+.. _convert-layout-usage:
 
 ConvertLayout pass is extremely easy to use. The pass is not a part of default relay.build pipeline. The intended usage is to call it between the framework-to-relay parser and relay.build module call.
 
@@ -264,5 +265,5 @@ The ordering of the layouts is defined by the implementation of `register_conver
 
 Current implementation has support for almost all the operators commonly used in image classification models. However, if one encounters too many data layout transforms in the graph, it is highly likely that there is an operator whose layouts need special handling as described in Section 3. Some pull requests that can help in such a situation are
 
-- Layout inference for `Batch Norm <https://github.com/apache/incubator-tvm/pull/4600>`_ - Batch normalization falls into the category of lightly-sensitive operator. The PR shows how to handle the layout inference for batch norm.
-- Python Callback for `Convolution <https://github.com/apache/incubator-tvm/pull/4335>`_- For highly-sensitive operators, one might have to do python callback as well. The PR shows how to define a python callback function for Convolution operator.
+- Layout inference for `Batch Norm <https://github.com/apache/tvm/pull/4600>`_ - Batch normalization falls into the category of lightly-sensitive operator. The PR shows how to handle the layout inference for batch norm.
+- Python Callback for `Convolution <https://github.com/apache/tvm/pull/4335>`_- For highly-sensitive operators, one might have to do python callback as well. The PR shows how to define a python callback function for Convolution operator.
diff --git a/docs/dev/frontend/tensorflow.rst b/docs/dev/frontend/tensorflow.rst
index b234ed7b0466..dde7179d90db 100644
--- a/docs/dev/frontend/tensorflow.rst
+++ b/docs/dev/frontend/tensorflow.rst
@@ -57,7 +57,7 @@ Export
 
 TensorFlow frontend expects a frozen protobuf (.pb) or saved model as input. It currently does not support checkpoint (.ckpt). The graphdef needed by the TensorFlow frontend can be extracted from the active session, or by using the `TFParser`_ helper class.
 
-.. _TFParser: https://github.com/apache/incubator-tvm/blob/main/python/tvm/relay/frontend/tensorflow_parser.py
+.. _TFParser: https://github.com/apache/tvm/blob/main/python/tvm/relay/frontend/tensorflow_parser.py
 
 The model should be exported with a number of transformations to prepare the model for inference. It is also important to set ```add_shapes=True```, as this will embed the output shapes of each node into the graph. Here is one function to export a model as a protobuf given a session:
 
@@ -101,7 +101,7 @@ Import the Model
 Explicit Shape:
 ~~~~~~~~~~~~~~~
 
-To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases <https://github.com/apache/incubator-tvm/blob/main/tests/python/frontend/tensorflow/test_forward.py#L36>`_ for examples.
+To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases <https://github.com/apache/tvm/blob/main/tests/python/frontend/tensorflow/test_forward.py#L36>`_ for examples.
 
 Data Layout
 ~~~~~~~~~~~
diff --git a/docs/dev/inferbound.rst b/docs/dev/inferbound.rst
index 7d0127a6c039..010d0d42d37e 100644
--- a/docs/dev/inferbound.rst
+++ b/docs/dev/inferbound.rst
@@ -22,7 +22,7 @@ InferBound Pass
 *******************************************
 
 
-The InferBound pass is run after normalize, and before ScheduleOps `build_module.py <https://github.com/apache/incubator-tvm/blob/main/python/tvm/driver/build_module.py>`_. The main job of InferBound is to create the bounds map, which specifies a Range for each IterVar in the program. These bounds are then passed to ScheduleOps, where they are used to set the extents of For loops, see `MakeLoopNest <https://github.com/apache/incubator-tvm/blob/main/src/te/operation/op_util.cc>`_, and to set the sizes of allocated buffers (`BuildRealize <https://github.com/apache/incubator-tvm/blob/main/src/te/operation/compute_op.cc>`_), among other uses.
+The InferBound pass is run after normalize, and before ScheduleOps `build_module.py <https://github.com/apache/tvm/blob/main/python/tvm/driver/build_module.py>`_. The main job of InferBound is to create the bounds map, which specifies a Range for each IterVar in the program. These bounds are then passed to ScheduleOps, where they are used to set the extents of For loops, see `MakeLoopNest <https://github.com/apache/tvm/blob/main/src/te/operation/op_util.cc>`_, and to set the sizes of allocated buffers (`BuildRealize <https://github.com/apache/tvm/blob/main/src/te/operation/compute_op.cc>`_), among other uses.
 
 The output of InferBound is a map from IterVar to Range:
 
@@ -53,9 +53,9 @@ Therefore, let's review the Range and IterVar classes:
    	};
    }
 
-Note that IterVarNode also contains a Range ``dom``. This ``dom`` may or may not have a meaningful value, depending on when the IterVar was created. For example, when ``tvm.compute`` is called, an `IterVar is created <https://github.com/apache/incubator-tvm/blob/main/src/te/operation/compute_op.cc>`_ for each axis and reduce axis, with dom's equal to the shape supplied in the call to ``tvm.compute``.
+Note that IterVarNode also contains a Range ``dom``. This ``dom`` may or may not have a meaningful value, depending on when the IterVar was created. For example, when ``tvm.compute`` is called, an `IterVar is created <https://github.com/apache/tvm/blob/main/src/te/operation/compute_op.cc>`_ for each axis and reduce axis, with dom's equal to the shape supplied in the call to ``tvm.compute``.
 
-On the other hand, when ``tvm.split`` is called, `IterVars are created <https://github.com/apache/incubator-tvm/blob/main/src/te/schedule/schedule_lang.cc>`_ for the inner and outer axes, but these IterVars are not given a meaningful ``dom`` value.
+On the other hand, when ``tvm.split`` is called, `IterVars are created <https://github.com/apache/tvm/blob/main/src/te/schedule/schedule_lang.cc>`_ for the inner and outer axes, but these IterVars are not given a meaningful ``dom`` value.
 
 In any case, the ``dom`` member of an IterVar is never modified during InferBound. However, keep in mind that the ``dom`` member of an IterVar is sometimes used as default value for the Ranges InferBound computes.
 
@@ -117,7 +117,7 @@ Tensors haven't been mentioned yet, but in the context of TVM, a Tensor represen
    	int value_index;
    };
 
-In the Operation class declaration above, we can see that each operation also has a list of InputTensors. Thus the stages of the schedule form a DAG, where each stage is a node in the graph. There is an edge in the graph from Stage A to Stage B, if the operation of Stage B has an input tensor whose source operation is the op of Stage A. Put simply, there is an edge from A to B, if B consumes a tensor produced by A. See the diagram below. This graph is created at the beginning of InferBound, by a call to `CreateReadGraph <https://github.com/apache/incubator-tvm/blob/main/src/te/schedule/bound.cc>`_.
+In the Operation class declaration above, we can see that each operation also has a list of InputTensors. Thus the stages of the schedule form a DAG, where each stage is a node in the graph. There is an edge in the graph from Stage A to Stage B, if the operation of Stage B has an input tensor whose source operation is the op of Stage A. Put simply, there is an edge from A to B, if B consumes a tensor produced by A. See the diagram below. This graph is created at the beginning of InferBound, by a call to `CreateReadGraph <https://github.com/apache/tvm/blob/main/src/te/schedule/bound.cc>`_.
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/stage_graph.png
     :align: center
diff --git a/docs/dev/introduction_to_module_serialization.rst b/docs/dev/introduction_to_module_serialization.rst
index 5451b84c9b8c..6b2f2addaf9a 100644
--- a/docs/dev/introduction_to_module_serialization.rst
+++ b/docs/dev/introduction_to_module_serialization.rst
@@ -32,7 +32,7 @@ Let us build one ResNet-18 workload for GPU as an example first.
 
    from tvm import relay
    from tvm.relay import testing
-   from tvm.contrib import util
+   from tvm.contrib import utils
    import tvm
 
    # Resnet18 workload
@@ -43,7 +43,7 @@ Let us build one ResNet-18 workload for GPU as an example first.
        _, resnet18_lib, _ = relay.build_module.build(resnet18_mod, "cuda", params=resnet18_params)
 
    # create one tempory directory
-   temp = util.tempdir()
+   temp = utils.tempdir()
 
    # path lib
    file_name = "deploy.so"
diff --git a/docs/dev/pass_infra.rst b/docs/dev/pass_infra.rst
index 1427608a4574..3680cb886952 100644
--- a/docs/dev/pass_infra.rst
+++ b/docs/dev/pass_infra.rst
@@ -276,12 +276,12 @@ order that they were appended to the pass list.
                                       const PassContext& pass_ctx) const {
       Module mod = module;
       for (const Pass& pass : passes) {
-        CHECK(pass.defined()) << "Found undefined pass for optimization.";
+        ICHECK(pass.defined()) << "Found undefined pass for optimization.";
         const PassInfo& pass_info = pass->Info();
         if (!PassEnabled(pass_info))  continue;
         for (const auto& it : pass_info->required) {
           const auto* name = it.as<tvm::ir::StringImm>();
-          CHECK(name);
+          ICHECK(name);
           mod = GetPass(name->value)(mod, pass_ctx);
         }
         mod = pass(mod, pass_ctx);
@@ -306,7 +306,7 @@ pass is registered with an API endpoint as we will show later.
       using tvm::runtime::Registry;
       std::string fpass_name = "relay._transform." + pass_name;
       const auto* f = Registry::Get(fpass_name);
-      CHECK(f != nullptr) << "Cannot find " << fpass_name
+      ICHECK(f != nullptr) << "Cannot find " << fpass_name
                           << "to create the pass " << pass_name;
       return (*f)();
     }
@@ -528,22 +528,22 @@ optimization pipeline and debug Relay and tir passes, please refer to the
 
 .. _Sequential: https://pytorch.org/docs/stable/nn.html?highlight=sequential#torch.nn.Sequential
 
-.. _Block: https://mxnet.incubator.apache.org/api/python/docs/api/gluon/block.html#gluon-block
+.. _Block: https://mxnet.apache.org/api/python/docs/api/gluon/block.html#gluon-block
 
-.. _include/tvm/ir/transform.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/ir/transform.h
+.. _include/tvm/ir/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/ir/transform.h
 
-.. _src/relay/ir/transform.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/ir/transform.cc
+.. _src/relay/ir/transform.cc: https://github.com/apache/tvm/blob/main/src/relay/ir/transform.cc
 
-.. _src/ir/transform.cc: https://github.com/apache/incubator-tvm/blob/main/src/ir/transform.cc
+.. _src/ir/transform.cc: https://github.com/apache/tvm/blob/main/src/ir/transform.cc
 
-.. _src/relay/pass/fold_constant.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/pass/fold_constant.cc
+.. _src/relay/pass/fold_constant.cc: https://github.com/apache/tvm/blob/main/src/relay/pass/fold_constant.cc
 
-.. _python/tvm/relay/transform.py: https://github.com/apache/incubator-tvm/blob/main/python/tvm/relay/transform.py
+.. _python/tvm/relay/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/relay/transform.py
 
-.. _include/tvm/relay/transform.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/relay/transform.h
+.. _include/tvm/relay/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/relay/transform.h
 
-.. _python/tvm/ir/transform.py: https://github.com/apache/incubator-tvm/blob/main/python/tvm/ir/transform.py
+.. _python/tvm/ir/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/ir/transform.py
 
-.. _src/tir/transforms/unroll_loop.cc: https://github.com/apache/incubator-tvm/blob/main/src/tir/transforms/unroll_loop.cc
+.. _src/tir/transforms/unroll_loop.cc: https://github.com/apache/tvm/blob/main/src/tir/transforms/unroll_loop.cc
 
-.. _use pass infra: https://github.com/apache/incubator-tvm/blob/main/tutorials/dev/use_pass_infra.py
+.. _use pass infra: https://github.com/apache/tvm/blob/main/tutorials/dev/use_pass_infra.py
diff --git a/docs/dev/relay_add_op.rst b/docs/dev/relay_add_op.rst
index 7dca251dd532..0697939be162 100644
--- a/docs/dev/relay_add_op.rst
+++ b/docs/dev/relay_add_op.rst
@@ -231,7 +231,7 @@ Adding a Gradient in C++
 Adding a gradient in C++ is similar to adding one in Python, but the
 interface for registering is slightly different.
 
-First, make sure ``src/relay/pass/pattern_util.h`` is included. It provides
+First, make sure ``src/relay/pass/pattern_utils.h`` is included. It provides
 helper functions for creating nodes in the Relay AST. Then, define the
 gradient in a similar fashion as in the Python example:
 
diff --git a/docs/dev/relay_add_pass.rst b/docs/dev/relay_add_pass.rst
index 02c0ba2808ad..0661df0ae35a 100644
--- a/docs/dev/relay_add_pass.rst
+++ b/docs/dev/relay_add_pass.rst
@@ -399,8 +399,8 @@ information about the pass manager interface can be found in :ref:`pass-infra`.
 Relay's standard passes are listed in `include/tvm/relay/transform.h`_ and implemented
 in `src/relay/pass/`_.
 
-.. _include/tvm/relay/transform.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/relay/transform.h
+.. _include/tvm/relay/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/relay/transform.h
 
-.. _src/relay/pass/: https://github.com/apache/incubator-tvm/tree/main/src/relay/pass
+.. _src/relay/pass/: https://github.com/apache/tvm/tree/main/src/relay/pass
 
-.. _src/relay/transforms/fold_constant.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/transforms/fold_constant.cc
+.. _src/relay/transforms/fold_constant.cc: https://github.com/apache/tvm/blob/main/src/relay/transforms/fold_constant.cc
diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/relay_bring_your_own_codegen.rst
index f4ee58a6902b..3fcd3365c82f 100644
--- a/docs/dev/relay_bring_your_own_codegen.rst
+++ b/docs/dev/relay_bring_your_own_codegen.rst
@@ -137,7 +137,7 @@ Here we highlight the notes marked in the above code:
 
 * **Note 3** is a TVM runtime compatible wrapper function. It accepts a list of input tensors and one output tensor (the last argument), casts them to the right data type, and invokes the subgraph function described in Note 2. In addition, ``TVM_DLL_EXPORT_TYPED_FUNC`` is a TVM macro that generates another function ``gcc_0`` with unified the function arguments by packing all tensors to ``TVMArgs``. As a result, the TVM runtime can directly invoke ``gcc_0`` to execute the subgraph without additional efforts. With the above code generated, TVM is able to compile it along with the rest parts of the graph and export a single library for deployment.
 
-In the rest of this section, we will implement a codegen step-by-step to generate the above code. Your own codegen has to be located at ``src/relay/backend/contrib/<your-codegen-name>/``. In our example, we name our codegen "codegen_c" and put it under `/src/relay/backend/contrib/codegen_c/ <https://github.com/apache/incubator-tvm/blob/main/src/relay/backend/contrib/codegen_c/codegen.cc>`_. Feel free to check this file for a complete implementation.
+In the rest of this section, we will implement a codegen step-by-step to generate the above code. Your own codegen has to be located at ``src/relay/backend/contrib/<your-codegen-name>/``. In our example, we name our codegen "codegen_c" and put it under `/src/relay/backend/contrib/codegen_c/ <https://github.com/apache/tvm/blob/main/src/relay/backend/contrib/codegen_c/codegen.cc>`_. Feel free to check this file for a complete implementation.
 
 Specifically, we are going to implement two classes in this file and here is their relationship:
 
@@ -296,7 +296,7 @@ As mentioned in the previous step, in addition to the subgraph input and output
 
     // This example only supports single output.
     auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
+    ICHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
           << "Only support single output tensor with float type";
 
     // Generate a unique buffer name.
@@ -410,7 +410,7 @@ Implement GenCFunc
 .. code-block:: c++
 
   void GenCFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
+    ICHECK(func.defined()) << "Input error: expect a Relay function.";
 
     // Record the external symbol for runtime lookup.
     auto sid = GetExtSymbol(func);
@@ -474,7 +474,7 @@ This function creates a runtime module for the external library. In this example
 
     // Create a CSourceModule
     const auto* pf = runtime::Registry::Get("module.csource_module_create");
-    CHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
+    ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
     return (*pf)(code_stream_.str(), "cc");
   }
 
@@ -556,7 +556,7 @@ In this section, our goal is to implement the following customized TVM runtime m
       ExampleJsonCodeGen codegen(ref);
       std::string code = codegen.gen(); // Note 1
       const auto* pf = runtime::Registry::Get("module.examplejson_module_create"); // Note 2
-      CHECK(pf != nullptr) << "Cannot find ExampleJson module to create the external runtime module";
+      ICHECK(pf != nullptr) << "Cannot find ExampleJson module to create the external runtime module";
       return (*pf)(code);
   }
   TVM_REGISTER_GLOBAL("relay.ext.examplejsoncompiler").set_body_typed(ExampleJsonCompiler);
@@ -785,7 +785,7 @@ After the construction, we should have the above class variables ready. We then
 
         // Copy input tensors to corresponding data entries.
         for (auto i = 0; i < args.size(); ++i) {
-          CHECK(args[i].type_code() == kNDArrayContainer || args[i].type_code() == kArrayHandle)
+          ICHECK(args[i].type_code() == kNDArrayContainer || args[i].type_code() == kArrayHandle)
               << "Expect NDArray or DLTensor as inputs\n";
           if (args[i].type_code() == kArrayHandle) {
             DLTensor* arg = args[i];
@@ -800,7 +800,7 @@ After the construction, we should have the above class variables ready. We then
         for (const auto& it : this->graph_[this->curr_subgraph_]) {
           this->Run(it.id, it.inputs, it.output);
         }
-        CHECK_GT(graph_.count(this->curr_subgraph_), 0U);
+        ICHECK_GT(graph_.count(this->curr_subgraph_), 0U);
 
         // Copy the output from a data entry back to TVM runtime argument.
         auto out_idx = graph_[this->curr_subgraph_].back().output;
diff --git a/docs/dev/runtime.rst b/docs/dev/runtime.rst
index 91b19eee3230..c77b693f0749 100644
--- a/docs/dev/runtime.rst
+++ b/docs/dev/runtime.rst
@@ -45,7 +45,7 @@ PackedFunc
 `PackedFunc`_ is a simple but elegant solution
 we find to solve the challenges listed. The following code block provides an example in C++
 
-.. _PackedFunc: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/packed_func.h
+.. _PackedFunc: https://github.com/apache/tvm/blob/main/include/tvm/runtime/packed_func.h
 
 .. code:: c
 
@@ -131,9 +131,9 @@ which allows us to embed the PackedFunc into any languages. Besides python, so f
 `java`_ and `javascript`_.
 This philosophy of embedded API is very like Lua, except that we don't have a new language but use C++.
 
-.. _minimum C API: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/c_runtime_api.h
-.. _java: https://github.com/apache/incubator-tvm/tree/main/jvm
-.. _javascript: https://github.com/apache/incubator-tvm/tree/main/web
+.. _minimum C API: https://github.com/apache/tvm/blob/main/include/tvm/runtime/c_runtime_api.h
+.. _java: https://github.com/apache/tvm/tree/main/jvm
+.. _javascript: https://github.com/apache/tvm/tree/main/web
 
 
 One fun fact about PackedFunc is that we use it for both compiler and deployment stack.
@@ -141,7 +141,7 @@ One fun fact about PackedFunc is that we use it for both compiler and deployment
 - All TVM's compiler pass functions are exposed to frontend as PackedFunc, see `here`_
 - The compiled module also returns the compiled function as PackedFunc
 
-.. _here: https://github.com/apache/incubator-tvm/tree/main/src/api
+.. _here: https://github.com/apache/tvm/tree/main/src/api
 
 To keep the runtime minimum, we isolated the IR Object support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules (e.g., CUDA) get included.
 
@@ -162,7 +162,7 @@ TVM defines the compiled object as `Module`_.
 The user can get the compiled function from Module as PackedFunc.
 The generated compiled code can dynamically get function from Module in runtime. It caches the function handle in the first call and reuses in subsequent calls. We use this to link device code and callback into any PackedFunc(e.g., python) from generated code.
 
-.. _Module: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/module.h
+.. _Module: https://github.com/apache/tvm/blob/main/include/tvm/runtime/module.h
 
 The ModuleNode is an abstract class that can be implemented by each type of device.
 So far we support modules for CUDA, Metal, OpenCL and loading dynamic shared libraries. This abstraction makes introduction
@@ -198,7 +198,7 @@ All the language object in the compiler stack is a subclass of ``Object``. Each
 the type of object. We choose string instead of int as type key so new ``Object`` class can be added in the decentralized fashion without
 adding the code back to the central repo. To ease the speed of dispatching, we allocate an integer type_index at runtime for each type_key.
 
-.. _Object: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/object.h
+.. _Object: https://github.com/apache/tvm/blob/main/include/tvm/runtime/object.h
 
 Since usually one ``Object`` could be referenced in multiple places in the language, we use a shared_ptr to keep
 track of reference. We use ``ObjectRef`` class to represent a reference to the ``Object``.
@@ -279,17 +279,17 @@ Each argument in PackedFunc contains a union value `TVMValue`_
 and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language to
 do runtime type checking during conversion.
 
-.. _TVMValue: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/c_runtime_api.h#L122
+.. _TVMValue: https://github.com/apache/tvm/blob/main/include/tvm/runtime/c_runtime_api.h#L122
 
 The relevant files are
 
 - `packed_func.h`_ for C++ API
 - `c_runtime_api.cc`_ for C API and how to provide callback.
 
-.. _packed_func.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/packed_func.h
-.. _c_runtime_api.cc: https://github.com/apache/incubator-tvm/blob/main/src/runtime/c_runtime_api.cc#L262
+.. _packed_func.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/packed_func.h
+.. _c_runtime_api.cc: https://github.com/apache/tvm/blob/main/src/runtime/c_runtime_api.cc#L262
 
 To support extension types, we used a registry system to register type related information, like support of any
 in C++, see `Extension types`_ for more details.
 
-.. _Extension types: https://github.com/apache/incubator-tvm/tree/main/apps/extension
+.. _Extension types: https://github.com/apache/tvm/tree/main/apps/extension
diff --git a/docs/dev/virtual_machine.rst b/docs/dev/virtual_machine.rst
index 0986328811dc..9081d50b92ef 100644
--- a/docs/dev/virtual_machine.rst
+++ b/docs/dev/virtual_machine.rst
@@ -278,11 +278,11 @@ to represent tensor, tuple/list, and closure data, respectively. More details
 for each of them can be found at `include/tvm/runtime/ndarray.h`_,
 `include/tvm/runtime/vm/vm.h`_, and `include/tvm/runtime/container.h`_, respectively.
 
-.. _include/tvm/runtime/ndarray.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/ndarray.h
+.. _include/tvm/runtime/ndarray.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/ndarray.h
 
-.. _include/tvm/runtime/vm/vm.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/vm/vm.h
+.. _include/tvm/runtime/vm/vm.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/vm/vm.h
 
-.. _include/tvm/runtime/container.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/container.h
+.. _include/tvm/runtime/container.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/container.h
 
 Stack and State
 ~~~~~~~~~~~~~~~
@@ -326,7 +326,7 @@ The functions contain metadata about the function as well as its compiled byteco
 object then can be loaded and run by a ``tvm::relay::vm::VirtualMachine`` object. For full definitions of the
 data structures, please see `include/tvm/runtime/vm/executable.h`_ and `include/tvm/runtime/vm/vm.h`_.
 
-.. _include/tvm/runtime/vm/executable.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/vm/executable.h
+.. _include/tvm/runtime/vm/executable.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/vm/executable.h
 
 Optimizations
 ~~~~~~~~~~~~~
@@ -343,11 +343,11 @@ Optimizations marked with `TODO` are not implemented yet.
 - Tail Call Optimization (TODO)
 - Liveness Analysis (TODO)
 
-.. _src/relay/vm/lambda_lift.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/backend/vm/lambda_lift.cc
+.. _src/relay/vm/lambda_lift.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/lambda_lift.cc
 
-.. _src/relay/vm/inline_primitives.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/backend/vm/inline_primitives.cc
+.. _src/relay/vm/inline_primitives.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/inline_primitives.cc
 
-.. _src/relay/backend/vm/compiler.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/backend/vm/compiler.cc
+.. _src/relay/backend/vm/compiler.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/compiler.cc
 
 Serialization
 ~~~~~~~~~~~~~
@@ -386,7 +386,7 @@ load the serialized kernel binary and executable related binary code, which will
 instantiate a VM object. Please refer to the `test_vm_serialization.py`_ file for more
 examples.
 
-.. _test_vm_serialization.py: https://github.com/apache/incubator-tvm/blob/main/tests/python/relay/test_vm_serialization.py
+.. _test_vm_serialization.py: https://github.com/apache/tvm/blob/main/tests/python/relay/test_vm_serialization.py
 
 Unresolved Questions
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/index.rst b/docs/index.rst
index 18b2da7fc387..f407fa2d4f29 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,7 +25,7 @@ Get Started
 -----------
 
 - Follow the :doc:`instructions <install/index>` to install TVM.
-- Checkout the :doc:`Tutorials <tutorials/index>`.
+- Checkout the :doc:`tutorials <tutorials/index>`.
 
 For Developers
 --------------
diff --git a/docs/install/docker.rst b/docs/install/docker.rst
index 243e438b6d0c..768cad2057f8 100644
--- a/docs/install/docker.rst
+++ b/docs/install/docker.rst
@@ -28,7 +28,7 @@ Get a tvm source distribution or clone the github repo to get the auxiliary scri
 
 .. code:: bash
 
-    git clone --recursive https://github.com/apache/incubator-tvm tvm
+    git clone --recursive https://github.com/apache/tvm tvm
 
 
 We can then use the following command to launch a docker image.
@@ -67,7 +67,7 @@ with ``localhost`` when pasting it into browser.
 
 Docker Source
 -------------
-Check out `The docker source <https://github.com/apache/incubator-tvm/tree/main/docker>`_ if you are interested in
+Check out `The docker source <https://github.com/apache/tvm/tree/main/docker>`_ if you are interested in
 building your own docker images.
 
 
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 2bb6e551b1a0..3cf0a78f244f 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -34,7 +34,7 @@ It is important to clone the submodules along, with ``--recursive`` option.
 
 .. code:: bash
 
-    git clone --recursive https://github.com/apache/incubator-tvm tvm
+    git clone --recursive https://github.com/apache/tvm tvm
 
 For windows users who use github tools, you can open the git shell, and type the following command.
 
@@ -90,7 +90,7 @@ The configuration of TVM can be modified by `config.cmake`.
     you want to build for (OpenCL, RCOM, METAL, VULKAN, ...).
   - To help with debugging, ensure the embedded graph runtime and debugging functions are enabled with ``set(USE_GRAPH_RUNTIME ON)`` and ``set(USE_GRAPH_RUNTIME_DEBUG ON)``
 
-- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.
+- TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on.
 
   - LLVM 4.0 or higher is needed for build with LLVM. Note that version of LLVM from default apt may lower than 4.0.
   - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from
@@ -102,7 +102,7 @@ The configuration of TVM can be modified by `config.cmake`.
   - You can also use `LLVM Nightly Ubuntu Build <https://apt.llvm.org/>`_
 
     - Note that apt-package append ``llvm-config`` with version number.
-      For example, set ``set(LLVM_CONFIG llvm-config-4.0)`` if you installed 4.0 package
+      For example, set ``set(USE_LLVM llvm-config-10)`` if you installed LLVM 10 package
 
 - We can then build tvm and related libraries.
 
@@ -122,27 +122,58 @@ The configuration of TVM can be modified by `config.cmake`.
 
 If everything goes well, we can go to :ref:`python-package-installation`
 
+.. _build-with-conda:
+
+Building with a Conda Environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Conda is a very handy way to the necessary obtain dependencies needed for running TVM.
+First, follow the `conda's installation guide <https://docs.conda.io/projects/conda/en/latest/user-guide/install/>`_
+to install miniconda or anaconda if you do not yet have conda in your system. Run the following command in a conda environment:
+
+.. code:: bash
+
+    # Create a conda environment with the dependencies specified by the yaml
+    conda env create --file conda/build-environment.yaml
+    # Activate the created environment
+    conda activate tvm-build
+
+The above command will install all necessary build dependencies such as cmake and LLVM. You can then run the standard build process in the last section.
+
+If you want to use the compiled binary outside the conda environment,
+you can set LLVM to static linking mode ``set(USE_LLVM "llvm-config --link-static")``.
+In this way, the resulting library won't depend on the dynamic LLVM libraries in the conda environment.
+
+The above instructions show how to use conda to provide the necessary build dependencies to build libtvm.
+If you are already using conda as your package manager and wish to directly build and install tvm as a conda package, you can follow the instructions below:
+
+.. code:: bash
+
+   conda build --output-folder=conda/pkg  conda/recipe
+   # Run conda/build_cuda.sh to build with cuda enabled
+   conda install tvm -c ./conda/pkg
+
 Building on Windows
 ~~~~~~~~~~~~~~~~~~~
-
-TVM support build via MSVC using cmake. The minimum required VS version is **Visual Studio Community 2015 Update 3**.
-In order to generate the VS solution file using cmake, make sure you have a recent version of cmake added to your path and then from the TVM directory:
+TVM support build via MSVC using cmake. You will need to ontain a visual studio compiler.
+The minimum required VS version is **Visual Studio Community 2015 Update 3**.
+We recommend following :ref:`build-with-conda` to obtain necessary dependencies and
+get an activated tvm-build environment. Then you can run the following command to build
 
 .. code:: bash
 
-  mkdir build
-  cd build
-  cmake -G "Visual Studio 14 2015 Win64" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" ..
+    mkdir build
+    cd build
+    cmake -A x64 -Thost=x64 ..
+    cd ..
 
-Starting with Visual Studio 2019 the architecture is specified differently so use this command
+The above command generates the solution file under the build directory.
+You can then run the following command to build
 
 .. code:: bash
 
-  cmake -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" ..
+    cmake --build build --config Release -- /m
 
-This will generate the VS project using the MSVC 64 bit generator.
-Open the .sln file in the build directory and build with Visual Studio.
-In order to build with LLVM in windows, you will need to build LLVM from source.
 
 Building ROCm support
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/install/nnpack.rst b/docs/install/nnpack.rst
index 10497ba05654..2afd95a5ef3f 100644
--- a/docs/install/nnpack.rst
+++ b/docs/install/nnpack.rst
@@ -105,7 +105,7 @@ Build TVM with NNPACK support
 
 .. code:: bash
 
-   git clone --recursive https://github.com/apache/incubator-tvm tvm
+   git clone --recursive https://github.com/apache/tvm tvm
 
 - Set `set(USE_NNPACK ON)` in config.cmake.
 - Set `NNPACK_PATH` to the $(YOUR_NNPACK_INSTALL_PATH)
diff --git a/docs/langref/relay_adt.rst b/docs/langref/relay_adt.rst
index a53c7515c62a..dab2e3e70678 100644
--- a/docs/langref/relay_adt.rst
+++ b/docs/langref/relay_adt.rst
@@ -387,7 +387,7 @@ The following left fold flattens a list of lists (using concatenation):
 Note that these iteration constructs can be implemented directly in Relay's
 source language and more can easily be defined (and for more data types, like trees),
 rather than being constructs built into the language (e.g.,
-`"foreach" in MXNet <https://mxnet.incubator.apache.org/versions/master/tutorials/control_flow/ControlFlowTutorial.html>`__).
+`"foreach" in MXNet <https://mxnet.apache.org/versions/master/tutorials/control_flow/ControlFlowTutorial.html>`__).
 ADTs and their extensibility allow for a broad range of iterations and data structures to be expressed
 in Relay and supported by the type system without having to modify the language implementation.
 
diff --git a/docs/langref/relay_pattern.rst b/docs/langref/relay_pattern.rst
index 17282e142b2a..8b34b7619840 100644
--- a/docs/langref/relay_pattern.rst
+++ b/docs/langref/relay_pattern.rst
@@ -35,7 +35,7 @@ There are quite a few properties of operators that are worth matching. Below we
 demonstrates how to write patterns. It is recommended to check `tests/python/relay/test_dataflow_pattern.py`_
 for more use cases.
 
-.. _tests/python/relay/test_dataflow_pattern.py: https://github.com/apache/incubator-tvm/blob/main/tests/python/relay/test_dataflow_pattern.py
+.. _tests/python/relay/test_dataflow_pattern.py: https://github.com/apache/tvm/blob/main/tests/python/relay/test_dataflow_pattern.py
 
 .. note::
 
diff --git a/docs/vta/dev/hardware.rst b/docs/vta/dev/hardware.rst
index c8d543330728..1e3c0acdb185 100644
--- a/docs/vta/dev/hardware.rst
+++ b/docs/vta/dev/hardware.rst
@@ -36,7 +36,7 @@ In addition the design adopts decoupled access-execute to hide memory access lat
 
 To a broader extent, VTA can serve as a template deep learning accelerator design for full stack optimization, exposing a generic tensor computation interface to the compiler stack.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/blogpost/vta_overview.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/blogpost/vta_overview.png
    :align: center
    :width: 80%
 
@@ -175,7 +175,7 @@ Finally, the ``STORE`` instructions are executed by the store module exclusively
 The fields of each instruction is described in the figure below.
 The meaning of each field will be further explained in the :ref:`vta-uarch` section.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/vta_instructions.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/vta_instructions.png
    :align: center
    :width: 100%
 
@@ -191,7 +191,7 @@ VTA relies on dependence FIFO queues between hardware modules to synchronize the
 The figure below shows how a given hardware module can execute concurrently from its producer and consumer modules in a dataflow fashion through the use of dependence FIFO queues, and single-reader/single-writer SRAM buffers.
 Each module is connected to its consumer and producer via read-after-write (RAW) and write-after-read (WAR) dependence queues.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/dataflow.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/dataflow.png
    :align: center
    :width: 100%
 
@@ -258,7 +258,7 @@ There are two types of compute micro-ops: ALU and GEMM operations.
 To minimize the footprint of micro-op kernels, while avoiding the need for control-flow instructions such as conditional jumps, the compute module executes micro-op sequences inside a two-level nested loop that computes the location of each tensor register location via an affine function.
 This compression approach helps reduce the micro-kernel instruction footprint, and applies to both matrix multiplication and 2D convolution, commonly found in neural network operators.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/gemm_core.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/gemm_core.png
    :align: center
    :width: 100%
 
@@ -269,7 +269,7 @@ This tensorization intrinsic is defined by the dimensions of the input, weight a
 Each data type can have a different integer precision: typically both weight and input types are low-precision (8-bits or less), while the accumulator tensor has a wider type to prevent overflows (32-bits).
 In order to keep the GEMM core busy, each of the input buffer, weight buffer, and register file have to expose sufficient read/write bandwidth.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/alu_core.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/alu_core.png
    :align: center
    :width: 100%
 
@@ -289,7 +289,7 @@ The micro-code in the context of tensor ALU computation only takes care of speci
 Load and Store Modules
 ~~~~~~~~~~~~~~~~~~~~~~
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/2d_dma.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/2d_dma.png
    :align: center
    :width: 100%
 
diff --git a/docs/vta/dev/index.rst b/docs/vta/dev/index.rst
index d95f6e23d90d..2b715740ed29 100644
--- a/docs/vta/dev/index.rst
+++ b/docs/vta/dev/index.rst
@@ -20,7 +20,7 @@ VTA Design and Developer Guide
 
 This developer guide details the complete VTA-TVM hardware-software stack.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/blogpost/vta_stack.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/blogpost/vta_stack.png
    :align: center
    :width: 60%
 
diff --git a/docs/vta/install.rst b/docs/vta/install.rst
index 4cd1ee93a6e6..2248975b61b1 100644
--- a/docs/vta/install.rst
+++ b/docs/vta/install.rst
@@ -135,7 +135,7 @@ Because the direct board-to-computer connection prevents the board from directly
    mkdir <mountpoint>
    sshfs xilinx@192.168.2.99:/home/xilinx <mountpoint>
    cd <mountpoint>
-   git clone --recursive https://github.com/apache/incubator-tvm tvm
+   git clone --recursive https://github.com/apache/tvm tvm
    # When finished, you can leave the moutpoint and unmount the directory
    cd ~
    sudo umount <mountpoint>
@@ -202,7 +202,7 @@ This time again, we will run the 2D convolution testbench.
 Beforehand, we need to program the Pynq board FPGA with a VTA bitstream, and build the VTA runtime via RPC.
 The following ``test_program_rpc.py`` script will perform two operations:
 
-* FPGA programming, by downloading a pre-compiled bitstream from a `VTA bitstream repository <https://github.com/uwsaml/vta-distro>`_ that matches the default ``vta_config.json`` configuration set by the host, and sending it over to the Pynq via RPC to program the Pynq's FPGA.
+* FPGA programming, by downloading a pre-compiled bitstream from a `VTA bitstream repository <https://github.com/uwsampl/vta-distro>`_ that matches the default ``vta_config.json`` configuration set by the host, and sending it over to the Pynq via RPC to program the Pynq's FPGA.
 * Runtime building on the Pynq, which needs to be run every time the ``vta_config.json`` configuration is modified. This ensures that the VTA software runtime that generates the accelerator's executable via just-in-time (JIT) compilation matches the specifications of the VTA design that is programmed on the FPGA. The build process takes about 30 seconds to complete so be patient!
 
 .. code:: bash
@@ -466,7 +466,7 @@ This would add quartus binary path into your ``PATH`` environment variable, so y
 Chisel-based Custom VTA Bitstream Compilation for DE10-Nano
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file `Configs.scala <https://github.com/apache/incubator-tvm/blob/main/3rdparty/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala>`_, and they can be customized by the user.
+Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file `Configs.scala <https://github.com/apache/tvm/blob/main/3rdparty/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala>`_, and they can be customized by the user.
 
 For Intel FPGA, bitstream generation is driven by a top-level ``Makefile`` under ``<tvm root>/3rdparty/vta-hw/hardware/intel``.
 
diff --git a/golang/sample/deploy.py b/golang/sample/deploy.py
index a0553cfe0211..98820195511c 100644
--- a/golang/sample/deploy.py
+++ b/golang/sample/deploy.py
@@ -51,7 +51,7 @@
 # Save Compiled Module
 # --------------------
 from tvm.contrib import cc
-from tvm.contrib import util
+from tvm.contrib import utils
 
 fadd.save("deploy.o")
 cc.create_shared("deploy.so", ["deploy.o"])
diff --git a/golang/src/tvm_runtime_pack.cc b/golang/src/tvm_runtime_pack.cc
index 644249fa75c9..7dd6dd5e94c5 100644
--- a/golang/src/tvm_runtime_pack.cc
+++ b/golang/src/tvm_runtime_pack.cc
@@ -23,7 +23,7 @@
  */
 #include "src/runtime/c_runtime_api.cc"
 #include "src/runtime/cpu_device_api.cc"
-#include "src/runtime/file_util.cc"
+#include "src/runtime/file_utils.cc"
 #include "src/runtime/library_module.cc"
 #include "src/runtime/module.cc"
 #include "src/runtime/ndarray.cc"
diff --git a/golang/src/util.go b/golang/src/utils.go
similarity index 98%
rename from golang/src/util.go
rename to golang/src/utils.go
index d3846d1db452..2da4138a1e66 100644
--- a/golang/src/util.go
+++ b/golang/src/utils.go
@@ -19,7 +19,7 @@
 
 /*!
  * \brief gotvm package source for common utilities
- * \file util.go
+ * \file utils.go
  */
 
 package gotvm
diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index a9a0bed6712a..cd20bdcf4d1a 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -320,10 +320,10 @@ class CanonicalSimplifier {
  *  arith::Analyzer analyzer;
  *  {
  *    With<arith::ConstraintContext> scope(&analyzer, x % 3 == 0);
- *    CHECK_EQ(analyzer.modular_set(x)->coeff, 3);
+ *    ICHECK_EQ(analyzer.modular_set(x)->coeff, 3);
  *  }
  *  // constraint no longer in effect.
- *  CHECK_NE(analyzer.modular_set(x)->coeff, 3);
+ *  ICHECK_NE(analyzer.modular_set(x)->coeff, 3);
  *
  * \endcode
  */
diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
new file mode 100644
index 000000000000..e2e081d2be89
--- /dev/null
+++ b/include/tvm/arith/iter_affine_map.h
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/arith/iter_affine_map.h
+ * \brief Iterator quasi-affine mapping patterns.
+ *
+ *  This file defines a collection of mapping patterns
+ *  maps a collection of independent iterators to another
+ *  collection of independent iterators.
+ *
+ *  There are two main kinds of mapping patterns:
+ *
+ *  - Fuse: fuse a collection of iterators into a single one
+ *
+ *    domain(x0) = [0, 4), domain(x1) = [0, 3), domain(x2) = [0, 2)
+ *    fuse(x0, x1, x2): y = x2 * 12 + x1 * 4 + x0
+ *    domain(y) = [0, 24)
+ *
+ *  - Split: split an iterator into multiple ones
+ *
+ *    domain(x) = [0, 24)
+ *    split(x, 3, 12): [y0, y1, y2] = [x % 3, (x % 12) / 3, x / 12]
+ *    domain(y0) = [0, 3), domain(y1) = [0, 4), domain(y2) = [0, 2)
+ *
+ *  We use the name "(quasi)affine" to be consistent with
+ *  the terminology used in the polyhedral compilation.
+ *  Notably, fuse is an affine transformation,
+ *  while split corresponds to additional floordiv/mod operations
+ *  that can appear in quasi-affine transformations.
+ */
+#ifndef TVM_ARITH_ITER_AFFINE_MAP_H_
+#define TVM_ARITH_ITER_AFFINE_MAP_H_
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/ir/expr.h>
+#include <tvm/tir/var.h>
+
+namespace tvm {
+namespace arith {
+
+/*!
+ * \brief Base class of all iter map expressions.
+ *
+ *  An IterMapExpr is a special expression to store
+ *  the result of IterMapDetection.
+ *  It should not appear in a legal TIR PrimFunc.
+ */
+class IterMapExprNode : public PrimExprNode {
+ public:
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {}
+
+  static constexpr const char* _type_key = "arith.IterMapExpr";
+  static constexpr const uint32_t _type_child_slots = 3;
+  TVM_DECLARE_BASE_OBJECT_INFO(IterMapExprNode, PrimExprNode);
+};
+
+/*!
+ * \brief Managed reference to IterMapExprNode.
+ * \sa IterMapExprNode
+ */
+class IterMapExpr : public PrimExpr {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(IterMapExpr, PrimExpr, IterMapExprNode);
+};
+
+/*!
+ * \brief Mark the source as an iterator in [0, extent).
+ *
+ *  IterMark is used to mark source expression as a valid
+ *  iterator to make future analysis easy.
+ */
+class IterMarkNode : public Object {
+ public:
+  /*!
+   * \brief The source expression, can either be
+   *  a IterSumExpr or a Var.
+   */
+  PrimExpr source;
+  /*!
+   * \brief The extent of the iteration.
+   */
+  PrimExpr extent;
+
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("source", &source);
+    v->Visit("extent", &extent);
+  }
+
+  bool SEqualReduce(const IterMarkNode* other, SEqualReducer equal) const {
+    equal->MarkGraphNode();
+    return equal(source, other->source) && equal(extent, other->extent);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce->MarkGraphNode();
+    hash_reduce(source);
+    hash_reduce(extent);
+  }
+
+  static constexpr const bool _type_has_method_sequal_reduce = true;
+  static constexpr const bool _type_has_method_shash_reduce = true;
+  static constexpr const char* _type_key = "arith.IterMark";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IterMarkNode, Object);
+};
+
+/*!
+ * \brief Managed reference to IterMarkExprNode.
+ * \sa IterMarkExprNode
+ */
+class IterMark : public ObjectRef {
+ public:
+  /*!
+   * \brief constructor.
+   * \param source The source expression.
+   * \param extent The extent of the iterator.
+   */
+  TVM_DLL IterMark(PrimExpr source, PrimExpr extent);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(IterMark, ObjectRef, IterMarkNode);
+};
+
+/*!
+ * \brief Split of an iterator.
+ *
+ *  result = floormod(floordiv(source, lower_factor), extent) * scale
+ */
+class IterSplitExprNode : public IterMapExprNode {
+ public:
+  /*! \brief The source marked iterator. */
+  IterMark source;
+  /*! \brief The lower factor to split the source. */
+  PrimExpr lower_factor;
+  /*! \brief The extent of the split. */
+  PrimExpr extent;
+  /*! \brief Additional scale. */
+  PrimExpr scale;
+
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("source", &source);
+    v->Visit("lower_factor", &lower_factor);
+    v->Visit("extent", &extent);
+    v->Visit("scale", &scale);
+  }
+
+  bool SEqualReduce(const IterSplitExprNode* other, SEqualReducer equal) const {
+    return equal(source, other->source) && equal(lower_factor, other->lower_factor) &&
+           equal(extent, other->extent) && equal(scale, other->scale);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(source);
+    hash_reduce(lower_factor);
+    hash_reduce(extent);
+    hash_reduce(scale);
+  }
+
+  static constexpr const char* _type_key = "arith.IterSplitExpr";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IterSplitExprNode, IterMapExprNode);
+};
+
+/*!
+ * \brief Managed reference to IterSplitExprNode.
+ * \sa IterSplitExprNode
+ */
+class IterSplitExpr : public IterMapExpr {
+ public:
+  /*!
+   * \brief constructor from just source.
+   * \param source The source expression.
+   */
+  TVM_DLL explicit IterSplitExpr(IterMark source);
+  /*!
+   * \brief constructor from just source.
+   * \param source The source expression.
+   * \param scale The additional scaling factor.
+   */
+  TVM_DLL explicit IterSplitExpr(IterMark source, PrimExpr scale);
+  /*!
+   * \brief constructor
+   * \param source The source expression.
+   * \param lower_factor The lower factor to split the source.
+   * \param extent The extent of the split.
+   * \param scale The additional scaling factor.
+   */
+  TVM_DLL explicit IterSplitExpr(IterMark source, PrimExpr lower_factor, PrimExpr extent,
+                                 PrimExpr scale);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(IterSplitExpr, IterMapExpr, IterSplitExprNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(IterSplitExprNode);
+};
+
+/*!
+ * \brief Fuse multiple iterators by summing them with scaling.
+ *
+ *  result = sum(args) + base
+ */
+class IterSumExprNode : public IterMapExprNode {
+ public:
+  /*! \brief The args to the sum. */
+  Array<IterSplitExpr> args;
+  /*! \brief The base offset. */
+  PrimExpr base;
+
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("args", &args);
+    v->Visit("base", &base);
+  }
+
+  bool SEqualReduce(const IterSumExprNode* other, SEqualReducer equal) const {
+    return equal(args, other->args) && equal(base, other->base);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(args);
+    hash_reduce(base);
+  }
+
+  static constexpr const char* _type_key = "arith.IterSumExpr";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IterSumExprNode, IterMapExprNode);
+};
+
+/*!
+ * \brief Managed reference to IterSumExprNode.
+ * \sa IterSumExprNode
+ */
+class IterSumExpr : public IterMapExpr {
+ public:
+  /*!
+   * \brief constructor.
+   * \param args The args to the sum.
+   * \param base The base offset.
+   */
+  TVM_DLL IterSumExpr(Array<IterSplitExpr> args, PrimExpr base);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(IterSumExpr, IterMapExpr, IterSumExprNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(IterSumExprNode);
+};
+
+/*!
+ * \brief Detect if indices can be written as
+ *
+ *  [y_0 + c_0, y_1 + c_1, ..., y_n + c_n]
+ *
+ *  Here y = some-quasi-affine-iter-map(input_iters)
+ *  and c are symbolic constants.
+ *
+ *  We also requires that y_i and y_j to be independent for i != j.
+ *
+ *  For returned value rv, the following is always true:
+ *  - rv[i]->args.size() <=1: only one iterator per element.
+ *
+ * \param indices The indices to detect pattern for.
+ * \param input_iters Map from variable to iterator's range.
+ * \param analyzer Analyzer used to get context information.
+ *
+ * \return The detected pattern if a match exists,
+ *         otherwise return an empty array.
+ */
+Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                                 arith::Analyzer* analyzer);
+
+}  // namespace arith
+}  // namespace tvm
+#endif  // TVM_ARITH_ITER_AFFINE_MAP_H_
diff --git a/include/tvm/auto_scheduler/compute_dag.h b/include/tvm/auto_scheduler/compute_dag.h
index 553008a7fcbf..b9306c64b0b5 100755
--- a/include/tvm/auto_scheduler/compute_dag.h
+++ b/include/tvm/auto_scheduler/compute_dag.h
@@ -194,23 +194,48 @@ class ComputeDAGNode : public Object {
   TVM_DECLARE_FINAL_OBJECT_INFO(ComputeDAGNode, Object);
 };
 
+/*!
+ * \brief Options for applying layout rewrite.
+ * This is an optimization to rewrite the layout of input tensors according to the schedule we get.
+ */
+enum class LayoutRewriteOption : int {
+  /*! \brief Do not perform layout rewrite. */
+  NoRewrite = 0,
+  /*! \brief Insert layout transformation stages for input placeholders in the compute DAG */
+  InsertTransformStage = 1,
+  /*!
+   * \brief Do not insert layout transformation stages and assume the input placeholders
+   * are pre-transformed.
+   * \note The lowered function with this option does not accept the origial input shapes,
+   * so this option must be used along with `AutoSchedulerLayoutRewrite` pass in Relay.
+   */
+  RewriteForPreTransformed = 2,
+};
+
 /*!
  * \brief Managed reference to ComputeDAGNode.
  * \sa ComputeDAGNode
  */
 class ComputeDAG : public ObjectRef {
  public:
-  /*! \brief The constructor.
+  /*! \brief Construct a DAG from a list of output tensors.
    * \param tensors `te::Tensor`s for a compute declaration.
    */
   TVM_DLL explicit ComputeDAG(Array<te::Tensor> tensors);
 
+  /*! \brief Construct a DAG based on a schedule.
+   * \param sch `te::Schedule`s for a compute declaration.
+   */
+  TVM_DLL explicit ComputeDAG(const te::Schedule& sch);
+
   /*!
    * \brief Rewrite the layout of placeholder specified by attr `layout_free_placeholders`
    * according to the loop nest derived with `transform_steps`.
    * \param transform_steps Transform steps of a state.
+   * \param layout_rewrite Different options in layout rewrite.
+   * \return The updated ComputeDAG after layout rewrite.
    */
-  void RewriteLayout(const Array<Step>& transform_steps);
+  ComputeDAG RewriteLayout(Array<Step>* transform_steps, LayoutRewriteOption layout_rewrite) const;
 
   /*!
    * \brief Apply the history transform steps to get a TVM schedule.
@@ -220,14 +245,14 @@ class ComputeDAG : public ObjectRef {
    * \param stage_to_axes The map that stores all axes for one stage.
    * Pass a valid pointer if this information needs to be used outside this function.
    * \param layout_rewrite Rewrite the layout of placeholders specified by
-   * attr `layout_free_placeholders`
+   * attr `layout_free_placeholders`.
    * \return A `te.schedule` and the an Array of `te.Tensor` to be used in `tvm.lower`
    * or `tvm.build`.
    */
-  std::pair<te::Schedule, Array<te::Tensor>> ApplySteps(const Array<Step>& transform_steps,
-                                                        Array<te::Stage>* stages = nullptr,
-                                                        StageToAxesMap* stage_to_axes = nullptr,
-                                                        bool layout_rewrite = false) const;
+  std::pair<te::Schedule, Array<te::Tensor>> ApplySteps(
+      const Array<Step>& transform_steps, Array<te::Stage>* stages = nullptr,
+      StageToAxesMap* stage_to_axes = nullptr,
+      LayoutRewriteOption layout_rewrite = LayoutRewriteOption::NoRewrite) const;
 
   /*!
    * \brief Print transform steps as equivalent python schedule API.
diff --git a/include/tvm/auto_scheduler/measure.h b/include/tvm/auto_scheduler/measure.h
index 349f4f8c7d51..e8c01e84f289 100755
--- a/include/tvm/auto_scheduler/measure.h
+++ b/include/tvm/auto_scheduler/measure.h
@@ -43,6 +43,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 namespace tvm {
@@ -423,7 +424,7 @@ class RPCRunner : public ProgramRunner {
 
 /*!
  * \brief Measurer that measures the time costs of tvm programs
- * This class combines ProgramBuilder and ProgramRunner and provides a simpler API */
+ * This class combines ProgramBuilder and ProgramRunner, and provides a simpler API */
 class ProgramMeasurerNode : public Object {
  public:
   /*! \brief Measured programs counter. */
@@ -436,6 +437,8 @@ class ProgramMeasurerNode : public Object {
   std::unordered_map<std::string, State> best_state;
   /*! \brief Workload key to best state's count index map. */
   std::unordered_map<std::string, int> best_ct;
+  /*! \brief The set of workloads that have at least one valid schedule */
+  std::unordered_set<std::string> has_valid;
   /*! \brief The ProgramBuilder to build each program. */
   ProgramBuilder builder;
   /*! \brief The ProgramRunner to measure each program. */
@@ -444,7 +447,7 @@ class ProgramMeasurerNode : public Object {
   Optional<Array<MeasureCallback>> callbacks;
   /*! \brief Verbosity level. 0 for silent, 1 to output information during program measuring. */
   int verbose;
-  /*! \brief The number of max continuous error. */
+  /*! \brief The number of allowed maximum continuous error before forcely stopping the tuning */
   int max_continuous_error;
 
   /*! \brief Reset book keeping variables */
@@ -454,13 +457,12 @@ class ProgramMeasurerNode : public Object {
    * \brief Do measurement.
    * \param task The current SearchTask.
    * \param policy The current SearchPolicy.
-   * \param inputs The MeasureInputs.
-   * \param results A pointer to a MeasureResult Array, this is used as output.
+   * \param inputs The inputs of measurement.
    * \param batch_size Number of programs to be measured in one batch.
+   * \return results The results of measurement.
    */
-  void Measure(const SearchTask& task, const SearchPolicy& policy,
-               const Array<MeasureInput>& inputs, Array<MeasureResult>* results,
-               int batch_size = -1);
+  Array<MeasureResult> Measure(const SearchTask& task, const SearchPolicy& policy,
+                               const Array<MeasureInput>& inputs, int batch_size = -1);
   /*!
    * \brief Do measurement silently.
    * This API will not print the measure results to screen.
@@ -486,12 +488,13 @@ class ProgramMeasurer : public ObjectRef {
  public:
   /*!
    * \brief The constructor.
-   * \param builder The ProgramBuilder to build each program.
-   * \param runner The ProgramRunner to measure each program.
-   * \param callbacks MeasureCallback to be called after each measure batch.
+   * \param builder The ProgramBuilder to build programs.
+   * \param runner The ProgramRunner to measure programs.
+   * \param callbacks MeasureCallback to be called after each measurement batch.
    * \param verbose Verbosity level. 0 for silent, 1 to output information during program
    * measuring.
-   * \param max_continuous_error The number of allowed maximum continuous error.
+   * \param max_continuous_error The number of allowed maximum continuous error before
+   * forcely stopping the tuning.
    */
   ProgramMeasurer(ProgramBuilder builder, ProgramRunner runner,
                   Optional<Array<MeasureCallback>> callbacks, int verbose,
diff --git a/include/tvm/auto_scheduler/search_policy.h b/include/tvm/auto_scheduler/search_policy.h
index ddb0dd284875..e433799b7fa5 100755
--- a/include/tvm/auto_scheduler/search_policy.h
+++ b/include/tvm/auto_scheduler/search_policy.h
@@ -22,26 +22,6 @@
  * \brief The base class of search policies, including the abstract definition of search policy and
  * other supporting data structures.
  *
- * The basic schedule search process for the auto-scheduler is design to be:
- * `Program sampling` -> `Performance Tuning`.
- *
- * In `Program sampling`, we use some predefined precise or heuristic rules to generate several
- * initial schedules. Based on these initial starting points, we perform `Performance Tuning` which
- * uses cost model based evolutionary search to select schedules with the best performance.
- *
- * Candidate schedules are measured against the specific hardware target.
- *
- * We intend to introduce different level of automation on the schedule generation process:
- * - Level 0(the default level): For all kinds of ops/subgraphs, the search policy should be able
- *   to generate schedule automatically.
- * - Level 1: For some complicated ops/subgraphs(e.g. conv2d windograd), the default search space
- *   of level 0 may be too large to find a high performance schedule efficiently. We provide some
- *   op attributes to help reduce the total search space, see `SearchPolicyKey` below for more
- *   information.
- * - Level 2: For some further special ops/subgraphs, users may more likely to write their own
- *   template(just like AutoTVM). Search policy should be able to provide a flexible approach as
- *   well.
- *
  * \note How to add a new search policy.
  * In design, there's no need for users to implement their own search policy, our formal search
  * policy(will be brought later) should be enough to cover most use cases. Meanwhile, a custom rule
@@ -62,11 +42,13 @@
 #ifndef TVM_AUTO_SCHEDULER_SEARCH_POLICY_H_
 #define TVM_AUTO_SCHEDULER_SEARCH_POLICY_H_
 
+#include <tvm/auto_scheduler/measure.h>
 #include <tvm/auto_scheduler/search_task.h>
 #include <tvm/node/node.h>
 
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 namespace tvm {
@@ -171,6 +153,15 @@ class SearchPolicyNode : public Object {
   virtual State Search(int num_measure_trials, int early_stopping, int num_measures_per_round,
                        ProgramMeasurer measurer) = 0;
 
+  /*!
+   * \brief Continue the search by doing an additional search round.
+   * \param num_measure The number of measurements
+   * \param measurer The measurer to measure programs
+   * \return The measurement records for measurements in this search round
+   */
+  virtual std::pair<Array<MeasureInput>, Array<MeasureResult>> ContinueSearchOneRound(
+      int num_measure, ProgramMeasurer measurer) = 0;
+
   /*!
    * \brief Preload measured states from a log file to resume the state of the search policy.
    * \param log_file The name of the record log file.
diff --git a/include/tvm/auto_scheduler/search_task.h b/include/tvm/auto_scheduler/search_task.h
index 85154b5e406b..6d85835d2e4b 100755
--- a/include/tvm/auto_scheduler/search_task.h
+++ b/include/tvm/auto_scheduler/search_task.h
@@ -44,17 +44,16 @@ class HardwareParamsNode : public Object {
   int cache_line_bytes;
 
   // GPU related parameters got from device query API
-
-  /*! \brief The max shared memory per block. */
-  int max_shared_memory_per_block{INT32_MAX};
-  /*! \brief The max register memory per block. */
-  int max_registers_per_block{INT32_MAX};
-  /*! \brief The max threads per block. */
-  int max_threads_per_block{INT32_MAX};
+  /*! \brief The max shared memory per block in bytes. */
+  int max_shared_memory_per_block;
+  /*! \brief The max number of register per block. */
+  int max_registers_per_block;
+  /*! \brief The max number of threads per block. */
+  int max_threads_per_block;
   /*! \brief The max vthread extent. */
-  int max_vthread_extent{INT32_MAX};
+  int max_vthread_extent;
   /*! \brief The thread numbers of a warp. */
-  int warp_size{INT32_MAX};
+  int warp_size;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_cores", &num_cores);
@@ -90,8 +89,15 @@ class HardwareParams : public ObjectRef {
    * \param num_cores The number of cores.
    * \param vector_unit_bytes The width of vector units in bytes.
    * \param cache_line_bytes The size of cache line in bytes.
+   * \param max_shared_memory_per_block The max amount of shared memory per block for GPU.
+   * \param max_registers_per_block The max number of registers per block for GPU.
+   * \param max_threads_per_block The max number of threads per block for GPU.
+   * \param max_vthread_extent The max extent of vthread for GPU.
+   * \param warp_size The warp size for GPU
    */
-  HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes);
+  HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes,
+                 int max_shared_memory_per_block, int max_registers_per_block,
+                 int max_threads_per_block, int max_vthread_extent, int warp_size);
 
   TVM_DEFINE_OBJECT_REF_METHODS(HardwareParams, ObjectRef, HardwareParamsNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(HardwareParamsNode);
diff --git a/include/tvm/auto_scheduler/transform_step.h b/include/tvm/auto_scheduler/transform_step.h
index 7be3554c7c5d..4cc1551e76fc 100755
--- a/include/tvm/auto_scheduler/transform_step.h
+++ b/include/tvm/auto_scheduler/transform_step.h
@@ -182,7 +182,23 @@ class StepNode : public Object {
  */
 class Step : public ObjectRef {
  public:
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Step, ObjectRef, StepNode);
+  /*!
+   * \brief CopyOnWrite function for Step.
+   * This works almost the same as a normal ObjectRef.CopyOnWrite(), but can dispatch to different
+   * steps.
+   * \return A base StepNode pointer, need to cast to its real StepNode type before doing any
+   * modifications.
+   * \code
+   *
+   *  SplitStep ref;
+   *  StepNode* mutable_ref = ref.CopyOnWrite();
+   *  dynamic_cast<SplitStepNode*>(mutable_ref)->... = ...;
+   *
+   * \endcode
+   */
+  StepNode* CopyOnWrite();
+
+  TVM_DEFINE_OBJECT_REF_METHODS(Step, ObjectRef, StepNode);
 };
 
 // Forward declaration
@@ -267,7 +283,7 @@ class AnnotationStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "AN";
 
   static constexpr const char* _type_key = "auto_scheduler.AnnotationStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(AnnotationStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(AnnotationStepNode, StepNode);
 };
 
 /*!
@@ -330,7 +346,7 @@ class FuseStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "FU";
 
   static constexpr const char* _type_key = "auto_scheduler.FuseStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FuseStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(FuseStepNode, StepNode);
 };
 
 /*!
@@ -390,7 +406,7 @@ class PragmaStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "PR";
 
   static constexpr const char* _type_key = "auto_scheduler.PragmaStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PragmaStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(PragmaStepNode, StepNode);
 };
 
 /*!
@@ -452,7 +468,7 @@ class ReorderStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "RE";
 
   static constexpr const char* _type_key = "auto_scheduler.ReorderStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ReorderStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ReorderStepNode, StepNode);
 };
 
 /*!
@@ -527,7 +543,7 @@ class SplitStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "SP";
 
   static constexpr const char* _type_key = "auto_scheduler.SplitStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(SplitStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(SplitStepNode, StepNode);
 };
 
 /*!
@@ -607,7 +623,7 @@ class FollowSplitStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "FSP";
 
   static constexpr const char* _type_key = "auto_scheduler.FollowSplitStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FollowSplitStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(FollowSplitStepNode, StepNode);
 };
 
 /*!
@@ -688,7 +704,7 @@ class FollowFusedSplitStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "FFSP";
 
   static constexpr const char* _type_key = "auto_scheduler.FollowFusedSplitStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FollowFusedSplitStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(FollowFusedSplitStepNode, StepNode);
 };
 
 /*!
@@ -754,7 +770,7 @@ class StorageAlignStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "SA";
 
   static constexpr const char* _type_key = "auto_scheduler.StorageAlignStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(StorageAlignStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(StorageAlignStepNode, StepNode);
 };
 
 /*!
@@ -822,7 +838,7 @@ class ComputeAtStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CA";
 
   static constexpr const char* _type_key = "auto_scheduler.ComputeAtStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeAtStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeAtStepNode, StepNode);
 };
 
 /*!
@@ -879,7 +895,7 @@ class ComputeInlineStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CI";
 
   static constexpr const char* _type_key = "auto_scheduler.ComputeInlineStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeInlineStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeInlineStepNode, StepNode);
 };
 
 /*!
@@ -938,7 +954,7 @@ class ComputeRootStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CR";
 
   static constexpr const char* _type_key = "auto_scheduler.ComputeRootStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeRootStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeRootStepNode, StepNode);
 };
 
 /*!
@@ -1010,7 +1026,7 @@ class CacheReadStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CHR";
 
   static constexpr const char* _type_key = "auto_scheduler.CacheReadStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CacheReadStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(CacheReadStepNode, StepNode);
 };
 
 /*!
@@ -1081,7 +1097,7 @@ class CacheWriteStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CHW";
 
   static constexpr const char* _type_key = "auto_scheduler.CacheWriteStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CacheWriteStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(CacheWriteStepNode, StepNode);
 };
 
 /*!
@@ -1148,7 +1164,7 @@ class RfactorStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "RF";
 
   static constexpr const char* _type_key = "auto_scheduler.RfactorStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(RfactorStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(RfactorStepNode, StepNode);
 };
 
 /*!
diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index e92baf12b05f..13bfd715cdfb 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -413,6 +413,12 @@ inline void SetIntValue(T* ptr, const TVMArgValue& val) {
   }
 }
 
+// Workaround for GCC8.1 / GCC8.2
+template <>
+inline void SetValue<DataType>(DataType* ptr, const TVMArgValue& val) {
+  *ptr = val.operator DataType();
+}
+
 template <>
 inline void SetValue<std::string>(std::string* ptr, const TVMArgValue& val) {
   if (String::CanConvertFrom(val)) {
@@ -428,7 +434,7 @@ inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
     *ptr = val.operator double();
   } else {
     ObjectRef expr = val;
-    CHECK(expr.defined());
+    ICHECK(expr.defined());
     if (const IntImmNode* op = expr.as<IntImmNode>()) {
       *ptr = static_cast<double>(op->value);
     } else if (const FloatImmNode* op = expr.as<FloatImmNode>()) {
@@ -664,7 +670,7 @@ class AttrsNode : public BaseAttrsNode {
   }
 
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final {
-    CHECK_EQ(args.size() % 2, 0);
+    ICHECK_EQ(args.size() % 2, 0);
     const int kLinearSearchBound = 16;
     int hit_count = 0;
     // applies two stratgies to lookup
@@ -672,7 +678,7 @@ class AttrsNode : public BaseAttrsNode {
       // linear search.
       auto ffind = [&args](const char* key, runtime::TVMArgValue* val) {
         for (int i = 0; i < args.size(); i += 2) {
-          CHECK_EQ(args.type_codes[i], kTVMStr);
+          ICHECK_EQ(args.type_codes[i], kTVMStr);
           if (!std::strcmp(key, args.values[i].v_str)) {
             *val = args[i + 1];
             return true;
@@ -687,7 +693,7 @@ class AttrsNode : public BaseAttrsNode {
       // construct a map then do lookup.
       std::unordered_map<std::string, runtime::TVMArgValue> kwargs;
       for (int i = 0; i < args.size(); i += 2) {
-        CHECK_EQ(args.type_codes[i], kTVMStr);
+        ICHECK_EQ(args.type_codes[i], kTVMStr);
         kwargs[args[i].operator std::string()] = args[i + 1];
       }
       auto ffind = [&kwargs](const char* key, runtime::TVMArgValue* val) {
diff --git a/include/tvm/ir/diagnostic.h b/include/tvm/ir/diagnostic.h
index 6b9807487bae..2053a295a3b8 100644
--- a/include/tvm/ir/diagnostic.h
+++ b/include/tvm/ir/diagnostic.h
@@ -21,68 +21,22 @@
  * \file diagnostic.h
  * \brief A new diagnostic interface for TVM error reporting.
  *
- * A prototype of the new diagnostic reporting interface for TVM.
- *
- * Eventually we hope to promote this file to the top-level and
- * replace the existing errors.h.
  */
 
 #ifndef TVM_IR_DIAGNOSTIC_H_
 #define TVM_IR_DIAGNOSTIC_H_
 
 #include <tvm/ir/module.h>
-#include <tvm/ir/span.h>
 #include <tvm/parser/source_map.h>
-#include <tvm/runtime/container.h>
-#include <tvm/runtime/object.h>
-#include <tvm/support/logging.h>
 
-#include <fstream>
+#include <sstream>
 #include <string>
-#include <utility>
-#include <vector>
 
 namespace tvm {
 
 using tvm::parser::SourceMap;
 using tvm::runtime::TypedPackedFunc;
 
-extern const char* kTVM_INTERNAL_ERROR_MESSAGE;
-
-#define ICHECK_INDENT "  "
-
-#define ICHECK_BINARY_OP(name, op, x, y)                           \
-  if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \
-  dmlc::LogMessageFatal(__FILE__, __LINE__).stream()               \
-      << kTVM_INTERNAL_ERROR_MESSAGE << std::endl                  \
-      << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": "
-
-#define ICHECK(x)                                    \
-  if (!(x))                                          \
-  dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
-      << kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: "
-
-#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
-#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
-#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y)
-#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
-#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
-#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
-#define ICHECK_NOTNULL(x)                                                                   \
-  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()                      \
-                        << kTVM_INTERNAL_ERROR_MESSAGE << __INDENT << "Check not null: " #x \
-                        << ' ',                                                             \
-   (x) : (x))  // NOLINT(*)
-
-/*! \brief The diagnostic level, controls the printing of the message. */
-enum class DiagnosticLevel : int {
-  kBug = 10,
-  kError = 20,
-  kWarning = 30,
-  kNote = 40,
-  kHelp = 50,
-};
-
 class DiagnosticBuilder;
 
 /*! \brief A compiler diagnostic. */
@@ -195,7 +149,7 @@ class DiagnosticRenderer : public ObjectRef {
   void Render(const DiagnosticContext& ctx);
 
   DiagnosticRendererNode* operator->() {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<DiagnosticRendererNode*>(get_mutable());
   }
 
@@ -249,7 +203,7 @@ class DiagnosticContext : public ObjectRef {
   void Render();
 
   DiagnosticContextNode* operator->() {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<DiagnosticContextNode*>(get_mutable());
   }
 
diff --git a/include/tvm/ir/env_func.h b/include/tvm/ir/env_func.h
index 65653b75562d..386666a2c50c 100644
--- a/include/tvm/ir/env_func.h
+++ b/include/tvm/ir/env_func.h
@@ -83,7 +83,7 @@ class EnvFunc : public ObjectRef {
   template <typename... Args>
   runtime::TVMRetValue operator()(Args&&... args) const {
     const EnvFuncNode* n = operator->();
-    CHECK(n != nullptr);
+    ICHECK(n != nullptr);
     return n->func(std::forward<Args>(args)...);
   }
   /*!
@@ -137,7 +137,7 @@ class TypedEnvFunc<R(Args...)> : public ObjectRef {
    */
   R operator()(Args... args) const {
     const EnvFuncNode* n = operator->();
-    CHECK(n != nullptr);
+    ICHECK(n != nullptr);
     return runtime::detail::typed_packed_call_dispatcher<R>::run(n->func,
                                                                  std::forward<Args>(args)...);
   }
diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index d6cfc5a64121..1c470fae51ee 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -45,10 +45,16 @@ using tvm::runtime::String;
  */
 class BaseExprNode : public Object {
  public:
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
+
   static constexpr const char* _type_key = "BaseExpr";
   static constexpr const bool _type_has_method_sequal_reduce = true;
   static constexpr const bool _type_has_method_shash_reduce = true;
-  static constexpr const uint32_t _type_child_slots = 58;
+  static constexpr const uint32_t _type_child_slots = 62;
   TVM_DECLARE_BASE_OBJECT_INFO(BaseExprNode, Object);
 };
 
@@ -92,7 +98,7 @@ class PrimExprNode : public BaseExprNode {
   DataType dtype;
 
   static constexpr const char* _type_key = "PrimExpr";
-  static constexpr const uint32_t _type_child_slots = 34;
+  static constexpr const uint32_t _type_child_slots = 38;
   TVM_DECLARE_BASE_OBJECT_INFO(PrimExprNode, BaseExprNode);
 };
 
@@ -135,11 +141,6 @@ class PrimExpr : public BaseExpr {
  */
 class RelayExprNode : public BaseExprNode {
  public:
-  /*!
-   * \brief Span that points to the original source code.
-   *        Reserved debug information.
-   */
-  mutable Span span;
   /*!
    * \brief Stores the result of type inference(type checking).
    *
@@ -263,8 +264,9 @@ class IntImm : public PrimExpr {
    * \brief Constructor.
    * \param dtype The data type of the value.
    * \param value The internal value.
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL IntImm(DataType dtype, int64_t value);
+  TVM_DLL IntImm(DataType dtype, int64_t value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(IntImm, PrimExpr, IntImmNode);
 };
@@ -307,8 +309,9 @@ class FloatImm : public PrimExpr {
    * \brief Constructor.
    * \param dtype The data type of the value.
    * \param value The internal value.
+   * \param span The location in the source code.
    */
-  TVM_DLL FloatImm(DataType dtype, double value);
+  TVM_DLL FloatImm(DataType dtype, double value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(FloatImm, PrimExpr, FloatImmNode);
 };
@@ -321,7 +324,7 @@ class FloatImm : public PrimExpr {
  */
 class Bool : public IntImm {
  public:
-  explicit Bool(bool value) : IntImm(DataType::Bool(), value) {}
+  explicit Bool(bool value, Span span = Span()) : IntImm(DataType::Bool(), value, span) {}
   Bool operator!() const { return Bool((*this)->value == 0); }
   operator bool() const { return (*this)->value != 0; }
 
@@ -358,7 +361,7 @@ class Integer : public IntImm {
   /*!
    * \brief Construct integer from int value.
    */
-  Integer(int value) : IntImm(DataType::Int(32), value) {}  // NOLINT(*)
+  Integer(int value, Span span = Span()) : IntImm(DataType::Int(32), value, span) {}  // NOLINT(*)
   /*!
    * \brief Construct integer from int imm.
    * \param other The other value.
@@ -386,7 +389,7 @@ class Integer : public IntImm {
    * \brief convert to int64_t
    */
   operator int64_t() const {
-    CHECK(data_ != nullptr) << " Trying to reference a null Integer";
+    ICHECK(data_ != nullptr) << " Trying to reference a null Integer";
     return (*this)->value;
   }
   // comparators
@@ -461,9 +464,9 @@ class Range : public ObjectRef {
 
 // implementataions
 inline const Type& RelayExprNode::checked_type() const {
-  CHECK(checked_type_.defined()) << "internal error: the type checker has "
-                                 << "not populated the checked_type "
-                                 << "field for " << GetRef<RelayExpr>(this);
+  ICHECK(checked_type_.defined()) << "internal error: the type checker has "
+                                  << "not populated the checked_type "
+                                  << "field for " << GetRef<RelayExpr>(this);
   return this->checked_type_;
 }
 
@@ -471,11 +474,11 @@ template <typename TTypeNode>
 inline const TTypeNode* RelayExprNode::type_as() const {
   static_assert(std::is_base_of<TypeNode, TTypeNode>::value,
                 "TType must be a special case of type");
-  CHECK(checked_type_.defined())
+  ICHECK(checked_type_.defined())
       << "Type inference for this Expr has not completed. Try to call infer_type pass.";
   const TTypeNode* node = checked_type_.as<TTypeNode>();
-  CHECK(node != nullptr) << "Expected type to be " << TTypeNode::_type_key << ", but get "
-                         << checked_type_->GetTypeKey();
+  ICHECK(node != nullptr) << "Expected type to be " << TTypeNode::_type_key << ", but get "
+                          << checked_type_->GetTypeKey();
   return node;
 }
 
@@ -522,7 +525,7 @@ struct PackedFuncValueConverter<tvm::Bool> {
     }
     if (val.type_code() == kTVMArgInt) {
       int v = val.operator int();
-      CHECK(v == 0 || v == 1) << "ValueError: boolean value can only be 0 or 1, but get " << v;
+      ICHECK(v == 0 || v == 1) << "ValueError: boolean value can only be 0 or 1, but get " << v;
       return Bool(static_cast<bool>(v));
     }
     return val.AsObjectRef<tvm::Bool>();
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index b3f8438f6ec9..d6fb6a20b58a 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -300,7 +300,7 @@ class IRModule : public ObjectRef {
   /*! \return mutable pointers to the node. */
   IRModuleNode* operator->() const {
     auto* ptr = get_mutable();
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
     return static_cast<IRModuleNode*>(ptr);
   }
 
diff --git a/include/tvm/ir/op.h b/include/tvm/ir/op.h
index e7b35778d500..c73be3c1e564 100644
--- a/include/tvm/ir/op.h
+++ b/include/tvm/ir/op.h
@@ -146,7 +146,7 @@ class OpNode : public RelayExprNode {
   // Internal function to compute if it is primitive op
   bool IsPrimitiveOp_() const {
     const auto& fn_ty = this->op_type;
-    CHECK(fn_ty.get() != nullptr);
+    ICHECK(fn_ty.get() != nullptr);
     if (fn_ty->type_constraints.size() != 1) return false;
     const TypeRelationNode* rel = fn_ty->type_constraints[0].as<TypeRelationNode>();
     if (rel == nullptr) return false;
@@ -462,7 +462,7 @@ inline OpRegEntry& OpRegEntry::set_support_level(int32_t n) {  // NOLINT(*)
 template <typename ValueType>
 inline OpRegEntry& OpRegEntry::set_attr(  // NOLINT(*)
     const std::string& attr_name, const ValueType& value, int plevel) {
-  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  ICHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
   runtime::TVMRetValue rv;
   rv = value;
   UpdateAttr(attr_name, rv, plevel);
@@ -473,7 +473,7 @@ inline OpRegEntry& OpRegEntry::set_attr(  // NOLINT(*)
 
 template <typename ValueType>
 inline ValueType OpAttrMap<ValueType>::get(const RelayExpr& expr, ValueType def_value) const {
-  CHECK(expr.defined());
+  ICHECK(expr.defined());
   if (const OpNode* op = expr.as<OpNode>()) {
     return this->map_.get(GetRef<Op>(op), def_value);
   } else {
diff --git a/include/tvm/ir/transform.h b/include/tvm/ir/transform.h
index 2bbf28311b30..56905ded5201 100644
--- a/include/tvm/ir/transform.h
+++ b/include/tvm/ir/transform.h
@@ -166,7 +166,7 @@ class PassContext : public ObjectRef {
    * \return const access pointer.
    */
   const PassContextNode* operator->() const {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<const PassContextNode*>(get());
   }
   /*!
@@ -174,7 +174,7 @@ class PassContext : public ObjectRef {
    * \return mutable access pointer.
    */
   PassContextNode* operator->() {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<PassContextNode*>(get_mutable());
   }
 
@@ -197,6 +197,13 @@ class PassContext : public ObjectRef {
    */
   TVM_DLL void Trace(const IRModule& module, const PassInfo& info, bool is_before) const;
 
+  /*!
+   * \brief Check whether a pass is enabled.
+   * \param info The pass information.
+   * \return true if the pass is enabled. Otherwise, false.
+   */
+  TVM_DLL bool PassEnabled(const PassInfo& info) const;
+
   /*!
    * \brief Register a valid configuration option and its ValueType for validation.
    *
@@ -344,7 +351,7 @@ class Pass : public ObjectRef {
    */
   IRModule operator()(IRModule mod) const {
     const PassNode* node = operator->();
-    CHECK(node != nullptr);
+    ICHECK(node != nullptr);
     return node->operator()(std::move(mod));
   }
   /*!
@@ -357,7 +364,7 @@ class Pass : public ObjectRef {
    */
   IRModule operator()(IRModule mod, const PassContext& pass_ctx) const {
     const PassNode* node = operator->();
-    CHECK(node != nullptr);
+    ICHECK(node != nullptr);
     return node->operator()(std::move(mod), pass_ctx);
   }
 
diff --git a/include/tvm/ir/type_functor.h b/include/tvm/ir/type_functor.h
index 2a6314cf7644..11bf7d4740d0 100644
--- a/include/tvm/ir/type_functor.h
+++ b/include/tvm/ir/type_functor.h
@@ -71,7 +71,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitType(const Type& n, Args... args) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/include/tvm/ir/type_relation.h b/include/tvm/ir/type_relation.h
index 83323b01e419..462588006c9b 100644
--- a/include/tvm/ir/type_relation.h
+++ b/include/tvm/ir/type_relation.h
@@ -29,6 +29,7 @@
 #include <tvm/ir/env_func.h>
 #include <tvm/ir/module.h>
 #include <tvm/ir/type.h>
+#include <tvm/support/logging.h>
 
 namespace tvm {
 
diff --git a/include/tvm/node/attr_registry_map.h b/include/tvm/node/attr_registry_map.h
index 9c554af9bc21..552aa7114657 100644
--- a/include/tvm/node/attr_registry_map.h
+++ b/include/tvm/node/attr_registry_map.h
@@ -56,9 +56,9 @@ class AttrRegistryMapContainerMap {
    * \return the const reference to the content value.
    */
   const runtime::TVMRetValue& operator[](const KeyType& key) const {
-    CHECK(key.defined());
+    ICHECK(key.defined());
     const uint32_t idx = key->AttrRegistryIndex();
-    CHECK(idx < data_.size() && data_[idx].second != 0)
+    ICHECK(idx < data_.size() && data_[idx].second != 0)
         << "Attribute " << attr_name_ << " has not been registered for " << key->name;
     return data_[idx].first;
   }
@@ -71,7 +71,7 @@ class AttrRegistryMapContainerMap {
    */
   template <typename ValueType>
   ValueType get(const KeyType& key, ValueType def_value) const {
-    CHECK(key.defined());
+    ICHECK(key.defined());
     const uint32_t idx = key->AttrRegistryIndex();
     if (idx < data_.size() && data_[idx].second != 0) {
       return data_[idx].first;
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index 74dabc168924..209bb9e72f33 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -351,7 +351,7 @@ class SmallMapNode : public MapNode,
    */
   const mapped_type& at(const key_type& key) const {
     iterator itr = find(key);
-    CHECK(itr.index < size_) << "IndexError: key is not in Map";
+    ICHECK(itr.index < size_) << "IndexError: key is not in Map";
     return itr->second;
   }
   /*!
@@ -361,7 +361,7 @@ class SmallMapNode : public MapNode,
    */
   mapped_type& at(const key_type& key) {
     iterator itr = find(key);
-    CHECK(itr.index < size_) << "IndexError: key is not in Map";
+    ICHECK(itr.index < size_) << "IndexError: key is not in Map";
     return itr->second;
   }
   /*! \return begin iterator */
@@ -466,7 +466,7 @@ class SmallMapNode : public MapNode,
     }
     uint64_t next_size = std::max(map_node->slots_ * 2, uint64_t(kInitSize));
     next_size = std::min(next_size, uint64_t(kMaxSize));
-    CHECK_GT(next_size, map_node->slots_);
+    ICHECK_GT(next_size, map_node->slots_);
     ObjectPtr<Object> new_map = CreateFromRange(next_size, map_node->begin(), map_node->end());
     InsertMaybeReHash(kv, &new_map);
     *map = std::move(new_map);
@@ -656,7 +656,7 @@ class DenseMapNode : public MapNode {
    */
   mapped_type& At(const key_type& key) const {
     ListNode iter = Search(key);
-    CHECK(!iter.IsNone()) << "IndexError: key is not in Map";
+    ICHECK(!iter.IsNone()) << "IndexError: key is not in Map";
     return iter.Val();
   }
   /*!
@@ -823,7 +823,7 @@ class DenseMapNode : public MapNode {
    * \return The object created
    */
   static ObjectPtr<DenseMapNode> Empty(uint32_t fib_shift, uint64_t n_slots) {
-    CHECK_GT(n_slots, uint64_t(SmallMapNode::kMaxSize));
+    ICHECK_GT(n_slots, uint64_t(SmallMapNode::kMaxSize));
     ObjectPtr<DenseMapNode> p = make_object<DenseMapNode>();
     uint64_t n_blocks = CalcNumBlocks(n_slots - 1);
     Block* block = p->data_ = new Block[n_blocks];
@@ -855,7 +855,7 @@ class DenseMapNode : public MapNode {
       for (int j = 0; j < kBlockCap;
            ++j, ++meta_ptr_from, ++data_ptr_from, ++meta_ptr_to, ++data_ptr_to) {
         uint8_t& meta = *meta_ptr_to = *meta_ptr_from;
-        CHECK(meta != kProtectedSlot);
+        ICHECK(meta != kProtectedSlot);
         if (meta != uint8_t(kEmptySlot)) {
           new (data_ptr_to) KVType(*data_ptr_from);
         }
@@ -876,7 +876,7 @@ class DenseMapNode : public MapNode {
       iter.Val() = kv.second;
       return;
     }
-    CHECK_GT(map_node->slots_, uint64_t(SmallMapNode::kMaxSize));
+    ICHECK_GT(map_node->slots_, uint64_t(SmallMapNode::kMaxSize));
     // Otherwise, start rehash
     ObjectPtr<Object> p = Empty(map_node->fib_shift_ - 1, map_node->slots_ * 2 + 2);
     // Insert the given `kv` into the new hash map
@@ -963,7 +963,7 @@ class DenseMapNode : public MapNode {
       shift -= 1;
       slots <<= 1;
     }
-    CHECK_GT(slots, cap);
+    ICHECK_GT(slots, cap);
     if (slots < cap * 2) {
       *fib_shift = shift - 1;
       *n_slots = slots << 1;
diff --git a/include/tvm/node/functor.h b/include/tvm/node/functor.h
index 0837f35bd715..9920500ffe98 100644
--- a/include/tvm/node/functor.h
+++ b/include/tvm/node/functor.h
@@ -92,8 +92,8 @@ class NodeFunctor<R(const ObjectRef& n, Args...)> {
    * \return The result.
    */
   R operator()(const ObjectRef& n, Args... args) const {
-    CHECK(can_dispatch(n)) << "NodeFunctor calls un-registered function on type "
-                           << n->GetTypeKey();
+    ICHECK(can_dispatch(n)) << "NodeFunctor calls un-registered function on type "
+                            << n->GetTypeKey();
     return (*func_[n->type_index()])(n, std::forward<Args>(args)...);
   }
   /*!
@@ -108,7 +108,7 @@ class NodeFunctor<R(const ObjectRef& n, Args...)> {
     if (func_.size() <= tindex) {
       func_.resize(tindex + 1, nullptr);
     }
-    CHECK(func_[tindex] == nullptr) << "Dispatch for " << TNode::_type_key << " is already set";
+    ICHECK(func_[tindex] == nullptr) << "Dispatch for " << TNode::_type_key << " is already set";
     func_[tindex] = f;
     return *this;
   }
@@ -121,7 +121,7 @@ class NodeFunctor<R(const ObjectRef& n, Args...)> {
   template <typename TNode>
   TSelf& clear_dispatch() {  // NOLINT(*)
     uint32_t tindex = TNode::RuntimeTypeIndex();
-    CHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
+    ICHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
     func_[tindex] = nullptr;
     return *this;
   }
diff --git a/include/tvm/node/reflection.h b/include/tvm/node/reflection.h
index e8ff26be42b3..d842c33cce03 100644
--- a/include/tvm/node/reflection.h
+++ b/include/tvm/node/reflection.h
@@ -208,7 +208,7 @@ class ReflectionVTable::Registry {
    * \return rference to self.
    */
   Registry& set_creator(FCreate f) {  // NOLINT(*)
-    CHECK_LT(type_index_, parent_->fcreate_.size());
+    ICHECK_LT(type_index_, parent_->fcreate_.size());
     parent_->fcreate_[type_index_] = f;
     return *this;
   }
@@ -218,7 +218,7 @@ class ReflectionVTable::Registry {
    * \return rference to self.
    */
   Registry& set_repr_bytes(FReprBytes f) {  // NOLINT(*)
-    CHECK_LT(type_index_, parent_->frepr_bytes_.size());
+    ICHECK_LT(type_index_, parent_->frepr_bytes_.size());
     parent_->frepr_bytes_[type_index_] = f;
     return *this;
   }
diff --git a/include/tvm/parser/source_map.h b/include/tvm/parser/source_map.h
index 5595574265c6..a160c22a2a2f 100644
--- a/include/tvm/parser/source_map.h
+++ b/include/tvm/parser/source_map.h
@@ -101,14 +101,12 @@ class SourceMap : public ObjectRef {
   TVM_DLL SourceMap(std::initializer_list<std::pair<SourceName, Source>> source_map)
       : SourceMap(Map<SourceName, Source>(source_map)) {}
 
-  TVM_DLL SourceMap() : SourceMap({}) {}
-
-  TVM_DLL static SourceMap Global();
+  TVM_DLL SourceMap() : SourceMap(Map<SourceName, Source>()) {}
 
   void Add(const Source& source);
 
   SourceMapNode* operator->() {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<SourceMapNode*>(get_mutable());
   }
 
diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h
index 26e5a65ddb5e..5dd837038731 100644
--- a/include/tvm/relay/analysis.h
+++ b/include/tvm/relay/analysis.h
@@ -24,12 +24,12 @@
 #ifndef TVM_RELAY_ANALYSIS_H_
 #define TVM_RELAY_ANALYSIS_H_
 
-#include <tvm/ir/diagnostic.h>
 #include <tvm/ir/module.h>
 #include <tvm/relay/adt.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
 #include <tvm/relay/type.h>
+#include <tvm/support/logging.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index b2555de6d35e..f8aa1fc508b6 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -120,6 +120,7 @@ struct Conv2DAttrs : public tvm::AttrsNode<Conv2DAttrs> {
   tvm::String data_layout;
   tvm::String kernel_layout;
   tvm::String out_layout;
+  std::string auto_scheduler_rewritten_layout;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv2DAttrs, "relay.attrs.Conv2DAttrs") {
@@ -1324,6 +1325,34 @@ struct CorrelationAttrs : public tvm::AttrsNode<CorrelationAttrs> {
   }
 };  // struct CorrelationAttrs
 
+/*! \brief Attributes used in SpaceToBatchND operator */
+struct SpaceToBatchNDAttrs : public tvm::AttrsNode<SpaceToBatchNDAttrs> {
+  Array<Integer> block_shape;
+  Array<Array<IndexExpr>> paddings;
+  double pad_value;
+
+  TVM_DECLARE_ATTRS(SpaceToBatchNDAttrs, "relay.attrs.SpaceToBatchNDAttrs") {
+    TVM_ATTR_FIELD(block_shape)
+        .set_default(Array<Integer>({1, 1}))
+        .describe("1-D containing block size for each spatial dimension.");
+    TVM_ATTR_FIELD(paddings).describe("2-D containing paddings for each spatial dimension.");
+    TVM_ATTR_FIELD(pad_value).set_default(0.0).describe("The value used for padding.");
+  }
+};  // struct SpaceToBatchNDAttrs
+
+/*! \brief Attributes used in BatchToSpaceND operator */
+struct BatchToSpaceNDAttrs : public tvm::AttrsNode<BatchToSpaceNDAttrs> {
+  Array<Integer> block_shape;
+  Array<Array<IndexExpr>> crops;
+
+  TVM_DECLARE_ATTRS(BatchToSpaceNDAttrs, "relay.attrs.BatchToSpaceNDAttrs") {
+    TVM_ATTR_FIELD(block_shape)
+        .set_default(Array<Integer>({1, 1}))
+        .describe("1-D containing block size for each spatial dimension.");
+    TVM_ATTR_FIELD(crops).describe("2-D containing amount to crop from spatial dimension.");
+  }
+};  // struct BatchToSpaceNDAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 274294ccb388..b64070781523 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -93,6 +93,26 @@ struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
   }
 };  // struct ReshapeAttrs
 
+/*! \brief Attributes used in MXNet-style reshape_like operators */
+struct ReshapeLikeAttrs : public tvm::AttrsNode<ReshapeLikeAttrs> {
+  int lhs_begin;
+  Integer lhs_end;  // can be None
+  int rhs_begin;
+  Integer rhs_end;  // can be None
+  TVM_DECLARE_ATTRS(ReshapeLikeAttrs, "relay.attrs.ReshapeLikeAttrs") {
+    TVM_ATTR_FIELD(lhs_begin).set_default(0).describe(
+        "The axis of the input where reshaping should begin.");
+    TVM_ATTR_FIELD(lhs_end)
+        .set_default(NullValue<Integer>())
+        .describe("The axis of the input where reshaping should end, exclusive.");
+    TVM_ATTR_FIELD(rhs_begin).set_default(0).describe(
+        "The axis of the shape_like tensor to begin taking dimensions from.");
+    TVM_ATTR_FIELD(rhs_end)
+        .set_default(NullValue<Integer>())
+        .describe("The axis of the shape_like tensor to end taking dimensions from, exclusive.");
+  }
+};  // struct ReshapeLikeAttrs
+
 struct ScatterAttrs : public tvm::AttrsNode<ScatterAttrs> {
   Integer axis;
 
@@ -109,6 +129,14 @@ struct ScatterAddAttrs : public tvm::AttrsNode<ScatterAddAttrs> {
   }
 };
 
+struct ScatterNDAttrs : public tvm::AttrsNode<ScatterNDAttrs> {
+  Array<Integer> out_shape;
+
+  TVM_DECLARE_ATTRS(ScatterNDAttrs, "relay.attrs.ScatterNDAttrs") {
+    TVM_ATTR_FIELD(out_shape).describe("Output shape of the scatter.");
+  }
+};
+
 struct GatherAttrs : public tvm::AttrsNode<GatherAttrs> {
   Integer axis;
 
@@ -338,6 +366,20 @@ struct LayoutTransformAttrs : public tvm::AttrsNode<LayoutTransformAttrs> {
   }
 };
 
+/*! \brief Attributes for AutoSchedulerLayoutTransform operator */
+struct AutoSchedulerLayoutTransformAttrs
+    : public tvm::AttrsNode<AutoSchedulerLayoutTransformAttrs> {
+  std::string src_layout;
+  std::string dst_layout;
+
+  TVM_DECLARE_ATTRS(AutoSchedulerLayoutTransformAttrs,
+                    "relay.attrs.AutoSchedulerLayoutTransformAttrs") {
+    TVM_ATTR_FIELD(src_layout).describe("The source layout of the tensor. (e.g. 1N32C112H112W)");
+    TVM_ATTR_FIELD(dst_layout)
+        .describe("The destination layout of the tensor. (e.g. 1N2C112H112W16c)");
+  }
+};
+
 /*! \brief Attributes for ShapeOf operator */
 struct ShapeOfAttrs : public tvm::AttrsNode<ShapeOfAttrs> {
   DataType dtype;
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 76a6a221d065..e94bd2756e98 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -42,18 +42,18 @@ namespace tvm {
  */
 namespace relay {
 
-#define RELAY_DEBUG(...)                                               \
-  {                                                                    \
-    auto fdebug = runtime::Registry::Get("relay.debug");               \
-    CHECK(fdebug) << "Could not find Relay Python debugger function."; \
-    (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__);         \
+#define RELAY_DEBUG(...)                                                \
+  {                                                                     \
+    auto fdebug = runtime::Registry::Get("relay.debug");                \
+    ICHECK(fdebug) << "Could not find Relay Python debugger function."; \
+    (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__);          \
   }
 
-#define RELAY_DEBUG_INTERP(...)                                        \
-  {                                                                    \
-    auto fdebug = runtime::Registry::Get("relay.debug_interp");        \
-    CHECK(fdebug) << "Could not find Relay Python debugger function."; \
-    (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__);         \
+#define RELAY_DEBUG_INTERP(...)                                         \
+  {                                                                     \
+    auto fdebug = runtime::Registry::Get("relay.debug_interp");         \
+    ICHECK(fdebug) << "Could not find Relay Python debugger function."; \
+    (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__);          \
   }
 
 /*!
diff --git a/include/tvm/relay/dataflow_pattern_functor.h b/include/tvm/relay/dataflow_pattern_functor.h
index 98c81c929409..364daac81cc8 100644
--- a/include/tvm/relay/dataflow_pattern_functor.h
+++ b/include/tvm/relay/dataflow_pattern_functor.h
@@ -76,7 +76,7 @@ class DFPatternFunctor<R(const DFPattern& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitDFPattern(const DFPattern& n, Args... args) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index c3d2f724b736..8589f8cc4f16 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -32,6 +32,7 @@
 #include <tvm/relay/function.h>
 #include <tvm/relay/op.h>
 
+#include <stack>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -87,7 +88,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitExpr(const Expr& n, Args... args) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
@@ -345,7 +346,7 @@ class ExprRewriter {
    * \return The result of the call
    */
   virtual Expr Rewrite(const Expr& pre, const Expr& post) {
-    CHECK(pre.defined());
+    ICHECK(pre.defined());
     static FType vtable = InitVTable();
     return vtable(pre, this, post);
   }
@@ -408,6 +409,73 @@ Expr PostOrderRewrite(const Expr& expr, ExprRewriter* rewriter);
  */
 void PostOrderVisit(const Expr& node, std::function<void(const Expr&)> fvisit);
 
+/*!
+ * \brief A function to iteratively traverse dataflow regions of a graph
+ *
+ * ExpandDataflow manually manages a stack and performs DFS to determine the processing
+ * order of nodes in an input graph.
+ *
+ * If it finds a dataflow node (Call, Tuple, TupleGetItem), it checks if the arguments to that node
+ * need to be processed via fcheck_visited. If so, the function pushes those arguments to the stack
+ * and continues iteratively to process the top of the stack. When it finds a node that doesn't
+ * match the dataflow types, or a node who's inputs have all been processed, it visits the current
+ * leaf via fvisit_leaf.
+ *
+ * This function should be used internally to other classes to implement mixed-mode traversals. The
+ * expectation is that fvisit_leaf will perform recursive analysis within mixed-mode traversal if it
+ * hits a non-dataflow node.
+ *
+ * fcheck_visited and fvisit_leaf are templated to encourage compiler inlining.
+ */
+template <typename FCheckVisited, typename FVisitLeaf>
+void ExpandDataflow(Expr expr, FCheckVisited fcheck_visited, FVisitLeaf fvisit_leaf) {
+  std::stack<std::pair<Expr, bool>> stack;
+  auto fpush_to_stack = [&fcheck_visited, &stack](const Expr& expr) {
+    // The second state of the stack indicate whether the child has been
+    // expanded in the pre-order.
+    // NOTE: function will be inlined.
+    if (!fcheck_visited(expr)) {
+      stack.push({expr, false});
+    }
+  };
+  fpush_to_stack(expr);
+  while (stack.size() > 0) {
+    auto node = stack.top().first;
+    if (fcheck_visited(node)) {
+      // if this node was visited through another path
+      // after being added to the stack ignore it.
+      stack.pop();
+    } else if (stack.top().second) {
+      // all the children have already been expanded.
+      // we can just run post order visit on it.
+      fvisit_leaf(node);
+      stack.pop();
+    } else if (const CallNode* op = node.as<CallNode>()) {
+      // mark expanded = true
+      stack.top().second = true;
+      // push the children to the stack in reverse order
+      // to match recursive processing order
+      for (auto it = op->args.rbegin(); it != op->args.rend(); ++it) {
+        fpush_to_stack(*it);
+      }
+      fpush_to_stack(op->op);
+    } else if (const TupleNode* op = node.as<TupleNode>()) {
+      stack.top().second = true;
+      // push the children to the stack in reverse order
+      // to match recursive processing order
+      for (auto it = op->fields.rbegin(); it != op->fields.rend(); ++it) {
+        fpush_to_stack(*it);
+      }
+    } else if (const TupleGetItemNode* op = node.as<TupleGetItemNode>()) {
+      stack.top().second = true;
+      fpush_to_stack(op->tuple);
+    } else {
+      // No need to expand the children directly run visit.
+      fvisit_leaf(node);
+      stack.pop();
+    }
+  }
+}
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_FUNCTOR_H_
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index acd4a03aed03..1e9b86d9e0bc 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -175,15 +175,11 @@ using FTVMLegalize = runtime::TypedPackedFunc<Expr(const Attrs& attrs, const Arr
 /*!
  * \brief Annotates an expression to indicate if an op should be compiled using
  * the given compiler/target.
- *
- * \param attrs The attribute of the original expr.
- * \param args The arguments of the original expr.
- *
+ * \param expr The original expr.
  * \return true if this op should be registered to invoke a specific compiler
  * for codegen, otherwise, false.
  */
-using FTVMAnnotateTarget = runtime::TypedPackedFunc<bool(const Attrs& attrs,  // NOLINT(*)
-                                                         const Array<Expr>& args)>;
+using FTVMAnnotateTarget = runtime::TypedPackedFunc<bool(const Expr& expr)>;
 
 /*!
  * \brief Forward rewriting rule for a specific op.
diff --git a/include/tvm/relay/pattern_functor.h b/include/tvm/relay/pattern_functor.h
index de3bafa49074..711d8323f158 100644
--- a/include/tvm/relay/pattern_functor.h
+++ b/include/tvm/relay/pattern_functor.h
@@ -89,7 +89,7 @@ class PatternFunctor<R(const Pattern& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitPattern(const Pattern& n, Args... args) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index cbd6a88e584e..e4b39da85206 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -106,6 +106,14 @@ TVM_DLL Pass FoldConstant();
  */
 TVM_DLL Pass FuseOps(int fuse_opt_level = -1);
 
+/*!
+ * \brief The inverse operation of FuseOps. It transforms a fused program returned by
+ * FuseOps into the program before FuseOps. (i.e. x == DefuseOps(FuseOps(x)))
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass DefuseOps();
+
 /*!
  * \brief Rewrite the annotated program.
  *
@@ -194,8 +202,9 @@ TVM_DLL Pass ToGraphNormalForm();
 TVM_DLL Pass PartialEval();
 
 /*!
- * \brief Simplify certain operators during inference. For example, batch norm
- * will be unpacked into a number of simplified operators.
+ * \brief Simplify certain operators during inference. For example, the result
+ * of a batch norm which is indexed at tuple index 0 will be unpacked into a
+ * number of simplified operators.
  *
  * \return The Pass.
  */
@@ -314,6 +323,12 @@ TVM_DLL Pass CanonicalizeOps();
  */
 TVM_DLL Pass AlterOpLayout();
 
+/*!
+ * \brief Do layout rewrite according to the tile structure created by auto-scheduler.
+ * \return The pass
+ */
+TVM_DLL Pass AutoSchedulerLayoutRewrite();
+
 /*!
  * \brief Given a dest layout, this pass transforms the expr such that most of the ops input data
  * layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms, one
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index e25394a88b5a..aac49c198c72 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -366,6 +366,12 @@ TVM_DLL int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out);
  */
 TVM_DLL int TVMFuncListGlobalNames(int* out_size, const char*** out_array);
 
+/*!
+ * \brief Remove a global function.
+ * \param name The name of the function.
+ */
+TVM_DLL int TVMFuncRemoveGlobal(const char* name);
+
 // Array related apis for quick proptyping
 /*!
  * \brief Allocate a nd-array's memory,
diff --git a/include/tvm/runtime/container.h b/include/tvm/runtime/container.h
index 7778c5d8424c..796ab7b113c1 100644
--- a/include/tvm/runtime/container.h
+++ b/include/tvm/runtime/container.h
@@ -146,7 +146,7 @@ class InplaceArrayBase {
    */
   const ElemType& operator[](size_t idx) const {
     size_t size = Self()->GetSize();
-    CHECK_LT(idx, size) << "Index " << idx << " out of bounds " << size << "\n";
+    ICHECK_LT(idx, size) << "Index " << idx << " out of bounds " << size << "\n";
     return *(reinterpret_cast<ElemType*>(AddressOf(idx)));
   }
 
@@ -157,7 +157,7 @@ class InplaceArrayBase {
    */
   ElemType& operator[](size_t idx) {
     size_t size = Self()->GetSize();
-    CHECK_LT(idx, size) << "Index " << idx << " out of bounds " << size << "\n";
+    ICHECK_LT(idx, size) << "Index " << idx << " out of bounds " << size << "\n";
     return *(reinterpret_cast<ElemType*>(AddressOf(idx)));
   }
 
@@ -361,7 +361,7 @@ class ArrayNode : public Object, public InplaceArrayBase<ArrayNode, ObjectRef> {
    */
   static ObjectPtr<ArrayNode> CopyFrom(int64_t cap, ArrayNode* from) {
     int64_t size = from->size_;
-    CHECK_GE(cap, size) << "ValueError: not enough capacity";
+    ICHECK_GE(cap, size) << "ValueError: not enough capacity";
     ObjectPtr<ArrayNode> p = ArrayNode::Empty(cap);
     ObjectRef* write = p->MutableBegin();
     ObjectRef* read = from->MutableBegin();
@@ -380,7 +380,7 @@ class ArrayNode : public Object, public InplaceArrayBase<ArrayNode, ObjectRef> {
    */
   static ObjectPtr<ArrayNode> MoveFrom(int64_t cap, ArrayNode* from) {
     int64_t size = from->size_;
-    CHECK_GE(cap, size) << "ValueError: not enough capacity";
+    ICHECK_GE(cap, size) << "ValueError: not enough capacity";
     ObjectPtr<ArrayNode> p = ArrayNode::Empty(cap);
     ObjectRef* write = p->MutableBegin();
     ObjectRef* read = from->MutableBegin();
@@ -429,7 +429,7 @@ class ArrayNode : public Object, public InplaceArrayBase<ArrayNode, ObjectRef> {
    * \return Ref-counted ArrayNode requested
    */
   static ObjectPtr<ArrayNode> Empty(int64_t n = kInitSize) {
-    CHECK_GE(n, 0);
+    ICHECK_GE(n, 0);
     ObjectPtr<ArrayNode> p = make_inplace_array_object<ArrayNode, ObjectRef>(n);
     p->capacity_ = n;
     p->size_ = 0;
@@ -679,9 +679,9 @@ class Array : public ObjectRef {
    */
   const T operator[](int64_t i) const {
     ArrayNode* p = GetArrayNode();
-    CHECK(p != nullptr) << "ValueError: cannot index a null array";
-    CHECK(0 <= i && i < p->size_) << "IndexError: indexing " << i << " on an array of size "
-                                  << p->size_;
+    ICHECK(p != nullptr) << "ValueError: cannot index a null array";
+    ICHECK(0 <= i && i < p->size_)
+        << "IndexError: indexing " << i << " on an array of size " << p->size_;
     return DowncastNoCheck<T>(*(p->begin() + i));
   }
 
@@ -703,16 +703,16 @@ class Array : public ObjectRef {
   /*! \return The first element of the array */
   const T front() const {
     ArrayNode* p = GetArrayNode();
-    CHECK(p != nullptr) << "ValueError: cannot index a null array";
-    CHECK_GT(p->size_, 0) << "IndexError: cannot index an empty array";
+    ICHECK(p != nullptr) << "ValueError: cannot index a null array";
+    ICHECK_GT(p->size_, 0) << "IndexError: cannot index an empty array";
     return DowncastNoCheck<T>(*(p->begin()));
   }
 
   /*! \return The last element of the array */
   const T back() const {
     ArrayNode* p = GetArrayNode();
-    CHECK(p != nullptr) << "ValueError: cannot index a null array";
-    CHECK_GT(p->size_, 0) << "IndexError: cannot index an empty array";
+    ICHECK(p != nullptr) << "ValueError: cannot index a null array";
+    ICHECK_GT(p->size_, 0) << "IndexError: cannot index an empty array";
     return DowncastNoCheck<T>(*(p->end() - 1));
   }
 
@@ -734,7 +734,7 @@ class Array : public ObjectRef {
    * \param val The element to insert
    */
   void insert(iterator position, const T& val) {
-    CHECK(data_ != nullptr) << "ValueError: cannot insert a null array";
+    ICHECK(data_ != nullptr) << "ValueError: cannot insert a null array";
     int64_t idx = std::distance(begin(), position);
     int64_t size = GetArrayNode()->size_;
     auto addr = CopyOnWrite(1)                               //
@@ -755,7 +755,7 @@ class Array : public ObjectRef {
     if (first == last) {
       return;
     }
-    CHECK(data_ != nullptr) << "ValueError: cannot insert a null array";
+    ICHECK(data_ != nullptr) << "ValueError: cannot insert a null array";
     int64_t idx = std::distance(begin(), position);
     int64_t size = GetArrayNode()->size_;
     int64_t numel = std::distance(first, last);
@@ -767,9 +767,9 @@ class Array : public ObjectRef {
 
   /*! \brief Remove the last item of the list */
   void pop_back() {
-    CHECK(data_ != nullptr) << "ValueError: cannot pop_back because array is null";
+    ICHECK(data_ != nullptr) << "ValueError: cannot pop_back because array is null";
     int64_t size = GetArrayNode()->size_;
-    CHECK_GT(size, 0) << "ValueError: cannot pop_back because array is empty";
+    ICHECK_GT(size, 0) << "ValueError: cannot pop_back because array is empty";
     CopyOnWrite()->ShrinkBy(1);
   }
 
@@ -778,11 +778,11 @@ class Array : public ObjectRef {
    * \param position An iterator pointing to the element to be erased
    */
   void erase(iterator position) {
-    CHECK(data_ != nullptr) << "ValueError: cannot erase a null array";
+    ICHECK(data_ != nullptr) << "ValueError: cannot erase a null array";
     int64_t st = std::distance(begin(), position);
     int64_t size = GetArrayNode()->size_;
-    CHECK(0 <= st && st < size) << "ValueError: cannot erase at index " << st
-                                << ", because Array size is " << size;
+    ICHECK(0 <= st && st < size) << "ValueError: cannot erase at index " << st
+                                 << ", because Array size is " << size;
     CopyOnWrite()                             //
         ->MoveElementsLeft(st, st + 1, size)  //
         ->ShrinkBy(1);
@@ -797,12 +797,12 @@ class Array : public ObjectRef {
     if (first == last) {
       return;
     }
-    CHECK(data_ != nullptr) << "ValueError: cannot erase a null array";
+    ICHECK(data_ != nullptr) << "ValueError: cannot erase a null array";
     int64_t size = GetArrayNode()->size_;
     int64_t st = std::distance(begin(), first);
     int64_t ed = std::distance(begin(), last);
-    CHECK_LT(st, ed) << "ValueError: cannot erase array in range [" << st << ", " << ed << ")";
-    CHECK(0 <= st && st <= size && 0 <= ed && ed <= size)
+    ICHECK_LT(st, ed) << "ValueError: cannot erase array in range [" << st << ", " << ed << ")";
+    ICHECK(0 <= st && st <= size && 0 <= ed && ed <= size)
         << "ValueError: cannot erase array in range [" << st << ", " << ed << ")"
         << ", because array size is " << size;
     CopyOnWrite()                         //
@@ -815,7 +815,7 @@ class Array : public ObjectRef {
    * \param n The new size.
    */
   void resize(int64_t n) {
-    CHECK_GE(n, 0) << "ValueError: cannot resize an Array to negative size";
+    ICHECK_GE(n, 0) << "ValueError: cannot resize an Array to negative size";
     if (data_ == nullptr) {
       SwitchContainer(n);
       return;
@@ -856,8 +856,8 @@ class Array : public ObjectRef {
    */
   void Set(int64_t i, T value) {
     ArrayNode* p = this->CopyOnWrite();
-    CHECK(0 <= i && i < p->size_) << "IndexError: indexing " << i << " on an array of size "
-                                  << p->size_;
+    ICHECK(0 <= i && i < p->size_)
+        << "IndexError: indexing " << i << " on an array of size " << p->size_;
     *(p->MutableBegin() + i) = std::move(value);
   }
 
@@ -923,7 +923,7 @@ class Array : public ObjectRef {
   template <typename IterType>
   void Assign(IterType first, IterType last) {
     int64_t cap = std::distance(first, last);
-    CHECK_GE(cap, 0) << "ValueError: cannot construct an Array of negative size";
+    ICHECK_GE(cap, 0) << "ValueError: cannot construct an Array of negative size";
     ArrayNode* p = GetArrayNode();
     if (p != nullptr && data_.unique() && p->capacity_ >= cap) {
       // do not have to make new space
@@ -1565,8 +1565,8 @@ struct NullOptType {};
  *
  *  Optional<String> opt0 = nullptr;
  *  Optional<String> opt1 = String("xyz");
- *  CHECK(opt0 == nullptr);
- *  CHECK(opt1 == "xyz");
+ *  ICHECK(opt0 == nullptr);
+ *  ICHECK(opt1 == "xyz");
  *
  * \endcode
  */
@@ -1613,7 +1613,7 @@ class Optional : public ObjectRef {
    * \note This function performs not-null checking.
    */
   T value() const {
-    CHECK(data_ != nullptr);
+    ICHECK(data_ != nullptr);
     return T(data_);
   }
   /*!
diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index e01304061313..93a332a5924f 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -41,6 +41,9 @@ typedef enum {
   kTvmErrorCategoryWriteStream = 3,
   kTvmErrorCategorySession = 4,
   kTvmErrorCategoryPlatform = 5,
+  kTvmErrorCategoryGenerated = 6,
+  kTvmErrorCategoryGraphRuntime = 7,
+  kTvmErrorCategoryFunctionCall = 8,
 } tvm_crt_error_category_t;
 
 typedef enum {
@@ -72,6 +75,20 @@ typedef enum {
   // Platform
   kTvmErrorPlatformCheckFailure = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 0),
   kTvmErrorPlatformMemoryManagerInitialized = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 1),
+  kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
+
+  // Common error codes returned from generated functions.
+  kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
+
+  // Graph runtime
+  kTvmErrorGraphModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 0),
+  kTvmErrorGraphModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 1),
+  kTvmErrorGraphModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 2),
+
+  // Function Calls - common problems encountered calling functions.
+  kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0),
+  kTvmErrorFunctionCallWrongArgType = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 1),
+  kTvmErrorFunctionCallNotImplemented = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 2),
 
   // System errors are always negative integers; this mask indicates presence of a system error.
   // Cast tvm_crt_error_t to a signed integer to interpret the negative error code.
diff --git a/include/tvm/runtime/crt/graph_runtime.h b/include/tvm/runtime/crt/graph_runtime.h
index d2eb3b7785e9..e8413aa1723d 100644
--- a/include/tvm/runtime/crt/graph_runtime.h
+++ b/include/tvm/runtime/crt/graph_runtime.h
@@ -61,14 +61,20 @@ typedef struct TVMGraphRuntime TVMGraphRuntime;
  * \brief Allocate a new GraphRuntime with vmalloc and initialize it.
  *
  * \param sym_json JSON-encoded graph.
- * \param m TVM Module that exposes the functions to call.
+ * \param module_handle TVM Module that exposes the functions to call.
  * \param ctxs runtime execution context.
  */
-TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, const struct TVMModule* m,
+TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle,
                                         const TVMContext* ctxs);
 
 int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name);
 
+/*!
+ * \brief get number of input tensors allocated.
+ * \return integer number of tensors available to use.
+ */
+int TVMGraphRuntime_GetNumInputs();
+
 /*!
  * \brief set input to the graph based on name.
  * \param runtime The graph runtime.
@@ -77,6 +83,12 @@ int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name);
  */
 void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in);
 
+/*!
+ * \brief get number of output tensors allocated.
+ * \return integer number of output tensors allocated.
+ */
+int TVMGraphRuntime_GetNumOutputs();
+
 /*!
  * \brief Return NDArray for given output index.
  * \param runtime The graph runtime.
diff --git a/include/tvm/runtime/crt/graph_runtime_module.h b/include/tvm/runtime/crt/graph_runtime_module.h
new file mode 100644
index 000000000000..04e9184c8b8d
--- /dev/null
+++ b/include/tvm/runtime/crt/graph_runtime_module.h
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_runtime.h
+ * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc.
+ */
+#ifndef TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
+#define TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <tvm/runtime/crt/error_codes.h>
+
+/*!
+ * \brief Register the "tvm.graph_runtime.create" constructor PackedFunc.
+ */
+tvm_crt_error_t TVMGraphRuntimeModule_Register();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
diff --git a/include/tvm/runtime/crt/module.h b/include/tvm/runtime/crt/module.h
index 2359025f6fe1..7b124c4faa3a 100644
--- a/include/tvm/runtime/crt/module.h
+++ b/include/tvm/runtime/crt/module.h
@@ -39,6 +39,14 @@ typedef struct TVMModule {
   const TVMFuncRegistry* registry;
 } TVMModule;
 
+/*!
+ * \brief Create a new module handle from the given TVMModule instance.
+ * \param mod The module instance to register.
+ * \param out_handle Pointer to recieve the newly-minted handle for this module.
+ * \return 0 on success, non-zero on error.
+ */
+int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle);
+
 /*! \brief Entry point for the system lib module. */
 const TVMModule* TVMSystemLibEntryPoint(void);
 
diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index 782060dfd000..0f8c6ba7baf2 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -25,6 +25,8 @@
 #ifndef TVM_RUNTIME_CRT_PLATFORM_H_
 #define TVM_RUNTIME_CRT_PLATFORM_H_
 
+#include <stdarg.h>
+#include <stddef.h>
 #include <tvm/runtime/crt/error_codes.h>
 
 #ifdef __cplusplus
@@ -39,6 +41,21 @@ extern "C" {
  */
 void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t code);
 
+/*! \brief Called by the microTVM RPC server to implement TVMLogf.
+ *
+ * Not required to be implemented when the RPC server is not linked into the binary. This
+ * function's signature matches that of vsnprintf, so trivial implementations can just call
+ * vsnprintf.
+ *
+ * \param out_buf A char buffer where the formatted string should be written.
+ * \param out_buf_size_bytes Number of bytes available for writing in out_buf.
+ * \param fmt The printf-style formatstring.
+ * \param args extra arguments to be formatted.
+ * \return number of bytes written.
+ */
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/include/tvm/runtime/crt/utvm_rpc_server.h b/include/tvm/runtime/crt/utvm_rpc_server.h
index 314463ac8652..e1cc05906932 100644
--- a/include/tvm/runtime/crt/utvm_rpc_server.h
+++ b/include/tvm/runtime/crt/utvm_rpc_server.h
@@ -27,6 +27,7 @@
 
 #include <stdlib.h>
 #include <sys/types.h>
+#include <tvm/runtime/crt/error_codes.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -66,24 +67,17 @@ utvm_rpc_server_t UTvmRpcServerInit(uint8_t* memory, size_t memory_size_bytes,
                                     size_t page_size_bytes_log2,
                                     utvm_rpc_channel_write_t write_func, void* write_func_ctx);
 
-/*! \brief Copy received data into an internal buffer for processing.
- *
- * Currently only handles 1 byte of data. In the future, the goal of this function is to be safe to
- * invoke from an ISR. At that time, this function will just append to an internal buffer.
- *
- * \param server The TVM RPC Server pointer.
- * \param byte The received byte of data.
- * \return The number of bytes copied to the internal buffer. May be less than data_size_bytes when
- * the internal buffer fills.
- */
-size_t UTvmRpcServerReceiveByte(utvm_rpc_server_t server, uint8_t byte);
-
-/*! \brief Perform normal processing of received data.
+/*! \brief Do any tasks suitable for the main thread, and maybe process new incoming data.
  *
  * \param server The TVM RPC Server pointer.
- * \return true while the server is still running. false when it shuts down gracefully.
+ * \param new_data If not nullptr, a pointer to a buffer pointer, which should point at new input
+ *     data to process. On return, updated to point past data that has been consumed.
+ * \param new_data_size_bytes Points to the number of valid bytes in `new_data`. On return,
+ *     updated to the number of unprocessed bytes remaining in `new_data` (usually 0).
+ * \return An error code indicating the outcome of the server main loop iteration.
  */
-bool UTvmRpcServerLoop(utvm_rpc_server_t server);
+tvm_crt_error_t UTvmRpcServerLoop(utvm_rpc_server_t server, uint8_t** new_data,
+                                  size_t* new_data_size_bytes);
 
 #ifdef __cplusplus
 }
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index cb817a89ab81..25aadb598b28 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -24,8 +24,8 @@
 #ifndef TVM_RUNTIME_DATA_TYPE_H_
 #define TVM_RUNTIME_DATA_TYPE_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/support/logging.h>
 
 #include <string>
 #include <type_traits>
@@ -74,7 +74,7 @@ class DataType {
     data_.bits = static_cast<uint8_t>(bits);
     data_.lanes = static_cast<uint16_t>(lanes);
     if (code == kBFloat) {
-      CHECK_EQ(bits, 16);
+      ICHECK_EQ(bits, 16);
     }
   }
   /*! \return The type code. */
@@ -212,7 +212,7 @@ inline int GetVectorBytes(DataType dtype) {
       dtype == DataType::Int(1)) {
     return 1;
   }
-  CHECK_EQ(data_bits % 8, 0U) << "Need to load/store by multiple of bytes";
+  ICHECK_EQ(data_bits % 8, 0U) << "Need to load/store by multiple of bytes";
   return data_bits / 8;
 }
 
@@ -373,7 +373,7 @@ inline DLDataType String2DLDataType(std::string s) {
   if (*xdelim == 'x') {
     t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, &endpt, 10));
   }
-  CHECK(endpt == s.c_str() + s.length()) << "unknown type " << s;
+  ICHECK(endpt == s.c_str() + s.length()) << "unknown type " << s;
   return t;
 }
 
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index c6a2ce3d28d0..a6f5624de084 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -240,13 +240,55 @@ inline const char* DeviceName(int type) {
   }
 }
 
+/*!
+ * \brief Return true if a TVMContext is owned by an RPC session.
+ */
+inline bool IsRPCSessionContext(TVMContext ctx) { return (ctx.device_type / kRPCSessMask) > 0; }
+
+/*!
+ * \brief Return the RPCSessTable index of the RPC Session that owns this context.
+ * \return the table index.
+ */
+inline int GetRPCSessionIndex(TVMContext ctx) {
+  ICHECK(IsRPCSessionContext(ctx)) << "GetRPCSessionIndex: ctx has no RPC session";
+  return ctx.device_type / kRPCSessMask - 1;
+}
+
+/*!
+ * \brief Remove the RPC session mask from a TVMContext.
+ * RPC clients typically do this when encoding a TVMContext for transmission to an RPC remote.
+ * On the wire, RPCContext are expected to be valid on the server without interpretation.
+ * \param ctx A TVMContext with non-zero RPC Session mask, valid on the RPC client.
+ * \return A TVMContext without any RPC Session mask, valid on the RPC server.
+ */
+inline TVMContext RemoveRPCSessionMask(TVMContext ctx) {
+  ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
+  return ctx;
+}
+
+inline std::ostream& operator<<(std::ostream& os, DLContext ctx);
+
+/*!
+ * \brief Add a RPC session mask to a TVMContext.
+ * RPC clients typically do this when decoding a TVMContext received from a RPC remote.
+ * \param ctx A TVMContext without any RPC Session mask, valid on the RPC server.
+ * \param session_table_index Numeric index of the RPC session in the session table.
+ * \return A TVMContext with RPC session mask added, valid on the RPC client.
+ */
+inline TVMContext AddRPCSessionMask(TVMContext ctx, int session_table_index) {
+  CHECK(!IsRPCSessionContext(ctx))
+      << "AddRPCSessionMask: ctx already non-zero RPCSessionIndex: " << ctx;
+  ctx.device_type =
+      static_cast<DLDeviceType>(ctx.device_type | (kRPCSessMask * (session_table_index + 1)));
+  return ctx;
+}
+
 inline std::ostream& operator<<(std::ostream& os, DLContext ctx) {  // NOLINT(*)
-  int device_type = static_cast<int>(ctx.device_type);
-  if (device_type > kRPCSessMask) {
-    os << "remote[" << (device_type / kRPCSessMask) << "]-";
-    device_type = device_type % kRPCSessMask;
+  if (IsRPCSessionContext(ctx)) {
+    os << "remote[" << GetRPCSessionIndex(ctx) << "]-";
+    ctx = RemoveRPCSessionMask(ctx);
   }
-  os << runtime::DeviceName(device_type) << "(" << ctx.device_id << ")";
+  os << runtime::DeviceName(static_cast<int>(ctx.device_type)) << "(" << ctx.device_id << ")";
   return os;
 }
 }  // namespace runtime
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 0e7cd2b08784..04a5cf8bf25d 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -226,6 +226,10 @@ constexpr const char* tvm_global_barrier_state = "__tvm_global_barrier_state";
 constexpr const char* tvm_prepare_global_barrier = "__tvm_prepare_global_barrier";
 /*! \brief Placeholder for the module's entry function. */
 constexpr const char* tvm_module_main = "__tvm_main__";
+/*! \brief Prefix for parameter symbols emitted into the main program. */
+constexpr const char* tvm_param_prefix = "__tvm_param__";
+/*! \brief A PackedFunc that looks up linked parameters by storage_id. */
+constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
 }  // namespace symbol
 
 // implementations of inline functions.
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 92b3857fbec8..0ff171d4821f 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -325,29 +325,29 @@ inline bool NDArray::IsContiguous() const {
 }
 
 inline void NDArray::CopyFrom(const DLTensor* other) {
-  CHECK(data_ != nullptr);
+  ICHECK(data_ != nullptr);
   CopyFromTo(other, &(get_mutable()->dl_tensor));
 }
 
 inline void NDArray::CopyFrom(const NDArray& other) {
-  CHECK(data_ != nullptr);
-  CHECK(other.data_ != nullptr);
+  ICHECK(data_ != nullptr);
+  ICHECK(other.data_ != nullptr);
   CopyFromTo(&(other.get_mutable()->dl_tensor), &(get_mutable()->dl_tensor));
 }
 
 inline void NDArray::CopyTo(DLTensor* other) const {
-  CHECK(data_ != nullptr);
+  ICHECK(data_ != nullptr);
   CopyFromTo(&(get_mutable()->dl_tensor), other);
 }
 
 inline void NDArray::CopyTo(const NDArray& other) const {
-  CHECK(data_ != nullptr);
-  CHECK(other.data_ != nullptr);
+  ICHECK(data_ != nullptr);
+  ICHECK(other.data_ != nullptr);
   CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor));
 }
 
 inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
-  CHECK(data_ != nullptr);
+  ICHECK(data_ != nullptr);
   const DLTensor* dptr = operator->();
   NDArray ret =
       Empty(std::vector<int64_t>(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, ctx);
@@ -422,7 +422,7 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
     strm->Write(tensor->data, data_byte_size);
   } else {
     std::vector<uint8_t> bytes(data_byte_size);
-    CHECK_EQ(
+    ICHECK_EQ(
         TVMArrayCopyToBytes(const_cast<DLTensor*>(tensor), dmlc::BeginPtr(bytes), data_byte_size),
         0)
         << TVMGetLastError();
@@ -438,19 +438,19 @@ inline void NDArray::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operato
 
 inline bool NDArray::Load(dmlc::Stream* strm) {
   uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid DLTensor file format";
-  CHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
-  CHECK(header == kTVMNDArrayMagic) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&header)) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
+  ICHECK(header == kTVMNDArrayMagic) << "Invalid DLTensor file format";
   DLContext ctx;
   int ndim;
   DLDataType dtype;
-  CHECK(strm->Read(&ctx)) << "Invalid DLTensor file format";
-  CHECK(strm->Read(&ndim)) << "Invalid DLTensor file format";
-  CHECK(strm->Read(&dtype)) << "Invalid DLTensor file format";
-  CHECK_EQ(ctx.device_type, kDLCPU) << "Invalid DLTensor context: can only save as CPU tensor";
+  ICHECK(strm->Read(&ctx)) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&ndim)) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&dtype)) << "Invalid DLTensor file format";
+  ICHECK_EQ(ctx.device_type, kDLCPU) << "Invalid DLTensor context: can only save as CPU tensor";
   std::vector<int64_t> shape(ndim);
   if (ndim != 0) {
-    CHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
+    ICHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
   }
   NDArray ret = NDArray::Empty(shape, dtype, ctx);
   int64_t num_elems = 1;
@@ -459,12 +459,12 @@ inline bool NDArray::Load(dmlc::Stream* strm) {
     num_elems *= ret->shape[i];
   }
   int64_t data_byte_size;
-  CHECK(strm->Read(&data_byte_size)) << "Invalid DLTensor file format";
-  CHECK(data_byte_size == num_elems * elem_bytes) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&data_byte_size)) << "Invalid DLTensor file format";
+  ICHECK(data_byte_size == num_elems * elem_bytes) << "Invalid DLTensor file format";
   auto read_ret = strm->Read(ret->data, data_byte_size);
   // Only check non-empty data
   if (ndim > 0 && shape[0] != 0) {
-    CHECK(read_ret) << "Invalid DLTensor file format";
+    ICHECK(read_ret) << "Invalid DLTensor file format";
   }
   if (!DMLC_IO_NO_ENDIAN_SWAP) {
     dmlc::ByteSwap(ret->data, elem_bytes, num_elems);
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index e6ca832c70c2..b5cf77d590f6 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -23,8 +23,8 @@
 #ifndef TVM_RUNTIME_OBJECT_H_
 #define TVM_RUNTIME_OBJECT_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/support/logging.h>
 
 #include <string>
 #include <type_traits>
@@ -153,9 +153,9 @@ struct TypeIndex {
  *    ObjectRef leaf_ref(make_object<LeafObj>());
  *    // cast to a specific instance
  *    const LeafObj* leaf_ptr = leaf_ref.as<LeafObj>();
- *    CHECK(leaf_ptr != nullptr);
+ *    ICHECK(leaf_ptr != nullptr);
  *    // can also cast to the base class.
- *    CHECK(leaf_ref.as<BaseObj>() != nullptr);
+ *    ICHECK(leaf_ref.as<BaseObj>() != nullptr);
  *  }
  *
  * \endcode
@@ -756,7 +756,7 @@ struct ObjectPtrEqual {
  */
 #define TVM_DEFINE_OBJECT_REF_COW_METHOD(ObjectName)     \
   ObjectName* CopyOnWrite() {                            \
-    CHECK(data_ != nullptr);                             \
+    ICHECK(data_ != nullptr);                            \
     if (!data_.unique()) {                               \
       auto n = make_object<ObjectName>(*(operator->())); \
       ObjectPtr<Object>(std::move(n)).swap(data_);       \
@@ -845,7 +845,7 @@ inline RefType GetRef(const ObjType* ptr) {
   static_assert(std::is_base_of<typename RefType::ContainerType, ObjType>::value,
                 "Can only cast to the ref of same container type");
   if (!RefType::_type_is_nullable) {
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
   }
   return RefType(ObjectPtr<Object>(const_cast<Object*>(static_cast<const Object*>(ptr))));
 }
@@ -860,12 +860,12 @@ inline ObjectPtr<BaseType> GetObjectPtr(ObjType* ptr) {
 template <typename SubRef, typename BaseRef>
 inline SubRef Downcast(BaseRef ref) {
   if (ref.defined()) {
-    CHECK(ref->template IsInstance<typename SubRef::ContainerType>())
+    ICHECK(ref->template IsInstance<typename SubRef::ContainerType>())
         << "Downcast from " << ref->GetTypeKey() << " to " << SubRef::ContainerType::_type_key
         << " failed.";
   } else {
-    CHECK(SubRef::_type_is_nullable) << "Downcast from nullptr to not nullable reference of "
-                                     << SubRef::ContainerType::_type_key;
+    ICHECK(SubRef::_type_is_nullable) << "Downcast from nullptr to not nullable reference of "
+                                      << SubRef::ContainerType::_type_key;
   }
   return SubRef(std::move(ref.data_));
 }
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 2305f12e5533..a5db34c75400 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -195,7 +195,7 @@ class TypedPackedFunc<R(Args...)> {
    * // construct from packed function
    * TypedPackedFunc<int(int)> ftyped(packed);
    * // call the typed version.
-   * CHECK_EQ(ftyped(1), 2);
+   * ICHECK_EQ(ftyped(1), 2);
    * \endcode
    *
    * \param packed The packed function
@@ -225,7 +225,7 @@ class TypedPackedFunc<R(Args...)> {
    * // construct from packed function
    * TypedPackedFunc<int(int)> ftyped(typed_lambda);
    * // call the typed version.
-   * CHECK_EQ(ftyped(1), 2);
+   * ICHECK_EQ(ftyped(1), 2);
    * \endcode
    *
    * \param typed_lambda typed lambda function.
@@ -246,7 +246,7 @@ class TypedPackedFunc<R(Args...)> {
    * TypedPackedFunc<int(int)> ftyped;
    * ftyped = [](int x) { return x + 1; }
    * // call the typed version.
-   * CHECK_EQ(ftyped(1), 2);
+   * ICHECK_EQ(ftyped(1), 2);
    * \endcode
    *
    * \param typed_lambda typed lambda function.
@@ -337,7 +337,7 @@ inline const char* ArgTypeCode2Str(int type_code);
 
 // macro to check type code.
 #define TVM_CHECK_TYPE_CODE(CODE, T) \
-  CHECK_EQ(CODE, T) << " expected " << ArgTypeCode2Str(T) << " but get " << ArgTypeCode2Str(CODE)
+  ICHECK_EQ(CODE, T) << " expected " << ArgTypeCode2Str(T) << " but get " << ArgTypeCode2Str(CODE)
 
 /*!
  * \brief Type traits for runtime type check during FFI conversion.
@@ -382,8 +382,8 @@ class TVMPODValue_ {
   }
   operator int() const {
     TVM_CHECK_TYPE_CODE(type_code_, kDLInt);
-    CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
-    CHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
+    ICHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
+    ICHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
     return static_cast<int>(value_.v_int64);
   }
   operator bool() const {
@@ -491,7 +491,7 @@ class TVMArgValue : public TVMPODValue_ {
     } else if (type_code_ == kTVMStr) {
       return std::string(value_.v_str);
     } else {
-      CHECK(IsObjectRef<tvm::runtime::String>());
+      ICHECK(IsObjectRef<tvm::runtime::String>());
       return AsObjectRef<tvm::runtime::String>().operator std::string();
     }
   }
@@ -522,22 +522,30 @@ class TVMArgValue : public TVMPODValue_ {
  *
  * \note For internal development purpose only.
  */
-class TVMMovableArgValue_ : public TVMArgValue {
+class TVMMovableArgValue_ : public TVMPODValue_ {
  public:
-  TVMMovableArgValue_(TVMValue value, int type_code) : TVMArgValue(value, type_code) {}
+  TVMMovableArgValue_(TVMValue value, int type_code) : TVMPODValue_(value, type_code) {}
   // reuse converter from parent
-  using TVMArgValue::operator double;
-  using TVMArgValue::operator int64_t;
-  using TVMArgValue::operator uint64_t;
-  using TVMArgValue::operator int;
-  using TVMArgValue::operator bool;
-  using TVMArgValue::operator void*;
-  using TVMArgValue::operator DLTensor*;
-  using TVMArgValue::operator TVMContext;
-  using TVMArgValue::operator std::string;
-  using TVMArgValue::operator DLDataType;
-  using TVMArgValue::operator DataType;
-  using TVMArgValue::operator PackedFunc;
+  using TVMPODValue_::operator double;
+  using TVMPODValue_::operator int64_t;
+  using TVMPODValue_::operator uint64_t;
+  using TVMPODValue_::operator int;
+  using TVMPODValue_::operator bool;
+  using TVMPODValue_::operator void*;
+  using TVMPODValue_::operator DLTensor*;
+  using TVMPODValue_::operator NDArray;
+  using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator Module;
+  // reuse conversion rule from ArgValue.
+  operator std::string() const { return AsArgValue().operator std::string(); }
+  operator PackedFunc() const { return AsArgValue().operator PackedFunc(); }
+  template <typename FType>
+  operator TypedPackedFunc<FType>() const {
+    return TypedPackedFunc<FType>(operator PackedFunc());
+  }
+  operator DLDataType() const { return AsArgValue().operator DLDataType(); }
+  operator DataType() const { return AsArgValue().operator DataType(); }
+  operator TVMArgValue() const { return AsArgValue(); }
   /*!
    * \brief Helper converter function.
    *  Try to move out an argument if possible,
@@ -546,6 +554,10 @@ class TVMMovableArgValue_ : public TVMArgValue {
   template <typename T,
             typename = typename std::enable_if<std::is_base_of<ObjectRef, T>::value>::type>
   inline operator T() const;
+
+ private:
+  /*! \return The arg value repr of the value. */
+  TVMArgValue AsArgValue() const { return TVMArgValue(value_, type_code_); }
 };
 
 /*!
@@ -719,7 +731,7 @@ class TVMRetValue : public TVMPODValue_ {
    */
   void MoveToCHost(TVMValue* ret_value, int* ret_type_code) {
     // cannot move str; need specially handle.
-    CHECK(type_code_ != kTVMStr && type_code_ != kTVMBytes);
+    ICHECK(type_code_ != kTVMStr && type_code_ != kTVMBytes);
     *ret_value = value_;
     *ret_type_code = type_code_;
     type_code_ = kTVMNullptr;
@@ -733,7 +745,7 @@ class TVMRetValue : public TVMPODValue_ {
    */
   static TVMRetValue MoveFromCHost(TVMValue value, int type_code) {
     // Can move POD and everything under the object system.
-    CHECK(type_code <= kTVMPackedFuncHandle || type_code == kTVMNDArrayHandle);
+    ICHECK(type_code <= kTVMPackedFuncHandle || type_code == kTVMNDArrayHandle);
     TVMRetValue ret;
     ret.value_ = value;
     ret.type_code_ = type_code;
@@ -741,8 +753,8 @@ class TVMRetValue : public TVMPODValue_ {
   }
   /*! \return The value field, if the data is POD */
   const TVMValue& value() const {
-    CHECK(type_code_ != kTVMObjectHandle && type_code_ != kTVMPackedFuncHandle &&
-          type_code_ != kTVMModuleHandle && type_code_ != kTVMStr)
+    ICHECK(type_code_ != kTVMObjectHandle && type_code_ != kTVMPackedFuncHandle &&
+           type_code_ != kTVMModuleHandle && type_code_ != kTVMStr)
         << "TVMRetValue.value can only be used for POD data";
     return value_;
   }
@@ -966,8 +978,8 @@ struct PackedFuncValueConverter {
   }
 
 inline TVMArgValue TVMArgs::operator[](int i) const {
-  CHECK_LT(i, num_args) << "not enough argument passed, " << num_args << " passed"
-                        << " but request arg[" << i << "].";
+  ICHECK_LT(i, num_args) << "not enough argument passed, " << num_args << " passed"
+                         << " but request arg[" << i << "].";
   return TVMArgValue(values[i], type_codes[i]);
 }
 
@@ -1090,7 +1102,7 @@ class TVMArgsSetter {
   }
   TVM_ALWAYS_INLINE void operator()(size_t i, uint64_t value) const {
     values_[i].v_int64 = static_cast<int64_t>(value);
-    CHECK_LE(value, static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
+    ICHECK_LE(value, static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
     type_codes_[i] = kDLInt;
   }
   TVM_ALWAYS_INLINE void operator()(size_t i, double value) const {
@@ -1155,7 +1167,7 @@ class TVMArgsSetter {
       values_[i].v_str = value.ptr<std::string>()->c_str();
       type_codes_[i] = kTVMStr;
     } else {
-      CHECK_NE(value.type_code(), kTVMBytes) << "not handled.";
+      ICHECK_NE(value.type_code(), kTVMBytes) << "not handled.";
       values_[i] = value.value_;
       type_codes_[i] = value.type_code();
     }
@@ -1234,7 +1246,7 @@ struct unpack_call_dispatcher<void, 0, index, F> {
 
 template <typename R, int nargs, typename F>
 TVM_ALWAYS_INLINE void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) {
-  CHECK_EQ(nargs, args.size()) << "Expect " << nargs << " arguments but get " << args.size();
+  ICHECK_EQ(nargs, args.size()) << "Expect " << nargs << " arguments but get " << args.size();
   unpack_call_dispatcher<R, nargs, 0, F>::run(f, args, rv);
 }
 
@@ -1363,7 +1375,7 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
   using ContainerType = typename TObjectRef::ContainerType;
 
   if (type_code_ == kTVMNullptr) {
-    CHECK(TObjectRef::_type_is_nullable)
+    ICHECK(TObjectRef::_type_is_nullable)
         << "Expect a not null value of " << ContainerType::_type_key;
     return TObjectRef(ObjectPtr<Object>(nullptr));
   }
@@ -1373,7 +1385,7 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMNDArrayHandle);
     ObjectPtr<Object> data =
         NDArray::FFIDataFromHandle(static_cast<TVMArrayHandle>(value_.v_handle));
-    CHECK(data->IsInstance<ContainerType>())
+    ICHECK(data->IsInstance<ContainerType>())
         << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey();
     return TObjectRef(data);
   }
@@ -1381,20 +1393,20 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
     // Casting to a sub-class of Module
     TVM_CHECK_TYPE_CODE(type_code_, kTVMModuleHandle);
     ObjectPtr<Object> data = GetObjectPtr<Object>(static_cast<Object*>(value_.v_handle));
-    CHECK(data->IsInstance<ContainerType>())
+    ICHECK(data->IsInstance<ContainerType>())
         << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey();
     return TObjectRef(data);
   }
   if (type_code_ == kTVMObjectHandle) {
     // normal object type check.
     Object* ptr = static_cast<Object*>(value_.v_handle);
-    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+    ICHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
         << "Expect " << ObjectTypeChecker<TObjectRef>::TypeName() << " but get "
         << ptr->GetTypeKey();
     return TObjectRef(GetObjectPtr<Object>(ptr));
   } else if (type_code_ == kTVMObjectRValueRefArg) {
     Object* ptr = *static_cast<Object**>(value_.v_handle);
-    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+    ICHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
         << "Expect " << ObjectTypeChecker<TObjectRef>::TypeName() << " but get "
         << ptr->GetTypeKey();
     return TObjectRef(GetObjectPtr<Object>(ptr));
@@ -1450,7 +1462,7 @@ inline TVMMovableArgValue_::operator T() const {
     }
   }
   // fallback
-  return PackedFuncValueConverter<T>::From(*this);
+  return PackedFuncValueConverter<T>::From(AsArgValue());
 }
 
 template <typename T, typename>
diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h
index edcbd881e074..e858c4458054 100644
--- a/include/tvm/runtime/vm/bytecode.h
+++ b/include/tvm/runtime/vm/bytecode.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_VM_BYTECODE_H_
 
 #include <tvm/runtime/data_type.h>
+#include <tvm/support/logging.h>
 
 #include <iostream>
 #include <vector>
diff --git a/include/tvm/support/logging.h b/include/tvm/support/logging.h
index c318b89e5c51..d98363ea1c1b 100644
--- a/include/tvm/support/logging.h
+++ b/include/tvm/support/logging.h
@@ -24,6 +24,8 @@
 #ifndef TVM_SUPPORT_LOGGING_H_
 #define TVM_SUPPORT_LOGGING_H_
 
+#include <dmlc/logging.h>
+
 // a technique that enables overriding macro names on the number of parameters. This is used
 // to define other macros below
 #define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME
@@ -109,4 +111,48 @@
 #define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false)
 #define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false)
 
+namespace tvm {
+
+constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
+    "\n---------------------------------------------------------------\n"
+    "An internal invariant was violated during the execution of TVM.\n"
+    "Please read TVM's error reporting guidelines.\n"
+    "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n"
+    "---------------------------------------------------------------\n";
+
+#define ICHECK_INDENT "  "
+
+#define ICHECK_BINARY_OP(name, op, x, y)                           \
+  if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \
+  dmlc::LogMessageFatal(__FILE__, __LINE__).stream()               \
+      << tvm::kTVM_INTERNAL_ERROR_MESSAGE << std::endl             \
+      << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": "
+
+#define ICHECK(x)                                    \
+  if (!(x))                                          \
+  dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
+      << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: "
+
+#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
+#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
+#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y)
+#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
+#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
+#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
+#define ICHECK_NOTNULL(x)                                                                        \
+  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()                           \
+                        << tvm::kTVM_INTERNAL_ERROR_MESSAGE << __INDENT << "Check not null: " #x \
+                        << ' ',                                                                  \
+   (x) : (x))  // NOLINT(*)
+
+/*! \brief The diagnostic level, controls the printing of the message. */
+enum class DiagnosticLevel : int {
+  kBug = 10,
+  kError = 20,
+  kWarning = 30,
+  kNote = 40,
+  kHelp = 50,
+};
+
+}  // namespace tvm
 #endif  // TVM_SUPPORT_LOGGING_H_
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index dd14602fa6fc..c9ef736f7aee 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -295,7 +295,7 @@ inline TargetKindAttrMap<ValueType> TargetKind::GetAttrMap(const String& attr_na
 template <typename ValueType>
 inline TargetKindRegEntry& TargetKindRegEntry::set_attr(const String& attr_name,
                                                         const ValueType& value, int plevel) {
-  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  ICHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
   runtime::TVMRetValue rv;
   rv = value;
   UpdateAttr(attr_name, rv, plevel);
@@ -321,7 +321,7 @@ inline TargetKindRegEntry& TargetKindRegEntry::set_attrs_preprocessor(FLambda f)
 
 template <typename ValueType>
 inline TargetKindRegEntry& TargetKindRegEntry::add_attr_option(const String& key) {
-  CHECK(!kind_->key2vtype_.count(key))
+  ICHECK(!kind_->key2vtype_.count(key))
       << "AttributeError: add_attr_option failed because '" << key << "' has been set once";
   kind_->key2vtype_[key] = detail::ValueTypeInfoMaker<ValueType>()();
   return *this;
diff --git a/include/tvm/te/schedule.h b/include/tvm/te/schedule.h
index ee4fb33349f7..6f26d07dc8a5 100644
--- a/include/tvm/te/schedule.h
+++ b/include/tvm/te/schedule.h
@@ -378,6 +378,18 @@ class Schedule : public ObjectRef {
    * \return A normalized schedule, can be same as current one.
    */
   Schedule normalize();
+
+  /*!
+   * \brief Normalize the schedule for feature extraction in auto-scheduler.
+   * This is similar to `Schedule::normalize`, but we do aggressive simplification
+   * to the TE compute with const_matrix=True for faster compilation and feature extraction.
+   * The resulted schedule may be wrong, but it is good enough for feature extraction
+   * purposes.
+   *
+   * \return A normalized schedule, can be same as current one.
+   */
+  Schedule normalize_for_feature_extraction();
+
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/tir/buffer.h b/include/tvm/tir/buffer.h
index e150ff38041b..69741bbdca62 100644
--- a/include/tvm/tir/buffer.h
+++ b/include/tvm/tir/buffer.h
@@ -77,6 +77,11 @@ class BufferNode : public Object {
   int offset_factor;
   /*! \brief buffer type */
   BufferType buffer_type;
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
   /*! \brief constructor */
   BufferNode() {}
 
@@ -135,7 +140,7 @@ class Buffer : public ObjectRef {
   // A default value will be picked.
   TVM_DLL Buffer(Var ptr, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr> strides,
                  PrimExpr elem_offset, String name, String scope, int data_alignment,
-                 int offset_factor, BufferType buffer_type);
+                 int offset_factor, BufferType buffer_type, Span span = Span());
 
   /*!
    * \brief Return a new buffer that is equivalent with current one
@@ -183,11 +188,12 @@ class Buffer : public ObjectRef {
  * \param shape The shape of the buffer,
  * \param dtype The content data type.
  * \param name The name of the buffer
+ * \param span The location of this object in the source code.
  * \return The created buffer.
  * \sa Buffer for complete constructor.
  */
 TVM_DLL Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
-                           String name = "buffer");
+                           String name = "buffer", Span span = Span());
 
 /*!
  * \brief Base node for data producers.
diff --git a/include/tvm/tir/data_layout.h b/include/tvm/tir/data_layout.h
index ee93a0675470..73da05c549e2 100644
--- a/include/tvm/tir/data_layout.h
+++ b/include/tvm/tir/data_layout.h
@@ -255,9 +255,9 @@ class Layout : public ObjectRef {
   }
 
   const LayoutAxis& operator[](int32_t i) const {
-    CHECK(defined()) << "Try to access axis from an undefined layout.";
+    ICHECK(defined()) << "Try to access axis from an undefined layout.";
     int32_t index = i < 0 ? static_cast<int32_t>(ndim() + i) : i;
-    CHECK(index >= 0 && static_cast<size_t>(index) < ndim()) << "Invalid index " << i;
+    ICHECK(index >= 0 && static_cast<size_t>(index) < ndim()) << "Invalid index " << i;
     const tir::IterVar axis = operator->()->axes[index];
     return LayoutAxis::Get(axis);
   }
diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index eee0deecdc70..f2ae58554ab1 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -74,7 +74,7 @@ class StringImmNode : public PrimExprNode {
  */
 class StringImm : public PrimExpr {
  public:
-  TVM_DLL StringImm(String value);
+  TVM_DLL StringImm(String value, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(StringImm, PrimExpr, StringImmNode);
 };
 
@@ -111,7 +111,7 @@ class CastNode : public PrimExprNode {
  */
 class Cast : public PrimExpr {
  public:
-  TVM_DLL Cast(DataType dtype, PrimExpr value);
+  TVM_DLL Cast(DataType dtype, PrimExpr value, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Cast, PrimExpr, CastNode);
 };
 
@@ -158,7 +158,7 @@ class AddNode : public BinaryOpNode<AddNode> {
  */
 class Add : public PrimExpr {
  public:
-  TVM_DLL Add(PrimExpr a, PrimExpr b);
+  TVM_DLL Add(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Add, PrimExpr, AddNode);
 };
 
@@ -174,7 +174,7 @@ class SubNode : public BinaryOpNode<SubNode> {
  */
 class Sub : public PrimExpr {
  public:
-  TVM_DLL Sub(PrimExpr a, PrimExpr b);
+  TVM_DLL Sub(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Sub, PrimExpr, SubNode);
 };
 
@@ -190,7 +190,7 @@ class MulNode : public BinaryOpNode<MulNode> {
  */
 class Mul : public PrimExpr {
  public:
-  TVM_DLL Mul(PrimExpr a, PrimExpr b);
+  TVM_DLL Mul(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Mul, PrimExpr, MulNode);
 };
 
@@ -209,7 +209,7 @@ class DivNode : public BinaryOpNode<DivNode> {
  */
 class Div : public PrimExpr {
  public:
-  TVM_DLL Div(PrimExpr a, PrimExpr b);
+  TVM_DLL Div(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Div, PrimExpr, DivNode);
 };
 
@@ -228,7 +228,7 @@ class ModNode : public BinaryOpNode<ModNode> {
  */
 class Mod : public PrimExpr {
  public:
-  TVM_DLL Mod(PrimExpr a, PrimExpr b);
+  TVM_DLL Mod(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Mod, PrimExpr, ModNode);
 };
 
@@ -244,7 +244,7 @@ class FloorDivNode : public BinaryOpNode<FloorDivNode> {
  */
 class FloorDiv : public PrimExpr {
  public:
-  TVM_DLL FloorDiv(PrimExpr a, PrimExpr b);
+  TVM_DLL FloorDiv(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(FloorDiv, PrimExpr, FloorDivNode);
 };
 
@@ -260,7 +260,7 @@ class FloorModNode : public BinaryOpNode<FloorModNode> {
  */
 class FloorMod : public PrimExpr {
  public:
-  TVM_DLL FloorMod(PrimExpr a, PrimExpr b);
+  TVM_DLL FloorMod(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(FloorMod, PrimExpr, FloorModNode);
 };
 
@@ -276,7 +276,7 @@ class MinNode : public BinaryOpNode<MinNode> {
  */
 class Min : public PrimExpr {
  public:
-  TVM_DLL Min(PrimExpr a, PrimExpr b);
+  TVM_DLL Min(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Min, PrimExpr, MinNode);
 };
 
@@ -292,7 +292,7 @@ class MaxNode : public BinaryOpNode<MaxNode> {
  */
 class Max : public PrimExpr {
  public:
-  TVM_DLL Max(PrimExpr a, PrimExpr b);
+  TVM_DLL Max(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Max, PrimExpr, MaxNode);
 };
 
@@ -339,7 +339,7 @@ class EQNode : public CmpOpNode<EQNode> {
  */
 class EQ : public PrimExpr {
  public:
-  TVM_DLL EQ(PrimExpr a, PrimExpr b);
+  TVM_DLL EQ(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(EQ, PrimExpr, EQNode);
 };
 
@@ -355,7 +355,7 @@ class NENode : public CmpOpNode<NENode> {
  */
 class NE : public PrimExpr {
  public:
-  TVM_DLL NE(PrimExpr a, PrimExpr b);
+  TVM_DLL NE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(NE, PrimExpr, NENode);
 };
 
@@ -371,7 +371,7 @@ class LTNode : public CmpOpNode<LTNode> {
  */
 class LT : public PrimExpr {
  public:
-  TVM_DLL LT(PrimExpr a, PrimExpr b);
+  TVM_DLL LT(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(LT, PrimExpr, LTNode);
 };
 
@@ -387,7 +387,7 @@ struct LENode : public CmpOpNode<LENode> {
  */
 class LE : public PrimExpr {
  public:
-  TVM_DLL LE(PrimExpr a, PrimExpr b);
+  TVM_DLL LE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(LE, PrimExpr, LENode);
 };
 
@@ -403,7 +403,7 @@ class GTNode : public CmpOpNode<GTNode> {
  */
 class GT : public PrimExpr {
  public:
-  TVM_DLL GT(PrimExpr a, PrimExpr b);
+  TVM_DLL GT(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(GT, PrimExpr, GTNode);
 };
 
@@ -419,7 +419,7 @@ class GENode : public CmpOpNode<GENode> {
  */
 class GE : public PrimExpr {
  public:
-  TVM_DLL GE(PrimExpr a, PrimExpr b);
+  TVM_DLL GE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(GE, PrimExpr, GENode);
 };
 
@@ -457,7 +457,7 @@ class AndNode : public PrimExprNode {
  */
 class And : public PrimExpr {
  public:
-  TVM_DLL And(PrimExpr a, PrimExpr b);
+  TVM_DLL And(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(And, PrimExpr, AndNode);
 };
 
@@ -495,7 +495,7 @@ class OrNode : public PrimExprNode {
  */
 class Or : public PrimExpr {
  public:
-  TVM_DLL Or(PrimExpr a, PrimExpr b);
+  TVM_DLL Or(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Or, PrimExpr, OrNode);
 };
 
@@ -529,7 +529,7 @@ class NotNode : public PrimExprNode {
  */
 class Not : public PrimExpr {
  public:
-  TVM_DLL Not(PrimExpr a);
+  TVM_DLL Not(PrimExpr a, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Not, PrimExpr, NotNode);
 };
 
@@ -578,7 +578,7 @@ class SelectNode : public PrimExprNode {
  */
 class Select : public PrimExpr {
  public:
-  TVM_DLL Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value);
+  TVM_DLL Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Select, PrimExpr, SelectNode);
 };
@@ -627,7 +627,7 @@ class BufferLoadNode : public PrimExprNode {
  */
 class BufferLoad : public PrimExpr {
  public:
-  TVM_DLL explicit BufferLoad(Buffer buffer, Array<PrimExpr> indices);
+  TVM_DLL explicit BufferLoad(Buffer buffer, Array<PrimExpr> indices, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(BufferLoad, PrimExpr, BufferLoadNode);
 };
 
@@ -674,7 +674,7 @@ class ProducerLoadNode : public PrimExprNode {
  */
 class ProducerLoad : public PrimExpr {
  public:
-  TVM_DLL explicit ProducerLoad(DataProducer producer, Array<PrimExpr> indices);
+  TVM_DLL explicit ProducerLoad(DataProducer producer, Array<PrimExpr> indices, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerLoad, PrimExpr, ProducerLoadNode);
 };
@@ -732,7 +732,8 @@ class LoadNode : public PrimExprNode {
  */
 class Load : public PrimExpr {
  public:
-  TVM_DLL Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate);
+  TVM_DLL Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate,
+               Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Load, PrimExpr, LoadNode);
 };
 
@@ -783,7 +784,7 @@ class RampNode : public PrimExprNode {
  */
 class Ramp : public PrimExpr {
  public:
-  TVM_DLL Ramp(PrimExpr base, PrimExpr stride, int lanes);
+  TVM_DLL Ramp(PrimExpr base, PrimExpr stride, int lanes, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Ramp, PrimExpr, RampNode);
 };
 
@@ -821,7 +822,7 @@ class BroadcastNode : public PrimExprNode {
  */
 class Broadcast : public PrimExpr {
  public:
-  TVM_DLL Broadcast(PrimExpr value, int lanes);
+  TVM_DLL Broadcast(PrimExpr value, int lanes, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Broadcast, PrimExpr, BroadcastNode);
 };
 
@@ -866,7 +867,7 @@ class LetNode : public PrimExprNode {
  */
 class Let : public PrimExpr {
  public:
-  TVM_DLL Let(Var var, PrimExpr value, PrimExpr body);
+  TVM_DLL Let(Var var, PrimExpr value, PrimExpr body, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Let, PrimExpr, LetNode);
 };
 
@@ -911,7 +912,7 @@ class CallNode : public PrimExprNode {
  */
 class Call : public PrimExpr {
  public:
-  TVM_DLL Call(DataType dtype, RelayExpr op, Array<PrimExpr> args);
+  TVM_DLL Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Call, PrimExpr, CallNode);
 };
 
@@ -953,9 +954,9 @@ class ShuffleNode : public PrimExprNode {
  */
 class Shuffle : public PrimExpr {
  public:
-  TVM_DLL Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices);
-  TVM_DLL static PrimExpr Concat(Array<PrimExpr> vectors);
-  TVM_DLL static PrimExpr ExtractElement(PrimExpr vector, int index);
+  TVM_DLL Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices, Span span = Span());
+  TVM_DLL static PrimExpr Concat(Array<PrimExpr> vectors, Span span = Span());
+  TVM_DLL static PrimExpr ExtractElement(PrimExpr vector, int index, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Shuffle, PrimExpr, ShuffleNode);
 };
@@ -981,6 +982,11 @@ class CommReducerNode : public Object {
   Array<PrimExpr> identity_element;
   /*! \brief Function call operator to combine a and b */
   Array<PrimExpr> operator()(Array<PrimExpr> a, Array<PrimExpr> b) const;
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("lhs", &lhs);
@@ -1014,7 +1020,7 @@ class CommReducerNode : public Object {
 class CommReducer : public ObjectRef {
  public:
   TVM_DLL CommReducer(Array<Var> lhs, Array<Var> rhs, Array<PrimExpr> result,
-                      Array<PrimExpr> identity_element);
+                      Array<PrimExpr> identity_element, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(CommReducer, ObjectRef, CommReducerNode);
 };
@@ -1077,7 +1083,7 @@ class ReduceNode : public PrimExprNode {
 class Reduce : public PrimExpr {
  public:
   TVM_DLL Reduce(CommReducer combiner, Array<PrimExpr> src, Array<IterVar> rdom, PrimExpr condition,
-                 int value_index, Array<PrimExpr> init);
+                 int value_index, Array<PrimExpr> init, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Reduce, PrimExpr, ReduceNode);
 };
@@ -1106,7 +1112,7 @@ class AnyNode : public PrimExprNode {
  */
 class Any : public PrimExpr {
  public:
-  TVM_DLL Any();
+  TVM_DLL Any(Span span = Span());
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Any, PrimExpr, AnyNode);
 };
diff --git a/include/tvm/tir/expr_functor.h b/include/tvm/tir/expr_functor.h
index 3f73d21bb625..b5f1d64a00c4 100644
--- a/include/tvm/tir/expr_functor.h
+++ b/include/tvm/tir/expr_functor.h
@@ -58,7 +58,7 @@ namespace tir {
  *  };
  *  MyExprFunctor f;
  *  Var x("x");
- *  CHECK_EQ(f(x + 1, 2), 3);
+ *  ICHECK_EQ(f(x + 1, 2), 3);
  * \endcode
  *
  * \note Why do we need this more powerful Functor:
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index caddd99eeb2c..97ee7f7211d4 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -25,6 +25,7 @@
 #define TVM_TIR_FUNCTION_H_
 
 #include <tvm/ir/function.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
@@ -140,15 +141,52 @@ class PrimFunc : public BaseFunc {
    * \param ret_type The return type of the function.
    * \param buffer_map The buffer map for parameter buffer unpacking.
    * \param attrs Additional function attributes.
+   * \param span The location of this object in the source code.
    */
   TVM_DLL PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type = VoidType(),
                    Map<tir::Var, Buffer> buffer_map = Map<tir::Var, Buffer>(),
-                   DictAttrs attrs = NullValue<DictAttrs>());
+                   DictAttrs attrs = NullValue<DictAttrs>(), Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(PrimFunc, BaseFunc, PrimFuncNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(PrimFuncNode);
 };
 
+/*!
+ * \brief Describes one parameter that should be linked into the generated module.
+ *
+ * When parameters are to be linked in with generated code (i.e. on target_host-compatible
+ * backends), Relay attaches instances of this object to a global TIR function. Code-generators
+ * use the information contained in this node to include the parameter data in the generated
+ * module.
+ */
+class LinkedParamNode : public Object {
+ public:
+  /*! \brief Unique numeric identifier used by runtimes to lookup this parameter. */
+  int64_t id;
+
+  /*! \brief Parameter data which should get linked into the final module. */
+  ::tvm::runtime::NDArray param;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("id", &id);
+    v->Visit("param", &param);
+  }
+
+  static constexpr const char* _type_key = "tir.LinkedParam";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LinkedParamNode, Object);
+};
+
+/*!
+ * \brief Managed reference to LinkedParamNode.
+ */
+class LinkedParam : public ObjectRef {
+ public:
+  TVM_DLL LinkedParam(int64_t id, ::tvm::runtime::NDArray param);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(LinkedParam, ObjectRef, LinkedParamNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(LinkedParamNode);
+};
+
 /*!
  * \brief PrimFunc specific attribute names.
  *
@@ -191,6 +229,17 @@ constexpr const char* kNoAlias = "tir.noalias";
  * \note There can only be one entry function per module.
  */
 constexpr const char* kIsEntryFunc = "tir.is_entry_func";
+
+/*!
+ * \brief Parameters used in the module that should be linked by the codegen.
+ *
+ * Type: Map<String, LinkableParam>
+ *
+ * \note This should be present only on a function named
+ *     tvm::target::packed_func::kLookupLinkedParam.
+ */
+constexpr const char* kLinkedParams = "tir.linked_params";
+
 }  // namespace attr
 }  // namespace tir
 }  // namespace tvm
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 16800d57bda8..661c30110062 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -37,6 +37,15 @@ namespace tir {
 /*! \brief Base node of all statements. */
 class StmtNode : public Object {
  public:
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
+
+  StmtNode() = default;
+  explicit StmtNode(Span span) : span(span) {}
+
   static constexpr const char* _type_key = "tir.Stmt";
   static constexpr const bool _type_has_method_sequal_reduce = true;
   static constexpr const bool _type_has_method_shash_reduce = true;
@@ -89,7 +98,7 @@ class LetStmtNode : public StmtNode {
  */
 class LetStmt : public Stmt {
  public:
-  TVM_DLL LetStmt(Var var, PrimExpr value, Stmt body);
+  TVM_DLL LetStmt(Var var, PrimExpr value, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(LetStmt, Stmt, LetStmtNode);
 };
@@ -144,7 +153,7 @@ class AttrStmtNode : public StmtNode {
  */
 class AttrStmt : public Stmt {
  public:
-  TVM_DLL AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body);
+  TVM_DLL AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(AttrStmt, Stmt, AttrStmtNode);
 };
@@ -191,7 +200,7 @@ class AssertStmtNode : public StmtNode {
  */
 class AssertStmt : public Stmt {
  public:
-  TVM_DLL AssertStmt(PrimExpr condition, PrimExpr message, Stmt body);
+  TVM_DLL AssertStmt(PrimExpr condition, PrimExpr message, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(AssertStmt, Stmt, AssertStmtNode);
 };
@@ -254,7 +263,8 @@ class StoreNode : public StmtNode {
  */
 class Store : public Stmt {
  public:
-  TVM_DLL Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate);
+  TVM_DLL Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate,
+                Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Store, Stmt, StoreNode);
 };
@@ -305,7 +315,8 @@ class BufferStoreNode : public StmtNode {
  */
 class BufferStore : public Stmt {
  public:
-  TVM_DLL explicit BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices);
+  TVM_DLL explicit BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices,
+                               Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(BufferStore, Stmt, BufferStoreNode);
 };
@@ -352,8 +363,9 @@ class BufferRealizeNode : public StmtNode {
   }
 
   BufferRealizeNode() = default;
-  BufferRealizeNode(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body)
-      : buffer(buffer), bounds(bounds), condition(condition), body(body) {}
+  BufferRealizeNode(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
+                    Span span = Span())
+      : StmtNode(span), buffer(buffer), bounds(bounds), condition(condition), body(body) {}
 
   static constexpr const char* _type_key = "tir.BufferRealize";
   TVM_DECLARE_FINAL_OBJECT_INFO(BufferRealizeNode, StmtNode);
@@ -365,7 +377,8 @@ class BufferRealizeNode : public StmtNode {
  */
 class BufferRealize : public Stmt {
  public:
-  TVM_DLL explicit BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body);
+  TVM_DLL explicit BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
+                                 Span span = Span());
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(BufferRealize, Stmt, BufferRealizeNode);
 };
@@ -416,7 +429,8 @@ class ProducerStoreNode : public StmtNode {
  */
 class ProducerStore : public Stmt {
  public:
-  TVM_DLL ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices);
+  TVM_DLL ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices,
+                        Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerStore, Stmt, ProducerStoreNode);
 };
@@ -472,7 +486,8 @@ class ProducerRealizeNode : public StmtNode {
  */
 class ProducerRealize : public Stmt {
  public:
-  TVM_DLL ProducerRealize(DataProducer producer, Region bounds, PrimExpr condition, Stmt body);
+  TVM_DLL ProducerRealize(DataProducer producer, Region bounds, PrimExpr condition, Stmt body,
+                          Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerRealize, Stmt, ProducerRealizeNode);
 };
@@ -540,7 +555,7 @@ class AllocateNode : public StmtNode {
 class Allocate : public Stmt {
  public:
   TVM_DLL Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, PrimExpr condition,
-                   Stmt body);
+                   Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Allocate, Stmt, AllocateNode);
 };
@@ -579,8 +594,9 @@ class SeqStmt : public Stmt {
   /*!
    * \brief Construct SeqStmt.
    * \param seq The sequence.
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit SeqStmt(Array<Stmt> seq);
+  TVM_DLL explicit SeqStmt(Array<Stmt> seq, Span span = Span());
 
   /*! \return get the size of the sequence */
   size_t size() const { return operator->()->size(); }
@@ -678,7 +694,8 @@ class IfThenElseNode : public StmtNode {
  */
 class IfThenElse : public Stmt {
  public:
-  TVM_DLL IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case = Stmt());
+  TVM_DLL IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case = Stmt(),
+                     Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(IfThenElse, Stmt, IfThenElseNode);
 };
@@ -712,9 +729,9 @@ class EvaluateNode : public StmtNode {
  */
 class Evaluate : public Stmt {
  public:
-  TVM_DLL explicit Evaluate(PrimExpr value);
+  TVM_DLL explicit Evaluate(PrimExpr value, Span span = Span());
 
-  explicit Evaluate(int value) : Evaluate(PrimExpr(value)) {}
+  explicit Evaluate(int value, Span span = Span()) : Evaluate(PrimExpr(value), span) {}
 
   TVM_DEFINE_OBJECT_REF_METHODS(Evaluate, Stmt, EvaluateNode);
 };
@@ -799,7 +816,7 @@ class ForNode : public StmtNode {
 class For : public Stmt {
  public:
   TVM_DLL For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api,
-              Stmt body);
+              Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(For, Stmt, ForNode);
 };
@@ -829,7 +846,8 @@ class PrefetchNode : public StmtNode {
   }
 
   PrefetchNode() = default;
-  PrefetchNode(Buffer buffer, Array<Range> bounds) : buffer(buffer), bounds(bounds) {}
+  PrefetchNode(Buffer buffer, Array<Range> bounds, Span span = Span())
+      : StmtNode(span), buffer(buffer), bounds(bounds) {}
 
   static constexpr const char* _type_key = "tir.Prefetch";
   TVM_DECLARE_FINAL_OBJECT_INFO(PrefetchNode, StmtNode);
@@ -841,7 +859,7 @@ class PrefetchNode : public StmtNode {
  */
 class Prefetch : public Stmt {
  public:
-  TVM_DLL explicit Prefetch(Buffer buffer, Array<Range> bounds);
+  TVM_DLL explicit Prefetch(Buffer buffer, Array<Range> bounds, Span span = Span());
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Prefetch, Stmt, PrefetchNode);
 };
@@ -973,9 +991,10 @@ inline bool IsPragmaKey(const std::string& attr_key) {
 /*!
  * \brief Create a type annotation expression
  * \param dtype The data type
+ * \param span The location of this object in the source code.
  * \return Expr a expression with dtype.
  */
-TVM_DLL PrimExpr TypeAnnotation(DataType dtype);
+TVM_DLL PrimExpr TypeAnnotation(DataType dtype, Span span = Span());
 
 // overload printing of for type.
 TVM_DLL std::ostream& operator<<(std::ostream& os, ForType for_type);
diff --git a/include/tvm/tir/var.h b/include/tvm/tir/var.h
index f1651c118010..a2240939ddea 100644
--- a/include/tvm/tir/var.h
+++ b/include/tvm/tir/var.h
@@ -91,14 +91,17 @@ class Var : public PrimExpr {
    * \brief Constructor
    * \param name_hint variable name
    * \param dtype data type
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit Var(String name_hint = "v", DataType dtype = DataType::Int(32));
+  TVM_DLL explicit Var(String name_hint = "v", DataType dtype = DataType::Int(32),
+                       Span span = Span());
   /*!
    * \brief Constructor which provides a more detailed type annotation.
    * \param name_hint variable name.
    * \param type_annotation The type annotation.
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit Var(String name_hint, Type type_annotation);
+  TVM_DLL explicit Var(String name_hint, Type type_annotation, Span span = Span());
   /*!
    * \brief Make a new copy of var with same type, append suffix
    * \param suffix The suffix to be appended.
@@ -138,8 +141,10 @@ class SizeVar : public Var {
    * \brief constructor
    * \param name_hint variable name
    * \param t data type
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit SizeVar(String name_hint = "s", DataType t = DataType::Int(32));
+  TVM_DLL explicit SizeVar(String name_hint = "s", DataType t = DataType::Int(32),
+                           Span span = Span());
   /*!
    * \brief Get pointer to the internal value.
    * \return the corresponding Variable.
@@ -246,6 +251,11 @@ class IterVarNode : public Object {
    *  set this if this is binded already to a known thread tag.
    */
   String thread_tag;
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("dom", &dom);
@@ -278,7 +288,8 @@ class IterVarNode : public Object {
  */
 class IterVar : public ObjectRef {
  public:
-  TVM_DLL IterVar(Range dom, Var var, IterVarType iter_type, String thread_tag = "");
+  TVM_DLL IterVar(Range dom, Var var, IterVarType iter_type, String thread_tag = "",
+                  Span span = Span());
   /*!
    * \return the corresponding var in the IterVar.
    */
diff --git a/include/tvm/topi/broadcast.h b/include/tvm/topi/broadcast.h
index d03ddc93b4c0..f4f4f2ccb917 100644
--- a/include/tvm/topi/broadcast.h
+++ b/include/tvm/topi/broadcast.h
@@ -49,17 +49,17 @@ inline tvm::te::Tensor broadcast_to(const tvm::te::Tensor& t,
                                     const tvm::Array<tvm::PrimExpr>& output_shape,
                                     std::string name = "T_broadcast_to",
                                     std::string tag = kBroadcast) {
-  CHECK_GE(output_shape.size(), t->shape.size())
+  ICHECK_GE(output_shape.size(), t->shape.size())
       << "Not a broadcast, output dimensionality smaller than input.\noutput: " << output_shape
       << "\nvs\ninput: " << t;
   auto bh = detail::BroadcastShape(output_shape, t->shape);
-  CHECK_EQ(output_shape.size(), bh.common_shape.size());
+  ICHECK_EQ(output_shape.size(), bh.common_shape.size());
   Array<PrimExpr> oshape;
   for (size_t i = 0; i < output_shape.size(); ++i) {
     if (output_shape[i].as<tir::IntImmNode>() == nullptr) {
       oshape.push_back(output_shape[i]);
     } else {
-      CHECK(topi::detail::EqualCheck(output_shape[i], bh.common_shape[i]));
+      ICHECK(topi::detail::EqualCheck(output_shape[i], bh.common_shape[i]));
       oshape.push_back(bh.common_shape[i]);
     }
   }
diff --git a/include/tvm/topi/cuda/dense.h b/include/tvm/topi/cuda/dense.h
index 447486d2fe0d..7fd3107b6c32 100644
--- a/include/tvm/topi/cuda/dense.h
+++ b/include/tvm/topi/cuda/dense.h
@@ -53,10 +53,10 @@ namespace cuda {
 inline tvm::te::Tensor dense_cuda(const Target& target, const tvm::te::Tensor& data,
                                   const tvm::te::Tensor& weight, const tvm::te::Tensor& bias,
                                   const DataType& out_dtype) {
-  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
-  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
-    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+    ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
   }
 
   auto batch = data->shape[0];
@@ -64,7 +64,7 @@ inline tvm::te::Tensor dense_cuda(const Target& target, const tvm::te::Tensor& d
   auto out_dim = weight->shape[0];
 
   if (target->GetLibs().count("cublas")) {
-    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
+    ICHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::cublas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::te::compute(
diff --git a/include/tvm/topi/cuda/reduction.h b/include/tvm/topi/cuda/reduction.h
index acfcc76b9ade..7160419422a6 100644
--- a/include/tvm/topi/cuda/reduction.h
+++ b/include/tvm/topi/cuda/reduction.h
@@ -60,7 +60,7 @@ Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch,
   }
 
   auto out_stage = sch[data_out];
-  CHECK_GT(out_stage->op.as<ComputeOpNode>()->reduce_axis.size(), 0)
+  ICHECK_GT(out_stage->op.as<ComputeOpNode>()->reduce_axis.size(), 0)
       << "reduce_axis must be greater than zero";
 
   bool all_reduce;
@@ -183,7 +183,7 @@ void TraverseAfterReduce(const Target& target, Schedule s, Operation op) {
  * \return A schedule for the given ops.
  */
 Schedule schedule_reduce(const Target& target, Array<Tensor> outs) {
-  CHECK_EQ(outs.size(), 1) << "outs must have size 1";
+  ICHECK_EQ(outs.size(), 1) << "outs must have size 1";
   Array<Operation> out_ops;
   for (auto t : outs) {
     out_ops.push_back(t->op);
diff --git a/include/tvm/topi/detail/broadcast.h b/include/tvm/topi/detail/broadcast.h
index e719348ecf77..5c701825840c 100644
--- a/include/tvm/topi/detail/broadcast.h
+++ b/include/tvm/topi/detail/broadcast.h
@@ -59,7 +59,7 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::PrimExpr>& shape1,
       bh.vars1.push_front(bh.all_vars[0]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else if (topi::detail::EqualCheck(one, shape1[s1_size - i])) {
-      CHECK(!topi::detail::EqualCheck(one, shape2[s2_size - i]));
+      ICHECK(!topi::detail::EqualCheck(one, shape2[s2_size - i]));
       bh.common_shape.push_front(shape2[s2_size - i]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else if (topi::detail::EqualCheck(one, shape2[s2_size - i])) {
@@ -78,10 +78,10 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::PrimExpr>& shape1,
       bh.vars1.push_front(bh.all_vars[0]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else {
-      CHECK(false) << "Incompatible broadcast dims: " << shape1[s1_size - i] << " and "
-                   << shape2[s2_size - i]
-                   << " in: " << tvm::Array<tvm::PrimExpr>(shape1.begin(), shape1.end()) << " and "
-                   << tvm::Array<tvm::PrimExpr>(shape2.begin(), shape2.end());
+      ICHECK(false) << "Incompatible broadcast dims: " << shape1[s1_size - i] << " and "
+                    << shape2[s2_size - i]
+                    << " in: " << tvm::Array<tvm::PrimExpr>(shape1.begin(), shape1.end()) << " and "
+                    << tvm::Array<tvm::PrimExpr>(shape2.begin(), shape2.end());
     }
   }
   // Remaining dimensions whether on shape1 or shape2 can always be completed
@@ -100,7 +100,7 @@ inline tvm::Array<tvm::PrimExpr> InputIndexFromBroadcast(
     const tvm::Array<tvm::tir::Var>& ovars, const tvm::te::Tensor& T,
     const std::deque<tvm::tir::Var>& my_vars, const std::deque<tvm::tir::Var>& all_vars) {
   tvm::Array<tvm::PrimExpr> ivars;
-  CHECK_EQ(ovars.size(), all_vars.size());
+  ICHECK_EQ(ovars.size(), all_vars.size());
   // N^2, could use a map but NBD.
   size_t expected_dims = T->shape.size();
   for (size_t i = 0; i < ovars.size(); ++i) {
@@ -118,7 +118,7 @@ inline tvm::Array<tvm::PrimExpr> InputIndexFromBroadcast(
       ivars.push_back(tvm::tir::make_zero(ovars[i].dtype()));
     }
   }
-  CHECK(expected_dims == ivars.size());
+  ICHECK(expected_dims == ivars.size());
   return ivars;
 }
 
diff --git a/include/tvm/topi/detail/constant_utils.h b/include/tvm/topi/detail/constant_utils.h
index 201a0da94278..49ce21b5732e 100644
--- a/include/tvm/topi/detail/constant_utils.h
+++ b/include/tvm/topi/detail/constant_utils.h
@@ -47,6 +47,21 @@ using namespace tvm::te;
  */
 inline bool IsConstInt(PrimExpr expr) { return expr->IsInstance<tvm::tir::IntImmNode>(); }
 
+/*!
+ * \brief Test whether the given Array has every element as constant integer
+ *
+ * \param array the array to query
+ *
+ * \return true if every element in array is constant int or uint, false otherwise.
+ */
+inline bool IsConstIntArray(Array<PrimExpr> array) {
+  bool is_const_int = true;
+  for (auto const& elem : array) {
+    is_const_int &= elem->IsInstance<tvm::tir::IntImmNode>();
+  }
+  return is_const_int;
+}
+
 /*!
  * \brief Get the value of the given constant integer expression. An error
  * is logged if the given expression is not a constant integer.
@@ -76,7 +91,7 @@ inline std::vector<int> GetConstIntValues(Array<PrimExpr> exprs, const std::stri
   std::vector<int> result;
   if (!exprs.defined()) return result;
   for (auto expr : exprs) {
-    CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
+    ICHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
     result.push_back(GetConstInt(expr));
   }
   return result;
@@ -96,7 +111,7 @@ inline std::vector<int64_t> GetConstInt64Values(Array<PrimExpr> exprs,
   std::vector<int64_t> result;
   if (!exprs.defined()) return result;
   for (auto expr : exprs) {
-    CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
+    ICHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
     result.push_back(GetConstInt(expr));
   }
   return result;
diff --git a/include/tvm/topi/detail/extern.h b/include/tvm/topi/detail/extern.h
index 48c3e18aa58e..caca1e85e520 100644
--- a/include/tvm/topi/detail/extern.h
+++ b/include/tvm/topi/detail/extern.h
@@ -79,7 +79,7 @@ inline Array<Tensor> make_extern(const Array<Array<PrimExpr> >& out_shapes,
                                  const std::vector<DataType>& out_types,
                                  const Array<Tensor>& inputs, FExtern fextern, std::string name,
                                  std::string tag, ::tvm::Map<String, ObjectRef> attrs) {
-  CHECK_EQ(out_shapes.size(), out_types.size())
+  ICHECK_EQ(out_shapes.size(), out_types.size())
       << "make_extern: out_shapes and out_types must have equal size";
 
   Array<Buffer> input_placeholders;
@@ -112,7 +112,7 @@ inline Array<Tensor> make_extern(const Array<Array<PrimExpr> >& out_shapes,
  * \return An expression representing the pack operation
  */
 inline PrimExpr pack_buffer(Buffer buf) {
-  CHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element";
+  ICHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element";
   auto shape =
       tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_stack_make_shape(), buf->shape);
   PrimExpr strides;
diff --git a/include/tvm/topi/detail/ravel_unravel.h b/include/tvm/topi/detail/ravel_unravel.h
index fc775093e632..dd7bcac09a04 100644
--- a/include/tvm/topi/detail/ravel_unravel.h
+++ b/include/tvm/topi/detail/ravel_unravel.h
@@ -43,8 +43,8 @@ using namespace tvm::te;
  * \return The index after flattening
  */
 inline PrimExpr RavelIndex(Array<PrimExpr> indices, Array<PrimExpr> shape) {
-  CHECK_EQ(indices.size(), shape.size()) << "indices and shape must have equal size";
-  CHECK_GT(indices.size(), 0) << "indices must not be empty";
+  ICHECK_EQ(indices.size(), shape.size()) << "indices and shape must have equal size";
+  ICHECK_GT(indices.size(), 0) << "indices must not be empty";
   PrimExpr idx;
   for (size_t i = 0; i < indices.size(); ++i) {
     if (i == 0) {
diff --git a/include/tvm/topi/detail/tensor_utils.h b/include/tvm/topi/detail/tensor_utils.h
index 7004c358ad4e..65a760b1397c 100644
--- a/include/tvm/topi/detail/tensor_utils.h
+++ b/include/tvm/topi/detail/tensor_utils.h
@@ -89,6 +89,43 @@ inline PrimExpr bilinear_sample_nchw(const Tensor& input, const Array<PrimExpr>&
          D * x_lerp * y_lerp;
 }
 
+/*!
+ * \brief Sample a point in a tensor using bilinear interpolation.
+ *
+ * \param input The input tensor.
+ * \param indices The index of the target point, which can be fractional
+ * \param max_y The maximum of y dimension
+ * \param max_x The maximum of x dimension
+ *
+ * \return The interpolated value in the given index.
+ */
+inline PrimExpr bilinear_sample_nhwc(const Tensor& input, const Array<PrimExpr>& indices,
+                                     const PrimExpr max_y, const PrimExpr max_x) {
+  auto in_y = indices[1];
+  auto yf = tvm::floor(in_y);
+  auto yc = tvm::cast(DataType::Int(32), tvm::ceil(in_y));
+
+  auto y0 = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y1 = tvm::if_then_else((yc > max_y), max_y, yc);
+  auto y_lerp = in_y - yf;
+
+  auto in_x = indices[2];
+  auto xf = tvm::floor(in_x);
+  auto xc = tvm::cast(DataType::Int(32), tvm::ceil(in_x));
+
+  auto x0 = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x1 = tvm::if_then_else((xc > max_x), max_x, xc);
+  auto x_lerp = in_x - xf;
+
+  auto A = input(indices[0], y0, x0, indices[3]);
+  auto B = input(indices[0], y0, x1, indices[3]);
+  auto C = input(indices[0], y1, x0, indices[3]);
+  auto D = input(indices[0], y1, x1, indices[3]);
+
+  return A * (1 - x_lerp) * (1 - y_lerp) + B * x_lerp * (1 - y_lerp) + C * (1 - x_lerp) * y_lerp +
+         D * x_lerp * y_lerp;
+}
+
 }  // namespace detail
 }  // namespace topi
 }  // namespace tvm
diff --git a/include/tvm/topi/elemwise.h b/include/tvm/topi/elemwise.h
index f537c9c865df..cad72cb591f8 100644
--- a/include/tvm/topi/elemwise.h
+++ b/include/tvm/topi/elemwise.h
@@ -327,7 +327,7 @@ inline Tensor reinterpret(const Tensor& x, DataType type, std::string name = "te
  */
 inline Tensor elemwise_sum(const Array<Tensor>& xs, std::string name = "T_elemwise_sum",
                            std::string tag = kElementWise) {
-  CHECK_GT(xs.size(), 0) << "elemwise sum must have at least one input tensor.";
+  ICHECK_GT(xs.size(), 0) << "elemwise sum must have at least one input tensor.";
   return compute(
       xs[0]->shape,
       [&](const Array<Var>& i) {
diff --git a/include/tvm/topi/nn.h b/include/tvm/topi/nn.h
index d257d3cbb863..71944071a7ce 100644
--- a/include/tvm/topi/nn.h
+++ b/include/tvm/topi/nn.h
@@ -30,6 +30,7 @@
 #include <tvm/tir/op.h>
 #include <tvm/topi/detail/constant_utils.h>
 #include <tvm/topi/tags.h>
+#include <tvm/topi/transform.h>
 
 #include <algorithm>
 #include <string>
@@ -98,8 +99,8 @@ inline tvm::te::Tensor leaky_relu(const tvm::te::Tensor& t, double alpha = 0.1,
 inline tvm::te::Tensor prelu(const tvm::te::Tensor& x, const tvm::te::Tensor& slope,
                              const int axis = 1, std::string name = "T_prelu",
                              std::string tag = kBroadcast) {
-  CHECK((size_t)axis < x->shape.size()) << "Wrong axis (" << axis << ")value. ";
-  CHECK(topi::detail::GetConstInt(slope->shape[0]) == topi::detail::GetConstInt(x->shape[axis]))
+  ICHECK((size_t)axis < x->shape.size()) << "Wrong axis (" << axis << ")value. ";
+  ICHECK(topi::detail::GetConstInt(slope->shape[0]) == topi::detail::GetConstInt(x->shape[axis]))
       << "Wrong slope shape received.";
 
   return tvm::te::compute(
@@ -162,8 +163,8 @@ inline tvm::te::Tensor pad(const tvm::te::Tensor& t, const tvm::Array<tvm::PrimE
   }
 
   arith::Analyzer analyzer;
-  CHECK_GE(pad_before.size(), 1);
-  CHECK_EQ(pad_before.size(), pad_after.size());
+  ICHECK_GE(pad_before.size(), 1);
+  ICHECK_EQ(pad_before.size(), pad_after.size());
   tvm::Array<tvm::PrimExpr> pad_before_int32;
   tvm::Array<tvm::PrimExpr> pad_after_int32;
 
@@ -262,8 +263,8 @@ inline tvm::te::Tensor conv2d_nchw(const tvm::te::Tensor& I, const tvm::te::Tens
                                    int pad_h = 0, int pad_w = 0, int stride_h = 1, int stride_w = 1,
                                    std::string name = "T_conv2d_nchw",
                                    std::string tag = kConv2dNCHW) {
-  CHECK_EQ(4, I->shape.size());
-  CHECK_EQ(4, W->shape.size());
+  ICHECK_EQ(4, I->shape.size());
+  ICHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
   tvm::Array<tvm::PrimExpr> output_shape{
@@ -306,8 +307,8 @@ inline tvm::te::Tensor conv2d_hwcn(const tvm::te::Tensor& I, const tvm::te::Tens
                                    int pad_h = 0, int pad_w = 0, int stride_h = 1, int stride_w = 1,
                                    std::string name = "T_conv2d_hwcn",
                                    std::string tag = kConv2dHWCN) {
-  CHECK_EQ(4, I->shape.size());
-  CHECK_EQ(4, W->shape.size());
+  ICHECK_EQ(4, I->shape.size());
+  ICHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
   tvm::Array<tvm::PrimExpr> output_shape{
@@ -351,8 +352,8 @@ inline tvm::te::Tensor depthwise_conv2d_nchw(const tvm::te::Tensor& I, const tvm
                                              int stride_w = 1,
                                              std::string name = "T_depthwise_conv2d_nchw",
                                              std::string tag = kDepthwiseConv2dNCHW) {
-  CHECK_EQ(4, I->shape.size());
-  CHECK_EQ(4, W->shape.size());
+  ICHECK_EQ(4, I->shape.size());
+  ICHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
   auto pCM = W->shape[1];  // channel_multiplier
@@ -380,8 +381,8 @@ inline tvm::te::Tensor depthwise_conv2d_nhwc(const tvm::te::Tensor& I, const tvm
                                              int stride_w = 1,
                                              std::string name = "T_depthwise_conv2d_nhwc",
                                              std::string tag = kDepthwiseConv2dNHWC) {
-  CHECK_EQ(4, I->shape.size());
-  CHECK_EQ(4, W->shape.size());
+  ICHECK_EQ(4, I->shape.size());
+  ICHECK_EQ(4, W->shape.size());
   auto pH = I->shape[1];
   auto pW = I->shape[2];
   auto pCM = W->shape[1];  // channel_multiplier
@@ -429,8 +430,8 @@ inline tvm::te::Tensor group_conv2d_ngchw(const tvm::te::Tensor& I, const tvm::t
                                           int stride_w = 1,
                                           std::string name = "T_group_conv2d_ngchw",
                                           std::string tag = kGroupConv2d) {
-  CHECK_EQ(5, I->shape.size());
-  CHECK_EQ(5, W->shape.size());
+  ICHECK_EQ(5, I->shape.size());
+  ICHECK_EQ(5, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
   tvm::Array<tvm::PrimExpr> output_shape{
@@ -459,6 +460,183 @@ inline tvm::te::Tensor group_conv2d_ngchw(const tvm::te::Tensor& I, const tvm::t
   return tvm::te::compute(output_shape, l, name, tag);
 }
 
+/*!
+ * \brief Divide spatial dimensions of the input into a grid of blocks.
+ *
+ * \param data The input tensor.
+ * \param block_shape The size of the spatial block.
+ * \param pad_before The zero-padding size before each spatial dimension.
+ * \param pad_after The zero-padding size after each spatial dimension.
+ * \param pad_value The value used for padding.
+ * \param name The name of the operation.
+ * \param tag The tag to mark the operation.
+ *
+ * \return A Tensor whose op member is the space_to_batch_nd operation
+ */
+inline tvm::te::Tensor space_to_batch_nd(const tvm::te::Tensor& data,
+                                         const tvm::Array<Integer>& block_shape,
+                                         const tvm::Array<tvm::PrimExpr>& pad_before,
+                                         const tvm::Array<tvm::PrimExpr>& pad_after,
+                                         PrimExpr pad_value = PrimExpr(),
+                                         std::string name = "space_to_batch_nd",
+                                         std::string tag = kInjective) {
+  tvm::te::Tensor padded_t;
+  CHECK_EQ(pad_before.size(), pad_after.size());
+  CHECK_EQ(block_shape.size(), pad_before.size())
+      << "Paddings must be provided for each spatial dimension";
+  tvm::Array<tvm::PrimExpr> pad_before_int32;
+  tvm::Array<tvm::PrimExpr> pad_after_int32;
+
+  // pad size for batch dimension is 0
+  pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), 0));
+  pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), 0));
+  // insert pad sizes given for spatial dimensions
+  for (const auto& ele : pad_before) {
+    pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+  }
+  for (const auto& ele : pad_after) {
+    pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+  }
+
+  // pad the input with paddings provided
+  if (!pad_value.defined()) {
+    pad_value = tvm::tir::make_const(data->dtype, 0);
+  }
+  padded_t = pad(data, pad_before_int32, pad_after_int32, pad_value);
+
+  auto input_shape = data->shape;
+  auto padded_shape = padded_t->shape;
+
+  // infer shapes
+  tvm::Array<PrimExpr> r_shape;
+  tvm::Array<Integer> axis;
+  tvm::Array<PrimExpr> o_shape;
+
+  size_t num_block_dims = block_shape.size();
+  int batch = static_cast<int>(GetConstInt(input_shape[0]));
+  tvm::PrimExpr block_shape_prod(1);
+  r_shape.push_back(batch);
+
+  for (size_t i = 1; i <= num_block_dims; i++) {
+    int padded_input = static_cast<int>(GetConstInt(padded_shape[i]));
+    int block_size = static_cast<int>(GetConstInt(block_shape[i - 1]));
+    CHECK_EQ((padded_input % block_size), 0)
+        << "(" << i
+        << ")th "
+           "Input dimension after padding ("
+        << padded_input << ")"
+        << " must be divisible by its block size (" << block_size << ")";
+
+    r_shape.push_back(div(padded_shape[i], block_shape[i - 1]));
+    r_shape.push_back(block_shape[i - 1]);
+    block_shape_prod *= block_shape[i - 1];
+    axis.push_back(Integer(r_shape.size() - 1));  // index of block_shape[i - 1]
+  }
+
+  size_t n = axis.size();
+  axis.push_back(0);  // batch is at index 0
+  // index of (padded_shape[i] / block_shape[i - 1]) in r_shape
+  for (size_t i = 0; i < n; i++) {
+    axis.push_back(static_cast<int>(GetConstInt(axis[i] - 1)));
+  }
+  o_shape.push_back(tvm::PrimExpr(batch) * block_shape_prod);
+  for (size_t i = 1; i <= num_block_dims; i++) {
+    o_shape.push_back(div(padded_shape[i], block_shape[i - 1]));
+  }
+  // append remaining shape
+  for (size_t i = num_block_dims + 1; i < input_shape.size(); i++) {
+    r_shape.push_back(input_shape[i]);
+    axis.push_back(Integer(r_shape.size() - 1));  // index of remaining shape in r_shape
+    o_shape.push_back(input_shape[i]);
+  }
+
+  tvm::te::Tensor output = reshape(padded_t, r_shape);
+  output = transpose(output, axis);
+  output = reshape(output, o_shape);
+
+  return output;
+}
+
+/*!
+ * \brief Reshape the batch dimension into spatial dimensions.
+ *
+ * \param data The input tensor.
+ * \param block_shape The size of the spatial block.
+ * \param crop_begin_list The begin crop size for each spatial dimension.
+ * \param crop_end_list The end crop size for each spatial dimension.
+ * \param name The name of the operation.
+ * \param tag The tag to mark the operation.
+ *
+ * \return A Tensor whose op member is the batch_to_space_nd operation
+ */
+inline tvm::te::Tensor batch_to_space_nd(const tvm::te::Tensor& data,
+                                         const tvm::Array<Integer>& block_shape,
+                                         const tvm::Array<tvm::PrimExpr>& crop_begin_list,
+                                         const tvm::Array<tvm::PrimExpr>& crop_end_list,
+                                         std::string name = "batch_to_space_nd",
+                                         std::string tag = kInjective) {
+  // Construct shapes for reshape and transpose operation
+  Array<PrimExpr> in_shape = data->shape;
+  Array<PrimExpr> r_shape;
+  Array<Integer> axis;
+  size_t num_block_dims = block_shape.size();
+  size_t num_input_dims = in_shape.size();
+  tvm::PrimExpr block_shape_prod(1);
+  int batch = static_cast<int>(GetConstInt(in_shape[0]));
+
+  for (size_t i = 0; i < num_block_dims; i++) {
+    r_shape.push_back(block_shape[i]);
+    block_shape_prod *= block_shape[i];
+  }
+  axis.push_back(Integer(r_shape.size()));  // axis of (batch / block_shape_prod)
+  r_shape.push_back(batch / block_shape_prod);
+
+  for (size_t i = 1; i < num_input_dims; i++) {
+    axis.push_back(Integer(r_shape.size()));  // axis of in_shape[i]
+    if (axis.size() < (num_block_dims + num_input_dims)) {
+      axis.push_back(Integer(r_shape.size() - (num_block_dims + 1)));  // axis of block_shape[i]
+    }
+    r_shape.push_back(in_shape[i]);
+  }
+
+  Array<PrimExpr> r_p_shape;
+  r_p_shape.push_back(batch / block_shape_prod);
+  for (size_t i = 1; i <= num_block_dims; i++) {
+    r_p_shape.push_back(in_shape[i] * block_shape[i - 1]);
+  }
+  for (size_t i = num_block_dims + 1; i < num_input_dims; i++) {
+    r_p_shape.push_back(in_shape[i]);
+  }
+
+  tvm::te::Tensor out;
+  out = reshape(data, r_shape);
+  out = transpose(out, axis);
+  out = reshape(out, r_p_shape);
+
+  // Crop the start and end of dimensions of out
+  Array<PrimExpr> begin_idx, end_idx, strides;
+  for (size_t i = 0; i < r_p_shape.size(); ++i) {
+    strides.push_back(Integer(1));
+    if (i > 0 && i <= num_block_dims) {
+      // prepare begin and end index for spatial dimensions
+      int begin_i = static_cast<int>(GetConstInt(crop_begin_list[i - 1]));
+      int end_i = static_cast<int>(GetConstInt(crop_end_list[i - 1]));
+      int out_i = static_cast<int>(GetConstInt(r_p_shape[i]));
+      CHECK_GT(out_i, (begin_i + end_i))
+          << "Incorrect crop sizes for (" << i << ")th dim, can not crop more than"
+          << " output size" << out_i << " vs " << (begin_i + end_i);
+      begin_idx.push_back(begin_i);
+      end_idx.push_back(out_i - end_i);
+    } else {
+      // ignore the batch and remaining dimension
+      begin_idx.push_back(Integer(0));
+      end_idx.push_back(static_cast<int>(GetConstInt(r_p_shape[i])));
+    }
+  }
+
+  out = strided_slice(out, begin_idx, end_idx, strides);
+  return out;
+}
 }  // namespace topi
 }  // namespace tvm
 #endif  // TVM_TOPI_NN_H_
diff --git a/include/tvm/topi/nn/bnn.h b/include/tvm/topi/nn/bnn.h
index f72950861b8a..815b8a23c998 100644
--- a/include/tvm/topi/nn/bnn.h
+++ b/include/tvm/topi/nn/bnn.h
@@ -52,7 +52,7 @@ inline tvm::te::Tensor binarize_pack(const tvm::te::Tensor& data, int axis,
                                      std::string name = "PackedInput",
                                      std::string tag = "binarize_pack") {
   auto ishape = data->shape;
-  CHECK_EQ(GetConstInt(ishape[axis]) % 32, 0)
+  ICHECK_EQ(GetConstInt(ishape[axis]) % 32, 0)
       << "binarize_pack: axis size must be a multiple of 32";
 
   arith::Analyzer analyzer;
@@ -99,10 +99,10 @@ inline tvm::te::Tensor binarize_pack(const tvm::te::Tensor& data, int axis,
  * \return Tensor with shape [batch, out_dim], dtype is float32
  */
 inline tvm::te::Tensor binary_dense(const tvm::te::Tensor& data, const tvm::te::Tensor& weight) {
-  CHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data";
-  CHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight";
-  CHECK_EQ(data->dtype, DataType::UInt(32)) << "binary_dense requires uint32 data";
-  CHECK_EQ(weight->dtype, DataType::UInt(32)) << "binary_dense requires uint32 weight";
+  ICHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data";
+  ICHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight";
+  ICHECK_EQ(data->dtype, DataType::UInt(32)) << "binary_dense requires uint32 data";
+  ICHECK_EQ(weight->dtype, DataType::UInt(32)) << "binary_dense requires uint32 weight";
 
   auto batch = data->shape[0];
   auto in_dim = data->shape[1];
diff --git a/include/tvm/topi/nn/dense.h b/include/tvm/topi/nn/dense.h
index ad18cb063f10..113002dc2d88 100644
--- a/include/tvm/topi/nn/dense.h
+++ b/include/tvm/topi/nn/dense.h
@@ -47,10 +47,10 @@ using namespace tvm::te;
  */
 inline tvm::te::Tensor dense(const tvm::te::Tensor& data, const tvm::te::Tensor& weight,
                              const tvm::te::Tensor& bias, const DataType& out_dtype) {
-  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
-  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
-    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+    ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
   }
 
   auto batch = data->shape[0];
diff --git a/include/tvm/topi/nn/dilate.h b/include/tvm/topi/nn/dilate.h
index 9b5a8047740e..3369316e4d7f 100644
--- a/include/tvm/topi/nn/dilate.h
+++ b/include/tvm/topi/nn/dilate.h
@@ -45,7 +45,7 @@ using namespace tvm::te;
  * \return The logical conjunction expression
  */
 PrimExpr all(Array<PrimExpr> args) {
-  CHECK_GT(args.size(), 0) << "all requires at least one argument";
+  ICHECK_GT(args.size(), 0) << "all requires at least one argument";
 
   PrimExpr ret = args[0];
   for (size_t i = 1; i < args.size(); ++i) {
@@ -70,8 +70,8 @@ PrimExpr all(Array<PrimExpr> args) {
 inline Tensor dilate(const Tensor& x, Array<PrimExpr> strides, double dilation_value,
                      std::string name = "tensor", std::string tag = kInjective) {
   auto n = x->shape.size();
-  CHECK_EQ(n, strides.size()) << "strides size (" << strides.size()
-                              << ") must match dimension of x (" << n << ")";
+  ICHECK_EQ(n, strides.size()) << "strides size (" << strides.size()
+                               << ") must match dimension of x (" << n << ")";
 
   Array<PrimExpr> out_shape;
   arith::Analyzer analyzer;
diff --git a/include/tvm/topi/nn/local_response_norm.h b/include/tvm/topi/nn/local_response_norm.h
index 0170c503d9ff..717adb8ff8fa 100644
--- a/include/tvm/topi/nn/local_response_norm.h
+++ b/include/tvm/topi/nn/local_response_norm.h
@@ -52,9 +52,9 @@ using namespace tvm::te;
 inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.0001,
                   float beta = 0.75, float bias = 2, std::string name = "tensor",
                   std::string tag = kBroadcast) {
-  CHECK_EQ(data->shape.size(), 4) << "LRN requires 4-D input";
-  CHECK_EQ(size % 2, 1) << "size should be odd number";
-  CHECK(axis == 1 || axis == 3) << "axis should be 1 or 3 for NCHW and NHWC";
+  ICHECK_EQ(data->shape.size(), 4) << "LRN requires 4-D input";
+  ICHECK_EQ(size % 2, 1) << "size should be odd number";
+  ICHECK(axis == 1 || axis == 3) << "axis should be 1 or 3 for NCHW and NHWC";
   auto input_shape = data->shape;
   Array<PrimExpr> pad_before{0, 0, 0, 0};
   Array<PrimExpr> pad_after{0, 0, 0, 0};
diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h
index 935d399a6604..8c30e673b304 100644
--- a/include/tvm/topi/nn/pooling.h
+++ b/include/tvm/topi/nn/pooling.h
@@ -65,18 +65,18 @@ inline Tensor pool_impl(const Tensor& x, const Array<PrimExpr>& kernel_size,
                         const Array<PrimExpr>& stride_size, const Array<PrimExpr>& padding_size,
                         PoolType pool_type, bool ceil_mode, const size_t height_axis,
                         const size_t width_axis, bool count_include_pad) {
-  CHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
-  CHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
-  CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
-  CHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
+  ICHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
+  ICHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
+  ICHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
+  ICHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
 
   auto kernel_height = cast(DataType::DataType::Int(32), kernel_size[0]);
   auto kernel_width = cast(DataType::DataType::Int(32), kernel_size[1]);
   auto stride_height = cast(DataType::DataType::Int(32), stride_size[0]);
   auto stride_width = cast(DataType::DataType::Int(32), stride_size[1]);
 
-  auto height = x->shape[height_axis];
-  auto width = x->shape[width_axis];
+  auto height = cast(DataType::DataType::Int(32), x->shape[height_axis]);
+  auto width = cast(DataType::DataType::Int(32), x->shape[width_axis]);
 
   auto pad_top = cast(DataType::DataType::Int(32), padding_size[0]);
   auto pad_left = cast(DataType::DataType::Int(32), padding_size[1]);
@@ -103,10 +103,13 @@ inline Tensor pool_impl(const Tensor& x, const Array<PrimExpr>& kernel_size,
   auto out_width =
       analyzer.Simplify(indexdiv(width - kernel_width + pad_left + pad_right, stride_width) + 1);
 
-  auto dheight = tvm::te::reduce_axis(Range(0, kernel_height));
-  auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width));
+  auto dheight = tvm::te::reduce_axis(Range(0, kernel_height), "dh");
+  auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width), "dw");
 
   Array<PrimExpr> out_shape = x->shape;
+  for (size_t i = 0; i < out_shape.size(); ++i) {
+    out_shape.Set(i, cast(DataType::DataType::Int(32), out_shape[i]));
+  }
   out_shape.Set(height_axis, out_height);
   out_shape.Set(width_axis, out_width);
 
@@ -178,19 +181,19 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
                              const Array<PrimExpr>& padding_size, PoolType pool_type,
                              bool ceil_mode, const size_t height_axis, const size_t width_axis,
                              bool count_include_pad) {
-  CHECK(out_grad->shape.size() >= 2) << "Pooling grad output must >= 2-D (H, W)";
-  CHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
-  CHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
-  CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
-  CHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
+  ICHECK(out_grad->shape.size() >= 2) << "Pooling grad output must >= 2-D (H, W)";
+  ICHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
+  ICHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
+  ICHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
+  ICHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
 
   auto kernel_height = cast(DataType::DataType::Int(32), kernel_size[0]);
   auto kernel_width = cast(DataType::DataType::Int(32), kernel_size[1]);
   auto stride_height = cast(DataType::DataType::Int(32), stride_size[0]);
   auto stride_width = cast(DataType::DataType::Int(32), stride_size[1]);
 
-  auto height = x->shape[height_axis];
-  auto width = x->shape[width_axis];
+  auto height = cast(DataType::DataType::Int(32), x->shape[height_axis]);
+  auto width = cast(DataType::DataType::Int(32), x->shape[width_axis]);
 
   auto pad_top = cast(DataType::DataType::Int(32), padding_size[0]);
   auto pad_left = cast(DataType::DataType::Int(32), padding_size[1]);
@@ -217,10 +220,15 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
   auto out_width =
       analyzer.Simplify((width - kernel_width + pad_left + pad_right) / stride_width + 1);
 
-  auto dheight = tvm::te::reduce_axis(Range(0, kernel_height));
-  auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width));
+  auto dheight = tvm::te::reduce_axis(Range(0, kernel_height), "dh");
+  auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width), "dw");
 
-  Array<PrimExpr> out_shape = x->shape;
+  Array<PrimExpr> data_shape = x->shape;
+  for (size_t i = 0; i < data_shape.size(); ++i) {
+    data_shape.Set(i, cast(DataType::DataType::Int(32), data_shape[i]));
+  }
+
+  Array<PrimExpr> out_shape = data_shape;
   out_shape.Set(height_axis, out_height);
   out_shape.Set(width_axis, out_width);
 
@@ -232,13 +240,14 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
                       ((padding_h1 && *padding_h1) || (padding_w1 && *padding_w1));
 
   if (pool_type == kMaxPool) {
-    Array<PrimExpr> ravel_shape{x->shape.begin(), x->shape.end()};
+    Array<PrimExpr> ravel_shape{data_shape.begin(), data_shape.end()};
     ravel_shape.Set(height_axis, ravel_shape[height_axis] + pad_top + pad_bottom);
     ravel_shape.Set(width_axis, ravel_shape[width_axis] + pad_left + pad_right);
 
     auto windowh =
-        tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height));
-    auto windoww = tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width));
+        tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height), "wh");
+    auto windoww =
+        tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width), "ww");
 
     auto argmax = MakeArgmaxReducer();
     auto pad_x = do_pad ? pad(x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
@@ -257,7 +266,7 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
     auto mp_inds = mp_argmax[0];
 
     return tvm::te::compute(
-        x->shape,
+        data_shape,
         [&](const Array<Var>& inds) {
           Array<PrimExpr> pad_inds{inds.begin(), inds.end()};
           pad_inds.Set(height_axis, pad_inds[height_axis] + pad_top);
@@ -285,10 +294,11 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
         "T_pool_grad", "pool_grad_max");
   } else if (pool_type == kAvgPool) {
     auto windowh =
-        tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height));
-    auto windoww = tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width));
+        tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height), "wh");
+    auto windoww =
+        tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width), "ww");
     return tvm::te::compute(
-        x->shape,
+        data_shape,
         [&](const Array<Var>& inds) {
           PrimExpr pad_h_idx = inds[height_axis] + pad_top;
           PrimExpr pad_w_idx = inds[width_axis] + pad_left;
@@ -364,7 +374,7 @@ inline bool find_depth_height_width(const std::string& layout, int* depth_axis,
 
 inline bool find_height_width(const std::string& layout, int* height_axis, int* width_axis) {
   int dummy;
-  CHECK_EQ(find_depth_height_width(layout, &dummy, height_axis, width_axis), false);
+  ICHECK_EQ(find_depth_height_width(layout, &dummy, height_axis, width_axis), false);
   if (*height_axis != -1 && *width_axis != -1) {
     return true;
   }
@@ -373,7 +383,7 @@ inline bool find_height_width(const std::string& layout, int* height_axis, int*
 
 inline bool find_width(const std::string& layout, int* width_axis) {
   int dummy;
-  CHECK_EQ(find_depth_height_width(layout, &dummy, &dummy, width_axis), false);
+  ICHECK_EQ(find_depth_height_width(layout, &dummy, &dummy, width_axis), false);
   if (*width_axis != -1) {
     return true;
   }
@@ -414,7 +424,7 @@ inline Tensor pool(const Tensor& x, const Array<PrimExpr>& kernel_size,
                    PoolType pool_type, bool ceil_mode, const std::string& layout = "NCHW",
                    bool count_include_pad = true) {
   int height_axis = -1, width_axis = -1;
-  CHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
+  ICHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
   return pool_impl(x, kernel_size, stride_size, padding_size, pool_type, ceil_mode, height_axis,
                    width_axis, count_include_pad);
 }
@@ -454,7 +464,7 @@ inline Tensor pool_grad(const Tensor& out_grad, const Tensor& x, const Array<Pri
                         PoolType pool_type, bool ceil_mode, const std::string& layout = "NCHW",
                         bool count_include_pad = true) {
   int height_axis = -1, width_axis = -1;
-  CHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
+  ICHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
   return pool_grad_impl(out_grad, x, kernel_size, stride_size, padding_size, pool_type, ceil_mode,
                         height_axis, width_axis, count_include_pad);
 }
@@ -481,12 +491,16 @@ inline PrimExpr end_index(const Var& out_index, const PrimExpr& odim, const Prim
 inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_size,
                                  PoolType pool_type, const std::vector<int>& axes) {
   const auto n_dim = output_size.size();
-  CHECK_EQ(axes.size(), n_dim) << "The number of axes not equal to the in/out dimension";
+  ICHECK_EQ(axes.size(), n_dim) << "The number of axes not equal to the in/out dimension";
 
-  Array<PrimExpr> out_shape = x->shape;
+  Array<PrimExpr> data_shape = x->shape;
+  for (size_t i = 0; i < data_shape.size(); ++i) {
+    data_shape.Set(i, cast(DataType::DataType::Int(32), data_shape[i]));
+  }
+  Array<PrimExpr> out_shape = data_shape;
   Array<PrimExpr> in_size, out_size;
   for (size_t i = 0; i < n_dim; ++i) {
-    in_size.push_back(x->shape[axes[i]]);
+    in_size.push_back(data_shape[axes[i]]);
     out_size.push_back(cast(DataType::Int(32), output_size[i]));
     out_shape.Set(axes[i], out_size[i]);
   }
@@ -579,7 +593,7 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_
 inline Tensor adaptive_pool(const Tensor& x, const Array<PrimExpr>& output_size, PoolType pool_type,
                             const std::string& layout = "NCHW") {
   int height_axis = -1, width_axis = -1;
-  CHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
+  ICHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
   return adaptive_pool_impl(x, output_size, pool_type, {height_axis, width_axis});
 }
 
@@ -594,7 +608,7 @@ inline Tensor adaptive_pool(const Tensor& x, const Array<PrimExpr>& output_size,
 inline Tensor adaptive_pool3d(const Tensor& x, const Array<PrimExpr>& output_size,
                               PoolType pool_type, const std::string& layout = "NCDHW") {
   int depth_axis = -1, height_axis = -1, width_axis = -1;
-  CHECK(find_depth_height_width(layout, &depth_axis, &height_axis, &width_axis))
+  ICHECK(find_depth_height_width(layout, &depth_axis, &height_axis, &width_axis))
       << "Unsupported layout " << layout;
   return adaptive_pool_impl(x, output_size, pool_type, {depth_axis, height_axis, width_axis});
 }
@@ -649,10 +663,10 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
                            bool count_include_pad) {
   int k_size = kernel_size.size();
   int x_size = x->shape.size();
-  CHECK_EQ(stride_size.size(), k_size) << "Pooling stride_size must have same elements as kernel";
-  CHECK_EQ(padding_size.size(), k_size * 2) << "Pooling padding_size must has double elements of"
-                                               " kernel";
-  CHECK_EQ(axis.size(), k_size) << "axis must have same elements as kernel";
+  ICHECK_EQ(stride_size.size(), k_size) << "Pooling stride_size must have same elements as kernel";
+  ICHECK_EQ(padding_size.size(), k_size * 2) << "Pooling padding_size must has double elements of"
+                                                " kernel";
+  ICHECK_EQ(axis.size(), k_size) << "axis must have same elements as kernel";
 
   Array<IterVar> daxis;
   std::vector<PrimExpr> kernel(k_size);
@@ -661,7 +675,11 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
   std::vector<PrimExpr> pad_tail(k_size);
   Array<PrimExpr> pad_before(std::vector<PrimExpr>(x_size, 0));
   Array<PrimExpr> pad_after(std::vector<PrimExpr>(x_size, 0));
-  Array<PrimExpr> out_shape = x->shape;
+  Array<PrimExpr> data_shape = x->shape;
+  for (size_t i = 0; i < data_shape.size(); ++i) {
+    data_shape.Set(i, cast(DataType::DataType::Int(32), data_shape[i]));
+  }
+  Array<PrimExpr> out_shape = data_shape;
 
   bool do_pad = false;
   for (int i = 0; i < k_size; i++) {
@@ -680,14 +698,14 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
       pad_tail[i] += stride[i] - 1;
     }
 
-    daxis.push_back(tvm::te::reduce_axis(Range(0, kernel[i])));
+    daxis.push_back(tvm::te::reduce_axis(Range(0, kernel[i]), "rv" + std::to_string(i)));
 
     pad_before.Set(ii, pad_head[i]);
     pad_after.Set(ii, pad_tail[i]);
 
     arith::Analyzer analyzer;
     auto out_dim = analyzer.Simplify(
-        indexdiv(x->shape[ii] - kernel[i] + pad_head[i] + pad_tail[i], stride[i]) + 1);
+        indexdiv(data_shape[ii] - kernel[i] + pad_head[i] + pad_tail[i], stride[i]) + 1);
 
     out_shape.Set(ii, out_dim);
   }
@@ -746,7 +764,7 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
             for (int i = 0; i < k_size; i++) {
               int ii = axis[i];
               start[i] = output[ii] * stride[i] - pad_head[i];
-              end[i] = min(start[i] + kernel[i], x->shape[ii]);
+              end[i] = min(start[i] + kernel[i], data_shape[ii]);
               start[i] = max(start[i], make_const(DataType::Int(32), 0));
               kernel_size *= (end[i] - start[i]);
             }
@@ -796,7 +814,7 @@ inline Tensor pool1d(const Tensor& x, const Array<PrimExpr>& kernel_size,
                      PoolType pool_type, bool ceil_mode, const std::string& layout = "NCW",
                      bool count_include_pad = true) {
   int width_axis = -1;
-  CHECK(find_width(layout, &width_axis)) << "Unsupported layout " << layout;
+  ICHECK(find_width(layout, &width_axis)) << "Unsupported layout " << layout;
   std::vector<int> axis = {width_axis};
   return pool_impl_nd(x, kernel_size, stride_size, padding_size, pool_type, ceil_mode, axis,
                       count_include_pad);
@@ -837,7 +855,7 @@ inline Tensor pool3d(const Tensor& x, const Array<PrimExpr>& kernel_size,
                      PoolType pool_type, bool ceil_mode, const std::string& layout = "NCDHW",
                      bool count_include_pad = true) {
   int depth_axis = -1, height_axis = -1, width_axis = -1;
-  CHECK(find_depth_height_width(layout, &depth_axis, &height_axis, &width_axis))
+  ICHECK(find_depth_height_width(layout, &depth_axis, &height_axis, &width_axis))
       << "Unsupported layout " << layout;
   std::vector<int> axis = {depth_axis, height_axis, width_axis};
   return pool_impl_nd(x, kernel_size, stride_size, padding_size, pool_type, ceil_mode, axis,
diff --git a/include/tvm/topi/nn/softmax.h b/include/tvm/topi/nn/softmax.h
index 2e94f9103c68..78a9ec40bf89 100644
--- a/include/tvm/topi/nn/softmax.h
+++ b/include/tvm/topi/nn/softmax.h
@@ -54,7 +54,7 @@ inline Tensor softmax(const Tensor& x, int axis = -1, std::string name = "tensor
   if (axis < 0) {
     axis = ndim + axis;
   }
-  CHECK_LT(axis, ndim) << "axis parameter should be less than input dim";
+  ICHECK_LT(axis, ndim) << "axis parameter should be less than input dim";
 
   auto k1 = tvm::te::reduce_axis(Range(0, input_shape[axis]), "k1");
   auto k2 = tvm::te::reduce_axis(Range(0, input_shape[axis]), "k2");
@@ -124,7 +124,7 @@ inline Tensor softmax(const Tensor& x, int axis = -1, std::string name = "tensor
  */
 inline Tensor log_softmax(const Tensor& x, std::string name = "tensor",
                           std::string tag = "log_softmax_output") {
-  CHECK_EQ(x->shape.size(), 2) << "Log softmax requires 2-D input";
+  ICHECK_EQ(x->shape.size(), 2) << "Log softmax requires 2-D input";
 
   PrimExpr m = x->shape[0];
   PrimExpr n = x->shape[1];
diff --git a/include/tvm/topi/reduction.h b/include/tvm/topi/reduction.h
index 75c8265a63ce..2a2f2113e9b1 100644
--- a/include/tvm/topi/reduction.h
+++ b/include/tvm/topi/reduction.h
@@ -75,8 +75,8 @@ inline std::vector<int> GetRealAxis(int ndim, const Array<Integer>& axis) {
       if (val < 0) {
         val += ndim;
       }
-      CHECK_LE(val, ndim) << " exceeds the maximum dimension " << ndim;
-      CHECK_GE(val, 0);
+      ICHECK_LE(val, ndim) << " exceeds the maximum dimension " << ndim;
+      ICHECK_GE(val, 0);
       real_axis.push_back(static_cast<int>(val));
     }
     std::sort(real_axis.begin(), real_axis.end());
@@ -181,7 +181,7 @@ inline Tensor DoCommReduce(const Tensor& data, FReduce func, const Array<PrimExp
 inline Tensor CommReduce(const Tensor& data, const Array<Integer>& axis, FReduce func,
                          bool keepdims, bool atleast1d) {
   auto ndim = data->shape.size();
-  CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
+  ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
   auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
   auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
   return DoCommReduce(data, func, target_shape, real_axis,
@@ -204,7 +204,7 @@ inline Tensor CommReduce(const Tensor& data, const Array<Integer>& axis, FReduce
 inline Tensor CommReduceIdx(const Tensor& data, const Array<Integer>& axis, FCommReduce func,
                             bool keepdims, bool atleast1d) {
   auto ndim = data->shape.size();
-  CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
+  ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
   auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
   auto reduce_axes = MakeReduceAxes(real_axis, data);
   auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
@@ -325,7 +325,7 @@ inline Tensor sum(const Tensor& data, const Array<Integer>& axis, bool keepdims
 }
 
 inline Tensor collapse_sum(const Tensor& data, Array<PrimExpr> target_shape) {
-  CHECK_GE(data->shape.size(), target_shape.size());
+  ICHECK_GE(data->shape.size(), target_shape.size());
   auto ishape = detail::GetConstIntValues(data->shape, "ishape");
   auto oshape = detail::GetConstIntValues(target_shape, "oshape");
 
diff --git a/include/tvm/topi/rocm/dense.h b/include/tvm/topi/rocm/dense.h
index a1e4d14b9719..b861e6c89a67 100644
--- a/include/tvm/topi/rocm/dense.h
+++ b/include/tvm/topi/rocm/dense.h
@@ -53,10 +53,10 @@ namespace rocm {
 inline tvm::te::Tensor dense_rocm(const Target& target, const tvm::te::Tensor& data,
                                   const tvm::te::Tensor& weight, const tvm::te::Tensor& bias,
                                   const DataType& out_dtype) {
-  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
-  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
-    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+    ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
   }
 
   auto batch = data->shape[0];
@@ -64,7 +64,7 @@ inline tvm::te::Tensor dense_rocm(const Target& target, const tvm::te::Tensor& d
   auto out_dim = weight->shape[0];
 
   if (target->GetLibs().count("rocblas")) {
-    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
+    ICHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::rocblas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::te::compute(
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index e01eb703cb99..a04762f28feb 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -39,6 +39,8 @@
 #include <unordered_set>
 #include <vector>
 
+#include "detail/broadcast.h"
+
 namespace tvm {
 namespace topi {
 
@@ -60,11 +62,11 @@ using namespace topi::detail;
 inline Tensor expand_dims(const Tensor& x, int axis, int num_newaxis = 1,
                           std::string name = "T_expand_dims", std::string tag = kBroadcast) {
   int ndim = static_cast<int>(x->shape.size());
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
-  CHECK(num_newaxis >= 0) << "expand_dims only accepts `num_newaxis >= 0`"
-                          << ", but got num_newaxis = " << num_newaxis;
+  ICHECK(num_newaxis >= 0) << "expand_dims only accepts `num_newaxis >= 0`"
+                           << ", but got num_newaxis = " << num_newaxis;
   if (axis < 0) {
     // Calculate offset from last dimension
     axis = ndim + axis + 1;
@@ -123,13 +125,13 @@ inline Tensor transpose(const Tensor& x, Array<Integer> axes, std::string name =
       new_axis = static_cast<int>(x->shape.size()) + axis;
       axes.Set(i, new_axis);
     }
-    CHECK((new_axis >= 0) && (new_axis < static_cast<int>(x->shape.size())))
+    ICHECK((new_axis >= 0) && (new_axis < static_cast<int>(x->shape.size())))
         << "axis=" << axis << " is invalid for the " << static_cast<int>(x->shape.size())
         << "-dimensional input tensor";
 
     for (size_t j = 0; j < axes.size(); ++j) {
       if (i != j) {
-        CHECK(new_axis != static_cast<int>(axes[j]->value)) << "repeated axis in transpose";
+        ICHECK(new_axis != static_cast<int>(axes[j]->value)) << "repeated axis in transpose";
       }
     }
     new_shape.push_back(x->shape[new_axis]);
@@ -178,14 +180,14 @@ inline Tensor reverse_sequence(const Tensor& x, const Tensor& seq_lengths, int s
       batch_axis = static_cast<int>(x->shape.size()) + batch_axis;
     }
 
-    CHECK(seq_lengths_dim == 1) << "seq_lengths should be 1D vector";
+    ICHECK(seq_lengths_dim == 1) << "seq_lengths should be 1D vector";
 
-    CHECK(GetConstInt(seq_lengths->shape[0]) == GetConstInt(x->shape[batch_axis]))
+    ICHECK(GetConstInt(seq_lengths->shape[0]) == GetConstInt(x->shape[batch_axis]))
         << "For reverse_sequnece seq_lengths size should match with dimension of batch axis"
         << ", but got dimension of batch_axis = " << GetConstInt(x->shape[batch_axis])
         << ", and seq_length size = " << GetConstInt(seq_lengths->shape[0]);
 
-    CHECK((0 <= batch_axis) && (batch_axis < static_cast<int>(x->shape.size())))
+    ICHECK((0 <= batch_axis) && (batch_axis < static_cast<int>(x->shape.size())))
         << "batch_axis=" << batch_axis_inp << " is invalid for the "
         << static_cast<int>(x->shape.size()) << "-dimensional input tensor";
   }
@@ -193,7 +195,7 @@ inline Tensor reverse_sequence(const Tensor& x, const Tensor& seq_lengths, int s
   if (seq_axis < 0) {
     seq_axis = static_cast<int>(x->shape.size()) + seq_axis;
   }
-  CHECK((0 <= seq_axis) && (seq_axis < static_cast<int>(x->shape.size())))
+  ICHECK((0 <= seq_axis) && (seq_axis < static_cast<int>(x->shape.size())))
       << "seq_axis=" << seq_axis_inp << " is invalid for the " << static_cast<int>(x->shape.size())
       << "-dimensional input tensor";
 
@@ -332,7 +334,9 @@ inline Tensor squeeze(const Tensor& x, Array<Integer> axis, bool atleast1d = fal
       if (val < 0) {
         val += static_cast<int>(x->shape.size());
       }
-      CHECK_EQ(GetConstInt(x->shape[val]), 1) << "Dimension " << val << " must have size 1";
+      if (IsConstInt(x->shape[val])) {
+        ICHECK_EQ(GetConstInt(x->shape[val]), 1) << "Dimension " << val << " must have size 1";
+      }
       axis_val.push_back(val);
     }
   }
@@ -380,12 +384,12 @@ inline Tensor squeeze(const Tensor& x, Array<Integer> axis, bool atleast1d = fal
 inline Tensor concatenate(const Array<Tensor>& inputs, int axis = 0, std::string name = "T_concat",
                           std::string tag = kInjective) {
   int ndim = static_cast<int>(inputs[0]->shape.size());
-  CHECK(-ndim <= axis && axis < ndim) << "concatenate only accepts `axis` in [-ndim, ndim)"
-                                      << ", but got axis = " << axis << ", and ndim = " << ndim;
+  ICHECK(-ndim <= axis && axis < ndim) << "concatenate only accepts `axis` in [-ndim, ndim)"
+                                       << ", but got axis = " << axis << ", and ndim = " << ndim;
   if (axis < 0) {
     axis += ndim;
   }
-  CHECK_LT(axis, inputs[0]->shape.size()) << "axis out of bounds";
+  ICHECK_LT(axis, inputs[0]->shape.size()) << "axis out of bounds";
 
   Array<PrimExpr> axis_sizes;
   for (auto t : inputs) {
@@ -439,13 +443,13 @@ inline Tensor concatenate(const Array<Tensor>& inputs, int axis = 0, std::string
 inline Tensor stack(const Array<Tensor>& inputs, int axis = 0, std::string name = "T_stack",
                     std::string tag = kInjective) {
   int ndim = static_cast<int>(inputs[0]->shape.size());
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "stack only accepts `axis` in [-ndim, ndim)"
       << ", but got axis = " << axis << ", and ndim = " << ndim;
   if (axis < 0) {
     axis += ndim + 1;
   }
-  CHECK_LT(axis, inputs[0]->shape.size() + 1) << "axis out of bounds";
+  ICHECK_LT(axis, inputs[0]->shape.size() + 1) << "axis out of bounds";
 
   const int stack_size = static_cast<int>(inputs.size());
   Array<PrimExpr> out_shape;
@@ -487,7 +491,7 @@ inline Array<Tensor> split(const Tensor& x, Array<PrimExpr> split_indices, int a
   if (axis < 0) {
     axis += static_cast<int>(x->shape.size());
   }
-  CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
+  ICHECK_LT(axis, x->shape.size()) << "axis out of bounds";
 
   auto src_axis_size = x->shape[axis];
   std::vector<PrimExpr> begin_ids;
@@ -497,7 +501,7 @@ inline Array<Tensor> split(const Tensor& x, Array<PrimExpr> split_indices, int a
     auto idx_node = idx.as<IntImmNode>();
     auto back_node = begin_ids.back().as<IntImmNode>();
     if (idx_node && back_node) {
-      CHECK_GT(idx_node->value, back_node->value) << "split_indices must be sorted";
+      ICHECK_GT(idx_node->value, back_node->value) << "split_indices must be sorted";
     }
     begin_ids.push_back(idx);
   }
@@ -546,6 +550,40 @@ inline Array<Tensor> split(const Tensor& x, Array<PrimExpr> split_indices, int a
   return result;
 }
 
+/*!
+ * \brief strided_slice of a tensor with dynamic begin/end/stride
+ *
+ * \param x The input tensor
+ * \param begin The indices to begin with in the slicing
+ * \param end Indicies indicating end of the slice
+ * \param strides Specifies the stride values, it can be negative
+ * in that case, the input tensor will be reversed in that particular axis
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is the split operation
+ */
+inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& begin,
+                                        const te::Tensor& end, const te::Tensor& strides,
+                                        std::string name = "T_strided_slice_dynamic",
+                                        std::string tag = topi::kInjective) {
+  int64_t src_tensor_dim = x->shape.size();
+  Array<PrimExpr> out_shape;
+  for (int64_t i = 0; i < src_tensor_dim; ++i) {
+    out_shape.push_back(tvm::tir::Var("dim"));
+  }
+  return te::compute(
+      out_shape,
+      [&](const Array<tvm::tir::Var>& indices) {
+        Array<PrimExpr> real_indices;
+        for (int32_t i = 0; i < src_tensor_dim; ++i) {
+          real_indices.push_back(indices[i] * strides(i) + begin(i));
+        }
+        return x(real_indices);
+      },
+      name, tag);
+}
+
 /*!
  * \brief strided_slice of a tensor
  *
@@ -560,17 +598,42 @@ inline Array<Tensor> split(const Tensor& x, Array<PrimExpr> split_indices, int a
  *
  * \return A Tensor whose op member is the split operation
  */
-inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const Array<Integer>& end,
-                            const Array<Integer>& strides, std::string slice_mode = "end",
-                            std::string name = "T_strided_slice", std::string tag = kInjective) {
+inline Tensor strided_slice(const Tensor& x, const Array<PrimExpr>& begin,
+                            const Array<PrimExpr>& end, const Array<PrimExpr>& strides,
+                            std::string slice_mode = "end", std::string name = "T_strided_slice",
+                            std::string tag = kInjective) {
   size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
+  // Quick path for dynamic shape strided slice.
+  // This is for ease of use to dynamice strided slice in topi.
+  bool is_static = IsConstIntArray(x->shape);
+  is_static &= IsConstIntArray(begin);
+  is_static &= IsConstIntArray(end);
+  is_static &= IsConstIntArray(strides);
+
+  Array<PrimExpr> out_shape;
+  if (!is_static) {
+    for (size_t i = 0; i < src_tensor_dim; ++i) {
+      out_shape.push_back(indexdiv(end[i] - begin[i], strides[i]));
+    }
+    return te::compute(
+        out_shape,
+        [&](const Array<tvm::tir::Var>& indices) {
+          Array<PrimExpr> real_indices;
+          for (size_t i = 0; i < src_tensor_dim; ++i) {
+            real_indices.push_back(indices[i] * strides[i] + begin[i]);
+          }
+          return x(real_indices);
+        },
+        name, tag);
+  }
+
   // Setup the ranges.
   // NOTE: this code duplicates the shape inference logic relay.op
   // Consider to refactor in the future.
   std::vector<int64_t> stride_vec(src_tensor_dim, 1);
   for (size_t i = 0; i < strides.size(); ++i) {
-    CHECK(strides[i].defined());
-    stride_vec[i] = strides[i]->value;
+    ICHECK(strides[i].defined());
+    stride_vec[i] = GetConstInt(strides[i]);
   }
 
   const int64_t max_range = std::numeric_limits<int64_t>::max();
@@ -581,7 +644,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
       // value=None
       begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
     } else {
-      begin_vec.push_back(begin[i]->value);
+      begin_vec.push_back(GetConstInt(begin[i]));
     }
   }
   for (size_t i = begin_vec.size(); i < src_tensor_dim; ++i) {
@@ -595,20 +658,20 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
     if (!end[i].defined()) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
     } else if (slice_mode == "size") {
-      if (end[i]->value < 0) {
+      int64_t end_val = GetConstInt(end[i]);
+      if (end_val < 0) {
         end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
       } else {
-        end_vec.push_back(begin_vec[i] + end[i]->value);
+        end_vec.push_back(begin_vec[i] + end_val);
       }
     } else {
-      end_vec.push_back(end[i]->value);
+      end_vec.push_back(GetConstInt(end[i]));
     }
   }
   for (size_t i = end_vec.size(); i < src_tensor_dim; ++i) {
     end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
   }
   // Compute
-  Array<PrimExpr> out_shape;
   Array<PrimExpr> begin_expr;
   Array<PrimExpr> strides_expr;
 
@@ -630,7 +693,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
     int interval = std::abs(end_i - begin_i);
     int slice_size =
         static_cast<int>((interval + std::abs(stride_vec[i]) - 1) / std::abs(stride_vec[i]));
-    CHECK(stride_vec[i] < 0 ? (end_i <= begin_i) : (begin_i <= end_i))
+    ICHECK(stride_vec[i] < 0 ? (end_i <= begin_i) : (begin_i <= end_i))
         << ": Input [Begin=" << begin_vec[i] << ", End=" << end_vec[i]
         << "] is invalid for axis=" << i;
 
@@ -670,14 +733,14 @@ inline Array<Tensor> split_sections(const Tensor& x, int num_sections, int axis,
   if (axis < 0) {
     axis += static_cast<int>(x->shape.size());
   }
-  CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
+  ICHECK_LT(axis, x->shape.size()) << "axis out of bounds";
 
   auto src_axis_size = x->shape[axis];
 
-  CHECK_GT(num_sections, 0) << "Slice count must be > 0";
+  ICHECK_GT(num_sections, 0) << "Slice count must be > 0";
 
   if (auto node = src_axis_size.as<IntImmNode>()) {
-    CHECK_EQ(node->value % num_sections, 0)
+    ICHECK_EQ(node->value % num_sections, 0)
         << "num_sections must be an integer factor of the size of axis " << axis << " ("
         << node->value << ")";
   }
@@ -756,8 +819,8 @@ inline Tensor take(const Tensor& a, const Tensor& indices, std::string mode = "c
 inline Tensor sequence_mask(const Tensor& data, const Tensor& valid_length, double mask_value,
                             int axis, std::string name = "T_sequence_mask",
                             std::string tag = kInjective) {
-  CHECK(axis == 0 || axis == 1) << "axis must be either 0 or 1";
-  CHECK_EQ(valid_length->shape.size(), 1) << "valid_length must have ndim=1, i.e., (batch_size,).";
+  ICHECK(axis == 0 || axis == 1) << "axis must be either 0 or 1";
+  ICHECK_EQ(valid_length->shape.size(), 1) << "valid_length must have ndim=1, i.e., (batch_size,).";
   auto length_dim = data->shape[axis];
   auto batch_dim = data->shape[1 - axis];
   Array<PrimExpr> out_shape = data->shape;
@@ -795,8 +858,8 @@ inline Tensor take(const Tensor& a, const Tensor& indices, int axis, std::string
   if (axis < 0) {
     axis += static_cast<int>(a->shape.size());
   }
-  CHECK_GE(axis, 0) << "axis out of bounds";
-  CHECK_LT(axis, a->shape.size()) << "axis out of bounds";
+  ICHECK_GE(axis, 0) << "axis out of bounds";
+  ICHECK_LT(axis, a->shape.size()) << "axis out of bounds";
   auto axis_dim = a->shape[axis];
 
   int indices_len = static_cast<int>(indices->shape.size());
@@ -887,43 +950,30 @@ inline Tensor take(const Tensor& a, const Tensor& indices, int axis, std::string
  */
 inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
                     std::string name = "T_where", std::string tag = kBroadcast) {
-  CHECK_EQ(x->shape.size(), y->shape.size())
-      << "x and y must have the same shape.Got different number of dimension: " << x->shape.size()
-      << " vs " << y->shape.size();
-  CHECK_EQ(x->dtype, y->dtype) << "x and y must have the same dtype: " << x->dtype << " vs "
-                               << y->dtype;
+  ICHECK_EQ(x->dtype, y->dtype) << "x and y must have the same dtype: " << x->dtype << " vs "
+                                << y->dtype;
+  auto get_out_shape = [&]() {
+    auto bh1 = detail::BroadcastShape(x->shape, y->shape);
+    Array<PrimExpr> common_shape1(bh1.common_shape.begin(), bh1.common_shape.end());
+    auto bh2 = detail::BroadcastShape(condition->shape, common_shape1);
+    Array<PrimExpr> common_shape2(bh2.common_shape.begin(), bh2.common_shape.end());
+    return common_shape2;
+  };
 
-  if (x->shape.size() == 0) {
-    return compute(
-        condition->shape,
-        [&](const Array<Var>& indices) {
-          Array<PrimExpr> condition_idx{indices[0]};
-          return tvm::tir::Select(condition(condition_idx) != 0, x(), y());
-        },
-        name, tag);
-  } else if (condition->shape.size() != 1) {
-    CHECK_EQ(condition->shape.size(), x->shape.size())
-        << "condition array must be either have the same shape as x or to be a "
-           "1-D array.Got different number of dimension: "
-        << condition->shape.size() << " vs " << x->shape.size();
-    return compute(
-        x->shape,
-        [&](const Array<Var>& indices) {
-          return tvm::tir::Select(condition(indices) != 0, x(indices), y(indices));
-        },
-        name, tag);
-  } else {
-    CHECK_EQ(topi::GetConstInt(condition->shape[0]), topi::GetConstInt(x->shape[0]))
-        << "If condition is 1-D, the first dimension must be the same as x: " << condition->shape[0]
-        << " vs " << x->shape[0];
-    return compute(
-        x->shape,
-        [&](const Array<Var>& indices) {
-          Array<PrimExpr> condition_idx{indices[0]};
-          return tvm::tir::Select(condition(condition_idx) != 0, x(indices), y(indices));
-        },
-        name, tag);
-  }
+  auto oshape = get_out_shape();
+
+  auto c_bh = detail::BroadcastShape(condition->shape, oshape);
+  auto x_bh = detail::BroadcastShape(x->shape, oshape);
+  auto y_bh = detail::BroadcastShape(y->shape, oshape);
+
+  auto select = [&](tvm::Array<tvm::tir::Var> ovars) {
+    auto c = condition(InputIndexFromBroadcast(ovars, condition, c_bh.vars1, c_bh.all_vars));
+    auto true_val = x(InputIndexFromBroadcast(ovars, x, x_bh.vars1, x_bh.all_vars));
+    auto false_val = y(InputIndexFromBroadcast(ovars, y, y_bh.vars1, y_bh.all_vars));
+    return tvm::tir::Select(c != 0, true_val, false_val);
+  };
+
+  return compute(oshape, select, name, tag);
 }
 
 /*!
@@ -941,11 +991,11 @@ inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
 inline Tensor repeat(const Tensor& x, int repeats, int axis, std::string name = "T_repeat",
                      std::string tag = kBroadcast) {
   int ndim = static_cast<int>(x->shape.size());
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
-  CHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
-                      << ", but got repeats = " << repeats;
+  ICHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
+                       << ", but got repeats = " << repeats;
   if (axis < 0) {
     // Calculate offset from last dimension
     axis += ndim;
@@ -1081,13 +1131,13 @@ inline Tensor gather(const Tensor& data, int axis, const Tensor& indices,
                      std::string name = "T_gather", std::string tag = kInjective) {
   size_t ndim_d = data->shape.size();
   size_t ndim_i = indices->shape.size();
-  CHECK_GE(ndim_d, 1) << "Cannot gather from a scalar.";
-  CHECK_EQ(ndim_d, ndim_i);
-  CHECK_GE(axis, 0);
-  CHECK_LT(axis, ndim_d);
+  ICHECK_GE(ndim_d, 1) << "Cannot gather from a scalar.";
+  ICHECK_EQ(ndim_d, ndim_i);
+  ICHECK_GE(axis, 0);
+  ICHECK_LT(axis, ndim_d);
   size_t indices_dim_i = static_cast<size_t>(GetConstInt(indices->shape[axis]));
-  CHECK_GE(indices_dim_i, 1);
-  CHECK(indices->dtype.is_int());
+  ICHECK_GE(indices_dim_i, 1);
+  ICHECK(indices->dtype.is_int());
 
   Array<PrimExpr> out_shape;
   for (size_t i = 0; i < ndim_i; ++i) {
@@ -1128,10 +1178,10 @@ inline Tensor gather_nd(const Tensor& data, const Tensor& indices, std::string n
                         std::string tag = kInjective) {
   size_t ndim_d = data->shape.size();
   size_t ndim_i = indices->shape.size();
-  CHECK_GE(ndim_i, 1) << "indices tensor must have at least 1 dimensions";
+  ICHECK_GE(ndim_i, 1) << "indices tensor must have at least 1 dimensions";
   size_t indices_dim0 = static_cast<size_t>(GetConstInt(indices->shape[0]));
-  CHECK_LE(indices_dim0, ndim_d) << "dim 0 of indices tensor must be no more "
-                                 << "than dimensions of data tensor";
+  ICHECK_LE(indices_dim0, ndim_d) << "dim 0 of indices tensor must be no more "
+                                  << "than dimensions of data tensor";
   Array<PrimExpr> out_shape;
   for (size_t i = 1; i < ndim_i; ++i) {
     out_shape.push_back(indices->shape[i]);
@@ -1206,8 +1256,8 @@ inline tvm::te::Tensor matmul(const tvm::te::Tensor& A, const tvm::te::Tensor& B
  */
 inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, int axes = 2,
                         std::string name = "T_tensordot", std::string tag = kMatMul) {
-  CHECK_GE(A->shape.size(), axes);
-  CHECK_GE(B->shape.size(), axes);
+  ICHECK_GE(A->shape.size(), axes);
+  ICHECK_GE(B->shape.size(), axes);
 
   Array<PrimExpr> output_shape(A->shape.begin(), A->shape.end() + (-axes));
   for (auto it = B->shape.begin() + axes; it != B->shape.end(); ++it) output_shape.push_back(*it);
@@ -1252,7 +1302,7 @@ inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, int axes = 2,
 inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, Array<PrimExpr> A_axes,
                         Array<PrimExpr> B_axes, std::string name = "T_tensordot",
                         std::string tag = kMatMul) {
-  CHECK_EQ(A_axes.size(), B_axes.size());
+  ICHECK_EQ(A_axes.size(), B_axes.size());
 
   auto A_axes_val = GetConstIntValues(A_axes, "A_axes");
   auto B_axes_val = GetConstIntValues(B_axes, "B_axes");
@@ -1356,11 +1406,12 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
     return src;
   }
 
-  CHECK(src_layout_struct.defined() && dst_layout_struct.defined())
+  ICHECK(src_layout_struct.defined() && dst_layout_struct.defined())
       << "cannot convert from/to undefined layout";
 
   auto layout_converter = tir::BijectiveLayout(src_layout_struct, dst_layout_struct);
-  CHECK(layout_converter.defined()) << "cannot convert from " << src_layout << " to " << dst_layout;
+  ICHECK(layout_converter.defined())
+      << "cannot convert from " << src_layout << " to " << dst_layout;
 
   Array<PrimExpr> dst_shape = layout_converter.ForwardShape(src->shape);
 
@@ -1374,6 +1425,74 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
       name, tag);
 }
 
+/*! \brief Utility function for auto_scheduler_layout_transform */
+inline void parse_auto_scheduler_layout(const String& layout, Array<PrimExpr>* shape,
+                                        std::vector<std::string>* axes) {
+  int32_t factor = 0;
+  std::string axis = "";
+  for (char c : std::string(layout)) {
+    if (c >= 'A' && c <= 'z') {
+      axis += c;
+      if (factor != 0) {
+        shape->push_back(factor);
+        factor = 0;
+      }
+    } else if (c >= '0' && c <= '9') {
+      factor = factor * 10 + c - '0';
+      if (!axis.empty()) {
+        axes->push_back(axis);
+        axis = "";
+      }
+    } else {
+      LOG(FATAL) << "Invalid layout " << layout;
+    }
+  }
+  if (!axis.empty()) {
+    axes->push_back(axis);
+  }
+}
+
+/*!
+ * \brief Transform the auto-scheduler generated layout according to
+ *        \p src_layout and \p dst_layout
+ * \param src the source input.
+ * \param src_layout the source layout.
+ * \param dst_layout the destination layout.
+ * \param name output tensor name.
+ * \param tag output tensor tag.
+ * \return A tensor with shape in \p dst_layout
+ */
+inline Tensor auto_scheduler_layout_transform(const Tensor& src, const String& src_layout,
+                                              const String& dst_layout,
+                                              const String name = "T_auto_scheduler_layout_trans",
+                                              const String tag = kInjective) {
+  Array<PrimExpr> src_shape;
+  std::vector<std::string> src_axes;
+  Array<PrimExpr> dst_shape;
+  std::vector<std::string> dst_axes;
+
+  parse_auto_scheduler_layout(src_layout, &src_shape, &src_axes);
+  parse_auto_scheduler_layout(dst_layout, &dst_shape, &dst_axes);
+  return compute(
+      dst_shape,
+      [&](const Array<Var>& dst_indices) {
+        Array<PrimExpr> dst_indices_expr(dst_indices.begin(), dst_indices.end());
+        Array<PrimExpr> src_indices;
+        for (const std::string& src_axis : src_axes) {
+          PrimExpr src_index = 0;
+          CHECK_EQ(dst_indices_expr.size(), dst_axes.size());
+          for (size_t i = 0; i < dst_axes.size(); ++i) {
+            if (dst_axes[i] == src_axis) {
+              src_index = src_index * dst_shape[i] + dst_indices_expr[i];
+            }
+          }
+          src_indices.push_back(src_index);
+        }
+        return src(src_indices);
+      },
+      name, tag);
+}
+
 /*!
  * \brief Get the shape of input tensor.
  * \param src the input tensor.
@@ -1485,13 +1604,14 @@ inline Tensor one_hot(const Tensor& indices, const PrimExpr on_value, const Prim
  * \param tag output tensor tag.
  * \return Tensor of output_shape.
  */
-inline Tensor sparse_to_dense(const Tensor& sparse_indices, const Array<Integer>& output_shape,
+inline Tensor sparse_to_dense(const Tensor& sparse_indices, const Array<PrimExpr>& output_shape,
                               const Tensor& sparse_values, const PrimExpr& default_value,
                               const std::string name = "T_sparse_to_dense",
                               const std::string tag = kInjective) {
-  CHECK(sparse_indices->dtype.is_int()) << "sparse_indices only accepts integer values";
-  CHECK_LE(sparse_indices->shape.size(), 3) << "sparse_indices tensor should be 0D, 1D, or 2D only";
-  CHECK_LE(sparse_values->shape.size(), 2) << "sparse_values tensor should be 0D or 1D only";
+  ICHECK(sparse_indices->dtype.is_int()) << "sparse_indices only accepts integer values";
+  ICHECK_LE(sparse_indices->shape.size(), 3)
+      << "sparse_indices tensor should be 0D, 1D, or 2D only";
+  ICHECK_LE(sparse_values->shape.size(), 2) << "sparse_values tensor should be 0D or 1D only";
 
   const auto rank_sparse_indices = static_cast<int>(sparse_indices->shape.size());
   Array<PrimExpr> oshape;
diff --git a/include/tvm/topi/util.h b/include/tvm/topi/utils.h
similarity index 92%
rename from include/tvm/topi/util.h
rename to include/tvm/topi/utils.h
index 4e0cdc6f2057..60dc3a6a01dd 100644
--- a/include/tvm/topi/util.h
+++ b/include/tvm/topi/utils.h
@@ -19,10 +19,10 @@
 
 /*!
  * \brief Topi utility function
- * \file topi/util.h
+ * \file topi/utils.h
  */
-#ifndef TVM_TOPI_UTIL_H_
-#define TVM_TOPI_UTIL_H_
+#ifndef TVM_TOPI_UTILS_H_
+#define TVM_TOPI_UTILS_H_
 
 #include <tvm/ir/expr.h>
 #include <tvm/runtime/packed_func.h>
@@ -44,4 +44,4 @@ inline Array<Integer> ArrayOrInt(TVMArgValue arg) {
 }
 }  // namespace topi
 }  // namespace tvm
-#endif  // TVM_TOPI_UTIL_H_
+#endif  // TVM_TOPI_UTILS_H_
diff --git a/jvm/README.md b/jvm/README.md
index 348e941c16e4..e23c632fb04a 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -97,7 +97,7 @@ There's nothing special for this part. The following Python snippet generate add
 import os
 import tvm
 from tvm import te
-from tvm.contrib import cc, util
+from tvm.contrib import cc, utils
 
 def test_add(target_dir):
     n = te.var("n")
@@ -176,4 +176,4 @@ Server server = new Server(proxyHost, proxyPort, "key");
 server.start();
 ```
 
-You can also use `StandaloneServerProcessor` and `ConnectProxyServerProcessor` to build your own RPC server. Refer to [Android RPC Server](https://github.com/apache/incubator-tvm/blob/main/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/RPCProcessor.java) for more details.
+You can also use `StandaloneServerProcessor` and `ConnectProxyServerProcessor` to build your own RPC server. Refer to [Android RPC Server](https://github.com/apache/tvm/blob/main/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/RPCProcessor.java) for more details.
diff --git a/jvm/core/src/test/scripts/test_add_cpu.py b/jvm/core/src/test/scripts/test_add_cpu.py
index 40edd082466a..4725dcb8aa67 100644
--- a/jvm/core/src/test/scripts/test_add_cpu.py
+++ b/jvm/core/src/test/scripts/test_add_cpu.py
@@ -18,7 +18,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import cc, util
+from tvm.contrib import cc, utils
 
 
 def test_add(target_dir):
diff --git a/jvm/core/src/test/scripts/test_add_gpu.py b/jvm/core/src/test/scripts/test_add_gpu.py
index 7983930252bc..040a447c3c27 100644
--- a/jvm/core/src/test/scripts/test_add_gpu.py
+++ b/jvm/core/src/test/scripts/test_add_gpu.py
@@ -18,7 +18,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import cc, util
+from tvm.contrib import cc, utils
 
 
 def test_add(target_dir):
diff --git a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
index 6fc316ca8739..e3ea4b9c3766 100644
--- a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
+++ b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
@@ -243,7 +243,8 @@ extern "C" int funcInvokeCallback(TVMValue* args, int* typeCodes, int numArgs,
     TVMValue arg = args[i];
     int tcode = typeCodes[i];
     if (tcode == kTVMObjectHandle || tcode == kTVMPackedFuncHandle ||
-        tcode == kTVMObjectRValueRefArg || tcode == kTVMModuleHandle) {
+        tcode == kTVMObjectRValueRefArg || tcode == kTVMModuleHandle ||
+        tcode == kTVMNDArrayHandle) {
       TVMCbArgToReturn(&arg, &tcode);
     }
     jobject jarg = tvmRetValueToJava(env, arg, tcode);
diff --git a/jvm/pom.xml b/jvm/pom.xml
index 886f0e6bc5fd..1aeaa0e57921 100644
--- a/jvm/pom.xml
+++ b/jvm/pom.xml
@@ -7,7 +7,7 @@
   <artifactId>tvm4j-parent</artifactId>
   <version>0.0.1-SNAPSHOT</version>
   <name>TVM4J Package - Parent</name>
-  <url>https://github.com/apache/incubator-tvm/tree/main/jvm</url>
+  <url>https://github.com/apache/tvm/tree/main/jvm</url>
   <description>TVM4J Package</description>
   <organization>
     <name>Apache Software Foundation</name>
@@ -20,9 +20,9 @@
     </license>
   </licenses>
   <scm>
-    <connection>scm:git:git@github.com:apache/incubator-tvm.git</connection>
-    <developerConnection>scm:git:git@github.com:apache/incubator-tvm.git</developerConnection>
-    <url>https://github.com/apache/incubator-tvm</url>
+    <connection>scm:git:git@github.com:apache/tvm.git</connection>
+    <developerConnection>scm:git:git@github.com:apache/tvm.git</developerConnection>
+    <url>https://github.com/apache/tvm</url>
   </scm>
 
   <properties>
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
index 475494e62c4d..6f624b758fa9 100644
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -229,7 +229,7 @@ inline void DFSVisit(const std::vector<NodeEntry>& heads, FVisit fvisit);
 template <typename T>
 inline const T& Graph::GetAttr(const std::string& attr_name) const {
   auto it = attrs.find(attr_name);
-  CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
+  ICHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
   return nnvm::unsafe_get<T>(*it->second);
 }
 
@@ -241,7 +241,7 @@ inline bool Graph::HasAttr(const std::string& attr_name) const {
 template <typename T>
 inline T Graph::MoveCopyAttr(const std::string& attr_name) {
   auto it = attrs.find(attr_name);
-  CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
+  ICHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
   std::shared_ptr<any> sptr = it->second;
   attrs.erase(it);
   if (sptr.unique()) {
diff --git a/nnvm/include/nnvm/layout.h b/nnvm/include/nnvm/layout.h
index e2e99784c99e..6c46f9de9e0f 100644
--- a/nnvm/include/nnvm/layout.h
+++ b/nnvm/include/nnvm/layout.h
@@ -220,7 +220,7 @@ class Layout {
     for (size_t i = pos; i < pos + len; ++i) {
       if (is_subdim(layout_simplified_[i])) {
         auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
+        ICHECK_GT(block_size, 0);
         new_layout << block_size;
       }
       new_layout << layout_simplified_[i];
@@ -235,7 +235,7 @@ class Layout {
     for (int64_t i = this->ndim() - 1; i >= 0; --i) {
       if (is_subdim(layout_simplified_[i])) {
         auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
+        ICHECK_GT(block_size, 0);
         new_layout << block_size;
       }
       new_layout << layout_simplified_[i];
@@ -251,13 +251,13 @@ class Layout {
    * \return A newly constructed Layout object.
    */
   inline Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const {
-    CHECK(target_pos <= this->ndim())
+    ICHECK(target_pos <= this->ndim())
         << "Invalid split position " << target_pos << " for layout " << name_;
-    CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
-    CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
-    CHECK(!this->contains(to_subdim(dim)))
+    ICHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
+    ICHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
+    ICHECK(!this->contains(to_subdim(dim)))
         << "Dimension " << dim << " has already been split in " << name_;
-    CHECK(size > 0) << "Invalid split size " << size;
+    ICHECK(size > 0) << "Invalid split size " << size;
     std::ostringstream new_layout;
     for (size_t i = 0; i <= this->ndim(); ++i) {
       if (i == target_pos) {
@@ -293,11 +293,11 @@ class Layout {
    * \return the description of the dimension.
    */
   inline std::string at(size_t i) const {
-    CHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim();
+    ICHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim();
     std::ostringstream repr;
     if (is_subdim(layout_simplified_[i])) {
       auto factor = subsizeof(layout_simplified_[i]);
-      CHECK_GT(factor, 0);
+      ICHECK_GT(factor, 0);
       repr << factor;
     }
     repr << layout_simplified_[i];
@@ -328,7 +328,7 @@ class Layout {
    *         Return -1 if \p dim is not in the layout or the layout is undefined.
    */
   inline int64_t subsizeof(LayoutDim dim) const {
-    CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
+    ICHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
     if (!this->defined() || !this->contains(to_subdim(dim))) {
       return -1;
     }
@@ -409,34 +409,34 @@ class Layout {
       const LayoutDim c = layout.at(i);
       if (is_superdim(c)) {
         int pos = c - 'A';
-        CHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
-                            << " before dimension " << c;
-        CHECK_EQ(superdim_pos_[pos], -1)
+        ICHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
+                             << " before dimension " << c;
+        ICHECK_EQ(superdim_pos_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
         superdim_pos_[pos] = curr++;
         layout_simplified_.push_back(c);
       } else if (is_subdim(c)) {
         int pos = c - 'a';
-        CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
-                            << " for dimension " << c;
-        CHECK_EQ(subdim_pos_[pos], -1)
+        ICHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
+                             << " for dimension " << c;
+        ICHECK_EQ(subdim_pos_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
-        CHECK_EQ(subdim_size_[pos], -1)
+        ICHECK_EQ(subdim_size_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
         subdim_pos_[pos] = curr++;
         subdim_size_[pos] = factor;
         layout_simplified_.push_back(c);
         factor = 0;
       } else if (c >= '0' && c <= '9') {
-        CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
+        ICHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
         factor = factor * 10 + c - '0';
       } else {
         LOG(FATAL) << "Invalid layout " << layout;
       }
     }
-    CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
+    ICHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
     for (LayoutDim dim : layout_simplified_) {
-      CHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0)
+      ICHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0)
           << "Invalid layout " << layout << ": missing axis " << static_cast<char>(dim - 'a' + 'A');
     }
   }
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
index d5794d88f705..272f70610715 100644
--- a/nnvm/include/nnvm/op.h
+++ b/nnvm/include/nnvm/op.h
@@ -467,7 +467,7 @@ inline const OpMap<ValueType>& Op::GetAttr(const std::string& key) {
 template <typename ValueType>
 inline Op& Op::set_attr(  // NOLINT(*)
     const std::string& attr_name, const ValueType& value, int plevel) {
-  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  ICHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
   // update the attribute map of the key by creating new empty if needed.
   UpdateAttrMap(attr_name, [this, attr_name, value, plevel](any* pmap) {
     // the callback is in lockscope so is threadsafe.
@@ -476,7 +476,7 @@ inline Op& Op::set_attr(  // NOLINT(*)
       pm.attr_name_ = attr_name;
       *pmap = std::move(pm);
     }
-    CHECK(pmap->type() == typeid(OpMap<ValueType>))
+    ICHECK(pmap->type() == typeid(OpMap<ValueType>))
         << "Attribute " << attr_name << " of operator " << this->name
         << " is registered as inconsistent types"
         << " previously " << pmap->type().name() << " current " << typeid(OpMap<ValueType>).name();
@@ -486,8 +486,8 @@ inline Op& Op::set_attr(  // NOLINT(*)
       vec.resize(index_ + 1, std::make_pair(ValueType(), 0));
     }
     std::pair<ValueType, int>& p = vec[index_];
-    CHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name
-                              << " is already registered with same plevel=" << plevel;
+    ICHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name
+                               << " is already registered with same plevel=" << plevel;
     if (p.second < plevel) {
       vec[index_] = std::make_pair(value, plevel);
     }
@@ -562,9 +562,9 @@ inline bool OpMap<ValueType>::contains(const Op* op) const {
 
 template <typename ValueType>
 inline const ValueType& OpMap<ValueType>::operator[](const Op* op) const {
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   const uint32_t idx = op->index_;
-  CHECK(idx < data_.size() && data_[idx].second)
+  ICHECK(idx < data_.size() && data_[idx].second)
       << "Attribute " << attr_name_ << " has not been registered for Operator " << op->name;
   return data_[idx].first;
 }
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
index c6d6125aa194..af800e77dd07 100644
--- a/nnvm/include/nnvm/tuple.h
+++ b/nnvm/include/nnvm/tuple.h
@@ -435,7 +435,7 @@ class TShape : public Tuple<dim_t> {
    */
   template <int dim>
   inline mshadow::Shape<dim> get() const {
-    CHECK_EQ(dim, static_cast<int>(ndim()))
+    ICHECK_EQ(dim, static_cast<int>(ndim()))
         << "dimension do not match target dimension " << dim << " vs " << ndim();
     const dim_t* d = this->data();
     mshadow::Shape<dim> s;
@@ -467,7 +467,7 @@ class TShape : public Tuple<dim_t> {
    * \return the flat 3d shape
    */
   inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
-    CHECK(axis_end >= axis_begin);
+    ICHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
     if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
     const dim_t* d = this->data();
diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc
index fd5b64f4777d..81dc9bc35992 100644
--- a/nnvm/src/core/graph.cc
+++ b/nnvm/src/core/graph.cc
@@ -54,7 +54,7 @@ static void SubgraphSanityCheck(const std::vector<std::shared_ptr<Symbol>>& subg
         nnvm::Node* node = n.get();
         // if the node is visited, but on a different level, then check failed
         // if check failed here or before, we stop doing anything, but raise an error
-        CHECK(!node2level.count(node) || node2level[node] == level)
+        ICHECK(!node2level.count(node) || node2level[node] == level)
             << "A subgraph should not depend on the outputs of nodes on higher levels";
         // otherwise, this node belongs to the current level
         node2level[node] = level;
@@ -76,9 +76,9 @@ IndexedGraph::IndexedGraph(const Graph& g) {
   DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs](const ObjectPtr& n) {
     const auto& is_ghost = Op::GetAttr<TIsGhost>("TIsGhost");
     if (!n->is_variable() && is_ghost.get(n->op(), false)) return;
-    CHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
+    ICHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
     uint32_t nid = static_cast<uint32_t>(nodes_.size());
-    CHECK(n);
+    ICHECK(n);
     for (const auto& subgraph : n->attrs.subgraphs) subgraphs.push_back(subgraph);
     // nodes_
     IndexedGraph::Node new_node;
@@ -96,7 +96,7 @@ IndexedGraph::IndexedGraph(const Graph& g) {
     // input entries
     for (const auto& e : n->inputs) {
       auto it = node2index_.find(e.node.get());
-      if (it == node2index_.end() || it->first != e.node.get()) continue;
+      ICHECK(it != node2index_.end() && it->first == e.node.get());
       input_entries_.emplace_back(NodeEntry{it->second, e.index, e.version});
     }
     inputs_rptr.push_back(input_entries_.size());
@@ -104,7 +104,7 @@ IndexedGraph::IndexedGraph(const Graph& g) {
     for (const auto& nptr : n->control_deps) {
       if (!nptr->is_variable() && is_ghost.get(nptr->op(), false)) continue;
       auto it = node2index_.find(nptr.get());
-      CHECK(it != node2index_.end()) << "control dep not found in graph";
+      ICHECK(it != node2index_.end()) << "control dep not found in graph";
       control_deps_.push_back(it->second);
     }
     control_rptr.push_back(control_deps_.size());
diff --git a/nnvm/src/core/op.cc b/nnvm/src/core/op.cc
index 08a11dff9a02..7f5d1999780d 100644
--- a/nnvm/src/core/op.cc
+++ b/nnvm/src/core/op.cc
@@ -70,7 +70,7 @@ Op& Op::add_alias(const std::string& alias) {  // NOLINT(*)
 // find operator by name
 const Op* Op::Get(const std::string& name) {
   const Op* op = dmlc::Registry<Op>::Find(name);
-  CHECK(op != nullptr) << "Operator " << name << " is not registered";
+  ICHECK(op != nullptr) << "Operator " << name << " is not registered";
   return op;
 }
 
diff --git a/nnvm/src/core/pass.cc b/nnvm/src/core/pass.cc
index 974cd2b35918..9966d3d42300 100644
--- a/nnvm/src/core/pass.cc
+++ b/nnvm/src/core/pass.cc
@@ -45,7 +45,7 @@ Graph ApplyPasses(Graph g, const std::vector<std::string>& pass) {
   std::vector<const PassFunctionReg*> fpass;
   for (auto& name : pass) {
     auto* reg = dmlc::Registry<PassFunctionReg>::Find(name);
-    CHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry";
+    ICHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry";
     fpass.push_back(reg);
   }
 
diff --git a/nnvm/src/core/symbolic.cc b/nnvm/src/core/symbolic.cc
index 12b8675d0bd7..18d31dd3a937 100644
--- a/nnvm/src/core/symbolic.cc
+++ b/nnvm/src/core/symbolic.cc
@@ -58,7 +58,7 @@ inline void UpdateNodeVersion(Node* n) {
   if (fmutate_inputs.count(n->op()) != 0) {
     for (uint32_t i : fmutate_inputs[n->op()](n->attrs)) {
       NodeEntry& e = n->inputs[i];
-      CHECK(e.node->is_variable()) << "Mutation target can only be Variable";
+      ICHECK(e.node->is_variable()) << "Mutation target can only be Variable";
       // increase the version of the variable.
       e.version = ++nnvm::get<VariableParam>(e.node->attrs.parsed).version;
     }
@@ -186,7 +186,7 @@ void Symbol::Print(std::ostream& os) const {
 
 Symbol Symbol::operator[](size_t index) const {
   size_t nreturn = outputs.size();
-  CHECK_LT(index, nreturn) << "Symbol only accept nonnegative index";
+  ICHECK_LT(index, nreturn) << "Symbol only accept nonnegative index";
   if (nreturn == 1) {
     return *this;
   } else {
@@ -298,13 +298,13 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
   for (size_t i = 0; i < args.size(); ++i) {
     // If the argument isn't a graph, it should have only one output.
     if (garg_idx.empty() || std::find(garg_idx.begin(), garg_idx.end(), i) == garg_idx.end())
-      CHECK_EQ(args[i]->outputs.size(), 1U)
+      ICHECK_EQ(args[i]->outputs.size(), 1U)
           << "Argument " << i << " is a tuple, single value is required";
   }
   for (const auto& kv : kwargs) {
     if (garg_names.empty() ||
         std::find(garg_names.begin(), garg_names.end(), kv.first) == garg_names.end())
-      CHECK_EQ(kv.second->outputs.size(), 1U)
+      ICHECK_EQ(kv.second->outputs.size(), 1U)
           << "Keyword Argument " << kv.first << " is a tuple, single value is required";
   }
   // assign new name
@@ -325,7 +325,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
           sym = arg_vec[idx];
         } else {
           auto it = kwarg_map.find(arg_names[idx]);
-          CHECK(it != kwarg_map.end());
+          ICHECK(it != kwarg_map.end());
           sym = it->second;
           kwarg_map.erase(it);
         }
@@ -346,7 +346,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
 
     if (n_req != kVarg) {
       n->inputs.resize(n_req);
-      CHECK_LE(arg_vec.size(), n_req)
+      ICHECK_LE(arg_vec.size(), n_req)
           << "Incorrect number of arguments, requires " << n_req << ", provided " << arg_vec.size();
       for (size_t i = 0; i < arg_vec.size(); ++i) {
         n->inputs[i] = arg_vec[i]->outputs[0];
@@ -378,7 +378,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
         }
       }
     } else {
-      CHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs";
+      ICHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs";
       n->inputs.reserve(arg_vec.size());
       for (const Symbol* s : arg_vec) {
         n->inputs.push_back(s->outputs[0]);
@@ -396,7 +396,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
     }
   } else {
     // general composition
-    CHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now";
+    ICHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now";
     size_t nmatched = 0;
     size_t arg_counter = 0;
     std::unordered_map<Node*, const NodeEntry*> replace_map;
@@ -456,7 +456,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
     // update outputs in case the composed variable is part of outputs.
     for (size_t i = 0; i < outputs.size(); ++i) {
       if (outputs[i].node->is_variable()) {
-        CHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments";
+        ICHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments";
         const auto it = kwargs.find(outputs[i].node->attrs.name);
         if (it != kwargs.end()) outputs[i] = it->second->outputs[0];
       }
@@ -473,7 +473,7 @@ Symbol Symbol::operator()(const array_view<const Symbol*>& args,
 }
 
 void Symbol::AddControlDeps(const Symbol& src) {
-  CHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol";
+  ICHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol";
   Node* n = outputs[0].node.get();
   for (const NodeEntry& sp : src.outputs) {
     n->control_deps.push_back(sp.node);
@@ -517,7 +517,7 @@ Symbol Symbol::GetChildren() const {
 void Symbol::SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs) {
   Node* node = outputs[0].node.get();
   for (const NodeEntry& e : outputs) {
-    CHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol";
+    ICHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol";
   }
   for (const auto& kv : attrs) {
     if (kv.first == "name") {
diff --git a/nnvm/src/pass/correct_layout.cc b/nnvm/src/pass/correct_layout.cc
index b9024a56d143..3a8cc16511ff 100644
--- a/nnvm/src/pass/correct_layout.cc
+++ b/nnvm/src/pass/correct_layout.cc
@@ -64,7 +64,7 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
     if (new_node->is_variable()) {
       // Variable node. No operator. Only one output entry.
       auto input_iter = std::find(idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
-      CHECK(input_iter != idx.input_nodes().cend());
+      ICHECK(input_iter != idx.input_nodes().cend());
       int64_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
       if (src.HasAttr("layout_inputs")) {
         new_layouts[new_node.get()] = {
@@ -83,11 +83,11 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
     for (size_t i = 0; i < num_inputs; ++i) {
       const IndexedGraph::NodeEntry& input_entry = inode.inputs[i];
       const ObjectPtr& new_input_node = mirror_vec[input_entry.node_id];
-      CHECK(new_input_node != nullptr);
+      ICHECK(new_input_node != nullptr);
 
       // fill inputs by previous node (DFS order) inferred layouts.
       const auto& layouts_iter = new_layouts.find(new_input_node.get());
-      CHECK(layouts_iter != new_layouts.end());
+      ICHECK(layouts_iter != new_layouts.end());
       request_ilayouts[i] = layouts_iter->second[input_entry.index];
     }
     // layouts produced by previous node.
@@ -108,10 +108,10 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
 
     if (op_correct_layout.count(new_node->op())) {
       const auto& flayout = op_correct_layout[new_node->op()];
-      CHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
+      ICHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
           << "Layout infer fail";
-      CHECK_EQ(request_ilayouts.size(), num_inputs);
-      CHECK_EQ(produce_olayouts.size(), num_outputs);
+      ICHECK_EQ(request_ilayouts.size(), num_inputs);
+      ICHECK_EQ(produce_olayouts.size(), num_outputs);
     }
 
     // update new layouts
diff --git a/nnvm/src/pass/gradient.cc b/nnvm/src/pass/gradient.cc
index 1df3af7ffaaf..902a968b102d 100644
--- a/nnvm/src/pass/gradient.cc
+++ b/nnvm/src/pass/gradient.cc
@@ -85,10 +85,10 @@ Graph Gradient(Graph src) {
   using MirrorFun = std::function<int(const Node& node)>;
   using AttrHintFun = std::function<NodeEntry(const NodeEntry& src, const NodeEntry& like)>;
 
-  CHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented.";
-  CHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U)
+  ICHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented.";
+  ICHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U)
       << "Gradient require grad_ys_out_grad to be presented.";
-  CHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented.";
+  ICHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented.";
   const std::vector<NodeEntry>& ys = src.GetAttr<std::vector<NodeEntry> >("grad_ys");
   const std::vector<NodeEntry>& ys_out_grad =
       src.GetAttr<std::vector<NodeEntry> >("grad_ys_out_grad");
@@ -124,7 +124,7 @@ Graph Gradient(Graph src) {
     topo_order.push_back(node);
   });
 
-  CHECK_EQ(ys.size(), ys_out_grad.size());
+  ICHECK_EQ(ys.size(), ys_out_grad.size());
   for (size_t i = 0; i < ys.size(); ++i) {
     NodeEntry ograd = ys_out_grad[i];
     output_grads[ys[i].node.get()][ys[i].index].grads = {ograd};
@@ -132,7 +132,7 @@ Graph Gradient(Graph src) {
 
   // Check that all xs are reachable from ys
   for (size_t i = 0; i < xs.size(); ++i) {
-    CHECK(output_grads.find(xs[i].node.get()) != output_grads.end())
+    ICHECK(output_grads.find(xs[i].node.get()) != output_grads.end())
         << "Cannot differentiate with respect to the " << i + 1 << "-th variable "
         << "because it is unreachable from the outputs.";
   }
@@ -182,7 +182,7 @@ Graph Gradient(Graph src) {
       // Check for FGradient
       if (grad_fun_map.contains(ptr->op())) {
         input_grads = grad_fun_map[ptr->op()](fwd_node, out_agg_grads);
-        CHECK_EQ((*rit)->inputs.size(), input_grads.size())
+        ICHECK_EQ((*rit)->inputs.size(), input_grads.size())
             << "Gradient function not returning enough gradient";
       } else if (CheckGradAllZero(out_agg_grads, zero_ops)) {
         for (size_t i = 0; i < fwd_node->num_inputs(); ++i) {
@@ -206,9 +206,9 @@ Graph Gradient(Graph src) {
         LOG(FATAL) << "Operator " << fwd_node->op()->name << " is non-differentiable "
                    << "because it didn't register FGradient attribute.";
       }
-      for (const auto& nodeEntry : input_grads) CHECK(nodeEntry.node);
+      for (const auto& nodeEntry : input_grads) ICHECK(nodeEntry.node);
       auto git = input_grads.begin();
-      CHECK((*rit)->inputs.size() <= input_grads.size());
+      ICHECK((*rit)->inputs.size() <= input_grads.size());
       for (auto it = (*rit)->inputs.begin(); it != (*rit)->inputs.end(); ++it, ++git) {
         auto& output_grad_entry = output_grads[it->node.get()][it->index];
         // if any of the backward op can do shape inference, the hint is not necessary.
diff --git a/nnvm/src/pass/graph_algorithm.h b/nnvm/src/pass/graph_algorithm.h
index b305c08bc05f..4620079a0ab2 100644
--- a/nnvm/src/pass/graph_algorithm.h
+++ b/nnvm/src/pass/graph_algorithm.h
@@ -45,7 +45,7 @@ namespace pass {
 inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32_t>& node_reward,
                              std::vector<uint32_t>* path) {
   const uint32_t num_nodes = static_cast<uint32_t>(graph.num_nodes());
-  CHECK_EQ(num_nodes, node_reward.size());
+  ICHECK_EQ(num_nodes, node_reward.size());
 
   std::vector<uint32_t> best_reward(node_reward.size(), 0);
   std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
@@ -73,7 +73,7 @@ inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32
     path->push_back(nid);
     reward += node_reward[nid];
   }
-  CHECK_EQ(reward, best_solution);
+  ICHECK_EQ(reward, best_solution);
   return best_solution;
 }
 
@@ -90,8 +90,8 @@ inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32
  */
 inline uint32_t ColorNodeGroup(const IndexedGraph& graph, std::vector<uint32_t> node_importance,
                                uint32_t max_ncolor, std::vector<uint32_t>* color) {
-  CHECK_NE(max_ncolor, 0U);
-  CHECK_EQ(graph.num_nodes(), node_importance.size());
+  ICHECK_NE(max_ncolor, 0U);
+  ICHECK_EQ(graph.num_nodes(), node_importance.size());
 
   color->clear();
   color->resize(graph.num_nodes(), max_ncolor);
@@ -105,7 +105,7 @@ inline uint32_t ColorNodeGroup(const IndexedGraph& graph, std::vector<uint32_t>
     if (reward == 0) break;
     for (uint32_t nid : path) {
       if (node_importance[nid] != 0) {
-        CHECK_EQ(color->at(nid), max_ncolor);
+        ICHECK_EQ(color->at(nid), max_ncolor);
         color->at(nid) = cindex;
         // make the importance 0 after color is decided.
         node_importance[nid] = 0;
diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
index fde1691ee96a..859c5b385c4a 100644
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -49,7 +49,7 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
 
   if (ret.attrs.count(input_name) != 0) {
     const AttrVector& shape_args = ret.GetAttr<AttrVector>(input_name);
-    CHECK_LE(shape_args.size(), idx.input_nodes().size())
+    ICHECK_LE(shape_args.size(), idx.input_nodes().size())
         << "More provided shapes than number of arguments.";
     for (size_t i = 0; i < shape_args.size(); ++i) {
       rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i];
@@ -88,22 +88,22 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
     const uint32_t num_outputs = inode.source->num_outputs();
     if (inode.source->is_variable()) {
       // Variable node. No operator. Only one output entry.
-      CHECK(inode.source->op() == nullptr);
-      CHECK_EQ(num_outputs, 1U);
+      ICHECK(inode.source->op() == nullptr);
+      ICHECK_EQ(num_outputs, 1U);
       const uint32_t out_ent_id = idx.entry_id(nid, 0);
       if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) {
         auto it = inode.source->attrs.dict.find(shape_attr_key);
         if (it != inode.source->attrs.dict.end()) {
           std::istringstream is(it->second);
-          CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
+          ICHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
         }
       }
     } else if (is_backward.get(inode.source->op(), false) && inode.control_deps.size()) {
-      CHECK_GE(inode.control_deps.size(), 1U)
+      ICHECK_GE(inode.control_deps.size(), 1U)
           << "BackwardOp need to have control_deps to its forward op";
       const IndexedGraph::Node& fnode = idx[inode.control_deps[0]];
       ObjectPtr fwd_ptr = inode.source->control_deps[0];
-      CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
+      ICHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
       // use gradient function to find out the correspondence.
       std::vector<NodeEntry> ograd(fwd_ptr->num_outputs());
       for (size_t i = 0; i < ograd.size(); ++i) {
@@ -119,18 +119,18 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
           if (fis_none(rshape[eid])) {
             rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])];
           } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) {
-            CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
+            ICHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
                 << "Backward shape inconsistent with the forward shape";
           }
           if (igrad_node == nullptr) {
             igrad_node = igrad[i].node.get();
           } else {
-            CHECK(igrad_node == igrad[i].node.get());
+            ICHECK(igrad_node == igrad[i].node.get());
           }
         }
       }
       // out grad entries
-      CHECK(igrad_node != nullptr)
+      ICHECK(igrad_node != nullptr)
           << "Cannot find matching backward op for " << inode.source->attrs.name;
       for (size_t i = 0; i < igrad_node->inputs.size(); ++i) {
         const NodeEntry& e = igrad_node->inputs[i];
@@ -164,9 +164,9 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
             throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what());
           }
         } else {
-          CHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op "
-                            << inode.source->op()->name
-                            << " we are not able to complete the inference because of this";
+          ICHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op "
+                             << inode.source->op()->name
+                             << " we are not able to complete the inference because of this";
         }
       }
       // Save to the result map.
diff --git a/nnvm/src/pass/place_device.cc b/nnvm/src/pass/place_device.cc
index d45658ae24ab..4a9d93465de8 100644
--- a/nnvm/src/pass/place_device.cc
+++ b/nnvm/src/pass/place_device.cc
@@ -33,11 +33,11 @@ namespace {
 // simply logic to place device according to device_group hint
 // insert copy node when there is
 Graph PlaceDevice(Graph src) {
-  CHECK(src.attrs.count("device_group_attr_key"))
+  ICHECK(src.attrs.count("device_group_attr_key"))
       << "Need graph attribute \"device_group_attr_key\" in PlaceDevice";
-  CHECK(src.attrs.count("device_assign_map"))
+  ICHECK(src.attrs.count("device_assign_map"))
       << "Need graph attribute \"device_assign_map\" in PlaceDevice";
-  CHECK(src.attrs.count("device_copy_op"))
+  ICHECK(src.attrs.count("device_copy_op"))
       << "Need graph attribute \"device_copy_op\" in PlaceDevice";
   std::string device_group_attr_key = src.GetAttr<std::string>("device_group_attr_key");
   const Op* copy_op = Op::Get(src.GetAttr<std::string>("device_copy_op"));
@@ -48,7 +48,7 @@ Graph PlaceDevice(Graph src) {
   // copy on write semanatics
   if (src.attrs.count("device") != 0) {
     device = src.MoveCopyAttr<DeviceVector>("device");
-    CHECK_EQ(device.size(), idx.num_nodes());
+    ICHECK_EQ(device.size(), idx.num_nodes());
   } else {
     device.resize(idx.num_nodes(), -1);
   }
@@ -60,7 +60,7 @@ Graph PlaceDevice(Graph src) {
     if (it != inode.source->attrs.dict.end()) {
       const std::string& device_group = it->second;
       auto dit = device_assign_map.find(device_group);
-      CHECK(dit != device_assign_map.end())
+      ICHECK(dit != device_assign_map.end())
           << "The device assignment not found for group " << device_group;
       device[nid] = dit->second;
     } else {
@@ -139,7 +139,7 @@ Graph PlaceDevice(Graph src) {
       }
     }
     if (inode.source->is_variable()) {
-      CHECK(!need_mutate) << "consistency check";
+      ICHECK(!need_mutate) << "consistency check";
     }
     if (need_mutate) {
       ObjectPtr new_node = Node::Create();
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
index 7d478c646a1f..931dbbd8d24c 100644
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
@@ -112,7 +112,7 @@ class GraphAllocator {
   }
   // release a memory space.
   void Release(StorageID id, uint32_t node_id) {
-    CHECK_NE(id, kBadStorageID);
+    ICHECK_NE(id, kBadStorageID);
     if (id == kExternalStorageID || id == kDynamicStorageID) return;
     StorageEntry* e = data_[id].get();
     e->released_by_node = node_id;
@@ -219,7 +219,7 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
       std::vector<bool> identity;
       if (finplace_identity.count(inode.source->op()) != 0) {
         identity = finplace_identity[inode.source->op()](inode.source->attrs);
-        CHECK_EQ(identity.size(), inplace_pairs.size())
+        ICHECK_EQ(identity.size(), inplace_pairs.size())
             << "FInplaceOption and FInplaceIdentity returned vectors of different "
             << "size for operator " << inode.source->op()->name;
       } else {
diff --git a/nnvm/src/pass/print_graph_ir.cc b/nnvm/src/pass/print_graph_ir.cc
index 4fe92e665961..6604d810f288 100644
--- a/nnvm/src/pass/print_graph_ir.cc
+++ b/nnvm/src/pass/print_graph_ir.cc
@@ -41,7 +41,7 @@ AttrPrinter GetVectorPrinter_(const T& vec) {
 
 AttrPrinter GetVectorPrinter(const Graph& graph, const std::string& key) {
   auto it = graph.attrs.find(key);
-  CHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr";
+  ICHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr";
   const any& value = *(it->second);
   if (value.type() == typeid(std::vector<TShape>)) {
     return GetVectorPrinter_(nnvm::get<std::vector<TShape> >(value));
diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
index 3916da43618d..dbd8ee0f83d4 100644
--- a/nnvm/src/pass/saveload_json.cc
+++ b/nnvm/src/pass/saveload_json.cc
@@ -72,13 +72,13 @@ struct JSONNode {
     }
     void Load(dmlc::JSONReader* reader) {
       reader->BeginArray();
-      CHECK(reader->NextArrayItem()) << "invalid json format";
+      ICHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&node_id);
-      CHECK(reader->NextArrayItem()) << "invalid json format";
+      ICHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&index);
       if (reader->NextArrayItem()) {
         reader->Read(&version);
-        CHECK(!reader->NextArrayItem()) << "invalid json format";
+        ICHECK(!reader->NextArrayItem()) << "invalid json format";
       } else {
         version = 0;
       }
@@ -226,12 +226,12 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
   for (const JSONNode& n : jgraph.nodes) {
     n.node->inputs.reserve(n.inputs.size());
     for (const JSONNode::Entry& e : n.inputs) {
-      CHECK(e.node_id < jgraph.nodes.size());
+      ICHECK(e.node_id < jgraph.nodes.size());
       n.node->inputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
     }
     n.node->control_deps.reserve(n.control_deps.size());
     for (uint32_t nid : n.control_deps) {
-      CHECK(nid < jgraph.nodes.size());
+      ICHECK(nid < jgraph.nodes.size());
       n.node->control_deps.push_back(jgraph.nodes[nid].node);
     }
     for (const JSONGraph& subgraph : n.subgraphs) {
@@ -252,13 +252,13 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
   }
   // consistency check
   for (uint32_t nid : jgraph.arg_nodes) {
-    CHECK(nid < jgraph.nodes.size());
-    CHECK(jgraph.nodes[nid].node->is_variable());
+    ICHECK(nid < jgraph.nodes.size());
+    ICHECK(jgraph.nodes[nid].node->is_variable());
   }
   std::shared_ptr<Symbol> symbol = std::make_shared<Symbol>();
   symbol->outputs.reserve(jgraph.heads.size());
   for (const JSONNode::Entry& e : jgraph.heads) {
-    CHECK(e.node_id < jgraph.nodes.size());
+    ICHECK(e.node_id < jgraph.nodes.size());
     symbol->outputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
   }
   return symbol;
@@ -266,7 +266,7 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
 
 // Load a graph from JSON file.
 Graph LoadJSON(Graph src) {
-  CHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented.";
+  ICHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented.";
   const std::string& json_str = nnvm::get<std::string>(*src.attrs.at("json"));
   bool no_parse = false;
   if (src.attrs.count("load_json_no_parse")) {
diff --git a/nnvm/tests/cpp/op_test.cc b/nnvm/tests/cpp/op_test.cc
index 2ebd14688f46..39a998a4eebe 100644
--- a/nnvm/tests/cpp/op_test.cc
+++ b/nnvm/tests/cpp/op_test.cc
@@ -35,7 +35,7 @@ TEST(Op, GetAttr) {
   auto add = Op::Get("add");
   auto nick = Op::GetAttr<std::string>("nick_name");
 
-  CHECK_EQ(nick[add], "plus");
+  ICHECK_EQ(nick[add], "plus");
 }
 
 int main(int argc, char** argv) {
diff --git a/nnvm/tests/cpp/tuple_test.cc b/nnvm/tests/cpp/tuple_test.cc
index 2c2c307aadce..e28ecd89f6fa 100644
--- a/nnvm/tests/cpp/tuple_test.cc
+++ b/nnvm/tests/cpp/tuple_test.cc
@@ -28,18 +28,18 @@ TEST(Tuple, Basic) {
   Tuple<int> y{1, 2, 3, 5, 6};
   x = std::move(y);
 
-  CHECK_EQ(x.ndim(), 5);
+  ICHECK_EQ(x.ndim(), 5);
   Tuple<int> z{1, 2, 3, 5, 6};
   std::ostringstream os;
   os << z;
-  CHECK_EQ(os.str(), "[1,2,3,5,6]");
+  ICHECK_EQ(os.str(), "[1,2,3,5,6]");
   std::istringstream is(os.str());
   is >> y;
-  CHECK_EQ(x, y);
+  ICHECK_EQ(x, y);
   Tuple<nnvm::dim_t> ss{1, 2, 3};
   TShape s = ss;
   s = std::move(ss);
-  CHECK((s == TShape{1, 2, 3}));
+  ICHECK((s == TShape{1, 2, 3}));
 }
 
 int main(int argc, char** argv) {
diff --git a/python/setup.py b/python/setup.py
index fff7a0ed3bb1..8af62f9c9102 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, exec-used
 """Setup TVM package."""
-from __future__ import absolute_import
 import os
 import shutil
 import sys
@@ -35,6 +34,8 @@
     from setuptools.extension import Extension
 
 CURRENT_DIR = os.path.dirname(__file__)
+FFI_MODE = os.environ.get("TVM_FFI", "auto")
+CONDA_BUILD = os.getenv("CONDA_BUILD") is not None
 
 
 def get_lib_path():
@@ -45,7 +46,7 @@ def get_lib_path():
     libinfo = {"__file__": libinfo_py}
     exec(compile(open(libinfo_py, "rb").read(), libinfo_py, "exec"), libinfo, libinfo)
     version = libinfo["__version__"]
-    if not os.getenv("CONDA_BUILD"):
+    if not CONDA_BUILD:
         lib_path = libinfo["find_lib_path"]()
         libs = [lib_path[0]]
         if libs[0].find("runtime") == -1:
@@ -58,19 +59,31 @@ def get_lib_path():
     return libs, version
 
 
+def git_describe_version(original_version):
+    """Get git describe version."""
+    ver_py = os.path.join(CURRENT_DIR, "..", "version.py")
+    libver = {"__file__": ver_py}
+    exec(compile(open(ver_py, "rb").read(), ver_py, "exec"), libver, libver)
+    _, gd_version = libver["git_describe_version"]()
+    if gd_version != original_version and "--inplace" not in sys.argv:
+        print("Use git describe based version %s" % gd_version)
+    return gd_version
+
+
 LIB_LIST, __version__ = get_lib_path()
+__version__ = git_describe_version(__version__)
 
 
 def config_cython():
     """Try to configure cython and return cython configuration"""
-    if os.name == "nt":
-        print("WARNING: Cython is not supported on Windows, will compile without cython module")
-        return []
-    sys_cflags = sysconfig.get_config_var("CFLAGS")
-
-    if "i386" in sys_cflags and "x86_64" in sys_cflags:
-        print("WARNING: Cython library may not be compiled correctly with both i386 and x64")
-        return []
+    if FFI_MODE not in ("cython"):
+        if os.name == "nt" and not CONDA_BUILD:
+            print("WARNING: Cython is not supported on Windows, will compile without cython module")
+            return []
+        sys_cflags = sysconfig.get_config_var("CFLAGS")
+        if sys_cflags and "i386" in sys_cflags and "x86_64" in sys_cflags:
+            print("WARNING: Cython library may not be compiled correctly with both i386 and x64")
+            return []
     try:
         from Cython.Build import cythonize
 
@@ -81,12 +94,18 @@ def config_cython():
             subdir = "_cy2"
         ret = []
         path = "tvm/_ffi/_cython"
+        extra_compile_args = ["-std=c++14"]
         if os.name == "nt":
             library_dirs = ["tvm", "../build/Release", "../build"]
-            libraries = ["libtvm"]
+            libraries = ["tvm"]
+            extra_compile_args = None
+            # library is available via conda env.
+            if CONDA_BUILD:
+                library_dirs = [os.environ["LIBRARY_LIB"]]
         else:
             library_dirs = None
             libraries = None
+
         for fn in os.listdir(path):
             if not fn.endswith(".pyx"):
                 continue
@@ -99,14 +118,16 @@ def config_cython():
                         "../3rdparty/dmlc-core/include",
                         "../3rdparty/dlpack/include",
                     ],
-                    extra_compile_args=["-std=c++14"],
+                    extra_compile_args=extra_compile_args,
                     library_dirs=library_dirs,
                     libraries=libraries,
                     language="c++",
                 )
             )
         return cythonize(ret, compiler_directives={"language_level": 3})
-    except ImportError:
+    except ImportError as error:
+        if FFI_MODE == "cython":
+            raise error
         print("WARNING: Cython is not installed, will compile without cython module")
         return []
 
@@ -121,7 +142,7 @@ def is_pure(self):
 
 include_libs = False
 wheel_include_libs = False
-if not os.getenv("CONDA_BUILD"):
+if not CONDA_BUILD:
     if "bdist_wheel" in sys.argv:
         wheel_include_libs = True
     else:
@@ -162,7 +183,7 @@ def get_package_data_files():
         "decorator",
         "attrs",
         "psutil",
-        "typed_ast",
+        "synr>=0.2.1",
     ],
     extras_require={
         "test": ["pillow<7", "matplotlib"],
@@ -186,7 +207,7 @@ def get_package_data_files():
     package_dir={"tvm": "tvm"},
     package_data={"tvm": get_package_data_files()},
     distclass=BinaryDistribution,
-    url="https://github.com/apache/incubator-tvm",
+    url="https://github.com/apache/tvm",
     ext_modules=config_cython(),
     **setup_kwargs,
 )
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 569e8f042486..c2b4fdb2d00e 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -68,15 +68,28 @@
 from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel
 
 
+def _should_print_backtrace():
+    in_pytest = "PYTEST_CURRENT_TEST" in os.environ
+    tvm_backtrace = os.environ.get("TVM_BACKTRACE", "0")
+
+    try:
+        tvm_backtrace = bool(int(tvm_backtrace))
+    except ValueError:
+        raise ValueError(
+            f"invalid value for TVM_BACKTRACE `{tvm_backtrace}`, please set to 0 or 1."
+        )
+
+    return in_pytest or tvm_backtrace
+
+
 def tvm_wrap_excepthook(exception_hook):
     """Wrap given excepthook with TVM additional work."""
 
     def wrapper(exctype, value, trbk):
         """Clean subprocesses when TVM is interrupted."""
-        in_pytest = "PYTEST_CURRENT_TEST" in os.environ
-
-        if exctype is error.DiagnosticError and not in_pytest:
-            pass
+        if exctype is error.DiagnosticError and not _should_print_backtrace():
+            # TODO(@jroesch): consider moving to C++?
+            print("note: run with `TVM_BACKTRACE=1` environment variable to display a backtrace.")
         else:
             exception_hook(exctype, value, trbk)
 
diff --git a/python/tvm/_ffi/_ctypes/packed_func.py b/python/tvm/_ffi/_ctypes/packed_func.py
index acf9776d9b8b..fd82b263e2dd 100644
--- a/python/tvm/_ffi/_ctypes/packed_func.py
+++ b/python/tvm/_ffi/_ctypes/packed_func.py
@@ -306,7 +306,9 @@ def _get_global_func(name, allow_missing=False):
     _return_module, ArgTypeCode.MODULE_HANDLE
 )
 C_TO_PY_ARG_SWITCH[ArgTypeCode.DLTENSOR_HANDLE] = lambda x: _make_array(x.v_handle, True, False)
-C_TO_PY_ARG_SWITCH[ArgTypeCode.NDARRAY_HANDLE] = lambda x: _make_array(x.v_handle, False, True)
+C_TO_PY_ARG_SWITCH[ArgTypeCode.NDARRAY_HANDLE] = _wrap_arg_func(
+    lambda x: _make_array(x.v_handle, False, True), ArgTypeCode.NDARRAY_HANDLE
+)
 
 _CLASS_MODULE = None
 _CLASS_PACKED_FUNC = None
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index 9fd3aa43841f..e671ef626205 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -68,6 +68,10 @@ cdef class NDArrayBase:
         def __set__(self, value):
             self._set_handle(value)
 
+    property is_view:
+        def __get__(self):
+            return self.c_is_view != 0
+
     @property
     def shape(self):
         """Shape of this array"""
diff --git a/python/tvm/_ffi/_cython/packed_func.pxi b/python/tvm/_ffi/_cython/packed_func.pxi
index 16b146119f0d..00585659ab76 100644
--- a/python/tvm/_ffi/_cython/packed_func.pxi
+++ b/python/tvm/_ffi/_cython/packed_func.pxi
@@ -43,6 +43,7 @@ cdef int tvm_callback(TVMValue* args,
         if (tcode == kTVMObjectHandle or
             tcode == kTVMPackedFuncHandle or
             tcode == kTVMModuleHandle or
+            tcode == kTVMNDArrayHandle or
             tcode == kTVMObjectRefArg or
             tcode > kTVMExtBegin):
             CALL(TVMCbArgToReturn(&value, &tcode))
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index df220ae1111f..397090618ade 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -49,6 +49,11 @@ def _py_str(x):
 def _load_lib():
     """Load libary by searching possible path."""
     lib_path = libinfo.find_lib_path()
+    # The dll search path need to be added explicitly in
+    # windows after python 3.8
+    if sys.platform.startswith("win32") and sys.version_info >= (3, 8):
+        for path in libinfo.get_dll_directories():
+            os.add_dll_directory(path)
     lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)
     lib.TVMGetLastError.restype = ctypes.c_char_p
     return lib, os.path.basename(lib_path[0])
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index ae3cba6e3dd7..28614d072f01 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -40,23 +40,8 @@ def split_env_var(env_var, split):
     return []
 
 
-def find_lib_path(name=None, search_path=None, optional=False):
-    """Find dynamic library files.
-
-    Parameters
-    ----------
-    name : list of str
-        List of names to be found.
-
-    Returns
-    -------
-    lib_path : list(string)
-        List of all found path to the libraries
-    """
-    use_runtime = os.environ.get("TVM_USE_RUNTIME_LIB", False)
-
-    # See https://github.com/apache/incubator-tvm/issues/281 for some background.
-
+def get_dll_directories():
+    """Get the possible dll directories"""
     # NB: This will either be the source directory (if TVM is run
     # inplace) or the install directory (if TVM is installed).
     # An installed TVM's curr_path will look something like:
@@ -94,11 +79,31 @@ def find_lib_path(name=None, search_path=None, optional=False):
         dll_path.append(os.path.join(source_dir, "web", "dist"))
 
     dll_path = [os.path.realpath(x) for x in dll_path]
+    return [x for x in dll_path if os.path.isdir(x)]
+
+
+def find_lib_path(name=None, search_path=None, optional=False):
+    """Find dynamic library files.
+
+    Parameters
+    ----------
+    name : list of str
+        List of names to be found.
+
+    Returns
+    -------
+    lib_path : list(string)
+        List of all found path to the libraries
+    """
+    use_runtime = os.environ.get("TVM_USE_RUNTIME_LIB", False)
+    dll_path = get_dll_directories()
+
     if search_path is not None:
         if isinstance(search_path, list):
             dll_path = dll_path + search_path
         else:
             dll_path.append(search_path)
+
     if name is not None:
         if isinstance(name, list):
             lib_dll_path = []
diff --git a/python/tvm/_ffi/registry.py b/python/tvm/_ffi/registry.py
index b42dada9c792..677ca5d8de8d 100644
--- a/python/tvm/_ffi/registry.py
+++ b/python/tvm/_ffi/registry.py
@@ -29,8 +29,10 @@
     from ._cy3.core import _register_object
     from ._cy3.core import _reg_extension
     from ._cy3.core import convert_to_tvm_func, _get_global_func, PackedFuncBase
-except (RuntimeError, ImportError):
+except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position,unused-import
+    if _FFI_MODE == "cython":
+        raise error
     from ._ctypes.object import _register_object
     from ._ctypes.ndarray import _reg_extension
     from ._ctypes.packed_func import convert_to_tvm_func, _get_global_func, PackedFuncBase
@@ -260,6 +262,17 @@ def _list(name, func):
     return fdict
 
 
+def remove_global_func(name):
+    """Remove a global function by name
+
+    Parameters
+    ----------
+    name : str
+        The name of the global function
+    """
+    check_call(_LIB.TVMFuncRemoveGlobal(c_str(name)))
+
+
 def _get_api(f):
     flocal = f
     flocal.is_global = True
diff --git a/python/tvm/arith/__init__.py b/python/tvm/arith/__init__.py
index e5af52938f5c..77ec869a171e 100644
--- a/python/tvm/arith/__init__.py
+++ b/python/tvm/arith/__init__.py
@@ -21,3 +21,5 @@
 from .bound import deduce_bound
 from .pattern import detect_linear_equation, detect_clip_bound
 from .int_solver import solve_linear_equations, solve_linear_inequalities
+from .iter_affine_map import IterMapExpr, IterMark, IterSplitExpr, IterSumExpr
+from .iter_affine_map import detect_iter_map
diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py
new file mode 100644
index 000000000000..123d9b85480a
--- /dev/null
+++ b/python/tvm/arith/iter_affine_map.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Iterator (quasi)affine mapping patterns."""
+import tvm._ffi
+from tvm.runtime import Object
+from tvm.ir import PrimExpr
+from . import _ffi_api
+
+
+class IterMapExpr(PrimExpr):
+    """Base class of all IterMap expressions."""
+
+
+@tvm._ffi.register_object("arith.IterMark")
+class IterMark(Object):
+    """Mark the source as an iterator in [0, extent).
+
+    Parameters
+    ----------
+    source : PrimExpr.
+        The source expression.
+
+    extent : PrimExpr
+        The extent of the iterator.
+    """
+
+    def __init__(self, source, extent):
+        self.__init_handle_by_constructor__(_ffi_api.IterMark, source, extent)
+
+
+@tvm._ffi.register_object("arith.IterSplitExpr")
+class IterSplitExpr(IterMapExpr):
+    """Split of an iterator.
+
+    result = floormod(floordiv(source, lower_factor), extent) * scale
+
+    Parameters
+    ----------
+    source : IterMark
+        The source marked iterator.
+
+    lower_factor : PrimExpr
+        The lower factor to split the domain.
+
+    extent : PrimExpr
+        The extent of the split.
+
+    scale : PrimExpr
+        Additional scale to the split.
+    """
+
+    def __init__(self, source, lower_factor, extent, scale):
+        self.__init_handle_by_constructor__(
+            _ffi_api.IterSplitExpr, source, lower_factor, extent, scale
+        )
+
+
+@tvm._ffi.register_object("arith.IterSumExpr")
+class IterSumExpr(IterMapExpr):
+    """Fuse multiple iterators by summing them with scaling.
+
+    result = sum(args) + base
+
+    Parameters
+    ----------
+    args : List[IterSplitExpr]
+        The input to the sum expression.
+
+    base : PrimExpr
+        The base offset.
+    """
+
+    def __init__(self, args, base):
+        self.__init_handle_by_constructor__(_ffi_api.IterSumExpr, args, base)
+
+
+def detect_iter_map(indices, input_iters):
+    """Detect if indices can be written mapped iters from input_iters.
+
+    Parameters
+    ----------
+    indices : List[PrimExpr]
+        The input indices.
+
+    input_iters : Map[Var, Range]
+        The domain of each input iterators.
+
+    Returns
+    -------
+    results : List[IterSumExpr]
+        The iter map matching result.
+        Empty array if no match can be found.
+    """
+    return _ffi_api.DetectIterMap(indices, input_iters)
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index 6a395e776257..4926b88e4658 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -18,19 +18,22 @@
 """ Namespace for TVM Auto-scheduler. """
 
 from . import compute_dag
+from . import dispatcher
 from . import feature
 from . import loop_state
 from . import measure
 from . import measure_record
+from . import relay_integration
 from . import search_policy
 from . import search_task
+from . import task_scheduler
 from . import utils
 from . import workload_registry
 
 # Shortcut
-from .auto_schedule import TuningOptions, HardwareParams, create_task, auto_schedule
-from .compute_dag import ComputeDAG
+from .compute_dag import ComputeDAG, LayoutRewriteOption
 from .cost_model import RandomModel, XGBModel
+from .dispatcher import DispatchContext, ApplyHistoryBest
 from .measure import (
     MeasureInput,
     MeasureResult,
@@ -39,7 +42,14 @@
     RPCRunner,
     LocalRPCMeasureContext,
 )
-from .measure_record import RecordToFile, RecordReader, load_best, load_records, save_records
-from .search_task import SearchTask
+from .measure_record import RecordToFile, RecordReader, load_best_record, load_records, save_records
+from .relay_integration import (
+    extract_tasks,
+    remove_index_check,
+    rewrite_compute_body,
+    is_auto_scheduler_enabled,
+)
+from .search_task import SearchTask, TuningOptions, HardwareParams, create_task, auto_schedule
 from .search_policy import EmptyPolicy, SketchPolicy, PreloadMeasuredStates
+from .task_scheduler import TaskScheduler
 from .workload_registry import register_workload, make_workload_key
diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index ca069bb0b4e9..57dc9588df51 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -16,16 +16,7 @@
 # under the License.
 
 """
-User interface for TVM Auto-scheduler.
-
-The basic schedule search process for TVM Auto-scheduler is designed to be:
-`Program sampling` -> `Performance Tuning`.
-
-In `Program sampling`, we use some predefined precise or heuristic rules to generate several
-initial schedules. Based on these initial starting points, we perform `Performance Tuning` which
-uses cost model based evolutionary search to select schedules with the best performance.
-
-Candidate schedules are measured against the specific hardware target.
+The user interface and tuning options of the TVM auto-scheduler.
 """
 
 import tvm._ffi
@@ -45,7 +36,7 @@ class HardwareParams(Object):
     """The parameters of target hardware used to guide the search policy
 
     TODO(jcf94): This is considered to be merged with the new Target specification:
-    https://discuss.tvm.ai/t/rfc-tvm-target-specification/6844
+    https://discuss.tvm.apache.org/t/rfc-tvm-target-specification/6844
 
     Parameters
     ----------
@@ -55,11 +46,39 @@ class HardwareParams(Object):
         The width of vector units in bytes.
     cache_line_bytes : int
         The size of cache line in bytes.
+    max_shared_memory_per_block : int
+        The max shared memory per block in bytes.
+    max_registers_per_block : int
+        The max number of register per block.
+    max_threads_per_block : int
+        The max number of threads per block.
+    max_vthread_extent : int
+        The max vthread extent.
+    warp_size : int
+        The thread numbers of a warp.
     """
 
-    def __init__(self, num_cores, vector_unit_bytes, cache_line_bytes):
+    def __init__(
+        self,
+        num_cores,
+        vector_unit_bytes,
+        cache_line_bytes,
+        max_shared_memory_per_block,
+        max_registers_per_block,
+        max_threads_per_block,
+        max_vthread_extent,
+        warp_size,
+    ):
         self.__init_handle_by_constructor__(
-            _ffi_api.HardwareParams, num_cores, vector_unit_bytes, cache_line_bytes
+            _ffi_api.HardwareParams,
+            num_cores,
+            vector_unit_bytes,
+            cache_line_bytes,
+            max_shared_memory_per_block,
+            max_registers_per_block,
+            max_threads_per_block,
+            max_vthread_extent,
+            warp_size,
         )
 
 
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index 0115dbcf8ebe..a6f99542e7d0 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name
 
 """ The auto-scheduler's computational graph and related program analyses. """
 
@@ -21,13 +22,27 @@
 
 import tvm._ffi
 from tvm.runtime import Object
-from tvm.te import PlaceholderOp, ComputeOp
+from tvm.runtime._ffi_node_api import LoadJSON, SaveJSON
+from tvm.te import ComputeOp, PlaceholderOp
 
+from . import _ffi_api
 from .loop_state import State, StateObject
 from .utils import get_const_tuple
 from .workload_registry import workload_key_to_tensors
 
-from . import _ffi_api
+
+class LayoutRewriteOption:
+    """Options for applying layout rewrite."""
+
+    # Do not perform layout rewrite
+    NO_REWRITE = 0
+    # Insert layout transformation stages for input placeholders in the compute DAG
+    INSERT_TRANSFORM_STAGE = 1
+    # Do not insert layout transformation stages and assume the input placeholders
+    # are pre-transformed.
+    # Note: The lowered function with this option does not accept the origial input shapes,
+    # so this option must be used along with `AutoSchedulerLayoutRewrite` pass in Relay.
+    REWRITE_FOR_PRE_TRANSFORMED = 2
 
 
 @tvm._ffi.register_object("auto_scheduler.ComputeDAG")
@@ -47,22 +62,34 @@ class ComputeDAG(Object):
 
     Parameters
     ----------
-    compute : Union[List[Tensor], str]
+    compute : Union[List[Tensor], str, Schedule]
         Input/output tensors or workload key for a compute declaration.
     """
 
-    def __init__(self, compute):
-        if isinstance(compute, str):
-            compute = workload_key_to_tensors(compute)
-        elif isinstance(compute, list):
-            for item in compute:
+    def __init__(self, compute_or_sche):
+        if isinstance(compute_or_sche, str):
+            compute = workload_key_to_tensors(compute_or_sche)
+            sche = None
+        elif isinstance(compute_or_sche, (list, tvm.ir.container.Array)):
+            for item in compute_or_sche:
                 if not isinstance(item, tvm.te.Tensor):
-                    raise ValueError("The input of ComputeDAG should be a list of Tensor")
+                    raise ValueError(
+                        "The input of ComputeDAG should be a list of Tensor, but got %s"
+                        % type(item)
+                    )
+            compute = compute_or_sche
+            sche = None
+        elif isinstance(compute_or_sche, tvm.te.Schedule):
+            compute = None
+            sche = compute_or_sche
         else:
             raise ValueError(
-                "Invalid compute: " + compute + " . ComputeDAG expects a string or list of Tensor"
+                "Invalid compute type: %s. ComputeDAG expects string, list of Tensor, or Schedule"
+                % type(compute_or_sche)
             )
-        self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, compute)
+        self.compute = compute
+        self.sche = sche
+        self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, compute, sche)
 
     def get_init_state(self):
         """Get the init state of this ComputeDAG.
@@ -74,7 +101,7 @@ def get_init_state(self):
         """
         return State(self.init_state, self)
 
-    def apply_steps_from_state(self, state, layout_rewrite=False):
+    def apply_steps_from_state(self, state, layout_rewrite=LayoutRewriteOption.NO_REWRITE):
         """
         Apply the history transform steps from a State to get a TVM schedule.
 
@@ -83,7 +110,7 @@ def apply_steps_from_state(self, state, layout_rewrite=False):
         state : Union[State, StateObject]
             The state from which we get transform steps.
 
-        layout_rewrite: Bool
+        layout_rewrite: LayoutRewriteOption = NoRewrite
             Rewrite the layout of placeholders specified by "layout_free_placeholders" attr
             to make it most friendly for the generated schedule to read from.
 
@@ -144,7 +171,31 @@ def infer_bound_from_state(self, state):
                 updated_state.stage_id_map[k] = v
         return updated_state
 
-    def __hash__(self):
+    def rewrite_layout_from_state(self, state):
+        """
+        Rewrite the layout of the DAG according to the history transform steps of a state.
+
+        Parameters
+        ----------
+        state : Union[State, StateObject]
+            The state from which we get transform steps.
+
+        Returns
+        -------
+        updated_dag : ComputeDAG
+            The compute dag with rewritten layout.
+        """
+        state_obj = state if isinstance(state, StateObject) else state.state_object
+        return _ffi_api.ComputeDAGRewriteLayoutFromState(self, state_obj)
+
+    def hash_key(self):
+        """Return the hash key of this compute DAG.
+
+        Returns
+        -------
+        key: str
+            The hash key of this compute DAG
+        """
         # TODO(merrymercy): Implement this more carefully and move this to c++ as a member function
         # of ComputeDAG
         str_key = ""
@@ -163,3 +214,25 @@ def __hash__(self):
 
         str_key = str_key.encode(encoding="utf-8")
         return hashlib.md5(str_key).hexdigest()
+
+    def __str__(self):
+        # pretty print
+        MAX_LINE_WIDTH = 256
+
+        raw_lines = super().__str__().split("\n")
+        lines = []
+        for line in raw_lines:
+            if len(line) > MAX_LINE_WIDTH:
+                line = (
+                    line[: MAX_LINE_WIDTH // 2] + " ..(OMITTED).. " + line[-MAX_LINE_WIDTH // 2 :]
+                )
+            lines.append(line)
+        return "\n".join(lines)
+
+    def __getstate__(self):
+        return {"compute": SaveJSON(self.compute), "sche": SaveJSON(self.sche)}
+
+    def __setstate__(self, state):
+        self.compute = LoadJSON(state["compute"])  # pylint: disable=assignment-from-no-return
+        self.sche = LoadJSON(state["sche"])  # pylint: disable=assignment-from-no-return
+        self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, self.compute, self.sche)
diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index 3eb64df693f2..5b10054d4600 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -22,16 +22,14 @@
 from collections import defaultdict
 
 import numpy as np
-import xgboost as xgb
-from xgboost.core import EarlyStopException
-from xgboost.callback import _fmt_metric
-from xgboost.training import aggcv
 
 from tvm.autotvm.tuner.metric import max_curve
 from .cost_model import PythonBasedModel
 from ..feature import get_per_store_features_from_measure_pairs, get_per_store_features_from_states
 from ..measure_record import RecordReader
 
+xgb = None
+
 logger = logging.getLogger("auto_scheduler")
 
 
@@ -76,7 +74,7 @@ def set(self, key, matrix, value):
 class XGBModel(PythonBasedModel):
     """Train a XGBoost model to predict the normalized throughputs of programs.
     Let the normalized throughput be the score of a program (higher is better). We predict
-    the (approximiate) score of a program = the sum of the scores of all stages in this program.
+    the (approximate) score of a program = the sum of the scores of all stages in this program.
     i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
     where score_si is the score of Stage i in Program P.
     We extract feature for each stage and let the xgboost predict the score for each stage.
@@ -91,6 +89,17 @@ class XGBModel(PythonBasedModel):
     """
 
     def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        global xgb
+        try:
+            if xgb is None:
+                xgb = __import__("xgboost")
+        except ImportError:
+            raise ImportError(
+                "XGBoost is required for XGBModel. "
+                "Please install its python package first. "
+                "Help: (https://xgboost.readthedocs.io/en/latest/) "
+            )
+
         self.xgb_params = {
             "max_depth": 10,
             "gamma": 0.001,
@@ -188,7 +197,7 @@ def predict(self, task, states):
         else:
             ret = np.random.uniform(0, 1, (len(states),))
 
-        # Predict 0 for invalid states that failed to be lowered.
+        # Predict -inf for invalid states that failed to be lowered.
         for idx, feature in enumerate(features):
             if feature.min() == feature.max() == 0:
                 ret[idx] = float("-inf")
@@ -501,6 +510,11 @@ def custom_callback(
     skip_every=2,
 ):
     """Callback function for xgboost to support multiple custom evaluation functions"""
+    # pylint: disable=import-outside-toplevel
+    from xgboost.core import EarlyStopException
+    from xgboost.callback import _fmt_metric
+    from xgboost.training import aggcv
+
     state = {}
     metric_shortname = metric.split("-")[1]
 
diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
new file mode 100644
index 000000000000..b0b98d8d0f56
--- /dev/null
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -0,0 +1,286 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+The global context that dispatches best schedules to workloads.
+
+In auto-scheduler, a state (loop_state.py::StateObject) saves the
+schedule configuration by its transform_steps, so a state is used
+as a schedule configuration here.
+"""
+# pylint: disable=invalid-name
+
+import logging
+import pathlib
+
+import numpy as np
+
+from tvm.tir.expr import FloatImm
+from .measure_record import load_records
+
+logger = logging.getLogger("auto_scheduler")
+
+
+class DispatchContext(object):
+    """
+    Base class of dispatch context.
+    """
+
+    current = None
+
+    def __init__(self):
+        self._old_ctx = DispatchContext.current
+
+    def query(self, target, workload_key, has_complex_op, dag):
+        """
+        Query the context to get the specific config for a workload.
+        If cannot find the result inside this context, this function will query it
+        from the upper contexts.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload_key : str
+            The workload key
+        has_complex_op: bool
+            Whether this workload has at least one complex op.
+        dag: ComputeDAG
+            The ComputeDAG of the workload.
+
+        Returns
+        -------
+        state : StateObject
+            The state that stores schedule configuration for the workload
+        """
+        ret = self._query_inside(target, workload_key)
+        if ret is None:
+            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag)
+        return ret
+
+    def update(self, target, workload_key, state):
+        """
+        Update the config for a workload
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload_key : str
+            The current workload_key.
+        state : StateObject
+            The state that stores schedule configuration for the workload
+        """
+        raise NotImplementedError()
+
+    def _query_inside(self, target, workload_key):
+        """
+        Query the context to get the specific config for a workload.
+        This function only query config inside this context.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload_key : str
+            The current workload_key.
+
+        Returns
+        -------
+        state : StateObject
+            The schedule configuration for the workload
+        """
+        raise NotImplementedError()
+
+    def __enter__(self):
+        self._old_ctx = DispatchContext.current
+        DispatchContext.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        DispatchContext.current = self._old_ctx
+
+
+class ApplyHistoryBest(DispatchContext):
+    """
+    Apply the history best config
+
+    Parameters
+    ----------
+    records : str or iterator of (auto_scheduler.measure.MeasureInput,\
+                                  auto_scheduler.measure.MeasureResult)
+        Collection of tuning records.
+        If is str, then it should be the filename of a records log file.
+        Each row of this file is an encoded record pair. Otherwise, it is an iterator.
+    n_lines: Optional[int]
+        if it is not None, only load the first `n_lines` lines of log
+    """
+
+    def __init__(self, records, n_lines=None):
+        super(ApplyHistoryBest, self).__init__()
+
+        self.best_by_targetkey = {}
+        self.best_by_model = {}
+        self._best_user_defined = {}
+
+        self.load(records, n_lines)
+
+    def load(self, records, n_lines=None):
+        """Load records to this dispatch context
+
+        Parameters
+        ----------
+        records : str or iterator of (auto_scheduler.measure.MeasureInput,\
+                                      auto_scheduler.measure.MeasureResult)
+            Collection of tuning records.
+            If is str, then it should be the filename of a records log file.
+            Each row of this file is an encoded record pair. Otherwise, it is an iterator.
+        n_lines: Optional[int]
+            if it is not None, only load the first `n_lines` lines of log
+        """
+        if isinstance(records, pathlib.Path):
+            records = str(records)
+
+        if isinstance(records, str):
+            records = load_records(records)
+
+        if not records:
+            return
+
+        best_by_targetkey = self.best_by_targetkey
+        best_by_model = self.best_by_model
+
+        counter = 0
+        for inp, res in records:
+            if n_lines is not None and counter >= n_lines:
+                break
+            counter += 1
+            if res.error_no != 0:
+                continue
+
+            # use target keys in tvm target system as key to build best map
+            for k in inp.task.target.keys:
+                key = (k, inp.task.workload_key)
+                if key not in best_by_targetkey:
+                    best_by_targetkey[key] = (inp, res)
+                else:
+                    _, other_res = best_by_targetkey[key]
+                    other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)]
+                    costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
+                    if np.mean(other_costs) > np.mean(costs):
+                        best_by_targetkey[key] = (inp, res)
+
+            # use model as key to build best map
+            key = (inp.task.target.model, inp.task.workload_key)
+            if key not in best_by_model:
+                if inp.task.target.model != "unknown":
+                    best_by_model[key] = (inp, res)
+            else:
+                _, other_res = best_by_model[key]
+                other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)]
+                costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
+                if np.mean(other_costs) > np.mean(costs):
+                    best_by_model[key] = (inp, res)
+
+        logger.debug("Finish loading %d records", counter)
+
+    def _query_inside(self, target, workload_key):
+        if target is None:
+            raise RuntimeError(
+                "Need a target context to find the history best. "
+                "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
+                " above the dispatcher call. So does other target. "
+            )
+
+        # first try matching by model
+        key = (target.model, workload_key)
+        if key in self._best_user_defined:
+            return self._best_user_defined[key]
+        if key in self.best_by_model:
+            return self.best_by_model[key][0].state
+
+        # then try matching by target key
+        for k in target.keys:
+            key = (k, workload_key)
+            if key in self._best_user_defined:
+                return self._best_user_defined[key]
+            if key in self.best_by_targetkey:
+                return self.best_by_targetkey[key][0].state
+
+        return None
+
+    def update(self, target, workload_key, state):
+        model = target.model
+        key = (model, workload_key)
+        self._best_user_defined[key] = state
+
+        for k in target.keys:
+            key = (k, workload_key)
+            self._best_user_defined[key] = state
+
+
+class FallbackContext(DispatchContext):
+    """
+    A fallback dispatch context.
+    This is used as the root context.
+    """
+
+    def __init__(self):
+        super(FallbackContext, self).__init__()
+        self.memory = {}
+
+        # Verbose level:
+        # 0: Completely silent.
+        # 1: Warning the missing configs for querying complex tasks.
+        # 2: Warning the missing configs for querying all tasks.
+        self.verbose = 1
+
+        # a set to prevent print duplicated message
+        self.messages = set()
+
+    def query(self, target, workload_key, has_complex_op, dag):
+        key = (str(target), workload_key)
+        if key in self.memory:
+            return self.memory[key]
+
+        if self.verbose == 2 or (has_complex_op and self.verbose == 1):
+            msg = (
+                "-----------------------------------\n"
+                "Cannot find tuned schedules for target=%s, workload_key=%s. "
+                "A fallback TOPI schedule is used, "
+                "which may bring great performance regression or even compilation failure. "
+                "Compute DAG info:\n%s" % (target, workload_key, dag)
+            )
+            if msg not in self.messages:
+                self.messages.add(msg)
+                logger.warning(msg)
+
+        state = None
+
+        # cache this config to avoid duplicated warning message
+        self.memory[key] = state
+        return state
+
+    def _query_inside(self, target, workload_key):
+        _ = target = workload_key
+        raise RuntimeError("This function should never be called")
+
+    def update(self, target, workload_key, state):
+        key = (str(target), workload_key)
+        self.memory[key] = state
+
+
+DispatchContext.current = FallbackContext()
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 81c314fb332a..7e4f14933819 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -34,7 +34,6 @@
 import os
 import time
 import shutil
-import traceback
 import tempfile
 import multiprocessing
 
@@ -42,28 +41,28 @@
 from tvm.runtime import Object, module, ndarray
 from tvm.driver import build_module
 from tvm.ir import transform
-from tvm.rpc.tracker import Tracker
-from tvm.rpc.server import Server
 from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
 from tvm.contrib import tar, ndk
 
 from . import _ffi_api
 from .loop_state import StateObject
 from .utils import (
-    get_const_tuple,
-    NoDaemonPool,
     call_func_with_timeout,
-    request_remote,
     check_remote,
+    get_const_tuple,
+    make_traceback_info,
+    request_remote,
+)
+from .compute_dag import LayoutRewriteOption
+from .workload_registry import (
+    serialize_workload_registry_entry,
+    deserialize_workload_registry_entry,
 )
 
-# The maximum length of error message
-MAX_ERROR_MSG_LEN = 512
 
-# We use fork and a global variable to copy arguments between processes.
-# This can avoid expensive serialization of TVM IR when using multiprocessing.Pool
-GLOBAL_BUILD_ARGUMENTS = None
-GLOBAL_RUN_ARGUMENTS = None
+# The time cost for measurements with errors
+# We use 1e10 instead of sys.float_info.max for better readability in log
+MAX_FLOAT = 1e10
 
 
 @tvm._ffi.register_object("auto_scheduler.MeasureCallback")
@@ -87,6 +86,25 @@ def __init__(self, task, state):
         state = state if isinstance(state, StateObject) else state.state_object
         self.__init_handle_by_constructor__(_ffi_api.MeasureInput, task, state)
 
+    def serialize(self):
+        """Custom serialization to workaround MeasureInput not exposing all its
+        members to the TVM ffi interface.
+
+        Note that we do not implement __getstate__ as it does not seem to work
+        with initialization of the workload registry (maybe because of
+        initialization order?).
+        """
+        return [
+            _ffi_api.SerializeMeasureInput(self),
+            serialize_workload_registry_entry(self.task.workload_key),
+        ]
+
+    @staticmethod
+    def deserialize(data):
+        inp = _ffi_api.DeserializeMeasureInput(data[0])
+        deserialize_workload_registry_entry(data[1])
+        return recover_measure_input(inp)
+
 
 @tvm._ffi.register_object("auto_scheduler.BuildResult")
 class BuildResult(Object):
@@ -141,6 +159,43 @@ def __init__(self, costs, error_no, error_msg, all_cost, timestamp):
         )
 
 
+def recover_measure_input(inp, rebuild_state=False):
+    """
+    Recover a deserialized MeasureInput by rebuilding the missing fields.
+    1. Rebuid the compute_dag in inp.task
+    2. (Optional) Rebuild the stages in inp.state
+
+    Parameters
+    ----------
+    inp: MeasureInput
+        The deserialized MeasureInput
+    rebuild_state: bool = False
+        Whether rebuild the stages in MeasureInput.State
+
+    Returns
+    -------
+    new_input: MeasureInput
+        The fully recovered MeasureInput with all fields rebuilt.
+    """
+    # pylint: disable=import-outside-toplevel
+    from .search_task import SearchTask  # lazily import to avoid recursive dependency
+
+    task = inp.task
+    new_task = SearchTask(
+        workload_key=task.workload_key,
+        target=task.target,
+        target_host=task.target_host,
+        hardware_params=task.hardware_params,
+    )
+
+    if rebuild_state:
+        new_state = new_task.compute_dag.infer_bound_from_state(inp.state)
+    else:
+        new_state = inp.state
+
+    return MeasureInput(new_task, new_state)
+
+
 @tvm._ffi.register_object("auto_scheduler.ProgramBuilder")
 class ProgramBuilder(Object):
     """ The base class of ProgramBuilders. """
@@ -185,6 +240,33 @@ def run(self, measure_inputs, build_results, verbose=1):
         return _ffi_api.ProgramRunnerRun(self, measure_inputs, build_results, verbose)
 
 
+@tvm._ffi.register_object("auto_scheduler.ProgramMeasurer")
+class ProgramMeasurer(Object):
+    """
+    Measurer that measures the time costs of tvm programs
+    This class combines ProgramBuilder and ProgramRunner, and provides a simpler API.
+
+    Parameters
+    ----------
+    builder : ProgramBuilder
+        The ProgramBuilder to build programs
+    runner : ProgramRunner
+        The ProgramRunner to measure programs.
+    callbacks : List[MeasureCallback]
+        Callbacks to be called after each measurement batch
+    verbose : int
+        The Verbosity level: 0 for silent, 1 to output information during program
+    max_continuous_error : Optional[int]
+        The number of allowed maximum continuous error before stop the tuning
+    """
+
+    def __init__(self, builder, runner, callbacks, verbose, max_continuous_error=None):
+        max_continuous_error = max_continuous_error or -1  # -1 means using the default value
+        self.__init_handle_by_constructor__(
+            _ffi_api.ProgramMeasurer, builder, runner, callbacks, verbose, max_continuous_error
+        )
+
+
 @tvm._ffi.register_object("auto_scheduler.LocalBuilder")
 class LocalBuilder(ProgramBuilder):
     """LocalBuilder use local CPU cores to build programs in parallel.
@@ -248,6 +330,10 @@ def __init__(
         cooldown_interval=0.0,
         enable_cpu_cache_flush=False,
     ):
+        if enable_cpu_cache_flush:
+            number = 1
+            min_repeat_ms = 0
+
         self.__init_handle_by_constructor__(
             _ffi_api.LocalRunner,
             timeout,
@@ -396,6 +482,10 @@ def __init__(
         cooldown_interval=0.0,
         enable_cpu_cache_flush=False,
     ):
+        # pylint: disable=import-outside-toplevel
+        from tvm.rpc.tracker import Tracker
+        from tvm.rpc.server import Server
+
         ctx = tvm.context("cuda", 0)
         if ctx.exist:
             cuda_arch = "sm_" + "".join(ctx.compute_version.split("."))
@@ -432,6 +522,7 @@ def __del__(self):
         # Close the tracker and server before exit
         self.tracker.terminate()
         self.server.terminate()
+        time.sleep(0.5)
 
 
 class MeasureErrorNo(object):
@@ -449,39 +540,65 @@ class MeasureErrorNo(object):
     UNKNOWN_ERROR = 8  # Unknown error
 
 
-def make_error_msg():
-    """ Get the error message from traceback. """
-    error_msg = str(traceback.format_exc())
-    if len(error_msg) > MAX_ERROR_MSG_LEN:
-        error_msg = (
-            error_msg[: MAX_ERROR_MSG_LEN // 2] + "\n...\n" + error_msg[-MAX_ERROR_MSG_LEN // 2 :]
+def _timed_func(inp_serialized, build_func, verbose):
+    tic = time.time()
+    inp = MeasureInput.deserialize(inp_serialized)
+    task = inp.task
+
+    error_no = MeasureErrorNo.NO_ERROR
+    error_msg = None
+    args = []
+
+    try:
+        sch, args = task.compute_dag.apply_steps_from_state(
+            inp.state, layout_rewrite=LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
         )
-    return error_msg
+    # pylint: disable=broad-except
+    except Exception:
+        error_no = MeasureErrorNo.INSTANTIATION_ERROR
+        error_msg = make_traceback_info()
+
+    if error_no == 0:
+        dirname = tempfile.mkdtemp()
+        filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
+
+        try:
+            with transform.PassContext():
+                func = build_module.build(
+                    sch, args, target=task.target, target_host=task.target_host
+                )
+            func.export_library(filename, build_func)
+        # pylint: disable=broad-except
+        except Exception:
+            error_no = MeasureErrorNo.COMPILE_HOST
+            error_msg = make_traceback_info()
+    else:
+        filename = ""
 
+    if verbose >= 1:
+        if error_no == MeasureErrorNo.NO_ERROR:
+            print(".", end="")
+        else:
+            print(".E", end="")  # Build error
+
+    return filename, args, error_no, error_msg, time.time() - tic
 
-def local_build_worker(index):
+
+def local_build_worker(args):
     """
     Build function of LocalBuilder to be ran in the Builder thread pool.
 
     Parameters
     ----------
-    index : int
-        The MeasureInput index to be processed by the current Builder thread.
+    args: Tuple[MeasureInput, str, int, int]
+        inputs, build-func, time, verbose args passed to local_builder_build
 
     Returns
     -------
     res : BuildResult
         The build result of this Builder thread.
     """
-    global GLOBAL_BUILD_ARGUMENTS
-
-    # We use fork and a global variable to copy arguments between processes.
-    # This can avoid expensive serialization of TVM IR when using multiprocessing.Pool
-    if not GLOBAL_BUILD_ARGUMENTS:
-        raise ValueError("GLOBAL_BUILD_ARGUMENTS not found")
-    measure_inputs, build_func, timeout, verbose = GLOBAL_BUILD_ARGUMENTS
-    assert isinstance(build_func, str)
-
+    inp, build_func, timeout, verbose = args
     if build_func == "default":
         build_func = tar.tar
     elif build_func == "ndk":
@@ -489,52 +606,15 @@ def local_build_worker(index):
     else:
         raise ValueError("Invalid build_func" + build_func)
 
-    def timed_func():
-        tic = time.time()
-        inp = measure_inputs[index]
-        task = inp.task
-
-        error_no = MeasureErrorNo.NO_ERROR
-        error_msg = None
-        args = []
-
-        try:
-            sch, args = task.compute_dag.apply_steps_from_state(inp.state, layout_rewrite=True)
-        # pylint: disable=broad-except
-        except Exception:
-            error_no = MeasureErrorNo.INSTANTIATION_ERROR
-            error_msg = make_error_msg()
-
-        if error_no == 0:
-            dirname = tempfile.mkdtemp()
-            filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
-
-            try:
-                # TODO(merrymercy): Port the unroll pass.
-                with transform.PassContext():
-                    func = build_module.build(
-                        sch, args, target=task.target, target_host=task.target_host
-                    )
-                func.export_library(filename, build_func)
-            # pylint: disable=broad-except
-            except Exception:
-                error_no = MeasureErrorNo.COMPILE_HOST
-                error_msg = make_error_msg()
-        else:
-            filename = ""
-
-        if verbose >= 1:
-            if error_no == MeasureErrorNo.NO_ERROR:
-                print(".", end="")
-            else:
-                print(".E", end="")  # Build error
-        return filename, args, error_no, error_msg, time.time() - tic
-
-    res = call_func_with_timeout(timeout, timed_func)
+    res = call_func_with_timeout(timeout, _timed_func, args=(inp, build_func, verbose))
     if isinstance(res, TimeoutError):
         if verbose >= 1:
             print(".T", end="")  # Build timeout
         res = None, [], MeasureErrorNo.BUILD_TIMEOUT, None, timeout
+    elif isinstance(res, Exception):
+        if verbose >= 1:
+            print(".E", end="")  # Build error
+        res = None, [], MeasureErrorNo.COMPILE_HOST, str(res), timeout
 
     return res
 
@@ -563,14 +643,20 @@ def local_builder_build(inputs, timeout, n_parallel, build_func="default", verbo
     res : List[BuildResult]
         The build results of these MeasureInputs.
     """
-    # We use fork and a global variable to copy arguments between processes.
-    # This can avoid expensive serialization of TVM IR when using multiprocessing.Pool
-    global GLOBAL_BUILD_ARGUMENTS
-
-    GLOBAL_BUILD_ARGUMENTS = (inputs, build_func, timeout, verbose)
-
-    pool = NoDaemonPool(n_parallel)
-    tuple_res = pool.map(local_build_worker, range(len(inputs)))
+    # This pool is not doing computationally intensive work, so we can use threads
+    pool = multiprocessing.pool.ThreadPool(n_parallel)
+    tuple_res = pool.map(
+        local_build_worker,
+        [
+            (
+                i.serialize(),
+                build_func,
+                timeout,
+                verbose,
+            )
+            for i in inputs
+        ],
+    )
     pool.terminate()
     pool.join()
     del pool
@@ -582,6 +668,70 @@ def local_builder_build(inputs, timeout, n_parallel, build_func="default", verbo
     return results
 
 
+def _timed_eval_func(
+    inp_serialized,
+    build_res,
+    number,
+    repeat,
+    min_repeat_ms,
+    cooldown_interval,
+    enable_cpu_cache_flush,
+    verbose,
+):
+    inp = MeasureInput.deserialize(inp_serialized)
+    tic = time.time()
+    error_no = 0
+    error_msg = None
+    try:
+        func = module.load_module(build_res.filename)
+        ctx = ndarray.context(str(inp.task.target), 0)
+        # Limitation:
+        # We can not get PackFunction directly in the remote mode as it is wrapped
+        # under the std::function. We could lift the restriction later once we fold
+        # the PackedFunc as an object. Currently, we pass function name to work
+        # around it.
+        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
+        time_f = func.time_evaluator(
+            func.entry_name,
+            ctx,
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            f_preproc=f_prepare,
+        )
+    # pylint: disable=broad-except
+    except Exception:
+        costs = (MAX_FLOAT,)
+        error_no = MeasureErrorNo.COMPILE_DEVICE
+        error_msg = make_traceback_info()
+
+    if error_no == 0:
+        try:
+            args = [ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args]
+            random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True)
+            assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake"
+            for arg in args:
+                random_fill(arg)
+            ctx.sync()
+            costs = time_f(*args).results
+        # pylint: disable=broad-except
+        except Exception:
+            costs = (MAX_FLOAT,)
+            error_no = MeasureErrorNo.RUNTIME_DEVICE
+            error_msg = make_traceback_info()
+
+    shutil.rmtree(os.path.dirname(build_res.filename))
+    toc = time.time()
+    time.sleep(cooldown_interval)
+
+    if verbose >= 1:
+        if error_no == MeasureErrorNo.NO_ERROR:
+            print("*", end="")
+        else:
+            print("*E", end="")  # Run error
+    return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
+
+
 @tvm._ffi.register_func("auto_scheduler.local_runner.run")
 def local_run(
     inputs,
@@ -638,86 +788,54 @@ def local_run(
     res : List[MeasureResult]
         The measure results of these MeasureInputs.
     """
-    max_float = 1e10  # We use 1e10 instead of sys.float_info.max for better readability in log
-
-    def timed_func(inp, build_res):
-        tic = time.time()
-        error_no = 0
-        error_msg = None
-        try:
-            func = module.load_module(build_res.filename)
-            ctx = ndarray.context(str(inp.task.target), 0)
-            # Limitation:
-            # We can not get PackFunction directly in the remote mode as it is wrapped
-            # under the std::function. We could lift the restriction later once we fold
-            # the PackedFunc as an object. Currently, we pass function name to work
-            # around it.
-            f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
-            time_f = func.time_evaluator(
-                func.entry_name,
-                ctx,
-                number=number,
-                repeat=repeat,
-                min_repeat_ms=min_repeat_ms,
-                f_preproc=f_prepare,
-            )
-        # pylint: disable=broad-except
-        except Exception:
-            costs = (max_float,)
-            error_no = MeasureErrorNo.COMPILE_DEVICE
-            error_msg = make_error_msg()
-
-        if error_no == 0:
-            try:
-                args = [
-                    ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args
-                ]
-                random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True)
-                assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake"
-                for arg in args:
-                    random_fill(arg)
-                ctx.sync()
-                costs = time_f(*args).results
-            # pylint: disable=broad-except
-            except Exception:
-                costs = (max_float,)
-                error_no = MeasureErrorNo.RUNTIME_DEVICE
-                error_msg = make_error_msg()
-
-        shutil.rmtree(os.path.dirname(build_res.filename))
-        toc = time.time()
-        time.sleep(cooldown_interval)
-
-        if verbose >= 1:
-            if error_no == MeasureErrorNo.NO_ERROR:
-                print("*", end="")
-            else:
-                print("*E", end="")  # Run error
-        return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
 
     measure_results = []
     assert len(inputs) == len(build_results), "Measure input size should be equal to build results"
     for inp, build_res in zip(inputs, build_results):
         if build_res.error_no != 0:
             res = (
-                (max_float,),
+                (MAX_FLOAT,),
                 build_res.error_no,
                 build_res.error_msg,
                 build_res.time_cost,
                 time.time(),
             )
         else:
-            res = call_func_with_timeout(timeout, timed_func, args=(inp, build_res))
+            res = call_func_with_timeout(
+                timeout,
+                _timed_eval_func,
+                args=(
+                    inp.serialize(),
+                    build_res,
+                    number,
+                    repeat,
+                    min_repeat_ms,
+                    cooldown_interval,
+                    enable_cpu_cache_flush,
+                    verbose,
+                ),
+            )
             if isinstance(res, TimeoutError):
                 if verbose >= 1:
                     print("*T", end="")  # Run timeout
                 res = (
-                    (max_float,),
+                    (MAX_FLOAT,),
                     MeasureErrorNo.RUN_TIMEOUT,
                     None,
                     build_res.time_cost + timeout,
                     time.time(),
                 )
+            elif isinstance(res, Exception):
+                if verbose >= 1:
+                    print("*E", end="")  # Run error
+                res = (
+                    (MAX_FLOAT,),
+                    MeasureErrorNo.RUNTIME_DEVICE,
+                    str(res),
+                    build_res.time_cost + timeout,
+                    time.time(),
+                )
+
         measure_results.append(MeasureResult(*res))
 
     if verbose >= 1:
@@ -726,130 +844,133 @@ def timed_func(inp, build_res):
     return measure_results
 
 
-def rpc_run_worker(index):
+def _timed_rpc_run(
+    inp_serialized,
+    build_res,
+    key,
+    host,
+    port,
+    priority,
+    timeout,
+    number,
+    repeat,
+    min_repeat_ms,
+    cooldown_interval,
+    enable_cpu_cache_flush,
+    verbose,
+):
+    inp = MeasureInput.deserialize(inp_serialized)
+    tic = time.time()
+    error_no = 0
+    error_msg = None
+    try:
+        # upload built module
+        remote = request_remote(key, host, port, priority, timeout)
+        remote.upload(build_res.filename)
+        func = remote.load_module(os.path.split(build_res.filename)[1])
+        ctx = remote.context(str(inp.task.target), 0)
+        # Limitation:
+        # We can not get PackFunction directly in the remote mode as it is wrapped
+        # under the std::function. We could lift the restriction later once we fold
+        # the PackedFunc as an object. Currently, we pass function name to work
+        # around it.
+        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
+        time_f = func.time_evaluator(
+            func.entry_name,
+            ctx,
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            f_preproc=f_prepare,
+        )
+    # pylint: disable=broad-except
+    except Exception:
+        costs = (MAX_FLOAT,)
+        error_no = MeasureErrorNo.COMPILE_DEVICE
+        error_msg = make_traceback_info()
+
+    if error_no == 0:
+        try:
+            args = [ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args]
+            try:
+                random_fill = remote.get_function("tvm.contrib.random.random_fill")
+            except AttributeError:
+                raise AttributeError(
+                    "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
+                )
+            for arg in args:
+                random_fill(arg)
+            ctx.sync()
+
+            costs = time_f(*args).results
+            # clean up remote files
+            remote.remove(build_res.filename)
+            remote.remove(os.path.splitext(build_res.filename)[0] + ".so")
+            remote.remove("")
+        # pylint: disable=broad-except
+        except Exception:
+            costs = (MAX_FLOAT,)
+            error_no = MeasureErrorNo.RUNTIME_DEVICE
+            error_msg = make_traceback_info()
+
+    shutil.rmtree(os.path.dirname(build_res.filename))
+    toc = time.time()
+
+    time.sleep(cooldown_interval)
+    if verbose >= 1:
+        if error_no == MeasureErrorNo.NO_ERROR:
+            print("*", end="")
+        else:
+            print("*E", end="")  # Run error
+
+    return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
+
+
+def _rpc_run_worker(args):
     """Function to be ran in the RPCRunner thread pool.
 
     Parameters
     ----------
-    index : int
-        The MeasureInput and BuildResult index to be processed by the current Runner thread.
+    args : Tuple[MeasureInput, BuildResult, ...]
+        Single input and build result plus the rest of the arguments to `rpc_runner_run`.
 
     Returns
     -------
     res : MeasureResult
         The measure result of this Runner thread.
     """
-    global GLOBAL_RUN_ARGUMENTS
-    (
-        inputs,
-        build_results,
-        key,
-        host,
-        port,
-        priority,
-        timeout,
-        number,
-        repeat,
-        min_repeat_ms,
-        cooldown_interval,
-        enable_cpu_cache_flush,
-        verbose,
-    ) = GLOBAL_RUN_ARGUMENTS
-
-    max_float = 1e10  # We use 1e10 instead of sys.float_info.max for better readability in log
-    inp = inputs[index]
-    build_res = build_results[index]
-
+    _, build_res, _, _, _, _, timeout, _, _, _, _, _, verbose = args
     if build_res.error_no != MeasureErrorNo.NO_ERROR:
         return (
-            (max_float,),
+            (MAX_FLOAT,),
             build_res.error_no,
             build_res.error_msg,
             build_res.time_cost,
             time.time(),
         )
 
-    def timed_func():
-        tic = time.time()
-        error_no = 0
-        error_msg = None
-        try:
-            # upload built module
-            remote = request_remote(key, host, port, priority, timeout)
-            remote.upload(build_res.filename)
-            func = remote.load_module(os.path.split(build_res.filename)[1])
-            ctx = remote.context(str(inp.task.target), 0)
-            # Limitation:
-            # We can not get PackFunction directly in the remote mode as it is wrapped
-            # under the std::function. We could lift the restriction later once we fold
-            # the PackedFunc as an object. Currently, we pass function name to work
-            # around it.
-            f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
-            time_f = func.time_evaluator(
-                func.entry_name,
-                ctx,
-                number=number,
-                repeat=repeat,
-                min_repeat_ms=min_repeat_ms,
-                f_preproc=f_prepare,
-            )
-        # pylint: disable=broad-except
-        except Exception:
-            costs = (max_float,)
-            error_no = MeasureErrorNo.COMPILE_DEVICE
-            error_msg = make_error_msg()
-
-        if error_no == 0:
-            try:
-                args = [
-                    ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args
-                ]
-                try:
-                    random_fill = remote.get_function("tvm.contrib.random.random_fill")
-                except AttributeError:
-                    raise AttributeError(
-                        "Please make sure USE_RANDOM is ON in the config.cmake "
-                        "on the remote devices"
-                    )
-                for arg in args:
-                    random_fill(arg)
-                ctx.sync()
-
-                costs = time_f(*args).results
-                # clean up remote files
-                remote.remove(build_res.filename)
-                remote.remove(os.path.splitext(build_res.filename)[0] + ".so")
-                remote.remove("")
-            # pylint: disable=broad-except
-            except Exception:
-                costs = (max_float,)
-                error_no = MeasureErrorNo.RUNTIME_DEVICE
-                error_msg = make_error_msg()
-
-        shutil.rmtree(os.path.dirname(build_res.filename))
-        toc = time.time()
-
-        time.sleep(cooldown_interval)
-        if verbose >= 1:
-            if error_no == MeasureErrorNo.NO_ERROR:
-                print("*", end="")
-            else:
-                print("*E", end="")  # Run error
-
-        return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
-
-    res = call_func_with_timeout(timeout, timed_func)
-
+    res = call_func_with_timeout(timeout, _timed_rpc_run, args=args)
     if isinstance(res, TimeoutError):
         if verbose >= 1:
             print("*T", end="")  # Run timeout
         res = (
-            (max_float,),
+            (MAX_FLOAT,),
             MeasureErrorNo.RUN_TIMEOUT,
             None,
             build_res.time_cost + timeout,
             time.time(),
         )
+    elif isinstance(res, Exception):
+        if verbose >= 1:
+            print("*E", end="")  # Run error
+        res = (
+            (MAX_FLOAT,),
+            MeasureErrorNo.RUNTIME_DEVICE,
+            str(res),
+            build_res.time_cost + timeout,
+            time.time(),
+        )
+
     return res
 
 
@@ -923,26 +1044,30 @@ def rpc_runner_run(
     res : List[MeasureResult]
         The measure results of these MeasureInputs.
     """
-    global GLOBAL_RUN_ARGUMENTS
-    GLOBAL_RUN_ARGUMENTS = (
-        inputs,
-        build_results,
-        key,
-        host,
-        port,
-        priority,
-        timeout,
-        number,
-        repeat,
-        min_repeat_ms,
-        cooldown_interval,
-        enable_cpu_cache_flush,
-        verbose,
-    )
-
     assert len(inputs) == len(build_results), "Measure input size should be equal to build results"
-    pool = NoDaemonPool(n_parallel)
-    tuple_res = pool.map(rpc_run_worker, range(len(build_results)))
+    # This pool is not doing computationally intensive work, so we can use threads
+    pool = multiprocessing.pool.ThreadPool(n_parallel)
+    tuple_res = pool.map(
+        _rpc_run_worker,
+        [
+            (
+                inp.serialize(),
+                build_res,
+                key,
+                host,
+                port,
+                priority,
+                timeout,
+                number,
+                repeat,
+                min_repeat_ms,
+                cooldown_interval,
+                enable_cpu_cache_flush,
+                verbose,
+            )
+            for inp, build_res in zip(inputs, build_results)
+        ],
+    )
     pool.terminate()
     pool.join()
     del pool
diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
index 1d0d7650a0f6..d6fea5c48598 100644
--- a/python/tvm/auto_scheduler/measure_record.py
+++ b/python/tvm/auto_scheduler/measure_record.py
@@ -14,18 +14,23 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name, pointless-string-statement
 
 """ Serialization and other I/O support for measurement records (tuning logs). """
+import argparse
+import logging
+import os
+import itertools
 
 import numpy as np
 
 import tvm._ffi
 from tvm.runtime import Object
-from .compute_dag import ComputeDAG
-from .measure import MeasureErrorNo, MeasureInput, MeasureCallback
-from .search_task import SearchTask
+from .measure import MeasureErrorNo, MeasureCallback
 from . import _ffi_api
 
+logger = logging.getLogger("auto_scheduler")
+
 
 @tvm._ffi.register_object("auto_scheduler.RecordToFile")
 class RecordToFile(MeasureCallback):
@@ -38,7 +43,7 @@ class RecordToFile(MeasureCallback):
         File name for this callback to write log to.
     """
 
-    def __init__(self, filename="auto_scheduler_tuning.json"):
+    def __init__(self, filename):
         self.__init_handle_by_constructor__(_ffi_api.RecordToFile, filename)
 
 
@@ -49,11 +54,11 @@ class RecordReader(Object):
 
     Parameters
     ----------
-    filename : str = "auto_scheduler_tuning.json"
+    filename : str
         File name for this reader to load log from.
     """
 
-    def __init__(self, filename="auto_scheduler_tuning.json"):
+    def __init__(self, filename):
         self.__init_handle_by_constructor__(_ffi_api.RecordReader, filename)
 
     def read_lines(self, max_lines=None, skip_lines=0):
@@ -132,7 +137,7 @@ def save_records(filename, inputs, results):
     _ffi_api.SaveRecords(filename, inputs, results)
 
 
-def load_best(filename, workload_key=None, target=None):
+def load_best_record(filename, workload_key=None, target=None):
     """Return the best measurement pair form a log file. This may return none results if
     there is no legal measure pair with the specified workload_key/target found from the log file.
 
@@ -177,36 +182,69 @@ def load_best(filename, workload_key=None, target=None):
     return best_inp, best_res
 
 
-def recover_measure_input(inp, rebuild_state=False):
+def distill_record_file(in_file, out_file):
     """
-    Recover a deserialized MeasureInput by rebuilding the missing fields.
-    1. Rebuid the compute_dag in inp.task
-    2. (Optional) Rebuild the stages in inp.state
+    Pick the best entries from a record file and store them to another file.
+    This function distills the useful log entries from a large log file.
+    If out_file already exists, the best entries from both
+    in_file and out_file will be saved.
 
     Parameters
     ----------
-    inp: MeasureInput
-        The deserialized MeasureInput
-    rebuild_state: bool = False
-        Whether rebuild the stages in MeasureInput.State
-
-    Returns
-    -------
-    new_input: MeasureInput
-        The fully recovered MeasureInput with all fields rebuilt.
+    in_file: str
+        The filename of input
+    out_file: str or file
+        The filename of output
     """
-    task = inp.task
-    new_task = SearchTask(
-        ComputeDAG(task.workload_key),
-        task.workload_key,
-        task.target,
-        task.target_host,
-        task.hardware_params,
-    )
-
-    if rebuild_state:
-        new_state = new_task.compute_dag.infer_bound_from_state(inp.state)
-    else:
-        new_state = inp.state
-
-    return MeasureInput(new_task, new_state)
+    # pylint: disable=import-outside-toplevel
+    from .dispatcher import ApplyHistoryBest
+
+    context = load_records(in_file)
+    if os.path.isfile(out_file):
+        out_context = load_records(out_file)
+        context = itertools.chain(context, out_context)
+    context, context_clone = itertools.tee(context)
+    best_context = ApplyHistoryBest(context)
+    best_set = set()
+
+    def measure_input_str_key(inp):
+        return _ffi_api.SerializeMeasureInput(inp)
+
+    for v in best_context.best_by_model.values():
+        best_set.add(measure_input_str_key(v[0]))
+
+    for v in best_context.best_by_targetkey.values():
+        best_set.add(measure_input_str_key(v[0]))
+
+    inputs = []
+    results = []
+    for inp, res in context_clone:
+        if measure_input_str_key(inp) in best_set:
+            inputs.append(inp)
+            results.append(res)
+            best_set.remove(measure_input_str_key(inp))
+
+    # create a new file and save the best records
+    open(out_file, "w")
+    save_records(out_file, inputs, results)
+    logger.info("Extract %d best records from %s to %s", len(inputs), in_file, out_file)
+
+
+"""
+Usage:
+* Distill the best entries from a large log file
+e.g. python -m tvm.auto_scheduler.measure_record --mode distill --i input.json
+"""
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", choices=["distill"], required=True)
+    parser.add_argument("--i", type=str, help="input file")
+    parser.add_argument("--o", type=str, default=None, help="output file")
+
+    args = parser.parse_args()
+    logging.basicConfig()
+    logger.setLevel(logging.INFO)
+
+    if args.mode == "distill":
+        args.o = args.o or args.i + ".best.json"
+        distill_record_file(args.i, args.o)
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
new file mode 100644
index 000000000000..4c493d1d9366
--- /dev/null
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -0,0 +1,363 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-variable,invalid-name
+
+"""
+Integrate auto_scheduler into relay. It implements the following items:
+1. Extract search tasks from a relay program
+2. Provide auto-scheduling for all TOPI compute functions
+"""
+
+import logging
+import json
+import threading
+
+import tvm
+from tvm import autotvm, te, transform
+from tvm.ir.transform import PassContext
+from tvm.runtime import convert_to_object
+from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor
+from tvm.tir import expr as _expr
+from . import _ffi_api
+from .compute_dag import ComputeDAG
+from .dispatcher import DispatchContext
+from .search_task import SearchTask
+from .workload_registry import register_workload_tensors
+
+logger = logging.getLogger("auto_scheduler")
+
+
+def call_all_topi_funcs(mod, params, target):
+    """Call all TOPI compute to extract auto_scheduler tasks in a Relay program"""
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+    from tvm.relay.backend import graph_runtime_codegen
+
+    # Turn off AutoTVM config not found warnings
+    old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
+    autotvm.GLOBAL_SCOPE.silent = True
+
+    with transform.PassContext(
+        opt_level=3,
+        config={"relay.backend.use_auto_scheduler": True},
+        disabled_pass={"AutoSchedulerLayoutRewrite"},
+    ):
+        opt_mod, _ = relay.optimize(mod, target, params)
+        grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
+        grc.codegen(opt_mod["main"])
+
+    autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
+
+
+def extract_tasks(
+    mod, params, target, target_host=None, hardware_params=None, include_simple_tasks=False
+):
+    """Extract tuning tasks from a relay program.
+
+    Parameters
+    ----------
+    mod: tvm.IRModule or relay.function.Function
+        The module or function to tune
+    params: dict of str to numpy array
+        The associated parameters of the program
+    target: Union[tvm.target.Target, str]
+        The compilation target
+    target_host: Optional[Union[tvm.target.Target, str]]
+        The host compilation target
+    hardware_params : Optional[HardwareParams]
+        Hardware parameters used for the search tasks
+    include_simple_tasks: bool
+        Whether to extract simple tasks that do not include complicated ops.
+
+    Returns
+    -------
+    tasks: List[SearchTask]
+        The tasks in this network
+    weights: List[int]
+        The weight (i.e. the number of appearance) of extracted tasks
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    if isinstance(target, str):
+        target = tvm.target.Target(target)
+    if isinstance(target_host, str):
+        target_host = tvm.target.Target(target_host)
+
+    # Run the compiler to collect all TOPI calls during compilation.
+    env = TracingEnvironment(
+        TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY
+    )
+    with env:
+        # Wrap build call in a new thread to avoid the conflict
+        # between python's multiprocessing and tvm's thread pool
+        build_thread = threading.Thread(target=call_all_topi_funcs, args=(mod, params, target))
+        build_thread.start()
+        build_thread.join()
+
+    # query the compile engine to get the number of occurrence of all tasks
+    engine = relay.backend.compile_engine.get()
+    use_count_dict = {}
+    for k, v in engine.items():
+        use_count_dict[k] = v.use_count
+
+    # create search tasks
+    tasks = []
+    weights = []
+    for wkl_key, ccache_key in env.wkl_key_to_ccache_key.items():
+        dag = ComputeDAG(wkl_key)
+        tasks.append(
+            SearchTask(
+                workload_key=wkl_key,
+                target=target,
+                target_host=target_host,
+                hardware_params=hardware_params,
+            )
+        )
+        weights.append(use_count_dict[ccache_key] + 1)
+
+    # clean the cached lowering results
+    engine.clear()
+
+    return tasks, weights
+
+
+class TracingMode:
+    """Two modes for tracing"""
+
+    EXTRACT_TASK = 0  # trace all topi calls to extract tasks
+    EXTRACT_COMPLEX_TASK_ONLY = 1  # same as EXTRACT_TASK but ignore the task without complex ops
+    PREPARE_LAYOUT_REWRITE = 2  # trace topi calls to prepare layout rewrite
+
+
+class TracingEnvironment:
+    """Global environment for tracing all topi function calls"""
+
+    current = None
+
+    def __init__(self, tracing_mode):
+        self.tracing_mode = tracing_mode
+        self.relay_disable_build_cache = "false"
+        self.wkl_key_to_ccache_key = {}
+
+    def __enter__(self):
+        TracingEnvironment.current = self
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        TracingEnvironment.current = None
+
+    def add_workload_key(self, workload_key, ccache_key):
+        """Add the workload key of a search task
+
+        Parameters
+        ----------
+        workload_key: str
+            The workload key of a task
+        ccache_key: CCacheKey
+            The corresponding ccache_key of the task
+        """
+        self.wkl_key_to_ccache_key[workload_key] = ccache_key
+
+
+@tvm._ffi.register_func("auto_scheduler.enter_layout_rewrite")
+def enter_layout_rewrite():
+    """Enter layout rewrite tracing environment"""
+    env = TracingEnvironment(TracingMode.PREPARE_LAYOUT_REWRITE)
+    env.__enter__()
+
+
+@tvm._ffi.register_func("auto_scheduler.exit_layout_rewrite")
+def exit_layout_rewrite():
+    """Exit layout rewrite tracing environment"""
+    env = TracingEnvironment.current
+    env.__exit__(None, None, None)
+
+
+def traverse_to_get_io_tensors(outs):
+    """Traverse from a list of output tensors to get both input and output tensors
+
+    Parameters
+    ----------
+    outs: List[Tensor]
+        The output tensors
+
+    Returns
+    -------
+    io_tensors: List[Tensor]
+        The input and output tensors
+    has_layout_free: bool
+        Whether the compute DAG has layout_free placeholders
+    """
+    layout_free_ops = []
+    inputs = []
+
+    visited = set()
+
+    def traverse(t):
+        if t in visited:
+            return
+        if isinstance(t.op, PlaceholderOp):
+            inputs.append(t)
+        elif isinstance(t.op, ComputeOp):
+            if "layout_free_placeholders" in t.op.attrs:
+                layout_free_ops.append(t.op)
+            for x in t.op.input_tensors:
+                traverse(x)
+        visited.add(t)
+
+    for t in outs:
+        traverse(t)
+
+    has_layout_free = len(layout_free_ops) > 0
+    return inputs + list(outs), has_layout_free
+
+
+@tvm._ffi.register_func("auto_scheduler.relay_integration.auto_schedule_topi_compute")
+def auto_schedule_topi(outs, has_complex_op):
+    """Use auto-scheduler to schedule any topi compute function.
+
+    Note: This is used internally for relay integration. Do
+    not use this as a general user-facing API.
+
+    Parameters
+    ----------
+    outs: List[Tensor]
+        The output tensors of topi compute functions
+    has_complex_op: bool
+        Whether the topi compute function includes at least one complex op.
+
+    Returns
+    -------
+    sch: Optional[te.Schedule]
+        A tuned schedule or none (if not tuned) in the final build mode;
+        An initial schdule in the tracing mode.
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    io_tensors, has_layout_free = traverse_to_get_io_tensors(outs)
+    try:
+        dag = ComputeDAG(io_tensors)
+    except tvm.error.TVMError as err:
+        logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err))
+        return None
+
+    key = register_workload_tensors(dag.hash_key(), io_tensors)
+
+    # only enable layout rewrite for cpu backend
+    target = tvm.target.Target.current()
+    enable_layout_rewrite = "cpu" in target.keys
+
+    env = TracingEnvironment.current
+    if env is None:
+        # in the final build mode
+        state = DispatchContext.current.query(target, key, has_complex_op, dag)
+        if state is None:
+            return None
+
+        schedule, _ = dag.apply_steps_from_state(state)
+    elif env.tracing_mode in [TracingMode.EXTRACT_TASK, TracingMode.EXTRACT_COMPLEX_TASK_ONLY]:
+        # in the task extraction mode
+        if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK:
+            engine = relay.backend.compile_engine.get()
+            ccache_key = engine.get_current_ccache_key()
+            env.add_workload_key(key, ccache_key)
+        schedule = te.create_schedule([x.op for x in outs])
+    elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
+        # in prepare_layout_rewrite mode
+        if enable_layout_rewrite and has_layout_free:
+            dispatch_ctx = DispatchContext.current
+            state = dispatch_ctx.query(target, key, has_complex_op, dag)
+            if state is None:
+                return None
+
+            # rewrite the layout and update the context for the new dag
+            dag = ComputeDAG(outs)
+            new_dag = dag.rewrite_layout_from_state(state)
+            new_key = json.dumps((new_dag.hash_key(),))
+            if new_key != key:
+                dispatch_ctx.update(target, new_key, state)
+        return te.create_schedule([x.op for x in outs])
+    else:
+        raise ValueError("Invalid tracing mode: " + env.tracing_mode)
+
+    return schedule
+
+
+def tensor_no_check_call(self, *indices):
+    """An indexing function without any check.
+    This is the same as `tvm.te.Tensor::__call__` except that the safety
+    check is removed.
+    """
+    indices = convert_to_object(indices)
+    args = []
+    for x in indices:
+        if isinstance(x, _expr.PrimExpr):
+            args.append(x)
+        elif isinstance(x, _expr.IterVar):
+            args.append(x.var)
+        else:
+            raise ValueError("The indices must be expression")
+
+    return _expr.ProducerLoad(self, args)
+
+
+def remove_index_check(tensor):
+    """Remove the safety check in the indexing function for a tensor.
+    This is done by monkey patching its indexing function.
+    After removing the check, we are allowed to create a
+    temporary wrong IR and fix it later in other places.
+
+    Parameters
+    ----------
+    tensor: Tensor
+      The tensor to remove index check.
+    """
+    # Monkey patch the indexing function
+    tensor.__call__ = tensor_no_check_call.__get__(tensor, Tensor)
+
+
+def rewrite_compute_body(compute_tensor, new_layout):
+    """Rewrite the body of a ComputeOp according to a new layout of a placeholder"""
+    op = compute_tensor.op
+
+    # Get layout free placeholders
+    layout_free_placeholders = op.attrs["layout_free_placeholders"]
+    assert len(layout_free_placeholders) == 1, "Only support one layout free placeholder"
+    placeholder_op = layout_free_placeholders[0].op
+
+    # Rewrite the index expression in body
+    body = []
+    for b in op.body:
+        body.append(_ffi_api.RewriteIndexForNewLayout(placeholder_op, new_layout, b))
+    op_node = tvm.te._ffi_api.ComputeOp(op.name, op.tag, op.attrs, op.axis, body)
+
+    num = op_node.num_outputs
+    outputs = tuple(op_node.output(i) for i in range(num))
+    return outputs[0] if num == 1 else outputs
+
+
+def is_auto_scheduler_enabled():
+    """Return whether the auto-scheduler is enabled.
+
+    Parameters
+    ----------
+    enabled: bool
+        Whether the auto-scheduler is enabled
+    """
+    return PassContext.current().config.get("relay.backend.use_auto_scheduler", False)
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index 5533aec8a5e9..6f565edbd378 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -16,15 +16,17 @@
 # under the License.
 
 """
-The search policies for TVM Auto-scheduler.
+The search policies of TVM auto-scheduler.
 
-This contains the strategies to generate a schedule automatically. We provide an EmptyPolicy
-which always returns an unchanged initial state, and a more advanced SketchPolicy which can
-deal with various ops/subgraphs on different target devices.
+The auto-scheduler constructs a search space according to the compute declaration.
+It then randomly samples programs from the search space and uses evolutionary search with a
+learned cost model to fine tune the sampled programs.
+The final optimized programs are sent to actual hardware for measurement.
+The above process is repeated until the auto-scheduler runs out of time budget.
 
 Reference:
 L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
-Programs for Deep Learning." arXiv preprint arXiv:2006.06762 (2020).
+Programs for Deep Learning." (OSDI 2020).
 """
 
 import random
@@ -63,11 +65,42 @@ def __init__(self, filename="auto_scheduler_tuning.json"):
 class SearchPolicy(Object):
     """ The base class of search policies. """
 
+    def continue_search_one_round(self, num_measure, measurer):
+        """
+        Continue the search by doing an additional search round.
+
+        Parameters
+        ----------
+        num_measure: int
+            The number of programs to measure in this round
+        measurer: ProgramMeasurer
+            The program measurer to measure programs
+
+        Returns
+        -------
+        inputs: List[MeasureInput]
+            The inputs of measurments in this search round
+        results: List[MeasureResult]
+            The results of measurments in this search round
+        """
+        return _ffi_api.SearchPolicyContinueSearchOneRound(self, num_measure, measurer)
+
+    def set_verbose(self, verbose):
+        """
+        Set the verbosity level of the search policy.
+
+        Parameters
+        ----------
+        verbose: int
+            The verbosity level
+        """
+        return _ffi_api.SearchPolicySetVerbose(self, verbose)
+
 
 @tvm._ffi.register_object("auto_scheduler.EmptyPolicy")
 class EmptyPolicy(SearchPolicy):
-    """This is an example empty search policy which will always generate
-    the init state of ComputeDAG.
+    """A simple example of the search policy which always returns
+    the initial naive schedule (state).
 
     Parameters
     ----------
@@ -114,11 +147,12 @@ class SketchPolicy(SearchPolicy):
 
     DEFAULT_PARAMS = {
         "eps_greedy": 0.05,
-        "retry_search_one_round_on_empty": 10,
+        "retry_search_one_round_on_empty": 1,
+        "sample_init_min_population": 50,
+        "sample_init_use_measured_ratio": 0.2,
         "evolutionary_search_population": 2048,
-        "evolutionary_search_num_iters": 10,
+        "evolutionary_search_num_iters": 4,
         "evolutionary_search_mutation_prob": 0.85,
-        "evolutionary_search_use_measured_ratio": 0.2,
         "cpu_multi_level_tiling_structure": "SSRSRS",
         "gpu_multi_level_tiling_structure": "SSSRRSRS",
         # Notice: the default thread bind policy of GPU assumes the tiling structure to have at
@@ -176,34 +210,31 @@ def generate_sketches(self, print_for_debug=False):
                 print(s)
         return sketches
 
-    def sample_initial_population(self, pop_size):
+    def sample_initial_population(self):
         """Sample initial population.
         This python interface is mainly used for debugging and testing.
         The actual search is all done in c++.
 
-        Parameters
-        ----------
-        pop_size : int
-            The size of sampled population
-
         Returns
         -------
         states: List[State]
             The sampled states
         """
-        states = _ffi_api.SketchPolicySampleInitialPopulation(self, pop_size)
+        states = _ffi_api.SketchPolicySampleInitialPopulation(self)
         return states
 
     def evolutionary_search(self, init_populations, out_size):
-        """Evolutionary search.
+        """Perform evolutionary search.
         This python interface is mainly used for debugging and testing.
         The actual search is all done in c++.
+
         Parameters
         ----------
         init_populations: List[State]
             The initial population states
         out_size : int
             The size of generated states
+
         Returns
         -------
         states: List[State]
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 92c4f48bf371..31698d0356de 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -17,19 +17,158 @@
 
 """ The definiton of SearchTask """
 
+import json
+
 import tvm._ffi
 from tvm.runtime import Object
 
+from tvm.driver.build_module import build
+from tvm.target import Target
+from .measure import LocalBuilder, LocalRunner
+from .measure_record import load_best_record
+from .workload_registry import make_workload_key
+from .compute_dag import ComputeDAG, LayoutRewriteOption
+from .cost_model import XGBModel
+from .search_policy import SketchPolicy
+from .workload_registry import register_workload_tensors
 from . import _ffi_api
 
 
+@tvm._ffi.register_object("auto_scheduler.HardwareParams")
+class HardwareParams(Object):
+    """The parameters of target hardware used to guide the search policy
+    TODO(jcf94): This is considered to be merged with the new Target specification:
+    https://discuss.tvm.apache.org/t/rfc-tvm-target-specification/6844
+    Parameters
+    ----------
+    num_cores : int
+        The number of device cores.
+    vector_unit_bytes : int
+        The width of vector units in bytes.
+    cache_line_bytes : int
+        The size of cache line in bytes.
+    max_shared_memory_per_block : int
+        The max shared memory per block in bytes.
+    max_registers_per_block : int
+        The max number of register per block.
+    max_threads_per_block : int
+        The max number of threads per block.
+    max_vthread_extent : int
+        The max vthread extent.
+    warp_size : int
+        The thread numbers of a warp.
+    """
+
+    def __init__(
+        self,
+        num_cores,
+        vector_unit_bytes,
+        cache_line_bytes,
+        max_shared_memory_per_block,
+        max_registers_per_block,
+        max_threads_per_block,
+        max_vthread_extent,
+        warp_size,
+    ):
+        self.__init_handle_by_constructor__(
+            _ffi_api.HardwareParams,
+            num_cores,
+            vector_unit_bytes,
+            cache_line_bytes,
+            max_shared_memory_per_block,
+            max_registers_per_block,
+            max_threads_per_block,
+            max_vthread_extent,
+            warp_size,
+        )
+
+
+@tvm._ffi.register_object("auto_scheduler.TuningOptions")
+class TuningOptions(Object):
+    """This controls the options of performance tuning.
+
+    Parameters
+    ----------
+    num_measure_trials: int = 0
+        The number of measurement trials.
+        The search policy measures `num_measure_trials` schedules in total and returns the best one
+        among them.
+        With `num_measure_trials` == 0, the policy will do the schedule search but won't involve
+        measurement. This can be used to get a runnable schedule quickly without auto-tuning.
+    early_stopping: Optional[int]
+        Stop the tuning early if getting no improvement after n measurements.
+    num_measures_per_round: int = 64
+        The number of schedules to be measured at each search round.
+        The whole schedule search process will try a total number of `num_measure_trials` in several
+        rounds.
+    verbose: int = 1
+        Verbosity level. 0 for silent, 1 to output information during schedule search.
+    builder: Union[ProgramBuilder, str] = 'local'
+        ProgramBuilder which builds the program.
+    runner: Union[ProgramRunner, str] = 'local'
+        ProgramRunner which runs the program and measures time costs.
+    measure_callbacks: Optional[List[MeasureCallback]]
+        Callback functions called after each measurement.
+        Candidates:
+        - auto_scheduler.RecordToFile
+    """
+
+    def __init__(
+        self,
+        num_measure_trials=0,
+        early_stopping=None,
+        num_measures_per_round=64,
+        verbose=1,
+        builder="local",
+        runner="local",
+        measure_callbacks=None,
+    ):
+        if isinstance(builder, str):
+            if builder == "local":
+                builder = LocalBuilder()
+            else:
+                raise ValueError("Invalid builder: " + builder)
+        elif not isinstance(builder, tvm.auto_scheduler.measure.ProgramBuilder):
+            raise ValueError(
+                "Invalid builder: "
+                + builder
+                + " . TuningOptions expects a ProgramBuilder or string."
+            )
+
+        if isinstance(runner, str):
+            if runner == "local":
+                runner = LocalRunner()
+            else:
+                raise ValueError("Invalid runner: " + runner)
+        elif not isinstance(runner, tvm.auto_scheduler.measure.ProgramRunner):
+            raise ValueError(
+                "Invalid runner: " + runner + " . TuningOptions expects a ProgramRunner or string."
+            )
+
+        self.__init_handle_by_constructor__(
+            _ffi_api.TuningOptions,
+            num_measure_trials,
+            early_stopping or -1,
+            num_measures_per_round,
+            verbose,
+            builder,
+            runner,
+            measure_callbacks,
+        )
+
+
 @tvm._ffi.register_object("auto_scheduler.SearchTask")
 class SearchTask(Object):
     """The computation information and hardware parameters for a schedule search task.
 
     Parameters
     ----------
-    dag : ComputeDAG
+    func : Union[Function, str]
+        The function that returns the compute declaration Tensors.
+        Can be the a function or the function name.
+    args : Union[Tuple[Any, ...], List[Any]]
+        The args of the function.
+    compute_dag : ComputeDAG
         The ComputeDAG for the corresponding compute declaration.
     workload_key : str
         The workload key for the corresponding compute declaration.
@@ -39,9 +178,209 @@ class SearchTask(Object):
         The target host device of this search task.
     hardware_params : Optional[HardwareParams]
         Hardware parameters used in this search task.
+
+    Examples
+    --------
+    .. code-block:: python
+
+      # We support two ways to create a search task
+
+      # Way 1: create a task by a workload generation function.
+      # The `workload_func` is a function decorated by @auto_scheduler.register_workload
+      task = SearchTask(func=workload_func, args=args, target=target)
+
+      # Way 2: create a task by a workload_key.
+      # The `workload_key` is a string, which can be either a hash key or a json-serialized
+      # tuple(func, args).
+      task = SearchTask(workload_key=workload_key, target=target)
     """
 
-    def __init__(self, dag, workload_key, target, target_host=None, hardware_params=None):
+    def __init__(
+        self,
+        func=None,
+        args=None,
+        compute_dag=None,
+        workload_key=None,
+        target=None,
+        target_host=None,
+        hardware_params=None,
+    ):
+        assert (
+            func is not None or workload_key is not None
+        ), "Either a workload generation function or a workload key should be provided"
+
+        if func is not None:
+            workload_key = make_workload_key(func, args)
+        if compute_dag is None:
+            compute_dag = ComputeDAG(workload_key)
+
+        assert target is not None, "Must specify a target."
+        if isinstance(target, str):
+            target = Target(target)
+        if isinstance(target_host, str):
+            target_host = Target(target_host)
+
+        self.dag = compute_dag
+        self.workload_key = workload_key
+        self.target = target
+        self.target_host = target_host
+        self.hardware_params = hardware_params
         self.__init_handle_by_constructor__(
-            _ffi_api.SearchTask, dag, workload_key, target, target_host, hardware_params
+            _ffi_api.SearchTask, compute_dag, workload_key, target, target_host, hardware_params
         )
+
+    def tune(self, tuning_options, search_policy=None):
+        """Run auto scheduling search for a task
+
+        Parameters
+        ----------
+        tuning_options : TuningOptions
+            Tuning and measurement options.
+        search_policy : Optional[SearchPolicy]
+            The search policy to be used for schedule search.
+        """
+        if search_policy is None:
+            cost_model = XGBModel()
+            search_policy = SketchPolicy(self, cost_model)
+
+        _ffi_api.AutoSchedule(search_policy, tuning_options)
+
+    def apply_best(self, log_file, layout_rewrite_option=None):
+        """Apply the history best from a log file and return the schedule.
+
+        Parameters
+        ----------
+        log_file : str
+           The name of the log file.
+        layout_rewrite_option : Optional[LayoutRewriteOption]
+           The layout rewrite option.
+
+        Returns
+        -------
+            A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
+        """
+        inp, _ = load_best_record(log_file, self.workload_key)
+
+        if layout_rewrite_option is None:
+            layout_rewrite_option = LayoutRewriteOption.NO_REWRITE
+            if self.target.kind.name == "llvm":
+                layout_rewrite_option = LayoutRewriteOption.INSERT_TRANSFORM_STAGE
+        sch, args = self.compute_dag.apply_steps_from_state(inp.state, layout_rewrite_option)
+        return sch, args
+
+    def print_best(self, log_file, print_mode="schedule"):
+        """Print the best schedule as python schedule API code or CUDA source code.
+
+        Parameters
+        ----------
+        log_file : str
+           The name of the log file
+        print_mode: str
+           if "schedule", print the best schedule as python schedule API code.
+           if "cuda", print the best schedule as CUDA source code.
+
+        Returns
+        -------
+        code: str
+            The best schedule code in python API or CUDA source code
+        """
+        inp, _ = load_best_record(log_file, self.workload_key)
+
+        if print_mode == "schedule":
+            return self.compute_dag.print_python_code_from_state(inp.state)
+        if print_mode == "cuda":
+            assert self.target.kind.name == "cuda"
+            sch, args = self.compute_dag.apply_steps_from_state(inp.state)
+            func = build(sch, args, "cuda")
+            return func.imported_modules[0].get_source()
+        raise ValueError("Invalid print_mode: %s" % print_mode)
+
+    def __getstate__(self):
+        return {
+            "dag": self.dag,
+            "workload_key": self.workload_key,
+            "target": self.target,
+            "target_host": self.target_host,
+            "hardware_params": self.hardware_params,
+        }
+
+    def __setstate__(self, state):
+        self.dag = state["dag"]
+        self.workload_key = state["workload_key"]
+
+        # Register the workload if needed
+        try:
+            workload = json.loads(self.workload_key)
+        except Exception:  # pylint: disable=broad-except
+            raise RuntimeError("Invalid workload key %s" % self.workload_key)
+
+        # The workload from a compute DAG does not have arguments and is not registered
+        # by default so we register it here. If the workload has already been registered,
+        # the later registration overrides the prvious one.
+        if len(workload) == 1:
+            register_workload_tensors(workload[0], self.dag.tensors)
+
+        self.target = state["target"]
+        self.target_host = state["target_host"]
+        self.hardware_params = state["hardware_params"]
+        self.__init_handle_by_constructor__(
+            _ffi_api.SearchTask,
+            self.dag,
+            self.workload_key,
+            self.target,
+            self.target_host,
+            self.hardware_params,
+        )
+
+
+def create_task(func, args, target, target_host=None, hardware_params=None):
+    """THIS API IS DEPRECATED.
+
+    Create a search task.
+
+    Parameters
+    ----------
+    func : Union[Function, str]
+        The function that returns the compute declaration Tensors.
+        Can be the a function or the function name.
+    args : Union[Tuple[Any, ...], List[Any]]
+        The args of the function.
+    target : Union[tvm.target.Target, str]
+        The target device of this search task.
+    target_host : Optional[Union[tvm.target.Target, str]]
+        The target host device of this search task.
+    hardware_params : Optional[HardwareParams]
+        Hardware parameters used in this search task.
+
+    Returns
+    -------
+        SearchTask: the created task
+    """
+    raise ValueError(
+        'The API "auto_scheduler.create_task" is deprecated.'
+        "See https://github.com/apache/tvm/pull/7028 for the upgrade guide"
+    )
+
+
+def auto_schedule(task, search_policy=None, tuning_options=TuningOptions()):
+    """THIS API IS DEPRECATED.
+
+    Run auto scheduling search for a task.
+
+    Parameters
+    ----------
+    task : SearchTask
+        The SearchTask for the computation declaration.
+    search_policy : Optional[SearchPolicy]
+        The search policy to be used for schedule search.
+    tuning_options : Optional[TuningOptions]
+        Tuning and measurement options.
+
+    Returns
+    -------
+        A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
+    """
+    raise ValueError(
+        'The API "auto_scheduler.create_task" is deprecated.'
+        "See https://github.com/apache/tvm/pull/7028 for the upgrade guide."
+    )
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
new file mode 100644
index 000000000000..a3dbcae64b60
--- /dev/null
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -0,0 +1,587 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+""" The task scheduler that allocates the time resources when tuning multiple tasks together
+
+The details of the "gradient" strategy below can be found in the section 6 of this paper:
+L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
+Programs for Deep Learning." (OSDI 2020).
+"""
+import os
+import time
+import math
+import logging
+
+import numpy as np
+
+from .search_policy import SearchPolicy, SketchPolicy
+from .cost_model import RandomModel, XGBModel
+from .utils import array_mean
+from .measure import ProgramMeasurer
+from .measure_record import RecordReader
+from . import _ffi_api
+
+logger = logging.getLogger("auto_scheduler")
+
+
+def make_search_policies(
+    search_policy,
+    search_policy_params,
+    tasks,
+    num_measures_per_round,
+    verbose,
+    load_model_file=None,
+    load_log_file=None,
+):
+    """Make a list of search policies for a list of search tasks.
+    It creates one policy per task.
+
+    Parameters
+    ----------
+    search_policy: Union[str, List[SearchPolicy]]
+        The name of search policy.
+    search_policy_params: Dict[str, Any]]
+        The parameters of the search policy.
+    tasks: List[SearchTask]
+        The list of all tasks
+    num_measures_per_round: int
+        The number of schedules to be measured at each search round.
+        This should be the same as `TuningOptions.num_measures_per_round`
+    verbose: int
+        The verbosity level. 0 for silent.
+    load_model_file: Optional[str]
+        Load pre-trained model from this file. If this is None, the cost model will
+        be trained from scratch.
+    load_log_file: Optional[str]
+        Load measurement records from this file. If it is not None, the status of the
+        task scheduler, search policies and cost models will be restored according to this file.
+
+    Returns
+    -------
+    policies: List[SearchPolicy]
+        The list of search policies
+    """
+    if search_policy == "default":
+        search_policy = "sketch.xgb"
+
+    if isinstance(search_policy, str):
+        policy_type, model_type = search_policy.split(".")
+        if model_type == "xgb":
+            cost_model = XGBModel(num_warmup_sample=len(tasks) * num_measures_per_round)
+            if load_model_file:
+                logger.info("TaskScheduler: Load pretrained model...")
+                cost_model.load(load_model_file)
+            elif load_log_file:
+                cost_model.update_from_file(load_log_file)
+        elif model_type == "random":
+            cost_model = RandomModel()
+        else:
+            raise ValueError("Invalid search policy: " + search_policy)
+
+        if policy_type == "sketch":
+            search_policies = [
+                SketchPolicy(task, cost_model, params=search_policy_params, verbose=verbose)
+                for task in tasks
+            ]
+        else:
+            raise ValueError("Invalid search policy: " + search_policy)
+    else:
+        # check type
+        assert isinstance(search_policy, (tuple, list))
+        for item in search_policy:
+            assert isinstance(item, SearchPolicy)
+        search_policies = search_policy
+
+    return search_policies
+
+
+def derive_similarity_tag(dag, log_base=1.618):
+    """Derive the tag for similarity check from one computational DAG.
+    The DAGs with the same tag are considered as similar tasks.
+
+    The tag format is <op1-tag>_<op2-tag> ... <log(flop)>.
+
+    If the tag is "", then the task is not considered to be similar to any other tasks.
+
+    Parameters
+    ----------
+    dag: ComputeDAG
+        The input computational DAG
+    log_base: float = 1.618
+        The base of log to normalize FLOPS
+
+    Returns
+    -------
+    tag: str
+        The tag of this computational DAG.
+    """
+    ret = ""
+    for op in dag.ops:
+        tag = op.attrs.get("auto_scheduler_task_scheduler_tag", None)
+        if tag:
+            ret += op.attrs["auto_scheduler_task_scheduler_tag"] + "_"
+    if ret:
+        ret += "%d" % int(math.log(dag.flop_ct + 1, log_base))
+    return ret
+
+
+class TaskScheduler:
+    """
+    Allocate the time resources when tuning multiple tasks together.
+    This implements two strategies: "round-robin" and "gradient".
+
+    Parameters
+    ----------
+    tasks: List[SearchTask]
+        All tasks to tune
+    task_weights: Optional[List[float]]
+        The weights of tasks.
+        If provided, the task scheduler will set the objective function to
+        sum(weight[t] * latency[t]), where weight[t] is the weight of a task
+        and the lantecy[t] is the lantecy of the task.
+        If not provided, the task scheduer will assign equal weights to all
+        tasks (i.e., the objective function is sum(latency[t])).
+    objective_func: Optional[Callable[List[float] -> float]]
+        The objective function to be minimized.
+        The objective function accepts the current latencies of all tasks and returns the
+        objective.
+        If not provided, the objective is the weighted sum of the latencies of all tasks.
+    strategy: str = "gradient"
+        The scheduling strategy.
+        "round-robin": Tune tasks in round robin order.
+        "gradient" : Tune tasks with gradient descent.
+    load_model_file: Optional[str]
+        Load pre-trained model from this file. If this is None, the cost model will
+        be trained from scratch.
+    load_log_file: Optional[str]
+        Load measurement records from this file. If it is not None, the status of the
+        task scheduler, search policies and cost models will be restored according to this file.
+    verbose: int = 1
+        The level of verbosity. 0 means silent.
+    alpha: float = 0.2
+        The parameter used for 'gradient' strategy
+    beta: float = 2
+        The parameter used for 'gradient' strategy
+    backward_window_size: int = 3
+        The parameter used for 'gradient' strategy
+    callbacks: Optional[List[TaskSchedulerCallback]]
+        The task scheduler callbacks that will be called before and after tuning a task.
+        If None, PrintTableInfo and LogEstimatedLatency callback will be used.
+    """
+
+    def __init__(
+        self,
+        tasks,
+        task_weights=None,
+        objective_func=None,
+        strategy="gradient",
+        load_model_file: str = None,
+        load_log_file: str = None,
+        alpha: float = 0.2,
+        beta: float = 2,
+        gamma: float = 0.5,
+        backward_window_size: int = 3,
+        callbacks=None,
+    ):
+        self.tasks = tasks
+        if objective_func:  # use custom objective function
+            self.objective_func = objective_func
+        else:  # use weighted sum
+            if task_weights:
+                self.objective_func = lambda costs: sum(c * w for c, w in zip(costs, task_weights))
+            else:
+                self.objective_func = sum
+
+        self.strategy = strategy
+        self.load_log_file = load_log_file
+        self.load_model_file = load_model_file
+        self.alpha = alpha
+        self.beta = beta
+        self.gamma = gamma
+        self.backward_window_size = backward_window_size
+        self.callbacks = (
+            callbacks
+            if callbacks is not None
+            else [PrintTableInfo(), LogEstimatedLatency("total_latency.tsv")]
+        )
+
+        assert len(self.tasks) != 0, "No tasks"
+        assert self.strategy in ["round-robin", "gradient"]
+
+        # task_cts[i] saves how many times task i is tuned
+        self.task_cts = [0 for _ in range(len(self.tasks))]
+
+        # task_costs_history[i] saves the latency history of task i
+        self.task_costs_history = [[] for _ in range(len(self.tasks))]
+
+        # best_costs[i] saves the best latency of task i
+        self.best_costs = 1e10 * np.ones(len(self.tasks))
+        self.cur_score = self._compute_score(self.best_costs)
+
+        self.tune_option = self.measurer = self.search_policies = None
+        self.ct = self.best_ct = self.best_score = self.tic = None
+        self.num_measures_per_round = None
+        self.dead_tasks = set()
+
+        # Build similarity groups
+        self.task_tags = []  # task_id -> tag
+        self.tag_to_group_id = {}  # tag -> group_id
+        self.group_task_ids = []  # group_id -> all task ids in this group
+        self.flop_cts = []  # task_id -> the number of floating ops
+        for i, task in enumerate(self.tasks):
+            tag = derive_similarity_tag(task.compute_dag)
+            self.task_tags.append(tag)
+            self.flop_cts.append(task.compute_dag.flop_ct)
+            if not tag:
+                continue
+
+            if tag not in self.tag_to_group_id:
+                self.tag_to_group_id[tag] = len(self.tag_to_group_id)
+                self.group_task_ids.append([])
+            self.group_task_ids[self.tag_to_group_id[tag]].append(i)
+
+    def tune(self, tune_option, search_policy="default", search_policy_params=None):
+        """Tune a batch of tasks together.
+
+        Parameters
+        ----------
+        tune_option: TuningOptions
+            The options of tuning
+        search_policy: : Union[str, List[SearchPolicy]] = "default"
+            The list of search policies.
+            If it is str,
+            "default" for the default policy (SketchPolicy + XGBModel),
+            "sketch.xgb" for SketchPolicy + XGBModel,
+            "sketch.random" for SketchPolicy + RandomModel.
+        search_policy_params : Optional[Dict[str, Any]]
+            The parameters of the search policy
+        """
+        # init members
+        self.tune_option = tune_option
+        early_stopping = 1e20 if tune_option.early_stopping < 0 else tune_option.early_stopping
+
+        self.measurer = ProgramMeasurer(
+            tune_option.builder,
+            tune_option.runner,
+            tune_option.measure_callbacks,
+            tune_option.verbose,
+        )
+        self.ct = self.best_ct = 0
+        self.tic = time.time()
+
+        # reset num_measures_per_round to make sure every task is tuned at least once
+        self.num_measures_per_round = min(
+            tune_option.num_measures_per_round, tune_option.num_measure_trials // len(self.tasks)
+        )
+        if self.num_measures_per_round <= 0:
+            raise ValueError("num_measure_trials is too small. Please set it to a higher value.")
+
+        # restore the status of the task scheduler from a log file
+        if self.load_log_file:
+            self._restore_status(self.load_log_file, self.num_measures_per_round)
+
+        # make one search policy for one task
+        self.search_policies = make_search_policies(
+            search_policy,
+            search_policy_params,
+            self.tasks,
+            self.num_measures_per_round,
+            tune_option.verbose,
+            self.load_model_file,
+            self.load_log_file,
+        )
+
+        # do a round robin first to warm up
+        for idx in range(len(self.tasks)):
+            # skip warming up this task if it has been tuned before (restored from the log file)
+            if not self.task_cts[idx]:
+                self._tune_task(idx)
+        self.best_ct = self.ct
+        self.best_score = self.cur_score
+
+        # use the specific strategy to choose workload to tune
+        task_idx = -1
+        while self.ct < tune_option.num_measure_trials and len(self.dead_tasks) < len(self.tasks):
+            if self.strategy == "round-robin":
+                task_idx = (task_idx + 1) % len(self.tasks)
+                while task_idx in self.dead_tasks:
+                    task_idx = (task_idx + 1) % len(self.tasks)
+            elif self.strategy == "gradient":
+                gradients = []
+                for i in range(len(self.tasks)):
+                    if i in self.dead_tasks:
+                        gradients.append(0)
+                        continue
+
+                    # compute gradient from chain rule : (delta f / delta g_i)
+                    delta = 1e-4
+                    new_costs = list(self.best_costs)
+                    new_costs[i] -= delta
+                    chain_grad = (
+                        self._compute_score(self.best_costs) - self._compute_score(new_costs)
+                    ) / delta
+
+                    # compute (g_i(t_i) - g(t_i - \Delta t)) / (\Delta t)
+                    if (
+                        self.task_cts[i] - 1 < len(self.task_costs_history[i])
+                        and self.task_cts[i] - 1 - self.backward_window_size >= 0
+                    ):
+                        backward_grad = (
+                            self.task_costs_history[i][self.task_cts[i] - 1]
+                            - self.task_costs_history[i][
+                                self.task_cts[i] - 1 - self.backward_window_size
+                            ]
+                        ) / self.backward_window_size
+                    else:
+                        backward_grad = 0
+
+                    # compute (g_i(t_i + \Delta t) - g(t_i)) / (\Delta t)
+                    g_next_1 = self.best_costs[i] - (self.best_costs[i] / self.task_cts[i])
+
+                    g_next_2 = self.beta * 1e30
+                    group_id = self.tag_to_group_id.get(self.task_tags[i], None)
+                    if group_id is not None and len(self.group_task_ids[group_id]) > 1:
+                        best_flops = max(
+                            [
+                                self.flop_cts[j] / self.best_costs[j]
+                                for j in self.group_task_ids[group_id]
+                            ]
+                        )
+                        g_next_2 = self.beta * self.flop_cts[i] / best_flops
+
+                    g_next = min(g_next_1, g_next_2)
+                    forward_grad = g_next - self.best_costs[i]
+
+                    # combine all grads
+                    grad = chain_grad * (
+                        self.alpha * backward_grad + (1 - self.alpha) * forward_grad
+                    )
+                    assert grad <= 0
+                    gradients.append(grad)
+
+                if max(gradients) == min(gradients):
+                    task_idx = np.random.choice(len(gradients))
+                else:
+                    task_idx = np.argmin(gradients)
+            else:
+                raise ValueError("Invalid strategy: " + self.strategy)
+
+            self._tune_task(task_idx)
+            self._adjust_similarity_group(task_idx)
+
+            if self.cur_score < self.best_score:
+                self.best_score = self.cur_score
+                self.best_ct = self.ct
+            elif self.ct - self.best_ct >= early_stopping and all(
+                cost < 1e9 for cost in self.best_costs
+            ):
+                if self.tune_option.verbose >= 1:
+                    print(
+                        "Stop early since no performance improvement in the last "
+                        + str(early_stopping)
+                        + " measurement trials."
+                    )
+                break
+
+    def _tune_task(self, task_idx):
+        """Tune the select task for one round"""
+
+        # Run pre-tune callbacks
+        for callback in self.callbacks:
+            callback.pre_tune(self, task_idx)
+
+        measure_inputs, measure_results = self.search_policies[task_idx].continue_search_one_round(
+            self.num_measures_per_round, self.measurer
+        )
+
+        for res in measure_results:
+            cost = array_mean(res.costs)
+            if cost < self.best_costs[task_idx]:
+                self.best_costs[task_idx] = cost
+
+        if len(measure_inputs) == 0:
+            self.dead_tasks.add(task_idx)
+
+        self.task_cts[task_idx] += 1
+        self.task_costs_history[task_idx].append(self.best_costs[task_idx])
+
+        self.ct += len(measure_inputs)
+        self.cur_score = self._compute_score(self.best_costs)
+
+        # Run post-tune callbacks
+        for callback in self.callbacks:
+            callback.post_tune(self, task_idx)
+
+    def _compute_score(self, costs):
+        """compute the objective function"""
+        return self.objective_func(costs)
+
+    def _adjust_similarity_group(self, task_idx):
+        """adjust the similarity group for the selected task"""
+        group_id = self.tag_to_group_id.get(self.task_tags[task_idx], None)
+        if group_id is None or len(self.group_task_ids[group_id]) <= 1:
+            return
+
+        group_ids = self.group_task_ids[group_id]
+        best_group_flops = max([self.flop_cts[j] / self.best_costs[j] for j in group_ids])
+        cur_flops = self.flop_cts[task_idx] / self.best_costs[task_idx]
+
+        # if we tune a task for many times but it still cannot achieve
+        # a similar speed to the fastest one in its group, this means this task
+        # is actually not similar to other tasks in its group.
+        # So we will remove it from its original group.
+        if cur_flops < best_group_flops / self.beta and self.task_cts[task_idx] > 5 + max(
+            self.task_cts[j] for j in group_ids if j != task_idx
+        ):
+            self.task_tags[task_idx] = None
+            group_ids.remove(task_idx)
+
+    def _restore_status(self, log_file, num_measures_per_round):
+        """restore task_cts and best_costs from a log file"""
+        str_target = str(self.tasks[0].target)
+        workload_key_to_task_id = {t.workload_key: i for i, t in enumerate(self.tasks)}
+        total_ct = -1
+
+        for total_ct, (inp, res) in enumerate(RecordReader(log_file)):
+            if str(inp.task.target) != str_target:
+                continue
+            task_idx = workload_key_to_task_id.get(inp.task.workload_key, None)
+            if task_idx is None:
+                continue
+
+            if res.error_no == 0:
+                self.best_costs[task_idx] = min(self.best_costs[task_idx], array_mean(res.costs))
+
+            self.task_cts[task_idx] += 1
+
+        for i in range(len(self.tasks)):
+            # The computation of taks_cts is just an estimation.
+            # The estimation may not be accurate if the log file is changed externally or
+            # `num_measures_per_round` is different from the last tuning.
+            self.task_cts[i] = int(self.task_cts[i] / num_measures_per_round + 0.5)
+            self.task_costs_history[i].append(self.best_costs[i])
+
+        self.cur_score = self._compute_score(self.best_costs)
+
+        logger.info("TaskScheduler: Loaded %d measurement records from %s", total_ct + 1, log_file)
+
+
+class TaskSchedulerCallback:
+    """The base class of task scheduler callback functions. """
+
+    def pre_tune(self, task_scheduler, task_id):
+        """The callback before tuning each task.
+
+        Parameters
+        ----------
+        task_scheduler: TaskScheduler
+            The task scheduler.
+        task_id: int
+            The task ID going to be tuned.
+        """
+        # Do nothing by default
+
+    def post_tune(self, task_scheduler, task_id):
+        """The callback after tuning each task.
+
+        Parameters
+        ----------
+        task_scheduler: TaskScheduler
+            The task scheduler.
+        task_id: int
+            The task ID be tuned.
+        """
+        # Do nothing by default
+
+
+class PrintTableInfo(TaskSchedulerCallback):
+    """The callback that prints a table of current progress."""
+
+    def pre_tune(self, task_scheduler, task_id):
+        if task_scheduler.tune_option.verbose < 1:
+            return
+
+        _ffi_api.PrintTitle("Task Scheduler")
+        print("|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |")
+        print("-------------------------------------------------")
+
+        # content
+        for i in range(len(task_scheduler.tasks)):
+            id_str = "%d" % i
+            latency_str = (
+                "%.3f" % (1e3 * task_scheduler.best_costs[i])
+                if task_scheduler.best_costs[i] < 1e9
+                else "-"
+            )
+            speed_str = (
+                "%.2f"
+                % (task_scheduler.tasks[i].compute_dag.flop_ct / task_scheduler.best_costs[i] / 1e9)
+                if task_scheduler.best_costs[i] < 1e9
+                else "-"
+            )
+            trials_str = "%d" % (task_scheduler.task_cts[i] * task_scheduler.num_measures_per_round)
+            print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str))
+        print("-------------------------------------------------")
+
+        # overall info
+        if all(cost < 1e9 for cost in task_scheduler.best_costs):
+            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
+        else:
+            total_latency_str = "-"
+        print(
+            "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t"
+            % (
+                total_latency_str,
+                task_scheduler.ct,
+                time.time() - task_scheduler.tic,
+                task_id,
+            )
+        )
+
+
+class LogEstimatedLatency(TaskSchedulerCallback):
+    """Log the estimated latency to the file after tuning a task.
+
+    Parameters
+    ----------
+    log_file: str
+        The log file path.
+    """
+
+    def __init__(self, log_file):
+        if os.path.exists(log_file):  # Remove existing log
+            os.remove(log_file)
+
+        self.log_file = log_file
+
+    def post_tune(self, task_scheduler, task_id):
+        if all(cost < 1e9 for cost in task_scheduler.best_costs):
+            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
+        else:
+            total_latency_str = "N/A"
+
+        with open(self.log_file, "a") as filep:
+            filep.write(
+                "ElapsedTime(s)\t%.0f\tEstimatedLatency(ms)\t%s\tTrials\t%d\n"
+                % (
+                    time.time() - task_scheduler.tic,
+                    total_latency_str,
+                    task_scheduler.ct,
+                )
+            )
+            filep.flush()
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index ff357c439797..9a7c199e6745 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -23,12 +23,15 @@
 import queue
 import signal
 import threading
+import traceback
 import os
 
+import numpy as np
+
 try:
     import psutil
 except ImportError:
-    raise ImportError("psutil not found, try `pip install psutil` to fix this")
+    psutil = None
 
 from tvm import rpc
 from tvm.tir import expr
@@ -127,62 +130,58 @@ def deserialize_args(args):
     return ret
 
 
-class NoDaemonProcess(multiprocessing.Process):
-    @property
-    def daemon(self):
-        return False
-
-    @daemon.setter
-    def daemon(self, value):
-        pass
-
-
-class NoDaemonContext(type(multiprocessing.get_context())):
-    Process = NoDaemonProcess
-
-
-class NoDaemonPool(multiprocessing.pool.Pool):
-    """A no daemon pool version of multiprocessing.Pool.
-    This allows us to start new processes inside the worker function"""
-
-    def __init__(self, *args, **kwargs):
-        kwargs["context"] = NoDaemonContext()
-        super().__init__(*args, **kwargs)
-
-    def __reduce__(self):
-        pass
-
-
 def kill_child_processes(parent_pid, sig=signal.SIGTERM):
     """kill all child processes recursively"""
+    if not psutil:
+        raise ImportError("psutil not found, try `pip install psutil` to fix this")
+
     try:
         parent = psutil.Process(parent_pid)
     except psutil.NoSuchProcess:
         return
-    children = parent.children(recursive=True)
-    for process in children:
-        try:
+
+    try:
+        children = parent.children(recursive=True)
+        for process in children:
             process.send_signal(sig)
-        except psutil.NoSuchProcess:
-            return
+    except psutil.NoSuchProcess:
+        return
 
 
-def call_func_with_timeout(timeout, func, args=(), kwargs=None):
-    """Call a function with timeout"""
+# The maximum length of traceback information
+MAX_TRACEBACK_INFO_LEN = 512
+
+
+def make_traceback_info():
+    """ Get the error message from traceback. """
+    info = str(traceback.format_exc())
+    if len(info) > MAX_TRACEBACK_INFO_LEN:
+        info = (
+            info[: MAX_TRACEBACK_INFO_LEN // 2] + "\n...\n" + info[-MAX_TRACEBACK_INFO_LEN // 2 :]
+        )
+    return info
 
-    def func_wrapper(que):
+
+def _func_wrapper(que, func, args, kwargs):
+    """Call function and return the result over the queue."""
+    try:
         if kwargs:
             que.put(func(*args, **kwargs))
         else:
             que.put(func(*args))
+    # pylint: disable=broad-except
+    except Exception:
+        que.put(Exception(make_traceback_info()))
 
+
+def call_func_with_timeout(timeout, func, args=(), kwargs=None):
+    """Call a function with timeout"""
     que = multiprocessing.Queue(2)
-    process = multiprocessing.Process(target=func_wrapper, args=(que,))
+    process = multiprocessing.Process(target=_func_wrapper, args=(que, func, args, kwargs))
     process.start()
-    process.join(timeout)
 
     try:
-        res = que.get(block=False)
+        res = que.get(timeout=timeout)
     except queue.Empty:
         res = TimeoutError()
 
@@ -264,3 +263,48 @@ def _check():
     t.start()
     t.join(timeout)
     return not t.is_alive()
+
+
+def array_mean(arr):
+    """Compute mean of the elments in a TVM Array<PrimExpr>
+
+    Parameters
+    ----------
+    arr: Array
+        A TVM Array<PrimExpr>
+
+    Returns
+    -------
+    mean: float
+        The mean of the elements in the array
+    """
+    return sum(x.value for x in arr) / len(arr)
+
+
+def to_str_round(x, decimal=6):
+    """Convert an object to str and round float numbers
+
+    Parameters
+    ----------
+    x: Union[str, list, int, float, np.ndarray]
+        The input object
+    decimal: int
+        The precision of decimal fraction
+
+    Returns
+    -------
+    ret: str
+        The string format of these objects
+    """
+    if isinstance(x, str):
+        return x
+    if isinstance(x, (list, tuple, np.ndarray)):
+        return "[" + ", ".join([to_str_round(y, decimal=decimal) for y in x]) + "]"
+    if isinstance(x, dict):
+        return str({k: to_str_round(v) for k, v in x.items()})
+    if isinstance(x, int):
+        return str(x)
+    if isinstance(x, (np.float32, np.float64, float)):
+        format_str = "%%.%df" % decimal
+        return format_str % x
+    raise ValueError("Invalid value: " + str(x) + "\ttype: " + str(type(x)))
diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
index 1d9ee6da4f7a..9a7c15c877aa 100644
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ b/python/tvm/auto_scheduler/workload_registry.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name
 
 """
 Workload registration and serialization.
@@ -29,12 +30,24 @@
 When we need the dag, we decode the string and call the function, which will return the dag.
 """
 
+import logging
 import pickle
 import json
 
 import tvm._ffi
 from .utils import serialize_args, deserialize_args, get_func_name
 
+logger = logging.getLogger("auto_scheduler")
+
+# Global workload function and hash key registry
+# It stores two types of workload:
+# 1. User registered tasks. This type of workload is registered
+#    by the decorator "register_workload"
+# 2. Extracted tasks from a relay program. This type of workload is
+#    registered by function "register_workload_tensors".
+#
+# For 1, the dictionary maps a function name to its function pointer
+# For 2, the dictionary maps a hash key to a list of input/output tensors
 WORKLOAD_FUNC_REGISTRY = {}
 
 
@@ -51,7 +64,7 @@ def register_workload(func_name, f=None, override=False):
     f : Optional[Function]
         The generation function to be registered.
     override : boolean = False
-        Whether override existing entry.
+        Whether to override existing entry.
 
     Examples
     --------
@@ -85,6 +98,28 @@ def register(myf):
     return register
 
 
+def register_workload_tensors(func_name, tensors, override=True):
+    """Register a workload by provding input/output tensors. Since this function is used
+    when extracting/deserializing tasks, it expects duplicated registrations by default.
+
+    Parameters
+    ----------
+    func_name: str
+        The function name or the hash key of the compute DAG.
+    tensors: List[Tensor]
+        The input/output tensors of a compute DAG
+    override : boolean = True
+        Whether to override existing entry.
+
+    Returns
+    -------
+    key: str
+        The serialized JSON string as the workload key.
+    """
+    register_workload(func_name, override=override)(tensors)
+    return json.dumps((func_name,))
+
+
 def make_workload_key(func, args):
     """Make a workload key by function and arguments.
 
@@ -125,8 +160,11 @@ def make_workload_key(func, args):
     return json.dumps((func_name,) + args)
 
 
-def decode_workload_key_to_func_args(workload_key):
-    """Decode a workload key to the registered function name and its corresponding args.
+@tvm._ffi.register_func("auto_scheduler.workload_key_to_tensors")
+def workload_key_to_tensors(workload_key):
+    """Get the input/output tensors from the workload key.
+
+    This method is usually used to create a ComputeDAG by workload key.
 
     Parameters
     ----------
@@ -135,44 +173,64 @@ def decode_workload_key_to_func_args(workload_key):
 
     Returns
     -------
-    name : str
-        The function name of this workload key.
-    args : List[Tensor]
-        The args of the generation function.
+    tensors : List[Tensor]
+        The registered compute declaration Tensors.
     """
     global WORKLOAD_FUNC_REGISTRY
 
     workload = json.loads(workload_key)
-    if not workload[0] in WORKLOAD_FUNC_REGISTRY:
-        raise ValueError(
-            "%s is not registered. " % workload[0]
-            + "Please register it with @auto_scheduler.register_workload"
-        )
-    return workload[0], deserialize_args(workload[1:])
+    name = workload[0]
+    value = WORKLOAD_FUNC_REGISTRY[name]
 
+    # "value" can be either a function or a list of tensors
+    if callable(value):  # if it is a func
+        args = deserialize_args(workload[1:])
+        return value(*args)
+    # otherwise, it is a list of tensors
+    return value
 
-@tvm._ffi.register_func("auto_scheduler.workload_key_to_tensors")
-def workload_key_to_tensors(workload_key):
-    """Get the input/output tensors from the workload key.
 
-    This method is usually used to create a ComputeDAG by workload key.
+def serialize_workload_registry_entry(workload_key):
+    """
+    Serialize a workload registry entry.
+
+    This is used when the start method of multiprocessing is spawn.
+    We need to serialize the entry and register it in the new processes.
 
     Parameters
     ----------
     workload_key : str
-        The input workload key.
+        The workload key
 
     Returns
     -------
-    tensors : List[Tensor]
-        The registered compute declaration Tensors.
+    data: Tuple
+        The serialized pickable data
+    """
+    global WORKLOAD_FUNC_REGISTRY
+
+    workload = json.loads(workload_key)
+    name = workload[0]
+    value = WORKLOAD_FUNC_REGISTRY[name]
+
+    return name, value
+
+
+def deserialize_workload_registry_entry(data):
+    """
+    Deserialize a workload registry entry.
+    This should be used along with :code:`serialize_workload_registry_entry`
+
+    Parameters
+    ----------
+    data: Tuple
+        The return value of :code:`serialize_workload_registry_entry`
     """
     global WORKLOAD_FUNC_REGISTRY
 
-    name, args = decode_workload_key_to_func_args(workload_key)
-    lookup = WORKLOAD_FUNC_REGISTRY[name]
-    assert callable(lookup)
-    return lookup(*args)
+    name, value = data
+    if name not in WORKLOAD_FUNC_REGISTRY:
+        WORKLOAD_FUNC_REGISTRY[name] = value
 
 
 def save_workload_func_registry(filename):
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 7eb1c8b98bc2..a3c59252b01a 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -33,7 +33,7 @@
 from . import record
 from . import task
 from . import tuner
-from . import util
+from . import utils
 from . import env
 from . import tophub
 
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index 40945edad57f..741b05f4c453 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -509,7 +509,7 @@ def _callback(_, inputs, results):
                 # Rule out invalid layout transformations
                 out = topi.layout_transform(data, in_layout, out_layout)
                 out_flops = 1
-                for i in topi.util.get_const_tuple(out.shape):
+                for i in topi.utils.get_const_tuple(out.shape):
                     out_flops *= i
 
                 if flops != out_flops:
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 7032db67a732..4d6c5daad378 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -41,7 +41,7 @@
 from tvm.driver import build
 from tvm.contrib import nvcc, ndk, tar
 
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..env import AutotvmGlobalScope
 from ..task.space import InstantiationError
 
@@ -168,7 +168,7 @@ class RPCRunner(Runner):
     Parameters
     ----------
     timeout: float
-        The timeout of a compilation
+        The timeout of a RPCRunner measurement task
     n_parallel: int
         The number of tasks run in parallel. "None" will use all cpu cores
     key: str
@@ -240,7 +240,7 @@ def __init__(
         self.check_correctness = check_correctness
         self.cooldown_interval = cooldown_interval
 
-        self.executor = LocalExecutor()
+        self.executor = LocalExecutor(timeout=timeout * (self.n_parallel + 1))
 
     def set_task(self, task):
         self.task = task
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index a1b89404b5a1..4f11aea2911f 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -268,8 +268,8 @@ def split_workload(in_file, clean=True):
 
 def pick_best(in_file, out_file):
     """
-    Pick best entries from a file and store it to another file.
-    This distill the useful log entries from a large log file.
+    Pick the best entries from a file and store them to another file.
+    This function distills the useful log entries from a large log file.
     If out_file already exists, the best entries from both
     in_file and out_file will be saved.
 
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index fb8cf57ed7c7..cf9cd809aa8d 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -34,7 +34,7 @@
 
 from tvm.te import schedule, thread_axis
 from tvm.tir import expr
-from tvm.autotvm.util import get_const_int
+from tvm.autotvm.utils import get_const_int
 
 Axis = namedtuple("Axis", ["space", "index"])
 
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index a7cb9a095765..c8b50ad33741 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -23,15 +23,15 @@
 """
 import numpy as np
 
-from tvm.target import Target
 from tvm import runtime
 from tvm.ir import container
+from tvm.target import Target
+from tvm.te import placeholder, tensor
 from tvm.tir import expr
-from tvm.te import tensor, placeholder
 
 
-from ..util import get_const_int, get_const_tuple
-from .dispatcher import DispatchContext, ApplyConfig
+from ..utils import get_const_int, get_const_tuple
+from .dispatcher import ApplyConfig, DispatchContext
 from .space import ConfigSpace
 
 
@@ -173,6 +173,8 @@ def __getstate__(self):
         # some unpickable local task functions.
         # So we only pickle the name of the function
         # and restore the function by name when unpickling it.
+        import cloudpickle  # pylint: disable=import-outside-toplevel
+
         return {
             "name": self.name,
             "args": self.args,
@@ -181,14 +183,17 @@ def __getstate__(self):
             "flop": self.flop,
             "target": self.target,
             "target_host": self.target_host,
+            "func": cloudpickle.dumps(self.func),
         }
 
     def __setstate__(self, state):
+        import cloudpickle  # pylint: disable=import-outside-toplevel
+
         self.name = state["name"]
         self.args = state["args"]
         self.kwargs = state["kwargs"]
         self.config_space = state["config_space"]
-        self.func = _lookup_task(state["name"])
+        self.func = cloudpickle.loads(state["func"])
         self.flop = state["flop"]
         self.target = state["target"]
         self.target_host = state["target_host"]
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 2076ee7d363c..7e72fe08fe32 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -30,13 +30,13 @@
 from ..target import Target
 from ..contrib.download import download
 from .record import load_from_file
-from .util import EmptyContext
+from .utils import EmptyContext
 
 # environment variable to read TopHub location
 AUTOTVM_TOPHUB_LOC_VAR = "TOPHUB_LOCATION"
 
 # default location of TopHub
-AUTOTVM_TOPHUB_DEFAULT_LOC = "https://raw.githubusercontent.com/uwsampl/tvm-distro/master/tophub"
+AUTOTVM_TOPHUB_DEFAULT_LOC = "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub"
 
 # value of AUTOTVM_TOPHUB_LOC_VAR to specify to not read from TopHub
 AUTOTVM_TOPHUB_NONE_LOC = "NONE"
@@ -46,14 +46,14 @@
 
 # the version of each package
 PACKAGE_VERSION = {
-    "arm_cpu": "v0.07",
+    "arm_cpu": "v0.08",
     "llvm": "v0.04",
-    "cuda": "v0.09",
+    "cuda": "v0.10",
     "rocm": "v0.05",
     "opencl": "v0.04",
     "mali": "v0.06",
     "intel_graphics": "v0.02",
-    "vta": "v0.09",
+    "vta": "v0.10",
     "amd_apu": "v0.01",
 }
 
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index bb9dafaac112..dc75de206d05 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -23,7 +23,7 @@
 import numpy as np
 
 from .. import record
-from ..util import format_si_prefix
+from ..utils import format_si_prefix
 
 logger = logging.getLogger("autotvm")
 
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index 165d5d3dc5c2..58251992ce54 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -129,8 +129,8 @@ def update(self, inputs, results):
                     while knob2point(tmp_gene, self.dims) in self.visited:
                         j = np.random.randint(len(self.dims))
                         tmp_gene[j] = np.random.randint(
-                            self.dims[j]
-                        )  # pylint: disable=invalid-sequence-index
+                            self.dims[j]  # pylint: disable=invalid-sequence-index
+                        )
                     next_genes.append(tmp_gene)
                     self.visited.add(knob2point(tmp_gene, self.dims))
                 else:
diff --git a/python/tvm/autotvm/tuner/metric.py b/python/tvm/autotvm/tuner/metric.py
index 1ed04ab22e3f..f6932f80d3e3 100644
--- a/python/tvm/autotvm/tuner/metric.py
+++ b/python/tvm/autotvm/tuner/metric.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from ..util import get_rank
+from ..utils import get_rank
 
 
 def max_curve(trial_scores):
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
index 5535246791b6..401eda8c276f 100644
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -25,7 +25,7 @@
 
 import numpy as np
 
-from ..util import sample_ints
+from ..utils import sample_ints
 from .model_based_tuner import ModelOptimizer, knob2point, point2knob
 
 logger = logging.getLogger("autotvm")
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index b769d345bc9b..fa609306140b 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from ..measure import MeasureInput, create_measure_batch
-from ..util import format_si_prefix
+from ..utils import format_si_prefix
 
 from ..env import GLOBAL_SCOPE
 
@@ -107,7 +107,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
             with no return value. These callback functions will be called on
             every measurement pair. See autotvm/tuner/callback.py for some examples.
         si_prefix: str
-            One of tvm.autotvm.util.SI_PREFIXES. The SI prefix to use when reporting FLOPS.
+            One of tvm.autotvm.utils.SI_PREFIXES. The SI prefix to use when reporting FLOPS.
         """
         measure_batch = create_measure_batch(self.task, measure_option)
         n_parallel = getattr(measure_batch, "n_parallel", 1)
@@ -176,7 +176,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
 
             if error_ct > 150:
                 logging.basicConfig()
-                logger.warning("Too many errors happen in the tuning. Now is in debug mode")
+                logger.warning("Too many errors happen in the tuning. Switching to debug mode.")
                 logger.setLevel(logging.DEBUG)
             else:
                 logger.setLevel(old_level)
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 7b9df1c99373..287cbffe6956 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -23,16 +23,13 @@
 
 import numpy as np
 
-try:
-    import xgboost as xgb
-except ImportError:
-    xgb = None
-
 from .. import feature
-from ..util import get_rank
+from ..utils import get_rank
 from .metric import max_curve, recall_curve, cover_curve
 from .model_based_tuner import CostModel, FeatureCache
 
+xgb = None
+
 logger = logging.getLogger("autotvm")
 
 
@@ -75,10 +72,13 @@ class XGBoostCostModel(CostModel):
     def __init__(
         self, task, feature_type, loss_type, num_threads=None, log_interval=25, upper_model=None
     ):
+        global xgb
         super(XGBoostCostModel, self).__init__()
-
-        if xgb is None:
-            raise RuntimeError(
+        try:
+            if xgb is None:
+                xgb = __import__("xgboost")
+        except ImportError:
+            raise ImportError(
                 "XGBoost is required for XGBoostCostModel. "
                 "Please install its python package first. "
                 "Help: (https://xgboost.readthedocs.io/en/latest/) "
@@ -153,7 +153,10 @@ def _reset_pool(self, space, target, task):
 
         self._close_pool()
 
-        # use global variable to pass common arguments
+        # Use global variable to pass common arguments. This is only used when
+        # new processes are started with fork. We have to set the globals
+        # before we create the pool, so that processes in the pool get the
+        # correct globals.
         global _extract_space, _extract_target, _extract_task
         _extract_space = space
         _extract_target = target
@@ -324,7 +327,12 @@ def _get_feature(self, indexes):
 
         if need_extract:
             pool = self._get_pool()
-            feas = pool.map(self.feature_extract_func, need_extract)
+            # If we are forking, we can pass arguments in globals for better performance
+            if multiprocessing.get_start_method(False) == "fork":
+                feas = pool.map(self.feature_extract_func, need_extract)
+            else:
+                args = [(self.space.get(x), self.target, self.task) for x in need_extract]
+                feas = pool.map(self.feature_extract_func, args)
             for i, fea in zip(need_extract, feas):
                 fea_cache[i] = fea
 
@@ -344,18 +352,24 @@ def __del__(self):
         self._close_pool()
 
 
+# Global variables for passing arguments to extract functions.
 _extract_space = None
 _extract_target = None
 _extract_task = None
 
 
-def _extract_itervar_feature_index(index):
+def _extract_itervar_feature_index(args):
     """extract iteration var feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
-        with _extract_target:
-            sch, args = _extract_task.instantiate(config)
-        fea = feature.get_itervar_feature_flatten(sch, args, take_log=True)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+            with _extract_target:
+                sch, fargs = _extract_task.instantiate(config)
+        else:
+            config, target, task = args
+            with target:
+                sch, fargs = task.instantiate(config)
+        fea = feature.get_itervar_feature_flatten(sch, fargs, take_log=True)
         fea = np.concatenate((fea, list(config.get_other_option().values())))
         return fea
     except Exception:  # pylint: disable=broad-except
@@ -381,10 +395,13 @@ def _extract_itervar_feature_log(arg):
         return None
 
 
-def _extract_knob_feature_index(index):
+def _extract_knob_feature_index(args):
     """extract knob feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+        else:
+            config = args[0]
         return config.get_flatten_feature()
     except Exception:  # pylint: disable=broad-except
         return None
@@ -408,13 +425,18 @@ def _extract_knob_feature_log(arg):
         return None
 
 
-def _extract_curve_feature_index(index):
+def _extract_curve_feature_index(args):
     """extract sampled curve feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
-        with _extract_target:
-            sch, args = _extract_task.instantiate(config)
-        fea = feature.get_buffer_curve_sample_flatten(sch, args, sample_n=20)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+            with _extract_target:
+                sch, fargs = _extract_task.instantiate(config)
+        else:
+            config, target, task = args
+            with target:
+                sch, fargs = task.instantiate(config)
+        fea = feature.get_buffer_curve_sample_flatten(sch, fargs, sample_n=20)
         fea = np.concatenate((fea, list(config.get_other_option().values())))
         return np.array(fea)
     except Exception:  # pylint: disable=broad-except
diff --git a/python/tvm/autotvm/util.py b/python/tvm/autotvm/utils.py
similarity index 100%
rename from python/tvm/autotvm/util.py
rename to python/tvm/autotvm/utils.py
diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py
deleted file mode 100644
index b0f36c8277ed..000000000000
--- a/python/tvm/contrib/binutil.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Utilities for binary file manipulation"""
-import os
-import subprocess
-import tvm._ffi
-from . import util
-
-# TODO does this file still belong in `contrib`. is it too µTVM-specific?
-
-# TODO shouldn't need so many `ALIGN` directives
-RELOCATION_LD_SCRIPT_TEMPLATE = """
-/* linker symbol for use in UTVMInit */
-_utvm_stack_pointer_init = 0x{stack_pointer_init:x};
-
-SECTIONS
-{{
-  . = 0x{text_start:x};
-  . = ALIGN({word_size});
-  .text :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.text))
-    KEEP(*(.text*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{rodata_start:x};
-  . = ALIGN({word_size});
-  .rodata :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.rodata))
-    KEEP(*(.rodata*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{data_start:x};
-  . = ALIGN({word_size});
-  .data :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.data))
-    KEEP(*(.data*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{bss_start:x};
-  . = ALIGN({word_size});
-  .bss :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.bss))
-    KEEP(*(.bss*))
-    . = ALIGN({word_size});
-  }}
-}}
-"""
-
-
-def run_cmd(cmd):
-    """Runs `cmd` in a subprocess and awaits its completion.
-
-    Parameters
-    ----------
-    cmd : List[str]
-        list of command-line arguments
-
-    Returns
-    -------
-    output : str
-        resulting stdout capture from the subprocess
-    """
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    (output, _) = proc.communicate()
-    output = output.decode("utf-8")
-    if proc.returncode != 0:
-        cmd_str = " ".join(cmd)
-        msg = f'error while running command "{cmd_str}":\n{output}'
-        raise RuntimeError(msg)
-    return output
-
-
-@tvm._ffi.register_func("tvm_callback_get_section_size")
-def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
-    """Finds size of the section in the binary.
-    Assumes `size` shell command exists (typically works only on Linux machines)
-
-    Parameters
-    ----------
-    binary_path : str
-        path of the binary file
-
-    section_name : str
-        name of section
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    size : integer
-        size of the section in bytes
-    """
-    if not os.path.isfile(binary_path):
-        raise RuntimeError('no such file "{}"'.format(binary_path))
-    # We use the "-A" flag here to get the ".rodata" section's size, which is
-    # not included by default.
-    size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])
-
-    # TODO(weberlo): Refactor this method and `*relocate_binary` so they are
-    # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss".
-    section_mapping = {
-        ".text": [".text"],
-        ".rodata": [".rodata"],
-        ".data": [".data", ".sdata"],
-        ".bss": [".bss", ".sbss"],
-    }
-    sections_to_sum = section_mapping["." + section_name]
-    section_size = 0
-    # Skip the first two header lines in the `size` output.
-    for line in size_output.split("\n")[2:]:
-        tokens = list(filter(lambda s: len(s) != 0, line.split(" ")))
-        if len(tokens) != 3:
-            continue
-        entry_name = tokens[0]
-        entry_size = int(tokens[1])
-        for section in sections_to_sum:
-            if entry_name.startswith(section):
-                section_size += entry_size
-                break
-
-    # NOTE: in the past, section_size has been wrong on x86. it may be
-    # inconsistent. TODO: maybe stop relying on `*size` to give us the size and
-    # instead read the section with `*objcopy` and count the bytes.
-    # NOTE(areusch): I think the problem is due to alignment ops in the linker.
-    # Since this is going away in the impending switch to on-device runtime,
-    # add a constant to hopefully absorb these relocations.
-    if section_size > 0:
-        section_size += 64
-
-    return section_size
-
-
-@tvm._ffi.register_func("tvm_callback_relocate_binary")
-def tvm_callback_relocate_binary(
-    binary_path,
-    word_size,
-    text_start,
-    rodata_start,
-    data_start,
-    bss_start,
-    stack_end,
-    toolchain_prefix,
-):
-    """Relocates sections in the binary to new addresses
-
-    Parameters
-    ----------
-    binary_path : str
-        path of the binary file
-
-    word_size : int
-        word size on the target machine
-
-    text_start : int
-        text section address
-
-    rodata_start : int
-        rodata section address
-
-    data_start : int
-        data section address
-
-    bss_start : int
-        bss section address
-
-    stack_end : int
-        stack section end address
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    rel_bin : bytearray
-        the relocated binary
-    """
-    assert text_start < rodata_start < data_start < bss_start < stack_end
-    stack_pointer_init = stack_end - word_size
-    ld_script_contents = ""
-    # TODO(weberlo): There should be a better way to configure this for different archs.
-    # TODO is this line even necessary?
-    if "riscv" in toolchain_prefix:
-        ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n'
-    ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
-        word_size=word_size,
-        text_start=text_start,
-        rodata_start=rodata_start,
-        data_start=data_start,
-        bss_start=bss_start,
-        stack_pointer_init=stack_pointer_init,
-    )
-
-    tmp_dir = util.tempdir()
-    rel_obj_path = tmp_dir.relpath("relocated.obj")
-    rel_ld_script_path = tmp_dir.relpath("relocate.lds")
-    with open(rel_ld_script_path, "w") as f:
-        f.write(ld_script_contents)
-    run_cmd(
-        ["{}ld".format(toolchain_prefix), binary_path, "-T", rel_ld_script_path, "-o", rel_obj_path]
-    )
-
-    with open(rel_obj_path, "rb") as f:
-        rel_bin = bytearray(f.read())
-
-    gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR")
-    if gdb_init_dir is not None:
-        gdb_init_path = f"{gdb_init_dir}/.gdbinit"
-        with open(gdb_init_path, "r") as f:
-            gdbinit_contents = f.read().split("\n")
-        new_contents = []
-        for line in gdbinit_contents:
-            new_contents.append(line)
-            if line.startswith("target"):
-                new_contents.append(f"add-symbol-file {rel_obj_path}")
-        with open(gdb_init_path, "w") as f:
-            f.write("\n".join(new_contents))
-
-    return rel_bin
-
-
-@tvm._ffi.register_func("tvm_callback_read_binary_section")
-def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
-    """Returns the contents of the specified section in the binary byte array
-
-    Parameters
-    ----------
-    binary : bytearray
-        contents of the binary
-
-    section : str
-        type of section
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    section_bin : bytearray
-        contents of the read section
-    """
-    tmp_dir = util.tempdir()
-    tmp_bin = tmp_dir.relpath("temp.bin")
-    tmp_section = tmp_dir.relpath("tmp_section.bin")
-    with open(tmp_bin, "wb") as out_file:
-        out_file.write(bytes(binary))
-    run_cmd(
-        [
-            "{}objcopy".format(toolchain_prefix),
-            "--dump-section",
-            ".{}={}".format(section, tmp_section),
-            tmp_bin,
-        ]
-    )
-    if os.path.isfile(tmp_section):
-        # Get section content if it exists.
-        with open(tmp_section, "rb") as f:
-            section_bin = bytearray(f.read())
-    else:
-        # Return empty bytearray if the section does not exist.
-        section_bin = bytearray("", "utf-8")
-    return section_bin
-
-
-@tvm._ffi.register_func("tvm_callback_get_symbol_map")
-def tvm_callback_get_symbol_map(binary, toolchain_prefix):
-    """Obtains a map of symbols to addresses in the passed binary
-
-    Parameters
-    ----------
-    binary : bytearray
-        contents of the binary
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    map_str : str
-        map of defined symbols to addresses, encoded as a series of
-        alternating newline-separated keys and values
-    """
-    tmp_dir = util.tempdir()
-    tmp_obj = tmp_dir.relpath("tmp_obj.bin")
-    with open(tmp_obj, "wb") as out_file:
-        out_file.write(bytes(binary))
-    nm_output = run_cmd(["{}nm".format(toolchain_prefix), "-C", "--defined-only", tmp_obj])
-    nm_output = nm_output.splitlines()
-    map_str = ""
-    for line in nm_output:
-        line = line.split()
-        map_str += line[2] + "\n"
-        map_str += line[0] + "\n"
-    return map_str
diff --git a/python/tvm/contrib/clang.py b/python/tvm/contrib/clang.py
index edc12004dc4d..989444730412 100644
--- a/python/tvm/contrib/clang.py
+++ b/python/tvm/contrib/clang.py
@@ -20,7 +20,7 @@
 
 from tvm._ffi.base import py_str
 import tvm.target
-from . import util
+from . import utils
 
 
 def find_clang(required=True):
@@ -49,7 +49,7 @@ def find_clang(required=True):
         cc_list += ["clang-%d" % major]
     cc_list += ["clang"]
     cc_list += ["clang.exe"]
-    valid_list = [util.which(x) for x in cc_list]
+    valid_list = [utils.which(x) for x in cc_list]
     valid_list = [x for x in valid_list if x]
     if not valid_list and required:
         raise RuntimeError("cannot find clang, candidates are: " + str(cc_list))
@@ -83,12 +83,12 @@ def create_llvm(inputs, output=None, options=None, cc=None):
     cc = cc if cc else find_clang()[0]
     cmd = [cc]
     cmd += ["-S", "-emit-llvm"]
-    temp = util.tempdir()
+    temp = utils.tempdir()
     output = output if output else temp.relpath("output.ll")
     inputs = [inputs] if isinstance(inputs, str) else inputs
     input_files = []
     for i, code in enumerate(inputs):
-        if util.is_source_path(code):
+        if utils.is_source_path(code):
             input_files.append(code)
         else:
             temp_path = temp.relpath("input%d.cc" % i)
diff --git a/python/tvm/contrib/download.py b/python/tvm/contrib/download.py
index 96030245b9e5..a521c8cac017 100644
--- a/python/tvm/contrib/download.py
+++ b/python/tvm/contrib/download.py
@@ -126,7 +126,7 @@ def _download_progress(count, block_size, total_size):
 os.makedirs(TEST_DATA_ROOT_PATH, exist_ok=True)
 
 
-def download_testdata(url, relpath, module=None):
+def download_testdata(url, relpath, module=None, overwrite=False):
     """Downloads the test data from the internet.
 
     Parameters
@@ -155,5 +155,5 @@ def download_testdata(url, relpath, module=None):
     else:
         raise ValueError("Unsupported module: " + module)
     abspath = os.path.join(TEST_DATA_ROOT_PATH, module_path, relpath)
-    download(url, abspath, overwrite=False, size_compare=False)
+    download(url, abspath, overwrite=overwrite, size_compare=False)
     return abspath
diff --git a/python/tvm/contrib/emcc.py b/python/tvm/contrib/emcc.py
index 0cecc66bd1a2..89431dc2a4f6 100644
--- a/python/tvm/contrib/emcc.py
+++ b/python/tvm/contrib/emcc.py
@@ -42,6 +42,7 @@ def create_tvmjs_wasm(output, objects, options=None, cc="emcc"):
     cmd += ["-O3"]
 
     cmd += ["-std=c++14"]
+    cmd += ["--no-entry"]
     cmd += ["-s", "ERROR_ON_UNDEFINED_SYMBOLS=0"]
     cmd += ["-s", "STANDALONE_WASM=1"]
     cmd += ["-s", "ALLOW_MEMORY_GROWTH=1"]
diff --git a/python/tvm/contrib/mkldnn.py b/python/tvm/contrib/mkldnn.py
index 04af30070293..8d5f4da0345b 100644
--- a/python/tvm/contrib/mkldnn.py
+++ b/python/tvm/contrib/mkldnn.py
@@ -45,7 +45,7 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
         (n, m),
         [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.mkl.matmul", ins[0], ins[1], outs[0], transa, transb
+            "tvm.contrib.mkldnn.matmul", ins[0], ins[1], outs[0], transa, transb
         ),
         name="C",
         **kwargs,
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index f958c1f8a0cf..53a507f2d79a 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -25,7 +25,7 @@
 import tvm._ffi
 from tvm.runtime import ndarray as nd
 
-from . import util
+from . import utils
 from .._ffi.base import py_str
 
 
@@ -54,7 +54,7 @@ def compile_cuda(code, target="ptx", arch=None, options=None, path_target=None):
     cubin : bytearray
         The bytearray of the cubin
     """
-    temp = util.tempdir()
+    temp = utils.tempdir()
     if target not in ["cubin", "ptx", "fatbin"]:
         raise ValueError("target must be in cubin, ptx, fatbin")
     temp_code = temp.relpath("my_kernel.cu")
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index d9dd5360da85..62ee9fea400b 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -20,13 +20,13 @@
 import logging
 import tvm
 from tvm import te
-from . import util
+from . import utils
 from .. import rpc
 
 
 def _convert_to_remote(func, remote):
     """ convert module function to remote rpc function"""
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path_dso = temp.relpath("tmp_func.tar")
     func.export_library(path_dso)
 
diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index 7b222f3bb20e..4f62f1a8da26 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -23,7 +23,7 @@
 import tvm.runtime
 import tvm.target
 
-from . import util
+from . import utils
 
 
 def find_lld(required=True):
@@ -51,7 +51,7 @@ def find_lld(required=True):
         lld_list += ["ld.lld-%d.0" % major]
         lld_list += ["ld.lld-%d" % major]
     lld_list += ["ld.lld"]
-    valid_list = [util.which(x) for x in lld_list]
+    valid_list = [utils.which(x) for x in lld_list]
     valid_list = [x for x in valid_list if x]
     if not valid_list and required:
         raise RuntimeError("cannot find ld.lld, candidates are: " + str(lld_list))
@@ -73,7 +73,20 @@ def rocm_link(in_file, out_file, lld=None):
         The lld linker, if not specified,
         we will try to guess the matched clang version.
     """
-    args = [lld if lld is not None else find_lld()[0], "-shared", in_file, "-o", out_file]
+
+    # if our result has undefined symbols, it will fail to load
+    # (hipModuleLoad/hipModuleLoadData), but with a somewhat opaque message
+    # so we have ld.lld check this here.
+    # If you get a complaint about missing symbols you might want to check the
+    # list of bitcode files below.
+    args = [
+        lld if lld is not None else find_lld()[0],
+        "--no-undefined",
+        "-shared",
+        in_file,
+        "-o",
+        out_file,
+    ]
     proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
 
@@ -97,7 +110,7 @@ def callback_rocm_link(obj_bin):
     cobj_bin : bytearray
         The HSA Code Object
     """
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_obj = tmp_dir.relpath("rocm_kernel.o")
     tmp_cobj = tmp_dir.relpath("rocm_kernel.co")
     with open(tmp_obj, "wb") as out_file:
@@ -108,7 +121,7 @@ def callback_rocm_link(obj_bin):
 
 
 @tvm._ffi.register_func("tvm_callback_rocm_bitcode_path")
-def callback_rocm_bitcode_path(rocdl_dir="/opt/rocm/lib/"):
+def callback_rocm_bitcode_path(rocdl_dir=None):
     """Utility function to find ROCm device library bitcodes
 
     Parameters
@@ -118,23 +131,40 @@ def callback_rocm_bitcode_path(rocdl_dir="/opt/rocm/lib/"):
         The default value is the standard location
     """
     # seems link order matters.
-    bitcode_files = [
-        "oclc_daz_opt_on.amdgcn.bc",
-        "ocml.amdgcn.bc",
-        "hc.amdgcn.bc",
-        "irif.amdgcn.bc",
-        "ockl.amdgcn.bc",
-        "oclc_correctly_rounded_sqrt_off.amdgcn.bc",
-        "oclc_correctly_rounded_sqrt_on.amdgcn.bc",
-        "oclc_daz_opt_off.amdgcn.bc",
-        "oclc_finite_only_off.amdgcn.bc",
-        "oclc_finite_only_on.amdgcn.bc",
-        "oclc_isa_version_803.amdgcn.bc",
-        "oclc_isa_version_900.amdgcn.bc",
-        "oclc_isa_version_906.amdgcn.bc",
-        "oclc_unsafe_math_off.amdgcn.bc",
-        "oclc_unsafe_math_on.amdgcn.bc",
-        "oclc_wavefrontsize64_on.amdgcn.bc",
+
+    if rocdl_dir is None:
+        if exists("/opt/rocm/amdgcn/bitcode/"):
+            rocdl_dir = "/opt/rocm/amdgcn/bitcode/"  # starting with rocm 3.9
+        else:
+            rocdl_dir = "/opt/rocm/lib/"  # until rocm 3.8
+
+    bitcode_names = [
+        "oclc_daz_opt_on",
+        "ocml",
+        "hc",
+        "irif",  # this does not exist in rocm 3.9, drop eventually
+        "ockl",
+        "oclc_correctly_rounded_sqrt_off",
+        "oclc_correctly_rounded_sqrt_on",
+        "oclc_daz_opt_off",
+        "oclc_finite_only_off",
+        "oclc_finite_only_on",
+        "oclc_isa_version_803",  # todo (t-vi): an alternative might be to scan for the
+        "oclc_isa_version_900",  #              isa version files (if the linker throws out
+        "oclc_isa_version_906",  #              the unneeded ones or we filter for the arch we need)
+        "oclc_unsafe_math_off",
+        "oclc_unsafe_math_on",
+        "oclc_wavefrontsize64_on",
     ]
-    paths = [join(rocdl_dir, bitcode) for bitcode in bitcode_files]
-    return tvm.runtime.convert([path for path in paths if exists(path)])
+
+    bitcode_files = []
+    for n in bitcode_names:
+        p = join(rocdl_dir, n + ".bc")  # rocm >= 3.9
+        if not exists(p):  # rocm <= 3.8
+            p = join(rocdl_dir, n + ".amdgcn.bc")
+        if exists(p):
+            bitcode_files.append(p)
+        elif "isa_version" not in n and n not in {"irif"}:
+            raise RuntimeError("could not find bitcode " + n)
+
+    return tvm.runtime.convert(bitcode_files)
diff --git a/python/tvm/contrib/sdaccel.py b/python/tvm/contrib/sdaccel.py
index b88fa4c840a2..930752c2bc6b 100644
--- a/python/tvm/contrib/sdaccel.py
+++ b/python/tvm/contrib/sdaccel.py
@@ -19,7 +19,7 @@
 import os
 
 import tvm._ffi
-from . import util
+from . import utils
 
 
 @tvm._ffi.register_func("tvm_callback_sdaccel_compile")
@@ -40,7 +40,7 @@ def compile_vhls(kernel_info, device_name):
     xclbin : bytearray
         The bytearray of the xclbin
     """
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
 
     sdk = os.environ.get("XILINX_SDX", None)
     xocc = os.path.join(sdk, "bin/xocc") if sdk else "xocc"
diff --git a/python/tvm/contrib/spirv.py b/python/tvm/contrib/spirv.py
index a5d847158c63..94b24d0c7b09 100644
--- a/python/tvm/contrib/spirv.py
+++ b/python/tvm/contrib/spirv.py
@@ -17,7 +17,7 @@
 """Utility for Interacting with SPIRV Tools"""
 import subprocess
 import os
-from . import util
+from . import utils
 from .._ffi.base import py_str
 
 
@@ -37,7 +37,7 @@ def optimize(spv_bin):
         The HSA Code Object
     """
 
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_in = tmp_dir.relpath("input.spv")
     tmp_out = tmp_dir.relpath("output.spv")
     with open(tmp_in, "wb") as out_file:
diff --git a/python/tvm/contrib/tar.py b/python/tvm/contrib/tar.py
index bcc34a18637c..354887730f46 100644
--- a/python/tvm/contrib/tar.py
+++ b/python/tvm/contrib/tar.py
@@ -21,7 +21,7 @@
 import os
 import shutil
 import subprocess
-from . import util
+from . import utils
 from .._ffi.base import py_str
 
 
@@ -38,7 +38,7 @@ def tar(output, files):
     """
     cmd = ["tar"]
     cmd += ["-czf"]
-    temp = util.tempdir()
+    temp = utils.tempdir()
     fset = set()
     for fname in files:
         base = os.path.basename(fname)
diff --git a/python/tvm/contrib/target/vitis_ai.py b/python/tvm/contrib/target/vitis_ai.py
new file mode 100644
index 000000000000..d4931d9e3f48
--- /dev/null
+++ b/python/tvm/contrib/target/vitis_ai.py
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+"""Utility to offload (sub-)models to Vitis-AI"""
+
+import warnings
+
+import pyxir
+import pyxir.frontend.tvm
+
+from tvm.relay.expr import Tuple, Call, TupleGetItem
+import tvm._ffi
+
+
+class CodegenVitisAI:
+
+    """Traverse Relay expression and convert into PyXIR XGraph format"""
+
+    def __init__(self, model_name, function):
+        self.model_name = model_name
+        self.function = function
+        self.params = {}
+
+    def convert_pyxir(self, target):
+        """Convert Relay expression to PyXIR XGraph"""
+        xgraph = pyxir.frontend.tvm.from_relay(
+            self.function, params=self.params, postprocessing=None
+        )
+        xgraph = pyxir.partition(xgraph, targets=[target])
+        return xgraph
+
+    def get_output_names(self):
+        """Get output names from Relay expression"""
+        func = self.function
+        output_relay_ids = []
+        expr = func.body
+        if isinstance(expr, Tuple):
+            for field in expr.fields:
+                output_relay_ids.append(hash(field))
+        elif isinstance(expr, Call):
+            output_relay_ids.append(hash(expr))
+        elif isinstance(expr, TupleGetItem):
+            output_relay_ids.append(hash(expr.tuple_value))
+        else:
+            raise ValueError("Vitis-AI codegen does not support {} as output".format(type(expr)))
+        return output_relay_ids
+
+
+@tvm._ffi.register_func("relay.ext.vitis_ai")
+def vitis_ai_compiler(ref):
+    """Create a Vitis-AI runtime from the provided Relay expression"""
+    assert isinstance(ref, tvm.relay.function.Function)
+
+    out_tensor_names = []
+    name = str(ref.attrs.global_symbol)
+
+    pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
+
+    # The target Vitis-AI accelerator device
+    target = (
+        str(pass_context.config["relay.ext.vitis_ai.options.target"])
+        if "relay.ext.vitis_ai.options.target" in pass_context.config
+        else None
+    )
+
+    # (Optional configs) The build and work directories to be used by Vitis-AI
+    vai_build_dir = (
+        str(pass_context.config["relay.ext.vitis_ai.options.build_dir"])
+        if "relay.ext.vitis_ai.options.build_dir" in pass_context.config
+        else tvm.contrib.utils.tempdir().relpath("")
+    )
+    vai_work_dir = (
+        str(pass_context.config["relay.ext.vitis_ai.options.work_dir"])
+        if "relay.ext.vitis_ai.options.work_dir" in pass_context.config
+        else tvm.contrib.utils.tempdir().relpath("")
+    )
+
+    # (Optional configs) Export and load PyXIR runtime module to file if provided. This is used to
+    #   compile and quantize a model on the host and deploy it at the edge
+    export_runtime_module = (
+        str(pass_context.config["relay.ext.vitis_ai.options.export_runtime_module"])
+        if "relay.ext.vitis_ai.options.export_runtime_module" in pass_context.config
+        else ""
+    )
+    load_runtime_module = (
+        str(pass_context.config["relay.ext.vitis_ai.options.load_runtime_module"])
+        if "relay.ext.vitis_ai.options.load_runtime_module" in pass_context.config
+        else ""
+    )
+
+    # Config checks
+    if load_runtime_module and target is not None:
+        warnings.warn(
+            "Both `load_runtime_module` and `target` configs were specified."
+            " The `load_runtime_module` points to a prebuilt runtime module with"
+            " an internal target so the `target` config will be ignored"
+        )
+    if load_runtime_module and "relay.ext.vitis_ai.options.build_dir" in pass_context.config:
+        warnings.warn(
+            "Both `load_runtime_module` and `build_dir` configs were specified."
+            " The `load_runtime_module` points to a prebuilt runtime module with"
+            " an internal build directory so the `build_dir` config will be ignored"
+        )
+    if load_runtime_module and "relay.ext.vitis_ai.options.work_dir" in pass_context.config:
+        warnings.warn(
+            "Both `load_runtime_module` and `work_dir` configs were specified."
+            " The `load_runtime_module` points to a prebuilt runtime module with"
+            " an internal work directory so the `work_dir` config will be ignored"
+        )
+
+    # If load_runtime_module is not set, we will build the PyXIR runtime module from scratch
+    if load_runtime_module == "":
+        # Convert Relay expression into XGraph and do partitioning inside PyXIR
+        builder = CodegenVitisAI(name, ref)
+        xgraph = builder.convert_pyxir(target)
+        output_relay_ids = builder.get_output_names()
+        layers = xgraph.get_layers()
+
+        # Get the output tensor names using XGraph and output Relay ids
+        out_tensor_names = []
+        for layer in layers:
+            if not layer.internal:
+                for relay_id in layer.attrs["relay_id"]:
+                    if relay_id in output_relay_ids:
+                        out_tensor_names.append(layer.name)
+                        break
+        if not out_tensor_names:
+            raise ValueError(
+                "During codegeneration the loading of subexpression \
+                             failed due to output tensor name mismatch in Relay PyXIR interface."
+            )
+        xgraph.meta_attrs["tvm_out_tensors"] = out_tensor_names
+        xgraph_str = pyxir.get_xgraph_str(xgraph)
+
+        runtime_func = "tvm.vitis_ai_runtime.from_xgraph"
+        fcreate = tvm._ffi.get_global_func(runtime_func)
+        return fcreate(name, xgraph_str, target, vai_build_dir, vai_work_dir, export_runtime_module)
+
+    runtime_func = "tvm.vitis_ai_runtime.from_rt_mod"
+    fcreate = tvm._ffi.get_global_func(runtime_func)
+    return fcreate(name, load_runtime_module, export_runtime_module)
diff --git a/python/tvm/contrib/tflite_runtime.py b/python/tvm/contrib/tflite_runtime.py
index 92501f950c56..3b0e268e2a44 100644
--- a/python/tvm/contrib/tflite_runtime.py
+++ b/python/tvm/contrib/tflite_runtime.py
@@ -73,6 +73,7 @@ def __init__(self, module):
         self._set_input = module["set_input"]
         self._invoke = module["invoke"]
         self._get_output = module["get_output"]
+        self._set_num_threads = module["set_num_threads"]
 
     def set_input(self, index, value):
         """Set inputs to the module via kwargs
@@ -109,3 +110,12 @@ def get_output(self, index):
             The output index
         """
         return self._get_output(index)
+
+    def set_num_threads(self, num_threads):
+        """Set the number of threads via kwargs
+        Parameters
+        ----------
+        num_threads : int
+           The number of threads
+        """
+        self._set_num_threads(num_threads)
diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/utils.py
similarity index 99%
rename from python/tvm/contrib/util.py
rename to python/tvm/contrib/utils.py
index f3397ce186ba..6451896c6bd1 100644
--- a/python/tvm/contrib/util.py
+++ b/python/tvm/contrib/utils.py
@@ -112,7 +112,7 @@ def __init__(self, custom_path=None):
             self.TEMPDIRS.add(self.temp_dir)
 
     def remove(self):
-        """Remote the tmp dir"""
+        """Remove the tmp dir"""
         if self.temp_dir:
             if not self._created_with_keep_for_debug:
                 shutil.rmtree(self.temp_dir, ignore_errors=True)
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index 13bd74762163..0c0dac1234e8 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -23,7 +23,7 @@
 import subprocess
 import json
 from .._ffi.base import py_str
-from . import util
+from . import utils
 
 
 def xcrun(cmd):
@@ -132,7 +132,7 @@ def compile_metal(code, path_target=None, sdk="macosx"):
     metallib : bytearray
         The bytearray of the metallib
     """
-    temp = util.tempdir()
+    temp = utils.tempdir()
     temp_code = temp.relpath("my_lib.metal")
     temp_ir = temp.relpath("my_lib.air")
     temp_target = temp.relpath("my_lib.metallib")
@@ -248,7 +248,7 @@ def popen_test_rpc(host, port, key, destination, libs=None, options=None):
         )
 
     # Lock the path so only one file can run
-    lock = util.filelock(os.path.join(rpc_root, "ios_rpc.lock"))
+    lock = utils.filelock(os.path.join(rpc_root, "ios_rpc.lock"))
 
     with open(os.path.join(rpc_root, "rpc_config.txt"), "w") as fo:
         fo.write("%s %d %s\n" % (host, port, key))
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 90efa50965da..53c8f3bdc43d 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -92,12 +92,10 @@ def add_tune_parser(subparsers):
     )
     parser.add_argument(
         "--rpc-key",
-        nargs=1,
         help="the RPC tracker key of the target device. Required when --rpc-tracker is provided.",
     )
     parser.add_argument(
         "--rpc-tracker",
-        nargs=1,
         help="hostname (required) and port (optional, defaults to 9090) of the RPC tracker, "
         "e.g. '192.168.0.100:9999'",
     )
diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index a625a99f0e7e..9db22f3f3390 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -76,7 +76,6 @@ def convert_graph_layout(mod, desired_layout):
             )
 
 
-# TODO In a separate PR, eliminate the duplicated code here and in compiler.py (@leandron)
 def target_from_cli(target):
     """
     Create a tvm.target.Target instance from a
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 8001ee29f757..57071476b073 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -26,7 +26,7 @@
 from tvm import autotvm
 from tvm import relay
 from tvm.contrib import cc
-from tvm.contrib import util
+from tvm.contrib import utils
 
 from . import common, frontends
 from .main import register_parser
@@ -177,28 +177,19 @@ def compile_model(
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
-    # Handle the case in which target is a path to a JSON file.
-    if os.path.exists(target):
-        with open(target) as target_file:
-            logger.info("using target input from file: %s", target)
-            target = "".join(target_file.readlines())
-
-    # TODO(@leandron) We don't have an API to collect a list of supported
-    #       targets yet
-    logger.debug("creating target from input: %s", target)
-    tvm_target = tvm.target.Target(target)
-    target_host = target_host or ""
+    tvm_target = common.target_from_cli(target)
+    target_host = tvm_target if not target_host else target_host
 
     if tuning_records and os.path.exists(tuning_records):
         logger.debug("tuning records file provided: %s", tuning_records)
         with autotvm.apply_history_best(tuning_records):
             with tvm.transform.PassContext(opt_level=3):
                 logger.debug("building relay graph with tuning records")
-                graph_module = relay.build(mod, tvm_target, params=params, target_host=tvm_target)
+                graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host)
     else:
         with tvm.transform.PassContext(opt_level=3):
             logger.debug("building relay graph (no tuning records provided)")
-            graph_module = relay.build(mod, tvm_target, params=params, target_host=tvm_target)
+            graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host)
 
     # Generate output dump files with sources
     dump_code = dump_code or []
@@ -238,7 +229,7 @@ def save_module(module_path, graph, lib, params, cross=None):
     lib_name = "mod.so"
     graph_name = "mod.json"
     param_name = "mod.params"
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path_lib = temp.relpath(lib_name)
     if not cross:
         logger.debug("exporting library to %s", path_lib)
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index d86d4db795dc..dec0e9842a37 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -47,10 +47,10 @@ def add_run_parser(subparsers):
     parser.set_defaults(func=drive_run)
 
     # TODO --device needs to be extended and tested to support other targets,
-    #      like 'cl', 'webgpu', etc (@leandron)
+    #      like 'webgpu', etc (@leandron)
     parser.add_argument(
         "--device",
-        choices=["cpu", "gpu"],
+        choices=["cpu", "gpu", "cl"],
         default="cpu",
         help="target device to run the compiled module. Defaults to 'cpu'",
     )
@@ -86,12 +86,10 @@ def add_run_parser(subparsers):
     )
     parser.add_argument(
         "--rpc-key",
-        nargs=1,
         help="the RPC tracker key of the target device",
     )
     parser.add_argument(
         "--rpc-tracker",
-        nargs=1,
         help="hostname (required) and port (optional, defaults to 9090) of the RPC tracker, "
         "e.g. '192.168.0.100:9999'",
     )
@@ -363,7 +361,13 @@ def run_module(
 
         # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron)
         logger.debug("device is %s", device)
-        ctx = session.cpu() if device == "cpu" else session.gpu()
+        if device == "gpu":
+            ctx = session.gpu()
+        elif device == "cl":
+            ctx = session.cl()
+        else:
+            assert device == "cpu"
+            ctx = session.cpu()
 
         if profile:
             logger.debug("creating runtime with profiling enabled")
diff --git a/python/tvm/error.py b/python/tvm/error.py
index d7628a735145..819f06475e0a 100644
--- a/python/tvm/error.py
+++ b/python/tvm/error.py
@@ -50,7 +50,7 @@ def __init__(self, msg):
         if "TVM hint:" not in msg:
             msg += (
                 "\nTVM hint: You hit an internal error. "
-                + "Please open a thread on https://discuss.tvm.ai/ to report it."
+                + "Please open a thread on https://discuss.tvm.apache.org/ to report it."
             )
         super(InternalError, self).__init__(msg)
 
diff --git a/python/tvm/exec/microtvm_debug_shell.py b/python/tvm/exec/microtvm_debug_shell.py
new file mode 100644
index 000000000000..576c07e7fe9e
--- /dev/null
+++ b/python/tvm/exec/microtvm_debug_shell.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=redefined-outer-name, invalid-name
+"""Start an RPC server intended for use as a microTVM debugger.
+
+microTVM aims to be runtime-agnostic, and to that end, frameworks often define command-line tools
+used to launch a debug flow. These tools often manage the process of connecting to an attached
+device using a hardware debugger, exposing a GDB server, and launching GDB connected to that
+server with a source file attached. It's also true that this debugger can typically not be executed
+concurrently with any flash tool, so this integration point is provided to allow TVM to launch and
+terminate any debuggers integrated with the larger microTVM compilation/autotuning flow.
+
+To use this tool, first launch this script in a separate terminal window. Then, provide the hostport
+to your compiler's Flasher instance.
+"""
+
+import argparse
+import logging
+import socket
+import struct
+
+import tvm.micro.debugger as _  # NOTE: imported to expose global PackedFuncs over RPC.
+
+from .._ffi.base import py_str
+from ..rpc import base
+from ..rpc import _ffi_api
+
+
+_LOG = logging.getLogger(__name__)
+
+
+def parse_args():
+    """Parse command line arguments to this script."""
+    parser = argparse.ArgumentParser(description="microTVM debug-tool runner")
+    parser.add_argument("--host", default="0.0.0.0", help="hostname to listen on")
+    parser.add_argument("--port", type=int, default=9090, help="hostname to listen on")
+    parser.add_argument(
+        "--impl",
+        help=(
+            "If given, name of a module underneath tvm.micro.contrib "
+            "which contains the Debugger implementation to use. For example, to enable a "
+            "debugger named BarDebugger in python/tvm/micro/contrib/foo.py, specify either "
+            "'tvm.micro.contrib.foo' or 'foo' here. To enable a debugger named BazDebugger in "
+            "a third-party module ext_package.debugger, specify 'ext_package.debugger' here. "
+            "NOTE: the module cannot be in a sub-package of tvm.micro.contrib."
+        ),
+    )
+
+    return parser.parse_args()
+
+
+class ConnectionClosedError(Exception):
+    """Raised when the connection is closed."""
+
+
+def handle_conn(conn, rpc_key):
+    """Handle a single connection that has just been accept'd()."""
+
+    def send(data):
+        conn.sendall(data)
+        return len(data)
+
+    magic = struct.unpack("<i", base.recvall(conn, 4))[0]
+    if magic != base.RPC_MAGIC:
+        conn.close()
+        return
+
+    keylen = struct.unpack("<i", base.recvall(conn, 4))[0]
+    key = py_str(base.recvall(conn, keylen))
+    arr = key.split()
+    expect_header = "client:"
+    server_key = "server:" + rpc_key
+    if arr[0] != expect_header:
+        conn.sendall(struct.pack("<i", base.RPC_CODE_MISMATCH))
+        _LOG.warning("mismatch key from %s", addr)
+        return
+
+    conn.sendall(struct.pack("<i", base.RPC_CODE_SUCCESS))
+    conn.sendall(struct.pack("<i", len(server_key)))
+    conn.sendall(server_key.encode("utf-8"))
+    server = _ffi_api.CreateEventDrivenServer(send, "microtvm-rpc-debugger", key)
+
+    def _readall(n):
+        buf = bytearray()
+        while len(buf) < n:
+            x = conn.recv(n - len(buf))
+            if not x:
+                raise ConnectionClosedError()
+
+            buf = buf + x
+
+        return buf
+
+    while True:
+        packet_length_bytes = _readall(8)
+        packet_length = struct.unpack("<q", packet_length_bytes)[0]
+        if not packet_length:
+            break
+
+        status = server(packet_length_bytes, 3)
+        if status == 0:
+            break
+
+        packet_body = _readall(packet_length)
+        status = server(packet_body, 3)
+
+
+def main():
+    """Main entry point for microTVM debug shell."""
+    args = parse_args()
+    logging.basicConfig(level=logging.INFO)
+    if args.impl:
+        package = None
+        if "." not in args.impl:
+            package = f"tvm.micro.contrib.{args.impl}"
+        importlib.import_module(args.impl, package)
+
+    sock = socket.socket(base.get_addr_family([args.host, args.port]), socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind((args.host, args.port))
+    sock.listen(1)
+    bind_addr, bind_port = sock.getsockname()
+    _LOG.info("listening for connections on %s:%d", bind_addr, bind_port)
+
+    while True:
+        conn, peer = sock.accept()
+        _LOG.info("accepted connection from %s", peer)
+        try:
+            handle_conn(conn, "")
+        except ConnectionClosedError:
+            pass
+        finally:
+            conn.close()
+            _LOG.info("closed connection from %s", peer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py
index 345e44718ac3..9692b98fe22b 100644
--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@@ -19,12 +19,9 @@
 from __future__ import absolute_import
 
 import argparse
-import ast
-import json
 import multiprocessing
 import sys
 import logging
-import tvm
 from .. import rpc
 
 
@@ -45,9 +42,6 @@ def main(args):
     else:
         tracker_addr = None
 
-    if args.utvm_dev_config or args.utvm_dev_id:
-        init_utvm(args)
-
     server = rpc.Server(
         args.host,
         args.port,
@@ -61,40 +55,6 @@ def main(args):
     server.proc.join()
 
 
-def init_utvm(args):
-    """MicroTVM-specific RPC initialization
-
-    Parameters
-    ----------
-    args : argparse.Namespace
-        parsed args from command-line invocation
-    """
-    from tvm import micro  # pylint: disable=import-outside-toplevel
-
-    if args.utvm_dev_config and args.utvm_dev_id:
-        raise RuntimeError("only one of --utvm-dev-config and --utvm-dev-id allowed")
-
-    if args.utvm_dev_config:
-        with open(args.utvm_dev_config, "r") as dev_conf_file:
-            dev_config = json.load(dev_conf_file)
-    else:
-        dev_config_args = ast.literal_eval(args.utvm_dev_config_args)
-        generate_config_func = micro.device.get_device_funcs(args.utvm_dev_id)["generate_config"]
-        dev_config = generate_config_func(*dev_config_args)
-
-    if args.utvm_dev_config or args.utvm_dev_id:
-        # add MicroTVM overrides
-        @tvm.register_func("tvm.rpc.server.start", override=True)
-        def server_start():
-            # pylint: disable=unused-variable
-            session = micro.Session(dev_config)
-            session._enter()
-
-            @tvm.register_func("tvm.rpc.server.shutdown", override=True)
-            def server_shutdown():
-                session._exit()
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="0.0.0.0", help="the hostname of the server")
@@ -121,35 +81,6 @@ def server_shutdown():
     parser.add_argument(
         "--custom-addr", type=str, help="Custom IP Address to Report to RPC Tracker"
     )
-    parser.add_argument(
-        "--utvm-dev-config",
-        type=str,
-        help=(
-            "JSON config file for the target device (if using MicroTVM). "
-            "This file should contain serialized output similar to that returned "
-            "from the device module's generate_config. Can't be specified when "
-            "--utvm-dev-config-args is specified."
-        ),
-    )
-    parser.add_argument(
-        "--utvm-dev-config-args",
-        type=str,
-        help=(
-            "Arguments to the device module's generate_config function. "
-            "Must be a python literal parseable by literal_eval. If specified, "
-            "the device configuration is generated using the device module's "
-            "generate_config. Can't be specified when --utvm-dev-config is "
-            "specified."
-        ),
-    )
-    parser.add_argument(
-        "--utvm-dev-id",
-        type=str,
-        help=(
-            "Unique ID for the target device (if using MicroTVM). Should "
-            "match the name of a module underneath tvm.micro.device)."
-        ),
-    )
 
     parser.set_defaults(fork=True)
     args = parser.parse_args()
diff --git a/python/tvm/ir/diagnostics/__init__.py b/python/tvm/ir/diagnostics/__init__.py
index 6503743aaa51..3a6402c0359d 100644
--- a/python/tvm/ir/diagnostics/__init__.py
+++ b/python/tvm/ir/diagnostics/__init__.py
@@ -38,6 +38,7 @@ def get_renderer():
     return _ffi_api.GetRenderer()
 
 
+@tvm.register_func("diagnostics.override_renderer")
 def override_renderer(render_func):
     """
     Sets a custom renderer for diagnostics.
diff --git a/python/tvm/ir/module.py b/python/tvm/ir/module.py
index 352f8aaf04b6..f8b6ff295339 100644
--- a/python/tvm/ir/module.py
+++ b/python/tvm/ir/module.py
@@ -251,8 +251,7 @@ def import_from_std(self, file_to_import):
         return tvm.relay.transform.InferType()(self)
 
     def __str__(self):
-        # TODO(jroesch): why does this hang sometimes?
-        return self.astext()
+        return _ffi_api.PrettyPrint(self)
 
     def __repr__(self):
         return self.astext()
diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index 30f81e76f697..a6e24343e378 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -23,5 +23,5 @@
 from .debugger import GdbRemoteDebugger
 from .micro_library import MicroLibrary
 from .micro_binary import MicroBinary
-from .session import create_local_graph_runtime, Session
+from .session import create_local_graph_runtime, Session, SessionTerminatedError
 from .transport import TransportLogger, DebugWrapperTransport, SubprocessTransport
diff --git a/python/tvm/micro/artifact.py b/python/tvm/micro/artifact.py
index 78939760c42b..c8faccb3f512 100644
--- a/python/tvm/micro/artifact.py
+++ b/python/tvm/micro/artifact.py
@@ -17,6 +17,7 @@
 
 """"Defines abstractions around compiler artifacts produced in compiling micro TVM binaries."""
 
+import hashlib
 import io
 import os
 import json
@@ -36,11 +37,53 @@ class ArtifactBadArchiveError(Exception):
     """Raised when an artifact archive is malformed."""
 
 
+class ImmobileArtifactError(Exception):
+    """Raised when an artifact is declared immobile and thus cannot be archived."""
+
+
+class ArchiveModifiedError(Exception):
+    """Raised when the underlying files in a metadata-only archive were modified after archiving."""
+
+
+def sha256_hexdigest(path):
+    with open(path, "rb") as path_fd:
+        h = hashlib.sha256()
+        chunk = path_fd.read(1 * 1024 * 1024)
+        while chunk:
+            h.update(chunk)
+            chunk = path_fd.read(1 * 1024 * 1024)
+
+    return h.hexdigest()
+
+
+def _validate_metadata_only(metadata):
+    """Validate that the files in a metadata-only archive have not changed."""
+    problems = []
+    for files in metadata["labelled_files"].values():
+        for f in files:
+            disk_path = os.path.join(metadata["base_dir"], f)
+            try:
+                sha = sha256_hexdigest(disk_path)
+            except FileNotFoundError:
+                problems.append(f"{f}: original file not found")
+                continue
+
+            expected_sha = metadata["file_digests"][f]
+            if sha != expected_sha:
+                problems.append(f"{f}: sha256 mismatch: expected {expected_sha}, got {sha}")
+
+    if problems:
+        raise ArchiveModifiedError(
+            "Files in metadata-only archive have been modified:\n"
+            + "\n".join([f" * {p}" for p in problems])
+        )
+
+
 class Artifact:
     """Describes a compiler artifact and defines common logic to archive it for transport."""
 
     # A version number written to the archive.
-    ENCODING_VERSION = 1
+    ENCODING_VERSION = 2
 
     # A unique string identifying the type of artifact in an archive. Subclasses must redefine this
     # variable.
@@ -55,7 +98,8 @@ def unarchive(cls, archive_path, base_dir):
         archive_path : str
             Path to the archive file.
         base_dir : str
-            Path to a non-existent, empty directory under which the artifact will live.
+            Path to a non-existent, empty directory under which the artifact will live. If working
+            with a metadata-only archive, this directory will just hold the metadata.json.
 
         Returns
         -------
@@ -92,6 +136,10 @@ def unarchive(cls, archive_path, base_dir):
                         f"archive version: expect {cls.EXPECTED_VERSION}, found {version}"
                     )
 
+                metadata_only = metadata.get("metadata_only")
+                if metadata_only:
+                    _validate_metadata_only(metadata)
+
                 os.rename(os.path.join(temp_dir, temp_dir_contents[0]), base_dir)
 
                 artifact_cls = cls
@@ -103,16 +151,19 @@ def unarchive(cls, archive_path, base_dir):
                         break
 
                 return artifact_cls.from_unarchived(
-                    base_dir, metadata["labelled_files"], metadata["metadata"]
+                    base_dir if not metadata_only else metadata["base_dir"],
+                    metadata["labelled_files"],
+                    metadata["metadata"],
+                    immobile=metadata.get("immobile"),
                 )
         finally:
             shutil.rmtree(temp_dir)
 
     @classmethod
-    def from_unarchived(cls, base_dir, labelled_files, metadata):
-        return cls(base_dir, labelled_files, metadata)
+    def from_unarchived(cls, base_dir, labelled_files, metadata, immobile):
+        return cls(base_dir, labelled_files, metadata, immobile)
 
-    def __init__(self, base_dir, labelled_files, metadata):
+    def __init__(self, base_dir, labelled_files, metadata, immobile=False):
         """Create a new artifact.
 
         Parameters
@@ -123,10 +174,16 @@ def __init__(self, base_dir, labelled_files, metadata):
             A dict mapping a file label to the relative paths of the files that carry that label.
         metadata : Dict
             A dict containing artitrary JSON-serializable key-value data describing the artifact.
+        immobile : bool
+            True when this artifact can't be used after being moved out of its current location on
+            disk. This can happen when artifacts contain absolute paths or when it's not feasible to
+            include enough files in the artifact to reliably re-run commands in arbitrary locations.
+            Setting this flag will cause archive() to raise ImmboileArtifactError.
         """
         self.base_dir = os.path.realpath(base_dir)
         self.labelled_files = labelled_files
         self.metadata = metadata
+        self.immobile = immobile
 
         for label, files in labelled_files.items():
             for f in files:
@@ -158,7 +215,7 @@ def label(self, label):
     def label_abspath(self, label):
         return [self.abspath(p) for p in self.labelled_files[label]]
 
-    def archive(self, archive_path):
+    def archive(self, archive_path, metadata_only=False):
         """Create a relocatable tar archive of the artifacts.
 
         Parameters
@@ -166,12 +223,24 @@ def archive(self, archive_path):
         archive_path : str
             Path to the tar file to create. Or, path to a directory, under which a tar file will be
             created named {base_dir}.tar.
+        metadata_only : bool
+            If true, don't archive artifacts; instead, just archive metadata plus original
+            base_path. A metadata-only archive can be unarchived and used like a regular archive
+            provided none of the files have changed in their original locations on-disk.
 
         Returns
         -------
         str :
             The value of archive_path, after potentially making the computation describe above.
+
+        Raises
+        ------
+        ImmboileArtifactError :
+            When immobile=True was passed to the constructor.
         """
+        if self.immobile and not metadata_only:
+            raise ImmobileArtifactError("This artifact can't be moved")
+
         if os.path.isdir(archive_path):
             archive_path = os.path.join(archive_path, f"{os.path.basename(self.base_dir)}.tar")
 
@@ -185,17 +254,24 @@ def _add_file(name, data, f_type):
                 tar_info.size = len(data)
                 tar_f.addfile(tar_info, io.BytesIO(data_bytes))
 
+            metadata = {
+                "version": self.ENCODING_VERSION,
+                "labelled_files": self.labelled_files,
+                "metadata": self.metadata,
+                "metadata_only": False,
+            }
+            if metadata_only:
+                metadata["metadata_only"] = True
+                metadata["base_dir"] = self.base_dir
+                metadata["immobile"] = self.immobile
+                metadata["file_digests"] = {}
+                for files in self.labelled_files.values():
+                    for f in files:
+                        metadata["file_digests"][f] = sha256_hexdigest(self.abspath(f))
+
             _add_file(
                 f"{archive_name}/metadata.json",
-                json.dumps(
-                    {
-                        "version": self.ENCODING_VERSION,
-                        "labelled_files": self.labelled_files,
-                        "metadata": self.metadata,
-                    },
-                    indent=2,
-                    sort_keys=True,
-                ),
+                json.dumps(metadata, indent=2, sort_keys=True),
                 tarfile.REGTYPE,
             )
             for dir_path, _, files in os.walk(self.base_dir):
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 908bc9637dcf..4aec9ea5ecbb 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -21,7 +21,9 @@
 import logging
 import os
 import re
-from tvm.contrib import util
+from tvm.contrib import utils
+
+from .micro_library import MicroLibrary
 
 
 _LOG = logging.getLogger(__name__)
@@ -32,11 +34,11 @@ class Workspace:
 
     def __init__(self, root=None, debug=False):
         if debug or root is not None:
-            with util.TempDirectory.set_keep_for_debug():
-                self.tempdir = util.tempdir(custom_path=root)
+            with utils.TempDirectory.set_keep_for_debug():
+                self.tempdir = utils.tempdir(custom_path=root)
                 _LOG.info("Created debug mode workspace at: %s", self.tempdir.temp_dir)
         else:
-            self.tempdir = util.tempdir()
+            self.tempdir = utils.tempdir()
 
     def relpath(self, path):
         return self.tempdir.relpath(path)
@@ -109,7 +111,13 @@ def default_options(target_include_dir):
 
 
 def build_static_runtime(
-    workspace, compiler, module, lib_opts=None, bin_opts=None, generated_lib_opts=None
+    workspace,
+    compiler,
+    module,
+    lib_opts=None,
+    bin_opts=None,
+    generated_lib_opts=None,
+    extra_libs=None,
 ):
     """Build the on-device runtime, statically linking the given modules.
 
@@ -131,6 +139,12 @@ def build_static_runtime(
         The `options` parameter passed to compiler.library() when compiling the generated TVM C
         source module.
 
+    extra_libs : Optional[List[MicroLibrary|str]]
+        If specified, extra libraries to be compiled into the binary. If a MicroLibrary, it is
+        included into the binary directly. If a string, the path to a directory; all direct children
+        of this directory matching RUNTIME_SRC_REGEX are built into a library. These libraries are
+        placed before any common CRT libraries in the link order.
+
     Returns
     -------
     MicroBinary :
@@ -150,7 +164,12 @@ def build_static_runtime(
     module.save(mod_src_path, "cc")
 
     libs = []
-    for lib_src_dir in RUNTIME_LIB_SRC_DIRS:
+    for mod_or_src_dir in (extra_libs or []) + RUNTIME_LIB_SRC_DIRS:
+        if isinstance(mod_or_src_dir, MicroLibrary):
+            libs.append(mod_or_src_dir)
+            continue
+
+        lib_src_dir = mod_or_src_dir
         lib_name = os.path.basename(lib_src_dir)
         lib_build_dir = workspace.relpath(f"build/{lib_name}")
         os.makedirs(lib_build_dir)
diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py
index 307c9809fc21..3b62e9347c7f 100644
--- a/python/tvm/micro/compiler.py
+++ b/python/tvm/micro/compiler.py
@@ -21,8 +21,8 @@
 import glob
 import os
 import re
+import subprocess
 
-from tvm.contrib import binutil
 import tvm.target
 from . import build
 from . import class_factory
@@ -30,6 +30,28 @@
 from . import transport
 
 
+def run_cmd(cmd):
+    """Runs `cmd` in a subprocess and awaits its completion.
+
+    Parameters
+    ----------
+    cmd : List[str]
+        list of command-line arguments
+
+    Returns
+    -------
+    output : str
+        resulting stdout capture from the subprocess
+    """
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    (output, _) = proc.communicate()
+    output = output.decode("utf-8")
+    if proc.returncode != 0:
+        cmd_str = " ".join(cmd)
+        msg = f'error while running command "{cmd_str}":\n{output}'
+        raise RuntimeError(msg)
+
+
 class DetectTargetError(Exception):
     """Raised when no target comment was detected in the sources given."""
 
@@ -117,9 +139,11 @@ def _defaults_from_target(self, target):
         opts = []
         # TODO use march for arm(https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html)?
         if target.attrs.get("mcpu"):
-            opts.append(f'-march={target.attrs["mcpu"]}')
+            opts.append(f'-mcpu={target.attrs["mcpu"]}')
         if target.attrs.get("mfpu"):
             opts.append(f'-mfpu={target.attrs["mfpu"]}')
+        if target.attrs.get("march"):
+            opts.append(f'-march={target.attrs["march"]}')
 
         return opts
 
@@ -232,13 +256,13 @@ def library(self, output, sources, options=None):
 
             output_filename = f"{src_base}.o"
             output_abspath = os.path.join(output, output_filename)
-            binutil.run_cmd(args + ["-c", "-o", output_abspath, src])
+            run_cmd(args + ["-c", "-o", output_abspath, src])
             outputs.append(output_abspath)
 
         output_filename = f"{os.path.basename(output)}.a"
         output_abspath = os.path.join(output, output_filename)
-        binutil.run_cmd([prefix + "ar", "-r", output_abspath] + outputs)
-        binutil.run_cmd([prefix + "ranlib", output_abspath])
+        run_cmd([prefix + "ar", "-r", output_abspath] + outputs)
+        run_cmd([prefix + "ranlib", output_abspath])
 
         return tvm.micro.MicroLibrary(output, [output_filename])
 
@@ -273,7 +297,7 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non
             for lib_name in obj.library_files:
                 args.append(obj.abspath(lib_name))
 
-        binutil.run_cmd(args)
+        run_cmd(args)
         return tvm.micro.MicroBinary(output, output_filename, [])
 
     @property
diff --git a/python/tvm/micro/contrib/__init__.py b/python/tvm/micro/contrib/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/python/tvm/micro/contrib/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/tvm/micro/contrib/base.py b/python/tvm/micro/contrib/base.py
new file mode 100644
index 000000000000..9c4f4863e3bc
--- /dev/null
+++ b/python/tvm/micro/contrib/base.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines common helper functions useful for integrating custom compiler toolchains."""
+
+import glob
+import os
+import shutil
+
+
+GLOB_PATTERNS = ["__tvm_*", "libtvm__*"]
+
+
+def populate_tvm_objs(dest_dir, objs):
+    """Replace tvm-prefixed files in a build worktree.
+
+    This function is intended to be used to place TVM source files and libraries into a
+    template on-device runtime project.
+
+    Parameters
+    ----------
+    dest_dir : str
+        Path to the destination directory.
+
+    objs : List[MicroLibrary]
+        List of MicroLibrary to place in the project directory.
+
+    Returns
+    -------
+    List[str] :
+        List of paths, each relative to  `dest_dir` to the newly-copied MicroLibrary files.
+    """
+    copied = []
+    for p in GLOB_PATTERNS:
+        for f in glob.glob(os.path.join(dest_dir, p)):
+            if os.path.isdir(f):
+                shutil.rmtree(f)
+            else:
+                os.unlink(f)
+
+    for obj in objs:
+        for lib_file in obj.library_files:
+            obj_base = os.path.basename(lib_file)
+            if obj_base.endswith(".a"):
+                dest_basename = f"libtvm__{obj_base}"
+            else:
+                dest_basename = f"__tvm_{obj_base}"
+
+            copied.append(dest_basename)
+            dest = os.path.join(dest_dir, dest_basename)
+            shutil.copy(obj.abspath(lib_file), dest)
+
+    return copied
diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
new file mode 100644
index 000000000000..66254987cb8b
--- /dev/null
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -0,0 +1,614 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines a compiler integration that uses an externally-supplied Zephyr project."""
+
+import collections
+import logging
+import multiprocessing
+import os
+import re
+import tempfile
+import textwrap
+import shlex
+import shutil
+import subprocess
+import sys
+
+import yaml
+
+import tvm.micro
+from . import base
+from .. import compiler
+from .. import debugger
+from ..transport import debug
+from ..transport import file_descriptor
+
+from ..transport import serial
+from ..transport import Transport, TransportClosedError, TransportTimeouts
+from ..transport import wakeup
+
+
+_LOG = logging.getLogger(__name__)
+
+
+class SubprocessEnv(object):
+    def __init__(self, default_overrides):
+        self.default_overrides = default_overrides
+
+    def run(self, cmd, **kw):
+        env = dict(os.environ)
+        for k, v in self.default_overrides.items():
+            env[k] = v
+
+        return subprocess.check_output(cmd, env=env, **kw)
+
+
+class FlashRunnerNotSupported(Exception):
+    """Raised when the FLASH_RUNNER for a project isn't supported by this Zephyr adapter."""
+
+
+class ZephyrCompiler(tvm.micro.Compiler):
+    """A Compiler instance that builds against a pre-existing zephyr project."""
+
+    def __init__(
+        self,
+        project_dir=None,
+        board=None,
+        west_cmd=None,
+        zephyr_base=None,
+        zephyr_toolchain_variant=None,
+        env_vars=None,
+    ):
+        """Configure the compiler for use.
+
+        Parameters
+        ----------
+        project_dir : str
+            Path to the pre-existing Zephyr project.
+        board : str
+            Name of the Zephyr board to build for (i.e. passed to `west build -b`)
+        west_cmd : Optional[list]
+            If given, argv that invoke the west build tool. Used only for flashing.
+        zephyr_base : Optional[str]
+            If given, path to Zephyr, as would normally be present in the ZEPHYR_BASE environment
+            variable. If not given, consults this environment variable. This value must be set in
+            one of those two places.
+        zephyr_toolchain_variant: Optional[str]
+            If given, overrides the toolchain used by Zephyr. If not given, uses the default
+            zephyr toolchain. When running on OS X outside of docker, you need to specify this.
+        env_vars : Optional[Dict[str,str]]
+            If given, additional environment variables present when invoking west, cmake, or make.
+        """
+        self._project_dir = project_dir
+        self._board = board
+        if west_cmd is None:
+            self._west_cmd = [sys.executable, "-mwest.app.main"]
+        elif isinstance(west_cmd, str):
+            self._west_cmd = [west_cmd]
+        elif isinstance(west_cmd, list):
+            self._west_cmd = west_cmd
+        else:
+            raise TypeError("west_cmd: expected string, list, or None; got %r" % (west_cmd,))
+
+        env = {}
+        if zephyr_toolchain_variant is not None:
+            env["ZEPHYR_TOOLCHAIN_VARIANT"] = zephyr_toolchain_variant
+
+        self._zephyr_base = zephyr_base or os.environ["ZEPHYR_BASE"]
+        assert (
+            self._zephyr_base is not None
+        ), f"Must specify zephyr_base=, or ZEPHYR_BASE must be in environment variables"
+        env["ZEPHYR_BASE"] = self._zephyr_base
+
+        if env_vars:
+            env.update(env_vars)
+
+        self._subprocess_env = SubprocessEnv(env)
+
+    OPT_KEY_TO_CMAKE_DEFINE = {
+        "cflags": "CFLAGS",
+        "ccflags": "CXXFLAGS",
+        "ldflags": "LDFLAGS",
+    }
+
+    @classmethod
+    def _options_to_cmake_args(cls, options):
+        args = []
+        for key, define in cls.OPT_KEY_TO_CMAKE_DEFINE.items():
+            if key in options:
+                quoted_opts = [shlex.quote(o).replace(";", "\\;") for o in options[key]]
+                args.append(f'-DEXTRA_{define}={" ".join(quoted_opts)}')
+
+        if "cmake_args" in options:
+            args.extend(options["cmake_args"])
+
+        return args
+
+    def library(self, output, sources, options=None):
+        project_name = os.path.basename(output)
+        if project_name.startswith("lib"):
+            project_name = project_name[3:]
+
+        lib_prj_conf = os.path.join(output, "prj.conf")
+        if self._project_dir is not None:
+            project_dir_conf = os.path.join(self._project_dir, "prj.conf")
+            if os.path.exists(project_dir_conf):
+                shutil.copy(project_dir_conf, lib_prj_conf)
+        else:
+            with open(lib_prj_conf, "w") as prj_conf_f:
+                prj_conf_f.write("CONFIG_CPLUSPLUS=y\n")
+
+        cmakelists_path = os.path.join(output, "CMakeLists.txt")
+        with open(cmakelists_path, "w") as cmake_f:
+            sources = " ".join(f'"{o}"' for o in sources)
+            cmake_f.write(
+                textwrap.dedent(
+                    f"""\
+                cmake_minimum_required(VERSION 3.13.1)
+
+                find_package(Zephyr HINTS $ENV{{ZEPHYR_BASE}})
+                project({project_name}_prj)
+                target_sources(app PRIVATE)
+                zephyr_library_named({project_name})
+                target_sources({project_name} PRIVATE {sources})
+                target_sources(app PRIVATE main.c)
+                target_link_libraries(app PUBLIC {project_name})
+                """
+                )
+            )
+            if "include_dirs" in options:
+                cmake_f.write(
+                    f"target_include_directories({project_name} PRIVATE "
+                    f'{" ".join(os.path.abspath(d) for d in options["include_dirs"])})\n'
+                )
+
+        with open(os.path.join(output, "main.c"), "w"):
+            pass
+
+        # expecetd not to exist after populate_tvm_libs
+        build_dir = os.path.join(output, "__tvm_build")
+        os.mkdir(build_dir)
+        self._subprocess_env.run(
+            ["cmake", "..", f"-DBOARD={self._board}"] + self._options_to_cmake_args(options),
+            cwd=build_dir,
+        )
+        num_cpus = multiprocessing.cpu_count()
+        self._subprocess_env.run(
+            ["make", f"-j{num_cpus}", "VERBOSE=1", project_name], cwd=build_dir
+        )
+        return tvm.micro.MicroLibrary(build_dir, [f"lib{project_name}.a"])
+
+    def binary(self, output, objects, options=None, link_main=True, main_options=None):
+        assert link_main, "Must pass link_main=True"
+        assert self._project_dir is not None, "Must supply project_dir= to build binaries"
+
+        copied_libs = base.populate_tvm_objs(self._project_dir, objects)
+
+        # expected not to exist after populate_tvm_objs
+        cmake_args = [
+            "cmake",
+            os.path.abspath(self._project_dir),
+            f"-DBOARD={self._board}",
+        ] + self._options_to_cmake_args(options)
+        if "include_dirs" in options:
+            cmake_args.append(
+                "-DTVM_INCLUDE_DIRS="
+                f'{";".join(os.path.abspath(d) for d in options["include_dirs"])}'
+            )
+        cmake_args.append(f'-DTVM_LIBS={";".join(copied_libs)}')
+        self._subprocess_env.run(cmake_args, cwd=output)
+
+        self._subprocess_env.run(["make"], cwd=output)
+
+        return tvm.micro.MicroBinary(
+            output,
+            binary_file=os.path.join("zephyr", "zephyr.elf"),
+            debug_files=[os.path.join("zephyr", "zephyr.elf")],
+            labelled_files={
+                "cmake_cache": ["CMakeCache.txt"],
+                "device_tree": [os.path.join("zephyr", "zephyr.dts")],
+            },
+            immobile="qemu" in self._board,
+        )
+
+    @property
+    def flasher_factory(self):
+        return compiler.FlasherFactory(
+            ZephyrFlasher,
+            (self._west_cmd,),
+            dict(
+                zephyr_base=self._zephyr_base,
+                project_dir=self._project_dir,
+                subprocess_env=self._subprocess_env.default_overrides,
+            ),
+        )
+
+
+CACHE_ENTRY_RE = re.compile(r"(?P<name>[^:]+):(?P<type>[^=]+)=(?P<value>.*)")
+
+
+CMAKE_BOOL_MAP = dict(
+    [(k, True) for k in ("1", "ON", "YES", "TRUE", "Y")]
+    + [(k, False) for k in ("0", "OFF", "NO", "FALSE", "N", "IGNORE", "NOTFOUND", "")]
+)
+
+
+def read_cmake_cache(file_name):
+    """Read a CMakeCache.txt-like file and return a dictionary of values."""
+    entries = collections.OrderedDict()
+    with open(file_name, encoding="utf-8") as f:
+        for line in f:
+            m = CACHE_ENTRY_RE.match(line.rstrip("\n"))
+            if not m:
+                continue
+
+            if m.group("type") == "BOOL":
+                value = CMAKE_BOOL_MAP[m.group("value").upper()]
+            else:
+                value = m.group("value")
+
+            entries[m.group("name")] = value
+
+    return entries
+
+
+class BoardError(Exception):
+    """Raised when an attached board cannot be opened (i.e. missing /dev nodes, etc)."""
+
+
+class BoardAutodetectFailed(Exception):
+    """Raised when no attached hardware is found matching the board= given to ZephyrCompiler."""
+
+
+class ZephyrFlasher(tvm.micro.compiler.Flasher):
+    """A Flasher implementation that delegates to Zephyr/west."""
+
+    def __init__(
+        self,
+        west_cmd,
+        zephyr_base=None,
+        project_dir=None,
+        subprocess_env=None,
+        nrfjprog_snr=None,
+        openocd_serial=None,
+        flash_args=None,
+        debug_rpc_session=None,
+        serial_timeouts=None,
+    ):
+        zephyr_base = zephyr_base or os.environ["ZEPHYR_BASE"]
+        sys.path.insert(0, os.path.join(zephyr_base, "scripts", "dts"))
+        try:
+            import dtlib  # pylint: disable=import-outside-toplevel
+
+            self._dtlib = dtlib
+        finally:
+            sys.path.pop(0)
+
+        self._zephyr_base = zephyr_base
+        self._project_dir = project_dir
+        self._west_cmd = west_cmd
+        self._flash_args = flash_args
+        self._openocd_serial = openocd_serial
+        self._autodetected_openocd_serial = None
+        self._subprocess_env = SubprocessEnv(subprocess_env)
+        self._debug_rpc_session = debug_rpc_session
+        self._nrfjprog_snr = nrfjprog_snr
+        self._serial_timeouts = serial_timeouts
+
+    def _get_nrf_device_args(self):
+        nrfjprog_args = ["nrfjprog", "--ids"]
+        nrfjprog_ids = subprocess.check_output(nrfjprog_args, encoding="utf-8")
+        if not nrfjprog_ids.strip("\n"):
+            raise BoardAutodetectFailed(
+                f'No attached boards recognized by {" ".join(nrfjprog_args)}'
+            )
+
+        boards = nrfjprog_ids.split("\n")[:-1]
+        if len(boards) > 1:
+            if self._nrfjprog_snr is None:
+                raise BoardError(
+                    "Multiple boards connected; specify one with nrfjprog_snr=: "
+                    f'{", ".join(boards)}'
+                )
+
+            if str(self._nrfjprog_snr) not in boards:
+                raise BoardError(
+                    f"nrfjprog_snr ({self._nrfjprog_snr}) not found in {nrfjprog_args}: {boards}"
+                )
+
+            return ["--snr", str(self._nrfjprog_snr)]
+
+        if not boards:
+            return []
+
+        return ["--snr", boards[0]]
+
+    # kwargs passed to usb.core.find to find attached boards for the openocd flash runner.
+    BOARD_USB_FIND_KW = {
+        "nucleo_f746zg": {"idVendor": 0x0483, "idProduct": 0x374B},
+    }
+
+    def openocd_serial(self, cmake_entries):
+        """Find the serial port to use for a board with OpenOCD flash strategy."""
+        if self._openocd_serial is not None:
+            return self._openocd_serial
+
+        if self._autodetected_openocd_serial is None:
+            import usb  # pylint: disable=import-outside-toplevel
+
+            find_kw = self.BOARD_USB_FIND_KW[cmake_entries["BOARD"]]
+            boards = usb.core.find(find_all=True, **find_kw)
+            serials = []
+            for b in boards:
+                serials.append(b.serial_number)
+
+            if len(serials) == 0:
+                raise BoardAutodetectFailed(f"No attached USB devices matching: {find_kw!r}")
+            serials.sort()
+
+            self._autodetected_openocd_serial = serials[0]
+            _LOG.debug("zephyr openocd driver: autodetected serial %s", serials[0])
+
+        return self._autodetected_openocd_serial
+
+    def _get_openocd_device_args(self, cmake_entries):
+        return ["--serial", self.openocd_serial(cmake_entries)]
+
+    @classmethod
+    def _get_flash_runner(cls, cmake_entries):
+        flash_runner = cmake_entries.get("ZEPHYR_BOARD_FLASH_RUNNER")
+        if flash_runner is not None:
+            return flash_runner
+
+        with open(cmake_entries["ZEPHYR_RUNNERS_YAML"]) as f:
+            doc = yaml.load(f)
+        return doc["flash-runner"]
+
+    def _get_device_args(self, cmake_entries):
+        flash_runner = self._get_flash_runner(cmake_entries)
+
+        if flash_runner == "nrfjprog":
+            return self._get_nrf_device_args()
+        if flash_runner == "openocd":
+            return self._get_openocd_device_args(cmake_entries)
+
+        raise BoardError(
+            f"Don't know how to find serial terminal for board {cmake_entries['BOARD']} with flash "
+            f"runner {flash_runner}"
+        )
+
+    def flash(self, micro_binary):
+        cmake_entries = read_cmake_cache(
+            micro_binary.abspath(micro_binary.labelled_files["cmake_cache"][0])
+        )
+        if "qemu" in cmake_entries["BOARD"]:
+            return ZephyrQemuTransport(micro_binary.base_dir, startup_timeout_sec=30.0)
+
+        build_dir = os.path.dirname(
+            micro_binary.abspath(micro_binary.labelled_files["cmake_cache"][0])
+        )
+        west_args = (
+            self._west_cmd
+            + ["flash", "--build-dir", build_dir, "--skip-rebuild"]
+            + self._get_device_args(cmake_entries)
+        )
+        if self._flash_args is not None:
+            west_args.extend(self._flash_args)
+        self._subprocess_env.run(west_args, cwd=build_dir)
+
+        return self.transport(micro_binary)
+
+    def _find_nrf_serial_port(self, cmake_entries):
+        com_ports = subprocess.check_output(
+            ["nrfjprog", "--com"] + self._get_device_args(cmake_entries), encoding="utf-8"
+        )
+        ports_by_vcom = {}
+        for line in com_ports.split("\n")[:-1]:
+            parts = line.split()
+            ports_by_vcom[parts[2]] = parts[1]
+
+        return {"port_path": ports_by_vcom["VCOM2"]}
+
+    def _find_openocd_serial_port(self, cmake_entries):
+        return {"grep": self.openocd_serial(cmake_entries)}
+
+    def _find_serial_port(self, micro_binary):
+        cmake_entries = read_cmake_cache(
+            micro_binary.abspath(micro_binary.labelled_files["cmake_cache"][0])
+        )
+        flash_runner = self._get_flash_runner(cmake_entries)
+
+        if flash_runner == "nrfjprog":
+            return self._find_nrf_serial_port(cmake_entries)
+
+        if flash_runner == "openocd":
+            return self._find_openocd_serial_port(cmake_entries)
+
+        raise FlashRunnerNotSupported(
+            f"Don't know how to deduce serial port for flash runner {flash_runner}"
+        )
+
+    def transport(self, micro_binary):
+        """Instantiate the transport for use with non-QEMU Zephyr."""
+        dt_inst = self._dtlib.DT(
+            micro_binary.abspath(micro_binary.labelled_files["device_tree"][0])
+        )
+        uart_baud = (
+            dt_inst.get_node("/chosen")
+            .props["zephyr,console"]
+            .to_path()
+            .props["current-speed"]
+            .to_num()
+        )
+        _LOG.debug("zephyr transport: found UART baudrate from devicetree: %d", uart_baud)
+
+        port_kwargs = self._find_serial_port(micro_binary)
+        serial_transport = serial.SerialTransport(
+            timeouts=self._serial_timeouts, baudrate=uart_baud, **port_kwargs
+        )
+        if self._debug_rpc_session is None:
+            return serial_transport
+
+        return debug.DebugWrapperTransport(
+            debugger.RpcDebugger(
+                self._debug_rpc_session,
+                debugger.DebuggerFactory(
+                    ZephyrDebugger,
+                    (
+                        " ".join(shlex.quote(x) for x in self._west_cmd),
+                        os.path.dirname(micro_binary.abspath(micro_binary.label("cmake_cache")[0])),
+                        micro_binary.abspath(micro_binary.debug_files[0]),
+                        self._zephyr_base,
+                    ),
+                    {},
+                ),
+            ),
+            serial_transport,
+        )
+
+
+class QemuStartupFailureError(Exception):
+    """Raised when the qemu pipe is not present within startup_timeout_sec."""
+
+
+class QemuFdTransport(file_descriptor.FdTransport):
+    """An FdTransport subclass that escapes written data to accomodate the QEMU monitor.
+
+    It's supposedly possible to disable the monitor, but Zephyr controls most of the command-line
+    arguments for QEMU and there are too many options which implictly enable the monitor, so this
+    approach seems more robust.
+    """
+
+    def write_monitor_quit(self):
+        file_descriptor.FdTransport.write(self, b"\x01x", 1.0)
+
+    def close(self):
+        file_descriptor.FdTransport.close(self)
+
+    def timeouts(self):
+        assert False, "should not get here"
+
+    def write(self, data, timeout_sec):
+        """Write data, escaping for QEMU monitor."""
+        to_write = bytearray()
+        escape_pos = []
+        for i, b in enumerate(data):
+            if b == 0x01:
+                to_write.append(b)
+                escape_pos.append(i)
+            to_write.append(b)
+
+        num_written = file_descriptor.FdTransport.write(self, to_write, timeout_sec)
+        num_written -= sum(1 if x < num_written else 0 for x in escape_pos)
+        return num_written
+
+
+class ZephyrQemuTransport(Transport):
+    """The user-facing Zephyr QEMU transport class."""
+
+    def __init__(self, base_dir, startup_timeout_sec=5.0, **kwargs):
+        self.base_dir = base_dir
+        self.startup_timeout_sec = startup_timeout_sec
+        self.kwargs = kwargs
+        self.proc = None
+        self.fd_transport = None
+        self.pipe_dir = None
+
+    def timeouts(self):
+        return TransportTimeouts(
+            session_start_retry_timeout_sec=2.0,
+            session_start_timeout_sec=self.startup_timeout_sec,
+            session_established_timeout_sec=5.0,
+        )
+
+    def open(self):
+        self.pipe_dir = tempfile.mkdtemp()
+        self.pipe = os.path.join(self.pipe_dir, "fifo")
+        self.write_pipe = os.path.join(self.pipe_dir, "fifo.in")
+        self.read_pipe = os.path.join(self.pipe_dir, "fifo.out")
+        os.mkfifo(self.write_pipe)
+        os.mkfifo(self.read_pipe)
+        self.proc = subprocess.Popen(
+            ["make", "run", f"QEMU_PIPE={self.pipe}"],
+            cwd=self.base_dir,
+            **self.kwargs,
+        )
+        # NOTE: although each pipe is unidirectional, open both as RDWR to work around a select
+        # limitation on linux. Without this, non-blocking I/O can't use timeouts because named
+        # FIFO are always considered ready to read when no one has opened them for writing.
+        self.fd_transport = wakeup.WakeupTransport(
+            QemuFdTransport(
+                os.open(self.read_pipe, os.O_RDWR | os.O_NONBLOCK),
+                os.open(self.write_pipe, os.O_RDWR | os.O_NONBLOCK),
+                self.timeouts(),
+            ),
+            b"\xfe\xff\xfd\x03\0\0\0\0\0\x02" b"fw",
+        )
+        self.fd_transport.open()
+
+    def close(self):
+        if self.fd_transport is not None:
+            self.fd_transport.child_transport.write_monitor_quit()
+            self.proc.wait()
+            self.fd_transport.close()
+            self.fd_transport = None
+
+        if self.proc is not None:
+            self.proc = None
+
+        if self.pipe_dir is not None:
+            shutil.rmtree(self.pipe_dir)
+            self.pipe_dir = None
+
+    def read(self, n, timeout_sec):
+        if self.fd_transport is None:
+            raise TransportClosedError()
+        return self.fd_transport.read(n, timeout_sec)
+
+    def write(self, data, timeout_sec):
+        if self.fd_transport is None:
+            raise TransportClosedError()
+        return self.fd_transport.write(data, timeout_sec)
+
+
+class ZephyrDebugger(debugger.GdbDebugger):
+    """A Zephyr debugger implementation."""
+
+    def __init__(self, west_cmd, build_dir, elf_path, zephyr_base):
+        super(ZephyrDebugger, self).__init__()
+        self._west_cmd = shlex.split(west_cmd)
+        self._build_dir = build_dir
+        self._elf_path = elf_path
+        self._zephyr_base = zephyr_base
+
+    def popen_kwargs(self):
+        env = dict(os.environ)
+        env["ZEPHYR_BASE"] = self._zephyr_base
+
+        return dict(
+            args=self._west_cmd
+            + [
+                "debug",
+                "--skip-rebuild",
+                "--build-dir",
+                self._build_dir,
+                "--elf-file",
+                self._elf_path,
+            ],
+            env=env,
+        )
diff --git a/python/tvm/micro/debugger.py b/python/tvm/micro/debugger.py
index 8c7a300c2aae..65cafe7e9c8a 100644
--- a/python/tvm/micro/debugger.py
+++ b/python/tvm/micro/debugger.py
@@ -17,21 +17,33 @@
 
 """Defines functions for controlling debuggers for micro TVM binaries."""
 
+import atexit
 import abc
+import errno
+import logging
 import os
+import shlex
 import signal
 import subprocess
+import sys
+import termios
 import threading
+import time
 
-from . import transport as _transport
+import psutil
+
+from .._ffi import register_func
+from . import class_factory
+from . import transport
+from .transport.file_descriptor import FdTransport
+
+
+_LOG = logging.getLogger(__name__)
 
 
 class Debugger(metaclass=abc.ABCMeta):
     """An interface for controlling micro TVM debuggers."""
 
-    def __init__(self):
-        self.on_terminate_callbacks = []
-
     @abc.abstractmethod
     def start(self):
         """Start the debugger, but do not block on it.
@@ -49,30 +61,108 @@ def stop(self):
 class GdbDebugger(Debugger):
     """Handles launching, suspending signals, and potentially dealing with terminal issues."""
 
+    # Number of seconds to wait in stop() for a graceful shutdown. After this time has elapsed,
+    # the debugger is kill()'d.
+    _GRACEFUL_SHUTDOWN_TIMEOUT_SEC = 5.0
+
+    # The instance of GdbDebugger that's currently started.
+    _STARTED_INSTANCE = None
+
+    @classmethod
+    def _stop_all(cls):
+        if cls._STARTED_INSTANCE:
+            cls._STARTED_INSTANCE.stop()
+
+    def __init__(self):
+        super(GdbDebugger, self).__init__()
+        self._is_running = False
+        self._is_running_lock = threading.RLock()
+        self._child_exited_event = threading.Event()
+        self._signals_reset_event = threading.Event()
+
     @abc.abstractmethod
     def popen_kwargs(self):
         raise NotImplementedError()
 
-    def _wait_restore_signal(self):
+    def _internal_stop(self):
+        if not self._is_running:
+            return
+
+        os.kill(os.getpid(), signal.SIGUSR1)
+        self._signals_reset_event.wait()
+        termios.tcsetattr(sys.stdin.fileno(), termios.TCSAFLUSH, self.old_termios)
+
+        try:
+            children = psutil.Process(self.popen.pid).children(recursive=True)
+            for c in children:
+                c.terminate()
+                _, alive = psutil.wait_procs(children, timeout=self._GRACEFUL_SHUTDOWN_TIMEOUT_SEC)
+                for a in alive:
+                    a.kill()
+        except psutil.NoSuchProcess:
+            pass
+        finally:
+            self.__class__._STARTED_INSTANCE = None
+            self._is_running = False
+            self._child_exited_event.set()
+
+    def _wait_for_child(self):
         self.popen.wait()
-        if not self.did_terminate.is_set():
-            for callback in self.on_terminate_callbacks:
-                try:
-                    callback()
-                except Exception:  # pylint: disable=broad-except
-                    logging.warn("on_terminate_callback raised exception", exc_info=True)
+        with self._is_running_lock:
+            self._internal_stop()
+
+    @classmethod
+    def _sigusr1_handler(cls, signum, stack_frame):  # pylint: disable=unused-argument
+        assert (
+            cls._STARTED_INSTANCE is not None
+        ), "overridden sigusr1 handler should not be invoked when GDB not started"
+        signal.signal(signal.SIGINT, cls._STARTED_INSTANCE.old_sigint_handler)
+        signal.signal(signal.SIGUSR1, cls._STARTED_INSTANCE.old_sigusr1_handler)
+        cls._STARTED_INSTANCE._signals_reset_event.set()
+
+    @classmethod
+    def _sigint_handler(cls, signum, stack_frame):  # pylint: disable=unused-argument
+        assert (
+            cls._STARTED_INSTANCE is not None
+        ), "overridden sigint handler should not be invoked when GDB not started"
+        with cls._STARTED_INSTANCE._is_running_lock:
+            exists = cls._STARTED_INSTANCE._is_running
+        if exists:
+            try:
+                os.killpg(cls._STARTED_INSTANCE.child_pgid, signal.SIGINT)
+            except ProcessLookupError:
+                pass
 
     def start(self):
-        kwargs = self.popen_kwargs()
-        self.did_terminate = threading.Event()
-        self.old_signal = signal.signal(signal.SIGINT, signal.SIG_IGN)
-        self.popen = subprocess.Popen(**kwargs)
-        threading.Thread(target=self._wait_restore_signal).start()
+        with self._is_running_lock:
+            assert not self._is_running
+            assert not self._STARTED_INSTANCE
+
+            kwargs = self.popen_kwargs()
+            self.did_start_new_session = kwargs.setdefault("start_new_session", True)
+
+            self.old_termios = termios.tcgetattr(sys.stdin.fileno())
+            self.popen = subprocess.Popen(**kwargs)
+            self._is_running = True
+            self.old_sigint_handler = signal.signal(signal.SIGINT, self._sigint_handler)
+            self.old_sigusr1_handler = signal.signal(signal.SIGUSR1, self._sigusr1_handler)
+            self.__class__._STARTED_INSTANCE = self
+            try:
+                self.child_pgid = os.getpgid(self.popen.pid)
+            except Exception:
+                self.stop()
+                raise
+            with self._is_running_lock:
+                self._is_child_alive = True
+            t = threading.Thread(target=self._wait_for_child)
+            t.daemon = True
+            t.start()
 
     def stop(self):
-        self.did_terminate.set()
-        self.popen.terminate()
-        signal.signal(signal.SIGINT, self.old_signal)
+        self._child_exited_event.wait()
+
+
+atexit.register(GdbDebugger._stop_all)
 
 
 class GdbTransportDebugger(GdbDebugger):
@@ -110,50 +200,79 @@ def popen_kwargs(self):
                     ["-O", "settings set target.run-args {}".format(" ".join(self.args[1:]))]
                 )
         elif sysname == "Linux":
-            args = (
-                ["gdb", "--args"] + self.args + ["</dev/fd/{stdin_read}", ">/dev/fd/{stdout_write}"]
-            )
+            args = [
+                "gdb",
+                "-ex",
+                f"file {self.args[0]}",
+                "-ex",
+                (
+                    f"set args {' '.join(shlex.quote(a) for a in self.args[1:])} "
+                    f"</dev/fd/{stdin_read} >/dev/fd/{stdout_write}"
+                ),
+            ]
         else:
             raise NotImplementedError(f"System {sysname} is not yet supported")
 
-        self.stdin = os.fdopen(stdin_write, "wb", buffering=0)
-        self.stdout = os.fdopen(stdout_read, "rb", buffering=0)
+        self.fd_transport = FdTransport(
+            stdout_read, stdin_write, transport.debug_transport_timeouts()
+        )
+        self.fd_transport.open()
 
         return {
             "args": args,
             "pass_fds": [stdin_read, stdout_write],
         }
 
-    def _wait_for_process_death(self):
-        self.popen.wait()
-        self.stdin.close()
-        self.stdout.close()
-
-    def start(self):
-        to_return = super(GdbTransportDebugger, self).start()
-        threading.Thread(target=self._wait_for_process_death, daemon=True).start()
-        return to_return
-
-    def stop(self):
-        self.stdin.close()
-        self.stdout.close()
-        super(GdbTransportDebugger, self).stop()
+    def _internal_stop(self):
+        self.fd_transport.close()
+        super(GdbTransportDebugger, self)._internal_stop()
 
-    class _Transport(_transport.Transport):
+    class _Transport(transport.Transport):
         def __init__(self, gdb_transport_debugger):
             self.gdb_transport_debugger = gdb_transport_debugger
 
+        def timeouts(self):
+            return transport.debug_transport_timeouts()
+
         def open(self):
             pass  # Pipes opened by parent class.
 
-        def write(self, data):
-            return self.gdb_transport_debugger.stdin.write(data)
+        def write(self, data, timeout_sec):
+            end_time = time.monotonic() + timeout_sec if timeout_sec is not None else None
+            while True:
+                try:
+                    return self.gdb_transport_debugger.fd_transport.write(data, timeout_sec)
+                except OSError as exc:
+                    # NOTE: this error sometimes happens when writes are initiated before the child
+                    # process launches.
+                    if exc.errno == errno.EAGAIN:
+                        if end_time is None or time.monotonic() < end_time:
+                            time.sleep(0.1)  # sleep to avoid excessive CPU usage
+                            continue
+
+                    raise exc
+
+            raise base.IoTimeoutError()
+
+        def read(self, n, timeout_sec):
+            end_time = time.monotonic() + timeout_sec if timeout_sec is not None else None
+            while True:
+                try:
+                    return self.gdb_transport_debugger.fd_transport.read(n, timeout_sec)
+                except OSError as exc:
+                    # NOTE: this error sometimes happens when reads are initiated before the child
+                    # process launches.
+                    if exc.errno == errno.EAGAIN:
+                        if end_time is None or time.monotonic() < end_time:
+                            time.sleep(0.1)  # sleep to avoid excessive CPU usage
+                            continue
+
+                    raise exc
 
-        def read(self, n):
-            return self.gdb_transport_debugger.stdout.read(n)
+            raise base.IoTimeoutError()
 
         def close(self):
-            pass  # Pipes closed by parent class.
+            pass  # Pipes closed by parent class (DebugWrapperTransport calls stop() next).
 
     def transport(self):
         return self._Transport(self)
@@ -197,3 +316,72 @@ def stop(self):
         finally:
             if self.wrapping_context_manager is not None:
                 self.wrapping_context_manager.__exit__(None, None, None)
+
+
+GLOBAL_DEBUGGER = None
+
+
+class DebuggerFactory(class_factory.ClassFactory):
+
+    SUPERCLASS = Debugger
+
+
+def launch_debugger(debugger_factory, *args, **kw):
+    global GLOBAL_DEBUGGER
+    if GLOBAL_DEBUGGER is not None:
+        stop_debugger()
+
+    GLOBAL_DEBUGGER = debugger_factory.instantiate(*args, **kw)
+    GLOBAL_DEBUGGER.start()
+
+
+@register_func("tvm.micro.debugger.launch_debugger")
+def _launch_debugger(debugger_factory_json):
+    launch_debugger(DebuggerFactory.from_json(debugger_factory_json))
+
+
+@register_func("tvm.micro.debugger.stop_debugger")
+def stop_debugger():
+    global GLOBAL_DEBUGGER
+    if GLOBAL_DEBUGGER is not None:
+        try:
+            GLOBAL_DEBUGGER.stop()
+        finally:
+            GLOBAL_DEBUGGER = None
+
+
+class RpcDebugger(Debugger):
+    """A Debugger instance that launches the actual debugger on a remote TVM RPC server."""
+
+    def __init__(self, rpc_session, factory, wrapping_context_manager=None):
+        super(RpcDebugger, self).__init__()
+        self._factory = factory
+        self.launch_debugger = rpc_session.get_function("tvm.micro.debugger.launch_debugger")
+        self.stop_debugger = rpc_session.get_function("tvm.micro.debugger.stop_debugger")
+        self.wrapping_context_manager = wrapping_context_manager
+
+    def start(self):
+        if self.wrapping_context_manager is not None:
+            self.wrapping_context_manager.__enter__()
+
+        try:
+            self.launch_debugger(self._factory.to_json)
+        except Exception:
+            if self.wrapping_context_manager is not None:
+                self.wrapping_context_manager.__exit__(None, None, None)
+            raise
+
+        try:
+            input("Press [Enter] when debugger is set")
+        except Exception:
+            self.stop()
+            raise
+
+        self._is_running = True
+
+    def stop(self):
+        try:
+            self.stop_debugger()
+        finally:
+            if self.wrapping_context_manager is not None:
+                self.wrapping_context_manager.__exit__(None, None, None)
diff --git a/python/tvm/micro/micro_binary.py b/python/tvm/micro/micro_binary.py
index 9d411a165150..74b760b67650 100644
--- a/python/tvm/micro/micro_binary.py
+++ b/python/tvm/micro/micro_binary.py
@@ -26,7 +26,7 @@ class MicroBinary(artifact.Artifact):
     ARTIFACT_TYPE = "micro_binary"
 
     @classmethod
-    def from_unarchived(cls, base_dir, labelled_files, metadata):
+    def from_unarchived(cls, base_dir, labelled_files, metadata, immobile):
         binary_file = labelled_files["binary_file"][0]
         del labelled_files["binary_file"]
 
@@ -41,16 +41,25 @@ def from_unarchived(cls, base_dir, labelled_files, metadata):
             debug_files=debug_files,
             labelled_files=labelled_files,
             metadata=metadata,
+            immobile=immobile,
         )
 
-    def __init__(self, base_dir, binary_file, debug_files=None, labelled_files=None, metadata=None):
+    def __init__(
+        self,
+        base_dir,
+        binary_file,
+        debug_files=None,
+        labelled_files=None,
+        metadata=None,
+        immobile=False,
+    ):
         labelled_files = {} if labelled_files is None else dict(labelled_files)
         metadata = {} if metadata is None else dict(metadata)
         labelled_files["binary_file"] = [binary_file]
         if debug_files is not None:
             labelled_files["debug_files"] = debug_files
 
-        super(MicroBinary, self).__init__(base_dir, labelled_files, metadata)
+        super(MicroBinary, self).__init__(base_dir, labelled_files, metadata, immobile=immobile)
 
         self.binary_file = binary_file
         self.debug_files = debug_files
diff --git a/python/tvm/micro/micro_library.py b/python/tvm/micro/micro_library.py
index 52c8cf29116e..74687ede1235 100644
--- a/python/tvm/micro/micro_library.py
+++ b/python/tvm/micro/micro_library.py
@@ -17,7 +17,7 @@
 
 """Defines an Artifact subclass that describes a compiled static library."""
 
-from tvm.contrib import util
+from tvm.contrib import utils
 from . import artifact
 from . import compiler
 
@@ -28,7 +28,7 @@ class MicroLibrary(artifact.Artifact):
     ARTIFACT_TYPE = "micro_library"
 
     @classmethod
-    def from_unarchived(cls, base_dir, labelled_files, metadata):
+    def from_unarchived(cls, base_dir, labelled_files, metadata, immobile):
         library_files = labelled_files["library_files"]
         del labelled_files["library_files"]
 
@@ -43,10 +43,17 @@ def from_unarchived(cls, base_dir, labelled_files, metadata):
             debug_files=debug_files,
             labelled_files=labelled_files,
             metadata=metadata,
+            immobile=immobile,
         )
 
     def __init__(
-        self, base_dir, library_files, debug_files=None, labelled_files=None, metadata=None
+        self,
+        base_dir,
+        library_files,
+        debug_files=None,
+        labelled_files=None,
+        metadata=None,
+        immobile=False,
     ):
         labelled_files = {} if labelled_files is None else dict(labelled_files)
         metadata = {} if metadata is None else dict(metadata)
@@ -54,7 +61,7 @@ def __init__(
         if debug_files is not None:
             labelled_files["debug_files"] = debug_files
 
-        super(MicroLibrary, self).__init__(base_dir, labelled_files, metadata)
+        super(MicroLibrary, self).__init__(base_dir, labelled_files, metadata, immobile=immobile)
 
         self.library_files = library_files
         self.debug_file = debug_files
@@ -72,7 +79,7 @@ def create_micro_library(output, objects, options=None):
     options : Optional[List[str]]
       If given, additional command-line flags for the compiler.
     """
-    temp_dir = util.tempdir()
+    temp_dir = utils.tempdir()
     comp = compiler.DefaultCompiler()
     output = temp_dir.relpath("micro-library.o")
     comp.library(output, objects, options=options)
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 3565040e1d76..0f2f09a83652 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -18,11 +18,13 @@
 """Defines a top-level glue class that operates the Transport and Flasher classes."""
 
 import logging
-import time
+import sys
 
+from ..error import register_error
 from .._ffi import get_global_func
 from ..contrib import graph_runtime
 from ..rpc import RPCSession
+from .transport import IoTimeoutError
 from .transport import TransportLogger
 
 try:
@@ -31,6 +33,11 @@
     raise ImportError("micro tvm is not enabled. Set USE_MICRO to ON in config.cmake")
 
 
+@register_error
+class SessionTerminatedError(Exception):
+    """Raised when a transport read operationd discovers that the remote session is terminated."""
+
+
 class Session:
     """MicroTVM Device Session
 
@@ -51,7 +58,12 @@ class Session:
     """
 
     def __init__(
-        self, binary=None, flasher=None, transport_context_manager=None, session_name="micro-rpc"
+        self,
+        binary=None,
+        flasher=None,
+        transport_context_manager=None,
+        session_name="micro-rpc",
+        timeout_override=None,
     ):
         """Configure a new session.
 
@@ -68,11 +80,15 @@ def __init__(
             should establish a tarnsport between this TVM instance and the device.
         session_name : str
             Name of the session, used for debugging.
+        timeout_override : TransportTimeouts
+            If given, TransportTimeouts that govern the way Receive() behaves. If not given, this is
+            determined by calling has_flow_control() on the transport.
         """
         self.binary = binary
         self.flasher = flasher
         self.transport_context_manager = transport_context_manager
         self.session_name = session_name
+        self.timeout_override = timeout_override
 
         self._rpc = None
         self._graph_runtime = None
@@ -80,6 +96,22 @@ def __init__(
     def get_system_lib(self):
         return self._rpc.get_function("runtime.SystemLib")()
 
+    def _wrap_transport_read(self, n, timeout_microsec):
+        try:
+            return self.transport.read(
+                n, float(timeout_microsec) / 1e6 if timeout_microsec is not None else None
+            )
+        except IoTimeoutError:
+            return bytes([])
+
+    def _wrap_transport_write(self, data, timeout_microsec):
+        try:
+            return self.transport.write(
+                data, float(timeout_microsec) / 1e6 if timeout_microsec is not None else None
+            )
+        except IoTimeoutError:
+            return 0
+
     def __enter__(self):
         """Initialize this session and establish an RPC session with the on-device RPC server.
 
@@ -90,22 +122,75 @@ def __enter__(self):
         """
         if self.flasher is not None:
             self.transport_context_manager = self.flasher.flash(self.binary)
-            time.sleep(3.0)
 
         self.transport = TransportLogger(
-            self.session_name, self.transport_context_manager, level=logging.INFO
+            self.session_name, self.transport_context_manager, level=logging.DEBUG
         ).__enter__()
-        self._rpc = RPCSession(
-            _rpc_connect(self.session_name, self.transport.write, self.transport.read)
-        )
-        self.context = self._rpc.cpu(0)
-        return self
+
+        try:
+            timeouts = self.timeout_override
+            if timeouts is None:
+                timeouts = self.transport.timeouts()
+
+            self._rpc = RPCSession(
+                _rpc_connect(
+                    self.session_name,
+                    self._wrap_transport_write,
+                    self._wrap_transport_read,
+                    int(timeouts.session_start_retry_timeout_sec * 1e6),
+                    int(timeouts.session_start_timeout_sec * 1e6),
+                    int(timeouts.session_established_timeout_sec * 1e6),
+                )
+            )
+            self.context = self._rpc.cpu(0)
+            return self
+
+        except:
+            self.transport.__exit__(*sys.exc_info())
+            raise
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
         """Tear down this session and associated RPC session resources."""
         self.transport.__exit__(exc_type, exc_value, exc_traceback)
 
 
+def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
+    """Lookup a parameter that has been pre-linked into a remote (i.e. over RPC) Module.
+
+    This function signature matches the signature built by
+
+    Parameters
+    ----------
+    mod : tvm.runtime.Module
+        The remote Module containing the pre-linked parameters.
+    storage_id : int
+        An integer identifying the pre-linked paramter to find
+    template_tensor : DLTensor
+        A DLTensor containing metadata that should be filled-in to the returned NDArray. This
+        function should mostly not inspect this, and just pass it along to
+        NDArrayFromRemoteOpaqueHandle.
+    ctx : TVMContext
+        The remote CPU context to be used with the returned NDArray.
+
+    Returns
+    -------
+    tvm.nd.NDArray :
+        NDArray containing the pre-linked parameter.
+    """
+    try:
+        lookup_linked_param = mod.get_function("_lookup_linked_param")
+    except AttributeError:
+        return None
+
+    remote_data = lookup_linked_param(storage_id)
+    if remote_data is None:
+        return None
+
+    return get_global_func("tvm.rpc.NDArrayFromRemoteOpaqueHandle")(
+        mod, remote_data, template_tensor, ctx, None
+    )
+
+
 def create_local_graph_runtime(graph_json_str, mod, ctx):
     """Create a local graph runtime driving execution on the remote CPU context given.
 
@@ -127,4 +212,38 @@ def create_local_graph_runtime(graph_json_str, mod, ctx):
     """
     device_type_id = [ctx.device_type, ctx.device_id]
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return graph_runtime.GraphModule(fcreate(graph_json_str, mod, *device_type_id))
+    return graph_runtime.GraphModule(
+        fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id)
+    )
+
+
+def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None):
+    """Create a local debug runtime driving execution on the remote CPU context given.
+
+    Parameters
+    ----------
+    graph_json_str : str
+        A string containing the graph representation.
+
+    mod : tvm.runtime.Module
+        The remote module containing functions in graph_json_str.
+
+    ctx : tvm.Context
+        The remote CPU execution context.
+
+    dump_root : Optional[str]
+        If given, passed as dump_root= to GraphModuleDebug.
+
+    Returns
+    -------
+    tvm.contrib.GraphRuntime :
+         A local graph runtime instance that executes on the remote device.
+    """
+    device_type_id = [ctx.device_type, ctx.device_id]
+    fcreate = get_global_func("tvm.graph_runtime_debug.create")
+    return debug_runtime.GraphModuleDebug(
+        fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id),
+        [ctx],
+        graph_json_str,
+        dump_root=dump_root,
+    )
diff --git a/python/tvm/micro/transport.py b/python/tvm/micro/transport.py
deleted file mode 100644
index c789bc6c856a..000000000000
--- a/python/tvm/micro/transport.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Defines abstractions and implementations of the RPC transport used with micro TVM."""
-
-import abc
-import logging
-import string
-import subprocess
-import typing
-
-import tvm
-
-_LOG = logging.getLogger(__name__)
-
-
-@tvm.error.register_error
-class SessionTerminatedError(Exception):
-    """Raised when a transport read operationd discovers that the remote session is terminated."""
-
-
-class Transport(metaclass=abc.ABCMeta):
-    """The abstract Transport class used for micro TVM."""
-
-    def __enter__(self):
-        self.open()
-        return self
-
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        self.close()
-
-    @abc.abstractmethod
-    def open(self):
-        """Open any resources needed to send and receive RPC protocol data for a single session."""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def close(self):
-        """Release resources associated with this transport."""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def read(self, n):
-        """Read up to n bytes from the transport.
-
-        Parameters
-        ----------
-        n : int
-            Maximum number of bytes to read from the transport.
-
-        Returns
-        -------
-        bytes :
-            Data read from the channel. Less than `n` bytes may be returned, but 0 bytes should
-            never be returned except in error. Note that if a transport error occurs, an Exception
-            should be raised rather than simply returning empty bytes.
-
-
-        Raises
-        ------
-        SessionTerminatedError :
-            When the transport layer determines that the active session was terminated by the
-            remote side. Typically this indicates that the remote device has reset.
-        """
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def write(self, data):
-        """Write data to the transport channel.
-
-        Parameters
-        ----------
-        data : bytes
-            The data to write over the channel.
-
-        Returns
-        -------
-        int :
-            The number of bytes written to the underlying channel. This can be less than the length
-            of `data`, but cannot be 0.
-        """
-        raise NotImplementedError()
-
-
-class TransportLogger(Transport):
-    """Wraps a Transport implementation and logs traffic to the Python logging infrastructure."""
-
-    def __init__(self, name, child, logger=None, level=logging.INFO):
-        self.name = name
-        self.child = child
-        self.logger = logger or _LOG
-        self.level = level
-
-    # Construct PRINTABLE to exclude whitespace from string.printable.
-    PRINTABLE = string.digits + string.ascii_letters + string.punctuation
-
-    @classmethod
-    def _to_hex(cls, data):
-        lines = []
-        if not data:
-            lines.append("")
-            return lines
-
-        for i in range(0, (len(data) + 15) // 16):
-            chunk = data[i * 16 : (i + 1) * 16]
-            hex_chunk = " ".join(f"{c:02x}" for c in chunk)
-            ascii_chunk = "".join((chr(c) if chr(c) in cls.PRINTABLE else ".") for c in chunk)
-            lines.append(f"{i * 16:04x}  {hex_chunk:47}  {ascii_chunk}")
-
-        if len(lines) == 1:
-            lines[0] = lines[0][6:]
-
-        return lines
-
-    def open(self):
-        self.logger.log(self.level, "opening transport")
-        self.child.open()
-
-    def close(self):
-        self.logger.log(self.level, "closing transport")
-        return self.child.close()
-
-    def read(self, n):
-        data = self.child.read(n)
-        hex_lines = self._to_hex(data)
-        if len(hex_lines) > 1:
-            self.logger.log(
-                self.level,
-                "%s read %4d B -> [%d B]:\n%s",
-                self.name,
-                n,
-                len(data),
-                "\n".join(hex_lines),
-            )
-        else:
-            self.logger.log(
-                self.level, "%s read %4d B -> [%d B]: %s", self.name, n, len(data), hex_lines[0]
-            )
-
-        return data
-
-    def write(self, data):
-        bytes_written = self.child.write(data)
-        hex_lines = self._to_hex(data[:bytes_written])
-        if len(hex_lines) > 1:
-            self.logger.log(
-                self.level,
-                "%s write      <- [%d B]:\n%s",
-                self.name,
-                bytes_written,
-                "\n".join(hex_lines),
-            )
-        else:
-            self.logger.log(
-                self.level, "%s write      <- [%d B]: %s", self.name, bytes_written, hex_lines[0]
-            )
-
-        return bytes_written
-
-
-class SubprocessTransport(Transport):
-    """A Transport implementation that uses a subprocess's stdin/stdout as the channel."""
-
-    def __init__(self, args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-        self.popen = None
-
-    def open(self):
-        self.kwargs["stdout"] = subprocess.PIPE
-        self.kwargs["stdin"] = subprocess.PIPE
-        self.kwargs["bufsize"] = 0
-        self.popen = subprocess.Popen(self.args, **self.kwargs)
-        self.stdin = self.popen.stdin
-        self.stdout = self.popen.stdout
-
-    def write(self, data):
-        to_return = self.stdin.write(data)
-        self.stdin.flush()
-
-        return to_return
-
-    def read(self, n):
-        return self.stdout.read(n)
-
-    def close(self):
-        self.stdin.close()
-        self.stdout.close()
-        self.popen.terminate()
-
-
-class DebugWrapperTransport(Transport):
-    """A Transport wrapper class that launches a debugger before opening the transport.
-
-    This is primiarly useful when debugging the other end of a SubprocessTransport. It allows you
-    to pipe data through the GDB process to drive the subprocess with a debugger attached.
-    """
-
-    def __init__(self, debugger, transport):
-        self.debugger = debugger
-        self.transport = transport
-        self.debugger.on_terminate_callbacks.append(self.transport.close)
-
-    def open(self):
-        self.debugger.start()
-
-        try:
-            self.transport.open()
-        except Exception:
-            self.debugger.stop()
-            raise
-
-    def write(self, data):
-        return self.transport.write(data)
-
-    def read(self, n):
-        return self.transport.read(n)
-
-    def close(self):
-        self.transport.close()
-        self.debugger.stop()
-
-
-TransportContextManager = typing.ContextManager[Transport]
diff --git a/python/tvm/micro/transport/__init__.py b/python/tvm/micro/transport/__init__.py
new file mode 100644
index 000000000000..dffe9ae32792
--- /dev/null
+++ b/python/tvm/micro/transport/__init__.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines abstractions and implementations related to the microTVM RPC transport layer."""
+
+from .base import IoTimeoutError
+from .base import Transport
+from .base import TransportClosedError
+from .base import TransportLogger
+from .base import TransportTimeouts
+from .base import debug_transport_timeouts
+from .debug import DebugWrapperTransport
+from .subprocess import SubprocessTransport
diff --git a/python/tvm/micro/transport/base.py b/python/tvm/micro/transport/base.py
new file mode 100644
index 000000000000..fdc7e9b2afce
--- /dev/null
+++ b/python/tvm/micro/transport/base.py
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines abstractions and implementations of the RPC transport used with micro TVM."""
+
+import abc
+import collections
+import logging
+import string
+import typing
+
+_LOG = logging.getLogger(__name__)
+
+
+class TransportClosedError(Exception):
+    """Raised when a transport can no longer be used due to underlying I/O problems."""
+
+
+class IoTimeoutError(Exception):
+    """Raised when the I/O operation could not be completed before the timeout.
+
+    Specifically:
+     - when no data could be read before the timeout
+     - when some of the write data could be written before the timeout
+
+    Note the asymmetric behavior of read() vs write(), since in one case the total length of the
+    data to transfer is known.
+    """
+
+
+# Timeouts supported by the underlying C++ MicroSession.
+#
+# session_start_retry_timeout_sec : float
+#     Number of seconds to wait for the device to send a kSessionStartReply after sending the
+#     initial session start message. After this time elapses another
+#     kSessionTerminated-kSessionStartInit train is sent. 0 disables this.
+# session_start_timeout_sec : float
+#     Total number of seconds to wait for the session to be established. After this time, the
+#     client gives up trying to establish a session and raises an exception.
+# session_established_timeout_sec : float
+#     Number of seconds to wait for a reply message after a session has been established. 0
+#     disables this.
+TransportTimeouts = collections.namedtuple(
+    "TransportTimeouts",
+    [
+        "session_start_retry_timeout_sec",
+        "session_start_timeout_sec",
+        "session_established_timeout_sec",
+    ],
+)
+
+
+def debug_transport_timeouts(session_start_retry_timeout_sec=0):
+    return TransportTimeouts(
+        session_start_retry_timeout_sec=session_start_retry_timeout_sec,
+        session_start_timeout_sec=0,
+        session_established_timeout_sec=0,
+    )
+
+
+class Transport(metaclass=abc.ABCMeta):
+    """The abstract Transport class used for micro TVM."""
+
+    def __enter__(self):
+        self.open()
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.close()
+
+    @abc.abstractmethod
+    def timeouts(self):
+        """Return TransportTimeouts suitable for use with this transport.
+
+        See the TransportTimeouts documentation in python/tvm/micro/session.py.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def open(self):
+        """Open any resources needed to send and receive RPC protocol data for a single session."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def close(self):
+        """Release resources associated with this transport."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def read(self, n, timeout_sec):
+        """Read up to n bytes from the transport.
+
+        Parameters
+        ----------
+        n : int
+            Maximum number of bytes to read from the transport.
+        timeout_sec : Union[float, None]
+            Number of seconds to wait for all `n` bytes to be received before timing out. The
+            transport can wait additional time to account for transport latency or bandwidth
+            limitations based on the selected configuration and number of bytes being received. If
+            timeout_sec is 0, read should attempt to service the request in a non-blocking fashion.
+            If timeout_sec is None, read should block until at least 1 byte of data can be returned.
+
+        Returns
+        -------
+        bytes :
+            Data read from the channel. Less than `n` bytes may be returned, but 0 bytes should
+            never be returned. If returning less than `n` bytes, the full timeout_sec, plus any
+            internally-added timeout, should be waited. If a timeout or transport error occurs,
+            an exception should be raised rather than simply returning empty bytes.
+
+
+        Raises
+        ------
+        TransportClosedError :
+            When the transport layer determines that the transport can no longer send or receive
+            data due to an underlying I/O problem (i.e. file descriptor closed, cable removed, etc).
+
+        IoTimeoutError :
+            When `timeout_sec` elapses without receiving any data.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def write(self, data, timeout_sec):
+        """Write data to the transport channel.
+
+        Parameters
+        ----------
+        data : bytes
+            The data to write over the channel.
+        timeout_sec : Union[float, None]
+            Number of seconds to wait for at least one byte to be written before timing out. The
+            transport can wait additional time to account for transport latency or bandwidth
+            limitations based on the selected configuration and number of bytes being received. If
+            timeout_sec is 0, write should attempt to service the request in a non-blocking fashion.
+            If timeout_sec is None, write should block until at least 1 byte of data can be
+            returned.
+
+        Returns
+        -------
+        int :
+            The number of bytes written to the underlying channel. This can be less than the length
+            of `data`, but cannot be 0 (raise an exception instead).
+
+        Raises
+        ------
+        TransportClosedError :
+            When the transport layer determines that the transport can no longer send or receive
+            data due to an underlying I/O problem (i.e. file descriptor closed, cable removed, etc).
+
+        IoTimeoutError :
+            When `timeout_sec` elapses without receiving any data.
+        """
+        raise NotImplementedError()
+
+
+class TransportLogger(Transport):
+    """Wraps a Transport implementation and logs traffic to the Python logging infrastructure."""
+
+    def __init__(self, name, child, logger=None, level=logging.INFO):
+        self.name = name
+        self.child = child
+        self.logger = logger or _LOG
+        self.level = level
+
+    # Construct PRINTABLE to exclude whitespace from string.printable.
+    PRINTABLE = string.digits + string.ascii_letters + string.punctuation
+
+    @classmethod
+    def _to_hex(cls, data):
+        lines = []
+        if not data:
+            lines.append("")
+            return lines
+
+        for i in range(0, (len(data) + 15) // 16):
+            chunk = data[i * 16 : (i + 1) * 16]
+            hex_chunk = " ".join(f"{c:02x}" for c in chunk)
+            ascii_chunk = "".join((chr(c) if chr(c) in cls.PRINTABLE else ".") for c in chunk)
+            lines.append(f"{i * 16:04x}  {hex_chunk:47}  {ascii_chunk}")
+
+        if len(lines) == 1:
+            lines[0] = lines[0][6:]
+
+        return lines
+
+    def timeouts(self):
+        return self.child.timeouts()
+
+    def open(self):
+        self.logger.log(self.level, "%s: opening transport", self.name)
+        self.child.open()
+
+    def close(self):
+        self.logger.log(self.level, "%s: closing transport", self.name)
+        return self.child.close()
+
+    def read(self, n, timeout_sec):
+        timeout_str = f"{timeout_sec:5.2f}s" if timeout_sec is not None else " None "
+        try:
+            data = self.child.read(n, timeout_sec)
+        except IoTimeoutError:
+            self.logger.log(
+                self.level,
+                "%s: read {%s} %4d B -> [IoTimeoutError %s]",
+                self.name,
+                timeout_str,
+                n,
+                timeout_str,
+            )
+            raise
+        except Exception as err:
+            self.logger.log(
+                self.level,
+                "%s: read {%s} %4d B -> [err: %s]",
+                self.name,
+                timeout_str,
+                n,
+                err.__class__.__name__,
+                exc_info=1,
+            )
+            raise err
+
+        hex_lines = self._to_hex(data)
+        if len(hex_lines) > 1:
+            self.logger.log(
+                self.level,
+                "%s: read {%s} %4d B -> [%3d B]:\n%s",
+                self.name,
+                timeout_str,
+                n,
+                len(data),
+                "\n".join(hex_lines),
+            )
+        else:
+            self.logger.log(
+                self.level,
+                "%s: read {%s} %4d B -> [%3d B]: %s",
+                self.name,
+                timeout_str,
+                n,
+                len(data),
+                hex_lines[0],
+            )
+
+        return data
+
+    def write(self, data, timeout_sec):
+        timeout_str = f"{timeout_sec:5.2f}s" if timeout_sec is not None else " None "
+        try:
+            bytes_written = self.child.write(data, timeout_sec)
+        except IoTimeoutError:
+            self.logger.log(
+                self.level,
+                "%s: write {%s}       <- [%3d B]: [IoTimeoutError %s]",
+                self.name,
+                timeout_str,
+                len(data),
+                timeout_str,
+            )
+            raise
+        except Exception as err:
+            self.logger.log(
+                self.level,
+                "%s: write {%s}       <- [%3d B]: [err: %s]",
+                self.name,
+                timeout_str,
+                len(data),
+                err.__class__.__name__,
+                exc_info=1,
+            )
+            raise err
+
+        hex_lines = self._to_hex(data[:bytes_written])
+        if len(hex_lines) > 1:
+            self.logger.log(
+                self.level,
+                "%s: write {%s}        <- [%3d B]:\n%s",
+                self.name,
+                timeout_str,
+                bytes_written,
+                "\n".join(hex_lines),
+            )
+        else:
+            self.logger.log(
+                self.level,
+                "%s: write {%s}        <- [%3d B]: %s",
+                self.name,
+                timeout_str,
+                bytes_written,
+                hex_lines[0],
+            )
+
+        return bytes_written
+
+
+TransportContextManager = typing.ContextManager[Transport]
diff --git a/python/tvm/micro/transport/debug.py b/python/tvm/micro/transport/debug.py
new file mode 100644
index 000000000000..71e12c7ed391
--- /dev/null
+++ b/python/tvm/micro/transport/debug.py
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines a wrapper Transport class that launches a debugger before opening."""
+
+from .base import Transport, TransportTimeouts
+
+
+class DebugWrapperTransport(Transport):
+    """A Transport wrapper class that launches a debugger before opening the transport.
+
+    This is primiarly useful when debugging the other end of a SubprocessTransport. It allows you
+    to pipe data through the GDB process to drive the subprocess with a debugger attached.
+    """
+
+    def __init__(self, debugger, transport, disable_session_start_retry=False):
+        self.debugger = debugger
+        self.transport = transport
+        self.disable_session_start_retry = disable_session_start_retry
+
+    def timeouts(self):
+        child_timeouts = self.transport.timeouts()
+        return TransportTimeouts(
+            session_start_retry_timeout_sec=(
+                0
+                if self.disable_session_start_retry
+                else child_timeouts.session_start_retry_timeout_sec
+            ),
+            session_start_timeout_sec=0,
+            session_established_timeout_sec=0,
+        )
+
+    def open(self):
+        self.debugger.start()
+
+        try:
+            self.transport.open()
+        except Exception:
+            self.debugger.stop()
+            raise
+
+    def write(self, data, timeout_sec):
+        return self.transport.write(data, timeout_sec)
+
+    def read(self, n, timeout_sec):
+        return self.transport.read(n, timeout_sec)
+
+    def close(self):
+        self.transport.close()
+        self.debugger.stop()
diff --git a/python/tvm/micro/transport/file_descriptor.py b/python/tvm/micro/transport/file_descriptor.py
new file mode 100644
index 000000000000..6df6cd425eff
--- /dev/null
+++ b/python/tvm/micro/transport/file_descriptor.py
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines an implementation of Transport that uses file descriptors."""
+
+import fcntl
+import os
+import select
+import time
+from . import base
+
+
+class FdConfigurationError(Exception):
+    """Raised when specified file descriptors can't be placed in non-blocking mode."""
+
+
+class FdTransport(base.Transport):
+    """A Transport implementation that implements timeouts using non-blocking I/O."""
+
+    @classmethod
+    def _validate_configure_fd(cls, file_descriptor):
+        file_descriptor = (
+            file_descriptor if isinstance(file_descriptor, int) else file_descriptor.fileno()
+        )
+        flag = fcntl.fcntl(file_descriptor, fcntl.F_GETFL)
+        if flag & os.O_NONBLOCK != 0:
+            return file_descriptor
+
+        fcntl.fcntl(file_descriptor, fcntl.F_SETFL, os.O_NONBLOCK | flag)
+        new_flag = fcntl.fcntl(file_descriptor, fcntl.F_GETFL)
+        if (new_flag & os.O_NONBLOCK) == 0:
+            raise FdConfigurationError(
+                f"Cannot set file descriptor {file_descriptor} to non-blocking"
+            )
+        return file_descriptor
+
+    def __init__(self, read_fd, write_fd, timeouts):
+        self.read_fd = self._validate_configure_fd(read_fd)
+        self.write_fd = self._validate_configure_fd(write_fd)
+        self._timeouts = timeouts
+
+    def timeouts(self):
+        return self._timeouts
+
+    def open(self):
+        pass
+
+    def close(self):
+        if self.read_fd is not None:
+            os.close(self.read_fd)
+            self.read_fd = None
+
+        if self.write_fd is not None:
+            os.close(self.write_fd)
+            self.write_fd = None
+
+    def _await_ready(self, rlist, wlist, timeout_sec=None, end_time=None):
+        if end_time is None:
+            return True
+
+        if timeout_sec is None:
+            timeout_sec = max(0, end_time - time.monotonic())
+        rlist, wlist, xlist = select.select(rlist, wlist, rlist + wlist, timeout_sec)
+        if not rlist and not wlist and not xlist:
+            raise base.IoTimeoutError()
+
+        return True
+
+    def read(self, n, timeout_sec):
+        if self.read_fd is None:
+            raise base.TransportClosedError()
+
+        end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
+
+        self._await_ready([self.read_fd], [], end_time=end_time)
+        to_return = os.read(self.read_fd, n)
+
+        if not to_return:
+            self.close()
+            raise base.TransportClosedError()
+
+        return to_return
+
+    def write(self, data, timeout_sec):
+        if self.write_fd is None:
+            raise base.TransportClosedError()
+
+        end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
+
+        data_len = len(data)
+        while data:
+            self._await_ready(end_time, [], [self.write_fd])
+            num_written = os.write(self.write_fd, data)
+            if not num_written:
+                self.close()
+                raise base.TransportClosedError()
+
+            data = data[num_written:]
+
+        return data_len
diff --git a/python/tvm/micro/transport/serial.py b/python/tvm/micro/transport/serial.py
new file mode 100644
index 000000000000..6640bb5a8a0c
--- /dev/null
+++ b/python/tvm/micro/transport/serial.py
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines a Transport implementation using pyserial."""
+
+import atexit
+import time
+import serial
+import serial.tools.list_ports
+from .base import IoTimeoutError, Transport, TransportTimeouts
+
+
+_DEFAULT_SERIAL_TIMEOUTS = TransportTimeouts(
+    session_start_retry_timeout_sec=5,
+    session_start_timeout_sec=10.0,
+    session_established_timeout_sec=30.0,
+)
+
+
+class SerialTransport(Transport):
+    """A Transport implementation using pySerial."""
+
+    _OPEN_PORTS = []
+
+    @classmethod
+    def close_atexit(cls):
+        """Close all serial ports before exit.
+
+        Some USB-UART kernel drivers are particularly sensitive to being left open (i.e. require
+        unplugging and replugging of attached hardware or reboot of machine); try very hard to
+        close all serial ports at exit.
+        """
+        for port in cls._OPEN_PORTS:
+            try:
+                port.close()
+            except Exception:  # pylint: disable=broad-except
+                _LOG.warn("exception closing port", exc_info=True)
+
+        cls._OPEN_PORTS = []
+
+    def __init__(self, grep=None, port_path=None, timeouts=None, **kw):
+        self._port_path = port_path
+        self._grep = grep
+        self._timeouts = timeouts if timeouts is not None else _DEFAULT_SERIAL_TIMEOUTS
+        self._kw = kw
+        if self._port_path is None and self._grep is None:
+            raise SerialPortNotFoundError("Must specify one of grep= or port_path=")
+
+    def timeouts(self):
+        return self._timeouts
+
+    def open(self):
+        if self._port_path is not None:
+            port_path = self._port_path
+        else:
+            ports = list(serial.tools.list_ports.grep(self._grep, include_links=True))
+            if len(ports) != 1:
+                raise SerialPortNotFoundError(
+                    f"grep expression should find 1 serial port; found {ports!r}"
+                )
+
+            port_path = ports[0].device
+
+        self._port = serial.Serial(port_path, timeout=0.1, exclusive=True, **self._kw)
+        self._port.cancel_read()
+        self._port.reset_input_buffer()
+        self._port.reset_output_buffer()
+        self._OPEN_PORTS.append(self._port)
+
+    def close(self):
+        if self._port is None:
+            return
+
+        self._port.close()
+        self._OPEN_PORTS.remove(self._port)
+        self._port = None
+
+    def read(self, n, timeout_sec):
+        if timeout_sec is None:
+            self._port.timeout = None
+            return self._port.read(n)
+
+        end_time = time.monotonic() + timeout_sec
+        to_return = bytearray()
+        while True:
+            timeout_remaining = end_time - time.monotonic()
+            if timeout_sec != 0 and timeout_remaining < 0:
+                break
+
+            # Read until *something* can be returned. If nothing is sent within 5 chars' time, stop.
+            # 5 is an arbitrary number.
+            self._port.timeout = 1 / self._port.baudrate * 5
+            try:
+                data = self._port.read(n if timeout_sec != 0 else 1)
+                if not data and to_return:
+                    break
+
+                to_return.extend(data)
+            except serial.SerialTimeoutException:
+                if to_return:
+                    break
+
+        if not to_return:
+            raise IoTimeoutError()
+
+        return to_return
+
+    def write(self, data, timeout_sec):
+        self._port.write_timeout = timeout_sec
+        try:
+            to_return = self._port.write(data)
+            self._port.flush()
+            return to_return
+        except serial.SerialTimeoutException:
+            raise IoTimeoutError()
+
+
+atexit.register(SerialTransport.close_atexit)
diff --git a/python/tvm/micro/transport/subprocess.py b/python/tvm/micro/transport/subprocess.py
new file mode 100644
index 000000000000..4de1fa1266d3
--- /dev/null
+++ b/python/tvm/micro/transport/subprocess.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines an implementation of Transport that uses subprocesses."""
+
+import subprocess
+from . import base
+from . import file_descriptor
+
+
+class SubprocessFdTransport(file_descriptor.FdTransport):
+    def timeouts(self):
+        raise NotImplementedError()
+
+
+class SubprocessTransport(base.Transport):
+    """A Transport implementation that uses a subprocess's stdin/stdout as the channel."""
+
+    def __init__(self, args, max_startup_latency_sec=5.0, max_latency_sec=5.0, **kwargs):
+        self.max_startup_latency_sec = max_startup_latency_sec
+        self.max_latency_sec = max_latency_sec
+        self.args = args
+        self.kwargs = kwargs
+        self.popen = None
+        self.child_transport = None
+
+    def timeouts(self):
+        return base.TransportTimeouts(
+            session_start_retry_timeout_sec=0,
+            session_start_timeout_sec=self.max_startup_latency_sec,
+            session_established_timeout_sec=self.max_latency_sec,
+        )
+
+    def open(self):
+        self.kwargs["stdout"] = subprocess.PIPE
+        self.kwargs["stdin"] = subprocess.PIPE
+        self.kwargs["bufsize"] = 0
+        self.popen = subprocess.Popen(self.args, **self.kwargs)
+        self.child_transport = SubprocessFdTransport(
+            self.popen.stdout, self.popen.stdin, self.timeouts()
+        )
+
+    def write(self, data, timeout_sec):
+        return self.child_transport.write(data, timeout_sec)
+
+    def read(self, n, timeout_sec):
+        return self.child_transport.read(n, timeout_sec)
+
+    def close(self):
+        if self.child_transport is not None:
+            self.child_transport.close()
+
+        self.popen.terminate()
diff --git a/python/tvm/micro/transport/wakeup.py b/python/tvm/micro/transport/wakeup.py
new file mode 100644
index 000000000000..418f8bdbb27a
--- /dev/null
+++ b/python/tvm/micro/transport/wakeup.py
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines an implementation of Transport that uses subprocesses."""
+
+import logging
+import time
+from . import base
+
+
+_LOG = logging.getLogger(__name__)
+
+
+class WakeupTransport(base.Transport):
+    """A Transport implementation that waits for a "wakeup sequence" from the remote end."""
+
+    def __init__(self, child_transport, wakeup_sequence):
+        self.child_transport = child_transport
+        self.wakeup_sequence = bytes(wakeup_sequence)
+        self.wakeup_sequence_buffer = bytearray()
+        self.line_start_index = 0
+        self.found_wakeup_sequence = False
+
+    def open(self):
+        return self.child_transport.open()
+
+    def close(self):
+        return self.child_transport.close()
+
+    def timeouts(self):
+        return self.child_transport.timeouts()
+
+    def _await_wakeup(self, end_time):
+        def _time_remaining():
+            if end_time is None:
+                return None
+            return max(0, end_time - time.monotonic())
+
+        if not self.found_wakeup_sequence:
+            while self.wakeup_sequence not in self.wakeup_sequence_buffer:
+                x = self.child_transport.read(1, _time_remaining())
+                self.wakeup_sequence_buffer.extend(x)
+                if x[0] in (b"\n", b"\xff"):
+                    _LOG.debug("%s", self.wakeup_sequence_buffer[self.line_start_index : -1])
+                    self.line_start_index = len(self.wakeup_sequence_buffer)
+
+            _LOG.info("remote side woke up!")
+            self.found_wakeup_sequence = True
+            time.sleep(0.2)
+
+        return _time_remaining()
+
+    def read(self, n, timeout_sec):
+        if not self.found_wakeup_sequence:
+            end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
+            timeout_sec = self._await_wakeup(end_time)
+
+        return self.child_transport.read(n, timeout_sec)
+
+    def write(self, data, timeout_sec):
+        if not self.found_wakeup_sequence:
+            end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
+            timeout_sec = self._await_wakeup(end_time)
+
+        return self.child_transport.write(data, timeout_sec)
diff --git a/python/tvm/relay/analysis/__init__.py b/python/tvm/relay/analysis/__init__.py
index e5b21cb107f5..b4ea7f3cff62 100644
--- a/python/tvm/relay/analysis/__init__.py
+++ b/python/tvm/relay/analysis/__init__.py
@@ -29,3 +29,6 @@
 # Feature
 from . import feature
 from . import sparse_dense
+
+# Utilities
+from .count_layers import count_layers
diff --git a/python/tvm/relay/analysis/count_layers.py b/python/tvm/relay/analysis/count_layers.py
new file mode 100644
index 000000000000..93d4f2766284
--- /dev/null
+++ b/python/tvm/relay/analysis/count_layers.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities that enable counting the number of layers in a graph."""
+import tvm
+from tvm import relay
+from ..expr_functor import ExprVisitor
+
+
+class LayerCounter(ExprVisitor):
+    """A visitor pass that computes the deepest chain of specified ops in graph."""
+
+    def __init__(self, valid_ops):
+        self.depth_count = 0
+        self.deepest_count = 0
+        self.valid_ops = [relay.op.get(op) for op in valid_ops]
+        super().__init__()
+
+    def visit_call(self, call):
+        if call.op in self.valid_ops:
+            self.depth_count += 1
+        current_count = self.depth_count
+        self.deepest_count = max(self.deepest_count, current_count)
+        for arg in call.args:
+            self.visit(arg)
+            self.depth_count = current_count
+
+    def count(self):
+        return self.deepest_count
+
+
+def count_layers(expr, valid_ops):
+    """Determine the number of layers of specified ops in a graph.
+    This pass computes only the deepest chain of ops rather than the
+    total number of ops in a graph. Thus, if there are two parallel
+    convolutions (for example), they would be considered a single layer.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule.
+        The input expression.
+
+    valid_ops: List[str]
+        A list of the operations that should be included in the count.
+
+    Returns
+    -------
+    layer_count : int
+        The number of layers of the specified operations found in the graph.
+    """
+    if isinstance(expr, tvm.ir.IRModule):
+        expr = expr["main"]
+    count_pass = LayerCounter(valid_ops)
+    count_pass.visit(expr)
+    return count_pass.count()
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 8d4a331f1b86..a39f72e2e61f 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -21,11 +21,11 @@
 import logging
 import numpy as np
 import tvm
-from tvm import te
+from tvm import te, autotvm
+from tvm.ir.transform import PassContext
 from tvm.runtime import Object
 from tvm.support import libinfo
-from ...target import Target
-from ... import autotvm
+from tvm.target import Target
 from .. import function as _function
 from .. import ty as _ty
 from . import _backend
@@ -122,7 +122,10 @@ def get_valid_implementations(op, attrs, inputs, out_type, target):
         The list of all valid op implementations.
     """
     fstrategy = op.get_attr("FTVMStrategy")
-    assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name
+    assert fstrategy is not None, (
+        "%s doesn't have an FTVMStrategy registered. You can register "
+        "one in python with `tvm.relay.op.register_strategy`." % op.name
+    )
     with target:
         strategy = fstrategy(attrs, inputs, out_type, target)
     analyzer = tvm.arith.Analyzer()
@@ -184,8 +187,14 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
         The best op implementation and the corresponding output tensors.
     """
     all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
-
     best_plevel_impl = max(all_impls, key=lambda x: x.plevel)
+
+    # Disable autotvm if auto_scheduler is enabled.
+    # (i.e., always return the implementation with the highest priority for auto-scheduler).
+    if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
+        use_autotvm = False
+
+    # If not use autotvm, always return the implementation with the highest priority
     if not use_autotvm:
         logger.info(
             "Using %s for %s based on highest priority (%d)",
@@ -196,11 +205,13 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
         outs = best_plevel_impl.compute(attrs, inputs, out_type)
         return best_plevel_impl, outs
 
+    # Otherwise, try autotvm templates
     outputs = {}
     workloads = {}
     best_autotvm_impl = None
     best_cfg = None
     dispatch_ctx = autotvm.task.DispatchContext.current
+    old_silent = autotvm.GLOBAL_SCOPE.silent
     autotvm.GLOBAL_SCOPE.silent = True
     for impl in all_impls:
         outs = impl.compute(attrs, inputs, out_type)
@@ -218,7 +229,8 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
         if best_cfg is None or best_cfg.cost > cfg.cost:
             best_autotvm_impl = impl
             best_cfg = cfg
-    autotvm.GLOBAL_SCOPE.silent = False
+    autotvm.GLOBAL_SCOPE.silent = old_silent
+
     if best_autotvm_impl:
         # The best autotvm implementation definitely doesn't use fallback config
         logger.info(
@@ -228,6 +240,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
             best_cfg.cost,
         )
         return best_autotvm_impl, outputs[best_autotvm_impl]
+
     # Use the implementation with highest plevel
     if workloads[best_plevel_impl] is not None:
         msg = (
@@ -235,7 +248,10 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
             "is used, which may bring great performance regression."
             % (target, workloads[best_plevel_impl])
         )
-        if msg not in autotvm.task.DispatchContext.warning_messages:
+        if (
+            not autotvm.env.GLOBAL_SCOPE.silent
+            and msg not in autotvm.task.DispatchContext.warning_messages
+        ):
             autotvm.task.DispatchContext.warning_messages.add(msg)
             autotvm_logger.warning(msg)
     logger.info(
@@ -284,7 +300,6 @@ def lower_call(call, inputs, target):
         best_impl, outputs = select_implementation(op, call.attrs, inputs, ret_type, target)
     else:
         # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.
-        #   Currently, we just use the implementation with highest plevel
         best_impl, outputs = select_implementation(
             op, call.attrs, inputs, ret_type, target, use_autotvm=False
         )
@@ -371,6 +386,9 @@ def items(self):
         assert len(res) % 2 == 0
         return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
 
+    def get_current_ccache_key(self):
+        return _backend._CompileEngineGetCurrentCCacheKey(self)
+
     def dump(self):
         """Return a string representation of engine dump.
 
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index b0a5e9899b02..0f7875a9202e 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -223,7 +223,7 @@ def _tophub_context(self, target):
         if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
             tophub_context = autotvm.tophub.context(list(target.values()))
         else:
-            tophub_context = autotvm.util.EmptyContext()
+            tophub_context = autotvm.utils.EmptyContext()
         return tophub_context
 
 
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index bd0c3e5f4d73..5dc6f81b97a2 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -23,6 +23,7 @@
 
 from tvm.ir import IRModule
 
+from tvm.ir.transform import PassContext
 from tvm.tir import expr as tvm_expr
 from .. import nd as _nd, autotvm
 from ..target import Target
@@ -123,8 +124,20 @@ def build(self, mod, target=None, target_host=None, params=None):
         # Setup the params.
         if params:
             self._set_params(params)
-        # Build the IR module
+
+        # Build the IR module. If auto_scheduler is not enabled,
+        # then use the TOPI-defined schedule.
+        use_auto_scheduler = PassContext.current().config.get(
+            "relay.backend.use_auto_scheduler", False
+        )
+
+        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
+        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
+        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
+
         self._build(mod, target, target_host)
+        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
+
         # Get artifacts
         graph_json = self.get_json()
         mod = self.get_module()
@@ -188,16 +201,16 @@ def get_params(self):
 
 
 def build(mod, target=None, target_host=None, params=None, mod_name="default"):
-    """Helper function that builds a Relay function to run on TVM graph
-    runtime.
+    # fmt: off
+    # pylint: disable=line-too-long
+    """Helper function that builds a Relay function to run on TVM graph runtime.
 
     Parameters
     ----------
     mod : :py:class:`~tvm.IRModule`
         The IR module to build. Using relay.Function is deprecated.
 
-    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context
-    name) to str/tvm.target.Target, optional
+    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional
         For heterogeneous compilation, it is a dictionary indicating context to
         target mapping. For homogeneous compilation, it is a build target.
 
@@ -228,6 +241,8 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
     params : dict
         The parameters of the final graph.
     """
+    # pylint: enable=line-too-long
+    # fmt: on
     if not isinstance(mod, (IRModule, _function.Function)):
         raise ValueError("Type of input parameter mod must be tvm.IRModule")
 
@@ -253,7 +268,7 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(list(target.values()))
     else:
-        tophub_context = autotvm.util.EmptyContext()
+        tophub_context = autotvm.utils.EmptyContext()
 
     with tophub_context:
         bld_mod = BuildModule()
@@ -307,7 +322,7 @@ def optimize(mod, target=None, params=None):
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(list(target.values()))
     else:
-        tophub_context = autotvm.util.EmptyContext()
+        tophub_context = autotvm.utils.EmptyContext()
 
     with tophub_context:
         bld_mod = BuildModule()
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 6d304648fa1c..7b6e4b4ccf80 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -185,10 +185,13 @@ class Tuple(ExprWithOp):
     ----------
     fields : List[tvm.relay.Expr]
         The fields in the tuple.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code
     """
 
-    def __init__(self, fields):
-        self.__init_handle_by_constructor__(_ffi_api.Tuple, fields)
+    def __init__(self, fields, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Tuple, fields, span)
 
     def __getitem__(self, index):
         if index >= len(self):
@@ -251,12 +254,15 @@ class Call(ExprWithOp):
     type_args: Optional[List[tvm.relay.Type]]
         The additional type arguments, this is only
         used in advanced usecase of template functions.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code
     """
 
-    def __init__(self, op, args, attrs=None, type_args=None):
+    def __init__(self, op, args, attrs=None, type_args=None, span=None):
         if not type_args:
             type_args = []
-        self.__init_handle_by_constructor__(_ffi_api.Call, op, args, attrs, type_args)
+        self.__init_handle_by_constructor__(_ffi_api.Call, op, args, attrs, type_args, span)
 
 
 @tvm._ffi.register_object("relay.Let")
diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py
index 0a37e4d4393c..40a116ab0b43 100644
--- a/python/tvm/relay/expr_functor.py
+++ b/python/tvm/relay/expr_functor.py
@@ -213,7 +213,7 @@ def visit_let(self, let):
     def visit_call(self, call):
         new_fn = self.visit(call.op)
         new_args = [self.visit(arg) for arg in call.args]
-        return Call(new_fn, new_args, call.attrs)
+        return Call(new_fn, new_args, call.attrs, call.type_args, call.span)
 
     def visit_var(self, var):
         return var
@@ -225,7 +225,7 @@ def visit_if(self, ite):
         return If(self.visit(ite.cond), self.visit(ite.true_branch), self.visit(ite.false_branch))
 
     def visit_tuple(self, tup):
-        return Tuple([self.visit(field) for field in tup.fields])
+        return Tuple([self.visit(field) for field in tup.fields], tup.span)
 
     def visit_tuple_getitem(self, op):
         tuple_value = self.visit(op.tuple_value)
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index b27c759b8d03..8c74f3a54138 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -22,7 +22,7 @@
 
 import tvm
 from tvm.ir import IRModule
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from .. import expr as _expr
 from .. import function as _function
@@ -601,3 +601,14 @@ def __call__(self, inputs, attrs, *args):
         if "tvm_custom" in attrs:
             attrs.pop("tvm_custom")
         return get_relay_op(self._new_name)(*inputs, **attrs)
+
+
+def to_int_list(np_array):
+    """Convert a np array to a python int list.
+
+    Note: This function converts np.int32 to python's int.
+    If we don't do this conversion, numpy's automatic upcast will make
+    the shape / parameters be converted to int64 IntImm in relay and
+    cause problems in relay/TOPI.
+    """
+    return [int(x) for x in np_array]
diff --git a/python/tvm/relay/frontend/darknet.py b/python/tvm/relay/frontend/darknet.py
index 87e55593e943..363812fd562b 100644
--- a/python/tvm/relay/frontend/darknet.py
+++ b/python/tvm/relay/frontend/darknet.py
@@ -40,7 +40,7 @@ def _darknet_not_support(attr, op="relay"):
 
 def _get_params_prefix(opname, layer_num):
     """Makes the params prefix name from opname and layer number."""
-    return str(opname) + str(layer_num)
+    return str(opname).replace(".", "_") + str(layer_num)
 
 
 def _get_params_name(prefix, item):
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 984945f71868..2242be1bcdeb 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -23,7 +23,7 @@
 from tvm.ir import IRModule
 
 from tvm import relay
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .. import analysis
 from .. import expr as _expr
 from .. import function as _function
@@ -790,6 +790,16 @@ def _mx_dot(inputs, attrs):
 def _mx_batch_dot(inputs, attrs):
     assert len(inputs) == 2
     a, b = inputs
+    a_shape = _infer_type(a).checked_type.shape
+    batch_shapes = None
+    if len(a_shape) > 3:
+        batch_shapes = a_shape[:-2]
+        a = _op.reverse_reshape(a, newshape=(-1, 0, 0))
+    b_shape = _infer_type(b).checked_type.shape
+    if len(b_shape) > 3:
+        if batch_shapes is None:
+            batch_shapes = b_shape[:-2]
+        b = _op.reverse_reshape(b, newshape=(-1, 0, 0))
     transpose_a = attrs.get_bool("transpose_a", False)
     transpose_b = attrs.get_bool("transpose_b", False)
     if transpose_a is True:
@@ -797,7 +807,10 @@ def _mx_batch_dot(inputs, attrs):
         raise tvm.error.OpAttributeInvalid(msg.format(transpose_a))
     if transpose_b is False:
         b = _op.transpose(b, axes=[0, 2, 1])
-    return _op.nn.batch_matmul(a, b)
+    out = _op.nn.batch_matmul(a, b)
+    if batch_shapes is not None:
+        out = _op.reverse_reshape(out, newshape=tuple(batch_shapes) + (0, 0))
+    return out
 
 
 def _mx_arange(inputs, attrs):
@@ -2294,18 +2307,16 @@ def _mx_npi_pad(inputs, attrs):
         raise tvm.error.OpAttributeRequired('Attribute "mode" not found in operator pad.')
     if pad_mode not in ["constant", "edge", "reflect"]:
         raise tvm.error.OpAttributeInvalid("Value " + mode + ' in attribute "mode" is not valid')
-    pad_width = attrs.get_int_tuple("pad_width", None)
-    if pad_width is None:
+    if "pad_width" not in attrs.attrs:
         raise tvm.error.OpAttributeRequired('Attribute "pad_width" not found in operator pad.')
-    if None in pad_width:
-        raise tvm.error.OpAttributeInvalid(
-            'Value None in attribute "pad_width" of operator Slice is not valid.'
-        )
+    # Begin to parse tuple of tuple, we cannot use get_int_tuple here because it's a tuple of tuple.
+    pad_width = attrs.attrs["pad_width"]
+    pad_width = pad_width.replace("(", "[")
+    pad_width = pad_width.replace(")", "]")
+    pad_width = json.loads(pad_width)
     constant_values = attrs.get_float("constant_values", 0.0)
-    padding = tuple(tuple((b, a)) for b, a in zip(pad_width[::2], pad_width[1::2]))
-
     return _op.nn.pad(
-        data=inputs[0], pad_width=padding, pad_value=constant_values, pad_mode=pad_mode
+        data=inputs[0], pad_width=pad_width, pad_value=constant_values, pad_mode=pad_mode
     )
 
 
@@ -2321,24 +2332,74 @@ def _mx_npx_reshape(inputs, attrs):
     shape = attrs.get_int_tuple("newshape")
     reverse = attrs.get_bool("reverse", False)
     shape_list = list(shape)
-    new_shape_list = []
-    for num in shape_list:
-        if num > 0 or num == -1:
-            new_shape_list.append(num)
-        elif num == -2:
-            new_shape_list.append(0)
-        elif num == -4:
-            new_shape_list.append(-2)
-        elif num == -5:
-            new_shape_list.append(-3)
-        elif num == -6:
-            new_shape_list.append(-4)
+    old_shape = get_const_tuple(_infer_type(inputs[0]).checked_type.shape)
+    new_shape = []
+    if reverse:
+        old_shape = old_shape[::-1]
+        shape_list = shape_list[::-1]
+    ptr = 0
+    unknown_axis = None
+    src_ptr = 0
+    while src_ptr < len(shape_list):
+        ele = shape_list[src_ptr]
+        src_ptr += 1
+        if ele > 0:
+            new_shape.append(ele)
+            ptr += 1
+        elif ele == -1:
+            new_shape.append(-1)
+            if unknown_axis is not None:
+                raise tvm.error.OpAttributeInvalid("Can only have one -1 in the input shape.")
+            unknown_axis = len(new_shape)
+            ptr += 1
+        elif ele == -2:
+            new_shape.append(old_shape[ptr])
+            ptr += 1
+        elif ele == -3:
+            if old_shape[ptr] != 1:
+                raise tvm.error.OpAttributeInvalid(
+                    "Dimension of the original shape "
+                    "that corresponds to -3 must be 1. Received"
+                    " {}".format(old_shape[ptr])
+                )
+            ptr += 1
+        elif ele == -4:
+            new_shape += old_shape[ptr:]
+            break
+        elif ele == -5:
+            new_shape.append(old_shape[ptr] * old_shape[ptr + 1])
+            ptr += 2
+        elif ele == -6:
+            # Split axis
+            lhs = shape_list[src_ptr]
+            rhs = shape_list[src_ptr + 1]
+            src_ptr += 2
+            if lhs == -1 and rhs == -1:
+                raise tvm.error.OpAttributeInvalid("The lhs and rhs can not both be -1.")
+            if lhs == -1:
+                if old_shape[ptr] % rhs != 0:
+                    raise tvm.error.OpAttributeInvalid(
+                        "When splitting the axis, "
+                        "the dimension of the split axis must "
+                        "be divisible by the splitted values."
+                    )
+                lhs = old_shape[ptr] // rhs
+            if rhs == -1:
+                if old_shape[ptr] % lhs != 0:
+                    raise tvm.error.OpAttributeInvalid(
+                        "When splitting the axis, "
+                        "the dimension of the split axis must "
+                        "be divisible by the splitted values."
+                    )
+                rhs = old_shape[ptr] // lhs
+            new_shape.append(lhs)
+            new_shape.append(rhs)
+            ptr += 1
         else:
-            raise tvm.error.OpAttributeInvalid("Shape dimension %d is not supported" % num)
-    shape = tuple(new_shape_list)
+            raise tvm.error.OpAttributeInvalid("Shape dimension %d is not supported" % ele)
     if reverse:
-        return _op.reverse_reshape(inputs[0], newshape=shape)
-    return _op.reshape(inputs[0], newshape=shape)
+        new_shape = new_shape[::-1]
+    return _op.reshape(inputs[0], newshape=new_shape)
 
 
 def _mx_split_v2(inputs, attrs):
@@ -2356,12 +2417,21 @@ def _mx_split_v2(inputs, attrs):
 
 
 def _mx_npi_where_rscalar(inputs, attrs):
+    cond, dat = inputs
     scalar = attrs.get_float("scalar")
-    dtype = _infer_type(inputs[1]).checked_type.dtype
+    cond_shape = get_const_tuple(_infer_type(cond).checked_type.shape)
+    dat_shape = get_const_tuple(_infer_type(dat).checked_type.shape)
+    dtype = _infer_type(dat).checked_type.dtype
+    # Check for broadcasting
+    out_shape = np.broadcast(np.empty(cond_shape), np.empty(dat_shape)).shape
+    if out_shape != cond_shape:
+        cond = _op.broadcast_to(cond, out_shape)
+    if out_shape != dat_shape:
+        dat = _op.broadcast_to(dat, out_shape)
     scalar = _expr.const(scalar, dtype=dtype)
-    ones = _op.ones_like(inputs[1])
+    ones = _op.ones_like(dat)
     scalar = _op.multiply(ones, scalar)
-    return _op.where(inputs[0], inputs[1], scalar)
+    return _op.where(cond, dat, scalar)
 
 
 # Note: due to attribute conversion constraint
@@ -2382,13 +2452,13 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "reshape_like",
     "zeros_like",
     "ones_like",
-    "where",
     "cos",
     "cosh",
     "sin",
     "sinh",
     "tan",
     "tanh",
+    "where",
 ]
 
 _convert_map = {
@@ -2609,6 +2679,7 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "_npi_concatenate": _mx_npi_concatenate,
     "_npx_reshape": _mx_npx_reshape,
     "_np_copy": _rename(_op.copy),
+    "_npi_copy": _rename(_op.copy),
     "_npi_power": _rename(_op.power),
     "_npi_power_scalar": _binop_scalar(_op.power),
     "_npi_multiply": _rename(_op.multiply),
@@ -2617,6 +2688,7 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "_npi_add_scalar": _binop_scalar(_op.add),
     "_npi_where_rscalar": _mx_npi_where_rscalar,
     "_npi_less": _rename(_op.less),
+    "_npi_less_equal": _mx_compare(_op.less_equal, _rename),
     "_npi_tanh": _rename(_op.tanh),
     "_npi_true_divide_scalar": _binop_scalar(_op.divide),
 }
@@ -2728,7 +2800,6 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info, params=None, mod=None):
             else:
                 raise RuntimeError("unexpected type %s" % type(res))
             node_map[nid] = res
-
     outputs = [node_map[e[0]][e[1]] for e in jgraph["heads"]]
     outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
     func = _function.Function(analysis.free_vars(outputs), outputs)
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 0598094398f7..d65f5676fb33 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -17,9 +17,11 @@
 # pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines
 # pylint: disable=import-outside-toplevel
 """ONNX: Open Neural Network Exchange frontend for Relay."""
+import warnings
 import numpy as np
 import tvm
 from tvm.ir import IRModule
+from tvm.topi.utils import get_const_tuple
 
 from ... import nd as _nd
 from .. import analysis
@@ -27,6 +29,8 @@
 from .. import function as _function
 from .. import op as _op
 from .. import vision as _vision
+from .. import loops as _loops
+from .. import ty as _ty
 
 from .common import AttrCvt, Renamer
 from .common import get_relay_op, new_var, infer_shape, infer_channels
@@ -95,6 +99,29 @@ def get_numpy(tensor_proto):
     return to_array(tensor_proto)
 
 
+def get_type(elem_type):
+    """Converts onnx integer datatype to numpy datatype"""
+    try:
+        from onnx import TensorProto
+    except ImportError as e:
+        raise ImportError("Unable to import onnx which is required {}".format(e))
+    return TensorProto.DataType.Name(elem_type).lower()
+
+
+def get_info(info_proto):
+    """Extract the shape from a ValueInfoProto."""
+    shape = []
+    for dim in info_proto.type.tensor_type.shape.dim:
+        value = dim.dim_value
+        if value is None:
+            value = _ty.Any
+        shape.append(value)
+
+    name = info_proto.name
+    dtype = get_type(info_proto.type.tensor_type.elem_type)
+    return name, shape, dtype
+
+
 def dimension_picker(prefix, suffix=""):
     """Check that dimensions are supported."""
 
@@ -513,9 +540,11 @@ def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs))
         # Need to check input shape as batch matmul must be supported.
         a_shape = _op.shape_of(inputs[0])
+        a_rank = infer_shape(a_shape)[0]
+        b_shape = _op.shape_of(inputs[1])
+        b_rank = infer_shape(b_shape)[0]
         # When performing a batch matmul, we need to properly handle N-dim shapes.
-        if infer_shape(a_shape)[0] > 2:
-            b_shape = _op.shape_of(inputs[1])
+        if a_rank > 2 or b_rank > 2:
 
             def flatten_to_3d(x, x_shape):
                 ndims = infer_shape(x_shape)[0]
@@ -532,10 +561,31 @@ def flatten_to_3d(x, x_shape):
             b = _op.transpose(b, [0, 2, 1])
             # Perform a batch matmul.
             output = _op.nn.batch_matmul(a, b)
+            # Determine the output batch dimension.
+            if a_rank > b_rank:
+                out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
+            elif a_rank < b_rank:
+                out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
+            # If its unclear how broadcasting should be applied, the output
+            # shape is determined by choosing the maximum value from each input.
+            else:
+                out_batch = _op.concatenate(
+                    [
+                        _op.maximum(
+                            _op.strided_slice(a_shape, [i], [i + 1]),
+                            _op.strided_slice(b_shape, [i], [i + 1]),
+                        )
+                        for i in range(a_rank - 2)
+                    ],
+                    0,
+                )
             # Reshape output to original dimensions.
             final_shape = _op.concatenate(
                 [
-                    _op.strided_slice(a_shape, [0], [infer_shape(a_shape)[0] - 1]),
+                    out_batch,
+                    _op.strided_slice(
+                        a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
+                    ),
                     _op.strided_slice(
                         b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
                     ),
@@ -684,9 +734,7 @@ def _impl_v11(cls, inputs, attr, params):
         else:
             value = 0
 
-        pads_shape = infer_shape(pads)
-        dims = int(pads_shape[0] / 2)
-        pad_width_expr = _op.transpose(_op.reshape(pads, (2, dims)))
+        pad_width_expr = _op.transpose(_op.reshape(pads, (2, -1)))
         pad_mode = attr.get("mode", b"constant").decode("utf-8")
 
         if not pad_mode in ["constant", "edge", "reflect"]:
@@ -1822,6 +1870,25 @@ def _impl_v7(cls, inputs, attr, params):
 class Resize(OnnxOpConverter):
     """Operator converter for Resize"""
 
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        mode = attr.get("mode")
+        if mode == b"nearest":
+            method = "nearest_neighbor"
+        elif mode == b"linear":
+            method = "bilinear"
+        else:
+            raise tvm.error.OpAttributeInvalid(
+                'Value {} in attribute "mode" of operator Resize is not valid.'.format(mode)
+            )
+
+        scale = inputs[1]
+        size = _op.cast(_op.shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
+
+        layout = "NCHW"  # ONNX assumes NCHW layout
+        out_size = _op.strided_slice(size, [2], [4])
+        return _op.image.resize(inputs[0], out_size, layout, method, "asymmetric")
+
     @classmethod
     def _impl_v11(cls, inputs, attr, params):
         mode = attr.get("mode")
@@ -1843,9 +1910,7 @@ def _impl_v11(cls, inputs, attr, params):
             size = inputs[3]
         else:
             assert len(scale_shape) != 0, "One of scale or size should be passed."
-            size = (
-                _op.cast(_op.shape_of(inputs[0]), infer_type(scale).type_annotation.dtype) * scale
-            )
+            size = _op.cast(_op.shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
 
         coord_trans = attr.get("coordinate_transformation_mode")
         if coord_trans in [b"pytorch_half_pixel", b"half_pixel"]:
@@ -1974,6 +2039,203 @@ def _impl_v11(cls, inputs, attr, params):
         return result
 
 
+class Loop(OnnxOpConverter):
+    """Operator converter for Loop"""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        max_loop_count = inputs[0]
+        cond = inputs[1]
+        loop_deps = inputs[2:]
+        num_deps = len(loop_deps)
+        body = attr["body"]
+        iter_dtype = infer_type(max_loop_count).checked_type.dtype
+
+        # Determine what condition mode we're in.
+        assert cond is not None or max_loop_count is not None
+        is_for_loop = max_loop_count is not None and cond is None
+        is_condition_for_loop = cond is not None and max_loop_count is not None
+
+        # Loop inputs will be packed as
+        # [iter_count, max_count, condition, loop_deps, scan_outputs]
+        def cond_fn(*loop_inputs):
+            i = loop_inputs[0]
+            max_count = loop_inputs[1]
+            w = loop_inputs[2]
+
+            if cond is not None:
+                out_while = _op.equal(w, _expr.const(True, "bool"))
+            if max_loop_count is not None:
+                out_loop = _op.less(i, max_count)
+
+            if is_condition_for_loop:
+                return _op.logical_and(out_while, out_loop)
+            if is_for_loop:
+                return out_loop
+            return out_while
+
+        # Get the current graph proto and create a clone for the subgraph
+        graph_scope = GraphProto.current
+        subgraph_scope = GraphProto(graph_scope._shape, graph_scope._dtype)
+        # Load nodes from outer graph into inner graph.
+        subgraph_scope._nodes = graph_scope._nodes.copy()
+
+        # Create a list of variables for each value updated in the loop.
+        def get_var(name, val, scan=False):
+            checked_type = infer_type(val)
+            if hasattr(checked_type, "type_annotation"):
+                checked_type = checked_type.type_annotation
+            shape = get_const_tuple(checked_type.shape)
+            actual_shape = []
+            for dim in shape:
+                if isinstance(dim, int) and dim == 0:
+                    actual_shape.append(_ty.Any())
+                else:
+                    actual_shape.append(dim)
+            if scan:
+                return _expr.var(name, shape=[_ty.Any()] + actual_shape, dtype=checked_type.dtype)
+
+            return _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
+
+        loop_vars = [
+            _expr.var(body.input[0].name, shape=(), dtype=iter_dtype),  # iteration count
+            _expr.var("max_count", shape=(), dtype=iter_dtype),  # iteration count
+            get_var(body.input[1].name, cond),  # exit condition
+        ]
+        loop_vars += [get_var(body.input[i + 2].name, v) for i, v in enumerate(loop_deps)]
+        loop_var_names = [v.name_hint for v in loop_vars]
+
+        num_scan_outputs = len(body.output) - (1 + num_deps)
+        # TODO (jwfromm) Test with strided slice once type unifier for this case is fixed.
+        if num_scan_outputs != 0 and "Slice" in [n.op_type for n in body.node]:
+            warnings.warn(
+                """
+                Using scan outputs in a loop with strided slice
+                currently may cause errors during compilation.
+                """
+            )
+
+        # Construct variables and intial empty tensors for any scan outputs.
+        scan_output_vars = []
+        scan_output_init = []
+        for i in range(num_scan_outputs):
+            name, shape, dtype = get_info(body.output[i + 1 + num_deps])
+            scan_output_vars.append(_expr.var(name, shape=([_ty.Any()] + shape), dtype=dtype))
+            scan_output_init.append(_op.reshape(_expr.const([]), [0] + shape))
+
+        # Now we can remove loop iter variables from our inner loop's inputs.
+        # This is kind of a hack since we have graph inputs that we don't
+        # want to treat as actual inputs.
+        while len(body.input) != 0:
+            body.input.pop(0)
+
+        # Define the loop body, in this function we need to unpack loop inputs,
+        # convert the loop subgraph, and pack outputs for the next iteration.
+        def body_fn(*loop_inputs):
+            # Unpack inputs
+            loop_count = loop_inputs[0]
+            max_count = loop_inputs[1]
+            cond = loop_inputs[2]
+            current_vars = list(loop_inputs[3 : (3 + num_deps)])
+            scan_outputs = loop_inputs[(3 + num_deps) :]
+
+            # Prepare body inputs by adding them to node dictionary.
+            new_inputs = [loop_count, max_count, cond] + current_vars
+            for i, inp in enumerate(new_inputs):
+                subgraph_scope._nodes[loop_var_names[i]] = inp
+
+            # Get the output of the current loop using the updated inputs.
+            with subgraph_scope:
+                loop_outputs = subgraph_scope.from_onnx(
+                    body, graph_scope.opset, get_output_expr=True
+                )
+            # Unpack the body outputs and prepare variables for next iteration.
+            new_cond = loop_outputs[0]
+            new_loop_vars = [loop_outputs[i] for i in range(1, 1 + num_deps)]
+            new_scan_outputs = [loop_outputs[i] for i in range(1 + num_deps, len(loop_outputs))]
+
+            # Increment counter.
+            if max_loop_count is not None:
+                incr = _expr.const(1, dtype=iter_dtype)
+                loop_count = loop_count + incr
+
+            # Add new scan outputs to tracking
+            combined_scan_outputs = []
+            for i, scan in enumerate(scan_outputs):
+                new_scan = _op.expand_dims(new_scan_outputs[i], axis=0)
+                combined_scan = _op.concatenate([scan, new_scan], axis=0)
+                combined_scan_outputs.append(combined_scan)
+
+            # Pack loop outputs for next iteration
+            # [iter_count, cond, loop_deps, loop_scans]
+            return [loop_count, max_count, new_cond] + new_loop_vars + combined_scan_outputs
+
+        # Create the loop function.
+        loop = _loops.while_loop(cond_fn, loop_vars + scan_output_vars, body_fn)
+
+        # Now need to run initial values through the graph.
+        init_count = _expr.const(0, dtype=iter_dtype)
+        loop_vals = loop(init_count, max_loop_count, cond, *loop_deps, *scan_output_init)
+
+        # Extract final iteration outputs.
+        if num_deps + num_scan_outputs == 1:
+            outputs = _expr.TupleGetItem(loop_vals, 3)
+        else:
+            outputs = _expr.TupleWrapper(
+                _expr.Tuple(
+                    [
+                        _expr.TupleGetItem(loop_vals, i + 3)
+                        for i in range(num_deps + num_scan_outputs)
+                    ]
+                ),
+                num_deps + num_scan_outputs,
+            )
+
+        # Update outer graph with constants found in the subgraph.
+        free_vars = analysis.free_vars(loop)
+        graph_scope._params.update(subgraph_scope._params)
+        for var in free_vars:
+            graph_scope._nodes.update({var.name_hint: var})
+        return outputs
+
+
+class If(OnnxOpConverter):
+    """Operator converter for If"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        cond = inputs[0]
+        then_branch = attr.get("then_branch", None)
+        else_branch = attr.get("else_branch", None)
+        assert then_branch is not None and else_branch is not None
+
+        # Create graph converters for both branches.
+        graph_scope = GraphProto.current
+        then_graph = GraphProto(graph_scope._shape, graph_scope._dtype)
+        then_graph._nodes = graph_scope._nodes.copy()
+        else_graph = GraphProto(graph_scope._shape, graph_scope._dtype)
+        else_graph._nodes = graph_scope._nodes.copy()
+
+        # Convert each branch to a relay expression.
+        with then_graph:
+            then_expr = then_graph.from_onnx(then_branch, graph_scope.opset, get_output_expr=True)
+        with else_graph:
+            else_expr = else_graph.from_onnx(else_branch, graph_scope.opset, get_output_expr=True)
+
+        # Add constants from both branches to parent graph.
+        graph_scope._params.update(then_graph._params)
+        then_free_vars = analysis.free_vars(then_expr)
+        for var in then_free_vars:
+            graph_scope._nodes.update({var.name_hint: var})
+        graph_scope._params.update(else_graph._params)
+        else_free_vars = analysis.free_vars(else_expr)
+        for var in else_free_vars:
+            graph_scope._nodes.update({var.name_hint: var})
+
+        # Now we can construct the relay if statement and return.
+        return _expr.If(cond, then_expr, else_expr)
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -2112,6 +2374,7 @@ def _get_convert_map(opset):
         "Gather": Gather.get_converter(opset),
         "GatherElements": GatherElements.get_converter(opset),
         "GatherND": GatherND.get_converter(opset),
+        "Size": AttrCvt("ndarray_size", extras={"dtype": "int64"}),
         "Scatter": Scatter.get_converter(opset),
         "ScatterElements": Scatter.get_converter(opset),
         "Squeeze": AttrCvt("squeeze", {"axes": "axis"}),
@@ -2129,6 +2392,9 @@ def _get_convert_map(opset):
         "Resize": Resize.get_converter(opset),
         "NonZero": NonZero.get_converter(opset),
         "Range": Range.get_converter(opset),
+        # defs/control_flow
+        "Loop": Loop.get_converter(opset),
+        "If": If.get_converter(opset),
     }
 
 
@@ -2145,6 +2411,8 @@ class GraphProto:
         The input types to the graph
     """
 
+    current = None
+
     def __init__(self, shape, dtype):
         self._nodes = {}
         self._params = {}
@@ -2154,16 +2422,26 @@ def __init__(self, shape, dtype):
         self._num_param = 0
         self._shape = shape if shape else {}
         self._dtype = dtype
+        self.opset = None
+
+    def __enter__(self):
+        self._old_manager = GraphProto.current
+        GraphProto.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        GraphProto.current = self._old_manager
 
     def freeze(self, func, params):
         bind_map = {}
         for name in params.keys():
-            bind_map[self._nodes[name]] = _expr.const(params[name])
+            if name in self._nodes.keys():
+                bind_map[self._nodes[name]] = _expr.const(params[name])
         body = _expr.bind(func.body, bind_map)
         fn = _function.Function(analysis.free_vars(body), body)
         return fn, {}
 
-    def from_onnx(self, graph, opset, freeze_params=False):
+    def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
         """Construct Relay expression from ONNX graph.
 
         Onnx graph is a python protobuf object.
@@ -2187,6 +2465,11 @@ def from_onnx(self, graph, opset, freeze_params=False):
             at compile time and helps in making models static if certain inputs represent
             attributes relay would traditionally consider compile-time constants.
 
+        get_output_expr: bool
+            If set to true, this conversion will return each output expression rather
+            than a packaged module. This can be useful when converting subgraphs to
+            relay.
+
         Returns
         -------
         mod : tvm.IRModule
@@ -2195,6 +2478,7 @@ def from_onnx(self, graph, opset, freeze_params=False):
         params : dict
             A dict of name: tvm.nd.array pairs, used as pretrained weights
         """
+        self.opset = opset
         # parse network inputs to relay, aka parameters
         for init_tensor in graph.initializer:
             if not init_tensor.name.strip():
@@ -2288,6 +2572,9 @@ def from_onnx(self, graph, opset, freeze_params=False):
         # now return the outputs
         outputs = [self._nodes[self._parse_value_proto(i)] for i in graph.output]
         outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
+        # If requested, directly return the converted expressions.
+        if get_output_expr:
+            return outputs
         ## Maintain the order of inputs and parameters from the ONNX graph, but only include
         ## those parameters that are needed to execute the relay graph
         free_vars = analysis.free_vars(outputs)
@@ -2296,6 +2583,7 @@ def from_onnx(self, graph, opset, freeze_params=False):
         for i_name in self._params:
             if i_name in free_vars and i_name not in self._inputs:
                 self._inputs[i_name] = self._nodes[i_name]
+        # Create a function from our output expression and all input variables.
         func = _function.Function([v for k, v in self._inputs.items()], outputs)
         if freeze_params:
             func, params = self.freeze(func, self._params)
@@ -2327,7 +2615,7 @@ def _parse_attr(self, attr_proto):
         """Convert a list of AttributeProto to a dict, with names as keys."""
         attrs = {}
         for a in attr_proto:
-            for f in ["f", "i", "s"]:
+            for f in ["f", "i", "s", "g"]:
                 if a.HasField(f):
                     attrs[a.name] = getattr(a, f)
             for f in ["floats", "ints", "strings"]:
@@ -2341,12 +2629,9 @@ def _parse_attr(self, attr_proto):
                 if list(getattr(a, f)):
                     assert a.name not in attrs, "Only one type of attr is allowed"
                     attrs[a.name] = tuple(getattr(a, f))
-            for f in ["g"]:
-                if a.HasField(f):
-                    raise NotImplementedError("Filed {} is not supported in relay.".format(f))
             for f in ["graphs"]:
                 if list(getattr(a, f)):
-                    raise NotImplementedError("Filed {} is not supported in relay.".format(f))
+                    raise NotImplementedError("Field {} is not supported in relay.".format(f))
             if a.name not in attrs:
                 raise ValueError("Cannot parse attribute: \n{}\n.".format(a))
         return attrs
@@ -2407,7 +2692,7 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
     retains that dynamism upon import, and the compiler attempts to convert the
     model into a static shapes at compile time. If this fails, there may still
     be dynamic operations in the model. Not all TVM kernels currently support
-    dynamic shapes, please file an issue on discuss.tvm.ai
+    dynamic shapes, please file an issue on discuss.tvm.apache.org
     if you hit an error with dynamic kernels.
 
     Parameters
@@ -2447,9 +2732,7 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
             # try use onnx's own model checker before converting any model
             try:
                 onnx.checker.check_model(model)
-            except onnx.onnx_cpp2py_export.checker.ValidationError as e:
-                import warnings
-
+            except onnx.onnx_cpp2py_export.checker.ValidationError as e:  # pylint: disable=c-extension-no-member
                 # the checker is a bit violent about errors, so simply print warnings here
                 warnings.warn(str(e))
     except ImportError:
@@ -2461,5 +2744,7 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
             opset = model.opset_import[0].version if model.opset_import else 1
         except AttributeError:
             opset = 1
-    mod, params = g.from_onnx(graph, opset, freeze_params)
+    # Use the graph proto as a scope so that ops can access other nodes if needed.
+    with g:
+        mod, params = g.from_onnx(graph, opset, freeze_params)
     return mod, params
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 8e626f52d528..4f75cf380cc6 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -17,15 +17,17 @@
 # pylint: disable=import-self, too-many-lines, len-as-condition, no-else-return, unused-variable, too-many-nested-blocks
 # pylint: disable=consider-iterating-dictionary, invalid-name, unused-argument, unused-variable, broad-except
 # pylint: disable=import-outside-toplevel, simplifiable-if-expression, cell-var-from-loop, unnecessary-lambda
+# pylint: disable=missing-function-docstring
 """PT: PyTorch frontend."""
 import itertools
 import logging
 import sys
+import math
 
 import numpy as np
 
 import tvm
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from .. import analysis as _analysis
 from .. import expr as _expr
@@ -42,18 +44,11 @@
 from ..prelude import Prelude, StaticTensorArrayOps
 
 from . import qnn_torch
+from .pytorch_utils import is_version_greater_than
 
 __all__ = ["from_pytorch"]
 
 
-def _is_version_greater_than(ver):
-    import torch
-    from packaging import version
-
-    # Torch version > 1.4 changed upsampling API
-    return version.parse(torch.__version__) > version.parse(ver)
-
-
 # List ADT utilities
 def _infer_type_with_prelude(val, prelude):
     body = _infer_type(val, prelude.mod)
@@ -139,16 +134,24 @@ def _is_quantized_tensor(data, prelude):
 
 
 # operator implementation
-def _elemwise(name):
-    def _impl(inputs, input_types):
-        data0, data1 = _pytorch_promote_types(inputs[:2], input_types[:2])
-        return get_relay_op(name)(data0, data1)
 
-    return _impl
 
+class PyTorchOpConverter:
+    """A helper class for holding PyTorch op converters."""
+
+    def __init__(self, prelude, default_dtype):
+        self.prelude = prelude
+        self.default_dtype = default_dtype
+        self.create_convert_map()
 
-def _min_max_common(name_elemwise, name_reduce):
-    def _impl(inputs, input_types):
+    def make_elemwise(self, name):
+        def elemwise(inputs, input_types):
+            data0, data1 = _pytorch_promote_types(inputs[:2], input_types[:2])
+            return get_relay_op(name)(data0, data1)
+
+        return elemwise
+
+    def min_max_common(self, name_elemwise, name_reduce, inputs, input_types):
         if len(inputs) == 1:
             data = _pytorch_promote_types(inputs[:1], input_types[:1])
             return get_relay_op(name_reduce)(data[0])
@@ -162,39 +165,27 @@ def _impl(inputs, input_types):
             data0, data1 = _pytorch_promote_types(inputs[:2], input_types[:2])
             return get_relay_op(name_elemwise)(data0, data1)
 
-    return _impl
-
-
-def _max():
-    return _min_max_common("maximum", "max")
-
+    def max(self, inputs, input_types):
+        return self.min_max_common("maximum", "max", inputs, input_types)
 
-def _min():
-    return _min_max_common("minimum", "min")
+    def min(self, inputs, input_types):
+        return self.min_max_common("minimum", "min", inputs, input_types)
 
+    def make_unary(self, name):
+        def unary(inputs, input_types):
+            # this is just to ensure tensor input
+            (data,) = _pytorch_promote_types(inputs[:1], input_types[:1])
+            return get_relay_op(name)(data)
 
-def _unary(name):
-    def _impl(inputs, input_types):
-        input_type = input_types[0]
-        # this is just to ensure tensor input
-        (data,) = _pytorch_promote_types(inputs[:1], input_types[:1])
-        return get_relay_op(name)(data)
+        return unary
 
-    return _impl
-
-
-def _log1p():
-    def _impl(inputs, input_types):
+    def log1p(self, inputs, input_types):
         # 1_plus_log x = log(x + 1)
         (dtype,) = input_types
         one = _expr.const(1, dtype=dtype)
         return _op.log(inputs[0] + one)
 
-    return _impl
-
-
-def _arange():
-    def _impl(inputs, input_types):
+    def arange(self, inputs, input_types):
         def _get_value(val, dtype):
             # dtype is a tvm dtype
             if isinstance(val, _expr.Expr):
@@ -242,11 +233,7 @@ def _get_type(val, inp_type):
 
         return _op.transform.arange(start=start, stop=stop, step=step, dtype=dtype)
 
-    return _impl
-
-
-def _squeeze():
-    def _impl(inputs, input_types):
+    def squeeze(self, inputs, input_types):
         data = inputs[0]
         if len(inputs) == 1:
             axis = None
@@ -256,33 +243,27 @@ def _impl(inputs, input_types):
 
         return _op.transform.squeeze(data, axis)
 
-    return _impl
-
-
-def _unsqueeze():
-    def _impl(inputs, input_types):
+    def unsqueeze(self, inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
 
         return _op.transform.expand_dims(data, int(axis), 1)
 
-    return _impl
-
-
-def _concatenate(prelude):
-    def tensor_array_concat(lst, axis):
-        assert axis == 0, "Tensor array concat supported only for axis 0"
-        tensor_array, shape = _convert_to_tensor_array(lst, prelude)
-        concat_shape = (Any(),) + shape[1:]
-        concat = prelude.get_global_var_static("tensor_array_concat", "float32", shape)
-        concatenated = concat(tensor_array)
-
-        static_tensor_array_ops = StaticTensorArrayOps(prelude, "float32", concat_shape)
-        static_tensor_array_ops.register()
-        get_tensor = prelude.get_global_var_static("tensor_get_data", "float32", concat_shape)
-        return get_tensor(concatenated)
+    def concatenate(self, inputs, input_types):
+        def tensor_array_concat(lst, axis):
+            assert axis == 0, "Tensor array concat supported only for axis 0"
+            tensor_array, shape = _convert_to_tensor_array(lst, self.prelude)
+            concat_shape = (Any(),) + shape[1:]
+            concat = self.prelude.get_global_var_static("tensor_array_concat", "float32", shape)
+            concatenated = concat(tensor_array)
+
+            static_tensor_array_ops = StaticTensorArrayOps(self.prelude, "float32", concat_shape)
+            static_tensor_array_ops.register()
+            get_tensor = self.prelude.get_global_var_static(
+                "tensor_get_data", "float32", concat_shape
+            )
+            return get_tensor(concatenated)
 
-    def _impl(inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
 
@@ -294,11 +275,7 @@ def _impl(inputs, input_types):
 
         return _op.tensor.concatenate(data, int(axis))
 
-    return _impl
-
-
-def _slice():
-    def _impl(inputs, input_types):
+    def slice(self, inputs, input_types):
         axis_dtype = "int64"
         index_size_limit = 2 ** 63 - 1
         data = inputs[0]
@@ -398,11 +375,7 @@ def _impl(inputs, input_types):
             data, begin=begin, end=end, strides=strides, slice_mode="end"
         )
 
-    return _impl
-
-
-def _split():
-    def _impl(inputs, input_types):
+    def split(self, inputs, input_types):
         data = inputs[0]
         split_size = int(inputs[1])
         dim = int(inputs[2])
@@ -415,11 +388,7 @@ def _impl(inputs, input_types):
 
         return _op.split(data, indices, dim)
 
-    return _impl
-
-
-def _split_with_sizes():
-    def _impl(inputs, input_types):
+    def split_with_sizes(self, inputs, input_types):
         data = inputs[0]
         sections = inputs[1]
         dim = int(inputs[2])
@@ -437,31 +406,19 @@ def _impl(inputs, input_types):
 
         return _op.split(data, indices, dim)
 
-    return _impl
-
-
-def _select():
-    def _impl(inputs, input_types):
+    def select(self, inputs, input_types):
         data = inputs[0]
         dim = int(inputs[1])
         index = _wrap_const(inputs[2])
         return _op.transform.take(data, index, axis=dim)
 
-    return _impl
-
-
-def _take():
-    def _impl(inputs, input_types):
+    def take(self, inputs, input_types):
         data = inputs[0]
         indices = _op.cast(inputs[1], "int32")
 
         return _op.transform.take(data, indices=indices)
 
-    return _impl
-
-
-def _topk():
-    def _impl(inputs, input_types):
+    def topk(self, inputs, input_types):
         data = inputs[0]
         axis = int(inputs[2])
         is_ascend = not bool(inputs[3])
@@ -480,28 +437,16 @@ def _impl(inputs, input_types):
 
         return outs[0], outs[1]
 
-    return _impl
-
-
-def _reciprocal():
-    def _impl(inputs, input_types):
+    def reciprocal(self, inputs, input_types):
         data = inputs[0]
         return _expr.const(1.0, dtype=input_types[0]) / data
 
-    return _impl
-
-
-def _repeat():
-    def _impl(inputs, input_types):
+    def repeat(self, inputs, input_types):
         data = inputs[0]
         reps = inputs[1]
         return _op.transform.tile(data, reps=reps)
 
-    return _impl
-
-
-def _repeat_interleave():
-    def _impl(inputs, input_types):
+    def repeat_interleave(self, inputs, input_types):
         data = inputs[0]
         if isinstance(inputs[1], int):
             repeats = inputs[1]
@@ -514,77 +459,60 @@ def _impl(inputs, input_types):
             axis = 0
         return _op.transform.repeat(data, repeats=repeats, axis=axis)
 
-    return _impl
-
-
-def _addcdiv():
-    def _impl(inputs, input_types):
+    def addcdiv(self, inputs, input_types):
         data, t1, t2, c = _pytorch_promote_types(inputs[:4], input_types[:4])
         return data + (c * (t1 / t2))
 
-    return _impl
-
-
-def _addcmul():
-    def _impl(inputs, input_types):
+    def addcmul(self, inputs, input_types):
         data, t1, t2, c = _pytorch_promote_types(inputs[:4], input_types[:4])
         return data + (c * (t1 * t2))
 
-    return _impl
-
-
-def _where():
-    def _impl(inputs, input_types):
+    def where(self, inputs, input_types):
         if len(inputs) == 1:
-            return _nonzero(False)([inputs[0], True], input_types)
+            return self.nonzero([inputs[0], True], input_types)
 
         cond = inputs[0]
         x, y = _pytorch_promote_types(inputs[1:3], input_types[1:3])
         return _op.where(cond, x, y)
 
-    return _impl
-
-
-def _full_impl(data, fill_value, dtype):
-    size = []
-    need_reshape = False
-    new_shape = []
-    for dim in data:
-        if isinstance(dim, _expr.Expr):
-            if isinstance(dim, _expr.Constant):
-                dim = int(dim.data.asnumpy())
-                if isinstance(size, list):
-                    size.append(dim)
-                new_shape.append(dim)
-            else:
-                dim, success = try_infer_value(dim, lambda ret: int(ret), lambda: 0)
-                new_shape.append(dim)
-
-                if success:
+    def full_impl(self, data, fill_value, dtype):
+        size = []
+        need_reshape = False
+        new_shape = []
+        for dim in data:
+            if isinstance(dim, _expr.Expr):
+                if isinstance(dim, _expr.Constant):
+                    dim = int(dim.data.asnumpy())
                     if isinstance(size, list):
                         size.append(dim)
+                    new_shape.append(dim)
                 else:
-                    size = None
-                    need_reshape = True
-        else:
-            if isinstance(size, list):
-                size.append(dim)
-            new_shape.append(dim)
+                    dim, success = try_infer_value(dim, lambda ret: int(ret), lambda: 0)
+                    new_shape.append(dim)
 
-    if size is None:
-        tmp = []
-        for dim in data:
-            tmp.append(_op.cast(_op.expand_dims(dim, axis=0), "int64"))
-        size = _op.concatenate(tmp, axis=0)
+                    if success:
+                        if isinstance(size, list):
+                            size.append(dim)
+                    else:
+                        size = None
+                        need_reshape = True
+            else:
+                if isinstance(size, list):
+                    size.append(dim)
+                new_shape.append(dim)
 
-    out = _op.full(_expr.const(fill_value), size, dtype=dtype)
-    if need_reshape:
-        out = _op.reshape(out, new_shape)
-    return out
+        if size is None:
+            tmp = []
+            for dim in data:
+                tmp.append(_op.cast(_op.expand_dims(dim, axis=0), "int64"))
+            size = _op.concatenate(tmp, axis=0)
 
+        out = _op.full(_expr.const(fill_value), size, dtype=dtype)
+        if need_reshape:
+            out = _op.reshape(out, new_shape)
+        return out
 
-def _ones(default_dtype):
-    def _impl(inputs, input_types):
+    def ones(self, inputs, input_types):
         data = inputs[0]
 
         import torch
@@ -596,14 +524,10 @@ def _impl(inputs, input_types):
         if inputs[1] is not None:
             dtype = _convert_dtype_value(inputs[1])
         else:
-            dtype = default_dtype
-        return _full_impl(data, 1, dtype)
-
-    return _impl
-
+            dtype = self.default_dtype
+        return self.full_impl(data, 1, dtype)
 
-def _ones_like(default_dtype):
-    def _impl(inputs, input_types):
+    def ones_like(self, inputs, input_types):
         data = inputs[0]
         out = _op.ones_like(data)
 
@@ -611,17 +535,13 @@ def _impl(inputs, input_types):
         if inputs[1] is not None:
             dtype = _convert_dtype_value(inputs[1])
         else:
-            dtype = default_dtype
+            dtype = self.default_dtype
         if input_types[0] != dtype:
             out = _op.cast(out, dtype)
 
         return out
 
-    return _impl
-
-
-def _zeros(default_dtype):
-    def _impl(inputs, input_types):
+    def zeros(self, inputs, input_types):
         data = inputs[0]
 
         import torch
@@ -633,14 +553,10 @@ def _impl(inputs, input_types):
         if inputs[1] is not None:
             dtype = _convert_dtype_value(inputs[1])
         else:
-            dtype = default_dtype
-        return _full_impl(data, 0, dtype)
-
-    return _impl
+            dtype = self.default_dtype
+        return self.full_impl(data, 0, dtype)
 
-
-def _zeros_like(default_dtype):
-    def _impl(inputs, input_types):
+    def zeros_like(self, inputs, input_types):
         data = inputs[0]
         out = _op.zeros_like(data)
 
@@ -648,17 +564,13 @@ def _impl(inputs, input_types):
         if inputs[1] is not None:
             dtype = _convert_dtype_value(inputs[1])
         else:
-            dtype = default_dtype
+            dtype = self.default_dtype
         if input_types[0] not in dtype:
             out = _op.cast(out, dtype)
 
         return out
 
-    return _impl
-
-
-def _full(default_dtype):
-    def _impl(inputs, input_types):
+    def full(self, inputs, input_types):
         data = inputs[0]
         fill_value = inputs[1]
 
@@ -672,15 +584,11 @@ def _impl(inputs, input_types):
             dtype = _convert_dtype_value(inputs[2])
         else:
             # if dtype is None, torch uses a global default set by torch.set_default_tensor_type()
-            dtype = default_dtype
+            dtype = self.default_dtype
 
-        return _full_impl(data, fill_value, dtype)
+        return self.full_impl(data, fill_value, dtype)
 
-    return _impl
-
-
-def _full_like(default_dtype):
-    def _impl(inputs, input_types):
+    def full_like(self, inputs, input_types):
         data = inputs[0]
         fill_value = inputs[1]
 
@@ -691,17 +599,13 @@ def _impl(inputs, input_types):
             dtype = _convert_dtype_value(inputs[2])
         else:
             # if dtype is None, torch uses a global default set by torch.set_default_tensor_type()
-            dtype = default_dtype
+            dtype = self.default_dtype
         if input_types[0] not in dtype:
             out = _op.cast(out, dtype)
 
         return out
 
-    return _impl
-
-
-def _linspace():
-    def _impl(inputs, input_types):
+    def linspace(self, inputs, input_types):
         start = inputs[0]
         stop = inputs[1]
         step = inputs[2]
@@ -720,51 +624,31 @@ def _impl(inputs, input_types):
 
         return _op.transform.arange(start=start, stop=stop, step=step, dtype=dtype)
 
-    return _impl
-
-
-def _relu(prelude):
-    def _impl(inputs, input_types):
+    def relu(self, inputs, input_types):
         data = inputs[0]
-        if _is_quantized_tensor(data, prelude):
+        if _is_quantized_tensor(data, self.prelude):
             assert len(inputs) == 3, "Input quant param not found in op inputs"
             input_zero_point = _expr.const(inputs[2], dtype="int32")
             return qnn_torch.quantized_relu(data, input_zero_point)
         return _op.nn.relu(data)
 
-    return _impl
-
-
-def _prelu():
-    def _impl(inputs, input_types):
+    def prelu(self, inputs, input_types):
         data = inputs[0]
         alpha = inputs[1]
         return _op.nn.prelu(data, alpha)
 
-    return _impl
-
-
-def _leaky_relu():
-    def _impl(inputs, input_types):
+    def leaky_relu(self, inputs, input_types):
         data = inputs[0]
         alpha = float(inputs[1])
         return _op.nn.leaky_relu(data, alpha)
 
-    return _impl
-
-
-def _elu():
-    def _impl(inputs, input_types):
+    def elu(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         alpha = _expr.const(float(inputs[1]), dtype=dtype)
         return alpha * _op.nn.relu(_expr.const(1, dtype=dtype) - _op.exp(data)) + _op.nn.relu(data)
 
-    return _impl
-
-
-def _celu():
-    def _impl(inputs, input_types):
+    def celu(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         alpha = _expr.const(float(inputs[1]), dtype=dtype)
@@ -772,11 +656,7 @@ def _impl(inputs, input_types):
             _expr.const(1, dtype=dtype) - _op.exp(data / alpha)
         ) + _op.nn.relu(data)
 
-    return _impl
-
-
-def _gelu():
-    def _impl(inputs, input_types):
+    def gelu(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         # gelu is data  * normcdf(data)
@@ -788,11 +668,7 @@ def _impl(inputs, input_types):
             + _op.erf(data * _expr.const(0.5 ** 0.5, dtype=dtype)) * _expr.const(0.5, dtype=dtype)
         )
 
-    return _impl
-
-
-def _selu():
-    def _impl(inputs, input_types):
+    def selu(self, inputs, input_types):
         data = inputs[0]
         # https://pytorch.org/docs/stable/nn.html#selu
         dtype = input_types[0]
@@ -802,65 +678,41 @@ def _impl(inputs, input_types):
             alpha * _op.nn.relu(_expr.const(1.0, dtype=dtype) - _op.exp(data)) + _op.nn.relu(data)
         )
 
-    return _impl
-
-
-def _log_sigmoid():
-    def _impl(inputs, input_types):
+    def log_sigmoid(self, inputs, input_types):
         data = inputs[0]
         return _op.log(_op.tensor.sigmoid(data))
 
-    return _impl
-
-
-def _adaptive_avg_pool_2d(prelude):
-    def _impl(inputs, input_types):
+    def adaptive_avg_pool_2d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
 
         def func(x):
             return _op.nn.adaptive_avg_pool2d(x, output_size=output_size)
 
-        if _is_quantized_tensor(data, prelude):
+        if _is_quantized_tensor(data, self.prelude):
             return qnn_torch.apply_with_upcast(data, func)
 
         return func(data)
 
-    return _impl
-
-
-def _adaptive_max_pool_2d():
-    def _impl(inputs, input_types):
+    def adaptive_max_pool_2d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
 
         # returns dummy indices too
         return _op.nn.adaptive_max_pool2d(data, output_size=output_size), None
 
-    return _impl
-
-
-def _adaptive_max_pool_3d():
-    def _impl(inputs, input_types):
+    def adaptive_max_pool_3d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
         # returns dummy indices too
         return _op.nn.adaptive_max_pool3d(data, output_size=output_size), None
 
-    return _impl
-
-
-def _adaptive_avg_pool_3d():
-    def _impl(inputs, input_types):
+    def adaptive_avg_pool_3d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
         return _op.nn.adaptive_avg_pool3d(data, output_size=output_size)
 
-    return _impl
-
-
-def _maxpool_2d():
-    def _impl(inputs, input_types):
+    def maxpool_2d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -875,19 +727,11 @@ def _impl(inputs, input_types):
 
         return _op.nn.max_pool2d(data, pool_size, strides, padding, "NCHW", ceil_mode)
 
-    return _impl
-
-
-def _maxpool_2d_with_indices():
-    def _impl(inputs, input_types):
+    def maxpool_2d_with_indices(self, inputs, input_types):
         # returns dummy indices too
-        return _maxpool_2d()(inputs, input_types), None
+        return self.maxpool_2d(inputs, input_types), None
 
-    return _impl
-
-
-def _maxpool_1d():
-    def _impl(inputs, input_types):
+    def maxpool_1d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -902,11 +746,7 @@ def _impl(inputs, input_types):
 
         return _op.nn.max_pool1d(data, pool_size, strides, padding, "NCW", ceil_mode)
 
-    return _impl
-
-
-def _maxpool_3d():
-    def _impl(inputs, input_types):
+    def maxpool_3d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -922,21 +762,13 @@ def _impl(inputs, input_types):
             data, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode
         )
 
-    return _impl
-
-
-def _hardtanh():
-    def _impl(inputs, input_types):
+    def hardtanh(self, inputs, input_types):
         a = inputs[0]
         tanh_min = float(inputs[1])
         tanh_max = float(inputs[2])
         return _op.tensor.clip(a, tanh_min, tanh_max)
 
-    return _impl
-
-
-def _convolution():
-    def _impl(inputs, input_types):
+    def convolution(self, inputs, input_types):
         # Use transpose or normal
         use_transpose = True if inputs[6] == 1 else False
 
@@ -1025,11 +857,7 @@ def _impl(inputs, input_types):
             res = _op.squeeze(res, axis=[2])
         return res
 
-    return _impl
-
-
-def _softmax():
-    def _impl(inputs, input_types):
+    def softmax(self, inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
         if isinstance(axis, str):
@@ -1037,27 +865,15 @@ def _impl(inputs, input_types):
 
         return _op.nn.softmax(data, axis=axis)
 
-    return _impl
-
-
-def _threshold():
-    def _impl(inputs, input_types):
+    def threshold(self, inputs, input_types):
         data = inputs[0]
         return _op.nn.relu(data)
 
-    return _impl
-
-
-def _contiguous():
-    def _impl(inputs, input_types):
+    def contiguous(self, inputs, input_types):
         data = inputs[0]
         return _op.tensor.copy(data)
 
-    return _impl
-
-
-def _batch_norm():
-    def _impl(inputs, input_types):
+    def batch_norm(self, inputs, input_types):
         data = inputs[0]
         data_type = input_types[0]
 
@@ -1093,11 +909,7 @@ def _impl(inputs, input_types):
             scale=scale,
         )[0]
 
-    return _impl
-
-
-def _instance_norm():
-    def _impl(inputs, input_types):
+    def instance_norm(self, inputs, input_types):
         data = inputs[0]
         data_type = input_types[0]
         channels = _infer_shape(data)
@@ -1121,28 +933,24 @@ def _impl(inputs, input_types):
             data, gamma, beta, axis=1, epsilon=epsilon, center=center, scale=scale
         )
 
-    return _impl
-
-
-def _get_dims(data):
-    import torch
-
-    if isinstance(data, _expr.Expr):
-        dims = _infer_shape(data)
-    elif isinstance(data, list):
-        dims = data
-    elif isinstance(data, (torch.Tensor, np.ndarray)):
-        dims = data.shape
-    else:
-        msg = "Data type %s could not be parsed" % type(data)
-        raise AssertionError(msg)
-    return dims
+    @staticmethod
+    def get_dims(data):
+        import torch
 
+        if isinstance(data, _expr.Expr):
+            dims = _infer_shape(data)
+        elif isinstance(data, list):
+            dims = data
+        elif isinstance(data, (torch.Tensor, np.ndarray)):
+            dims = data.shape
+        else:
+            msg = "Data type %s could not be parsed" % type(data)
+            raise AssertionError(msg)
+        return dims
 
-def _layer_norm():
-    def _impl(inputs, input_types):
+    def layer_norm(self, inputs, input_types):
         data = inputs[0]
-        ndims = len(_get_dims(inputs[1]))
+        ndims = len(self.get_dims(inputs[1]))
         assert ndims == 1, "Support only normalization over last one dimension."
 
         return _op.nn.layer_norm(
@@ -1155,11 +963,7 @@ def _impl(inputs, input_types):
             scale=True,
         )
 
-    return _impl
-
-
-def _group_norm():
-    def _impl(inputs, input_types):
+    def group_norm(self, inputs, input_types):
         data = inputs[0]
         gamma = inputs[2]
         beta = inputs[3]
@@ -1177,17 +981,13 @@ def _impl(inputs, input_types):
             scale=True,
         )
 
-    return _impl
-
-
-def _transpose(prelude):
-    def _impl(inputs, input_types):
+    def transpose(self, inputs, input_types):
         data = inputs[0]
 
         import torch
 
         if isinstance(data, _expr.Expr):
-            ndims = len(_infer_shape(data, prelude.mod))
+            ndims = len(_infer_shape(data, self.prelude.mod))
         elif isinstance(data, list):
             ndims = data
         elif isinstance(data, (torch.Tensor, np.ndarray)):
@@ -1218,11 +1018,7 @@ def _impl(inputs, input_types):
             axes = inputs[1]
         return _op.transform.transpose(data, axes)
 
-    return _impl
-
-
-def _flatten():
-    def _impl(inputs, input_types):
+    def flatten(self, inputs, input_types):
         data = inputs[0]
         start = int(inputs[1])
         end = int(inputs[2])
@@ -1244,11 +1040,7 @@ def _impl(inputs, input_types):
             out = _op.squeeze(out, axis=squeeze_axes)
         return out
 
-    return _impl
-
-
-def _addmm():
-    def _impl(inputs, input_types):
+    def addmm(self, inputs, input_types):
         input_mat = inputs[0]
         mat1 = inputs[1]
         data_type = input_types[1]
@@ -1272,35 +1064,24 @@ def _impl(inputs, input_types):
 
         return dense_out + input_mat
 
-    return _impl
-
-
-def _size(prelude):
-    def _impl_dynamic(inp, axis):
-        shape_dynamic = _op.shape_of(inp, dtype="int32")
-        if axis is not None:
-            return _op.take(shape_dynamic, _expr.const(axis), 0)
-        return shape_dynamic
-
-    def _impl(inputs, input_types):
-        shape = _infer_shape(inputs[0], prelude.mod)
+    def size(self, inputs, input_types):
+        shape = _infer_shape(inputs[0], self.prelude.mod)
         axis = None
         if len(inputs) > 1:
             axis = int(inputs[1])
 
         if any(map(lambda s: isinstance(s, tvm.tir.expr.Any), shape)):
             if axis is None or isinstance(shape[axis], tvm.tir.expr.Any):
-                return _impl_dynamic(inputs[0], axis)
+                shape_dynamic = _op.shape_of(inputs[0], dtype="int32")
+                if axis is not None:
+                    return _op.take(shape_dynamic, _expr.const(axis), 0)
+                return shape_dynamic
 
         if axis is not None:
             return _expr.const(shape[axis])
         return _expr.const(shape)
 
-    return _impl
-
-
-def _numtotensor():
-    def _impl(inputs, input_types):
+    def numtotensor(self, inputs, input_types):
         val = inputs[0]
         dtype = input_types[0]
 
@@ -1314,18 +1095,10 @@ def _impl(inputs, input_types):
         arr = val * np.ones([]).astype(dtype)
         return arr
 
-    return _impl
-
-
-def _tensortonum():
-    def _impl(inputs, input_types):
+    def tensortonum(self, inputs, input_types):
         return inputs[0]
 
-    return _impl
-
-
-def _view():
-    def _impl(inputs, input_types):
+    def view(self, inputs, input_types):
         data = inputs[0]
 
         if len(inputs) == 3:
@@ -1343,11 +1116,7 @@ def _impl(inputs, input_types):
 
         return _op.transform.reshape(data, new_shape)
 
-    return _impl
-
-
-def _reshape():
-    def _impl(inputs, input_types):
+    def reshape(self, inputs, input_types):
         data = inputs[0]
         new_shape = inputs[1]
 
@@ -1378,11 +1147,7 @@ def _impl(inputs, input_types):
             new_shape = tmp_shape
         return _op.transform.reshape(data, new_shape)
 
-    return _impl
-
-
-def _pixel_shuffle(prelude):
-    def _impl(inputs, input_types):
+    def pixel_shuffle(self, inputs, input_types):
         data = inputs[0]
         upscale_factor = inputs[1]
         upscale_squared = upscale_factor * upscale_factor
@@ -1391,7 +1156,7 @@ def _impl(inputs, input_types):
             c % upscale_squared == 0
         ), "input channel should be divisible by square of upscale_factor"
 
-        ndims = len(_infer_shape(data, prelude.mod))
+        ndims = len(_infer_shape(data, self.prelude.mod))
         axes = list(range(ndims))
         num_inputs = len(inputs)
         oc = c // upscale_squared
@@ -1409,46 +1174,26 @@ def _impl(inputs, input_types):
         data = _op.transform.transpose(data, axes)
         return _op.transform.reshape(data, out_shape)
 
-    return _impl
-
-
-def _clone():
-    def _impl(inputs, input_types):
+    def clone(self, inputs, input_types):
         data = inputs[0]
         return _op.tensor.copy(data)
 
-    return _impl
-
-
-def _log_softmax():
-    def _impl(inputs, input_types):
+    def log_softmax(self, inputs, input_types):
         data = inputs[0]
         axis = int(inputs[1])
         return _op.nn.log_softmax(data, axis)
 
-    return _impl
-
-
-def _sigmoid():
-    def _impl(inputs, input_types):
+    def sigmoid(self, inputs, input_types):
         data = inputs[0]
         return _op.tensor.sigmoid(data)
 
-    return _impl
-
-
-def _softplus():
-    def _impl(inputs, input_types):
+    def softplus(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         beta = _expr.const(float(inputs[1]), dtype=dtype)
         return _op.log(_op.exp(inputs[0] * beta) + _expr.const(1.0, dtype=dtype)) / beta
 
-    return _impl
-
-
-def _avg_pool2d(prelude):
-    def _impl(inputs, input_types):
+    def avg_pool2d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -1467,16 +1212,12 @@ def func(x):
                 count_include_pad=count_include_pad,
             )
 
-        if _is_quantized_tensor(data, prelude):
+        if _is_quantized_tensor(data, self.prelude):
             return qnn_torch.apply_with_upcast(data, func)
 
         return func(data)
 
-    return _impl
-
-
-def _avg_pool3d():
-    def _impl(inputs, input_types):
+    def avg_pool3d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -1494,41 +1235,32 @@ def _impl(inputs, input_types):
             count_include_pad=count_include_pad,
         )
 
-    return _impl
-
-
-def _dropout():
-    def _impl(inputs, input_types):
+    def dropout(self, inputs, input_types):
         data = inputs[0]
         rate = float(inputs[1])
 
         return _op.nn.dropout(data, rate)
 
-    return _impl
-
-
-def _reduce(name):
-    def _impl(inputs, input_types):
-        data = inputs[0]
-        axis = None
-        keepdims = False
-
-        if len(inputs) > 2:  # default, torch have only data, axis=None, keepdims=False
-            if isinstance(inputs[1], int):
-                axis = int(inputs[1])
-            elif _is_int_seq(inputs[1]):
-                axis = inputs[1]
-            else:
-                axis = list(_infer_shape(inputs[1]))
-            keepdims = bool(inputs[2])
+    def make_reduce(self, name):
+        def reduce(inputs, input_types):
+            data = inputs[0]
+            axis = None
+            keepdims = False
 
-        return get_relay_op(name)(data, axis=axis, keepdims=keepdims)
+            if len(inputs) > 2:  # default, torch have only data, axis=None, keepdims=False
+                if isinstance(inputs[1], int):
+                    axis = int(inputs[1])
+                elif _is_int_seq(inputs[1]):
+                    axis = inputs[1]
+                else:
+                    axis = list(_infer_shape(inputs[1]))
+                keepdims = bool(inputs[2])
 
-    return _impl
+            return get_relay_op(name)(data, axis=axis, keepdims=keepdims)
 
+        return reduce
 
-def _norm():
-    def _impl(inputs, input_types):
+    def norm(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         axis = None
@@ -1550,25 +1282,17 @@ def _impl(inputs, input_types):
                 reci_order,
             )
 
-    return _impl
-
-
-def _frobenius_norm():
-    def _impl(inputs, input_types):
+    def frobenius_norm(self, inputs, input_types):
         data = inputs[0]
         axis = None
         keepdims = False
         if len(inputs) > 2:
-            axis = inputs[1]
+            axis = inputs[1] if len(inputs[1]) > 0 else None
             keepdims = bool(inputs[2])
 
         return _op.sqrt(_op.reduce.sum((data * data), axis=axis, keepdims=keepdims))
 
-    return _impl
-
-
-def _std():
-    def _impl(inputs, input_types):
+    def std(self, inputs, input_types):
         data = inputs[0]
         if len(inputs) == 2:
             axis = None
@@ -1581,11 +1305,7 @@ def _impl(inputs, input_types):
 
         return _op.reduce.std(data, axis=axis, keepdims=keepdims, unbiased=unbiased)
 
-    return _impl
-
-
-def _variance():
-    def _impl(inputs, input_types):
+    def variance(self, inputs, input_types):
         data = inputs[0]
         if len(inputs) == 2:
             axis = None
@@ -1598,11 +1318,7 @@ def _impl(inputs, input_types):
 
         return _op.reduce.variance(data, axis=axis, keepdims=keepdims, unbiased=unbiased)
 
-    return _impl
-
-
-def _mean(prelude):
-    def _impl(inputs, input_types):
+    def mean(self, inputs, input_types):
         data = inputs[0]
 
         if inputs[1]:
@@ -1622,7 +1338,7 @@ def _impl(inputs, input_types):
         def func(x):
             return _op.mean(x, axis, keepdims, exclude)
 
-        if _is_quantized_tensor(data, prelude):
+        if _is_quantized_tensor(data, self.prelude):
             assert len(inputs) == 6, "Input quant param not found in op inputs"
             input_scale = _expr.const(inputs[4])
             input_zero_point = _expr.const(inputs[5])
@@ -1630,18 +1346,14 @@ def func(x):
 
         return func(data)
 
-    return _impl
-
-
-def _chunk(prelude):
-    def _impl(inputs, input_types):
+    def chunk(self, inputs, input_types):
         data = inputs[0]
 
         num_chunks = int(inputs[1])
         axis = int(inputs[2])
 
         if isinstance(data, _expr.Expr):
-            inferred_shape = _infer_shape(data, prelude.mod)
+            inferred_shape = _infer_shape(data, self.prelude.mod)
 
         shape = []
         for infer in inferred_shape:
@@ -1677,18 +1389,14 @@ def _impl(inputs, input_types):
 
         return chunks
 
-    return _impl
-
-
-def _matmul(prelude):
-    def _impl(inputs, input_types):
+    def matmul(self, inputs, input_types):
 
         inputs_0 = inputs[0]
         inputs_1 = inputs[1]
 
         # Need to check input shape as batch matmul must be supported.
-        a_shape = _infer_shape(inputs_0, prelude.mod)
-        b_shape = _infer_shape(inputs_1, prelude.mod)
+        a_shape = _infer_shape(inputs_0, self.prelude.mod)
+        b_shape = _infer_shape(inputs_1, self.prelude.mod)
 
         # When performing a batch matmul, we need to properly handle N-dim shapes.
         if len(a_shape) > 2 or len(b_shape) > 2:
@@ -1696,8 +1404,8 @@ def _impl(inputs, input_types):
             a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]])
             b = _op.reshape(inputs_1, [-1, b_shape[-2], b_shape[-1]])
             # Broadcast b to match batch size of a
-            new_b_shape = list(_infer_shape(b, prelude.mod))
-            new_a_shape = _infer_shape(a, prelude.mod)
+            new_b_shape = list(_infer_shape(b, self.prelude.mod))
+            new_a_shape = _infer_shape(a, self.prelude.mod)
             if new_a_shape[0] > new_b_shape[0]:
                 new_b_shape[0] = new_a_shape[0]
                 b = _op.broadcast_to(b, new_b_shape)
@@ -1721,11 +1429,7 @@ def _impl(inputs, input_types):
 
         return out
 
-    return _impl
-
-
-def _expand():
-    def _impl(inputs, input_types):
+    def expand(self, inputs, input_types):
         data_in = inputs[0]
         shape = list(_infer_shape(data_in))
 
@@ -1747,85 +1451,64 @@ def _impl(inputs, input_types):
 
         return out
 
-    return _impl
-
-
-def _int():
-    def _impl(inputs, input_types):
+    def int(self, inputs, input_types):
         if isinstance(inputs[0], _expr.Expr):
             return inputs[0]
         return int(inputs[0])
 
-    return _impl
-
-
-def _identity():
-    def _impl(inputs, input_types):
+    def identity(self, inputs, input_types):
         return inputs[0]
 
-    return _impl
-
-
-def _none():
-    def _impl(inputs, input_types):
+    def none(self, inputs, input_types):
         return None
 
-    return _impl
-
-
-def _pad(mode):
-    def _impl(inputs, input_types):
-        data = inputs[0]
-        if isinstance(inputs[1], list):
-            pad_list = inputs[1]
-        else:
-            pad_list = list(_infer_shape(inputs[1]))
-
-        # initialize paddings based on input len
-        pad_len = len(_infer_shape(data)) * 2
-        paddings = [0] * pad_len
-
-        if len(pad_list) >= 2:
-            paddings[-1] = pad_list[1]
-            paddings[-2] = pad_list[0]
-        if len(pad_list) >= 4:
-            paddings[-3] = pad_list[3]
-            paddings[-4] = pad_list[2]
-        if len(pad_list) >= 6:
-            paddings[-5] = pad_list[5]
-            paddings[-6] = pad_list[4]
-
-        # group into tuple of 2 ints
-        paddings = [paddings[i : i + 2] for i in range(0, len(paddings), 2)]
-
-        const_paddings = []
-        for pad in paddings:
-            const_paddings.append([])
-            for p in pad:
-                if not isinstance(p, int):
-                    p = int(_infer_value(p, {}).asnumpy())
-                const_paddings[-1].append(p)
-
-        if mode == "constant":
-            return _op.nn.pad(data, const_paddings, pad_value=inputs[2], pad_mode=mode)
-        else:
-            return _op.nn.pad(data, const_paddings, pad_mode=mode)
-
-    return _impl
+    def make_pad(self, mode):
+        def pad(inputs, input_types):
+            data = inputs[0]
+            if isinstance(inputs[1], list):
+                pad_list = inputs[1]
+            else:
+                pad_list = list(_infer_shape(inputs[1]))
+
+            # initialize paddings based on input len
+            pad_len = len(_infer_shape(data)) * 2
+            paddings = [0] * pad_len
+
+            if len(pad_list) >= 2:
+                paddings[-1] = pad_list[1]
+                paddings[-2] = pad_list[0]
+            if len(pad_list) >= 4:
+                paddings[-3] = pad_list[3]
+                paddings[-4] = pad_list[2]
+            if len(pad_list) >= 6:
+                paddings[-5] = pad_list[5]
+                paddings[-6] = pad_list[4]
+
+            # group into tuple of 2 ints
+            paddings = [paddings[i : i + 2] for i in range(0, len(paddings), 2)]
+
+            const_paddings = []
+            for pad in paddings:
+                const_paddings.append([])
+                for p in pad:
+                    if not isinstance(p, int):
+                        p = int(_infer_value(p, {}).asnumpy())
+                    const_paddings[-1].append(p)
+
+            if mode == "constant":
+                return _op.nn.pad(data, const_paddings, pad_value=inputs[2], pad_mode=mode)
+            else:
+                return _op.nn.pad(data, const_paddings, pad_mode=mode)
 
+        return pad
 
-def _clamp():
-    def _impl(inputs, input_types):
+    def clamp(self, inputs, input_types):
         data = inputs[0]
         amin = inputs[1] if inputs[1] else np.finfo(np.float32).min
         amax = inputs[2] if inputs[2] else np.finfo(np.float32).max
         return _op.clip(data, amin, amax)
 
-    return _impl
-
-
-def _to():
-    def _impl(inputs, input_types):
+    def to(self, inputs, input_types):
         data = inputs[0]
         dtype = inputs[1] if inputs[1] is not None and not isinstance(inputs[1], str) else inputs[2]
         # special handling for aten::to(data, 6, _, _, _) case
@@ -1851,83 +1534,81 @@ def _impl(inputs, input_types):
 
         return ret
 
-    return _impl
-
-
-def _upsample(method, prelude):
-    def _impl(inputs, input_types):
+    @staticmethod
+    def get_upsample_out_size(inputs, method):
+        # This assumes a static shape
         out_size = []
-        for size in inputs[1]:
-            if not isinstance(size, int):
-                out_size.append(int(_infer_value(size, {}).asnumpy()))
-            else:
-                out_size.append(size)
-
-        data = inputs[0]
-
-        if len(inputs) > 2:
-            align_corners = inputs[2]
-        else:
-            align_corners = False
-
-        if method == "nearest_neighbor":
-            coord_trans = "asymmetric"
-        elif align_corners:
-            coord_trans = "align_corners"
+        if inputs[1] is not None:
+            for size in inputs[1]:
+                if not isinstance(size, int):
+                    out_size.append(int(_infer_value(size, {}).asnumpy()))
+                else:
+                    out_size.append(size)
         else:
-            coord_trans = "half_pixel"
-
-        def func(x):
-            return _op.image.resize(x, out_size, "NCHW", method, coord_trans)
-
-        if _is_quantized_tensor(data, prelude):
-            # Torch version > 1.4 changed upsampling API
-            if _is_version_greater_than("1.4.0"):
-                num_inputs = 7
+            scale_index = 3 if method in ["bilinear", "trilinear"] else 2
+            scales = inputs[scale_index]
+            assert scales is not None, "neither out size nor scale provided"
+            assert isinstance(scales, list)
+            ishape = _infer_shape(inputs[0])
+            for i, scale in enumerate(scales):
+                out_size.append(int(math.floor(float(ishape[2 + i]) * scale)))
+
+        return out_size
+
+    def make_upsample(self, method):
+        def upsample(inputs, input_types):
+            data = inputs[0]
+            out_size = self.get_upsample_out_size(inputs, method)
+
+            if len(inputs) > 2 and method == "bilinear":
+                align_corners = inputs[2]
             else:
-                num_inputs = 5
-
-            assert len(inputs) == num_inputs, "Input quant param not found in op inputs"
+                align_corners = False
 
-            input_scale = _expr.const(inputs[-2])
-            input_zero_point = _expr.const(inputs[-1])
-            return qnn_torch.quantized_upsample(data, input_scale, input_zero_point, func)
-        return func(data)
+            if method == "nearest_neighbor":
+                coord_trans = "asymmetric"
+            elif align_corners:
+                coord_trans = "align_corners"
+            else:
+                coord_trans = "half_pixel"
 
-    return _impl
+            def func(x):
+                return _op.image.resize(x, out_size, "NCHW", method, coord_trans)
 
+            if _is_quantized_tensor(data, self.prelude):
+                # input qparams are manually appended by us
+                assert isinstance(inputs[-2], float)
+                assert isinstance(inputs[-1], int)
+                input_scale = _expr.const(inputs[-2])
+                input_zero_point = _expr.const(inputs[-1])
+                return qnn_torch.quantized_upsample(data, input_scale, input_zero_point, func)
 
-def _upsample3d(method):
-    def _impl(inputs, input_types):
-        if isinstance(inputs[1], _expr.Var):
-            out_size = _infer_shape(inputs[1])
-        elif _is_int_seq(inputs[1]):
-            out_size = inputs[1]
-        elif isinstance(inputs[1], list):
-            infer_res = [_infer_value(size, {}) for size in inputs[1]]
-            out_size = [np.asscalar(res.asnumpy().astype(np.int)) for res in infer_res]
+            return func(data)
 
-        data = inputs[0]
+        return upsample
 
-        if len(inputs) > 2:
-            align_corners = inputs[2]
-        else:
-            align_corners = False
+    def make_upsample3d(self, method):
+        def upsample3d(inputs, input_types):
+            data = inputs[0]
+            out_size = self.get_upsample_out_size(inputs, method)
 
-        if method == "nearest_neighbor":
-            coord_trans = "asymmetric"
-        elif align_corners:
-            coord_trans = "align_corners"
-        else:
-            coord_trans = "half_pixel"
+            if len(inputs) > 2 and method == "trilinear":
+                align_corners = inputs[2]
+            else:
+                align_corners = False
 
-        return _op.image.resize3d(data, out_size, "NCDHW", method, coord_trans)
+            if method == "nearest_neighbor":
+                coord_trans = "asymmetric"
+            elif align_corners:
+                coord_trans = "align_corners"
+            else:
+                coord_trans = "half_pixel"
 
-    return _impl
+            return _op.image.resize3d(data, out_size, "NCDHW", method, coord_trans)
 
+        return upsample3d
 
-def _expand_as():
-    def _impl(inputs, input_types):
+    def expand_as(self, inputs, input_types):
         target = inputs[1]
         t0 = _infer_type(inputs[0]).checked_type.dtype
         t1 = _infer_type(inputs[1]).checked_type.dtype
@@ -1935,34 +1616,18 @@ def _impl(inputs, input_types):
             target = _op.cast(target, t0)
         return _op.broadcast_to_like(inputs[0], target)
 
-    return _impl
-
-
-def _Bool():
-    def _impl(inputs, input_types):
+    def Bool(self, inputs, input_types):
         assert len(inputs) == 1
         return inputs[0]
 
-    return _impl
-
-
-def _Float():
-    def _impl(inputs, input_types):
+    def Float(self, inputs, input_types):
         assert len(inputs) == 1
         return _op.cast(inputs[0], "float32")
 
-    return _impl
-
-
-def _mm():
-    def _impl(inputs, input_types):
+    def mm(self, inputs, input_types):
         return _op.nn.dense(inputs[0], inputs[1])
 
-    return _impl
-
-
-def _bitwise_not():
-    def _impl(inputs, input_types):
+    def bitwise_not(self, inputs, input_types):
         data = inputs[0]
         # The input tensor must be of integral or Boolean types.
         # For bool tensors, it computes the logical NOT
@@ -1973,11 +1638,7 @@ def _impl(inputs, input_types):
 
         return out
 
-    return _impl
-
-
-def _bitwise_xor():
-    def _impl(inputs, input_types):
+    def bitwise_xor(self, inputs, input_types):
         lhs = inputs[0]
         rhs = inputs[1]
         lhs = _op.cast(lhs, "bool") if input_types[0] == "bool" else _op.cast(lhs, "int")
@@ -1985,92 +1646,55 @@ def _impl(inputs, input_types):
 
         return _op.bitwise_xor(lhs, rhs)
 
-    return _impl
-
-
-def _logical_not():
-    def _impl(inputs, input_types):
-        data = inputs[0]
-
+    def logical_not(self, inputs, input_types):
+        data = _wrap_const(inputs[0])
         return _op.logical_not(_op.cast(data, "bool"))
 
-    return _impl
-
-
-def _logical_xor():
-    def _impl(inputs, input_types):
+    def logical_xor(self, inputs, input_types):
         lhs = _op.cast(inputs[0], "bool")
         rhs = _op.cast(inputs[1], "bool")
 
         return _op.logical_xor(lhs, rhs)
 
-    return _impl
-
-
-def _list_getitem(prelude):
-    def _impl(inputs, input_types):
-        return prelude.nth(inputs[0], _wrap_const(inputs[1]))
-
-    return _impl
-
-
-def _list_len(prelude):
-    def _impl(inputs, input_types):
-        return prelude.length(inputs[0])
+    def list_getitem(self, inputs, input_types):
+        return self.prelude.nth(inputs[0], _wrap_const(inputs[1]))
 
-    return _impl
+    def list_len(self, inputs, input_types):
+        return self.prelude.length(inputs[0])
 
-
-def _type_as():
-    def _impl(inputs, input_types):
+    def type_as(self, inputs, input_types):
         assert len(inputs) == 2
         assert len(input_types) == 2
         return _op.cast(inputs[0], input_types[1])
 
-    return _impl
-
-
-def _gather():
-    def _impl(inputs, input_types):
+    def gather(self, inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
         indices = inputs[2]
 
         return _op.gather(data, axis, indices)
 
-    return _impl
-
-
-def _add(prelude):
-    # add_ is overloaded for tensor add and list concat
-    def _impl(inputs, input_types):
+    def add(self, inputs, input_types):
+        # add_ is overloaded for tensor add and list concat
         if input_types[0] == "ListType":
-            return prelude.concat(inputs[0], inputs[1])
-        return _elemwise("add")(inputs, input_types)
-
-    return _impl
+            return self.prelude.concat(inputs[0], inputs[1])
+        return self.make_elemwise("add")(inputs, input_types)
 
-
-def _tensor_array_stack(prelude):
-    def _impl(inputs, input_types):
+    def tensor_array_stack(self, inputs, input_types):
         dim = inputs[1]
         assert dim == 0, "stacking on a dynamic tensor list only supported on a first axis"
-        tensor_array, shape = _convert_to_tensor_array(inputs[0], prelude)
+        tensor_array, shape = _convert_to_tensor_array(inputs[0], self.prelude)
 
         stacked_shape = (Any(),) + shape
-        stack = prelude.get_global_var_static("tensor_array_stack", "float32", shape)
+        stack = self.prelude.get_global_var_static("tensor_array_stack", "float32", shape)
         stacked = stack(tensor_array)
 
-        static_tensor_array_ops = StaticTensorArrayOps(prelude, "float32", stacked_shape)
+        static_tensor_array_ops = StaticTensorArrayOps(self.prelude, "float32", stacked_shape)
         static_tensor_array_ops.register()
-        get_tensor = prelude.get_global_var_static("tensor_get_data", "float32", stacked_shape)
+        get_tensor = self.prelude.get_global_var_static("tensor_get_data", "float32", stacked_shape)
         return get_tensor(stacked)
 
-    return _impl
-
-
-def _stack(prelude):
-    def _impl(inputs, input_types):
+    def stack(self, inputs, input_types):
         if isinstance(inputs[0], list):
             # a static python list of tensors
             dim = inputs[1]
@@ -2078,17 +1702,13 @@ def _impl(inputs, input_types):
         else:
             # List ADT case
             assert isinstance(inputs[0], _expr.Expr)
-            ty = _infer_type_with_prelude(inputs[0], prelude)
-            list_ty = prelude.mod.get_global_type_var("List")
+            ty = _infer_type_with_prelude(inputs[0], self.prelude)
+            list_ty = self.prelude.mod.get_global_type_var("List")
             msg = "The input list is expected to be List ADT"
             assert isinstance(ty, tvm.ir.TypeCall) and ty.func == list_ty, msg
-            return _tensor_array_stack(prelude)(inputs, input_types)
-
-    return _impl
-
+            return self.tensor_array_stack(inputs, input_types)
 
-def _rsub():
-    def _impl(inputs, input_types):
+    def rsub(self, inputs, input_types):
         data0, data1 = _pytorch_promote_types(inputs[:2], input_types[:2])
 
         # TODO (t-vi): should this also be part of the type promotion?
@@ -2097,21 +1717,13 @@ def _impl(inputs, input_types):
         # note: rsub means data0 and data1 swap places
         return get_relay_op("subtract")(data1, alpha * data0)
 
-    return _impl
-
-
-def _embedding():
-    def _impl(inputs, input_types):
+    def embedding(self, inputs, input_types):
         weight = inputs[0]
         indices = inputs[1]
 
         return _op.take(weight, indices.astype("int32"), axis=0)
 
-    return _impl
-
-
-def _one_hot():
-    def _impl(inputs, input_types):
+    def one_hot(self, inputs, input_types):
         indices = inputs[0].astype("int32")
         num_classes = inputs[1]
         if num_classes == -1:
@@ -2124,28 +1736,16 @@ def _impl(inputs, input_types):
 
         return _op.one_hot(indices, on_value, off_value, num_classes, -1, dtype)
 
-    return _impl
-
-
-def _index():
-    def _impl(inputs, input_types):
+    def index(self, inputs, input_types):
         data = inputs[0]
         indices = inputs[1]
         return _op.adv_index([data] + indices)
 
-    return _impl
-
-
-def _meshgrid():
-    def _impl(inputs, input_types):
+    def meshgrid(self, inputs, input_types):
         data = inputs[0]
         return _op.meshgrid(data, indexing="ij")
 
-    return _impl
-
-
-def _nms(prelude):
-    def _impl(inputs, input_types):
+    def nms(self, inputs, input_types):
         boxes = inputs[0]
         scores = inputs[1]
         iou_threshold = inputs[2]
@@ -2191,11 +1791,7 @@ def _impl(inputs, input_types):
         # in torchvision, indices from nms are int64
         return _op.cast(ret, "int64")
 
-    return _impl
-
-
-def _logsumexp():
-    def _impl(inputs, input_types):
+    def logsumexp(self, inputs, input_types):
         data = _pytorch_promote_types(inputs[:1], input_types[:1])
         dim_list = inputs[1]
         keepdim = inputs[2] if len(inputs) > 2 else False
@@ -2203,11 +1799,7 @@ def _impl(inputs, input_types):
         assert isinstance(dim_list, list), "dim is expected to be a list"
         return _op.logsumexp(data[0], axis=dim_list, keepdims=keepdim)
 
-    return _impl
-
-
-def _roi_align(prelude):
-    def _impl(inputs, input_types):
+    def roi_align(self, inputs, input_types):
         data = inputs[0]
         boxes = inputs[1]
 
@@ -2221,16 +1813,12 @@ def _impl(inputs, input_types):
 
         return _op.vision.roi_align(data, boxes, output_size, spatial_scale, sample_ratio)
 
-    return _impl
-
-
-def _unbind():
-    def _impl(inputs, input_types):
+    def unbind(self, inputs, input_types):
         data = inputs[0]
         dim = int(inputs[1])
         ishapes = _infer_shape(data)
         if dim >= len(ishapes):
-            msg = "Please check input dim, it shouldn't" "be greater than or equal to rank."
+            msg = "Please check input dim, it shouldn't be greater than or equal to rank."
             raise AttributeError(msg)
 
         selections = ishapes[dim]
@@ -2243,13 +1831,9 @@ def _impl(inputs, input_types):
         ret = _expr.TupleWrapper(_expr.Tuple(ret), selections)
         return ret
 
-    return _impl
-
-
-def _shape_as_tensor(prelude):
-    def _impl(inputs, input_types):
+    def shape_as_tensor(self, inputs, input_types):
         is_symbolic_shape = False
-        input_shape = _infer_shape(inputs[0], prelude.mod)
+        input_shape = _infer_shape(inputs[0], self.prelude.mod)
         for axis in input_shape:
             if not isinstance(axis, (int, tvm.tir.IntImm)):
                 is_symbolic_shape = True
@@ -2262,45 +1846,30 @@ def _impl(inputs, input_types):
 
         return ret
 
-    return _impl
-
-
-def _logical_and():
-    def _impl(inputs, input_types):
+    def logical_and(self, inputs, input_types):
         lhs = _op.cast(inputs[0], "bool")
         rhs = _op.cast(inputs[1], "bool")
 
         return _op.logical_and(lhs, rhs)
 
-    return _impl
-
-
-def _nonzero(is_numpy_style):
-    def _impl(inputs, input_types):
+    def nonzero(self, inputs, input_types, is_numpy_style=False):
         data = inputs[0]
         ret = _op.transform.argwhere(data)
-
         if is_numpy_style or (len(inputs) > 1 and inputs[1]):
-            return _unbind()([ret, 1], None)
-
+            return self.unbind([ret, 1], None)
         return ret
 
-    return _impl
+    def nonzero_numpy(self, inputs, input_types):
+        return self.nonzero(inputs, input_types, is_numpy_style=False)
 
-
-def _scatter():
-    def _impl(inputs, input_types):
+    def scatter(self, inputs, input_types):
         data = inputs[0]
         axis = int(inputs[1])
         index = inputs[2]
         src = inputs[3]
         return _op.transform.scatter(data, index, src, axis)
 
-    return _impl
-
-
-def _scalar_tensor():
-    def _impl(inputs, input_types):
+    def scalar_tensor(self, inputs, input_types):
         data = inputs[0]
         cast_map = {
             6: "float32",
@@ -2313,11 +1882,7 @@ def _impl(inputs, input_types):
             data = data.data.asnumpy().tolist()
         return _expr.const(data, cast_map[type_key])
 
-    return _impl
-
-
-def _interpolate():
-    def _impl(inputs, input_types):
+    def interpolate(self, inputs, input_types):
         if isinstance(inputs[1], _expr.Expr):
             out_size = inputs[1]
         elif isinstance(inputs[1], list):
@@ -2346,22 +1911,450 @@ def _impl(inputs, input_types):
 
         return _op.image.resize(data, out_size, "NCHW", method, coord_trans)
 
-    return _impl
+    def numel(self, inputs, input_types):
+        return _op.ndarray_size(inputs[0])
 
+    def empty(self, inputs, input_types):
+        shape = inputs[0]
+        return _op.zeros(shape, _convert_dtype_value(inputs[1]))
 
-def _numel():
-    def _impl(inputs, input_types):
-        return _op.ndarray_size(inputs[0])
+    def bincount(self, inputs, input_types):
+        data = inputs[0]
+        weights = inputs[1]
+        maximum = _op.max(data)
+        dim = maximum + _expr.const(1, dtype="int64")
+        if weights:
+            weight_type = _infer_type(weights).checked_type
+            out_dtype = weight_type.dtype
+            updates = weights
+        else:
+            out_dtype = "int64"
+            updates = _op.ones_like(data)
 
-    return _impl
+        counts = _op.zeros(_op.reshape(dim, [1]), out_dtype)
+        return _op.scatter_add(counts, data, updates, axis=0)
 
+    def scatter_add(self, inputs, input_types):
+        data = inputs[0]
+        axis = inputs[1]
+        index = inputs[2]
+        src = inputs[3]
+        return _op.scatter_add(data, index, src, axis=axis)
+
+    # Operator mappings
+    def create_convert_map(self):
+        self.convert_map = {
+            "aten::pixel_shuffle": self.pixel_shuffle,
+            "aten::device": self.none,
+            "prim::device": self.none,
+            "aten::sub": self.make_elemwise("subtract"),
+            "aten::sub_": self.make_elemwise("subtract"),
+            "aten::max": self.max,
+            "aten::min": self.min,
+            "aten::mul": self.make_elemwise("multiply"),
+            "aten::mul_": self.make_elemwise("multiply"),
+            "aten::pow": self.make_elemwise("power"),
+            "aten::arange": self.arange,
+            "aten::meshgrid": self.meshgrid,
+            "aten::div": self.make_elemwise("divide"),
+            "aten::div_": self.make_elemwise("divide"),
+            "aten::floor_divide": self.make_elemwise("floor_divide"),
+            "aten::true_divide": self.make_elemwise("divide"),
+            "aten::addcdiv": self.addcdiv,
+            "aten::addcmul": self.addcmul,
+            "aten::ones": self.ones,
+            "aten::ones_like": self.ones_like,
+            "aten::zeros": self.zeros,
+            "aten::zeros_like": self.zeros_like,
+            "aten::full": self.full,
+            "aten::full_like": self.full_like,
+            "aten::linspace": self.linspace,
+            "aten::reciprocal": self.reciprocal,
+            "aten::repeat": self.repeat,
+            "aten::repeat_interleave": self.repeat_interleave,
+            "aten::to": self.to,
+            "aten::squeeze": self.squeeze,
+            "aten::unsqueeze": self.unsqueeze,
+            "aten::cat": self.concatenate,
+            "aten::slice": self.slice,
+            "aten::split": self.split,
+            "aten::split_with_sizes": self.split_with_sizes,
+            "aten::select": self.select,
+            "aten::take": self.take,
+            "aten::where": self.where,
+            "aten::topk": self.topk,
+            "aten::relu": self.relu,
+            "aten::relu_": self.relu,
+            "aten::prelu": self.prelu,
+            "aten::leaky_relu": self.leaky_relu,
+            "aten::leaky_relu_": self.leaky_relu,
+            "aten::elu": self.elu,
+            "aten::elu_": self.elu,
+            "aten::celu": self.celu,
+            "aten::gelu": self.gelu,
+            "aten::selu": self.selu,
+            "aten::log_sigmoid": self.log_sigmoid,
+            "aten::adaptive_avg_pool2d": self.adaptive_avg_pool_2d,
+            "aten::adaptive_max_pool2d": self.adaptive_max_pool_2d,
+            "aten::max_pool2d": self.maxpool_2d,
+            "aten::max_pool2d_with_indices": self.maxpool_2d_with_indices,
+            "aten::max_pool1d": self.maxpool_1d,
+            "aten::max_pool3d": self.maxpool_3d,
+            "aten::hardtanh": self.hardtanh,
+            "aten::hardtanh_": self.hardtanh,
+            "aten::_convolution": self.convolution,
+            "aten::softmax": self.softmax,
+            "aten::threshold": self.threshold,
+            "aten::threshold_": self.threshold,
+            "aten::contiguous": self.contiguous,
+            "aten::batch_norm": self.batch_norm,
+            "aten::instance_norm": self.instance_norm,
+            "aten::layer_norm": self.layer_norm,
+            "aten::group_norm": self.group_norm,
+            "aten::transpose": self.transpose,
+            "aten::transpose_": self.transpose,
+            "aten::t": self.transpose,
+            "aten::flatten": self.flatten,
+            "aten::addmm": self.addmm,
+            "aten::size": self.size,
+            "aten::view": self.view,
+            "aten::reshape": self.reshape,
+            "aten::clone": self.clone,
+            "aten::log_softmax": self.log_softmax,
+            "aten::sigmoid": self.sigmoid,
+            "aten::softplus": self.softplus,
+            "aten::avg_pool2d": self.avg_pool2d,
+            "aten::avg_pool3d": self.avg_pool3d,
+            "aten::dropout": self.dropout,
+            "aten::dropout_": self.dropout,
+            "aten::feature_dropout": self.dropout,
+            "aten::alpha_dropout": self.dropout,
+            "aten::mean": self.mean,
+            "aten::chunk": self.chunk,
+            "aten::matmul": self.matmul,
+            "aten::bmm": self.matmul,
+            "aten::expand": self.expand,
+            "aten::Int": self.int,
+            "prim::NumToTensor": self.numtotensor,
+            "prim::ImplicitTensorToNum": self.tensortonum,
+            "aten::ScalarImplicit": self.tensortonum,
+            "aten::constant_pad_nd": self.make_pad("constant"),
+            "aten::reflection_pad1d": self.make_pad("reflect"),
+            "aten::reflection_pad2d": self.make_pad("reflect"),
+            "aten::replication_pad1d": self.make_pad("edge"),
+            "aten::replication_pad2d": self.make_pad("edge"),
+            "aten::replication_pad3d": self.make_pad("edge"),
+            "aten::permute": self.transpose,
+            "aten::sum": self.make_reduce("sum"),
+            "aten::prod": self.make_reduce("prod"),
+            "aten::argmin": self.make_reduce("argmin"),
+            "aten::argmax": self.make_reduce("argmax"),
+            "aten::norm": self.norm,
+            "aten::frobenius_norm": self.frobenius_norm,
+            "aten::std": self.std,
+            "aten::var": self.variance,
+            "aten::abs": self.make_unary("abs"),
+            "aten::neg": self.make_unary("negative"),
+            "aten::cos": self.make_unary("cos"),
+            "aten::cosh": self.make_unary("cosh"),
+            "aten::sin": self.make_unary("sin"),
+            "aten::sinh": self.make_unary("sinh"),
+            "aten::tan": self.make_unary("tan"),
+            "aten::tanh": self.make_unary("tanh"),
+            "aten::acos": self.make_unary("acos"),
+            "aten::asin": self.make_unary("asin"),
+            "aten::atan": self.make_unary("atan"),
+            "aten::log": self.make_unary("log"),
+            "aten::log2": self.make_unary("log2"),
+            "aten::log10": self.make_unary("log10"),
+            "aten::log1p": self.log1p,
+            "aten::exp": self.make_unary("exp"),
+            "aten::erf": self.make_unary("erf"),
+            "aten::trunc": self.make_unary("trunc"),
+            "aten::sign": self.make_unary("sign"),
+            "aten::sqrt": self.make_unary("sqrt"),
+            "aten::rsqrt": self.make_unary("rsqrt"),
+            "aten::ceil": self.make_unary("ceil"),
+            "aten::floor": self.make_unary("floor"),
+            "aten::round": self.make_unary("round"),
+            "aten::isfinite": self.make_unary("isfinite"),
+            "aten::isinf": self.make_unary("isinf"),
+            "aten::isnan": self.make_unary("isnan"),
+            "aten::clamp": self.clamp,
+            "aten::clamp_": self.clamp,
+            "aten::detach": self.identity,
+            "aten::upsample_bilinear2d": self.make_upsample("bilinear"),
+            "aten::upsample_nearest2d": self.make_upsample("nearest_neighbor"),
+            "aten::upsample_trilinear3d": self.make_upsample3d("trilinear"),
+            "aten::upsample_nearest3d": self.make_upsample3d("nearest_neighbor"),
+            "aten::expand_as": self.expand_as,
+            "aten::lt": self.make_elemwise("less"),
+            "aten::gt": self.make_elemwise("greater"),
+            "aten::le": self.make_elemwise("less_equal"),
+            "aten::ge": self.make_elemwise("greater_equal"),
+            "aten::ne": self.make_elemwise("not_equal"),
+            "aten::eq": self.make_elemwise("equal"),
+            "aten::logical_not": self.logical_not,
+            "aten::logical_xor": self.logical_xor,
+            "aten::bitwise_not": self.bitwise_not,
+            "aten::bitwise_xor": self.bitwise_xor,
+            "aten::Bool": self.Bool,
+            "aten::Float": self.Float,
+            "aten::adaptive_avg_pool3d": self.adaptive_avg_pool_3d,
+            "aten::adaptive_max_pool3d": self.adaptive_max_pool_3d,
+            "aten::rsub": self.rsub,
+            "aten::embedding": self.embedding,
+            "aten::one_hot": self.one_hot,
+            "aten::mm": self.matmul,
+            "aten::add": self.add,
+            "aten::add_": self.add,
+            "aten::stack": self.stack,
+            "aten::__getitem__": self.list_getitem,
+            "aten::len": self.list_len,
+            "aten::type_as": self.type_as,
+            "aten::gather": self.gather,
+            "aten::index_select": self.select,
+            "aten::index": self.index,
+            "torchvision::nms": self.nms,
+            "aten::logsumexp": self.logsumexp,
+            "torchvision::roi_align": self.roi_align,
+            "aten::unbind": self.unbind,
+            "aten::__and__": self.logical_and,
+            "aten::_shape_as_tensor": self.shape_as_tensor,
+            "aten::nonzero": self.nonzero,
+            "aten::nonzero_numpy": self.nonzero_numpy,
+            "aten::scatter": self.scatter,
+            "aten::scalar_tensor": self.scalar_tensor,
+            "aten::__interpolate": self.interpolate,
+            "aten::IntImplicit": self.identity,
+            "aten::tensor": self.identity,  # used for example in tensor(1.0)
+            "aten::numel": self.numel,
+            "aten::empty": self.empty,
+            "aten::bincount": self.bincount,
+            "aten::scatter_add": self.scatter_add,
+            "aten::__not__": self.logical_not,
+        }
 
-def _empty():
-    def _impl(inputs, input_types):
-        shape = inputs[0]
-        return _op.zeros(shape, _convert_dtype_value(inputs[1]))
+    def update_convert_map(self, custom_map):
+        self.convert_map.update(custom_map)
+
+    def report_missing_conversion(self, op_names):
+        """ Check if all ops in an input graph are supported by TVM """
+        known_ops = [
+            "prim::Constant",
+            "prim::GetAttr",
+            "prim::ListConstruct",
+            "prim::ListUnpack",
+            "prim::TupleConstruct",
+            "prim::TupleUnpack",
+            "prim::RaiseException",
+            "prim::If",
+            "prim::Loop",
+        ]
+        known_ops += list(self.convert_map.keys())
+        known_ops += list(qnn_torch.convert_map.keys())
+
+        missing = [op_name for op_name in op_names if op_name not in known_ops]
+
+        if missing:
+            msg = "The following operators are not implemented: {}".format(missing)
+            raise NotImplementedError(msg)
+
+    def convert_block(self, block, outputs):
+        """ Translate Torch "Block", used for prim::If and prim::Loop """
+        ops = _get_operator_nodes(block.nodes())
+        ret_names = _get_input_names(block.returnNode())
+        return self.convert_operators(ops, outputs, ret_names)
+
+    def convert_if(self, if_node, outputs):
+        """ Translate Torch prim::If to Relay If """
+        cond = outputs[if_node.inputsAt(0).debugName()]
+        blocks = list(if_node.blocks())
+        true_branch = self.convert_block(blocks[0], outputs)
+        false_branch = self.convert_block(blocks[1], outputs)
+        assert len(true_branch) == 1 and len(false_branch) == 1
+        return _expr.If(cond, true_branch[0], false_branch[0])
+
+    def convert_loop(self, loop_node, outputs):
+        """ Translate Torch prim::Loop to Relay while_loop """
+
+        def get_input(index):
+            ivalue = loop_node.inputsAt(index)
+            inode = ivalue.node()
+            if inode.kind() == "prim::Constant":
+                return _expr.const(_get_constant(inode))
+            var_name = ivalue.debugName()
+            assert var_name in outputs
+            return _wrap_const(outputs[var_name])
+
+        # Refer to the spec for prim::Loop below
+        # https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/OVERVIEW.md#loops
+        # The first input: %max_trip_count
+        # The second input: %initial_condition
+        # The rest of input: loop variables
+        max_loop_count = get_input(0)
+        init_cond = get_input(1)
+        num_loop_var = len(list(loop_node.inputs())) - 2
+        init_vals = [get_input(i + 2) for i in range(num_loop_var)]
+
+        # while loop has always max_loop_count being int64 max
+        # max_loop_count.data (tvm.runtime.NDArray) is -1, so _get_constant again
+        is_while_loop = (
+            isinstance(max_loop_count, _expr.Constant)
+            and _get_constant(loop_node.inputsAt(0).node()) == sys.maxsize
+        )
+
+        if is_while_loop:
+            loop_iter_dtype = "bool"
+            # while loop with non input dependent condition such as while i < 10:
+            # init_cond is int, need to cast to bool to type check
+            if isinstance(init_cond, _expr.Constant):
+                init_cond = _op.cast(init_cond, "bool")
+            init_loop_iter_val = init_cond
+        else:
+            loop_iter_dtype = "int32"
+            # always count from 0
+            init_loop_iter_val = _expr.const(0, dtype="int32")
+
+        body_block = list(loop_node.blocks())[0]
+        block_input_names = _get_input_names(body_block)
+        num_block_inputs = len(block_input_names)
+        name_val_pairs = list(zip(block_input_names, [init_loop_iter_val] + init_vals))
+        outputs.update(name_val_pairs)
+
+        def get_var(name, val):
+            if val:
+                checked_type = _infer_type_with_prelude(val, self.prelude)
+                if hasattr(checked_type, "shape"):
+                    shape = get_const_tuple(checked_type.shape)
+                    actual_shape = []
+                    for dim in shape:
+                        if isinstance(dim, int) and dim == 0:
+                            actual_shape.append(Any())
+                        else:
+                            actual_shape.append(dim)
+                    return _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
+                else:
+                    return _expr.var(name, type_annotation=checked_type)
+            return _expr.var(name)
+
+        loop_iter_var = _expr.var(block_input_names[0], shape=(), dtype=loop_iter_dtype)
+        loop_vars = [get_var(name, val) for name, val in name_val_pairs[1:]]
+
+        # Add non constant free variables to loop variables to prevent code blow up
+        # Without this, if there are two for loops in a row, which often happens
+        # if the outer loop is unrolled, the computation corresponding to the first for loop
+        # is inlined inside loop body, turning O(N) + O(N) computation into O(N^2).
+        # This issue was found when converting from Stacked LSTM test. Torch does not add the
+        # outputof the eariler loop into loop variables of the next loop.
+        # So the variable corresponding to the first loop output appears free in the second
+        # loop body.
+        free_vars = [
+            var
+            for var in _get_free_vars_from_block(body_block)
+            if var in outputs
+            and not isinstance(outputs[var], (_expr.Constant, int, float, str))
+            and outputs[var]
+        ]
+
+        prev_outputs = {}
+        for name in free_vars:
+            prev_output = outputs[name]
+            new_loop_var = get_var(name, prev_output)
+            prev_outputs[name] = prev_output
+            outputs[name] = new_loop_var
+            loop_vars.append(new_loop_var)
+            init_vals.append(prev_output)
+
+        def cond(*current_vals):
+            i = current_vals[0]
+
+            if is_while_loop:
+                return _op.equal(i, _expr.const(True, "bool"))
+
+            return _op.less(i, max_loop_count)
+
+        def body(*current_vals):
+            # Update loop variables using the prev iteration outputs
+            assert len(current_vals) == num_block_inputs + len(free_vars)
+
+            for (i, val) in enumerate(current_vals):
+                if i < num_block_inputs:
+                    outputs[block_input_names[i]] = val
+                else:
+                    outputs[free_vars[i - num_block_inputs]] = val
+
+            block_outputs = self.convert_block(body_block, outputs)
+            block_outputs += [outputs[name] for name in free_vars]
+
+            if not is_while_loop:
+                # iter var increment implicit in torch, so do it manually
+                # for while loop, block_outputs[0] is already a boolean,
+                # the result of termination check
+                incr = _expr.const(1, dtype="int32")
+                block_outputs[0] = current_vals[0] + incr
+
+            return block_outputs
+
+        loop = while_loop(cond, [loop_iter_var] + loop_vars, body)
+        loop_val = loop(init_loop_iter_val, *init_vals)
+
+        # restore original output values for free vars
+        outputs.update(prev_outputs)
+
+        # The first element is a loop counter or boolean condition, ignore it
+        return [_expr.TupleGetItem(loop_val, i + 1) for i in range(num_loop_var)]
+
+    def convert_operators(self, operators, outputs, ret_names):
+        """ Convert each Torch IR operators to Relay equivalent """
+        for node_name, op_node in operators:
+            operator = op_node.kind()
+            inputs = _get_op_inputs(op_node, outputs)
+
+            if operator == "prim::Constant":
+                outputs[node_name] = _get_constant(op_node)
+            elif operator == "prim::ListConstruct" and _should_construct_dynamic_list(op_node):
+                outputs[node_name] = _convert_to_list_adt(inputs, self.prelude)
+            elif operator == "prim::ListConstruct":
+                # This assumes that no more elements will be appended to this list
+                # In this case, we keep the Python list
+                outputs[node_name] = inputs
+            elif operator == "prim::TupleConstruct":
+                outputs[node_name] = _expr.Tuple(inputs)
+            elif operator in ["prim::ListUnpack", "prim::TupleUnpack"]:
+                assert len(inputs) == 1
+                if isinstance(inputs[0], (list, _expr.TupleWrapper)):
+                    unpacked = inputs[0]
+                else:
+                    unpacked = _unpack_tuple(inputs[0])
+                outputs.update(zip(_get_output_names(op_node), unpacked))
+            elif operator == "prim::prim::RaiseException":
+                logging.warning("raising exceptions is ignored")
+                outputs[node_name] = None
+            elif operator == "prim::If":
+                if_out = self.convert_if(op_node, outputs)
+                outputs[node_name] = if_out
+            elif operator == "prim::Loop":
+                loop_out = self.convert_loop(op_node, outputs)
+                unpacked_names = _get_output_names(op_node)
+                assert len(loop_out) == len(unpacked_names)
+                outputs.update(zip(unpacked_names, loop_out))
+            else:
+                relay_op = self.convert_map[operator]
+                relay_out = relay_op(
+                    inputs, _get_input_types(op_node, outputs, default_dtype=self.default_dtype)
+                )
 
-    return _impl
+                if isinstance(relay_out, tuple):
+                    # This is for torch operators that return multiple outputs
+                    # See _adaptive_max_2d above for example
+                    out_names = _get_output_names(op_node)
+                    outputs.update(zip(out_names, relay_out))
+                else:
+                    assert op_node.outputsSize() == 1
+                    outputs[node_name] = relay_out
+
+        return [_wrap_const(outputs[ret_name]) for ret_name in ret_names]
 
 
 def _pytorch_result_type(dtypes, non_tensor_inputs):
@@ -2517,204 +2510,12 @@ def _wrap_const(c):
     return c
 
 
-# Operator mappings
-def _get_convert_map(prelude, default_dtype):
-    convert_map = {
-        "aten::pixel_shuffle": _pixel_shuffle(prelude),
-        "aten::device": _none(),
-        "prim::device": _none(),
-        "aten::sub": _elemwise("subtract"),
-        "aten::sub_": _elemwise("subtract"),
-        "aten::max": _max(),
-        "aten::min": _min(),
-        "aten::mul": _elemwise("multiply"),
-        "aten::mul_": _elemwise("multiply"),
-        "aten::pow": _elemwise("power"),
-        "aten::arange": _arange(),
-        "aten::meshgrid": _meshgrid(),
-        "aten::div": _elemwise("divide"),
-        "aten::div_": _elemwise("divide"),
-        "aten::floor_divide": _elemwise("floor_divide"),
-        "aten::true_divide": _elemwise("divide"),
-        "aten::addcdiv": _addcdiv(),
-        "aten::addcmul": _addcmul(),
-        "aten::ones": _ones(default_dtype),
-        "aten::ones_like": _ones_like(default_dtype),
-        "aten::zeros": _zeros(default_dtype),
-        "aten::zeros_like": _zeros_like(default_dtype),
-        "aten::full": _full(default_dtype),
-        "aten::full_like": _full_like(default_dtype),
-        "aten::linspace": _linspace(),
-        "aten::reciprocal": _reciprocal(),
-        "aten::repeat": _repeat(),
-        "aten::repeat_interleave": _repeat_interleave(),
-        "aten::to": _to(),
-        "aten::squeeze": _squeeze(),
-        "aten::unsqueeze": _unsqueeze(),
-        "aten::cat": _concatenate(prelude),
-        "aten::slice": _slice(),
-        "aten::split": _split(),
-        "aten::split_with_sizes": _split_with_sizes(),
-        "aten::select": _select(),
-        "aten::take": _take(),
-        "aten::where": _where(),
-        "aten::topk": _topk(),
-        "aten::relu": _relu(prelude),
-        "aten::relu_": _relu(prelude),
-        "aten::prelu": _prelu(),
-        "aten::leaky_relu": _leaky_relu(),
-        "aten::leaky_relu_": _leaky_relu(),
-        "aten::elu": _elu(),
-        "aten::elu_": _elu(),
-        "aten::celu": _celu(),
-        "aten::gelu": _gelu(),
-        "aten::selu": _selu(),
-        "aten::log_sigmoid": _log_sigmoid(),
-        "aten::adaptive_avg_pool2d": _adaptive_avg_pool_2d(prelude),
-        "aten::adaptive_max_pool2d": _adaptive_max_pool_2d(),
-        "aten::max_pool2d": _maxpool_2d(),
-        "aten::max_pool2d_with_indices": _maxpool_2d_with_indices(),
-        "aten::max_pool1d": _maxpool_1d(),
-        "aten::max_pool3d": _maxpool_3d(),
-        "aten::hardtanh": _hardtanh(),
-        "aten::hardtanh_": _hardtanh(),
-        "aten::_convolution": _convolution(),
-        "aten::softmax": _softmax(),
-        "aten::threshold": _threshold(),
-        "aten::threshold_": _threshold(),
-        "aten::contiguous": _contiguous(),
-        "aten::batch_norm": _batch_norm(),
-        "aten::instance_norm": _instance_norm(),
-        "aten::layer_norm": _layer_norm(),
-        "aten::group_norm": _group_norm(),
-        "aten::transpose": _transpose(prelude),
-        "aten::transpose_": _transpose(prelude),
-        "aten::t": _transpose(prelude),
-        "aten::flatten": _flatten(),
-        "aten::addmm": _addmm(),
-        "aten::size": _size(prelude),
-        "aten::view": _view(),
-        "aten::reshape": _reshape(),
-        "aten::clone": _clone(),
-        "aten::log_softmax": _log_softmax(),
-        "aten::sigmoid": _sigmoid(),
-        "aten::softplus": _softplus(),
-        "aten::avg_pool2d": _avg_pool2d(prelude),
-        "aten::avg_pool3d": _avg_pool3d(),
-        "aten::dropout": _dropout(),
-        "aten::dropout_": _dropout(),
-        "aten::feature_dropout": _dropout(),
-        "aten::alpha_dropout": _dropout(),
-        "aten::mean": _mean(prelude),
-        "aten::chunk": _chunk(prelude),
-        "aten::matmul": _matmul(prelude),
-        "aten::bmm": _matmul(prelude),
-        "aten::expand": _expand(),
-        "aten::Int": _int(),
-        "prim::NumToTensor": _numtotensor(),
-        "prim::ImplicitTensorToNum": _tensortonum(),
-        "aten::ScalarImplicit": _tensortonum(),
-        "aten::constant_pad_nd": _pad("constant"),
-        "aten::reflection_pad1d": _pad("reflect"),
-        "aten::reflection_pad2d": _pad("reflect"),
-        "aten::replication_pad1d": _pad("edge"),
-        "aten::replication_pad2d": _pad("edge"),
-        "aten::replication_pad3d": _pad("edge"),
-        "aten::permute": _transpose(prelude),
-        "aten::sum": _reduce("sum"),
-        "aten::prod": _reduce("prod"),
-        "aten::argmin": _reduce("argmin"),
-        "aten::argmax": _reduce("argmax"),
-        "aten::norm": _norm(),
-        "aten::frobenius_norm": _frobenius_norm(),
-        "aten::std": _std(),
-        "aten::var": _variance(),
-        "aten::abs": _unary("abs"),
-        "aten::neg": _unary("negative"),
-        "aten::cos": _unary("cos"),
-        "aten::cosh": _unary("cosh"),
-        "aten::sin": _unary("sin"),
-        "aten::sinh": _unary("sinh"),
-        "aten::tan": _unary("tan"),
-        "aten::tanh": _unary("tanh"),
-        "aten::acos": _unary("acos"),
-        "aten::asin": _unary("asin"),
-        "aten::atan": _unary("atan"),
-        "aten::log": _unary("log"),
-        "aten::log2": _unary("log2"),
-        "aten::log10": _unary("log10"),
-        "aten::log1p": _log1p(),
-        "aten::exp": _unary("exp"),
-        "aten::erf": _unary("erf"),
-        "aten::trunc": _unary("trunc"),
-        "aten::sign": _unary("sign"),
-        "aten::sqrt": _unary("sqrt"),
-        "aten::rsqrt": _unary("rsqrt"),
-        "aten::ceil": _unary("ceil"),
-        "aten::floor": _unary("floor"),
-        "aten::round": _unary("round"),
-        "aten::isfinite": _unary("isfinite"),
-        "aten::isinf": _unary("isinf"),
-        "aten::isnan": _unary("isnan"),
-        "aten::clamp": _clamp(),
-        "aten::clamp_": _clamp(),
-        "aten::detach": _identity(),
-        "aten::upsample_bilinear2d": _upsample("bilinear", prelude),
-        "aten::upsample_nearest2d": _upsample("nearest_neighbor", prelude),
-        "aten::upsample_trilinear3d": _upsample3d("trilinear"),
-        "aten::upsample_nearest3d": _upsample3d("nearest_neighbor"),
-        "aten::expand_as": _expand_as(),
-        "aten::lt": _elemwise("less"),
-        "aten::gt": _elemwise("greater"),
-        "aten::le": _elemwise("less_equal"),
-        "aten::ge": _elemwise("greater_equal"),
-        "aten::ne": _elemwise("not_equal"),
-        "aten::eq": _elemwise("equal"),
-        "aten::logical_not": _logical_not(),
-        "aten::logical_xor": _logical_xor(),
-        "aten::bitwise_not": _bitwise_not(),
-        "aten::bitwise_xor": _bitwise_xor(),
-        "aten::Bool": _Bool(),
-        "aten::Float": _Float(),
-        "aten::adaptive_avg_pool3d": _adaptive_avg_pool_3d(),
-        "aten::adaptive_max_pool3d": _adaptive_max_pool_3d(),
-        "aten::rsub": _rsub(),
-        "aten::embedding": _embedding(),
-        "aten::one_hot": _one_hot(),
-        "aten::mm": _matmul(prelude),
-        "aten::add": _add(prelude),
-        "aten::add_": _add(prelude),
-        "aten::stack": _stack(prelude),
-        "aten::__getitem__": _list_getitem(prelude),
-        "aten::len": _list_len(prelude),
-        "aten::type_as": _type_as(),
-        "aten::gather": _gather(),
-        "aten::index_select": _select(),
-        "aten::index": _index(),
-        "torchvision::nms": _nms(prelude),
-        "aten::logsumexp": _logsumexp(),
-        "torchvision::roi_align": _roi_align(prelude),
-        "aten::unbind": _unbind(),
-        "aten::__and__": _logical_and(),
-        "aten::_shape_as_tensor": _shape_as_tensor(prelude),
-        "aten::nonzero": _nonzero(False),
-        "aten::nonzero_numpy": _nonzero(True),
-        "aten::scatter": _scatter(),
-        "aten::scalar_tensor": _scalar_tensor(),
-        "aten::__interpolate": _interpolate(),
-        "aten::IntImplicit": _identity(),
-        "aten::tensor": _identity(),  # used for example in tensor(1.0)
-        "aten::numel": _numel(),
-        "aten::empty": _empty(),
-    }
-    return convert_map
-
-
 def _run_jit_passes(graph):
     """ The inline pass is necessary to unwrap prim::CallMethod """
+    # pylint: disable=c-extension-no-member
     import torch
 
-    if _is_version_greater_than("1.5.0"):
+    if is_version_greater_than("1.5.1"):
         # This is required for torchvision detection models from 1.6 above
         # It is the same as _jit_pass_inline, except that it has some special
         # case behaviors for some ops such as aten::__interpolate()
@@ -2762,28 +2563,6 @@ def _get_users(node):
     return [use.user for use in _get_uses(node)]
 
 
-def _report_missing_conversion(op_names, convert_map):
-    """ Check if all ops in an input graph are supported by TVM """
-    known_ops = [
-        "prim::Constant",
-        "prim::GetAttr",
-        "prim::ListConstruct",
-        "prim::ListUnpack",
-        "prim::TupleConstruct",
-        "prim::TupleUnpack",
-        "prim::If",
-        "prim::Loop",
-    ]
-    known_ops += list(convert_map.keys())
-    known_ops += list(qnn_torch.convert_map.keys())
-
-    missing = [op_name for op_name in op_names if op_name not in known_ops]
-
-    if missing:
-        msg = "The following operators are not implemented: {}".format(missing)
-        raise NotImplementedError(msg)
-
-
 def _getattr_attr_name(node):
     attribute_names = node.attributeNames()
     assert len(attribute_names) == 1
@@ -2876,6 +2655,8 @@ def _get_operator_nodes(nodes):
     ops = []
     # Traverse nodes and add to graph
     for node in nodes:
+        if node.outputsSize() == 0:
+            continue
         if node.outputsSize() > 1:
             node_name = "_".join(_get_output_names(node))
         else:
@@ -3069,8 +2850,6 @@ def convert_params(graph, state_dict):
             full_attr_node_name = _get_output_name(getattrs[-1])
 
             if full_attr.endswith("_packed_params"):  # for quantized models
-                err_msg = "parameter %s not found in state dict" % full_attr
-                assert full_attr in state_dict, err_msg
                 packed_param_map[full_attr_node_name] = full_attr
             elif full_attr in state_dict:
                 if full_attr in vars_by_name:
@@ -3085,208 +2864,6 @@ def convert_params(graph, state_dict):
     return params, param_tensors, packed_param_map
 
 
-def convert_block(block, outputs, convert_map, prelude, default_dtype="float32"):
-    """ Translate Torch "Block", used for prim::If and prim::Loop """
-    ops = _get_operator_nodes(block.nodes())
-    ret_names = _get_input_names(block.returnNode())
-    return convert_operators(
-        ops, outputs, ret_names, convert_map, prelude, default_dtype=default_dtype
-    )
-
-
-def convert_if(if_node, outputs, convert_map, prelude, default_dtype="float32"):
-    """ Translate Torch prim::If to Relay If """
-    cond = outputs[if_node.inputsAt(0).debugName()]
-    blocks = list(if_node.blocks())
-    true_branch = convert_block(
-        blocks[0], outputs, convert_map, prelude, default_dtype=default_dtype
-    )
-    false_branch = convert_block(
-        blocks[1], outputs, convert_map, prelude, default_dtype=default_dtype
-    )
-    assert len(true_branch) == 1 and len(false_branch) == 1
-    return _expr.If(cond, true_branch[0], false_branch[0])
-
-
-def convert_loop(loop_node, outputs, convert_map, prelude):
-    """ Translate Torch prim::Loop to Relay while_loop """
-
-    def get_input(index):
-        ivalue = loop_node.inputsAt(index)
-        inode = ivalue.node()
-        if inode.kind() == "prim::Constant":
-            return _expr.const(_get_constant(inode))
-        var_name = ivalue.debugName()
-        assert var_name in outputs
-        return _wrap_const(outputs[var_name])
-
-    # Refer to the spec for prim::Loop below
-    # https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/OVERVIEW.md#loops
-    # The first input: %max_trip_count
-    # The second input: %initial_condition
-    # The rest of input: loop variables
-    max_loop_count = get_input(0)
-    init_cond = get_input(1)
-    num_loop_var = len(list(loop_node.inputs())) - 2
-    init_vals = [get_input(i + 2) for i in range(num_loop_var)]
-
-    # while loop has always max_loop_count being int64 max
-    # max_loop_count.data (tvm.runtime.NDArray) is -1, so _get_constant again
-    is_while_loop = (
-        isinstance(max_loop_count, _expr.Constant)
-        and _get_constant(loop_node.inputsAt(0).node()) == sys.maxsize
-    )
-
-    if is_while_loop:
-        loop_iter_dtype = "bool"
-        # while loop with non input dependent condition such as while i < 10:
-        # init_cond is int, need to cast to bool to type check
-        if isinstance(init_cond, _expr.Constant):
-            init_cond = _op.cast(init_cond, "bool")
-        init_loop_iter_val = init_cond
-    else:
-        loop_iter_dtype = "int32"
-        # always count from 0
-        init_loop_iter_val = _expr.const(0, dtype="int32")
-
-    body_block = list(loop_node.blocks())[0]
-    block_input_names = _get_input_names(body_block)
-    num_block_inputs = len(block_input_names)
-    name_val_pairs = list(zip(block_input_names, [init_loop_iter_val] + init_vals))
-    outputs.update(name_val_pairs)
-
-    def get_var(name, val):
-        if val:
-            checked_type = _infer_type_with_prelude(val, prelude)
-            if hasattr(checked_type, "shape"):
-                shape = get_const_tuple(checked_type.shape)
-                actual_shape = []
-                for dim in shape:
-                    if isinstance(dim, int) and dim == 0:
-                        actual_shape.append(Any())
-                    else:
-                        actual_shape.append(dim)
-                return _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
-            else:
-                return _expr.var(name, type_annotation=checked_type)
-        return _expr.var(name)
-
-    loop_iter_var = _expr.var(block_input_names[0], shape=(), dtype=loop_iter_dtype)
-    loop_vars = [get_var(name, val) for name, val in name_val_pairs[1:]]
-
-    # Add non constant free variables to loop variables to prevent code blow up
-    # Without this, if there are two for loops in a row, which often happens
-    # if the outer loop is unrolled, the computation corresponding to the first for loop
-    # is inlined inside loop body, turning O(N) + O(N) computation into O(N^2).
-    # This issue was found when converting from Stacked LSTM test. Torch does not add the output
-    # of the eariler loop into loop variables of the next loop.
-    # So the variable corresponding to the first loop output appears free in the second loop body.
-    free_vars = [
-        var
-        for var in _get_free_vars_from_block(body_block)
-        if var in outputs
-        and not isinstance(outputs[var], (_expr.Constant, int, float, str))
-        and outputs[var]
-    ]
-
-    prev_outputs = {}
-    for name in free_vars:
-        prev_output = outputs[name]
-        new_loop_var = get_var(name, prev_output)
-        prev_outputs[name] = prev_output
-        outputs[name] = new_loop_var
-        loop_vars.append(new_loop_var)
-        init_vals.append(prev_output)
-
-    def cond(*current_vals):
-        i = current_vals[0]
-
-        if is_while_loop:
-            return _op.equal(i, _expr.const(True, "bool"))
-
-        return _op.less(i, max_loop_count)
-
-    def body(*current_vals):
-        # Update loop variables using the prev iteration outputs
-        assert len(current_vals) == num_block_inputs + len(free_vars)
-
-        for (i, val) in enumerate(current_vals):
-            if i < num_block_inputs:
-                outputs[block_input_names[i]] = val
-            else:
-                outputs[free_vars[i - num_block_inputs]] = val
-
-        block_outputs = convert_block(body_block, outputs, convert_map, prelude)
-        block_outputs += [outputs[name] for name in free_vars]
-
-        if not is_while_loop:
-            # iter var increment implicit in torch, so do it manually
-            # for while loop, block_outputs[0] is already a boolean,
-            # the result of termination check
-            incr = _expr.const(1, dtype="int32")
-            block_outputs[0] = current_vals[0] + incr
-
-        return block_outputs
-
-    loop = while_loop(cond, [loop_iter_var] + loop_vars, body)
-    loop_val = loop(init_loop_iter_val, *init_vals)
-
-    # restore original output values for free vars
-    outputs.update(prev_outputs)
-
-    # The first element is a loop counter or boolean condition, ignore it
-    return [_expr.TupleGetItem(loop_val, i + 1) for i in range(num_loop_var)]
-
-
-def convert_operators(operators, outputs, ret_names, convert_map, prelude, default_dtype="float32"):
-    """ Convert each Torch IR operators to Relay equivalent """
-    for node_name, op_node in operators:
-        operator = op_node.kind()
-        inputs = _get_op_inputs(op_node, outputs)
-
-        if operator == "prim::Constant":
-            outputs[node_name] = _get_constant(op_node)
-        elif operator == "prim::ListConstruct" and _should_construct_dynamic_list(op_node):
-            outputs[node_name] = _convert_to_list_adt(inputs, prelude)
-        elif operator == "prim::ListConstruct":
-            # This assumes that no more elements will be appended to this list
-            # In this case, we keep the Python list
-            outputs[node_name] = inputs
-        elif operator == "prim::TupleConstruct":
-            outputs[node_name] = _expr.Tuple(inputs)
-        elif operator in ["prim::ListUnpack", "prim::TupleUnpack"]:
-            assert len(inputs) == 1
-            if isinstance(inputs[0], (list, _expr.TupleWrapper)):
-                unpacked = inputs[0]
-            else:
-                unpacked = _unpack_tuple(inputs[0])
-            outputs.update(zip(_get_output_names(op_node), unpacked))
-        elif operator == "prim::If":
-            if_out = convert_if(op_node, outputs, convert_map, prelude, default_dtype=default_dtype)
-            outputs[node_name] = if_out
-        elif operator == "prim::Loop":
-            loop_out = convert_loop(op_node, outputs, convert_map, prelude)
-            unpacked_names = _get_output_names(op_node)
-            assert len(loop_out) == len(unpacked_names)
-            outputs.update(zip(unpacked_names, loop_out))
-        else:
-            relay_op = convert_map[operator]
-            relay_out = relay_op(
-                inputs, _get_input_types(op_node, outputs, default_dtype=default_dtype)
-            )
-
-            if isinstance(relay_out, tuple):
-                # This is for torch operators that return multiple outputs
-                # See _adaptive_max_2d above for example
-                out_names = _get_output_names(op_node)
-                outputs.update(zip(out_names, relay_out))
-            else:
-                assert op_node.outputsSize() == 1
-                outputs[node_name] = relay_out
-
-    return [_wrap_const(outputs[ret_name]) for ret_name in ret_names]
-
-
 def get_all_op_names(graph):
     """ Return all operator names in the input graph """
     nodes = list(graph.nodes())
@@ -3309,17 +2886,17 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
         TorchScripted PyTorch graph
         Note: We currently only support traces (ie: torch.jit.trace(model, input))
 
-    input_infos: List of tuples of (input name, input shape)
-                 or (input name, (input shape, input types))
+    input_infos : List of tuples
+        Can be (input name, input shape) or (input name, (input shape, input types))
         Graph level input shape and type list
         The same input names need to be used for deployment, so choose easy to
         remember names (such as: input0, input1)
         e.g.
-          [('input0', (1, 2)), ('input1', (3, 4))]
-          or
-          [('input0', ((1, 2), 'int')), ('input1', ((3, 4), 'float'))]
+        [('input0', (1, 2)), ('input1', (3, 4))]
+        or
+        [('input0', ((1, 2), 'int')), ('input1', ((3, 4), 'float'))]
 
-    custom_convert_map: Dictionary of str to Relay op
+    custom_convert_map : Dictionary of str to Relay op
         A custom op conversion map in the same format as _convert_map above
 
     Returns
@@ -3335,16 +2912,16 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
     mod = tvm.IRModule()
     prelude = Prelude(mod)
 
-    convert_map = _get_convert_map(prelude, default_dtype)
+    converter = PyTorchOpConverter(prelude, default_dtype)
 
     graph = script_module.graph.copy()
     _run_jit_passes(graph)
 
     if custom_convert_map:
-        convert_map.update(custom_convert_map)
+        converter.update_convert_map(custom_convert_map)
 
     op_names = get_all_op_names(graph)
-    _report_missing_conversion(op_names, convert_map)
+    converter.report_missing_conversion(op_names)
 
     is_module = isinstance(script_module, torch.jit.ScriptModule)
     params = script_module.state_dict() if is_module else {}
@@ -3358,21 +2935,15 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
     ret_name = _get_input_names(graph.return_node())
 
     # For quantized models
-    if "aten::quantize_per_tensor" in op_names:
+    quantized_ops = set(["aten::quantize_per_tensor", "quantized::linear_dynamic"])
+    if len(quantized_ops.intersection(set(op_names))) > 0:
         weight_quant_params = qnn_torch.get_weight_quant_params(script_module)
         qnn_torch.add_input_quant_params_to_op_inputs(graph)
         qnn_torch.add_quant_params_to_outputs(outputs, packed_param_map, weight_quant_params)
         qnn_torch.add_quant_params(tvm_params, weight_quant_params)
-        convert_map.update(qnn_torch.convert_map)
-
-    ret = convert_operators(
-        _get_operator_nodes(graph.nodes()),
-        outputs,
-        ret_name,
-        convert_map,
-        prelude,
-        default_dtype=default_dtype,
-    )
+        converter.update_convert_map(qnn_torch.convert_map)
+
+    ret = converter.convert_operators(_get_operator_nodes(graph.nodes()), outputs, ret_name)
 
     mod["main"] = tvm.relay.Function(_analysis.free_vars(ret[0]), ret[0])
 
diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py
new file mode 100644
index 000000000000..d0f0b9b4b019
--- /dev/null
+++ b/python/tvm/relay/frontend/pytorch_utils.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-outside-toplevel
+""" Common utilities used by PyTorch frontend """
+
+
+def is_version_greater_than(ver):
+    import torch
+    import re
+
+    return "".join(re.findall(r"(\d+\.)(\d+\.)(\d)", torch.__version__)[0]) > "".join(
+        re.findall(r"(\d+\.)(\d+\.)(\d)", ver)[0]
+    )
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 121307385d7e..e3431043bc86 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -26,6 +26,8 @@
 from tvm.relay import op as _op
 from tvm.relay.frontend.common import infer_shape
 
+from .pytorch_utils import is_version_greater_than
+
 
 class QNNParam:
     """ A placeholder for weight quantization parameters """
@@ -46,59 +48,95 @@ def __init__(self, weight, bias, scale, zero_point, param_key):
         self.zero_point = _expr.const(zero_point, dtype="int32")
 
 
-def _unpack_quant_params(param_name, packed_params, unpack_func):
-    # Torch stores quantized params in a custom packed format,
-    # need to unpack and retrieve them as numpy arrays
-    qweight, bias = unpack_func(packed_params)
-    weight_np = qweight.dequantize().numpy()
+class ConvPackedParam(QNNParam):
+    """A placeholder for quantized conv2d op attributes
+    As of PyTorch 1.6, attributes of quantized conv2d ops, like
+    stride, padding etc are stored in ConvPackedParams objects,
+    together with weights and quantization parameters
+    """
+
+    def __init__(
+        self, weight_np, bias, scale, zero_point, param_name, stride, padding, dilation, groups
+    ):
+        super().__init__(weight_np, bias, scale, zero_point, param_name)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
 
+
+def _get_quant_params(qweight):
     import torch
 
+    weight_np = qweight.dequantize().numpy()
+
     if qweight.qscheme() == torch.per_tensor_affine:
-        param = QNNParam(
-            weight_np, bias, qweight.q_scale(), int(qweight.q_zero_point()), param_name
-        )
-    else:
-        scales = qweight.q_per_channel_scales().numpy()
-        zero_points = qweight.q_per_channel_zero_points().numpy()
-        # This is an assumption posed by QNN
-        msg = "The values of zero points should be all zero for per channel"
-        assert np.all(zero_points == 0), msg
-        param = QNNParam(weight_np, bias, scales, 0, param_name)
+        return weight_np, qweight.q_scale(), int(qweight.q_zero_point())
 
-    return param
+    scales = qweight.q_per_channel_scales().numpy()
+    zero_points = qweight.q_per_channel_zero_points().numpy()
+    # This is an assumption posed by QNN
+    msg = "The values of zero points should be all zero for per channel"
+    assert np.all(zero_points == 0), msg
+    return weight_np, scales, 0
+
+
+def make_qnn_param(param_name, qweight, bias):
+    weight_np, scale, zero_point = _get_quant_params(qweight)
+    return QNNParam(weight_np, bias, scale, zero_point, param_name)
+
+
+def make_conv_packed_param(param_name, qweight, bias, packed_params):
+    weight_np, scale, zero_point = _get_quant_params(qweight)
+    stride = packed_params.stride()
+    padding = packed_params.padding()
+    dilation = packed_params.dilation()
+    groups = packed_params.groups()
+    return ConvPackedParam(
+        weight_np, bias, scale, zero_point, param_name, stride, padding, dilation, groups
+    )
 
 
 def get_weight_quant_params(script_module):
     """ Retrive and unpack weight parameters from quantized modules """
-    conv_packed_params = []
-    linear_packed_params = []
-
     import torch
 
-    # conv and linear requires different unpacking function
-    # extract all conv and linear parameters separately to distinguish them
-    for name, m in script_module.named_modules():
-        if isinstance(m, torch.jit.RecursiveScriptModule):
-            if "Conv" in m.original_name:
-                conv_packed_params.append((name, m.state_dict()))
-            elif m.original_name == "LinearPackedParams":
-                linear_packed_params.append((name, m.state_dict()))
+    param_name = "_packed_params"
+    quant_params = {}
 
-    pairs = [
-        (torch.ops.quantized.conv2d_unpack, conv_packed_params),
-        (torch.ops.quantized.linear_unpack, linear_packed_params),
-    ]
+    def filter_func(named_module):
+        m = named_module[1]
+        return isinstance(m, torch.jit.RecursiveScriptModule) and (
+            ("Conv" in m.original_name) or (m.original_name == "LinearPackedParams")
+        )
 
-    quant_params = {}
-    param_name = "_packed_params"
-    for unpack_func, params in pairs:
-        for name, state_dict in params:
+    for name, m in filter(filter_func, script_module.named_modules()):
+        key = name + "." + param_name
+        state_dict = m.state_dict()
+
+        if len(state_dict) == 0 and not hasattr(m, param_name):
+            # for v1.6 and above
+            # This case seems to happen if a model is serialized
+            # and loaded back
+            # This module can be safely ignored
+            continue
+
+        if len(state_dict) == 0 and hasattr(m, param_name):
+            # for v1.6 and above
+            packed_params = m._packed_params
+        else:
             assert len(state_dict) == 1
-            assert param_name in state_dict
-            key = name + "." + param_name
-            packed_param = state_dict[param_name]
-            quant_params[key] = _unpack_quant_params(key, packed_param, unpack_func)
+            packed_params = list(state_dict.values())[0]
+
+        if "Conv" in m.original_name and len(state_dict) == 0:
+            qweight, bias = torch.ops.quantized.conv2d_unpack(packed_params)
+            quant_params[key] = make_conv_packed_param(key, qweight, bias, packed_params)
+        elif "Conv" in m.original_name:
+            qweight, bias = torch.ops.quantized.conv2d_unpack(packed_params)
+            quant_params[key] = make_qnn_param(key, qweight, bias)
+        elif m.original_name == "LinearPackedParams":
+            qweight, bias = torch.ops.quantized.linear_unpack(packed_params)
+            quant_params[key] = make_qnn_param(key, qweight, bias)
 
     return quant_params
 
@@ -113,8 +151,12 @@ def add_quant_params_to_outputs(outputs, packed_param_map, quant_params):
         qweight = relay.qnn.op.quantize(
             qparam.weight_var, qparam.scale, qparam.zero_point, out_dtype="int8", axis=0
         )
-        param_tup = (qweight, qparam.scale, qparam.zero_point, qparam.bias_var)
-        outputs[node_name] = param_tup
+        params = [qweight, qparam.scale, qparam.zero_point, qparam.bias_var]
+
+        if isinstance(quant_params[packed_param_name], ConvPackedParam):
+            params += [qparam.stride, qparam.padding, qparam.dilation, qparam.groups]
+
+        outputs[node_name] = params
 
 
 def _get_quant_param_for_input(input_value):
@@ -129,10 +171,17 @@ def _get_quant_param_for_input(input_value):
     # Indices for output scale and zp
     # For example, in quantized::conv2d(%input, %1, %2, %3, %4, %5, %6, %7),
     # 6th and 7th arg are output scale and zp respectively.
+
+    # PyTorch 1.6 changed qconv API
+    if is_version_greater_than("1.5.1"):
+        qconv_indices = (2, 3)
+    else:
+        qconv_indices = (6, 7)
+
     output_quant_param_indices = {
         "aten::quantize_per_tensor": (1, 2),
-        "quantized::conv2d": (6, 7),
-        "quantized::conv2d_relu": (6, 7),
+        "quantized::conv2d": qconv_indices,
+        "quantized::conv2d_relu": qconv_indices,
         "quantized::linear": (2, 3),
         "quantized::linear_relu": (2, 3),
         "quantized::add_relu": (2, 3),
@@ -236,6 +285,7 @@ def _add_output_quant_params_to_scalar_op(node, graph, input_scale, input_zero_p
 
     %7 and %8 are newly created output scale and zp constant nodes
     """
+    # pylint: disable=c-extension-no-member
     import torch
 
     operator = node.kind()
@@ -458,24 +508,40 @@ def _impl(inputs, _):
         # inputs[7]: output_zero_point
         # inputs[8]: input_scale (added manually by frontend)
         # inputs[9]: input_zero_point (added manually by frontend)
-        weight = inputs[1][0]
-        weight_scale = inputs[1][1]
-        weight_zero_point = inputs[1][2]
-
-        output_scale = _expr.const(inputs[6])
-        output_zero_point = _expr.const(inputs[7])
+        conv_params = inputs[1]
+        weight = conv_params[0]
+        weight_scale = conv_params[1]
+        weight_zero_point = conv_params[2]
+        bias = conv_params[3]
+
+        if len(conv_params) > 4:
+            # Torch 1.6 or newer case
+            strides = conv_params[4]
+            padding = conv_params[5]
+            dilation = conv_params[6]
+            groups = conv_params[7]
+
+            output_scale = _expr.const(inputs[2])
+            output_zero_point = _expr.const(inputs[3])
+
+            assert len(inputs) == 6, "Input quant params not found in op inputs"
+
+            # These are manually added by add_input_quant_params_to_op_inputs above
+            # In torch, they are retrieved from QTensor data structure at runtime
+            input_scale = _expr.const(inputs[4])
+            input_zero_point = _expr.const(inputs[5])
+        else:
+            strides = inputs[2]
+            padding = inputs[3]
+            dilation = inputs[4]
+            groups = inputs[5]
+            output_scale = _expr.const(inputs[6])
+            output_zero_point = _expr.const(inputs[7])
 
-        assert len(inputs) == 10, "Input quant params not found in op inputs"
-        # These are manually added by add_input_quant_params_to_op_inputs above
-        # In torch, they are retrieved from QTensor data structure at runtime
-        input_scale = _expr.const(inputs[8])
-        input_zero_point = _expr.const(inputs[9])
+            assert len(inputs) == 10, "Input quant params not found in op inputs"
 
-        strides, padding, dilation = inputs[2], inputs[3], inputs[4]
-        strides = inputs[2]
-        padding = inputs[3]
-        dilation = inputs[4]
-        groups = inputs[5]
+            input_scale = _expr.const(inputs[8])
+            input_zero_point = _expr.const(inputs[9])
 
         weight_shape = infer_shape(weight)
         kernel_size = (weight_shape[2], weight_shape[3])
@@ -507,16 +573,9 @@ def _impl(inputs, _):
             groups=groups,
             channels=out_channels,
         )
-        bias_var = inputs[1][3]
 
         return _do_bias_and_requantize(
-            conv_out,
-            bias_var,
-            input_scale,
-            weight_scale,
-            output_scale,
-            output_zero_point,
-            with_relu,
+            conv_out, bias, input_scale, weight_scale, output_scale, output_zero_point, with_relu
         )
 
     return _impl
@@ -761,6 +820,76 @@ def _impl(inputs, _):
     return _impl
 
 
+def _linear_dynamic():
+    def _calculate_qparam(inp):
+        # reference ATen/native/quantized/cpu/qlinear_dynamic.cpp
+        # ChooseQuantizationParams function
+        mn = _op.min(inp)
+        mx = _op.max(inp)
+
+        # Ensure that the interval contains 0
+        mn = _op.minimum(mn, _op.const(0.0, dtype="float32"))
+        mx = _op.maximum(mx, _op.const(0.0, dtype="float32"))
+
+        qmax = 255
+
+        # reduce_range became True in v1.6
+        if is_version_greater_than("1.5.1"):
+            qmax = 127
+
+        scale = (mx - mn) / _expr.const(qmax, dtype="float32")
+
+        zero_point_from_min = -(mn / scale)
+        zero_point = _op.cast(_op.round(_op.clip(zero_point_from_min, 0.0, qmax)), "int32")
+
+        return scale, zero_point
+
+    def _impl(inputs, _):
+        weight = inputs[1][0]
+        weight_scale = inputs[1][1]
+        weight_zero_point = inputs[1][2]
+
+        inp = inputs[0]
+
+        input_scale, input_zero_point = _calculate_qparam(inp)
+        qinp = relay.qnn.op.quantize(inp, input_scale, input_zero_point, out_dtype="uint8")
+
+        data_shape = infer_shape(inp)
+
+        if len(data_shape) > 2:
+            qinp = _op.reverse_reshape(qinp, [-1, 0])
+
+        weight_shape = infer_shape(weight)
+        units = weight_shape[0]
+        dense = relay.qnn.op.dense(
+            qinp,
+            weight,
+            input_zero_point,
+            weight_zero_point,
+            input_scale,
+            weight_scale,
+            units=units,
+        )
+        bias_var = inputs[1][3]
+
+        dequant_scale = input_scale * weight_scale
+        dense_out = relay.qnn.op.dequantize(
+            dense, dequant_scale, input_zero_point=relay.const(0, "int32"), axis=1
+        )
+
+        if len(data_shape) > 2:
+            new_shape = list(data_shape[:-1])
+            new_shape.append(units)
+            dense_out = _op.reshape(dense_out, new_shape)
+
+        if bias_var is not None:
+            return dense_out + bias_var
+
+        return dense_out
+
+    return _impl
+
+
 convert_map = {
     "aten::quantize_per_tensor": _quantize_per_tensor(),
     "quantized::conv2d_relu": _quantized_conv2d(with_relu=True),
@@ -776,4 +905,5 @@ def _impl(inputs, _):
     "quantized::add_scalar": _add_scalar(),
     "quantized::mul_scalar": _mul_scalar(),
     "quantized::relu6": _relu6(),
+    "quantized::linear_dynamic": _linear_dynamic(),
 }
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 3df582a0c76a..861a73aa2ad8 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -27,7 +27,7 @@
 from tvm.ir import IRModule
 from tvm.relay.prelude import Prelude, StaticTensorArrayOps, get_tensor_array_shape
 from tvm.relay.transform import InferType
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from .. import analysis
 from .. import expr as _expr
@@ -665,7 +665,7 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
-def _nms():
+def _nms(return_scores=False):
     def _impl(inputs, attr, params, mod):
         # Get parameter values
         try:
@@ -724,6 +724,16 @@ def _impl(inputs, attr, params, mod):
         ret = get_relay_op("strided_slice")(
             data_slice, begin=_expr.const([0]), end=size, slice_mode="size"
         )
+
+        # NonMaxSuppressionV5 returns scores. pad_output is always False for NMSv5.
+        if return_scores:
+            if "soft_nms_sigma" in attr and attr["soft_nms_sigma"] != 0.0:
+                raise tvm.error.OpAttributeUnImplemented(
+                    "soft_nms_sigma for NonMaxSuppressionV5 is not supported"
+                )
+            ret_scores = _op.take(inputs[1], ret, axis=0)
+            return _expr.TupleWrapper(_expr.Tuple([ret, ret_scores, size]), 3)
+
         return ret
 
     return _impl
@@ -788,6 +798,15 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _expm1():
+    # op description: https://www.tensorflow.org/api_docs/python/tf/math/expm1
+    def _impl(inputs, attr, params, mod):
+        exp_out = get_relay_op("exp")(inputs[0])
+        return exp_out - tvm.relay.const(1.0)
+
+    return _impl
+
+
 def _resize(method):
     def _impl(inputs, attr, params, mod):
         if attr["_output_shapes"][0] is not None:
@@ -894,6 +913,51 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _sparse_tensor_dense_matmul():
+    # Sparse utility from scipy
+    from scipy.sparse import csr_matrix
+
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 4, "There should be 4 input tensors"
+
+        indices_tensor = _infer_value(inputs[0], params, mod).asnumpy()
+        values_tensor = _infer_value(inputs[1], params, mod).asnumpy()
+        dense_shape_tensor = _infer_value(inputs[2], params, mod).asnumpy()
+
+        data = inputs[3]
+
+        rows = [x[0] for x in indices_tensor]
+        cols = [x[1] for x in indices_tensor]
+
+        # Create scipy sparse Tensor(CSR)
+        weight_sp = csr_matrix(
+            (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist())
+        )
+        weight_sp = csr_matrix(weight_sp.transpose())
+
+        weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype)
+        weight_indptrs = _expr.const(weight_sp.indptr, weight_sp.indptr.dtype)
+        weight_indices = _expr.const(weight_sp.indices, weight_sp.indices.dtype)
+
+        ret = _op.nn.sparse_dense(data, [weight_data, weight_indices, weight_indptrs])
+
+        # If both are true means First input was dense and second was sparse
+        # TODO(ANSHUMAN87): Support other adjoint option too
+        if attr.get("adjoint_a") and attr.get("adjoint_b"):
+            ret = _op.transpose(ret)
+        else:
+            raise tvm.error.OpAttributeUnImplemented(
+                "Only tf.sparse.sparse_dense_matmul() with adjoint_a=True and adjoint_b=True"
+                " is supported, but adjoint_a={} and adjoint_b={} was supplied.".format(
+                    attr.get("adjoint_a"), attr.get("adjoint_b")
+                )
+            )
+
+        return ret
+
+    return _impl
+
+
 def _identity():
     def _impl(inputs, attr, params, mod):
         return inputs[0]
@@ -1272,6 +1336,18 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _sparse_to_dense():
+    def _impl(inputs, attr, params, mod):
+        sparse_indices = inputs[0]
+        output_shape = inputs[1]
+        sparse_values = inputs[2]
+        default_value = inputs[3]
+
+        return _op.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
+
+    return _impl
+
+
 def _bias_add():
     def _impl(inputs, attr, params, mod):
         # Must expand for proper broadcasting in NCHW.
@@ -1334,7 +1410,7 @@ def _impl(inputs, attr, params, mod):
             op_name="batch_norm",
             transforms={"scale_after_normalization": "scale", "variance_epsilon": "epsilon"},
             extras={"axis": axis},
-            ignores=["data_format", "U"],
+            ignores=["data_format", "U", "exponential_avg_factor"],
             disables=["momentum"],
         )(inputs, attr)
 
@@ -1364,7 +1440,7 @@ def _impl(inputs, attr, params, mod):
             op_name="batch_norm",
             transforms={"scale_after_normalization": "scale", "variance_epsilon": "epsilon"},
             extras={"axis": axis},
-            ignores=["data_format"],
+            ignores=["data_format", "exponential_avg_factor"],
             disables=["momentum"],
         )(new_inputs, attr)
 
@@ -1388,9 +1464,9 @@ def _impl(inputs, attr, params, mod):
                 break
 
         if is_symbolic_shape:
-            ret = _op.shape_of(inputs[0], dtype="int32")
+            ret = _op.shape_of(inputs[0], dtype=attr["out_type"].name)
         else:
-            ret = np.array(input_shape, dtype="int32")
+            ret = np.array(input_shape, dtype=attr["out_type"].name)
         return ret
 
     return _impl
@@ -1549,7 +1625,7 @@ def _impl(inputs, attr, params, mod):
                 idx += st
 
             # Only return when in_shape is fully static in the range from begin to end.
-            if idx >= st:
+            if idx >= ed:
                 ret = _expr.const(out_data, dtype)
                 if shrink_axis_mask:
                     ret = _op.squeeze(ret)
@@ -1595,11 +1671,15 @@ def _transform_mask(stride_dim, ellipsis_mask):
                     if final_index == len(m_begin):
                         break
                     if mask & begin_mask:
-                        m_begin[final_index] = data_shape[final_index] if stride[index] < 0 else 0
+                        m_begin[final_index] = -1 if stride[index] < 0 else 0
                     elif begin[index]:
                         m_begin[final_index] = begin[index]
                     if mask & end_mask:
-                        m_end[final_index] = 0 if stride[index] < 0 else data_shape[final_index]
+                        m_end[final_index] = (
+                            -(data_shape[final_index] + 1)
+                            if stride[index] < 0
+                            else data_shape[final_index]
+                        )
                     elif end[index]:
                         m_end[final_index] = end[index]
                     m_stride[final_index] = stride[index]
@@ -1659,14 +1739,26 @@ def _transform_mask(stride_dim, ellipsis_mask):
 
 def _pad(name):
     def _impl(inputs, attr, params, mod):
-        padlist = _get_param(params, inputs[1])
-        paddings = tuple(tuple(l) for l in padlist)
+        try:
+            padlist = _get_param(params, inputs[1])
+        except (IndexError, KeyError, AttributeError):
+            try:
+                padlist = _infer_value(inputs[1], params, mod).asnumpy().tolist()
+            except Exception:
+                padlist = inputs[1]
+
+        if isinstance(padlist, _expr.Expr):
+            paddings = padlist
+        else:
+            paddings = tuple(tuple(l) for l in padlist)
         attr["pad_width"] = paddings
         attr["pad_value"] = 0
         new_inputs = [inputs[0]]
         if name == "PadV2":
-            constant_values = _get_num_param(params, inputs[2])
-            attr["pad_value"] = constant_values
+            try:
+                attr["pad_value"] = _get_num_param(params, inputs[2])
+            except (IndexError, KeyError, AttributeError):
+                attr["pad_value"] = inputs[2]
         return AttrCvt(
             op_name="pad",
             ignores=["Tpaddings"],
@@ -1780,11 +1872,11 @@ def _impl(inputs, attr, params, mod):
 
         dtype = attr["Tidx"].name if "Tidx" in attr else str(start.dtype)
         if isinstance(start, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            start = _expr.const(start)
+            start = _expr.const(start, dtype=dtype)
         if isinstance(limit, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            limit = _expr.const(limit)
+            limit = _expr.const(limit, dtype=dtype)
         if isinstance(delta, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            delta = _expr.const(delta)
+            delta = _expr.const(delta, dtype=dtype)
 
         return AttrCvt(
             op_name="arange",
@@ -1902,6 +1994,16 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _softsign():
+    # op description: https://www.tensorflow.org/api_docs/python/tf/math/softsign
+    def _impl(inputs, attr, params, mod):
+        abs_out = get_relay_op("abs")(inputs[0])
+        add_out = abs_out + tvm.relay.const(1, attr["T"].name)
+        return inputs[0] / add_out
+
+    return _impl
+
+
 def _softplus():
     # op description: https://www.tensorflow.org/api_docs/python/tf/math/softplus
     def _impl(inputs, attr, params, mod):
@@ -1968,8 +2070,6 @@ def _impl(inputs, attr, params, mod):
 
 def _space_to_batch_nd():
     def _impl(inputs, attr, params, mod):
-        input_node = inputs[0]
-        input_shape = _infer_shape(input_node, mod)
         try:
             block_shape = _get_list_param(params, inputs[1])
         except (IndexError, KeyError, AttributeError):
@@ -1983,48 +2083,18 @@ def _impl(inputs, attr, params, mod):
             if len(paddings.shape) == 1:
                 paddings = np.expand_dims(paddings, axis=0)
             paddings = paddings.tolist()
-        N = len(input_shape)
-        M = len(block_shape)
-        batch = input_shape[0]
-        remaining_shape_length = N - M - 1
-        paddings = [(0, 0)] + paddings + [(0, 0)] * remaining_shape_length
-        # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/space-to-batch-n-d:
-        # Zero-pad the start and end of dimensions [1, ..., M] of the input according to paddings
-        # to produce padded of shape padded_shape.
-        padded = tvm.relay.nn.pad(input_node, pad_width=paddings)
-        # Reshape padded to reshaped_padded of shape:
-        # [batch] + [padded_shape[1] / block_shape[0], block_shape[0], ...,
-        # padded_shape[M] / block_shape[M-1], block_shape[M-1]] + remaining_shape
-        shape1 = [batch] + [item for i in range(M) for item in [-4, -1, block_shape[i]]] + [-2]
-        reshaped_padded = tvm.relay.reshape(padded, newshape=shape1)
-        # Permute dimensions of reshaped_padded to produce permuted_reshaped_padded of shape:
-        # block_shape + [batch] + [padded_shape[1] / block_shape[0], ...,
-        # padded_shape[M] / block_shape[M-1]] + remaining_shape
-        axes = (
-            [2 * i + 2 for i in range(M)]
-            + [0]
-            + [2 * i + 1 for i in range(M)]
-            + list(range(1 + 2 * M, 1 + 2 * M + remaining_shape_length))
-        )
-        permuted_reshaped_padded = tvm.relay.transpose(reshaped_padded, axes=axes)
-        permuted_reshaped_padded_shape = _infer_shape(permuted_reshaped_padded, mod)
-        # Reshape permuted_reshaped_padded to flatten block_shape into the batch dimension,
-        # producing an output tensor of shape:
-        # [batch * prod(block_shape)] + [padded_shape[1] / block_shape[0], ...,
-        # padded_shape[M] / block_shape[M-1]] + remaining_shape
-        shape2 = [batch * np.prod(block_shape)] + list(permuted_reshaped_padded_shape)[M + 1 :]
-        reshaped_permuted_reshaped_padded = tvm.relay.reshape(
-            permuted_reshaped_padded, newshape=shape2
-        )
-        return reshaped_permuted_reshaped_padded
+
+        attr["block_shape"] = block_shape
+        attr["paddings"] = paddings
+        out = AttrCvt("space_to_batch_nd", ignores=["Tblock_shape", "Tpaddings"])([inputs[0]], attr)
+
+        return out
 
     return _impl
 
 
 def _batch_to_space_nd():
     def _impl(inputs, attr, params, mod):
-        input_node = inputs[0]
-        input_shape = _infer_shape(input_node, mod)
         try:
             block_shape = _get_list_param(params, inputs[1])
         except (IndexError, KeyError, AttributeError):
@@ -2038,46 +2108,12 @@ def _impl(inputs, attr, params, mod):
             if len(crops.shape) == 1:
                 crops = np.expand_dims(crops, axis=0)
             crops = crops.tolist()
-        M = len(block_shape)
-        batch = input_shape[0]
-        # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-to-space-n-d:
-        # Reshape input to reshaped of shape:
-        # [block_shape[0], ..., block_shape[M-1], batch / prod(block_shape),
-        #  input_shape[1], ..., input_shape[N-1]]
-        shape1 = block_shape + [batch // np.prod(block_shape)] + list(input_shape[1:])
-        reshaped = tvm.relay.reshape(input_node, newshape=shape1)
-        # Permute dimensions of reshaped to produce permuted of shape
-        # [batch / prod(block_shape), input_shape[1], block_shape[0], ...,
-        # input_shape[M], block_shape[M-1], input_shape[M+1], ..., input_shape[N-1]]
-        axes = (
-            [M]
-            + [axis for i in range(M) for axis in [M + i + 1, i]]
-            + list(range(2 * M + 1, len(shape1)))
-        )
-        permuted = tvm.relay.transpose(reshaped, axes=axes)
-        # Reshape permuted to produce reshaped_permuted of shape
-        # [batch / prod(block_shape), input_shape[1] * block_shape[0], ...,
-        #  input_shape[M] * block_shape[M-1], input_shape[M+1], ..., input_shape[N-1]]
-        shape2 = [0] + [-3] * M + [-2]
-        reshaped_permuted = tvm.relay.reshape(permuted, newshape=shape2)
-        # Crop the start and end of dimensions [1, ..., M] of reshaped_permuted according to crops
-        # to produce the output of shape:
-        # [batch / prod(block_shape), input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-        #  ..., input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-        #  input_shape[M+1], ..., input_shape[N-1]]
-        reshaped_permuted_shape = _infer_shape(reshaped_permuted, mod)
-        cropped = reshaped_permuted
-        for axis in range(1, M + 1):
-            crop = crops[axis - 1]
-            if crop != [0, 0]:
-                indices = tvm.relay.arange(
-                    _expr.const(crop[0]),
-                    _expr.const(reshaped_permuted_shape[axis] - crop[1]),
-                    dtype="int32",
-                )
-                cropped = tvm.relay.take(cropped, indices=indices, axis=axis)
 
-        return cropped
+        attr["block_shape"] = block_shape
+        attr["crops"] = crops
+        out = AttrCvt("batch_to_space_nd", ignores=["Tblock_shape", "Tcrops"])([inputs[0]], attr)
+
+        return out
 
     return _impl
 
@@ -2285,6 +2321,7 @@ def _impl(inputs, attr, params, mod):
     "EuclideanNorm": _euclidean_norm(),
     "Exp": AttrCvt("exp"),
     "ExpandDims": _expand_dims(),
+    "Expm1": _expm1(),
     "Fill": _fill(),
     "Floor": AttrCvt("floor"),
     "FloorDiv": _floordiv(),
@@ -2327,6 +2364,7 @@ def _impl(inputs, attr, params, mod):
     "NonMaxSuppressionV2": _nms(),
     "NonMaxSuppressionV3": _nms(),
     "NonMaxSuppressionV4": _nms(),
+    "NonMaxSuppressionV5": _nms(True),
     "NoOp": _no_op(),
     "NotEqual": _broadcast("not_equal"),
     "OneHot": _one_hot(),
@@ -2346,6 +2384,7 @@ def _impl(inputs, attr, params, mod):
     "ResizeNearestNeighbor": _resize("nearest_neighbor"),
     "ReverseV2": _reverse_v2(),
     "RightShift": AttrCvt("right_shift"),
+    "Rint": AttrCvt("round"),
     "Round": AttrCvt("round"),
     "Rsqrt": _rsqrt(),
     "Select": _where(),
@@ -2359,8 +2398,11 @@ def _impl(inputs, attr, params, mod):
     "Slice": _slice(),
     "Softmax": _softmax(),
     "Softplus": _softplus(),
+    "Softsign": _softsign(),
     "SpaceToBatchND": _space_to_batch_nd(),
     "SpaceToDepth": _space_to_depth(),
+    "SparseToDense": _sparse_to_dense(),
+    "SparseTensorDenseMatMul": _sparse_tensor_dense_matmul(),
     "Split": _split(False),
     "SplitV": _split(True),
     "Sqrt": AttrCvt("sqrt"),
@@ -3305,7 +3347,7 @@ def _partition_call_operator(self, inputs, attr):
         return ret
 
     def _convert_operator(
-        self, op_name, inputs, attrs, graph, identity_list=None, convert_map=None
+        self, op_name, node_name, inputs, attrs, identity_list=None, convert_map=None
     ):
         """Convert from Tensorflow operator to relay operator.
         The converter must specify conversions explicitly for incompatible name, and
@@ -3344,6 +3386,23 @@ def _convert_operator(
             sym = self._partition_call_operator(inputs, attrs)
         else:
             raise NotImplementedError("Operator {} not implemented.".format(op_name))
+
+        sym = self._set_span(sym, node_name)
+
+        return sym
+
+    @staticmethod
+    def _set_span(sym, node_name):
+        span = tvm.relay.Span(tvm.relay.SourceName(node_name), 0, 0, 0, 0)
+        if isinstance(sym, _expr.Call):
+            sym = _expr.Call(sym.op, sym.args, sym.attrs, sym.type_args, span)
+        elif isinstance(sym, _expr.TupleWrapper):
+            tuple_value = sym.tuple_value
+            if isinstance(tuple_value, _expr.Call):
+                tuple_value = _expr.Call(
+                    tuple_value.op, tuple_value.args, tuple_value.attrs, tuple_value.type_args, span
+                )
+                sym = _expr.TupleWrapper(tuple_value, sym.size)
         return sym
 
     def _licm_construct(self, loop_name, node_name):
@@ -3480,7 +3539,7 @@ def _backtrack_construct(self, node_name):
                         actual_input = self._licm_construct(plname, iname)
                         inputs[i] = actual_input
 
-                op = self._convert_operator(node.op, inputs, attr, self._graph)
+                op = self._convert_operator(node.op, node.name, inputs, attr)
 
             if isinstance(op, np.ndarray):
                 self._params[node.name] = tvm.nd.array(op)
diff --git a/python/tvm/relay/frontend/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
index a176e12192c4..b1b10eb81f56 100644
--- a/python/tvm/relay/frontend/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -18,7 +18,7 @@
 # pylint: disable=import-outside-toplevel, assignment-from-no-return
 
 import os
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 class TFParser(object):
@@ -45,7 +45,7 @@ class TFParser(object):
     def __init__(self, model_dir, outputs=None):
         from tensorflow.core.framework import graph_pb2
 
-        self._tmp_dir = util.tempdir()
+        self._tmp_dir = utils.tempdir()
         self._model_dir = model_dir
         self._graph = graph_pb2.GraphDef()
         self._outputs = outputs or []
@@ -147,7 +147,7 @@ def _load_saved_model(self):
             saved_model_tags,
         )
 
-        with ops.Graph().as_default():
+        with ops.Graph().as_default():  # pylint: disable=not-context-manager
             output_graph_def = graph_pb2.GraphDef()
             with open(output_graph_filename, "rb") as f:
                 output_graph_def.ParseFromString(f.read())
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 1b09cf307554..3f0140d19b1f 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -30,7 +30,7 @@
 from .. import qnn as _qnn
 from ... import nd as _nd
 from .common import ExprTable
-from .common import infer_shape as _infer_shape
+from .common import infer_shape as _infer_shape, to_int_list
 from .tflite_flexbuffer import FlexBufferDecoder
 
 
@@ -345,7 +345,7 @@ def get_tensor_value(self, tensor_wrapper):
         data = tensor_wrapper.buffer.DataAsNumpy()
 
         if tensor_wrapper.tensor.ShapeLength() != 0:
-            shape = tensor_wrapper.tensor.ShapeAsNumpy()
+            shape = to_int_list(tensor_wrapper.tensor.ShapeAsNumpy())
         else:
             shape = []
 
@@ -503,7 +503,7 @@ def convert_reshape(self, op):
             op_options = op.BuiltinOptions()
             reshape_options = ReshapeOptions()
             reshape_options.Init(op_options.Bytes, op_options.Pos)
-            target_shape = tuple(reshape_options.NewShapeAsNumpy())
+            target_shape = to_int_list(reshape_options.NewShapeAsNumpy())
 
         in_expr = self.get_expr(input_tensor_idx)
 
@@ -1387,7 +1387,7 @@ def convert_gather(self, op):
         axis = gather_options.Axis()
 
         # Check the indices are with in bounds.
-        data_shape = list(input_tensors[0].tensor.ShapeAsNumpy())
+        data_shape = to_int_list(input_tensors[0].tensor.ShapeAsNumpy())
         data_dim = len(data_shape)
 
         axis = data_dim + axis if axis < 0 else axis
@@ -1505,7 +1505,7 @@ def convert_strided_slice(self, op):
         new_axis_mask = options.NewAxisMask()
         shrink_axis_mask = options.ShrinkAxisMask()
 
-        data_shape = list(input_tensors[0].tensor.ShapeAsNumpy())
+        data_shape = to_int_list(input_tensors[0].tensor.ShapeAsNumpy())
         data_dim = len(data_shape)
         stride_dim = len(stride)
 
@@ -1757,7 +1757,7 @@ def convert_fully_connected(self, op):
         output_tensor_type = output_tensor.tensor.Type()
         output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
 
-        weight_tensor_shape = weight_tensor.tensor.ShapeAsNumpy()
+        weight_tensor_shape = to_int_list(weight_tensor.tensor.ShapeAsNumpy())
 
         # Weight should have only 2 dimensions(TFLite convention)
         assert len(weight_tensor_shape) == 2, "Weight should be only 2-dim"
@@ -1951,15 +1951,17 @@ def convert_conv(self, op, conv_type):
         padding = conv_options.Padding()
         fused_activation_fn = conv_options.FusedActivationFunction()
 
-        _, input_h, input_w, input_c = input_tensor.tensor.ShapeAsNumpy()
+        _, input_h, input_w, input_c = to_int_list(input_tensor.tensor.ShapeAsNumpy())
 
         if is_depthwise_conv:
             # TFLite depthwise convolution kernel layout is:
             # 1 KH KW C(input_c * depth_multiplier)
-            _, kernel_h, kernel_w, in_channels = weight_tensor.tensor.ShapeAsNumpy()
+            _, kernel_h, kernel_w, in_channels = to_int_list(weight_tensor.tensor.ShapeAsNumpy())
             assert in_channels == input_c * depth_multiplier
         else:
-            output_channels, kernel_h, kernel_w, _ = weight_tensor.tensor.ShapeAsNumpy()
+            output_channels, kernel_h, kernel_w, _ = to_int_list(
+                weight_tensor.tensor.ShapeAsNumpy()
+            )
 
         dilated_kernel_h = dilation_h * (kernel_h - 1) + 1
         dilated_kernel_w = dilation_w * (kernel_w - 1) + 1
@@ -2007,6 +2009,7 @@ def convert_conv(self, op, conv_type):
             pass
         elif padding == Padding.SAME:
             pad_top, pad_bottom = get_pad_value(input_h, dilated_kernel_h, stride_h)
+
             pad_left, pad_right = get_pad_value(input_w, dilated_kernel_w, stride_w)
             do_pad = not (pad_top == 0 and pad_bottom == 0 and pad_left == 0 and pad_right == 0)
             if do_pad:
@@ -2160,7 +2163,7 @@ def convert_slice(self, op):
         size = list(self.get_tensor_value(input_tensors[2]))
         # strided_slice(Relay) needs the slice's end indices, not the size
         end = size
-        input_tensor_shape = input_tensor.tensor.ShapeAsNumpy()
+        input_tensor_shape = to_int_list(input_tensor.tensor.ShapeAsNumpy())
         input_tensor_rank = len(input_tensor_shape)
         for i in range(input_tensor_rank):
             if size[i] == -1:
@@ -2322,7 +2325,7 @@ def convert_pool2d(self, op, pool_type):
 
         in_expr = self.get_expr(input_tensor_idx)
 
-        _, input_h, input_w, _ = input_tensor.tensor.ShapeAsNumpy()
+        _, input_h, input_w, _ = to_int_list(input_tensor.tensor.ShapeAsNumpy())
         if padding == Padding.VALID:
             pass
         elif padding == Padding.SAME:
@@ -2570,46 +2573,12 @@ def convert_batch_to_space_nd(self, op):
         input_tensor_idx = input_tensor.tensor_idx
         in_expr = self.get_expr(input_tensor_idx)
 
-        input_shape = list(input_tensor.tensor.ShapeAsNumpy())
-        batch = input_shape[0]
-
         block_shape = list(self.get_tensor_value(input_tensors[1]))
-        M = len(block_shape)
-
-        crops = list(self.get_tensor_value(input_tensors[2]))
+        crops = self.get_tensor_value(input_tensors[2]).tolist()
 
-        # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-to-space-n-d:
-        # Reshape input to reshaped of shape
-        shape1 = block_shape + [batch // np.prod(block_shape)] + input_shape[1:]
-        reshaped = _op.reshape(in_expr, newshape=shape1)
+        out = _op.nn.batch_to_space_nd(in_expr, block_shape, crops)
 
-        # Permute dimensions of reshaped to produce permuted of shape
-        axes = (
-            [M]
-            + [axis for i in range(M) for axis in [M + i + 1, i]]
-            + list(range(2 * M + 1, len(shape1)))
-        )
-        permuted = _op.transpose(reshaped, axes=axes)
-
-        # Reshape permuted to produce reshaped_permuted of shape
-        shape2 = [0] + [-3] * M + [-2]
-        reshaped_permuted = _op.reshape(permuted, newshape=shape2)
-
-        # Crop the start and end of dimensions [1, ..., M] of reshaped_permuted according to crops
-        # to produce the output of shape:
-        reshaped_permuted_shape = _infer_shape(reshaped_permuted)
-        cropped = reshaped_permuted
-        for axis in range(1, M + 1):
-            crop = crops[axis - 1]
-            if (crop != [0, 0]).all():
-                indices = _op.arange(
-                    _expr.const(crop[0]),
-                    _expr.const(reshaped_permuted_shape[axis] - crop[1]),
-                    dtype="int32",
-                )
-                cropped = _op.take(cropped, indices=indices, axis=axis)
-
-        return cropped
+        return out
 
     def convert_space_to_batch_nd(self, op):
         """space_to_batch_nd implementation."""
@@ -2620,51 +2589,12 @@ def convert_space_to_batch_nd(self, op):
         input_tensor_idx = input_tensor.tensor_idx
         in_expr = self.get_expr(input_tensor_idx)
 
-        input_shape = list(input_tensor.tensor.ShapeAsNumpy())
-        batch = input_shape[0]
-        N = len(input_shape)
-
         block_shape = list(self.get_tensor_value(input_tensors[1]))
-        M = len(block_shape)
-
-        paddings = list(self.get_tensor_value(input_tensors[2]))
-
-        # From https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd:
-        # Zero-pad the start and end of dimensions [1, ..., M] of the input according to paddings
-        # to produce padded of shape padded_shape.
-        remaining_shape_length = N - M - 1
-        padded_list = [(0, 0)] + paddings + [(0, 0)] * remaining_shape_length
-
-        padded_shape = []
-        for element in padded_list:
-            if isinstance(element, np.ndarray):
-                element = element.tolist()
-
-            padded_shape.append(element)
-
-        padded_shape = tuple(padded_shape)
-        padded = _op.nn.pad(in_expr, pad_width=tuple(padded_shape))
-
-        # Reshape padded to reshaped_padded of shape:
-        shape1 = [batch] + [item for i in range(M) for item in [-4, -1, block_shape[i]]] + [-2]
-        reshaped_padded = _op.reshape(padded, newshape=shape1)
-
-        # Permute dimensions of reshaped_padded to produce permuted_reshaped_padded of shape:
-        axes = (
-            [2 * i + 2 for i in range(M)]
-            + [0]
-            + [2 * i + 1 for i in range(M)]
-            + list(range(1 + 2 * M, 1 + 2 * M + remaining_shape_length))
-        )
-        permuted_reshaped_padded = _op.transpose(reshaped_padded, axes=axes)
-        permuted_reshaped_padded_shape = _infer_shape(permuted_reshaped_padded)
+        paddings = self.get_tensor_value(input_tensors[2]).tolist()
 
-        # Reshape permuted_reshaped_padded to flatten block_shape into the batch dimension,
-        # producing an output tensor of shape:
-        shape2 = [batch * np.prod(block_shape)] + list(permuted_reshaped_padded_shape)[M + 1 :]
-        reshaped_permuted_reshaped_padded = _op.reshape(permuted_reshaped_padded, newshape=shape2)
+        out = _op.nn.space_to_batch_nd(in_expr, block_shape, paddings)
 
-        return reshaped_permuted_reshaped_padded
+        return out
 
     def convert_depth_to_space(self, op):
         """Convert TFLite DEPTH_TO_SPACE"""
@@ -2770,14 +2700,16 @@ def convert_transpose_conv(self, op):
             raise ImportError("The tflite package must be installed")
 
         input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 3, "input tensors length should be 3"
+        assert len(input_tensors) >= 3, "input tensors length should be >= 3"
 
         # Input (data) Tensor. NHWC layout
         input_tensor = input_tensors[2]
-        _, input_h, input_w, input_c = input_tensor.tensor.ShapeAsNumpy()
+        _, input_h, input_w, input_c = to_int_list(input_tensor.tensor.ShapeAsNumpy())
         # Weights tensor. TFLite uses OHWI layout
         weights_tensor = input_tensors[1]
-        out_channels, kernel_h, kernel_w, in_channels = weights_tensor.tensor.ShapeAsNumpy()
+        out_channels, kernel_h, kernel_w, in_channels = to_int_list(
+            weights_tensor.tensor.ShapeAsNumpy()
+        )
         assert (
             input_c == in_channels
         ), "Input channel in the filter should match to channel in the input"
@@ -2809,7 +2741,7 @@ def convert_transpose_conv(self, op):
         # Weights
         weights_tensor_type = weights_tensor.tensor.Type()
         # weights tensor type should be UINT8 (quantization) or FLOAT32
-        assert weights_tensor_type in (TensorType.UINT8, TensorType.FLOAT32)
+        assert weights_tensor_type in (TensorType.INT8, TensorType.UINT8, TensorType.FLOAT32)
         weight_tensor_type_str = self.get_tensor_type_str(weights_tensor_type)
         weight_value_ohwi = self.get_tensor_value(weights_tensor)
         # Relay kernel_layout should be OIHW
@@ -2831,18 +2763,77 @@ def convert_transpose_conv(self, op):
         else:
             padding = (0, 0, 0, 0)
 
-        out = _op.nn.conv2d_transpose(
-            in_expr,
-            weight_expr_iohw,
-            strides=(stride_h, stride_w),
-            padding=padding,
-            channels=int(out_channels),
-            kernel_size=(int(kernel_h), int(kernel_w)),
-            data_layout="NHWC",
-            kernel_layout="OIHW",
-            out_dtype=output_tensor_type_str,
-        )
+        if input_tensor.qnn_params:
+            input_zero_point = input_tensor.qnn_params["zero_point"]
+            kernel_zero_point = weights_tensor.qnn_params["zero_point"]
+            input_scale = input_tensor.qnn_params["scale"]
+            kernel_scale = weights_tensor.qnn_params["scale"]
+            out = _qnn.op.conv2d_transpose(
+                in_expr,
+                weight_expr_iohw,
+                input_zero_point,
+                kernel_zero_point,
+                input_scale,
+                kernel_scale,
+                strides=(stride_h, stride_w),
+                padding=padding,
+                channels=int(out_channels),
+                kernel_size=(int(kernel_h), int(kernel_w)),
+                data_layout="NHWC",
+                kernel_layout="OIHW",
+                out_dtype="int32",
+            )
+        else:
+            out = _op.nn.conv2d_transpose(
+                in_expr,
+                weight_expr_iohw,
+                strides=(stride_h, stride_w),
+                padding=padding,
+                channels=int(out_channels),
+                kernel_size=(int(kernel_h), int(kernel_w)),
+                data_layout="NHWC",
+                kernel_layout="OIHW",
+                out_dtype=output_tensor_type_str,
+            )
 
+        # Checking if there is a fused bias
+        if len(input_tensors) == 4:
+            bias_tensor = input_tensors[3]
+            bias_tensor_type = bias_tensor.tensor.Type()
+            # bias tensor type should be INT32 (quantization) or FLOAT32
+            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
+            bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
+            bias_expr = self.exp_tab.new_const(
+                self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str
+            )
+            channel_axis = 3
+            out = _op.nn.bias_add(out, bias_expr, axis=channel_axis)
+
+        if output_tensor.qnn_params:
+            # Calculate the intermediate scale and zero point of the int32 output.
+            data_scale = input_tensor.qnn_params["scale"]
+            data_scale_val = get_scalar_from_constant(data_scale)
+
+            weight_scale = weights_tensor.qnn_params["scale"]
+            # If weight scale is scalar, it is per-tensor quantization
+            if isinstance(weight_scale, float):
+                weight_scale_val = get_scalar_from_constant(weight_scale)
+            else:
+                weight_scale_val = get_tensor_from_constant(weight_scale)
+
+            new_input_scale_val = data_scale_val * weight_scale_val
+            new_input_scale = relay.const(new_input_scale_val, "float32")
+            new_input_zero_point = relay.const(0, "int32")
+
+            out = _qnn.op.requantize(
+                out,
+                input_scale=new_input_scale,
+                input_zero_point=new_input_zero_point,
+                output_scale=output_tensor.qnn_params["scale"],
+                output_zero_point=output_tensor.qnn_params["zero_point"],
+                out_dtype=output_tensor_type_str,
+                axis=3,
+            )
         return out
 
     def convert_quantize(self, op):
@@ -3134,7 +3125,7 @@ def convert_matrix_diag(self, op):
             ), "TFLite MATRIX_DIAG requires diagonal and output tensors' \
                     scale and zero points to be equal"
 
-        shape = diagonal.tensor.ShapeAsNumpy()
+        shape = to_int_list(diagonal.tensor.ShapeAsNumpy())
         shape = np.append(shape, shape[-1])
         dtype = self.get_tensor_type_str(diagonal.tensor.Type())
 
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index 604098f30ad9..2872640109d3 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -19,7 +19,7 @@
 
 from tvm.runtime import convert
 from tvm.te.hybrid import script
-from tvm.topi.util import get_const_int, get_const_tuple
+from tvm.topi.utils import get_const_int, get_const_tuple
 from . import op as _reg
 
 _reg.register_reduce_schedule("argmax")
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 776060d7733e..b7ae715ff597 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -230,6 +230,7 @@ def elemwise_shape_func(attrs, inputs, _):
 
 
 register_shape_func("cast", False, elemwise_shape_func)
+register_shape_func("cast_like", False, elemwise_shape_func)
 register_shape_func("zeros", False, no_data_full_shape_func)
 register_shape_func("zeros_like", False, elemwise_shape_func)
 register_shape_func("ones", False, no_data_full_shape_func)
@@ -277,6 +278,4 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("sigmoid", False, elemwise_shape_func)
 register_shape_func("isnan", False, elemwise_shape_func)
 register_shape_func("isinf", False, elemwise_shape_func)
-register_shape_func("where", False, elemwise_shape_func)
-register_shape_func("copy", False, elemwise_shape_func)
 register_shape_func("logical_not", False, elemwise_shape_func)
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 85168a561399..9c84411352f2 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -16,12 +16,14 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
-from __future__ import absolute_import
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.error import OpError
 
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
 
-from ..expr import Tuple, TupleGetItem, const
+from ..expr import Tuple, TupleGetItem, const, Var
+from ..ty import TensorType
+from ..loops import while_loop
 from . import nn as _nn
 from .op import register_gradient
 from .reduce import sum as _sum
@@ -40,6 +42,7 @@
     equal,
     shape_of,
     log,
+    concatenate,
 )
 from .transform import (
     broadcast_to_like,
@@ -55,6 +58,11 @@
     repeat,
     expand_dims,
     full_like,
+    split,
+    squeeze,
+    strided_set,
+    arange,
+    scatter_nd,
 )
 
 
@@ -665,3 +673,140 @@ def cross_entropy_with_logits_grad(orig, grad):
     batch_size = take(shape, const(0, dtype="int32"), axis=0)
     grad = grad / batch_size.astype(x.checked_type.dtype)
     return [-grad * y, -grad * x]
+
+
+@register_gradient("take")
+def take_grad(orig, grad):
+    """
+    Returns the gradient of take.
+    """
+
+    def make_scalar_tensor(v):
+        if isinstance(v, int):
+            v = const(v, dtype="int32")
+        return reshape(v, (1,))
+
+    # TODO(@altanh): we currently assume indices are in range
+    data, indices = orig.args
+    axis = orig.attrs.axis
+    zero, one = map(make_scalar_tensor, [0, 1])
+    data_grad = zeros_like(data)
+    try:
+        data_shape = data.checked_type.concrete_shape
+    except TypeError as ty_err:
+        raise OpError("currently take_grad only supports data with concrete shape") from ty_err
+    if axis is None:
+        axis = 0
+        data_grad = reshape(data_grad, (-1,))
+        data_shape = 1
+        for dim in data.checked_type.concrete_shape:
+            data_shape *= dim
+        data_shape = (data_shape,)
+    else:
+        axis = int(axis)
+    strides = [1] * len(data_shape)
+
+    if len(indices.checked_type.shape) == 0:
+        # axis on grad has been squeezed in this case
+        num_indices = one
+        indices = reshape(indices, (1,))
+        grad = expand_dims(grad, int(axis))
+    elif len(indices.checked_type.shape) == 1:
+        num_indices = take(shape_of(indices), zero, axis=0)
+    else:
+        raise OpError("take_grad only supports scalar or 1D indices")
+
+    def loop_cond(data_grad, i):
+        return squeeze(less(i, num_indices))
+
+    def loop_body(data_grad, i):
+        index = take(indices, i, axis=0)
+        grad_slice = take(grad, i, axis=axis)
+        begin, end = [], []
+        for ax, size in enumerate(data_shape):
+            size = make_scalar_tensor(size)
+            begin.append(zero if ax != axis else index)
+            end.append(size if ax != axis else index + one)
+        begin, end = concatenate(begin, axis=0), concatenate(end, axis=0)
+        # data_grad[:,...,index at axis,...,:] += grad_slice
+        update = strided_slice(data_grad, begin, end, strides=strides)
+        update = update + grad_slice  # no need to expand grad_slice since i has shape (1,)
+        next_data_grad = strided_set(data_grad, update, begin, end, strides=strides)
+        return (next_data_grad, i + one)
+
+    loop_vars = [
+        Var("data_grad", type_annotation=TensorType(data_shape, data.checked_type.dtype)),
+        Var("i", type_annotation=TensorType((1,), "int32")),
+    ]
+
+    loop = while_loop(loop_cond, loop_vars, loop_body)
+    result = loop(data_grad, zero)
+    data_grad = TupleGetItem(result, 0)
+
+    if orig.attrs.axis is None:
+        data_grad = reshape_like(data_grad, data)
+
+    return [data_grad, zeros_like(orig.args[1])]
+
+
+@register_gradient("contrib_reverse_reshape")
+def reverse_reshape_grad(orig, grad):
+    """
+    Returns the gradient of reverse_reshape (same as reshape).
+    """
+    return [reshape_like(grad, orig.args[0])]
+
+
+@register_gradient("stack")
+def stack_grad(orig, grad):
+    """
+    Returns grad split across stacked inputs.
+    """
+    stack_axis = int(orig.attrs.axis)
+    sections = len(orig.args[0].checked_type.fields)
+    splits = split(grad, sections, stack_axis)
+    splits = Tuple([squeeze(x, axis=[stack_axis]) for x in splits])
+    return [splits]
+
+
+@register_gradient("squeeze")
+def squeeze_grad(orig, grad):
+    """
+    Returns grad expanded to input size.
+    """
+    # this should work, can't use expand_dims since we lose
+    # squeeze information when axis=None
+    return [reshape_like(grad, orig.args[0])]
+
+
+@register_gradient("expand_dims")
+def expand_dims_grad(orig, grad):
+    """
+    Returns grad squeezed on expanded dims.
+    """
+    axis = int(orig.attrs.axis)
+    for _ in range(orig.attrs.num_newaxis):
+        grad = squeeze(grad, axis=[axis])
+    return [grad]
+
+
+@register_gradient("arange")
+def arange_grad(orig, grad):
+    """
+    Returns the gradient of arange.
+    """
+    start, stop, step = orig.args
+    length = take(shape_of(orig), const(0, dtype="int32"), axis=0)
+
+    grad_start = cast_like(_sum(grad), start)
+    grad_stop = zeros_like(stop)
+    grad_step = cast_like(arange(length, dtype="int32"), grad) * grad
+    grad_step = cast_like(_sum(grad_step), step)
+
+    return [grad_start, grad_stop, grad_step]
+
+
+@register_gradient("gather_nd")
+def gather_nd_grad(orig, grad):
+    data, indices = orig.args
+    return [scatter_nd(grad, indices, data.checked_type.concrete_shape), zeros_like(indices)]
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index be0382aae8d2..1092c308cf49 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -22,7 +22,7 @@
 from tvm.te.hybrid import script
 from tvm.runtime import convert
 from tvm import topi
-from tvm.topi.util import get_const_int, get_const_tuple
+from tvm.topi.utils import get_const_int, get_const_tuple
 from . import op as _reg
 from . import strategy
 from .op import OpPattern
@@ -79,6 +79,8 @@ def compute_strided_set(attrs, inputs, output_type):
 # layout_transform
 _reg.register_injective_schedule("layout_transform")
 _reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
+_reg.register_injective_schedule("auto_scheduler_layout_transform")
+_reg.register_pattern("auto_scheduler_layout_transform", OpPattern.INJECTIVE)
 
 # argwhere
 @_reg.register_compute("argwhere")
@@ -104,7 +106,7 @@ def compute_scatter(attrs, inputs, output_type):
     return [topi.scatter(inputs[0], inputs[1], inputs[2], attrs.axis)]
 
 
-_reg.register_schedule("scatter", strategy.schedule_scatter)
+_reg.register_strategy("scatter", strategy.scatter_strategy)
 
 # scatter_add
 @_reg.register_compute("scatter_add")
@@ -113,7 +115,7 @@ def compute_scatter_add(attrs, inputs, output_type):
     return [topi.scatter_add(inputs[0], inputs[1], inputs[2], attrs.axis)]
 
 
-_reg.register_schedule("scatter_add", strategy.schedule_scatter_add)
+_reg.register_strategy("scatter_add", strategy.scatter_add_strategy)
 
 # interpolate
 @_reg.register_compute("interpolate")
@@ -124,6 +126,15 @@ def compute_interpolate(attrs, inputs, output_type):
 
 _reg.register_schedule("interpolate", strategy.schedule_interpolate)
 
+# scatter
+@_reg.register_compute("scatter_nd")
+def compute_scatter_nd(attrs, inputs, output_type):
+    """Compute definition of scatter_nd"""
+    return [topi.scatter_nd(inputs[0], inputs[1], attrs.out_shape)]
+
+
+_reg.register_strategy("scatter_nd", strategy.scatter_nd_strategy)
+
 #####################
 #  Shape functions  #
 #####################
@@ -172,6 +183,8 @@ def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, slice
         else:
             if end[i] > data_shape[i]:
                 cend = int64(data_shape[i])
+            elif end[i] < -data_shape[i]:
+                cend = int64(-1)
             else:
                 cend = int64(end[i])
                 if cend < 0:
@@ -339,6 +352,25 @@ def take_shape_func(attrs, inputs, out_ndims):
     return [_take_with_axis_shape_func(*inputs, convert(axis), out_ndims[0])]
 
 
+@_reg.register_legalize("take")
+def legalize_dyn_topk(attrs, inputs, types):
+    """Legalize take op.
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current op
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.take_legalize(attrs, inputs, types)
+
+
 @script
 def _argwhere_shape_func_1d(condition):
     out = output_tensor((2,), "int64")
@@ -584,11 +616,14 @@ def transpose_shape_func(attrs, inputs, _):
 
 
 @script
-def _squeeze_shape_func(data_shape, keep_axes):
+def _squeeze_shape_func(data_shape, keep_axes, remove_axes):
     out = output_tensor((len(keep_axes),), "int64")
     for i in const_range(len(keep_axes)):
         out[i] = data_shape[keep_axes[i]]
 
+    for i in const_range(len(remove_axes)):
+        assert data_shape[remove_axes[i]] == 1, "Removed dimension must have size 1"
+
     return out
 
 
@@ -599,10 +634,13 @@ def squeeze_shape_func(attrs, inputs, _):
     """
     axis = attrs.axis if attrs.axis is None else get_const_tuple(attrs.axis)
     keep_axes = []
+    remove_axes = []
     if axis is not None:
         for i in range(inputs[0].shape[0].value):
             if i not in axis:
                 keep_axes.append(i)
+            else:
+                remove_axes.append(i)
 
     # Due to current relay type system, it is possible even
     # a static kernel function needs shape function. To handle
@@ -610,7 +648,7 @@ def squeeze_shape_func(attrs, inputs, _):
     # for now.
     # TODO(kevinthesun): Enhance relay type system to avoid this.
     if keep_axes:
-        out = _squeeze_shape_func(inputs[0], convert(keep_axes))
+        out = _squeeze_shape_func(inputs[0], convert(keep_axes), convert(remove_axes))
     else:
         out = te.compute((), lambda *indices: 0)
     return [out]
@@ -714,6 +752,9 @@ def split_shape_func(attrs, inputs, _):
 
     axis = get_const_int(attrs.axis)
 
+    if axis < 0:
+        axis += get_const_int(inputs[0].shape[0])
+
     num_out = (
         indices_or_sections
         if isinstance(indices_or_sections, int)
@@ -787,6 +828,9 @@ def repeat_shape_func(attrs, inputs, _):
 
 @_reg.register_shape_func("broadcast_to_like", False)
 def broadcast_to_like_shape_func(attrs, inputs, _):
+    """
+    Shape func for broadcast_to_like.
+    """
     return [topi.math.identity(inputs[1])]
 
 
@@ -807,7 +851,52 @@ def _stack_shape_func(data_shape, axis, num_inputs):
 
 @_reg.register_shape_func("stack", False)
 def stack_shape_func(attrs, inputs, _):
+    """
+    Shape func for stack.
+    """
     axis = get_const_int(attrs.axis)
     if axis < 0:
         axis += inputs[0].shape[0] + 1
     return [_stack_shape_func(inputs[0], convert(axis), convert(len(inputs)))]
+
+
+@script
+def _broadcast_shape_tensors(shape_tensor1, shape_tensor2):
+    rank1 = shape_tensor1.shape[0]
+    rank2 = shape_tensor2.shape[0]
+    out_rank = max(rank1, rank2)
+    bcast_shape_tensor = output_tensor((out_rank,), "int64")
+
+    for index in const_range(out_rank):
+        dim1 = int64(1)
+        dim2 = int64(1)
+
+        if rank1 == out_rank:
+            dim1 = shape_tensor1[index]
+        elif rank1 - (out_rank - index) >= 0:
+            dim1 = shape_tensor1[rank1 - (out_rank - index)]
+
+        if rank2 == out_rank:
+            dim2 = shape_tensor2[index]
+        elif rank2 - (out_rank - index) >= 0:
+            dim2 = shape_tensor2[rank2 - (out_rank - index)]
+
+        assert dim1 == dim2 or dim1 == 1 or dim2 == 1, "Invalid broadcast shapes"
+        bcast_shape_tensor[index] = max(dim1, dim2)
+
+    return bcast_shape_tensor
+
+
+@_reg.register_shape_func("where", False)
+def where_shape_func(attrs, inputs, _):
+    """
+    Shape func for where.
+    """
+    cond_shape = inputs[0]
+    x_shape = inputs[1]
+    y_shape = inputs[2]
+
+    bcast_shape = _broadcast_shape_tensors(x_shape, y_shape)
+    out_shape = _broadcast_shape_tensors(bcast_shape, cond_shape)
+
+    return [out_shape]
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index dbcd8055d30b..49abf36134b4 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -22,3 +22,4 @@
 from .dnnl import *
 from .coreml import *
 from .ethosn import *
+from .tensorrt import *
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 8dfb3b7e0bf4..bdbeb8616a51 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -17,6 +17,8 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm Compute Library supported operators."""
 import tvm
+import numpy as np
+
 from tvm.relay.expr import const
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
@@ -167,7 +169,7 @@ def check_conv(extract):
         call = extract
         while call.op.name != "nn.conv2d":
             call = call.args[0]
-        return conv2d(call.attrs, call.args)
+        return conv2d(call)
 
     def check_qnn_conv(extract):
         """Check qnn conv pattern is supported by ACL."""
@@ -176,14 +178,14 @@ def check_qnn_conv(extract):
         call = extract
         while call.op.name != "qnn.conv2d":
             call = call.args[0]
-        return qnn_conv2d(call.attrs, call.args)
+        return qnn_conv2d(call)
 
     def check_dense(extract):
         """Check conv pattern is supported by ACL."""
         call = extract
         while call.op.name != "nn.dense":
             call = call.args[0]
-        return dense(call.attrs, call.args)
+        return dense(call)
 
     def check_qnn_dense(extract):
         """Check qnn conv pattern is supported by ACL."""
@@ -192,7 +194,7 @@ def check_qnn_dense(extract):
         call = extract
         while call.op.name != "qnn.dense":
             call = call.args[0]
-        return qnn_dense(call.attrs, call.args)
+        return qnn_dense(call)
 
     def check_avg_pool2d(extract):
         """Check average pool2d pattern is supported by ACL."""
@@ -201,12 +203,12 @@ def check_avg_pool2d(extract):
         pool = extract.args[0]
         if pool.args[0].attrs.dtype != "int32":
             return False
-        return avg_pool2d(pool.attrs, pool.args, from_quantized_composite=True)
+        return avg_pool2d(pool, from_quantized_composite=True)
 
     def check_l2_pool2d(extract):
         """Check l2 pool2d pattern is supported by ACL."""
         pool = extract.args[0]
-        return avg_pool2d(pool.attrs, pool.args)
+        return avg_pool2d(pool)
 
     return [
         ("arm_compute_lib.conv2d", conv_pattern(), check_conv),
@@ -221,7 +223,7 @@ def check_l2_pool2d(extract):
 
 def _register_external_op_helper(op_name, supported=True):
     @tvm.ir.register_op_attr(op_name, "target.arm_compute_lib")
-    def _func_wrapper(attrs, args):
+    def _func_wrapper(expr):
         return supported
 
     return _func_wrapper
@@ -231,8 +233,9 @@ def _func_wrapper(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.conv2d", "target.arm_compute_lib")
-def conv2d(attrs, args):
+def conv2d(expr):
     """Check if the external ACL codegen for conv2d should be used."""
+    attrs, args = expr.attrs, expr.args
     if attrs.groups != 1:
         return False
     if attrs.data_layout != "NHWC":
@@ -248,8 +251,9 @@ def conv2d(attrs, args):
     return True
 
 
-def qnn_conv2d(attrs, args):
+def qnn_conv2d(expr):
     """Check if the external ACL codegen for qnn.conv2d should be used."""
+    attrs, args = expr.attrs, expr.args
     if attrs.groups != 1:
         return False
     if attrs.data_layout != "NHWC":
@@ -266,8 +270,9 @@ def qnn_conv2d(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.dense", "target.arm_compute_lib")
-def dense(attrs, args):
+def dense(expr):
     """Check if the external ACL codegen for dense should be used."""
+    attrs, args = expr.attrs, expr.args
     data_typ = args[0].checked_type
     if data_typ.dtype != "float32":
         return False
@@ -276,11 +281,12 @@ def dense(attrs, args):
         return False
     if attrs.out_dtype != "float32" and attrs.out_dtype != "":
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
 
 
-def qnn_dense(attrs, args):
+def qnn_dense(expr):
     """Check if the external ACL codegen for qnn.dense should be used."""
+    attrs, args = expr.attrs, expr.args
     data_typ = args[0].checked_type
     if data_typ.dtype != "uint8":
         return False
@@ -289,24 +295,53 @@ def qnn_dense(attrs, args):
         return False
     if attrs.out_dtype != "int32":
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
 
 
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib")
-def max_pool2d(attrs, args):
+def max_pool2d(expr):
     """Check if the external ACL codegen for maxpool2d should be used."""
+    attrs, args = expr.attrs, expr.args
     if attrs.layout != "NHWC":
         return False
     typ = args[0].checked_type
     if typ.dtype not in ["float32", "uint8"]:
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
+
+
+def require_padding(inputs):
+    """Checks whether supplied data will require padding.
+    Most of the operators ACL up to 20.11 uses padded data.
+    """
+
+    def _check(shape, dtype):
+        """NEON has 128bits/16bytes per vector"""
+        if len(shape) == 0:
+            return False
+        return (shape[-1] * np.dtype(dtype).itemsize) % 16 != 0
+
+    for i in inputs:
+        if isinstance(i, (tvm.relay.expr.Var, tvm.relay.expr.Call)):
+            if _check(i.checked_type.shape, i.checked_type.dtype):
+                return True
+        elif isinstance(i, tvm.relay.expr.Constant):
+            if _check(i.data.shape, i.data.dtype):
+                return True
+        elif isinstance(i, tvm.ir.tensor_type.TensorType):
+            if _check(i.shape, i.dtype):
+                return True
+        else:
+            raise RuntimeException("Not supported input type: %s" % type(i))
+    return False
 
 
 @tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib")
-def avg_pool2d(attrs, args, from_quantized_composite=False):
+def avg_pool2d(expr, from_quantized_composite=False):
     """Check if the external ACL codegen for avgpool2d should be used."""
+    attrs, args = expr.attrs, expr.args
     typ = args[0].checked_type
+
     if from_quantized_composite:
         if typ.dtype != "int32":
             return False
@@ -315,42 +350,47 @@ def avg_pool2d(attrs, args, from_quantized_composite=False):
             return False
     if attrs.layout != "NHWC":
         return False
-    return True
+
+    return not require_padding([*args, expr.checked_type])
 
 
 @tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib")
-def global_max_pool2d(attrs, args):
+def global_max_pool2d(expr):
     """Check if the external ACL codegen for gloval_maxpool2d should be used."""
+    attrs, args = expr.attrs, expr.args
     typ = args[0].checked_type
     if typ.dtype not in ["float32", "uint8"]:
         return False
     if attrs.layout != "NHWC":
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
 
 
 @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib")
-def global_avg_pool2d(attrs, args):
+def global_avg_pool2d(expr):
     """Check if the external ACL codegen for global_avgpool2d should be used."""
+    attrs, args = expr.attrs, expr.args
     typ = args[0].checked_type
     if typ.dtype not in ["float32"]:
         return False
     if attrs.layout != "NHWC":
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
 
 
 @tvm.ir.register_op_attr("maximum", "target.arm_compute_lib")
-def maximum(attrs, args):
+def maximum(expr):
     """Check if the external ACL codegen for maximum should be used."""
+    args = expr.args
     type_a = args[0].checked_type
     type_b = args[0].checked_type
     return (type_a.dtype == "float32") and (type_b.dtype == "float32")
 
 
 @tvm.ir.register_op_attr("add", "target.arm_compute_lib")
-def add(attrs, args):
+def add(expr):
     """Check if the external ACL codegen for add should be used."""
+    args = expr.args
     for typ in [args[0].checked_type, args[1].checked_type]:
         if typ.dtype != "float32":
             return False
@@ -359,8 +399,9 @@ def add(attrs, args):
 
 
 @tvm.ir.register_op_attr("qnn.add", "target.arm_compute_lib")
-def qnn_add(attrs, args):
+def qnn_add(expr):
     """Check if the external ACL codegen for add should be used."""
+    args = expr.args
     for typ in [args[0].checked_type, args[1].checked_type]:
         if typ.dtype != "uint8":
             return False
diff --git a/python/tvm/relay/op/contrib/coreml.py b/python/tvm/relay/op/contrib/coreml.py
index 105009a9f9b0..c1c012199cec 100644
--- a/python/tvm/relay/op/contrib/coreml.py
+++ b/python/tvm/relay/op/contrib/coreml.py
@@ -31,7 +31,8 @@ def _register_coreml_op(op_name):
 
     """
 
-    def _check_supported(attrs, args):
+    def _check_supported(expr):
+        attrs, args = expr.attrs, expr.args
         if op_name == "nn.conv2d":
             if not isinstance(args[1], Constant):
                 return False
diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 816cb3818409..79bd02db164b 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -53,7 +53,7 @@ def _register_external_op_helper(op_name, supported=True):
     """
 
     @tvm.ir.register_op_attr(op_name, "target.dnnl")
-    def _func_wrapper(attrs, args):
+    def _func_wrapper(expr):
         return supported
 
     return _func_wrapper
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 3c676f4d9623..3a05011242e7 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -128,21 +128,23 @@ def _is_ethosn_composite(node):
 
 
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.ethos-n")
-def max_pool2d(attrs, args):
+def max_pool2d(expr):
     """Check if a max pool2d is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     pool = tvm.relay.nn.max_pool2d(*args, **attrs)
     return support.max_pool2d(pool)
 
 
 @tvm.ir.register_op_attr("reshape", "target.ethos-n")
-def reshape(attrs, args):
+def reshape(expr):
     """Check if a reshape is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     if not _is_ethosn_composite(args[0]):
         return False
 
@@ -151,21 +153,23 @@ def reshape(attrs, args):
 
 
 @tvm.ir.register_op_attr("qnn.add", "target.ethos-n")
-def qnn_add(attrs, args):
+def qnn_add(expr):
     """Check if an addition is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    args = expr.args
     add = _qnn.op.add(*args)
     return support.addition(add)
 
 
 @tvm.ir.register_op_attr("qnn.concatenate", "target.ethos-n")
-def qnn_concatenate(attrs, args):
+def qnn_concatenate(expr):
     """Check if a concatenate is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     conc = _qnn.op.concatenate(*args, **attrs)
     if not support.concatenate(conc):
         return False
@@ -190,11 +194,12 @@ def qnn_concatenate(attrs, args):
 
 
 @tvm.ir.register_op_attr("split", "target.ethos-n")
-def split(attrs, args):
+def split(expr):
     """Check if a split is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     if isinstance(attrs["indices_or_sections"], tvm.tir.IntImm):
         sp = tvm.relay.split(
             *args, indices_or_sections=attrs["indices_or_sections"].value, axis=attrs["axis"]
@@ -210,11 +215,12 @@ def split(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.depth_to_space", "target.ethos-n")
-def depth_to_space(attrs, args):
+def depth_to_space(expr):
     """Check if a depth_to_space is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     depth = tvm.relay.nn.depth_to_space(*args, **attrs)
     if not support.depth_to_space(depth):
         return False
@@ -223,11 +229,12 @@ def depth_to_space(attrs, args):
 
 
 @tvm.ir.register_op_attr("clip", "target.ethos-n")
-def clip(attrs, args):
+def clip(expr):
     """Check if a clip is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     c = tvm.relay.clip(*args, **attrs)
     if not support.relu(c):
         return False
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
new file mode 100644
index 000000000000..acd4f4740b2d
--- /dev/null
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -0,0 +1,986 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""TensorRT supported operators."""
+import logging
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.expr import Call, Constant, Tuple, GlobalVar, Var, TupleGetItem
+from tvm.relay.expr_functor import ExprMutator, ExprVisitor
+
+logger = logging.getLogger("TensorRT")
+
+
+def is_tensorrt_runtime_enabled():
+    """Check if the TensorRT graph runtime is present.
+    Returns
+    -------
+    ret: bool
+        True if present, False if not.
+    """
+    check_enabled = tvm.get_global_func("relay.op.is_tensorrt_runtime_enabled", True)
+    if check_enabled:
+        return check_enabled()
+    return False
+
+
+def get_tensorrt_version():
+    """Gets the version of TensorRT that TVM is built against or is targeting.
+
+    Returns
+    -------
+    ret: Tuple[int, int, int]
+        TensorRT version as a tuple of major, minor, and patch number. If TVM
+        is not built with TensorRT, the value set by set_tensorrt_version() is returned instead.
+    """
+    pass_ctx = tvm.transform.PassContext.current()
+    if "relay.ext.tensorrt.options" in pass_ctx.config:
+        return tuple(pass_ctx.config["relay.ext.tensorrt.options"].tensorrt_version)
+    return tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())
+
+
+def get_tensorrt_use_implicit_batch_mode():
+    pass_ctx = tvm.transform.PassContext.current()
+    if "relay.ext.tensorrt.options" in pass_ctx.config:
+        return pass_ctx.config["relay.ext.tensorrt.options"].use_implicit_batch
+    logger.warning(
+        "PassContext has no relay.ext.tensorrt.options config, using default value "
+        "use_implicit_batch=True."
+    )
+    return True
+
+
+def get_tensorrt_remove_no_mac_subgraphs():
+    pass_ctx = tvm.transform.PassContext.current()
+    if "relay.ext.tensorrt.options" in pass_ctx.config:
+        return pass_ctx.config["relay.ext.tensorrt.options"].remove_no_mac_subgraphs
+    logger.warning(
+        "PassContext has no relay.ext.tensorrt.options config, using default value "
+        "remove_no_mac_subgraphs=False."
+    )
+    return False
+
+
+def partition_for_tensorrt(
+    mod,
+    params=None,
+    version=None,
+    use_implicit_batch=True,
+    remove_no_mac_subgraphs=False,
+    max_workspace_size=1 << 30,
+):
+    """Partition the graph greedily offloading supported operators to TensorRT.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+    version : Optional[Tuple[int, int, int]]
+        TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled with
+        USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead.
+    use_implicit_batch : Optional[bool]
+        Use TensorRT implicit batch mode (default true). Setting to false will enable explicit batch
+        mode which will widen supported operators to include those which modify the batch dimension,
+        but may reduce performance for some models.
+    remove_no_mac_subgraphs : Optional[bool]
+        Removes subgraphs which have been partitioned for TensorRT if they do not have any
+        multiply-accumulate operations. The removed subgraphs will go through TVM's standard
+        compilation instead. Can improve performance.
+    max_workspace_size : Optional[int]
+        How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
+        See TensorRT documentation for more info.
+    Returns
+    -------
+    mod_and_config : Tuple[Module, Dict[str, Any]]
+        A tuple of 1) annotated and partitioned module and 2) "relay.ext.tensorrt.options"
+        configuration which should be given to PassContext when building.
+    """
+    config = {
+        "use_implicit_batch": use_implicit_batch,
+        "max_workspace_size": max_workspace_size,
+        "remove_no_mac_subgraphs": remove_no_mac_subgraphs,
+    }
+    if version:
+        assert isinstance(version, tuple) and len(version) == 3
+        config["tensorrt_version"] = version
+    else:
+        linked_version = tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())
+        if not linked_version:
+            logger.warning(
+                "TVM was not built against TensorRT and no version was provided to "
+                "partition_for_tensorrt. Defaulting to 6.0.1"
+            )
+            linked_version = (6, 0, 1)
+        config["tensorrt_version"] = linked_version
+
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            RemoveDropoutPass(),
+            transform.RemoveUnusedFunctions(),
+            transform.ConvertLayout(
+                {"nn.conv2d": ["NCHW", "default"], "nn.conv3d": ["NCDHW", "default"]}
+            ),
+            transform.FoldConstant(),
+            transform.AnnotateTarget("tensorrt"),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+            transform.InferType(),
+        ]
+    )
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        mod = seq(mod)
+        mod = prune_tensorrt_subgraphs(mod)
+    return mod, config
+
+
+def check_dynamism(args, op_name):
+    """
+    Check for dynamism inside any of the args in the op.
+
+    Parameters
+    ----------
+    args : tvm.ir.container.Array
+        Arguments of the op. Each of the argument shape is checked for presence of dynamic
+        components.
+    op_name: str
+        Name of the op for debugging purposes only.
+    Returns
+    ----------
+    ret : bool
+        True if dynamism is present, False otherwise
+    """
+    for arg in args:
+        if isinstance(arg, (Call, Var, Constant, TupleGetItem)):
+            for dim_shape in arg.checked_type.shape[1:]:
+                if isinstance(dim_shape, tvm.tir.expr.Any):
+                    return True
+        elif isinstance(arg, Tuple):
+            return check_dynamism(arg.fields, op_name)
+        else:
+            logger.info(
+                "Arg not supported in TensorRT for %s with type %s",
+                op_name,
+                type(arg),
+            )
+            return True
+    return False
+
+
+def _register_external_op_helper_with_checker(op_name, checker):
+    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
+    def _func_wrapper(expr):
+        attrs, args = expr.attrs, expr.args
+        # ops with dynamic shapes are offloaded to VM
+        if check_dynamism(args, op_name):
+            return False
+        if any([x.checked_type.dtype != "float32" for x in args]):
+            logger.info("Only float32 inputs are supported for TensorRT.")
+            return False
+        if op_name == "multiply":
+            shapes = [
+                [
+                    int(x) if not isinstance(x, tvm.tir.expr.Any) else -1
+                    for x in arg.checked_type.shape
+                ]
+                for arg in args
+            ]
+            # Batched multiply operations don't work in implicit batch mode. The following shapes
+            # have been excluded because they occur in PT MaskRCNN model. The long term solution is
+            # to switch to explicit batch mode after performance regressions are solved.
+            if all(
+                [list(map(int, shape)) in [[300, 64, 7, 7], [300, 1, 1, 1]] for shape in shapes]
+            ):
+                return False
+        return checker(attrs, args, op_name)
+
+    return _func_wrapper
+
+
+def _register_external_op_helper(op_name, supported=True):
+    return _register_external_op_helper_with_checker(
+        op_name, lambda attrs, args, op_name: supported
+    )
+
+
+def _register_external_dynamic_check_func(op_name):
+    """Wrapper to check dynamic shapes inside any of the args in the op."""
+
+    def _decorator_helper(checker):
+        @tvm.ir.register_op_attr(op_name, "target.tensorrt")
+        def _func_wrapper(expr):
+            args = expr.args
+            # ops with dynamic shapes are offloaded to VM
+            if check_dynamism(args, op_name):
+                return False
+            return checker(expr)
+
+        return _func_wrapper
+
+    return _decorator_helper
+
+
+# Ops which are always supported
+_register_external_op_helper("nn.relu")
+_register_external_op_helper("sigmoid")
+_register_external_op_helper("tanh")
+_register_external_op_helper("subtract")
+_register_external_op_helper("multiply")
+_register_external_op_helper("divide")
+_register_external_op_helper("power")
+_register_external_op_helper("maximum")
+_register_external_op_helper("minimum")
+_register_external_op_helper("exp")
+_register_external_op_helper("log")
+_register_external_op_helper("sqrt")
+_register_external_op_helper("abs")
+_register_external_op_helper("negative")
+_register_external_op_helper("nn.batch_flatten")
+_register_external_op_helper("clip")
+
+
+def reduce_annotate_fn(attrs, args, op_name):
+    """Helper for reduce operations."""
+    if not attrs.axis or len(attrs.axis) == 0:
+        logger.info("%s: cannot reduce to scalar.", op_name)
+        return False
+    if attrs.exclude:
+        logger.info("%s: exclude not supported.", op_name)
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and any([x == 0 for x in map(int, attrs.axis)]):
+        logger.info("%s: can't modify batch dimension.", op_name)
+        return False
+    return True
+
+
+_register_external_op_helper_with_checker("sum", reduce_annotate_fn)
+_register_external_op_helper_with_checker("prod", reduce_annotate_fn)
+_register_external_op_helper_with_checker("max", reduce_annotate_fn)
+_register_external_op_helper_with_checker("min", reduce_annotate_fn)
+_register_external_op_helper_with_checker("mean", reduce_annotate_fn)
+
+
+def trt_version_annotate_fn(version):
+    """Helper for ops which require a minimum TRT version"""
+
+    def _func_wrapper(attrs, args, op_name):
+        if get_tensorrt_version() < version:
+            logger.info(
+                "%s: requires TensorRT version %s or higher.", op_name, ".".join(map(str, version))
+            )
+            return False
+        return True
+
+    return _func_wrapper
+
+
+_register_external_op_helper_with_checker("nn.leaky_relu", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("sin", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("cos", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("atan", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("ceil", trt_version_annotate_fn((5, 1, 5)))
+
+
+@_register_external_dynamic_check_func("add")
+def add_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if add is supported by TensorRT."""
+
+    args = expr.args
+
+    shapes = [
+        [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
+        for arg in args
+    ]
+
+    # RelayVM + TRT doesn't support scalar addition yet.
+    for shape in shapes:
+        if len(shape) < 1:
+            return False
+
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if (
+        not get_tensorrt_use_implicit_batch_mode()
+        and (isinstance(args[0], Constant) or isinstance(args[1], Constant))
+        and shapes[0][0] == shapes[1][0]
+        and shapes[0][0] != 1
+        and (len(shapes[0]) > 3 or len(shapes[1]) > 3)
+    ):
+        logger.info("add: bug in TRT with adding batched constants.")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.batch_norm")
+def batch_norm_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.batch_norm is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if int(attrs.axis) not in (1, 3):
+        logger.info("nn.batch_norm: axis is %d but must be 1 or 3.", int(attrs.axis))
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.softmax")
+def softmax_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.softmax is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
+        logger.info("nn.softmax: can't modify batch dimension.")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.conv2d")
+def conv2d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.conv2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.data_layout != "NCHW":
+        logger.info("nn.conv2d: data_layout is %s but must be NCHW.", attrs.data_layout)
+        return False
+    if attrs.kernel_layout != "OIHW":
+        logger.info("nn.conv2d: kernel_layout is %s but must be OIHW.", attrs.kernel_layout)
+        return False
+    if attrs.out_layout and attrs.out_layout != "NCHW":
+        logger.info("nn.conv2d: out_layout is %s but must be NCHW.", attrs.out_layout)
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.dense")
+def dense_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if dense is supported by TensorRT."""
+
+    args = expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    input_rank = len(args[0].checked_type.shape)
+    weight_rank = len(args[1].checked_type.shape)
+    if input_rank not in (2, 3, 4):
+        logger.info("nn.dense: input has rank %d but must be 2, 3 or 4.", input_rank)
+        return False
+    if weight_rank != 2:
+        logger.info("nn.dense: weight has rank %d but must be 2.", weight_rank)
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.bias_add")
+def bias_add_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.bias_add is supported by TensorRT."""
+
+    args = expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    input_rank = len(args[0].checked_type.shape)
+    if input_rank not in (2, 3, 4):
+        logger.info("nn.bias_add: input rank is %d but must be 2, 3 or 4.", input_rank)
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.max_pool2d")
+def max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.max_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.layout != "NCHW":
+        logger.info("nn.max_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        return False
+    if attrs.ceil_mode and get_tensorrt_version() < (5, 1, 5):
+        logger.info("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.avg_pool2d")
+def avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.avg_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.layout != "NCHW":
+        logger.info("nn.avg_pool2d: layout is %d but must be NCHW.", attrs.layout)
+        return False
+    if (
+        attrs.count_include_pad
+        and len(attrs.padding) == 4
+        and (
+            int(attrs.padding[0]) != int(attrs.padding[2])
+            or int(attrs.padding[1]) != int(attrs.padding[3])
+        )
+    ):
+        logger.info(
+            "nn.avg_pool2d: inclusive-counted blended or average "
+            "pooling is not supported in combination with asymmetric padding"
+        )
+        return False
+    if attrs.ceil_mode and get_tensorrt_version() < (5, 1, 5):
+        logger.info("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.global_max_pool2d")
+def global_max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.global_max_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.layout != "NCHW":
+        logger.info("nn.global_max_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.global_avg_pool2d")
+def global_avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.global_avg_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.layout != "NCHW":
+        logger.info("nn.global_avg_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("expand_dims")
+def expand_dims_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if expand_dims is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
+        logger.info("expand_dims: can't modify batch dimension.")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("squeeze")
+def squeeze_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if squeeze is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not attrs.axis:
+        logger.info("squeeze: must explicitly set axis.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and any([axis == 0 for axis in map(int, attrs.axis)]):
+        logger.info("squeeze: can't modify batch dimension.")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("concatenate")
+def concatenate_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if concatenate is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.dtype != "float32" for x in args[0].checked_type.fields]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not get_tensorrt_use_implicit_batch_mode():
+        return True
+    if int(attrs.axis) == 0:
+        logger.info("concatenate: can't modify batch dimension.")
+        return False
+    if isinstance(args[0], Tuple):
+        for tuple_input in args[0].fields:
+            if isinstance(tuple_input, Constant):
+                logger.info("concatenate: can't concatenate tensors with constants.")
+                return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.conv2d_transpose")
+def conv2d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.conv2d_transpose is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.data_layout != "NCHW":
+        logger.info("nn.conv2d_transpose: data_layout is %s but must be NCHW.", attrs.data_layout)
+        return False
+    if attrs.kernel_layout != "OIHW":
+        logger.info(
+            "nn.conv2d_transpose: kernel_layout is %s but must be OIHW.", attrs.kernel_layout
+        )
+        return False
+    if attrs.out_layout and attrs.out_layout != "NCHW":
+        logger.info("nn.conv2d_transpose: out_layout is %s but must be NCHW.", attrs.out_layout)
+        return False
+    if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
+        logger.info("nn.conv2d_transpose: dilation rate must be 1.")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("transpose")
+def transpose_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if transpose is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axes[0]) != 0:
+        logger.info("transpose: can't modify batch dimension.")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("layout_transform")
+def layout_transform_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if layout_transform is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if (attrs.src_layout, attrs.dst_layout) not in [
+        ("NCHW", "NHWC"),
+        ("NHWC", "NCHW"),
+        ("NDHWC", "NCDHW"),
+        ("NCDHW", "NDHWC"),
+    ]:
+        logger.info(
+            "layout_transform: %s to %s is not supported.", attrs.src_layout, attrs.dst_layout
+        )
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("reshape")
+def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if reshape is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if args[0].checked_type.dtype != "float32":
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if any([x < -1 for x in map(int, attrs.newshape)]):
+        logger.info("reshape: new shape dims must be explicit.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode():
+        shape = args[0].checked_type.shape
+        new_shape = attrs.newshape
+        if len(new_shape) == 0 or len(shape) == 0:
+            logger.info("reshape: Can't reshape to or from scalar.")
+            return False
+
+        dynamic_reshape = any([isinstance(x, tvm.tir.expr.Any) for x in shape])
+
+        if dynamic_reshape:
+            # Make sure that the batch dim is unmodified.
+            if int(new_shape[0]) < 0:
+                for shape_val, new_shape_val in enumerate(shape[1:], new_shape[1:]):
+                    if not (
+                        isinstance(shape_val, int)
+                        and isinstance(new_shape_val, int)
+                        and int(shape_val) == int(new_shape_val)
+                    ):
+                        return False
+            elif int(new_shape[0]) > 0:
+                if not (
+                    isinstance(shape[0], int)
+                    and isinstance(new_shape[0], int)
+                    and int(shape[0]) == int(new_shape[0])
+                ):
+                    return False
+            return True
+        shape = list(map(int, shape))
+        new_shape = list(map(int, new_shape))
+
+        # TRT cannot modify batch dimension.
+        original_volume = np.prod(shape)
+        # First, resolve 0.
+        for i, value in enumerate(new_shape):
+            if value == 0:
+                new_shape[i] = shape[i]
+        # Resolve -1.
+        for i, value in enumerate(new_shape):
+            if value == -1:
+                new_shape[i] = original_volume // np.prod([x for x in new_shape if x != -1])
+        # Remove batch dimension and see if volumes match
+        if shape[0] != new_shape[0]:
+            logger.info("reshape: can't modify batch dimension.")
+            return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.pad")
+def pad_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.pad is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.pad_mode != "constant":
+        logger.info("nn.pad: pad mode is %s but must be constant.", attrs.pad_mode)
+        return False
+    if float(attrs.pad_value) != 0.0:
+        logger.info("nn.pad: pad value is %f but must be 0.0.", float(attrs.pad_value))
+        return False
+    if any([x != 0 for x in attrs.pad_width[0]]) or any([x != 0 for x in attrs.pad_width[1]]):
+        logger.info("nn.pad: can't pad batch or channel dimensions.")
+        return False
+    if len(attrs.pad_width) == 5 and any([x != 0 for x in attrs.pad_width[2]]):
+        logger.info("nn.pad: can only pad last two dimensions for 5D inputs.")
+    return True
+
+
+@_register_external_dynamic_check_func("strided_slice")
+def strided_slice_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if strided_slice is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if args[0].checked_type.dtype != "float32":
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((5, 1, 5))(attrs, args, "strided_slice"):
+        return False
+    if get_tensorrt_use_implicit_batch_mode():
+        batch_dim_begin_modified = attrs.begin[0] is not None and int(attrs.begin[0]) != 0
+        batch_dim_end_modified = (
+            attrs.end[0] is not None
+            and int(attrs.end[0]) != -1
+            and int(attrs.end[0]) != int(args[0].checked_type.shape[0])
+        )
+        if batch_dim_begin_modified or batch_dim_end_modified:
+            logger.info("strided_slice: can't modify batch dimension.")
+            return False
+    if any([x is not None and x <= 0 for x in attrs.strides]):
+        logger.info("strided_slice: stride must be positive")
+        return False
+    for i in range(0, len(args[0].checked_type.shape)):
+        begin = int(attrs.begin[i])
+        if attrs.slice_mode == "end":
+            end = (
+                int(attrs.end[i])
+                if attrs.end[i] is not None and int(attrs.end[i]) != -1
+                else args[0].checked_type.shape[i]
+            )
+            size = int(end) - int(begin)
+        elif attrs.slice_mode == "size":
+            size = (
+                int(attrs.end[i])
+                if attrs.end[i] is not None and int(attrs.end[i]) != -1
+                else args[0].checked_type.shape[i] - begin
+            )
+        else:
+            logger.warning("strided_slice: unknown slice mode encountered")
+
+        if int(size) < 1:
+            logger.info("strided_slice: size of slice must be at least 1")
+            return False
+
+    return True
+
+
+@_register_external_dynamic_check_func("nn.adaptive_max_pool2d")
+def adaptive_max_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.adaptive_max_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
+        logger.info("nn.adaptive_max_pool2d: output size must be (1, 1).")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.adaptive_avg_pool2d")
+def adaptive_avg_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.adaptive_avg_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
+        logger.info("nn.adaptive_avg_pool2d: output size must be (1, 1).")
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.conv3d")
+def conv3d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.conv3d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d"):
+        return False
+    if attrs.data_layout != "NCDHW":
+        logger.info("nn.conv3d: data_layout is %s but must be NCDHW.", attrs.data_layout)
+        return False
+    if attrs.kernel_layout != "OIDHW":
+        logger.info("nn.conv3d: kernel_layout is %s but must be OIDHW.", attrs.kernel_layout)
+        return False
+    if attrs.out_layout and attrs.out_layout != "NCDHW":
+        logger.info("nn.conv3d: out_layout is %s but must be NCDHW.", attrs.out_layout)
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.max_pool3d")
+def max_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.max_pool3d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.max_pool3d"):
+        return False
+    if attrs.layout != "NCDHW":
+        logger.info("nn.max_pool3d: layout is %s but must be NCDHW.", attrs.layout)
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.avg_pool3d")
+def avg_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.avg_pool3d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.avg_pool3d"):
+        return False
+    if attrs.layout != "NCDHW":
+        logger.info("nn.avg_pool3d: layout is %s but must be NCDHW.", attrs.layout)
+        return False
+    return True
+
+
+@_register_external_dynamic_check_func("nn.conv3d_transpose")
+def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
+    """Check if nn.conv3d_transpose is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d_transpose"):
+        return False
+    if attrs.data_layout != "NCDHW":
+        logger.info("nn.conv3d_transpose: data_layout is %s but must be NCDHW.", attrs.data_layout)
+        return False
+    if attrs.kernel_layout != "OIDHW":
+        logger.info(
+            "nn.conv3d_transpose: kernel_layout is %s but must be OIDHW.", attrs.kernel_layout
+        )
+        return False
+    if attrs.out_layout and attrs.out_layout != "NCDHW":
+        logger.info("nn.conv3d_transpose: out_layout is %s but must be NCDHW.", attrs.out_layout)
+        return False
+    if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
+        logger.info("nn.conv3d_transpose: dilation rate must be 1.")
+        return False
+    if attrs.output_padding and any([x != 0 for x in map(int, attrs.output_padding)]):
+        logger.info("nn.conv3d_transpose: output padding is not supported.")
+        return False
+    return True
+
+
+class IsComputeIntensiveGraph(ExprVisitor):
+    """
+    Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
+    its transpose, dense and batch mat-mul.
+    """
+
+    def __init__(self):
+        ExprVisitor.__init__(self)
+        self.is_compute_intensive = False
+
+    def visit_call(self, call):
+        compute_intensive_ops = set(
+            [
+                "nn.conv2d",
+                "nn.conv2d_transpose",
+                "nn.conv3d",
+                "nn.conv3d_transpose",
+                "nn.dense",
+                "nn.batch_matmul",
+            ]
+        )
+        if isinstance(call.op, tvm.tir.op.Op):
+            if str(call.op) in compute_intensive_ops:
+                self.is_compute_intensive = True
+
+        return super().visit_call(call)
+
+    def is_graph_compute_intensive(self, subgraph) -> bool:
+        """
+        This function recursively visits the graph and checks if it's compute intensive"
+        """
+        self.visit(subgraph)
+        return self.is_compute_intensive
+
+
+def is_valid_subgraph(params, body):
+    """Final check on whether the subgraph is valid and should be offloaded to TensorRT."""
+    # Remove invalid subgraphs for implicit batch mode.
+    if get_tensorrt_use_implicit_batch_mode():
+        input_batch_sizes = []
+        for var in params:
+            # In implicit batch mode, all inputs must have same batch size
+            # TODO: (codeislife99) : Fix different dynamic batch size inputs
+
+            if isinstance(var.checked_type, relay.TupleType):
+                for tupe_type in var.checked_type.fields:
+                    # Scalar inputs not allowed
+                    if len(tupe_type.shape) == 0:
+                        logger.info("tensorrt: scalar inputs not supported")
+                        return False
+
+                    if not isinstance(tupe_type.shape[0], tvm.tir.expr.Any):
+                        input_batch_sizes.append(int(tupe_type.shape[0]))
+            else:
+                # Scalar inputs not allowed
+                if len(var.checked_type.shape) == 0:
+                    logger.info("tensorrt: scalar inputs not supported")
+                    return False
+                if not isinstance(var.checked_type.shape[0], tvm.tir.expr.Any):
+                    input_batch_sizes.append(int(var.checked_type.shape[0]))
+        if len(input_batch_sizes) > 1 and len(set(input_batch_sizes)) != 1:
+            logger.info("tensorrt: inputs have different batch sizes")
+            return False
+    if (
+        get_tensorrt_remove_no_mac_subgraphs()
+        and not IsComputeIntensiveGraph().is_graph_compute_intensive(body)
+    ):
+        return False
+    return True
+
+
+def prune_tensorrt_subgraphs(mod):
+    """
+    Removes invalid subgraphs and those with no multiply-accumulates (if remove_no_max_subgraphs
+    is set).
+    """
+
+    class SubgraphRemover(ExprMutator):
+        """
+        Reverts subgraphs in subgraphs_to_remove back to TVM instead of using an external codegen.
+        """
+
+        def __init__(self, subgraphs_to_remove, mod, new_mod):
+            ExprMutator.__init__(self)
+            self.subgraphs_to_remove = subgraphs_to_remove
+            self.mod = mod
+            self.new_mod = new_mod
+
+        def visit_call(self, call):
+            if isinstance(call.op, GlobalVar):
+                name = call.op.name_hint
+                if name in self.subgraphs_to_remove:
+                    # "Inline" the subgraph back into new main function.
+                    func = self.mod[name]
+                    var_map = {}
+                    for arg, param in zip(call.args, func.params):
+                        var_map[param] = super().visit(arg)
+                    new_body = relay.bind(func.body, var_map)
+                    return new_body
+                if name != "main":
+                    args = []
+                    for arg in call.args:
+                        args.append(super().visit(arg))
+                    return call.op(*args)
+            return super().visit_call(call)
+
+    subgraphs_to_remove = []
+    # Remove invalid subgraphs
+    for subgraph in mod.get_global_vars():
+        name = subgraph.name_hint
+        if not mod[name].attrs or mod[name].attrs["Compiler"] != "tensorrt":
+            continue
+        if not is_valid_subgraph(mod[name].params, mod[name].body):
+            subgraphs_to_remove.append(name)
+    # Create new pruned module
+    new_mod = tvm.IRModule(mod.functions, mod.type_definitions)
+    new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
+    return new_mod
+
+
+class RemoveDropout(ExprMutator):
+    """
+    Removes all nn.dropout from an expr.
+    """
+
+    def visit_tuple_getitem(self, op):
+        visit = super().visit_tuple_getitem(op)
+        if visit.index != 0:
+            return visit
+        if (
+            isinstance(visit.tuple_value, Call)
+            and visit.tuple_value.op.name == "nn.dropout"
+            and visit.index == 0
+        ):
+            return visit.tuple_value.args[0]
+        return visit
+
+
+@transform.function_pass(opt_level=0)
+class RemoveDropoutPass:
+    def transform_function(self, func, mod, _):
+        return RemoveDropout().visit(func)
diff --git a/python/tvm/relay/op/contrib/vitis_ai.py b/python/tvm/relay/op/contrib/vitis_ai.py
new file mode 100644
index 000000000000..fa17c63fc00a
--- /dev/null
+++ b/python/tvm/relay/op/contrib/vitis_ai.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, no-else-return, E1102
+"""Vitis-AI codegen annotation of supported operators"""
+
+import numpy as np
+
+import pyxir
+import pyxir.frontend.tvm
+
+from tvm import relay
+import tvm._ffi
+from tvm.relay.expr import Tuple, TupleGetItem
+from tvm.relay import transform
+from tvm.relay.op.annotation import compiler_begin, compiler_end
+
+
+@transform.function_pass(opt_level=0)
+class VitisAIAnnotationPass:
+    """Responsible for annotating Relay expressions for Vitis-AI DPU accelerators"""
+
+    def __init__(self, compiler, relay_ids):
+        self.compiler = compiler
+        self.relay_ids = relay_ids
+
+    def transform_function(self, func, mod, ctx):
+        """Transform function for annotating Relay module"""
+        annotator = self
+
+        class Annotator(tvm.relay.ExprMutator):
+            """Annotator for Vitis-AI DPU accelerators"""
+
+            def visit_tuple(self, tup):
+                """Add compiler_begin and compiler_end annotations to Tuple"""
+                field_list = []
+                cond = int(hash(tup))
+                for field in tup.fields:
+                    if cond in annotator.relay_ids:
+                        field_list.append(compiler_begin(super().visit(field), annotator.compiler))
+                    else:
+                        field_list.append(super().visit(field))
+                if cond in annotator.relay_ids:
+                    return compiler_end(Tuple(field_list), annotator.compiler)
+                else:
+                    return Tuple(field_list)
+
+            def visit_tuple_getitem(self, op):
+                """Add compiler_begin and compiler_end annotations to TupleGetItem"""
+                if int(hash(op.tuple_value)) in annotator.relay_ids:
+                    tuple_value = compiler_begin(super().visit(op.tuple_value), annotator.compiler)
+                    return compiler_end(TupleGetItem(tuple_value, op.index), annotator.compiler)
+                else:
+                    tuple_value = super().visit(op.tuple_value)
+                    return TupleGetItem(tuple_value, op.index)
+
+            def visit_call(self, call):
+                """Add compiler_begin and compiler_end annotations to the Call expr"""
+                if int(hash(call)) in annotator.relay_ids:
+                    new_args = []
+                    for arg in call.args:
+                        ann = compiler_begin(super().visit(arg), annotator.compiler)
+                        new_args.append(ann)
+                    new_call = relay.Call(call.op, new_args, call.attrs, call.type_args)
+                    return compiler_end(new_call, annotator.compiler)
+
+                else:
+                    return super().visit_call(call)
+
+        return Annotator().visit(func)
+
+
+def annotation(mod, params, target):
+    """Annotate Relay expression for Vitis-AI DPU accelerators"""
+    xgraph = pyxir.frontend.tvm.from_relay(mod, params, postprocessing=None)
+    xgraph = pyxir.partition(xgraph, targets=[target])
+
+    layers = xgraph.get_layers()
+    relay_ids = [
+        list(np.array(layer.attrs["relay_id"]).flatten())
+        for layer in layers
+        if layer.target == target
+    ]
+    relay_ids_flatten = [item for sublist in relay_ids for item in sublist]
+    mod = VitisAIAnnotationPass("vitis_ai", relay_ids_flatten)(mod)
+
+    return mod
diff --git a/python/tvm/relay/op/dyn/_transform.py b/python/tvm/relay/op/dyn/_transform.py
index dedd3dfb66d7..b61d4f9655f6 100644
--- a/python/tvm/relay/op/dyn/_transform.py
+++ b/python/tvm/relay/op/dyn/_transform.py
@@ -28,6 +28,7 @@
 _reg.register_injective_schedule("dyn.one_hot")
 _reg.register_injective_schedule("dyn.full")
 _reg.register_injective_schedule("dyn.strided_slice")
+_reg.register_injective_schedule("dyn.sparse_to_dense")
 
 
 @script
@@ -173,6 +174,8 @@ def _strided_slice_shape_func_input_data(data, begin, end, strides, slice_mode):
         else:
             if end[i] > data.shape[i]:
                 cend = int64(data.shape[i])
+            elif end[i] < -data.shape[i]:
+                cend = int64(-1)
             else:
                 cend = int64(end[i])
                 if cend < 0:
@@ -196,3 +199,16 @@ def strided_slice_shape_func(attrs, inputs, _):
     """
     slice_mode = convert(0 if attrs.slice_mode == "end" else 1)
     return [_strided_slice_shape_func_input_data(*inputs, slice_mode)]
+
+
+@script
+def _sparse_to_dense_shape_func(output_shape, ndim):
+    out = output_tensor((ndim,), "int64")
+    for i in const_range(ndim):
+        out[i] = int64(output_shape[i])
+    return out
+
+
+@_reg.register_shape_func("dyn.sparse_to_dense", True)
+def sparse_to_dense_shape_func(attrs, inputs, out_ndims):
+    return [_sparse_to_dense_shape_func(inputs[3], out_ndims[0])]
diff --git a/python/tvm/relay/op/dyn/image/_image.py b/python/tvm/relay/op/dyn/image/_image.py
index cc0099836e2e..e3415795712e 100644
--- a/python/tvm/relay/op/dyn/image/_image.py
+++ b/python/tvm/relay/op/dyn/image/_image.py
@@ -21,7 +21,7 @@
 import tvm.topi
 from tvm.runtime import convert
 from tvm.te.hybrid import script
-from tvm.topi.util import nchw_pack_layout, nchw_xc_layout
+from tvm.topi.utils import nchw_pack_layout, nchw_xc_layout
 from ... import op as reg
 
 
diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py
index adbed84713fe..ee8a5b3883b1 100644
--- a/python/tvm/relay/op/image/_image.py
+++ b/python/tvm/relay/op/image/_image.py
@@ -22,7 +22,7 @@
 from tvm.runtime import convert
 
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .. import op as reg
 from .. import strategy
 from ..op import OpPattern
@@ -42,6 +42,45 @@ def compute_resize(attrs, inputs, out_type):
 reg.register_injective_schedule("image.resize")
 
 
+@script
+def _resize_shape_func(image_shape, size, batch_axis, height_axis, width_axis, channel_axis):
+    out = output_tensor((4,), "int64")
+    out[batch_axis] = int64(image_shape[0])
+    out[height_axis] = int64(size[0])
+    out[width_axis] = int64(size[1])
+    out[channel_axis] = image_shape[channel_axis]
+    return out
+
+
+@reg.register_shape_func("image.resize", False)
+def resize_shape_func(attrs, inputs, _):
+    """
+    Shape function for resize op.
+    """
+    layout = attrs.layout
+    height_axis = width_axis = channel_axis = 1
+    for i, letter in enumerate(layout):
+        if letter == "N":
+            batch_axis = i
+        if letter == "H":
+            height_axis = i
+        if letter == "W":
+            width_axis = i
+        if letter == "C":
+            channel_axis = i
+    size = get_const_tuple(attrs.size)
+    return [
+        _resize_shape_func(
+            inputs[0],
+            convert(size),
+            convert(batch_axis),
+            convert(height_axis),
+            convert(width_axis),
+            convert(channel_axis),
+        )
+    ]
+
+
 @reg.register_compute("image.resize3d")
 def compute_resize3d(attrs, inputs, out_type):
     size = attrs.size
@@ -134,6 +173,25 @@ def compute_affine_grid(attrs, inputs, out_dtype):
 reg.register_injective_schedule("image.affine_grid")
 
 
+@script
+def _affine_grid_func(data, target_shape):
+    out = output_tensor((4,), "int64")
+    out[0] = int64(data[0])
+    out[1] = int64(2)
+    out[2] = int64(target_shape[0])
+    out[3] = int64(target_shape[1])
+    return out
+
+
+@reg.register_shape_func("image.affine_grid", False)
+def affine_grid_func(attrs, inputs, _):
+    """
+    Shape function for affine_grid op.
+    """
+    target_shape = get_const_tuple(attrs.target_shape)
+    return [_affine_grid_func(inputs[0], convert(target_shape))]
+
+
 # grid_sample
 @reg.register_compute("image.grid_sample")
 def compute_grid_sample(attrs, inputs, out_dtype):
@@ -143,3 +201,21 @@ def compute_grid_sample(attrs, inputs, out_dtype):
 
 
 reg.register_injective_schedule("image.grid_sample")
+
+
+@script
+def _grid_sample_func(data, grid):
+    out = output_tensor((4,), "int64")
+    out[0] = int64(data[0])
+    out[1] = int64(data[1])
+    out[2] = int64(grid[2])
+    out[3] = int64(grid[3])
+    return out
+
+
+@reg.register_shape_func("image.grid_sample", False)
+def grid_sample_func(attrs, inputs, _):
+    """
+    Shape function for grid_sample op.
+    """
+    return [_grid_sample_func(inputs[0], inputs[1])]
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 9e47dc0a17f1..f7115da64a52 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from tvm.runtime import convert
 from tvm.te.hybrid import script
@@ -28,6 +28,7 @@
 from ..op import OpPattern
 from .._tensor import elemwise_shape_func
 from ..strategy.generic import is_depthwise_conv2d
+from ...transform import LayoutConfig
 
 # relu
 reg.register_broadcast_schedule("nn.relu")
@@ -164,6 +165,16 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layouts):
     from tvm import relay
 
     data, weight = inputs
+
+    # First check if there is a LayoutConfig scope, and if so, whether
+    # it indicates we should ignore this layer or not.
+    layout_config = LayoutConfig.current
+    if layout_config is not None:
+        skip_layer = layout_config.check_skip()
+        if skip_layer:
+            return relay.nn.conv2d(data, weight, **attrs)
+
+    # Prepare new layout.
     new_attrs = dict(attrs)
     assert len(desired_layouts) == 2, "A desired layout is expected for both of nn.conv2d's inputs"
     desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
@@ -192,6 +203,9 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layouts):
         else:
             new_attrs["kernel_layout"] = "HWIO"
         return relay.nn.conv2d(data, weight, **new_attrs)
+    elif desired_data_layout == "HWNC":
+        new_attrs["kernel_layout"] = "HWOI"
+        return relay.nn.conv2d(data, weight, **new_attrs)
 
     raise ValueError("Layout %s is not yet supported." % desired_data_layout)
 
@@ -732,6 +746,11 @@ def compute_space_to_depth(attrs, inputs, out_dtype):
 reg.register_pattern("nn.correlation", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+# space_to_batch_nd and batch_to_space_nd
+reg.register_injective_schedule("nn.space_to_batch_nd")
+reg.register_injective_schedule("nn.batch_to_space_nd")
+
+
 #####################
 #  Shape functions  #
 #####################
@@ -982,7 +1001,10 @@ def dense_shape_func(attrs, inputs, _):
 def _batch_matmul_shape_func(data_shape, weight_shape):
     out = output_tensor((data_shape.shape[0],), "int64")
     for i in const_range(out.shape[0] - 1):
-        out[i] = data_shape[i]
+        if i == 0:
+            out[i] = max(data_shape[i], weight_shape[i])
+        else:
+            out[i] = data_shape[i]
     out[out.shape[0] - 1] = weight_shape[weight_shape.shape[0] - 2]
 
     return out
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 1aad4e7125fd..05ca777186bb 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -20,7 +20,7 @@
 
 from . import _make
 from ..dyn.nn import _make as _dyn_make
-from .util import get_pad_tuple1d, get_pad_tuple2d, get_pad_tuple3d
+from .utils import get_pad_tuple1d, get_pad_tuple2d, get_pad_tuple3d
 from ...expr import const, Expr
 
 
@@ -2046,7 +2046,7 @@ def sparse_transpose(x):
 
     Parameters
     ----------
-    x : namedtuple.
+    x : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]].
         The sparse weight matrix for the fast matrix transpose.
 
     Returns
@@ -2055,7 +2055,9 @@ def sparse_transpose(x):
         Tuple of output sparse tensor (same shape and format as input),
         i.e. if CSR then output is in ([data, indices, indptr]) form
     """
-    return expr.TupleWrapper(_make.sparse_transpose(x.data, x.indices, x.indptr), 3)
+    if hasattr(x, "indices"):
+        return expr.TupleWrapper(_make.sparse_transpose(x.data, x.indices, x.indptr), 3)
+    return expr.TupleWrapper(_make.sparse_transpose(x[0], x[1], x[2]), 3)
 
 
 def contrib_conv2d_winograd_without_weight_transform(
@@ -3156,3 +3158,64 @@ def correlation(
     return _make.correlation(
         data1, data2, kernel_size, max_displacement, stride1, stride2, padding, is_multiply, layout
     )
+
+
+def space_to_batch_nd(data, block_shape, paddings, pad_value=0):
+    r"""Divide spatial dimensions of the data into a grid of blocks
+    and interleave them into batch dim.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D with shape [batch, spatial_shape, remaining_shape]
+
+    block_shape : relay.Expr
+        1-D of size [M] where M is number of spatial dims, specifies block size
+        for each spatial dimension.
+
+    paddings : relay.Expr
+        2-D of shape [M, 2] where M is number of spatial dims, specifies
+        [before, after] paddings for each spatial dimension.
+
+    pad_value : float, or relay.Expr, optional, default=0
+        The value used for padding.
+
+    Returns
+    -------
+    result : relay.Expr
+        N-D Tensor with shape
+        [in_batch * prod(block_shape),
+        padded_data[1] / block_shape[0], ..., padded_data[M] / block_shape[M-1],
+        remaining_shape]
+    """
+
+    return _make.space_to_batch_nd(data, block_shape, paddings, pad_value)
+
+
+def batch_to_space_nd(data, block_shape, crops):
+    r"""Reshape the batch dimension into spatial dimensions.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D with shape [batch, spatial_shape, remaining_shape]
+
+    block_shape : relay.Expr
+        1-D of size [M] where M is number of spatial dims, specifies block size
+        for each spatial dimension.
+
+    crops : relay.Expr
+        2-D of shape [M, 2] where M is number of spatial dims, specifies
+        [begin, end] crop size for each spatial dimension.
+
+    Returns
+    -------
+    result : relay.Expr
+        N-D Tensor with shape
+        [batch / prod(block_shape),
+        in_shape[1] * block_shape[0] - crops[0,0] - crops[0,1], ...,
+        in_shape[M] * block_shape[M-1] - crops[M-1, 0] - crops[M-1, 1],
+        remaining_shape]
+    """
+
+    return _make.batch_to_space_nd(data, block_shape, crops)
diff --git a/python/tvm/relay/op/nn/util.py b/python/tvm/relay/op/nn/utils.py
similarity index 100%
rename from python/tvm/relay/op/nn/util.py
rename to python/tvm/relay/op/nn/utils.py
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 755659a2819f..d4d20b3ebc4a 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -19,9 +19,8 @@
 import tvm._ffi
 import tvm.ir
 from tvm.driver import lower, build
-
-from ...target import get_native_generic_func, GenericFunc
-from ...runtime import Object
+from tvm.target import get_native_generic_func, GenericFunc
+from tvm.runtime import Object
 from . import _make
 
 
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 5dc2c2402c08..bbaded431788 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -194,6 +194,11 @@ class ReshapeAttrs(Attrs):
     """Attributes for transform.reshape"""
 
 
+@tvm._ffi.register_object("relay.attrs.ReshapeLikeAttrs")
+class ReshapeLikeAttrs(Attrs):
+    """Attributes for transform.reshape_like"""
+
+
 @tvm._ffi.register_object("relay.attrs.GatherAttrs")
 class GatherAttrs(Attrs):
     """Attributes for transform.gather"""
@@ -532,3 +537,13 @@ class TupleGetItemAttrs(Attrs):
 @tvm._ffi.register_object("relay.attrs.WithFuncIdAttrs")
 class WithFuncIdAttrs(Attrs):
     """Attributes used in with_funcid annotation operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.SpaceToBatchNDAttrs")
+class SpaceToBatchNDAttrs(Attrs):
+    """Attributes used in SpaceToBatchND operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.BatchToSpaceNDAttrs")
+class BatchToSpaceNDAttrs(Attrs):
+    """Attributes used in BatchToSpaceNDAttrs operators"""
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 6759a54d0b80..985124e305ee 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -180,7 +180,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
             # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
             # Let us comment it out but not remove.
             # see discussion:
-            # https://discuss.tvm.ai/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
+            # https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
             # strategy.add_implementation(
             #     wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack),
             #     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack),
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 7031365251aa..029690680e7d 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -18,11 +18,12 @@
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 from tvm import topi
 import tvm
+from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
 from tvm.contrib import nvcc
+from tvm._ffi import get_global_func
 from .generic import *
 from .. import op as _op
-from .... import get_global_func
 
 
 @schedule_injective.register(["cuda", "gpu"])
@@ -100,6 +101,18 @@ def schedule_lrn_cuda(attrs, outs, target):
         return topi.cuda.schedule_lrn(outs)
 
 
+def naive_schedule(_, outs, target):
+    """Return the naive default schedule"""
+    if "gpu" in target.keys:
+        # For GPU, we at least need thread binding to make a valid schedule.
+        # So the naive schedule cannot be compiled.
+        raise RuntimeError(
+            "Cannot compile for GPU targets if no tuned schedule is found. "
+            "Please see the warning messages above for more information about the failed workloads."
+        )
+    return tvm.te.create_schedule(outs[-1].op)
+
+
 @conv2d_strategy.register(["cuda", "gpu"])
 def conv2d_strategy_cuda(attrs, inputs, out_type, target):
     """conv2d cuda strategy"""
@@ -132,13 +145,9 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 )
             _, _, kh, kw = get_const_tuple(kernel.shape)
             if (
-                2 < kh < 8
-                and 2 < kw < 8
-                and kh == kw
-                and stride_h == 1
-                and stride_w == 1
-                and dilation_h == 1
-                and dilation_w == 1
+                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
+                and (stride_h == 1 and stride_w == 1)
+                and (dilation_h == 1 and dilation_w == 1)
             ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
@@ -160,10 +169,15 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                 name="conv2d_nhwc.cuda",
             )
+
             N, H, W, _ = get_const_tuple(data.shape)
             KH, KW, CI, CO = get_const_tuple(kernel.shape)
             # Winograd shape related judgment
-            judge_winograd_tensorcore, judge_winograd_shape = winograd_judge(
+            (
+                judge_winograd_tensorcore,
+                judge_winograd_autotvm,
+                judge_winograd_auto_scheduler,
+            ) = judge_winograd(
                 N,
                 H,
                 W,
@@ -176,9 +190,11 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 stride_w,
                 dilation_h,
                 dilation_w,
+                data.dtype,
+                kernel.dtype,
                 pre_flag=False,
             )
-            if judge_winograd_shape:
+            if judge_winograd_autotvm:
                 if (
                     target.kind.name == "cuda"
                     and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
@@ -197,19 +213,31 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                         name="conv2d_nhwc_winograd_direct.cuda",
                         plevel=5,
                     )
-            if target.kind.name == "cuda":
-                if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
-                    if (
-                        (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-                        or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-                        or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-                    ):
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
-                            wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore),
-                            name="conv2d_nhwc_tensorcore.cuda",
-                            plevel=20,
-                        )
+            if (
+                target.kind.name == "cuda"
+                and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
+                and (
+                    (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
+                    or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
+                    or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
+                )
+            ):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
+                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore),
+                    name="conv2d_nhwc_tensorcore.cuda",
+                    plevel=20,
+                )
+
+            # register auto-scheduler implementations
+            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
+                    naive_schedule,  # this implementation should never be picked by autotvm
+                    name="conv2d_nhwc.winograd",
+                    plevel=15,
+                )
+
         elif layout == "HWNC":
             assert kernel_layout in ["HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"]
             _, _, N, in_channels = get_const_tuple(data.shape)
@@ -219,8 +247,13 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 out_channels = oc_chunk * oc_block_factor
             else:
                 _, _, out_channels, _ = get_const_tuple(kernel.shape)
-            if topi.cuda.is_shape_tensorcore_direct_qualified(
-                batch=N, in_channels=in_channels, num_filter=out_channels, in_dtype=data.dtype
+
+            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
+            if (
+                (N % 16 == 0 and in_channels % 16 == 0 and out_channels % 16 == 0)
+                or (N % 8 == 0 and in_channels % 16 == 0 and out_channels % 32 == 0)
+                or (N % 32 == 0 and in_channels % 16 == 0 and out_channels % 8 == 0)
+                and (data.dtype in tensorcore_dtypes and kernel.dtype in tensorcore_dtypes)
             ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
@@ -305,6 +338,63 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+def judge_winograd(
+    N,
+    H,
+    W,
+    KH,
+    KW,
+    CI,
+    CO,
+    padding,
+    stride_h,
+    stride_w,
+    dilation_h,
+    dilation_w,
+    data_dtype,
+    kernel_dtype,
+    pre_flag,
+):
+    """Winograd judgement about tensorcore and shape"""
+    if H % 8 == 0:
+        tile_size = 4
+    else:
+        tile_size = 2
+    if pre_flag:
+        alpha = KH
+        KH = KW = alpha + 1 - tile_size
+    pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (KH, KW))
+    OH = (H + pt + pb - KH) // stride_h + 1
+    OW = (W + pl + pr - KW) // stride_w + 1
+    nH, nW = (OH + tile_size - 1) // tile_size, (OW + tile_size - 1) // tile_size
+    P = N * nH * nW
+
+    judge_winograd_tensorcore = (
+        (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
+        or (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
+        or (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
+    )
+
+    judge_winograd_autotvm = (
+        2 < KH < 8
+        and 2 < KW < 8
+        and KH == KW
+        and stride_h == 1
+        and stride_w == 1
+        and dilation_h == 1
+        and dilation_w == 1
+    )
+
+    judge_winograd_auto_scheduler = (
+        ("float" in data_dtype and "float" in kernel_dtype)
+        and (KH == 3 and KW == 3)
+        and (stride_h == 1 and stride_w == 1)
+        and (dilation_h == 1 and dilation_w == 1)
+    )
+
+    return judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler
+
+
 @conv2d_winograd_without_weight_transfrom_strategy.register(["cuda", "gpu"])
 def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
     """conv2d_winograd_without_weight_transfrom cuda strategy"""
@@ -327,7 +417,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
         N, H, W, _ = get_const_tuple(data.shape)
         alpha, _, CI, CO = get_const_tuple(kernel.shape)
         dilation_h, dilation_w = dilation
-        judge_winograd_tensorcore, _ = winograd_judge(
+        judge_winograd_tensorcore, _, _ = judge_winograd(
             N,
             H,
             W,
@@ -340,6 +430,8 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
             stride_w,
             dilation_h,
             dilation_w,
+            data.dtype,
+            kernel.dtype,
             pre_flag=True,
         )
         if (
@@ -364,6 +456,14 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
                 ),
                 name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
             )
+
+        if is_auto_scheduler_enabled():
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
+                naive_schedule,  # this implementation should never be picked by autotvm
+                name="conv2d_nhwc_winograd_without_weight_transform",
+                plevel=15,
+            )
     else:
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
@@ -570,6 +670,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
             wrap_topi_schedule(topi.cuda.schedule_dense_small_batch),
             name="dense_small_batch.cuda",
         )
+
         with SpecializedCondition(b >= 32):
             strategy.add_implementation(
                 wrap_compute_dense(topi.cuda.dense_large_batch),
@@ -646,6 +747,45 @@ def sparse_dense_padded_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+@scatter_strategy.register(["cuda", "gpu"])
+def scatter_cuda(attrs, inputs, out_type, target):
+    """scatter cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.cuda.scatter),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter.cuda",
+        plevel=10,
+    )
+    return strategy
+
+
+@scatter_add_strategy.register(["cuda", "gpu"])
+def scatter_add_cuda(attrs, inputs, out_type, target):
+    """scatter_add cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.cuda.scatter_add),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter_add.cuda",
+        plevel=10,
+    )
+    return strategy
+
+
+@scatter_nd_strategy.register(["cuda", "gpu"])
+def scatter_nd_cuda(attrs, inputs, out_type, target):
+    """scatter_nd cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter_nd(topi.cuda.scatter_nd),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter_nd.cuda",
+        plevel=10,
+    )
+    return strategy
+
+
 @argsort_strategy.register(["cuda", "gpu"])
 def argsort_strategy_cuda(attrs, inputs, out_type, target):
     """argsort cuda strategy"""
@@ -655,7 +795,9 @@ def argsort_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_argsort),
         name="argsort.cuda",
     )
-    if get_global_func("tvm.contrib.thrust.sort", allow_missing=True):
+    if target.kind.name == "cuda" and get_global_func(
+        "tvm.contrib.thrust.sort", allow_missing=True
+    ):
         strategy.add_implementation(
             wrap_compute_argsort(topi.cuda.argsort_thrust),
             wrap_topi_schedule(topi.cuda.schedule_argsort),
@@ -674,7 +816,9 @@ def topk_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_topk),
         name="topk.cuda",
     )
-    if get_global_func("tvm.contrib.thrust.sort", allow_missing=True):
+    if target.kind.name == "cuda" and get_global_func(
+        "tvm.contrib.thrust.sort", allow_missing=True
+    ):
         strategy.add_implementation(
             wrap_compute_topk(topi.cuda.topk_thrust),
             wrap_topi_schedule(topi.cuda.schedule_topk),
@@ -765,39 +909,6 @@ def proposal_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
-def winograd_judge(
-    N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, pre_flag
-):
-    """Winograd judgement about tensorcore and shape"""
-    if H % 8 == 0:
-        tile_size = 4
-    else:
-        tile_size = 2
-    if pre_flag:
-        alpha = KH
-        KH = KW = alpha + 1 - tile_size
-    pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (KH, KW))
-    OH = (H + pt + pb - KH) // stride_h + 1
-    OW = (W + pl + pr - KW) // stride_w + 1
-    nH, nW = (OH + tile_size - 1) // tile_size, (OW + tile_size - 1) // tile_size
-    P = N * nH * nW
-    judge_winograd_tensorcore = (
-        (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-        or (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-        or (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-    )
-    judge_winograd_shape = (
-        2 < KH < 8
-        and 2 < KW < 8
-        and KH == KW
-        and stride_h == 1
-        and stride_w == 1
-        and dilation_h == 1
-        and dilation_w == 1
-    )
-    return judge_winograd_tensorcore, judge_winograd_shape
-
-
 @correlation_strategy.register(["cuda", "gpu"])
 def correlation_strategy_cuda(attrs, inputs, out_type, target):
     """correlation cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index f6030d471594..c289c65758d9 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -19,10 +19,10 @@
 import logging
 
 import re
-from tvm import topi
-from tvm.topi.util import get_const_int, get_const_float, get_const_tuple, get_float_tuple
+from tvm import topi, _ffi
+from tvm.topi.utils import get_const_int, get_const_float, get_const_tuple, get_float_tuple
+from tvm.target import generic_func, override_native_generic_func
 from .. import op as _op
-from ....target import generic_func, override_native_generic_func
 
 logger = logging.getLogger("strategy")
 
@@ -166,9 +166,17 @@ def schedule_bitpack(attrs, outs, target):
         return topi.generic.schedule_bitpack(outs)
 
 
+get_auto_scheduler_rewritten_layout = _ffi.get_global_func(
+    "relay.attrs.get_auto_scheduler_rewritten_layout"
+)
+
 # conv2d
 def wrap_compute_conv2d(
-    topi_compute, need_data_layout=False, need_out_layout=False, has_groups=False
+    topi_compute,
+    need_data_layout=False,
+    need_out_layout=False,
+    has_groups=False,
+    need_auto_scheduler_layout=False,
 ):
     """Wrap conv2d topi compute"""
 
@@ -179,6 +187,7 @@ def _compute_conv2d(attrs, inputs, out_type):
         data_layout = attrs.get_str("data_layout")
         out_layout = attrs.get_str("out_layout")
         out_dtype = attrs.out_dtype
+        auto_scheduler_rewritten_layout = get_auto_scheduler_rewritten_layout(attrs)
         out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
         args = [inputs[0], inputs[1], strides, padding, dilation]
         if has_groups:
@@ -188,6 +197,8 @@ def _compute_conv2d(attrs, inputs, out_type):
         if need_out_layout:
             args.append(out_layout)
         args.append(out_dtype)
+        if need_auto_scheduler_layout:
+            args.append(auto_scheduler_rewritten_layout)
         return [topi_compute(*args)]
 
     return _compute_conv2d
@@ -1032,19 +1043,35 @@ def schedule_argwhere(attrs, outs, target):
 
 
 # scatter
-@generic_func
-def schedule_scatter(attrs, outs, target):
-    """schedule scatter"""
-    with target:
-        return topi.generic.schedule_scatter(outs)
+@override_native_generic_func("scatter_strategy")
+def scatter_strategy(attrs, outs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.scatter),
+        wrap_topi_schedule(topi.generic.schedule_scatter),
+        name="scatter.generic",
+    )
+    return strategy
 
 
-# scatter_add
-@generic_func
-def schedule_scatter_add(attrs, outs, target):
-    """schedule scatter_add"""
-    with target:
-        return topi.generic.schedule_scatter_add(outs)
+def wrap_compute_scatter(topi_compute):
+    """Wrap scatter topi compute"""
+
+    def _compute_scatter(attrs, inputs, _):
+        return [topi_compute(inputs[0], inputs[1], inputs[2], axis=attrs.axis)]
+
+    return _compute_scatter
+
+
+@override_native_generic_func("scatter_add_strategy")
+def scatter_add_strategy(attrs, outs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.scatter_add),
+        wrap_topi_schedule(topi.generic.schedule_scatter),
+        name="scatter_add.generic",
+    )
+    return strategy
 
 
 # interpolate
@@ -1055,6 +1082,28 @@ def schedule_interpolate(attrs, outs, target):
         return topi.generic.schedule_interpolate(outs)
 
 
+# scatter_nd
+@override_native_generic_func("scatter_nd_strategy")
+def scatter_nd_strategy(attrs, inputs, out_type, target):
+    """scatter_nd generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter_nd(topi.scatter_nd),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter_nd.generic",
+    )
+    return strategy
+
+
+def wrap_compute_scatter_nd(topi_compute):
+    """Wrap scatter_nd topi compute"""
+
+    def _compute_scatter_nd(attrs, inputs, _):
+        return [topi_compute(inputs[0], inputs[1], attrs.out_shape)]
+
+    return _compute_scatter_nd
+
+
 # bitserial_conv2d
 def wrap_compute_bitserial_conv2d(topi_compute):
     """wrap bitserial_conv2d topi compute"""
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index e2a82d396b22..5dfeca65e5c3 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -20,6 +20,7 @@
 
 import re
 from tvm import topi
+from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
 from tvm.relay.ty import is_dynamic
 from .generic import *
@@ -117,15 +118,17 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
-            logger.warning("For x86 target, NCHW layout is recommended for conv2d.")
+            if not is_auto_scheduler_enabled():
+                logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
+                wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
                 wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
                 name="conv2d_nhwc.x86",
             )
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
-            logger.warning("conv2d HWCN layout is not optimized for x86.")
+            if not is_auto_scheduler_enabled():
+                logger.warning("conv2d HWCN layout is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_hwcn),
                 wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
@@ -158,7 +161,10 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
-            logger.warning("depthwise_conv2d NHWC layout is not optimized for x86.")
+            if not is_auto_scheduler_enabled():
+                logger.warning(
+                    "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm."
+                )
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                 wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
@@ -169,7 +175,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
     else:  # group_conv2d
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
-            logger.warning("group_conv2d is not optimized for x86.")
+            if not is_auto_scheduler_enabled():
+                logger.warning("group_conv2d is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
@@ -177,7 +184,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
-            logger.warning("group_conv2d is not optimized for x86.")
+            if not is_auto_scheduler_enabled():
+                logger.warning("group_conv2d is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
@@ -377,6 +385,13 @@ def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
             name="batch_matmul_cblas.x86",
             plevel=15,
         )
+    if "mkl" in target.libs:
+        strategy.add_implementation(
+            wrap_compute_batch_matmul(topi.x86.batch_matmul_mkl),
+            wrap_topi_schedule(topi.x86.schedule_batch_matmul_mkl),
+            name="batch_matmul_mkl.x86",
+            plevel=15,
+        )
     return strategy
 
 
@@ -439,3 +454,16 @@ def bitserial_dense_strategy_cpu(attrs, inputs, out_type, target):
         name="bitserial_dense.x86",
     )
     return strategy
+
+
+@scatter_nd_strategy.register("cpu")
+def scatter_nd_strategy_cpu(attrs, inputs, out_type, target):
+    """scatter_nd x86 strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter_nd(topi.x86.scatter_nd),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter_nd.x86",
+        plevel=10,
+    )
+    return strategy
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 832372a6ed0d..453a9b7a7759 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -15,13 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 """Basic tensor operations."""
-# pylint: disable=redefined-builtin
+# pylint: disable=redefined-builtin, unused-argument
 from tvm.runtime import ndarray as _nd
 from tvm.runtime import TVMContext as _TVMContext
+from tvm.te.hybrid import script
 
 from . import _make
 from .dyn import _make as _dyn_make
 from ..expr import Tuple, Expr
+from . import op as reg
 
 
 # We create a wrapper function for each operator in the
@@ -1138,6 +1140,19 @@ def copy(data):
     return _make.copy(data)
 
 
+@script
+def _copy_shape_func(data_shape):
+    return data_shape
+
+
+@reg.register_shape_func("copy", False)
+def copy_shape_func(attrs, inputs, _):
+    """
+    Shape function for copy op.
+    """
+    return [_copy_shape_func(inputs[0])]
+
+
 def device_copy(data, src_dev, dst_dev):
     """Copy data from the source device to the destination device. This
     operator helps data transferring between difference contexts for
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 01af60ebbd4b..0ffab12c7e70 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -308,28 +308,79 @@ def scatter_add(data, indices, updates, axis):
     return _make.scatter_add(data, indices, updates, axis)
 
 
-def reshape_like(data, shape_like):
-    """Reshapes the input array by the size of another array.
-    For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
-    the input array into an output array with the same shape as the second input array.
+def scatter_nd(data, indices, out_shape):
+    """Scatter values from an array.
+
+    See :py:func:`tvm.topi.scatter` for how data is scattered.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    indices : relay.Expr
+        The index locations to update.
+
+    out_shape : relay.Expr
+        Output shape of the scatter.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.scatter_nd(data, indices, out_shape)
+
+
+def reshape_like(data, shape_like, lhs_begin=0, lhs_end=None, rhs_begin=0, rhs_end=None):
+    """Reshapes the input tensor by the size of another tensor.
+    For an input tensor with shape ``(d0, d1, ..., d(k-1))``, `reshape_like` operation reshapes
+    the input tensor into an output tensor with the same shape as the second input tensor,
+    in particular reshaping the dimensions of `data` in `[lhs_begin, lhs_end)` using the dimensions
+    from `shape_like` in `[rhs_begin, rhs_end)`.
 
     .. note::
-        Sizes for both array should be compatible.
+        Sizes for `data` and the output tensor should be compatible.
 
     Parameters
     ----------
     data : relay.Expr
         The input data to the operator.
 
-    shape_like : tuple of int
-        The new shape. Should be compatible with the original shape.
+    shape_like : relay.Expr
+        The tensor to reshape data like. Should be compatible with the original shape on the
+        reshaped dimensions.
+
+    lhs_begin : int, optional
+        The axis of data to begin reshaping. Default is 0.
+
+    lhs_end : int or None, optional
+        The axis of data where reshaping should stop, exclusive. Default is None which reshapes to
+        the end.
+
+    rhs_begin : int, optional
+        The axis of shape_like where the target shape begins. Default is 0.
+
+    rhs_end : int or None, optional
+        The axis of shape_like where the target shape ends, exclusive. Default is None which extends
+        to the end.
 
     Returns
     -------
     ret : relay.Expr
         The computed result.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        data.shape == (1, 2, 3, 4)
+        shape_like.shape == (6, 2, 2, 3)
+
+        ret = relay.reshape_like(data, shape_like, lhs_begin=1, rhs_end=3)
+        ret.shape == (1, 6, 2, 2)
     """
-    return _make.reshape_like(data, shape_like)
+    return _make.reshape_like(data, shape_like, lhs_begin, lhs_end, rhs_begin, rhs_end)
 
 
 def take(data, indices, axis=None, mode="clip"):
@@ -688,25 +739,26 @@ def where(condition, x, y):
     condition.
 
     .. note::
-        The shape of condition, x, and y needs to be the same.
+        Shapes of condition, x, and y must be broadcastable to a common shape.
+        Semantics follow numpy where function
+        https://numpy.org/doc/stable/reference/generated/numpy.where.html
 
     Parameters
     ----------
     condition : relay.Expr
-        The condition array. The n-th element in `y` is selected when the n-th
-        value in the `condition` array is zero. Otherwise, the corresponding
-        element from `x` will be picked.
+        Where True, yield x, otherwise yield y
 
     x : relay.Expr
-        The first array to be selected.
+        The first array or scalar to be selected.
 
     y : relay.Expr
-        The second array to be selected.
+        The second array or scalar to be selected.
 
     Returns
     -------
     result : relay.Expr
-        The selected array.
+        The selected array. The output shape is the broadcasted shape from
+        condition, x, and y.
 
     Examples
     --------
@@ -717,7 +769,7 @@ def where(condition, x, y):
         condition = [[0, 1], [-1, 0]]
         relay.where(conditon, x, y) = [[5, 2], [3, 8]]
 
-        condition = [1, 0]
+        condition = [[1], [0]]
         relay.where(conditon, x, y) = [[1, 2], [7, 8]]
     """
     return _make.where(condition, x, y)
@@ -1214,6 +1266,8 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0
 
     if default_value == 0:
         default_value = const(0)
+    if isinstance(output_shape, Expr):
+        return _dyn_make.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
     return _make.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
 
 
diff --git a/python/tvm/relay/op/vision/_rcnn.py b/python/tvm/relay/op/vision/_rcnn.py
index a5cc266f1566..4686974059b4 100644
--- a/python/tvm/relay/op/vision/_rcnn.py
+++ b/python/tvm/relay/op/vision/_rcnn.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Faster R-CNN and Mask R-CNN operations."""
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .. import op as reg
 from .. import strategy
 from ..op import OpPattern
@@ -69,11 +69,53 @@ def convert_roi_align(attrs, inputs, tinfos, desired_layouts):
     raise ValueError("Layout %s is not yet supported." % desired_data_layout)
 
 
+@reg.register_convert_op_layout("vision.roi_pool")
+def convert_roi_pool(attrs, inputs, tinfos, desired_layouts):
+    """Convert Layout pass registration for roi_pool op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current roi_pool
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    tinfos : list of types
+        List of input and output types
+    desired_layouts : list of layout strings
+        List of layouts defining our desired
+        layout for the data and rois inputs respectively.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The transformed expr
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    data, rois = inputs
+    new_attrs = dict(attrs)
+    assert (
+        len(desired_layouts) == 2
+    ), "A desired layout is expected for both of vision.roi_pool's inputs"
+
+    desired_data_layout, desired_rois_layout = map(str, desired_layouts)
+    assert desired_data_layout != "default", "Data layout cannot be default"
+    assert desired_rois_layout == "default", "Rois layout must be default"
+
+    new_attrs["layout"] = desired_data_layout
+    # rois layout not change
+    if desired_data_layout in ["NCHW", "NHWC"]:
+        return relay.vision.roi_pool(data, rois, **new_attrs)
+
+    raise ValueError("Layout %s is not yet supported." % desired_data_layout)
+
+
 # roi_pool
 @reg.register_compute("vision.roi_pool")
 def compute_roi_pool(attrs, inputs, _):
     """Compute definition of roi_pool"""
-    assert attrs.layout == "NCHW"
+    assert attrs.layout == "NCHW", "only support nchw for now"
     return [
         topi.vision.rcnn.roi_pool_nchw(
             inputs[0],
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 50e5a02f84c0..d74b3d989270 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -32,6 +32,12 @@ def legalize_qnn_conv2d(attrs, inputs, types):
     return qnn_conv2d_legalize(attrs, inputs, types)
 
 
+# Registering QNN Conv2DTranspose legalization function.
+@reg.register_qnn_legalize("qnn.conv2d_transpose")
+def legalize_qnn_conv2d_transpose(attrs, inputs, types):
+    return qnn_conv2d_transpose_legalize(attrs, inputs, types)
+
+
 # Registering QNN dense legalization function.
 @reg.register_qnn_legalize("qnn.dense")
 def legalize_qnn_dense(attrs, inputs, types):
@@ -46,6 +52,24 @@ def qnn_conv2d_legalize(attrs, inputs, types):
     return None
 
 
+# Generic QNN Conv2DTranspose legalization function.
+@tvm.target.generic_func
+def qnn_conv2d_transpose_legalize(attrs, inputs, types):
+    """Convert kernel and data to int16, subtract offsets upfront
+    and calls into relay.nn.conv2d_transpose."""
+
+    # Collect the input exprs.
+    data, kernel, input_zero_point, kernel_zero_point, _, _ = inputs
+
+    shift_data = relay.subtract(
+        relay.cast(data, dtype="int16"), relay.cast(input_zero_point, "int16")
+    )
+    shift_kernel = relay.subtract(
+        relay.cast(kernel, dtype="int16"), relay.cast(kernel_zero_point, "int16")
+    )
+    return relay.nn.conv2d_transpose(shift_data, shift_kernel, **attrs)
+
+
 # Generic QNN Conv2D legalization function.
 @tvm.target.generic_func
 def qnn_dense_legalize(attrs, inputs, types):
@@ -75,7 +99,7 @@ def helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay_op):
     """Converts QNN operators into a sequence of Relay operators that are friendly to HW that do
     not have fast Int8 arithmetic. For example, for ARM, LLVM utilizes the assembly instructions
     much more efficiently if the convolution or dense operator input datatypes are int16 instead of
-    int8. More details are present at https://github.com/apache/incubator-tvm/pull/4277.
+    int8. More details are present at https://github.com/apache/tvm/pull/4277.
 
     Parameters
     ----------
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index e4a6cbf43383..a5892f331f06 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -19,8 +19,10 @@
 
 from __future__ import absolute_import as _abs
 from tvm.relay.expr import Tuple, TupleWrapper
-from tvm.relay.op.nn.util import get_pad_tuple2d
+from tvm.relay.op.nn.utils import get_pad_tuple2d
 from . import _make
+from ... import op as reg
+from ...op import OpPattern
 
 
 def requantize(
@@ -294,6 +296,118 @@ def conv2d(
     )
 
 
+def conv2d_transpose(
+    data,
+    weight,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    strides=(1, 1),
+    padding=(0, 0),
+    dilation=(1, 1),
+    groups=1,
+    channels=None,
+    kernel_size=None,
+    data_layout="NCHW",
+    kernel_layout="OIHW",
+    out_layout="",
+    output_padding=(0, 0),
+    out_dtype="",
+):
+    """This operator deconvolves quantized data with quantized kernel. The scale of
+    the output quantized tensor is the product of the kernel_scale and
+    input_scale of the input quantized tensors. The zero point of the output
+    quantized tensor is 0. By default, the dtype of output is int32. Please also
+    refer to Requantize operator to understand how to scale back the int32
+    output to (u)int8.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    input_zero_point: tvm.relay.Expr
+           The zero point of the data distribution.
+
+    kernel_zero_point: tvm.relay.Expr
+           The zero point of the quantized_kernel distribution.
+
+    input_scale: tvm.relay.Expr
+           The scale for the input tensor. The scale for the input tensor is
+           stored purely for convenience here. See more commentary below.
+
+    kernel_scale: tvm.relay.Expr
+           The scale for the weight tensor. The scale for the weight tensor is
+           stored for access to this during relay. This information is not
+           needed in the pass pipeline after qnn.transpose_conv2d is lowered to the
+           sequence of steps as in nn.transpose_conv2d. See also input_scale in Requantize.
+
+    strides : Tuple[int], optional
+        The strides of convolution.
+
+    padding : Tuple[int], optional
+        The padding of convolution.
+
+    dilation : Tuple[int], optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial dimensions of the convolution kernel.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    kernel_layout : str, optional
+        Layout of the weight.
+
+    out_layout : Optional[str]
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    output_padding : Tuple[int], optional
+        Used to identify the padding within the output shape
+        (only used in training, where transpose_conv represents the gradient of a convolution )
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # convert 2-way padding to 4-way padding
+    padding = get_pad_tuple2d(padding)
+    return _make.conv2d_transpose(
+        data,
+        weight,
+        input_zero_point,
+        kernel_zero_point,
+        input_scale,
+        kernel_scale,
+        strides,
+        padding,
+        dilation,
+        groups,
+        channels,
+        kernel_size,
+        data_layout,
+        kernel_layout,
+        out_layout,
+        output_padding,
+        out_dtype,
+    )
+
+
 def add(
     lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
 ):
@@ -496,3 +610,8 @@ def subtract(
         output_scale,
         output_zero_point,
     )
+
+
+# register fuse pattern for qnn ops
+reg.register_pattern("qnn.quantize", OpPattern.OPAQUE)
+reg.register_pattern("qnn.dequantize", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 329ba64aae00..6c395e257cc7 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -175,6 +175,28 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+@register_annotate_function("nn.conv1d")
+def conv1d_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for conv1d. Lhs of conv will be quantized to
+    input field, and rhs of conv will be quantized to weight field.
+    Output would be in activation field"""
+    if quantize_context().check_to_skip(ref_call):
+        return None
+
+    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
+    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
+
+    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
+        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
+
+    assert rhs_kind is None
+    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
+
+    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+
+    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
+
 @register_annotate_function("nn.dense")
 def dense_rewrite(ref_call, new_args, ctx):
     """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
@@ -284,10 +306,13 @@ def identity_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(ret_expr, x_kind)
 
 
+register_annotate_function("reshape", identity_rewrite)
 register_annotate_function("clip", identity_rewrite)
 register_annotate_function("nn.relu", identity_rewrite)
 register_annotate_function("strided_slice", identity_rewrite)
 register_annotate_function("nn.avg_pool2d", identity_rewrite)
+register_annotate_function("nn.batch_flatten", identity_rewrite)
+register_annotate_function("transpose", identity_rewrite)
 register_annotate_function("annotation.stop_fusion", identity_rewrite)
 
 
@@ -310,6 +335,25 @@ def pool2d_rewrite(ref_call, new_args, ctx):
 register_annotate_function("nn.max_pool2d", pool2d_rewrite)
 
 
+def pool1d_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for max pool1d"""
+    if quantize_context().check_to_skip(ref_call):
+        return None
+
+    expr, x_kind = _get_expr_kind(new_args[0])
+
+    if x_kind is None:
+        return None
+    if x_kind == QAnnotateKind.ACTIVATION:
+        expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT)
+
+    expr = _forward_op(ref_call, [expr])
+    return QAnnotateExpr(expr, QAnnotateKind.INPUT)
+
+
+register_annotate_function("nn.max_pool1d", pool1d_rewrite)
+
+
 @register_annotate_function("annotation.cast_hint")
 def cast_hint_rewrite(ref_call, new_args, ctx):
     """Rewrite function to force cast"""
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
index 6892e8612a94..563d28366874 100644
--- a/python/tvm/relay/quantize/_partition.py
+++ b/python/tvm/relay/quantize/_partition.py
@@ -82,7 +82,7 @@ def add_partition_generic(ref_call, new_args, ctx):
         #     ...
         lhs = new_args[0].realize()
         rhs = new_args[1].realize()
-        return _forward_op(ref_call, [lhs, rhs])
+        return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
     if not lhs_cond and rhs_cond:
         # - introduced by residual connection in ResNet
         #     ...
@@ -130,6 +130,7 @@ def mul_partition_generic(ref_call, new_args, ctx):
 
     if lhs_cond:
         # introduced by bn: multiply(out, scale)
+        lhs = new_args[0].realize()
         return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
 
     if not lhs_cond and not rhs_cond:
@@ -155,3 +156,15 @@ def add_partition_function(ref_call, new_args, ctx):
 def multiply_partition_function(ref_call, new_args, ctx):
     """Rewrite function for ewise multiply for partition"""
     return mul_partition_generic(ref_call, new_args, ctx)
+
+
+# add cast after the relu op to make it run on vta
+@register_partition_function("nn.global_avg_pool2d")
+def global_avg_pool2d_partition_function(ref_call, new_args, ctx):
+    cond, expr = partition_expr_check(new_args[0])
+    if cond:
+        expr = new_args[0].realize()
+    else:
+        expr = QPartitionExpr(new_args[0]).realize()
+
+    return _forward_op(ref_call, [expr])
diff --git a/python/tvm/relay/tensorrt.py b/python/tvm/relay/tensorrt.py
deleted file mode 100644
index c04a679b1049..000000000000
--- a/python/tvm/relay/tensorrt.py
+++ /dev/null
@@ -1,1014 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,arguments-differ,no-else-return,unused-argument,missing-docstring
-"""
-Relay TensorRT codegen.
-"""
-import os
-import numpy as np
-import tvm
-import tvm.ir
-import tvm.relay.transform as transform
-from tvm import relay
-from tvm.relay.expr import Call, Constant, Tuple, GlobalVar, Var, TupleGetItem
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.transform import _ffi_api
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
-
-
-class LegalizeLayoutTranform(ExprMutator):
-    """
-    Legalize Relay layout transforms to transpose ops to simplify TensorRT conversion.
-    """
-
-    def visit_call(self, expr):
-        visit = super().visit_call(expr)
-        if expr.op == tvm.relay.op.get("layout_transform"):
-            src_layout = expr.attrs["src_layout"]
-            dst_layout = expr.attrs["dst_layout"]
-            if src_layout == "NCHW" and dst_layout == "NHWC":
-                return relay.transpose(visit.args[0], axes=[0, 2, 3, 1])
-            elif src_layout == "NHWC" and dst_layout == "NCHW":
-                return relay.transpose(visit.args[0], axes=[0, 3, 1, 2])
-            elif src_layout == "NDHWC" and dst_layout == "NCDHW":
-                return relay.transpose(visit.args[0], axes=[0, 4, 1, 2, 3])
-            elif src_layout == "NCDHW" and dst_layout == "NDHWC":
-                return relay.transpose(visit.args[0], axes=[0, 2, 3, 4, 1])
-            elif src_layout == "HWIO" and dst_layout == "OIHW":
-                return relay.transpose(visit.args[0], axes=[3, 2, 0, 1])
-            elif src_layout == "HWOI" and dst_layout == "OIHW":
-                return relay.transpose(visit.args[0], axes=[2, 3, 0, 1])
-            elif src_layout == "HWIO" and dst_layout == "IOHW":
-                return relay.transpose(visit.args[0], axes=[2, 3, 0, 1])
-        return visit
-
-
-class RemoveDropout(ExprMutator):
-    """
-    Removes all nn.dropout from an expr.
-    """
-
-    def visit_tuple_getitem(self, expr):
-        visit = super().visit_tuple_getitem(expr)
-        if visit.index != 0:
-            return visit
-        elif isinstance(visit.tuple_value, Call) and visit.tuple_value.op.name == "nn.dropout":
-            return visit.tuple_value.args[0]
-        return visit
-
-
-@transform.function_pass(opt_level=0)
-class LegalizeLayoutTranformPass:
-    def transform_function(self, func, mod, _):
-        return LegalizeLayoutTranform().visit(func)
-
-
-@transform.function_pass(opt_level=0)
-class RemoveDropoutPass:
-    def transform_function(self, func, mod, _):
-        return RemoveDropout().visit(func)
-
-
-def GetTrtVersion():
-    """Gets the version of TensorRT that TVM is built against.
-
-    Returns
-    -------
-    ret: Tuple[int]
-        TensorRT version as a tuple of major, minor, and patch number. If TVM
-        is not built with TensorRT, an empty tuple is returned instead.
-    """
-    return tuple(map(int, _ffi_api.GetTrtVersion()))
-
-
-def IsTrtRuntimeAvailable():
-    if not tvm.get_global_func("relay._transform.GetTrtVersion", True):
-        return False
-    return GetTrtVersion() != ()
-
-
-def check_dynamism(args, op_name):
-    """
-    This function checks for dynamism inside any of the args in the op.
-    Can be used to offload dynamic ops that are not supported by TRT to
-    be offloaded to relay VM.
-
-    Raises a NotImplementedError if the type of the arg is not of types
-    Call, Var, Constant, or TupleGetItem.
-
-    Parameters
-    ----------
-    args: a TRT array of the arguments of the op
-    op_name: name of the op for debugging purposes only
-
-    Returns
-    ----------
-    True if dynamism is present, False otherwise
-    """
-    for arg in args:
-        if isinstance(arg, (Call, Var, Constant, TupleGetItem)):
-            for dim_shape in arg.checked_type.shape[1:]:
-                if isinstance(dim_shape, tvm.tir.expr.Any):
-                    print(
-                        "Dynamic inputs are not supported for TensorRT for ",
-                        op_name,
-                        arg.checked_type.shape,
-                    )
-                    return True
-        elif isinstance(arg, Tuple):
-            return check_dynamism(arg.fields, op_name)
-        else:
-            print(
-                "Arg not supported in TensorRT for ",
-                op_name,
-                type(arg),
-            )
-            return True
-    return False
-
-
-def _register_external_op_helper(op_name, supported=True):
-    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-    def _func_wrapper(attrs, args):
-        if check_dynamism(args, op_name):
-            return False
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        # TODO (codeislife99): Here we are excluding multiply calculations which get "batched" in
-        # implicit batch mode. This leads to wrong or invalid multiply calculations.
-        # Since the Neo-service uses implicit batch mode=True, this is a temporary workaround.
-        # A more generalizable workaround is in the works for a future update.
-        if op_name == "multiply":
-            shapes = [
-                [
-                    int(x) if not isinstance(x, tvm.tir.expr.Any) else -1
-                    for x in arg.checked_type.shape
-                ]
-                for arg in args
-            ]
-            if all(
-                [list(map(int, shape)) in [[300, 64, 7, 7], [300, 1, 1, 1]] for shape in shapes]
-            ):
-                return False
-
-        return supported
-
-    return _func_wrapper
-
-
-def _register_external_op_helper_func(op_name, func, trt_version):
-    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-    def _func_wrapper(attrs, args):
-        if check_dynamism(args, op_name):
-            return False
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        return func(attrs, args, op_name, trt_version)
-
-    return _func_wrapper
-
-
-def _register_external_dynamic_check_func(op_name, func):
-    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-    def _func_wrapper(attrs, args):
-        if check_dynamism(args, op_name):
-            return False
-        return func(attrs, args)
-
-    return _func_wrapper
-
-
-def register_tensorrt_annotations(trt_version, use_implicit_batch=True):
-    if hasattr(register_tensorrt_annotations, "registered"):
-        # Can't register annotations more than once.
-        return
-    register_tensorrt_annotations.registered = True
-    if not use_implicit_batch and trt_version < (6, 0, 1):
-        print("Explicit batch mode only available for TRT 6+")
-        use_implicit_batch = True
-    # Ops which are always supported
-    _register_external_op_helper("nn.relu")
-    _register_external_op_helper("sigmoid")
-    _register_external_op_helper("tanh")
-    _register_external_op_helper("subtract")
-    _register_external_op_helper("multiply")
-    _register_external_op_helper("divide")
-    _register_external_op_helper("power")
-    _register_external_op_helper("maximum")
-    _register_external_op_helper("minimum")
-    _register_external_op_helper("exp")
-    _register_external_op_helper("log")
-    _register_external_op_helper("sqrt")
-    _register_external_op_helper("abs")
-    _register_external_op_helper("negative")
-    _register_external_op_helper("nn.batch_flatten")
-    _register_external_op_helper("clip")
-    # TODO(trevmorr): Temporarily disable split due to TRT bug on xavier.
-    # _register_external_op_helper("split")
-    # _register_external_op_helper("slice_like")
-
-    def add_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        shapes = [
-            [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
-            for arg in args
-        ]
-
-        for shape in shapes:
-            if len(shape) < 1:
-                return False
-
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-
-        if (
-            (isinstance(args[0], Constant) or isinstance(args[1], Constant))
-            and shapes[0][0] == shapes[1][0]
-            and shapes[0][0] != 1
-            and (len(shapes[0]) > 3 or len(shapes[1]) > 3)
-        ):
-            print("add: bug in TRT with adding batched constants.")
-            return False
-
-        # Skip this add op in TRT to avoid accuracy mismatch
-        if all([list(map(int, shape)) == [1, 546, 1, 1] for shape in shapes]):
-            print("add: bug in TRT with add of shape (1, 546, 1, 1).")
-            return False
-
-        return True
-
-    def batch_norm_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if int(attrs.axis) != 1 and int(attrs.axis) != 3:
-            print("nn.batch_norm: axis is {} but must be 1 or 3.".format(int(attrs.axis)))
-            return False
-        return True
-
-    def softmax_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if use_implicit_batch and int(attrs.axis) == 0:
-            print("nn.softmax: can't modify batch dimension.")
-            return False
-        return True
-
-    def conv2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.data_layout != "NCHW":
-            print("nn.conv2d: data_layout is {} but must be NCHW.".format(attrs.data_layout))
-            return False
-        if attrs.kernel_layout != "OIHW":
-            print("nn.conv2d: kernel_layout is {} but must be OIHW.".format(attrs.kernel_layout))
-            return False
-        if attrs.out_layout and attrs.out_layout != "NCHW":
-            print("nn.conv2d: out_layout is {} but must be NCHW.".format(attrs.out_layout))
-            return False
-        return True
-
-    def dense_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        input_rank = len(args[0].checked_type.shape)
-        weight_rank = len(args[1].checked_type.shape)
-        if input_rank < 2 or input_rank > 4:
-            print("nn.dense: input has rank {} but must be 2, 3 or 4.".format(input_rank))
-            return False
-        if weight_rank != 2:
-            print("nn.dense: weight has rank {} but must be 2.".format(weight_rank))
-            return False
-        return True
-
-    def bias_add_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        # TODO(trevmorr): BiasAddSimplifier creates a pattern which cannot be
-        # converted to TRT without binding params and constant folding.
-        # if trt_version < (6, 0, 1):
-        #     return False
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        input_rank = len(args[0].checked_type.shape)
-        if input_rank < 2 or input_rank > 4:
-            print("nn.bias_add: input rank is {} but must be 2, 3 or 4.".format(input_rank))
-            return False
-        return True
-
-    def max_pool_2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.layout != "NCHW":
-            print("nn.max_pool2d: layout is {} but must be NCHW.".format(attrs.layout))
-            return False
-        if attrs.ceil_mode and trt_version < (5, 1, 5):
-            print("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
-            return False
-        return True
-
-    def avg_pool_2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.layout != "NCHW":
-            print("nn.avg_pool2d: layout is {} but must be NCHW.".format(attrs.layout))
-            return False
-        if (
-            attrs.count_include_pad
-            and len(attrs.padding) == 4
-            and (
-                int(attrs.padding[0]) != int(attrs.padding[2])
-                or int(attrs.padding[1]) != int(attrs.padding[3])
-            )
-        ):
-            print(
-                "nn.avg_pool2d: inclusive-counted blended or average "
-                "pooling is not supported in combination with asymmetric padding"
-            )
-            return False
-        if attrs.ceil_mode and trt_version < (5, 1, 5):
-            print("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
-            return False
-        return True
-
-    def global_max_pool_2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.layout != "NCHW":
-            print("nn.global_max_pool2d: layout is {} but must be NCHW.".format(attrs.layout))
-            return False
-        return True
-
-    def global_avg_pool_2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.layout != "NCHW":
-            print("nn.global_avg_pool2d: layout is {} but must be NCHW.".format(attrs.layout))
-            return False
-        return True
-
-    def expand_dims_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if use_implicit_batch and int(attrs.axis) == 0:
-            print("expand_dims: can't modify batch dimension.")
-            return False
-        return True
-
-    def squeeze_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if not attrs.axis:
-            print("squeeze: must explicitly set axis.")
-            return False
-        if use_implicit_batch and any([axis == 0 for axis in map(int, attrs.axis)]):
-            print("squeeze: can't modify batch dimension.")
-            return False
-        return True
-
-    def concatenate_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.dtype != "float32" for x in args[0].checked_type.fields]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if not use_implicit_batch:
-            return True
-        if int(attrs.axis) == 0:
-            print("concatenate: can't modify batch dimension.")
-            return False
-        if isinstance(args[0], Tuple):
-            for tuple_input in args[0].fields:
-                if isinstance(tuple_input, Constant):
-                    print("concatenate: can't concatenate tensors with constants.")
-                    return False
-        return True
-
-    def conv2d_transpose_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.data_layout != "NCHW":
-            print(
-                "nn.conv2d_transpose: data_layout is {} but must be NCHW.".format(attrs.data_layout)
-            )
-            return False
-        if attrs.kernel_layout != "OIHW":
-            print(
-                "nn.conv2d_transpose: kernel_layout is {} but must be OIHW.".format(
-                    attrs.kernel_layout
-                )
-            )
-            return False
-        if attrs.out_layout and attrs.out_layout != "NCHW":
-            print(
-                "nn.conv2d_transpose: out_layout is {} but must be NCHW.".format(attrs.out_layout)
-            )
-            return False
-        if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
-            print("nn.conv2d_transpose: dilation rate must be 1.")
-            return False
-        return True
-
-    def transpose_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if use_implicit_batch and int(attrs.axes[0]) != 0:
-            print("transpose: can't modify batch dimension.")
-            return False
-        return True
-
-    def reshape_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if args[0].checked_type.dtype != "float32":
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if any([x < -1 for x in map(int, attrs.newshape)]):
-            print("reshape: new shape dims must be explicit.")
-            return False
-        if use_implicit_batch:
-            shape = args[0].checked_type.shape
-            new_shape = attrs.newshape
-            if len(new_shape) == 0 or len(shape) == 0:
-                print("reshape: Can't reshape to or from scalar.")
-                return False
-
-            dynamic_reshape = any([isinstance(x, tvm.tir.expr.Any) for x in shape])
-
-            if dynamic_reshape:
-                # Make sure that the batch dim is unmodified.
-                if int(new_shape[0]) < 0:
-                    for shape_val, new_shape_val in enumerate(shape[1:], new_shape[1:]):
-                        if not (
-                            isinstance(shape_val, int)
-                            and isinstance(new_shape_val, int)
-                            and int(shape_val) == int(new_shape_val)
-                        ):
-                            return False
-                elif int(new_shape[0]) > 0:
-                    if not (
-                        isinstance(shape[0], int)
-                        and isinstance(new_shape[0], int)
-                        and int(shape[0]) == int(new_shape[0])
-                    ):
-                        return False
-                return True
-            else:
-                shape = list(map(int, shape))
-                new_shape = list(map(int, new_shape))
-
-                # TRT cannot modify batch dimension.
-                original_volume = np.prod(shape)
-                # First, resolve 0.
-                for i, value in enumerate(new_shape):
-                    if value == 0:
-                        new_shape[i] = shape[i]
-                # Resolve -1.
-                for i, value in enumerate(new_shape):
-                    if value == -1:
-                        new_shape[i] = original_volume // np.prod([x for x in new_shape if x != -1])
-                # Remove batch dimension and see if volumes match
-                if shape[0] != new_shape[0]:
-                    print("reshape: can't modify batch dimension.")
-                    return False
-        return True
-
-    def pad_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.pad_mode != "constant":
-            print("nn.pad: pad mode is {} but must be constant.".format(attrs.pad_mode))
-            return False
-        if float(attrs.pad_value) != 0.0:
-            print("nn.pad: pad value is {} but must be 0.0.".format(float(attrs.pad_value)))
-            return False
-        return True
-
-    def reduce_whitelist_fn(attrs, args, op_name, trt_version):
-        if not attrs.axis or len(attrs.axis) == 0:
-            print("{}: cannot reduce to scalar.".format(op_name))
-            return False
-        if attrs.exclude:
-            print("{}: exclude not supported.".format(op_name))
-            return False
-        if use_implicit_batch and any([x == 0 for x in map(int, attrs.axis)]):
-            print("{}: can't modify batch dimension.".format(op_name))
-            return False
-        return True
-
-    def trt_5_1_5_whitelist_fn(attrs, args, op_name, trt_version):
-        if trt_version < (5, 1, 5):
-            print("{}: requires TensorRT version 5.1.5 or higher.".format(op_name))
-            return False
-        return True
-
-    _register_external_op_helper_func("sum", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("prod", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("max", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("min", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("mean", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("nn.leaky_relu", trt_5_1_5_whitelist_fn, trt_version)
-    _register_external_op_helper_func("sin", trt_5_1_5_whitelist_fn, trt_version)
-    _register_external_op_helper_func("cos", trt_5_1_5_whitelist_fn, trt_version)
-    _register_external_op_helper_func("atan", trt_5_1_5_whitelist_fn, trt_version)
-    _register_external_op_helper_func("ceil", trt_5_1_5_whitelist_fn, trt_version)
-
-    def strided_slice_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (5, 1, 5):
-            print("strided_slice: requires TensorRT version 5.1.5 or higher.")
-            return False
-        if args[0].checked_type.dtype != "float32":
-            print("strided_slice: only fp32 inputs are supported.")
-            return False
-        if use_implicit_batch:
-            batch_dim_begin_modified = attrs.begin[0] is not None and int(attrs.begin[0]) != 0
-            batch_dim_end_modified = (
-                attrs.end[0] is not None
-                and int(attrs.end[0]) != -1
-                and int(attrs.end[0]) != int(args[0].checked_type.shape[0])
-            )
-            if batch_dim_begin_modified or batch_dim_end_modified:
-                print("strided_slice: can't modify batch dimension.")
-                return False
-        if any([x is not None and x <= 0 for x in attrs.strides]):
-            print("strided_slice: stride must be positive")
-            return False
-        for i in range(0, len(args[0].checked_type.shape)):
-            begin = int(attrs.begin[i])
-            end = (
-                int(attrs.end[i])
-                if attrs.end[i] is not None and int(attrs.end[i]) != -1
-                else args[0].checked_type.shape[i]
-            )
-            if int(end) - int(begin) < 1:
-                print("strided_slice: size of slice must be at least 1")
-                return False
-        return True
-
-    def resize_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        # TODO(trevmorr): Output does not match TVM. Disable.
-        return False
-
-    def adapative_max_pool2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
-            print("nn.adaptive_max_pool2d: output size must be (1, 1).")
-            return False
-        return True
-
-    def adapative_avg_pool2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
-            print("nn.adaptive_avg_pool2d: output size must be (1, 1).")
-            return False
-        return True
-
-    def upsampling_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        # TODO(trevmorr): Output does not match TVM. Disable.
-        return False
-
-    def conv3d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (6, 0, 1):
-            print("nn.conv3d: requires TensorRT version 6.0.1 or higher.")
-            return False
-        if attrs.data_layout != "NCDHW":
-            print("nn.conv3d: data_layout is {} but must be NCDHW.".format(attrs.data_layout))
-            return False
-        if attrs.kernel_layout != "OIDHW":
-            print("nn.conv3d: kernel_layout is {} but must be OIDHW.".format(attrs.kernel_layout))
-            return False
-        if attrs.out_layout and attrs.out_layout != "NCDHW":
-            print("nn.conv3d: out_layout is {} but must be NCDHW.".format(attrs.out_layout))
-            return False
-        return True
-
-    def max_pool_3d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (6, 0, 1):
-            print("nn.max_pool3d: requires TensorRT version 6.0.1 or higher.")
-            return False
-        if attrs.layout != "NCDHW":
-            print("nn.max_pool3d: layout is {} but must be NCDHW.".format(attrs.layout))
-            return False
-        return True
-
-    def avg_pool_3d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (6, 0, 1):
-            print("nn.avg_pool3d: requires TensorRT version 6.0.1 or higher.")
-            return False
-        if attrs.layout != "NCDHW":
-            print("nn.avg_pool3d: layout is {} but must be NCDHW.".format(attrs.layout))
-            return False
-        return True
-
-    def conv3d_transpose_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (6, 0, 1):
-            print("nn.conv3d_transpose: requires TensorRT version 6.0.1 or higher.")
-            return False
-        if attrs.data_layout != "NCDHW":
-            print(
-                "nn.conv3d_transpose: data_layout is {} but must be NCDHW.".format(
-                    attrs.data_layout
-                )
-            )
-            return False
-        if attrs.kernel_layout != "OIDHW":
-            print(
-                "nn.conv3d_transpose: kernel_layout is {} but must be OIDHW.".format(
-                    attrs.kernel_layout
-                )
-            )
-            return False
-        if attrs.out_layout and attrs.out_layout != "NCDHW":
-            print(
-                "nn.conv3d_transpose: out_layout is {} but must be NCDHW.".format(attrs.out_layout)
-            )
-            return False
-        if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
-            print("nn.conv3d_transpose: dilation rate must be 1.")
-            return False
-        if attrs.output_padding and any([x != 0 for x in map(int, attrs.output_padding)]):
-            print("nn.conv3d_transpose: output padding is not supported.")
-            return False
-        return True
-
-    _register_external_dynamic_check_func("add", add_whitelist_fn)
-    _register_external_dynamic_check_func("nn.batch_norm", batch_norm_whitelist_fn)
-    _register_external_dynamic_check_func("nn.softmax", softmax_whitelist_fn)
-    _register_external_dynamic_check_func("nn.conv2d", conv2d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.dense", dense_whitelist_fn)
-    _register_external_dynamic_check_func("nn.bias_add", bias_add_whitelist_fn)
-    _register_external_dynamic_check_func("nn.max_pool2d", max_pool_2d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.avg_pool2d", avg_pool_2d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.global_max_pool2d", global_max_pool_2d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.global_avg_pool2d", global_avg_pool_2d_whitelist_fn)
-    _register_external_dynamic_check_func("expand_dims", expand_dims_whitelist_fn)
-    _register_external_dynamic_check_func("squeeze", squeeze_whitelist_fn)
-    _register_external_dynamic_check_func("concatenate", concatenate_whitelist_fn)
-    _register_external_dynamic_check_func("nn.conv2d_transpose", conv2d_transpose_whitelist_fn)
-    _register_external_dynamic_check_func("transpose", transpose_whitelist_fn)
-    _register_external_dynamic_check_func("reshape", reshape_whitelist_fn)
-    _register_external_dynamic_check_func("nn.pad", pad_whitelist_fn)
-    _register_external_dynamic_check_func("strided_slice", strided_slice_whitelist_fn)
-    _register_external_dynamic_check_func("image.resize", resize_whitelist_fn)
-    _register_external_dynamic_check_func(
-        "nn.adaptive_max_pool2d", adapative_max_pool2d_whitelist_fn
-    )
-    _register_external_dynamic_check_func(
-        "nn.adaptive_avg_pool2d", adapative_avg_pool2d_whitelist_fn
-    )
-    _register_external_dynamic_check_func("nn.upsampling", upsampling_whitelist_fn)
-    _register_external_dynamic_check_func("nn.conv3d", conv3d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.max_pool3d", max_pool_3d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.avg_pool3d", avg_pool_3d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.conv3d_transpose", conv3d_transpose_whitelist_fn)
-
-
-class VarReplacer(ExprMutator):
-    """
-    Visit an expression while replacing vars according to var_map. Used by
-    SubgraphRemover/PruneSubgraphs to return a subgraph originally partitioned to TRT back to TVM.
-    """
-
-    def __init__(self, var_map):
-        ExprMutator.__init__(self)
-        self.var_map = var_map
-
-    def visit_var(self, var):
-        if var in self.var_map:
-            return self.var_map[var]
-        return super().visit_var(var)
-
-
-class SubgraphRemover(ExprMutator):
-    """
-    Reverts subgraphs in subgraphs_to_remove back to TVM instead of using an external codegen.
-    """
-
-    def __init__(self, subgraphs_to_remove, mod, new_mod):
-        ExprMutator.__init__(self)
-        self.subgraphs_to_remove = subgraphs_to_remove
-        self.mod = mod
-        self.new_mod = new_mod
-
-    def visit_call(self, call):
-        if isinstance(call.op, GlobalVar):
-            name = call.op.name_hint
-            if name in self.subgraphs_to_remove:
-                # "Inline" the subgraph back into new main function.
-                func = self.mod[name]
-                var_map = {}
-                for arg, param in zip(call.args, func.params):
-                    var_map[param] = super().visit(arg)
-                new_body = VarReplacer(var_map).visit(func.body)
-                return new_body
-            elif name != "main":
-                # Copy the GlobalVar (subgraph function) to the new module and call.
-                args = []
-                for arg in call.args:
-                    args.append(super().visit(arg))
-                return call.op(*args)
-
-        return super().visit_call(call)
-
-
-class IsComputeIntensiveGraph(ExprVisitor):
-    """
-    Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
-    its transpose, dense and batch mat-mul.
-    """
-
-    def __init__(self):
-        ExprVisitor.__init__(self)
-        self.is_compute_intensive = False
-
-    def visit_call(self, call):
-        heavy_ops = set(
-            [
-                "nn.conv2d",
-                "nn.conv2d_transpose",
-                "nn.conv3d",
-                "nn.conv3d_transpose",
-                "nn.dense",
-                "nn.batch_matmul",
-            ]
-        )
-        if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op) in heavy_ops:
-                self.is_compute_intensive = True
-
-        return super().visit_call(call)
-
-    def is_graph_compute_intensive(self, subgraph):
-        self.visit(subgraph)
-        return self.is_compute_intensive
-
-
-def PruneSubgraphs(mod, compiler="tensorrt", use_implicit_batch=True, prune_no_macs=False):
-    """
-    If use_implicit_batch is True, removes subgraphs which were originally partitioned for TRT
-    that are incompatible with implicit batch mode.
-    If prune_no_macs is True, also remove subgraph if the number of multiply-accumulates is 0.
-    This is a heuristic which can improve performance by around 5% because TVM provides better
-    optimization for certain ops.
-
-     Parameters
-    ----------
-    mod: Module
-        The module which has been partitioned for tensorrt compiler.
-
-    compiler : str
-        Compiler string, should be "tensorrt".
-
-    use_implicit_batch : bool
-        Which mode we plan to use for TensorRT. Will be used to determine which subgraphs are
-        valid. In implicit batch mode, all inputs to a subgraph must have the same batch size.
-
-    prune_no_macs : bool
-        Whether to also remove subgraphs which have no multiple-accumulate operations.
-
-    Returns
-    -------
-    mod: Module
-        The modified module which has pruned subgraphs reverted back to TVM.
-    """
-    subgraphs_to_remove = []
-
-    def is_valid_subgraph(func):
-        """Whether a subgraph is valid in TRT.
-
-        Returns
-        -------
-        compatible : bool
-            True if the subgraph is compatible with TRT.
-        """
-        if not use_implicit_batch:
-            return True
-        input_batch_sizes = []
-        for var in func.params:
-            # In implicit batch mode, all inputs must have same batch size
-            if isinstance(var.checked_type, relay.TupleType):
-                for tupe_type in var.checked_type.fields:
-                    # Scalar inputs not allowed
-                    if len(tupe_type.shape) == 0:
-                        return False
-                    if not isinstance(tupe_type.shape[0], tvm.tir.expr.Any):
-                        input_batch_sizes.append(int(tupe_type.shape[0]))
-            else:
-                # Scalar inputs not allowed
-                if len(var.checked_type.shape) == 0:
-                    return False
-                if not isinstance(var.checked_type.shape[0], tvm.tir.expr.Any):
-                    input_batch_sizes.append(int(var.checked_type.shape[0]))
-        if len(input_batch_sizes) > 1 and any(
-            [x != input_batch_sizes[0] for x in input_batch_sizes[1:]]
-        ):
-            return False
-        return True
-
-    # Remove invalid subgraphs
-    for subgraph in mod.get_global_vars():
-        name = subgraph.name_hint
-        if (
-            mod[name].attrs
-            and hasattr(mod[name].attrs, "SkipOptimization")
-            and mod[name].attrs["SkipOptimization"] == 1
-        ):
-            continue
-        if not mod[name].attrs or mod[name].attrs["Compiler"] != compiler:
-            continue
-        if not is_valid_subgraph(mod[name]):
-            subgraphs_to_remove.append(name)
-
-    # Remove subgraphs with no multiply-accumulates
-    if prune_no_macs:
-        subgraph_with_compute_intensive_filter = []
-        for subgraph in mod.get_global_vars():
-            name = subgraph.name_hint
-            if (
-                mod[name].attrs
-                and hasattr(mod[name].attrs, "SkipOptimization")
-                and mod[name].attrs["SkipOptimization"] == 1
-            ):
-                continue
-            if not mod[name].attrs or mod[name].attrs["Compiler"] != compiler:
-                continue
-            is_compute_intensive = IsComputeIntensiveGraph().is_graph_compute_intensive(mod[name])
-            subgraph_with_compute_intensive_filter.append([name, is_compute_intensive])
-        print("Subgraphs with compute heavy filter", subgraph_with_compute_intensive_filter)
-        subgraphs_to_remove.extend(
-            [
-                name
-                for name, is_compute_intensive in subgraph_with_compute_intensive_filter
-                if not is_compute_intensive
-            ]
-        )
-    if len(subgraphs_to_remove) == 0:
-        return mod
-    print("Will remove these subgraphs:", subgraphs_to_remove)
-
-    # Create new pruned module with functions and type defns from mod
-    new_mod = tvm.IRModule(mod.functions, mod.type_definitions)
-    new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
-    return new_mod
-
-
-def EnableTrt(
-    mod,
-    params=None,
-    trt_version=None,
-    use_implicit_batch=True,
-    max_workspace_size=1 << 30,
-    prune_subgraphs=False,
-):
-    """Converts the "main" function in the module into one that can be executed using
-    TensorRT. If any of the operators are not supported by the TensorRT
-    conversion, the unmodified program will be returned instead.
-
-    Parameters
-    ----------
-    mod: Module
-        The original module.
-
-    params : dict of str to NDArray
-        Input parameters to the graph that do not change
-        during inference time. Used for constant folding.
-
-    trt_version : Optional[Tuple[int]]
-        Which version of TensorRT to target for partitioning as a tuple of
-        (major, minor, patch). If not specified, will attempt to get using
-        GetTrtVersion.
-
-    use_implicit_batch : bool
-        If false, will use explicit batch mode. Explicit batch mode is
-        available in TRT 6+. It increases operator coverage but comes at a
-        performance penalty.
-
-    max_workspace_size : int
-        Number of bytes for TensorRT workspace size.
-
-    prune_subgraphs : bool
-        If true, will prune subgraphs with 0 MACS and run them with TVM instead.
-
-    Returns
-    -------
-    mod: Module
-        The modified module which will use the TensorRT runtime if compatible.
-    """
-    if not trt_version:
-        trt_version = GetTrtVersion()
-        # If TVM wasn't built against TRT, default to target TRT 6. Since the
-        # actual conversion to TRT is done at runtime, building against TRT is
-        # not required for compilation.
-        if not trt_version:
-            trt_version = (6, 0, 1)
-    assert isinstance(trt_version, (list, tuple))
-    assert len(trt_version) == 3
-
-    register_tensorrt_annotations(trt_version, use_implicit_batch=use_implicit_batch)
-
-    def _set_optimization_attr(mod, skip_optimization=1):
-        """
-        Prepare the mod such that all functions except main are tagged for SkipOptimization
-        :param mod: input TRT mod
-        :param skip_optimization: flag to set SkipOptimization for all functions except main
-        :return: updated mod
-        """
-        gvs = mod.get_global_vars()
-        for gv in gvs:
-            func = mod[gv]
-            name = gv.name_hint
-            if name != "main":
-                new_func = func.with_attr(
-                    "SkipOptimization", tvm.tir.IntImm("int32", skip_optimization)
-                )
-                mod.update_func(gv, new_func)
-        return mod
-
-    if params:
-        # Bind params so that we can use FoldConstant.
-        mod["main"] = bind_params_by_name(mod["main"], params)
-    # Apply passes required for TRT
-    mod = transform.InferType()(mod)
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            RemoveDropoutPass(),
-            transform.RemoveUnusedFunctions(),
-            transform.ConvertLayout(
-                {
-                    "nn.conv2d": ["NCHW", "default"],
-                    "nn.conv2d_transpose": ["NCHW", "default"],
-                    "nn.conv3d": ["NCDHW", "default"],
-                }
-            ),
-            transform.FoldConstant(),
-            LegalizeLayoutTranformPass(),
-            transform.InferType(),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-
-    # Set SkipOptimization for all functions but main for the following passes
-    mod = _set_optimization_attr(mod, skip_optimization=1)
-    seq = tvm.transform.Sequential(
-        [
-            transform.AnnotateTarget("tensorrt"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            transform.InferType(),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    mod = PruneSubgraphs(mod, use_implicit_batch=use_implicit_batch, prune_no_macs=prune_subgraphs)
-
-    # Set SkipOptimization back to 0
-    mod = _set_optimization_attr(mod, skip_optimization=0)
-
-    # Set environment variables used to communicate with TensorRT module.
-    os.environ["TVM_TENSORRT_MAX_WORKSPACE_SIZE"] = str(max_workspace_size)
-    os.environ["TVM_TENSORRT_USE_IMPLICIT_BATCH"] = str(int(use_implicit_batch))
-    return mod
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 6eb71b581ab2..0b81cb9c7ec6 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -64,11 +64,23 @@ def run_infer_type(expr):
 
 
 def _np_randn_from_type(t, scale=1, mean=0):
-    return (mean + (scale * np.random.randn(*(int(d) for d in t.shape)))).astype(t.dtype)
+    res = mean + (scale * np.random.randn(*(int(d) for d in t.shape)))
+    # if t.shape == (), then randn returns a scalar so we need to wrap for dtype conversion
+    if np.isscalar(res):
+        res = np.array(res)
+    return res.astype(t.dtype)
 
 
 def check_grad(
-    func, inputs=None, test_inputs=None, eps=1e-6, atol=1e-5, rtol=1e-3, scale=None, mean=0
+    func,
+    inputs=None,
+    test_inputs=None,
+    eps=1e-6,
+    atol=1e-5,
+    rtol=1e-3,
+    scale=None,
+    mean=0,
+    mode="higher_order",
 ):
     """Perform numerical gradient checking given a relay function.
 
@@ -108,7 +120,7 @@ def check_grad(
     """
 
     fwd_func = run_infer_type(func)
-    bwd_func = run_infer_type(gradient(fwd_func))
+    bwd_func = run_infer_type(gradient(fwd_func, mode=mode))
 
     if scale is None:
         scale = 10 * eps
@@ -139,6 +151,8 @@ def check_grad(
                         break
             grads = tmp
 
+        assert len(grads) > 0, "You must test at least one gradient."
+
         # Get numeric gradients for each dimension of each param, using two-sided approximation.
         approx_grads = []
         for x in test_inputs:
diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py
index a62a91f66c41..c0468b7ef692 100644
--- a/python/tvm/relay/testing/darknet.py
+++ b/python/tvm/relay/testing/darknet.py
@@ -24,8 +24,8 @@
 """
 from __future__ import division
 import numpy as np
-from cffi import FFI
 import cv2
+from cffi import FFI
 
 
 def convert_image(image):
diff --git a/python/tvm/relay/transform/__init__.py b/python/tvm/relay/transform/__init__.py
index 138a36611c6f..1d0ea176b16f 100644
--- a/python/tvm/relay/transform/__init__.py
+++ b/python/tvm/relay/transform/__init__.py
@@ -18,4 +18,5 @@
 """The Relay IR namespace containing transformations."""
 # transformation passes
 from .transform import *
+from .recast import recast
 from . import memory_alloc
diff --git a/python/tvm/relay/transform/memory_alloc.py b/python/tvm/relay/transform/memory_alloc.py
index f611c1cc14c1..66528c861788 100644
--- a/python/tvm/relay/transform/memory_alloc.py
+++ b/python/tvm/relay/transform/memory_alloc.py
@@ -84,6 +84,11 @@ def visit_call(self, call):
         for arg in call.args:
             self.visit(arg)
 
+    def visit_var(self, var):
+        var_type = var.checked_type
+        if not isinstance(var_type, ty.TensorType):
+            self.reshape_only = False
+
 
 def is_reshape_only(func):
     """Check if the primitive function contains only reshape ops."""
diff --git a/python/tvm/relay/transform/recast.py b/python/tvm/relay/transform/recast.py
new file mode 100644
index 000000000000..05a72676a907
--- /dev/null
+++ b/python/tvm/relay/transform/recast.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Relay type recasting pass"""
+import tvm
+from tvm import relay
+from tvm.ir import IRModule
+from .transform import InferType
+from ..analysis import count_layers
+from ..expr_functor import ExprMutator, Call
+
+
+class RecastMutator(ExprMutator):
+    """Cast operations to the target type."""
+
+    def __init__(self, dtype, out_dtype, valid_ops, valid_op_count, skip_layers):
+        self.dtype = dtype
+        self.out_dtype = out_dtype
+        self.depth_count = 0
+        self.valid_ops = [relay.op.get(op) for op in valid_ops]
+        self.valid_op_count = valid_op_count
+        self.skip_layers = skip_layers
+        # Convert negative indices to positive ones.
+        for i, layer in enumerate(skip_layers):
+            if layer < 0:
+                skip_layers[i] = self.valid_op_count + layer
+        super().__init__()
+
+    def visit_call(self, call):
+        # Keep track of our current depth and layer count
+        # so we can know whether to skip this layer or not.
+        current_depth = self.depth_count
+        current_layer = self.valid_op_count - current_depth - 1
+        if call.op in self.valid_ops:
+            self.depth_count += 1
+        # Visit current call operation
+        new_fn = self.visit(call.op)
+        # Visit current arguments
+        args = []
+        for arg in call.args:
+            args.append(self.visit(arg))
+            self.depth_count = current_depth
+
+        # Downcast this op if its the correct type and not skipped.
+        if call.op in self.valid_ops and current_layer not in self.skip_layers:
+            # Recast inputs to specified type.
+            args = [self.visit(arg) for arg in call.args]
+            new_args = list()
+            for arg in args:
+                new_args.append(relay.cast(arg, dtype=self.dtype))
+
+            # If out_dtype is in the attributes, we need to update it.
+            orig_dtype = None
+            if "out_dtype" in call.attrs.keys():
+                new_attr_dict = {}
+                for attr in call.attrs.keys():
+                    attr_value = call.attrs[attr]
+                    if isinstance(attr_value, tvm.ir.container.Array):
+                        attr_value = tuple(attr_value)
+                    new_attr_dict[str(attr)] = attr_value
+                new_attr_dict["out_dtype"] = self.out_dtype
+                attr_type = str(call.attrs).split("(")[0]
+                new_attrs = tvm.ir.make_node(attr_type, **new_attr_dict)
+                if call.attrs["out_dtype"] != "":
+                    orig_dtype = call.attrs["out_dtype"]
+            else:
+                new_attrs = call.attrs
+
+            if orig_dtype is None:
+                # Perform type inference to determine the original type.
+                new_mod = IRModule.from_expr(call)
+                new_mod = InferType()(new_mod)
+                checked_arg = new_mod["main"].body
+                orig_dtype = checked_arg.checked_type.dtype
+            # Recast the output for compatibility with other graph operations.
+            return relay.cast(Call(new_fn, new_args, new_attrs), orig_dtype)
+
+        # Otherwise return the unchanged call.
+        return Call(new_fn, args, call.attrs)
+
+
+def recast(expr, dtype, out_dtype, ops=None, skip_layers=None):
+    """Convert the types of operations in a graph to a new value.
+    Note that this is primarily useful for testing performance of individual
+    operations at the new datatype. In a real setting, this pass will
+    almost certainly do a poor job converting from one datatype to another
+    as it just applies hard casting. For example, when recasting from float
+    to integer, many small values will simply be set to 0. Although this will
+    allow autotuning and benchmarking to produce proper timings at the new
+    data type, the output of the model will of course be heavily impacted.
+
+    Parameters
+    ---------
+    expr: tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule
+        The original function that will have its type changed.
+    dtype: str
+        The target type to cast to.
+    out_dtype: str
+        The output type to cast to.
+    ops: List[str]
+        A list of operations that should have their type changed,
+        others will be left as is.
+    skip_layers: List[int]
+        A list of integers indicating operations that should
+        not have their type changed, counted starting with the
+        first valid operation encountered. Negative indices are
+        allowed and indicate starting at the last layer.
+    Returns
+    -------
+    output_expr : tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule
+        The graph after recasting to the specified datatype.
+    """
+    return_mod = False
+    if isinstance(expr, tvm.ir.IRModule):
+        expr = expr["main"]
+        return_mod = True
+    if ops is None:
+        ops = ["nn.conv2d"]
+    if skip_layers is None:
+        skip_layers = []
+    layer_depth = count_layers(expr, ops)
+    recast_pass = RecastMutator(dtype, out_dtype, ops, layer_depth, skip_layers)
+    expr = recast_pass.visit(expr)
+    if return_mod:
+        return tvm.IRModule.from_expr(expr)
+    return expr
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index e155f83a7c5d..33a46cc6e6af 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -173,10 +173,14 @@ def SimplifyInference():
     """Simplify the data-flow graph for inference phase. An simplified expression
     which is semantically equal to the input expression will be returned.
 
+    Note that batch norms will only be simplified if their result is indexed at
+    tuple index 0.
+
     Returns
     -------
     ret: tvm.transform.Pass
         The registered pass to perform operator simplification.
+
     """
     return _ffi_api.SimplifyInference()
 
@@ -264,6 +268,18 @@ def FuseOps(fuse_opt_level=-1):
     return _ffi_api.FuseOps(fuse_opt_level)
 
 
+def DefuseOps():
+    """The inverse operation of FuseOps. It transforms a fused program returned by FuseOps into the
+    program before FuseOps. (i.e., x == DefuseOps(FuseOps(x)))
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The registered pass for operator defusion.
+    """
+    return _ffi_api.DefuseOps()
+
+
 def CombineParallelConv2D(min_num_branches=3):
     """Combine multiple conv2d operators into one.
 
@@ -386,6 +402,33 @@ def AlterOpLayout():
     return _ffi_api.AlterOpLayout()
 
 
+class LayoutConfig(object):
+    """A structure for customizing the ConvertLayout pass."""
+
+    current = None
+
+    def __init__(self, skip_layers=None):
+        self.skip_counter = 0
+        self.skip_layers = skip_layers if skip_layers is not None else []
+
+    def check_skip(self):
+        skip = self.skip_counter in self.skip_layers
+        self.skip_counter += 1
+        return skip
+
+    def reset(self):
+        self.skip_counter = 0
+        self.skip_layers = []
+
+    def __enter__(self):
+        self._old_manager = LayoutConfig.current
+        LayoutConfig.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        LayoutConfig.current = self._old_manager
+
+
 def ConvertLayout(desired_layouts):
     """Given a dest layout, this pass transforms the expr such that most of the ops input data
     layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms,
@@ -395,7 +438,7 @@ def ConvertLayout(desired_layouts):
     parser and relay.build call. This is very helpful for hardware backends that support/prefer only
     type of data layout.
 
-    RFC - https://discuss.tvm.ai/t/layout-conversion-pass/4009
+    RFC - https://discuss.tvm.apache.org/t/layout-conversion-pass/4009
 
     This pass uses most of the AlterOpLayout and InferCorrectLayout infrastructure. We can define
     new layouts for conv2d ops for now. Most of the other operators try to adapt to their input
diff --git a/python/tvm/rpc/base.py b/python/tvm/rpc/base.py
index b2bfa3b53416..1be904524ef6 100644
--- a/python/tvm/rpc/base.py
+++ b/python/tvm/rpc/base.py
@@ -60,6 +60,9 @@ class TrackerCode(object):
 
 def get_addr_family(addr):
     res = socket.getaddrinfo(addr[0], addr[1], 0, 0, socket.IPPROTO_TCP)
+    for info in res:
+        if info[0] == socket.AF_INET:
+            return info[0]
     return res[0][0]
 
 
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index b9ad94d87327..a50f3b856800 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -22,7 +22,7 @@
 import time
 
 import tvm._ffi
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm._ffi.base import TVMError
 from tvm.runtime import ndarray as nd
 
@@ -244,7 +244,7 @@ def __init__(self):
 
 @tvm._ffi.register_func("rpc.PopenSession")
 def _popen_session(binary):
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     if isinstance(binary, (bytes, bytearray)):
         path_exec = temp.relpath("server.minrpc")
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 728723432aa8..786154253133 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -42,7 +42,7 @@
 from tvm._ffi.base import py_str
 from tvm._ffi.libinfo import find_lib_path
 from tvm.runtime.module import load_module as _load_module
-from tvm.contrib import util
+from tvm.contrib import utils
 from . import _ffi_api
 from . import base
 from .base import TrackerCode
@@ -55,7 +55,7 @@ def _server_env(load_library, work_path=None):
     if work_path:
         temp = work_path
     else:
-        temp = util.tempdir()
+        temp = utils.tempdir()
 
     # pylint: disable=unused-variable
     @tvm._ffi.register_func("tvm.rpc.server.workpath", override=True)
@@ -89,7 +89,7 @@ def download_linked_module(file_name):
             # Extra dependencies during runtime.
             from tvm.contrib import cc as _cc, tar as _tar
 
-            tar_temp = util.tempdir(custom_path=path.replace(".tar", ""))
+            tar_temp = utils.tempdir(custom_path=path.replace(".tar", ""))
             _tar.untar(path, tar_temp.temp_dir)
             files = [tar_temp.relpath(x) for x in tar_temp.listdir()]
             _cc.create_shared(path + ".so", files, cc=cc)
@@ -230,17 +230,18 @@ def _accept_conn(listen_sock, tracker_conn, ping_period=2):
             raise exc
 
         # step 3: serving
-        work_path = util.tempdir()
+        work_path = utils.tempdir()
         logger.info("connection from %s", addr)
         server_proc = multiprocessing.Process(
             target=_serve_loop, args=(conn, addr, load_library, work_path)
         )
-        server_proc.deamon = True
+
         server_proc.start()
         # close from our side.
         conn.close()
         # wait until server process finish or timeout
         server_proc.join(opts.get("timeout", None))
+
         if server_proc.is_alive():
             logger.info("Timeout in RPC session, kill..")
             # pylint: disable=import-outside-toplevel
@@ -280,7 +281,6 @@ def _connect_proxy_loop(addr, key, load_library):
             opts = _parse_server_opt(remote_key.split()[1:])
             logger.info("connected to %s", str(addr))
             process = multiprocessing.Process(target=_serve_loop, args=(sock, addr, load_library))
-            process.deamon = True
             process.start()
             sock.close()
             process.join(opts.get("timeout", None))
@@ -362,8 +362,6 @@ def __init__(
         load_library=None,
         custom_addr=None,
         silent=False,
-        utvm_dev_id=None,
-        utvm_dev_config_args=None,
     ):
         try:
             if _ffi_api.ServerLoop is None:
@@ -397,10 +395,6 @@ def __init__(
                 cmd += ["--custom-addr", custom_addr]
             if silent:
                 cmd += ["--silent"]
-            if utvm_dev_id is not None:
-                assert utvm_dev_config_args is not None
-                cmd += [f"--utvm-dev-id={utvm_dev_id}"]
-                cmd += [f"--utvm-dev-config-args={utvm_dev_config_args}"]
 
             # prexec_fn is not thread safe and may result in deadlock.
             # python 3.2 introduced the start_new_session parameter as
@@ -437,13 +431,11 @@ def __init__(
                 target=_listen_loop,
                 args=(self.sock, self.port, key, tracker_addr, load_library, self.custom_addr),
             )
-            self.proc.deamon = True
             self.proc.start()
         else:
             self.proc = multiprocessing.Process(
                 target=_connect_proxy_loop, args=((host, port), key, load_library)
             )
-            self.proc.deamon = True
             self.proc.start()
 
     def terminate(self):
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 5b613a60ffc3..9cf636895541 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -278,7 +278,7 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
             raise RuntimeError("Cannot call export_library in runtime only mode")
         # Extra dependencies during runtime.
         from pathlib import Path
-        from tvm.contrib import cc as _cc, tar as _tar, util as _util
+        from tvm.contrib import cc as _cc, tar as _tar, utils as _utils
 
         if isinstance(file_name, Path):
             file_name = str(file_name)
@@ -293,7 +293,7 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
             return
 
         modules = self._collect_dso_modules()
-        temp = _util.tempdir()
+        temp = _utils.tempdir()
         files = addons if addons else []
         is_system_lib = False
         has_c_module = False
@@ -409,9 +409,9 @@ def load_module(path, fmt=""):
         path += ".so"
     elif path.endswith(".tar"):
         # Extra dependencies during runtime.
-        from tvm.contrib import cc as _cc, util as _util, tar as _tar
+        from tvm.contrib import cc as _cc, utils as _utils, tar as _tar
 
-        tar_temp = _util.tempdir(custom_path=path.replace(".tar", ""))
+        tar_temp = _utils.tempdir(custom_path=path.replace(".tar", ""))
         _tar.untar(path, tar_temp.temp_dir)
         files = [tar_temp.relpath(x) for x in tar_temp.listdir()]
         _cc.create_shared(path + ".so", files, cc=cc)
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index b0a3c74eeefa..2f616ce879c9 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -30,8 +30,10 @@
         raise ImportError()
     from tvm._ffi._cy3.core import _set_class_ndarray, _make_array, _from_dlpack
     from tvm._ffi._cy3.core import NDArrayBase
-except (RuntimeError, ImportError):
+except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position
+    if _FFI_MODE == "cython":
+        raise error
     from tvm._ffi._ctypes.ndarray import _set_class_ndarray, _make_array, _from_dlpack
     from tvm._ffi._ctypes.ndarray import NDArrayBase
 
diff --git a/python/tvm/runtime/object.py b/python/tvm/runtime/object.py
index 35f1f4e57da0..bfee7f544f9c 100644
--- a/python/tvm/runtime/object.py
+++ b/python/tvm/runtime/object.py
@@ -28,8 +28,10 @@
         raise ImportError()
     from tvm._ffi._cy3.core import _set_class_object, _set_class_object_generic
     from tvm._ffi._cy3.core import ObjectBase, PyNativeObject
-except (RuntimeError, ImportError):
+except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position,unused-import
+    if _FFI_MODE == "cython":
+        raise error
     from tvm._ffi._ctypes.packed_func import _set_class_object, _set_class_object_generic
     from tvm._ffi._ctypes.object import ObjectBase, PyNativeObject
 
diff --git a/python/tvm/runtime/packed_func.py b/python/tvm/runtime/packed_func.py
index 35a4783b6dff..bcd3cd733dc6 100644
--- a/python/tvm/runtime/packed_func.py
+++ b/python/tvm/runtime/packed_func.py
@@ -27,8 +27,10 @@
     from tvm._ffi._cy3.core import _set_class_packed_func, _set_class_module
     from tvm._ffi._cy3.core import PackedFuncBase
     from tvm._ffi._cy3.core import convert_to_tvm_func
-except (RuntimeError, ImportError):
+except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position
+    if _FFI_MODE == "cython":
+        raise error
     from tvm._ffi._ctypes.packed_func import _set_class_packed_func, _set_class_module
     from tvm._ffi._ctypes.packed_func import PackedFuncBase
     from tvm._ffi._ctypes.packed_func import convert_to_tvm_func
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 81a909b2e1c0..448cb137cc9b 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -120,7 +120,7 @@ def save(self):
             executable = relay.vm.compile(mod, target)
             code, lib = executable.save()
             # save and load the code and lib file.
-            tmp = tvm.contrib.util.tempdir()
+            tmp = tvm.contrib.utils.tempdir()
             path_lib = tmp.relpath("lib.so")
             lib.export_library(path_lib)
             with open(tmp.relpath("code.ro"), "wb") as fo:
diff --git a/python/tvm/script/__init__.py b/python/tvm/script/__init__.py
index 4b9f07354f70..4cf7828290a7 100644
--- a/python/tvm/script/__init__.py
+++ b/python/tvm/script/__init__.py
@@ -16,5 +16,4 @@
 # under the License.
 """TVM Script APIs of TVM Python Package, aimed to support TIR"""
 
-from .utils import create_module, asscript, tir, module
-from .parser import from_source
+from .parser import from_source, create_module, asscript, tir, module
diff --git a/python/tvm/script/_ffi_api.py b/python/tvm/script/_ffi_api.py
index 92c38909f446..926d17b1667e 100644
--- a/python/tvm/script/_ffi_api.py
+++ b/python/tvm/script/_ffi_api.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""FFI APIs for tvm.tvmscript"""
+"""FFI APIs for tvm.script"""
 import tvm._ffi
 
 tvm._ffi._init_api("script", __name__)
diff --git a/python/tvm/script/scope_emitter.py b/python/tvm/script/context_maintainer.py
similarity index 70%
rename from python/tvm/script/scope_emitter.py
rename to python/tvm/script/context_maintainer.py
index 69ad26731492..955266c4a3e0 100644
--- a/python/tvm/script/scope_emitter.py
+++ b/python/tvm/script/context_maintainer.py
@@ -14,17 +14,24 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Scope Emitter for TIR"""
+"""TVM Script Context Maintainer for TIR"""
 
 from tvm.te import schedule
 
 
-class ScopeEmitter:
-    """Maintain the nodes and symbols of scopes"""
+class ContextMaintainer:
+    """Maintain all the necessary context info"""
 
     def __init__(self, parser):
-        self.node_stack = [[]]  # AST nodes of scopes
-        self.symbols = [dict()]  # Symbols of scopes
+        # scope context
+        self.node_stack = []  # AST nodes of scopes
+        self.symbols = []  # symbols of scopes
+        # function context
+        self.func_params = []  # parameter list of function
+        self.func_buffer_map = {}  # buffer_map of function
+        self.func_dict_attr = {}  # func_attr of function
+        self.func_var_env_dict = {}  # map from var to env_name
+        # parser
         self.parser = parser
 
     def pop_scope(self):
@@ -32,9 +39,11 @@ def pop_scope(self):
         self.symbols.pop()
         self.node_stack.pop()
 
-    def new_scope(self):
-        """ Creating a new scope """
-        self.node_stack.append([])
+    def new_scope(self, nodes=None):
+        """Creating a new scope"""
+        if nodes is None:
+            nodes = []
+        self.node_stack.append(list(reversed(nodes)))
         self.symbols.append(dict())
 
     def update_symbol(self, name, symbol):
@@ -60,3 +69,6 @@ def lookup_symbol(self, name):
             if name in symbols:
                 return symbols[name]
         return None
+
+    def report_error(self, message, span):
+        self.parser.report_error(message, span)
diff --git a/python/tvm/script/diagnostics.py b/python/tvm/script/diagnostics.py
new file mode 100644
index 000000000000..fc196f6b16ae
--- /dev/null
+++ b/python/tvm/script/diagnostics.py
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Bridge from synr's (the library used for parsing the python AST)
+   DiagnosticContext to TVM's diagnostics
+"""
+import tvm
+from synr import DiagnosticContext, ast
+from tvm.ir.diagnostics import DiagnosticContext as TVMCtx
+from tvm.ir.diagnostics import get_renderer, DiagnosticLevel, Diagnostic
+
+
+class TVMDiagnosticCtx(DiagnosticContext):
+    """TVM diagnostics for synr"""
+
+    diag_ctx: TVMCtx
+
+    def __init__(self) -> None:
+        self.diag_ctx = TVMCtx(tvm.IRModule(), get_renderer())
+        self.source_name = None
+
+    def to_tvm_span(self, src_name, ast_span: ast.Span) -> tvm.ir.Span:
+        return tvm.ir.Span(
+            src_name,
+            ast_span.start_line,
+            ast_span.end_line,
+            ast_span.start_column,
+            ast_span.end_column,
+        )
+
+    def add_source(self, name: str, source: str) -> None:
+        src_name = self.diag_ctx.module.source_map.add(name, source)
+        self.source_name = src_name
+
+    def emit(self, _level, message, span):
+        span = self.to_tvm_span(self.source_name, span)
+        self.diag_ctx.emit(Diagnostic(DiagnosticLevel.ERROR, span, message))
+        self.diag_ctx.render()  # Raise exception on the first error we hit. TODO remove
+
+    def render(self):
+        self.diag_ctx.render()
diff --git a/python/tvm/script/intrin.py b/python/tvm/script/intrin.py
index 21570b91111a..63bc676bc889 100644
--- a/python/tvm/script/intrin.py
+++ b/python/tvm/script/intrin.py
@@ -14,127 +14,127 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Parser Intrinsic Functions
-
-IRNodes (StmtNodes without body, PrimExprNodes and more) are called intrins
-"""
-# pylint: disable=redefined-builtin
+"""TVM Script Parser Intrinsic Classes"""
+# pylint: disable=redefined-builtin, relative-beyond-top-level
 import tvm.tir
-from .registry import register_intrin
+from .registry import register
+from .utils import get_param_list
+
+
+class Intrin:
+    def __init__(self, intrin, stmt=False):
+        self.intrin = intrin
+        self.stmt = stmt
+
+    def signature(self):
+        return "tir." + self.intrin.__name__, get_param_list(self.intrin)
 
+    def handle(self, arg_list):
+        return self.intrin(*arg_list)
 
-@register_intrin()
+
+@register
 def bool(imm):
     return tvm.tir.const(imm, "bool")
 
 
-@register_intrin()
+@register
 def int8(imm):
     return tvm.tir.const(imm, "int8")
 
 
-@register_intrin()
+@register
 def int16(imm):
     return tvm.tir.const(imm, "int16")
 
 
-@register_intrin()
+@register
 def int32(imm):
     return tvm.tir.const(imm, "int32")
 
 
-@register_intrin()
+@register
 def int64(imm):
     return tvm.tir.const(imm, "int64")
 
 
-@register_intrin()
+@register
 def uint8(imm):
     return tvm.tir.const(imm, "uint8")
 
 
-@register_intrin()
+@register
 def uint16(imm):
     return tvm.tir.const(imm, "uint16")
 
 
-@register_intrin()
+@register
 def uint32(imm):
     return tvm.tir.const(imm, "uint32")
 
 
-@register_intrin()
+@register
 def uint64(imm):
     return tvm.tir.const(imm, "uint64")
 
 
-@register_intrin()
+@register
 def float8(imm):
     return tvm.tir.const(imm, "float8")
 
 
-@register_intrin()
+@register
 def float16(imm):
     return tvm.tir.const(imm, "float16")
 
 
-@register_intrin()
+@register
 def float32(imm):
     return tvm.tir.const(imm, "float32")
 
 
-@register_intrin()
+@register
 def float64(imm):
     return tvm.tir.const(imm, "float64")
 
 
-@register_intrin()
+@register
 def floordiv(x, y):
     return tvm.tir.floordiv(x, y)
 
 
-@register_intrin()
+@register
 def floormod(x, y):
     return tvm.tir.floormod(x, y)
 
 
-@register_intrin()
+@register
 def load(dtype, var, index, predicate=True):
     return tvm.tir.Load(dtype, var, index, predicate)
 
 
-@register_intrin()
+@register
 def cast(value, dtype):
     return tvm.tir.Cast(dtype, value)
 
 
-@register_intrin()
+@register
 def ramp(base, stride, lanes):
     return tvm.tir.Ramp(base, stride, lanes)
 
 
-@register_intrin()
+@register
 def broadcast(value, lanes):
     return tvm.tir.Broadcast(value, lanes)
 
 
-@register_intrin()
-def evaluate(value):
-    return tvm.tir.Evaluate(value)
-
-
-@register_intrin()
-def store(var, index, value, predicate=True):
-    return tvm.tir.Store(var, value, index, predicate)
-
-
-@register_intrin()
+@register
 def iter_var(var, dom, iter_type, thread_tag):
     iter_type = getattr(tvm.tir.IterVar, iter_type)
     return tvm.tir.IterVar(dom, var, iter_type, thread_tag)
 
 
-@register_intrin()
+@register
 def max(a, b):  # pylint: disable=redefined-builtin
     return tvm.tir.Max(a, b)
 
@@ -148,21 +148,39 @@ def get_axis(begin, end, iter_type):
     return tvm.tir.IterVar(block_var_dom, "bv", iter_type_dict[iter_type])
 
 
-@register_intrin()
+@register
 def range(begin, end):
     return get_axis(begin, end, "data_par")
 
 
-@register_intrin()
+@register
 def reduce_axis(begin, end):
     return get_axis(begin, end, "reduce")
 
 
-@register_intrin()
+@register
 def scan_axis(begin, end):
     return get_axis(begin, end, "scan")
 
 
-@register_intrin()
+@register
 def opaque_axis(begin, end):
     return get_axis(begin, end, "opaque")
+
+
+@register
+class EvaluateIntrin(Intrin):
+    def __init__(self):
+        def evaluate(value):
+            return tvm.tir.Evaluate(value)
+
+        super().__init__(evaluate, stmt=True)
+
+
+@register
+class StoreIntrin(Intrin):
+    def __init__(self):
+        def store(var, index, value, predicate=True):
+            return tvm.tir.Store(var, value, index, predicate)
+
+        super().__init__(store, stmt=True)
diff --git a/python/tvm/script/meta_unparser.py b/python/tvm/script/meta_unparser.py
index d56fbad3d1e3..b1472ccdc758 100644
--- a/python/tvm/script/meta_unparser.py
+++ b/python/tvm/script/meta_unparser.py
@@ -17,34 +17,29 @@
 """Unparse meta AST node into a dict"""
 # pylint: disable=invalid-name
 
-from typed_ast import ast3 as ast
+from synr import Transformer
 
 
-class MetaUnparser(ast.NodeVisitor):
+class MetaUnparser(Transformer):
     """Python AST Visitor to unparse meta AST node into a dict"""
 
-    def visit_Dict(self, node):
+    def transform(self, node):
+        method = "transform_" + node.__class__.__name__
+        visitor = getattr(self, method, None)
+        if visitor is None:
+            self.error(f"Unexpected node type {type(node)} when parsing __tvm_meta__", node.span)
+        return visitor(node)
+
+    def transform_DictLiteral(self, node):
         keys = [self.visit(key) for key in node.keys]
         values = [self.visit(value) for value in node.values]
         return dict(zip(keys, values))
 
-    def visit_Tuple(self, node):
+    def transform_Tuple(self, node):
         return tuple(self.visit(element) for element in node.elts)
 
-    def visit_List(self, node):
+    def transform_ArrayLiteral(self, node):
         return [self.visit(element) for element in node.elts]
 
-    def visit_keyword(self, node):
-        return node.arg, self.visit(node.value)
-
-    def visit_NameConstant(self, node):
-        return node.value
-
-    def visit_Constant(self, node):
+    def transform_Constant(self, node):
         return node.value
-
-    def visit_Num(self, node):
-        return node.n
-
-    def visit_Str(self, node):
-        return node.s
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 56710fc7a60f..6ce682778e5c 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -14,111 +14,158 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Parser For TIR"""
-# pylint: disable=invalid-name, missing-docstring, inconsistent-return-statements, no-else-return
-# pylint: disable=unnecessary-comprehension, unused-argument, import-outside-toplevel
-# pylint: disable=unused-import
+"""TVM Script Parser For TIR
+
+We use [synr](https://synr.readthedocs.io) to get an AST that is stable over
+different python versions. Synr also provides an error handling context that we
+use for error reporting.
+"""
+# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return
 import json
 import operator
-from typed_ast import ast3 as ast
+import inspect
+from synr import ast, Transformer, to_ast
 
-import tvm._ffi
-from tvm import tir
+import tvm
+from tvm import IRModule
 from tvm._ffi.base import TVMError
 from tvm.ir import GlobalVar
-from tvm.tir import all as _all
-from tvm.tir import expr as _expr
 
-from . import scope_emitter, special_stmt, scope_handler, intrin, ty
+from . import context_maintainer, ty
 from .meta_unparser import MetaUnparser
 from .registry import Registry
+from .intrin import Intrin
+from .special_stmt import SpecialStmt
+from .scope_handler import ScopeHandler, WithScopeHandler, ForScopeHandler
 from . import _ffi_api
+from .diagnostics import TVMDiagnosticCtx
+
+
+class CallArgumentReader(object):
+    """Helper class to read required arguments from passed arguments.
+
+    When parsing a function call, we need to match the arguments provided in
+    the AST to the required arguments of the function. This class makes sure
+    all the positional arguments are filled and also fill keyword arguments
+    with thier default value if a different value was not provided.
+    """
+
+    def __init__(self, func_name, args, kwargs, parser, node):
+        self.func_name = func_name
+        self.args = args
+        self.kwargs = kwargs
+        self.parser = parser
+        self.node = node
+
+    def get_pos_only_arg(self, pos, name):
+        """Get corresponding position only function argument from argument list"""
+        if len(self.args) >= pos:
+            arg = self.args[pos - 1]
+        elif name not in self.kwargs:
+            # If no positional argument was found in the AST, we see if it was
+            # defined by name instead.
+            # TODO(tkonolige): this error message is not quite correct. The
+            # number of required arguments is >= pos
+            self.parser.report_error(
+                f"{self.func_name} requires {pos} arguments, but only {len(self.args)} were given.",
+                self.node.span,
+            )
+        else:
+            arg = self.kwargs[name]
+
+        return arg
+
+    def get_kwarg(self, pos, name, default):
+        """Get corresponding keyword function argument from argument list.
+
+        If the user hasn't provided the argument, set it to the default value.
+        """
+        if len(self.args) >= pos:
+            arg = self.args[pos - 1]
+        elif name in self.kwargs:
+            arg = self.kwargs[name]
+        else:
+            return default
+
+        return arg
 
+    def get_varargs(self, pos):
+        """Get corresponding variable argument from argument list"""
+        if len(self.args) >= pos and len(self.kwargs) == 0:
+            return self.args[pos - 1 :]
+        return []
 
-class TVMScriptParserError(RuntimeError):
-    """TVM script Parser Runtime Error"""
 
+class TVMScriptParser(Transformer):
+    """Synr AST visitor pass which finally lowers to TIR.
 
-class TVMScriptParser(ast.NodeVisitor):
-    """Python AST visitor pass which finally lowers it to TIR
-    Notes for extension:
-    1. To support new types of AST nodes. Add a function visit_xxx().
-    2. To support new functions
+    Notes for Extension
+    -------------------
+    1. To support a new type of AST node, add a function transform_xxx().
+    2. To support new functions, add the function to the appropriate registry:
         We divide allowed function calls in TVM script into 3 categories,
-        which is intrin, scope_handler and special_stmt.
-        1) intrin functions ought to have return value.
-        User can also register intrin category function into parser.
-        2) scope_handler functions have no return value and accepts parser and AST node
-        as its arguments, which is used in for scope and with scope.
-        3) special_stmt functions have return value and accepts parser and AST node as its arguments
-        When visiting Call node, we check special_stmt registry at first. If no registered function
-        is found, we then check intrin.
-        When visiting With node, we check with_scope registry.
-        When visiting For node, we check for_scope registry.
+        intrin, scope_handler and special_stmt.
+        1. intrin functions are low level functions like mod, load, and
+           constants. They correspond to a tir `IRNode`. They must have a
+           return value. The user can register intrin functions for the parser to
+           use.
+        2. scope_handler functions have no return value. They take two
+           arguments: the parser and the AST node. scope_handler functions are
+           used in with and for statements.
+        3. special_stmt functions handle cases that do not have a corresponding
+           tir `IRNode`. These functions take the parser and the AST node as
+           arguments and may return a value.
+        When visiting a Call node, we check the special_stmt registry first. If
+        no registered function is found, we then check the intrin registry.
+        When visiting With node, we check the with_scope registry.
+        When visiting For node, we check the for_scope registry.
     """
 
     _binop_maker = {
-        ast.Add: tir.Add,
-        ast.Sub: tir.Sub,
-        ast.Mult: tir.Mul,
-        ast.Div: tir.Div,
-        ast.FloorDiv: tir.FloorDiv,
-        ast.Mod: tir.FloorMod,
-        ast.BitOr: operator.or_,
-        ast.BitAnd: operator.and_,
-        ast.BitXor: operator.xor,
-        ast.Gt: tir.GT,
-        ast.GtE: tir.GE,
-        ast.Lt: tir.LT,
-        ast.LtE: tir.LE,
-        ast.Eq: tir.EQ,
-        ast.NotEq: tir.NE,
-        ast.And: tir.And,
-        ast.Or: tir.Or,
+        ast.BuiltinOp.Add: tvm.tir.Add,
+        ast.BuiltinOp.Sub: tvm.tir.Sub,
+        ast.BuiltinOp.Mul: tvm.tir.Mul,
+        ast.BuiltinOp.Div: tvm.tir.Div,
+        ast.BuiltinOp.FloorDiv: tvm.tir.FloorDiv,
+        ast.BuiltinOp.Mod: tvm.tir.FloorMod,
+        ast.BuiltinOp.BitOr: operator.or_,
+        ast.BuiltinOp.BitAnd: operator.and_,
+        ast.BuiltinOp.BitXor: operator.xor,
+        ast.BuiltinOp.GT: tvm.tir.GT,
+        ast.BuiltinOp.GE: tvm.tir.GE,
+        ast.BuiltinOp.LT: tvm.tir.LT,
+        ast.BuiltinOp.LE: tvm.tir.LE,
+        ast.BuiltinOp.Eq: tvm.tir.EQ,
+        ast.BuiltinOp.NotEq: tvm.tir.NE,
+        ast.BuiltinOp.And: tvm.tir.And,
+        ast.BuiltinOp.Or: tvm.tir.Or,
     }
 
-    _unaryop_maker = {ast.USub: operator.neg, ast.Invert: operator.invert, ast.Not: tir.Not}
+    _unaryop_maker = {
+        ast.BuiltinOp.USub: operator.neg,
+        ast.BuiltinOp.Invert: operator.invert,
+        ast.BuiltinOp.Not: tvm.tir.Not,
+    }
 
-    def __init__(self, src, base_lienno):
-        self.params = None
-        self.buffer_map = None
-        self.dict_attr = None
-        self.scope_emitter = None
-        self.var_env_dict = None
+    def __init__(self, base_lienno):
+        self.context = None
 
-        self.src = src.split("\n")
         self.base_lineno = base_lienno
         self.current_lineno = 0
         self.current_col_offset = 0
         self.meta = None
 
-        self.functions = {}
-        self.target = None
-
     def init_function_parsing_env(self):
         """Initialize function parsing environment"""
-        self.params = []  # parameter list
-        self.buffer_map = {}  # buffer map
-        self.dict_attr = {}  # dict attr
-        self.scope_emitter = scope_emitter.ScopeEmitter(self)  # scope emitter
-        self.var_env_dict = {}  # map from var to thread env name
-
-    @staticmethod
-    def is_meta(node):
-        """Judge whether an AST node is META"""
-        return (
-            isinstance(node, ast.Assign)
-            and len(node.targets) == 1
-            and isinstance(node.targets[0], ast.Name)
-            and node.targets[0].id == "__tvm_meta__"
-        )
+        self.context = context_maintainer.ContextMaintainer(self)  # scope emitter
 
     def init_meta(self, meta_dict):
         if meta_dict is not None:
             self.meta = tvm.ir.load_json(json.dumps(meta_dict))
 
-    def visit(self, node):
-        """Override method in ast.NodeVisitor"""
+    def transform(self, node):
+        """Generic transformation for visiting the AST. Dispatches to
+        `transform_ClassName` for the appropriate ClassName."""
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
 
         if hasattr(node, "lineno"):
@@ -126,77 +173,105 @@ def visit(self, node):
         if hasattr(node, "col_offset"):
             self.current_col_offset = node.col_offset
 
-        method = "visit_" + node.__class__.__name__
+        method = "transform_" + node.__class__.__name__
         visitor = getattr(self, method, self.generic_visit)
-        visit_res = visitor(node)
+        transform_res = visitor(node)
 
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
 
-        return visit_res
-
-    def wrap_line_col(self, message, lineno, col_offset):
-        """Wrap the message with line number and column offset"""
-        src_line = self.src[lineno - self.base_lineno]
-        leading_space = len(src_line) - len(src_line.lstrip(" "))
-        col_offset = col_offset - leading_space
-        src_line = src_line[leading_space:]
-        return (
-            "\n  "
-            + src_line
-            + "\n  "
-            + " " * col_offset
-            + "^\n"
-            + "ParserError in line "
-            + str(lineno)
-            + " : "
-            + message
-        )
+        return transform_res
+
+    def report_error(self, message, span):
+        """Report an error occuring at a location.
+
+        This just dispatches to synr's DiagnosticContext.
 
-    def report_error(self, message, lineno=None, col_offset=None):
-        """Report an error occur in line lineno and column col_offset
         Parameters
         ----------
         message : str
             Error message
-        lineno : int
-            Line number of error line
-        col_offset : int
-            Column offset of error line
+        span : synr.ast.Span
+            Location of the error
         """
+        self.error(message, span)
 
-        if lineno is None:
-            lineno = self.current_lineno
-        if col_offset is None:
-            col_offset = self.current_col_offset
-        raise TVMScriptParserError(self.wrap_line_col(message, lineno, col_offset))
+    def parse_body(self, parent):
+        """Parse remaining statements in this scope.
 
-    def get_body(self):
+        Parameters
+        ----------
+        parent : synr.ast.Node
+            Parent node of this scope. Errors will be reported here.
+        """
         body = []
-        while len(self.scope_emitter.node_stack[-1]) > 0:
-            res = self.visit(self.scope_emitter.node_stack[-1].pop())
+        stmt = parent
+        while len(self.context.node_stack[-1]) > 0:
+            stmt = self.context.node_stack[-1].pop()
+            res = self.transform(stmt)
             if res is not None:
                 body.append(res)
-        return tvm.tir.SeqStmt(body) if len(body) > 1 else body[0]
+        if len(body) == 0:
+            self.report_error(
+                "Expected another statement at the end of this block. Perhaps you "
+                "used a concise statement and forgot to include a body afterwards.",
+                stmt.span,
+            )
+        else:
+            return tvm.tir.SeqStmt(body) if len(body) > 1 else body[0]
 
-    def get_type(self, type_node):
-        """ Parse type """
+    def parse_arg_list(self, func, node_call):
+        """Match the arguments of a function call in the AST to the required
+        arguments of the function. This handles positional arguments,
+        positional arguments specified by name, keyword arguments, and varargs.
+        """
+        assert isinstance(node_call, ast.Call)
+        # collect arguments
+        args = [self.transform(arg) for arg in node_call.params]
+        kw_args = {
+            self.transform(k): self.transform(v) for k, v in node_call.keyword_params.items()
+        }
+        # get the name and parameter list of func
+        if isinstance(func, (Intrin, ScopeHandler, SpecialStmt)):
+            func_name, param_list = func.signature()
+        else:
+            self.report_error(
+                "Internal Error: function must be of type Intrin, ScopeHandler or SpecialStmt, "
+                f"but it is {type(func).__name__}",
+                node_call.span,
+            )
+        # check arguments and parameter list and get a list of arguments
+        reader = CallArgumentReader(func_name, args, kw_args, self, node_call)
+        pos_only, kwargs, varargs = param_list
+        internal_args = list()
+        for i, arg_name in enumerate(pos_only):
+            internal_args.append(reader.get_pos_only_arg(i + 1, arg_name))
+        for i, arg_info in enumerate(kwargs):
+            arg_name, default = arg_info
+            internal_args.append(reader.get_kwarg(i + 1 + len(pos_only), arg_name, default=default))
+        if varargs is not None:
+            internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1))
+        return internal_args
+
+    def parse_type(self, type_node, parent):
+        """Parse a type annotation.
+
+        We require the parent object to the type so that we have a place to
+        report the error message if the type does not exist.
+        """
         if type_node is None:
-            self.report_error("missing type annotation")
-        res_type = self.visit(type_node)
+            self.report_error("A type annotation is required", parent.span)
+        res_type = self.transform(type_node)
         return tvm.ir.TupleType([]) if res_type is None else res_type.evaluate()
 
     def generic_visit(self, node):
-        """Override method in ast.NodeVisitor.
-        To directly filter out invalidate type of stmt.
-        """
+        """Fallback visitor if node type is not handled. Reports an error."""
 
-        self.report_error(type(node).__name__ + " AST node is not supported now")
+        self.report_error(type(node).__name__ + " AST node is not supported", node.span)
 
-    def visit_Module(self, node):
+    def transform_Module(self, node):
         """Module visitor
-        AST abstract grammar:
-            Module(stmt* body, type_ignore* type_ignore)
-        By now we support two format of TVM script shown below.
+
+        Right now, we only support two formats for TVM Script.
 
         Example
         -------
@@ -217,7 +292,7 @@ def A(...):
 
             import tvm
 
-            @tvm.script
+            @tvm.script.tir
             class MyMod():
                def A(...):
                   ...
@@ -230,500 +305,558 @@ def B(...):
             # returns an IRModule
             mod = MyMod()
         """
+        if len(node.funcs) == 1:
+            return self.transform(next(iter(node.funcs.values())))
+        elif len(node.func) == 0:
+            self.report_error(
+                "You must supply at least one class or function definition", node.span
+            )
+        else:
+            self.report_error(
+                "Only one-function, one-class or function-with-meta source code is allowed",
+                ast.Span.union([x.span for x in list(node.funcs.values())[1:]]),
+            )
 
-        if len(node.body) == 1 and isinstance(node.body[0], (ast.ClassDef, ast.FunctionDef)):
-            # class or single function
-            return self.visit(node.body[0])
-        elif len(node.body) == 2:
-            if isinstance(node.body[0], ast.Assign):
-                node.body[0], node.body[1] = node.body[1], node.body[0]
-            if isinstance(node.body[0], ast.FunctionDef) and TVMScriptParser.is_meta(node.body[1]):
-                # function with meta
-                self.init_meta(MetaUnparser().visit(node.body[1].value))
-                return self.visit(node.body[0])
-        self.report_error(
-            "Only one-function, one-class or function-with-meta source code is allowed"
-        )
+    def transform_Class(self, node):
+        """Class definition visitor.
 
-    def visit_ClassDef(self, node):
-        """ClassDef visitor
-        AST abstract grammar:
-            ClassDef(identifier name, expr* bases, keyword* keywords, stmt* body,
-                     expr* decorator_list)
+        A class can have multiple function definitions and a single
+        :code:`__tvm_meta__` statement. Each class corresponds to a single
+        :code:`IRModule`.
+
+        Example
+        -------
+        .. code-block:: python
+
+            @tvm.script.tir
+            class MyClass:
+                __tvm_meta__ = {}
+                def A():
+                    tir.evaluate(0)
         """
+        if len(node.assignments) == 1:
+            if not (
+                isinstance(node.assignments[0].lhs, ast.Var)
+                and node.assignments[0].lhs.id.name == "__tvm_meta__"
+            ):
+                self.report_error(
+                    "The only top level assignments allowed are `__tvm_meta__ = ...`",
+                    node.assignments[0].lhs.span,
+                )
+            self.init_meta(
+                MetaUnparser().do_transform(node.assignments[0].rhs, self._diagnostic_context)
+            )
+        elif len(node.assignments) > 1:
+            self.report_error(
+                "Only a single top level `__tvm_meta__` is allowed",
+                ast.Span.union([x.span for x in node.assignments[1:]]),
+            )
+
+        return create_module(
+            {GlobalVar(name): self.transform(func) for name, func in node.funcs.items()}
+        )
 
-        # parse meta
-        count = False
-        for body_element in node.body:
-            if isinstance(body_element, ast.FunctionDef):
-                pass
-            elif TVMScriptParser.is_meta(body_element) and not count:
-                count = True
-                self.init_meta(MetaUnparser().visit(body_element.value))
-            else:
-                self.report_error("invalid class member")
+    def transform_Function(self, node):
+        """Function definition visitor.
 
-        # parse member functions
-        for body_element in node.body:
-            if isinstance(body_element, ast.FunctionDef):
-                self.visit(body_element)
-        from .utils import create_module
+        Each function definition is translated to a single :code:`PrimFunc`.
 
-        return create_module(self.functions)
+        There are a couple restrictions on TVM Script functions:
+        1. Function arguments must have their types specified.
+        2. The body of the function can contain :code:`func_attr` to specify
+           attributes of the function (like it's name).
+        3. The body of the function can also contain multiple :code:`buffer_bind`s,
+           which give shape and dtype information to arguments.
+        4. Return statements are implicit.
 
-    def visit_FunctionDef(self, node):
-        """FunctionDef visitor
-        AST abstract grammar:
-            FunctionDef(identifier name, arguments args, stmt* body, expr* decorator_list,
-                        expr? returns, string? type_comment)
-            arguments = (arg* posonlyargs, arg* args, arg? vararg, arg* kwonlyargs,
-                         expr* kw_defaults, arg? kwarg, expr* defaults)
-            arg = (identifier arg, expr? annotation, string? type_comment)
+        Example
+        -------
+        .. code-block:: python
+
+            @tvm.script.tir
+            def my_function(x: ty.handle):  # 1. Argument types
+                tir.func_attr({"global_symbol": "mmult"})  # 2. Function attributes
+                X_1 = tir.buffer_bind(x, [1024, 1024])  # 3. Buffer binding
+                tir.evaluate(0)  # 4. This function returns 0
         """
 
         self.init_function_parsing_env()
-        # add parameters of function
-        for arg in node.args.args:
-            arg_var = tvm.te.var(arg.arg, self.get_type(arg.annotation))
-            self.scope_emitter.update_symbol(arg.arg, arg_var)
-            self.params.append(arg_var)
+        self.context.new_scope(nodes=node.body.stmts)
 
-        # visit the body of function
-        self.scope_emitter.node_stack[-1].extend(reversed(node.body))
+        # add parameters of function
+        for arg in node.params:
+            arg_var = tvm.te.var(arg.name, self.parse_type(arg.ty, arg))
+            self.context.update_symbol(arg.name, arg_var)
+            self.context.func_params.append(arg_var)
 
         # fetch the body and return a tir.PrimFunc
         func = tvm.tir.PrimFunc(
-            self.params,
-            self.get_body(),
-            ret_type=self.get_type(node.returns),
-            buffer_map=self.buffer_map,
-            attrs=tvm.ir.make_node("DictAttrs", **self.dict_attr),
+            self.context.func_params,
+            self.parse_body(node.body),
+            ret_type=self.parse_type(node.ret_type, node),
+            buffer_map=self.context.func_buffer_map,
+            attrs=tvm.ir.make_node("DictAttrs", **self.context.func_dict_attr),
         )
-        self.functions[GlobalVar(node.name)] = func
+
+        self.context.pop_scope()
         return func
 
-    def visit_Assign(self, node):
+    def transform_Assign(self, node):
         """Assign visitor
         AST abstract grammar:
             Assign(expr* targets, expr value, string? type_comment)
-        By now only 3 types of Assign is supported:
+
+        By now 3 patterns of Assign is supported:
             1. special stmts with return value
-                1.1 Buffer = tir.buffer_bind()/tir.buffer_decl()
+                1.1 Buffer = tir.match_buffer()/tir.buffer_decl()
                 1.2 Var = tir.var()
                 1.3 Var = tir.env_thread()
             2. (BufferStore) Buffer[PrimExpr, PrimExpr, ..., PrimExpr] = PrimExpr
             3. (Store)       Var[PrimExpr] = PrimExpr
             4. with scope handlers with concise scoping and var def
-                4.1 var = tir.alloc_with_scope()
+                4.1 var = tir.allocate()
         """
 
-        if not len(node.targets) == 1:
-            self.report_error("Only one-valued assignment is supported now")
-        target = node.targets[0]
-
-        if isinstance(target, ast.Name):
-            # scenario 1&4
-            self.target = [target.id]
-            if not isinstance(node.value, ast.Call):
-                self.report_error("Unsupported assign stmt")
-            func = self.visit(node.value.func)
-            if Registry.is_with_scope(func):
-                # scenario 4
-                return self.visit(node.value)
+        if isinstance(node.rhs, ast.Call):
+            # Pattern 1 & Pattern 4
+            func = self.transform(node.rhs.func_name)
+            if isinstance(func, WithScopeHandler):
+                if not func.concise_scope or not func.def_symbol:
+                    self.report_error(
+                        "with scope handler " + func.signature()[0] + " is not suitable here",
+                        node.rhs.span,
+                    )
+                # Pattern 4
+                func.enter_scope(node, self.context)
+                arg_list = self.parse_arg_list(func, node.rhs)
+                func.body = self.parse_body(node)
+                return func.exit_scope(node, self.context, arg_list)
+            elif isinstance(func, SpecialStmt):
+                # Pattern 1
+                arg_list = self.parse_arg_list(func, node.rhs)
+                func.handle(node, self.context, arg_list)
+                return self.parse_body(node)
             else:
-                # scenario 1
-                rhs = self.visit(node.value)
-                self.scope_emitter.update_symbol(target.id, rhs)
-        elif isinstance(target, ast.Subscript):
-            # scenario 2&3
-            symbol, indexes = self.visit(target)
-            rhs = self.visit(node.value)
-            if isinstance(symbol, tvm.tir.Buffer):
-                # BufferStore
-                return tvm.tir.BufferStore(symbol, tvm.runtime.convert(rhs), indexes)
-            else:
-                if len(indexes) != 1:
-                    self.report_error("Invalid Store stmt")
-                # Store
-                return tvm.tir.Store(
-                    symbol, tvm.runtime.convert(rhs), indexes[0], tvm.runtime.convert(True)
-                )
+                value = self.transform(node.rhs)
+                if not isinstance(node.lhs, ast.Var):
+                    # This is a little confusing because it only is true when
+                    # we have taken this branch. We might need to clarify what
+                    # exectly is allowed in Assignments in tvmscript.
+                    self.report_error(
+                        "Left hand side of assignment must be an unqualified variable",
+                        node.lhs.span,
+                    )
+                var = tvm.te.var(node.lhs.id.name, self.parse_type(node.ty, node.lhs))
+                self.context.update_symbol(var.name, var)
+                body = self.parse_body(node)
+                self.context.remove_symbol(var.name)
+                return tvm.tir.LetStmt(var, value, body)
+
+        self.report_error("Unsupported Assign stmt", node.span)
+
+    def transform_SubscriptAssign(self, node):
+        """Visitor for statements of the form :code:`x[1] = 2`."""
+        symbol = self.transform(node.params[0])
+        indexes = self.transform(node.params[1])
+        rhs = self.transform(node.params[2])
+        if isinstance(symbol, tvm.tir.Buffer):
+            # BufferStore
+            return tvm.tir.BufferStore(symbol, tvm.runtime.convert(rhs), indexes)
         else:
-            self.report_error("Unsupported Assign stmt")
-
-    def visit_AnnAssign(self, node):
-        """AnnAssign visitor
-        AST abstract grammar:
-            AnnAssign(expr target, expr annotation, expr? value, int simple)
-        Corresponds to concise mode of with tir.let()
-        """
-
-        if isinstance(node.target, ast.Name):
-            value = self.visit(node.value)
-            var = tvm.te.var(node.target.id, self.get_type(node.annotation))
-            self.scope_emitter.update_symbol(var.name, var)
-            return tvm.tir.LetStmt(var, value, self.visit(self.scope_emitter.node_stack[-1].pop()))
-        else:
-            self.report_error("Unsupported AnnAssign stmt")
+            if len(indexes) != 1:
+                self.report_error(
+                    f"Store is only allowed with one index, but {len(indexes)} were provided.",
+                    Span.union([x.span for x in indexes]),
+                )
+            # Store
+            return tvm.tir.Store(
+                symbol, tvm.runtime.convert(rhs), indexes[0], tvm.runtime.convert(True)
+            )
 
-    def visit_Assert(self, node):
+    def transform_Assert(self, node):
         """Assert visitor
-        AST abstract grammar:
-            Assert(expr test, expr? msg)
-        Corresponds to concise mode of with tir.assert()
+
+        Pattern corresponds to concise mode of :code:`with tir.Assert()`.
         """
 
-        condition = self.visit(node.test)
+        condition = self.transform(node.condition)
         if node.msg is None:
-            self.report_error("Message of AssertStmt can't be None")
-        message = self.visit(node.msg)
-        return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), self.get_body())
+            self.report_error("Assert statements must have an error message.", node.span)
+        message = self.transform(node.msg)
+        body = self.parse_body(node)
+        return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), body)
 
-    def visit_For(self, node):
+    def transform_For(self, node):
         """For visitor
         AST abstract grammar:
             For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
-        By now only 1 type of For is supported:
-            1. for name in tir.serial/parallel/vectorized/unroll(begin, end)
+        By now 1 pattern of For is supported:
+            1. for scope handler
+                for name in tir.serial()/tir.parallel()/tir.vectorized()/tir.unroll()
         """
 
-        # check node.iter, which is a Call
-        if not isinstance(node.iter, ast.Call):
-            self.report_error("The loop iter should be a Call")
-        func = self.visit(node.iter.func)
-        if not Registry.is_for_scope(func):
-            self.report_error("Function not allowed in for scope")
-        # collect arguments
-        args = [self.visit(arg) for arg in node.iter.args]
-        kw_args = [self.visit(keyword) for keyword in node.iter.keywords]
-        kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
-
+        if not isinstance(node.rhs, ast.Call):
+            self.report_error("The loop iterator should be a function call.", node.rhs.span)
+        func = self.transform(node.rhs.func_name)
+        if not isinstance(func, ForScopeHandler):
+            self.report_error(
+                "Only For scope handlers can be used in a for statement.", node.rhs.func_name.span
+            )
+        # prepare for new for scope
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
-        self.current_lineno, self.current_col_offset = (
-            self.base_lineno + node.iter.lineno - 1,
-            node.iter.col_offset,
-        )
-        res = func(self, node, args, kw_args)
+        self.current_lineno = node.span.start_line
+        self.current_col_offset = node.span.start_column
+        self.context.new_scope(nodes=node.body.stmts)
+        # for scope handler process the scope
+        func.enter_scope(node, self.context)
+        func.body = self.parse_body(node)
+        arg_list = self.parse_arg_list(func, node.rhs)
+        res = func.exit_scope(node, self.context, arg_list)
+        # exit the scope
+        self.context.pop_scope()
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
-    def visit_With(self, node):
+    def transform_With(self, node):
         """With visitor
         AST abstract grammar:
             With(withitem* items, stmt* body, string? type_comment)
             withitem = (expr context_expr, expr? optional_vars)
-        By now 2 types of With is supported:
-            1. with tir.allocate() as targets:
-            2. with tir.let()/tir.Assert()/tir.attr()//tir.realize()
+        By now 2 patterns of With is supported:
+            1. with scope handler with symbol def
+                with tir.allocate() as targets:
+            2. with scope handler without symbol def
+                with tir.let()/tir.Assert()/tir.attr()//tir.realize()
         """
-        if not len(node.items) == 1:
-            self.report_error("Only one with element is supported now")
-        if not isinstance(node.items[0].context_expr, ast.Call):
-            self.report_error("The context expression of with should be a Call")
-
-        func_call = node.items[0].context_expr
-        func_node = func_call.func
-        func = self.visit(func_node)
-
-        if not Registry.is_with_scope(func):
-            self.report_error("Function not allowed in with scope")
-
-        self.target = []
-        if node.items[0].optional_vars is not None:
-            # preprocess optional var names
-            if isinstance(node.items[0].optional_vars, ast.Name):
-                self.target = [node.items[0].optional_vars.id]
-            elif isinstance(node.items[0].optional_vars, (ast.List, ast.Tuple)):
-                for var in node.items[0].optional_vars.elts:
-                    if not isinstance(var, ast.Name):
-                        self.report_error("Invalid optional var definition")
-                self.target = [var.id for var in node.items[0].optional_vars.elts]
-            else:
-                self.report_error("Invalid optional var definition")
-        # parse other arguments
-        args = [self.visit(arg) for arg in func_call.args]
-        kw_args = [self.visit(keyword) for keyword in func_call.keywords]
-        kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
 
+        if not isinstance(node.rhs, ast.Call):
+            self.report_error(
+                "The context expression of a `with` statement should be a function call.",
+                node.rhs.span,
+            )
+
+        func = self.transform(node.rhs.func_name)
+
+        if not isinstance(func, WithScopeHandler):
+            self.report_error(
+                f"Function {func} cannot be used in a `with` statement.", node.rhs.func_name.span
+            )
+        # prepare for new block scope
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
-        self.current_lineno, self.current_col_offset = (
-            self.base_lineno + func_call.lineno - 1,
-            func_call.col_offset,
-        )
-        res = func(self, node, args, kw_args)
+        self.current_lineno = node.body.span.start_line
+        self.current_col_offset = node.body.span.start_column
+        self.context.new_scope(nodes=node.body.stmts)
+        # with scope handler process the scope
+        func.enter_scope(node, self.context)
+        func.body = self.parse_body(node)
+        arg_list = self.parse_arg_list(func, node.rhs)
+        res = func.exit_scope(node, self.context, arg_list)
+        # exit the scope
+        self.context.pop_scope()
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
-    def visit_If(self, node):
+    def transform_If(self, node):
         """If visitor
         AST abstract grammar:
             If(expr test, stmt* body, stmt* orelse)
         """
 
-        condition = self.visit(node.test)
+        condition = self.transform(node.condition)
         # then body
-        self.scope_emitter.new_scope()
-        self.scope_emitter.node_stack[-1].extend(reversed(node.body))
-        then_body = self.get_body()
-        self.scope_emitter.pop_scope()
+        self.context.new_scope(nodes=node.true.stmts)
+        then_body = self.parse_body(node)
+        self.context.pop_scope()
 
         # else body
-        if len(node.orelse) > 0:
-            self.scope_emitter.new_scope()
-            self.scope_emitter.node_stack[-1].extend(reversed(node.orelse))
-            else_body = self.get_body()
-            self.scope_emitter.pop_scope()
+        if len(node.false.stmts) > 0:
+            self.context.new_scope(nodes=node.false.stmts)
+            else_body = self.parse_body(node)
+            self.context.pop_scope()
         else:
             else_body = None
+
         return tvm.tir.IfThenElse(condition, then_body, else_body)
 
-    def visit_Call(self, node):
+    def transform_Call(self, node):
         """Call visitor
-        AST abstract grammar:
-            Call(expr func, expr* args, keyword* keywords)
-            keyword = (identifier? arg, expr value)
-        All the functions used outside With and For are registered in special_stmt or intrin
-        """
 
-        func = self.visit(node.func)
-        # collect arguments
-        args = [self.visit(arg) for arg in node.args]
-        kw_args = [self.visit(keyword) for keyword in node.keywords]
-        kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
-
-        if callable(func):
-            if Registry.is_registered(func):
-                return func(self, node, args, kw_args)
-            else:
-                return func(*args, **kw_args)
-        elif isinstance(func, tvm.tir.op.Op):
-            return tvm.tir.Call(kw_args["dtype"], func, args)
-
-        self.report_error("Unsupported function call")
-
-    def visit_Expr(self, node):
-        """Expr visitor
-        AST abstract grammar:
-            Expr(expr value)
-        Now only 3 types of `Expr` stmt is allowed:
-            1. reducer.step()/tir.store()
-            2. tir.attr()/tir.assert()/tir.allocate()/tir.realize()
-            3. tir.set_func_attr()
-        """
-
-        if not isinstance(node.value, ast.Call):
-            self.report_error("Unsupported Expr stmt")
-        res = self.visit(node.value)
-        if res is None or isinstance(res, tvm.tir.Stmt):
-            return res
-        self.report_error("Invalid Expr stmt")
-
-    def visit_BinOp(self, node):
-        """BinOp visitor
-        AST abstract grammar:
-            BinOp(expr left, operator op, expr right)
+        3 different Call patterns are allowed:
+            1. Intrin representing a PrimExpr/IterVar
+                1.1 tir.int/uint/float8/16/32/64/floormod/floordiv/load/cast/ramp/broadcast/max
+                1.2 tir.range/reduce_axis/scan_axis/opaque_axis
+            2. tir.Op(dtype, ...)
+            3. other callable functions
         """
 
-        lhs = self.visit(node.left)
-        rhs = self.visit(node.right)
-        if not isinstance(node.op, tuple(TVMScriptParser._binop_maker.keys())):
-            self.report_error("BinOp " + str(type(node.op)) + " is not supported now")
-        return TVMScriptParser._binop_maker[type(node.op)](lhs, rhs)
-
-    def visit_Compare(self, node):
-        """Compare visitor
-        AST abstract grammar:
-            Compare(expr left, expr right, ops=)
-        """
-
-        ops = [self.visit(node.left)]
-        ops += [self.visit(comparator) for comparator in node.comparators]
-        res = []
-        for i in range(len(node.ops)):
-            lhs = ops[i]
-            rhs = ops[i + 1]
-            res.append(TVMScriptParser._binop_maker[type(node.ops[i])](lhs, rhs))
-        return _all(*res)
-
-    def visit_BoolOp(self, node):
-        """BoolOp visitor
-        AST abstract grammar:
-            BoolOp(boolop op, expr* values)
-        """
-
-        values = [self.visit(value) for value in node.values]
-        return TVMScriptParser._binop_maker[type(node.op)](*values)
+        if isinstance(node.func_name, ast.Op):
+            if node.func_name.name == ast.BuiltinOp.Subscript:
+                return self.transform_Subscript(node)
+            if node.func_name.name in self._binop_maker:
+                lhs = self.transform(node.params[0])
+                rhs = self.transform(node.params[1])
+                return self._binop_maker[node.func_name.name](lhs, rhs)
+            if node.func_name.name in self._unaryop_maker:
+                rhs = self.transform(node.params[0])
+                return self._unaryop_maker[node.func_name.name](rhs)
+            self.report_error(f"Unsupported operator {node.func_name.name}.", node.func_name.span)
+        else:
+            func = self.transform(node.func_name)
+            if isinstance(func, Intrin) and not func.stmt:
+                # pattern 1
+                arg_list = self.parse_arg_list(func, node)
+                return func.handle(arg_list)
+            else:
+                args = [self.transform(arg) for arg in node.params]
+                kw_args = {
+                    self.transform(k): self.transform(v) for k, v in node.keyword_params.items()
+                }
+                if isinstance(func, tvm.tir.op.Op):
+                    # pattern 2
+                    return tvm.tir.Call(kw_args["dtype"], func, args)
+                elif callable(func):
+                    # pattern 3
+                    return func(*args, **kw_args)
+
+        self.report_error("Unsupported function call.", node.func_name.span)
+
+    def transform_UnassignedCall(self, node):
+        """Visitor for statements that are function calls.
+
+        This handles function calls that appear on thier own line like `tir.realize`.
+
+        Examples
+        --------
+        .. code-block:: python
 
-    def visit_UnaryOp(self, node):
-        """UnaryOp visitor
-        AST abstract grammar:
-            UnaryOp(unaryop op, expr operand)
+            @tvm.script.tir
+            def f():
+                A = tir.buffer_decl([10, 10])
+                tir.realize(A[1:2, 1:2], "")  # This is an UnassignedCall
+                A[1, 1] = 2  # This is also an UnassignedCall
         """
+        # Only allowed builtin operator that can be a statement is x[1] = 3 i.e. subscript assign.
+        if isinstance(node.call.func_name, ast.Op):
+            if node.call.func_name.name != ast.BuiltinOp.SubscriptAssign:
+                self.report_error(
+                    "Binary and unary operators are not allowed as a statement", node.span
+                )
+            else:
+                return self.transform_SubscriptAssign(node.call)
+
+        # handle a regular function call
+        func = self.transform(node.call.func_name)
+        arg_list = self.parse_arg_list(func, node.call)
+
+        if isinstance(func, tvm.script.scope_handler.AssertHandler):
+            self.report_error(
+                "A standalone `tir.Assert` is not allowed. Use `assert condition, message` "
+                "instead.",
+                node.call.func_name.span,
+            )
+
+        if isinstance(func, Intrin) and func.stmt:
+            return func.handle(arg_list)
+        elif isinstance(func, WithScopeHandler) and func.concise_scope and not func.def_symbol:
+            func.enter_scope(node, self.context)
+            func.body = self.parse_body(node)
+            return func.exit_scope(node, self.context, arg_list)
+        elif isinstance(func, SpecialStmt) and not func.def_symbol:
+            func.handle(node, self.context, arg_list)
+            return
+
+        self.report_error(f"Invalid Expr stmt {type(func).__name__}.", node.call.func_name.span)
+
+    def transform_Slice(self, node):
+        start = self.transform(node.start)
+        end = self.transform(node.end)
+        if not (isinstance(node.step, ast.Constant) and node.step.value == 1):
+            self.report_error("Only step size 1 is supported for slices.", node.step.span)
+        extent = end - start
+        if isinstance(extent, tvm.tir.PrimExpr):
+            ana = tvm.arith.Analyzer()
+            extent = ana.simplify(extent)
+        return tvm.ir.Range.from_min_extent(start, extent)
+
+    def transform_Subscript(self, node):
+        """Array access visitor.
 
-        operand = self.visit(node.operand)
-        if not isinstance(node.op, tuple(TVMScriptParser._unaryop_maker.keys())):
-            self.report_error("UnaryOp " + str(type(node.op)) + " is not supported now")
-        return TVMScriptParser._unaryop_maker[type(node.op)](operand)
-
-    def visit_Subscript(self, node):
-        """Subscript visitor
-        AST abstract grammar:
-            Subscript(expr value, slice slice, expr_context ctx)
-            slice = Slice(expr? lower, expr? upper, expr? step)
-                    | ExtSlice(slice* dims)
-                    | Index(expr value)
         By now only 2 types of Subscript are supported:
             1. Buffer[index, index, ...], Buffer element access(BufferLoad & BufferStore)
                Var[index] Buffer element access()
             2. meta[type_key][index], Meta info access
         """
 
-        symbol = self.visit(node.value)
+        symbol = self.transform(node.params[0])
         if symbol is None:
-            self.report_error(node.value.id + " is not defined")
-        if isinstance(symbol, (tvm.tir.expr.Var, tvm.tir.Buffer)):
-            if isinstance(node.slice, ast.Index):
-                # BufferLoad & BufferStore, Buffer/Var[index, index, ...]
-                indexes = self.visit(node.slice.value)
-                indexes = list(indexes) if isinstance(indexes, tuple) else [indexes]
-                if isinstance(node.ctx, ast.Load):
-                    if isinstance(symbol, tir.expr.Var):
-                        return tvm.tir.Load("float32", symbol, indexes, True)
-                    else:
-                        return tvm.tir.BufferLoad(symbol, indexes)
-                else:
-                    return symbol, indexes
-            else:
-                # Buffer Region, now used in tir.realize(buffer[bounds])
-                doms = []
-                slice_nodes = []
-                if isinstance(node.slice, ast.Slice):
-                    # Buffer[begin:end]
-                    slice_nodes.append(node.slice)
-                elif isinstance(node.slice, ast.ExtSlice):
-                    # Buffer[begin:end, begin:end]
-                    slice_nodes.extend(node.slice.dims)
-
-                for dim in slice_nodes:
-                    if not hasattr(dim, "step"):
-                        self.report_error("slice of Buffer Region ought to be begin:end")
-                    if dim.step is not None:
-                        self.report_error("step is not allowed in Buffer Region")
-                    upper = self.visit(dim.upper)
-                    lower = self.visit(dim.lower)
-                    extent = upper - lower
-                    if isinstance(extent, _expr.PrimExpr):
-                        ana = tvm.arith.Analyzer()
-                        extent = ana.simplify(extent)
-                    doms.append(tvm.ir.Range.from_min_extent(lower, extent))
-                return symbol, doms
-        else:
-            res = symbol[self.visit(slice)]
-            if res is None:
-                self.report_error("Only buffer variable and meta can be subscriptable")
-            return res
+            self.report_error(f"Variable {node.value.id} is not defined.", node.params[0].span)
 
-    def visit_Attribute(self, node):
-        """Attribute visitor
-        AST abstract grammar:
-            Attribute(expr value, identifier attr, expr_context ctx)
+        indexes = [self.transform(x) for x in node.params[1].values]
+        if isinstance(indexes[0], tvm.ir.Range):
+            return symbol, indexes
+
+        if isinstance(symbol, tvm.tir.expr.Var):
+            return tvm.tir.Load("float32", symbol, indexes, True)
+        if isinstance(symbol, tvm.tir.Buffer):
+            return tvm.tir.BufferLoad(symbol, indexes)
+
+        self.report_error(
+            f"Cannot subscript from a {type(symbol).__name__}. Only variables and "
+            "buffers are supported.",
+            node.params[0].span,
+        )
+
+    def transform_Attr(self, node):
+        """Visitor for field access of the form `x.y`.
+
+        This visitor is used to lookup function and symbol names. We have two
+        cases to handle here:
+        1. If we have a statement of the form `tir.something`, then we lookup
+           `tir.somthing` in the `Registry`. If the function is not in the
+           registry, then we try to find a `tvm.ir.op.Op` with the same name.
+        2. All other names `tvm.something` are lookup up in this current python
+           namespace.
         """
 
-        if isinstance(node.value, ast.Name):
-            if node.value.id == "tir":
-                func_name = "tir." + node.attr
-                res = Registry.look_up_function(func_name)
+        if isinstance(node.object, ast.Var):
+            if node.object.id.name == "tir":
+                func_name = "tir." + node.field.name
+                res = Registry.lookup(func_name)
                 if res is not None:
                     return res
                 try:
                     return tvm.ir.op.Op.get(func_name)
-                except AttributeError:
-                    self.report_error("Unregistered function tir." + node.attr)
-            elif node.value.id == "ty":
-                if not hasattr(ty, node.attr):
-                    self.report_error("invalid type annotation ty." + node.attr)
-                return getattr(ty, node.attr)
-
-        symbol = self.visit(node.value)
+                except TVMError as e:
+                    # Check if we got an attribute error
+                    if e.args[0].find("AttributeError"):
+                        self.report_error(
+                            f"Unregistered function `tir.{node.field.name}`.", node.field.span
+                        )
+                    else:
+                        raise e
+
+        symbol = self.transform(node.object)
         if symbol is None:
-            self.report_error("Unsupported Attribute expression")
-        if not hasattr(symbol, node.attr):
-            self.report_error("Type " + type(symbol) + " has not attr " + node.attr)
-        res = getattr(symbol, node.attr)
+            self.report_error("Unsupported Attribute expression.", node.object.span)
+        if not hasattr(symbol, node.field.name):
+            self.report_error(
+                f"Type {type(symbol)} does not have a field called `{node.field}`.", node.span
+            )
+        res = getattr(symbol, node.field.name)
         return res
 
-    def visit_Dict(self, node):
-        """Dict visitor
-        AST abstract grammar:
-            Dict(expr* keys, expr* values)
+    def transform_TypeAttr(self, node):
+        """Visitor for field access of the form `x.y` for types.
+
+        We have two cases here:
+        1. If the type is of the form `ty.something`, we look up the type in
+           the `ty` namespace in this module.
+        2. If the type is of the form `tvm.x.something` then we look up
+           `tvm.x.something` in this modules namespace.
         """
+        if isinstance(node.object, ast.TypeVar):
+            if node.object.id.name == "ty":
+                if not hasattr(ty, node.field.name):
+                    self.report_error(f"Invalid type annotation `ty.{node.field.name}`.", node.span)
+                return getattr(ty, node.field.name)
 
-        keys = [self.visit(key) for key in node.keys]
-        values = [self.visit(value) for value in node.values]
+        symbol = self.transform(node.object)
+        if symbol is None:
+            self.report_error("Unsupported Attribute expression", node.object.span)
+        if not hasattr(symbol, node.field):
+            self.report_error(
+                f"Type {type(symbol)} does not have a field called `{node.field}`.", node.span
+            )
+        res = getattr(symbol, node.field)
+        return res
 
-        return {key: value for key, value in zip(keys, values)}
+    def transform_DictLiteral(self, node):
+        """Dictionary literal visitor.
 
-    def visit_Tuple(self, node):
-        """Tuple visitor
-        AST abstract grammar:
-            Tuple(expr* elts, expr_context ctx)
+        Handles dictionary literals of the form `{x:y, z:2}`.
         """
 
-        return tuple(self.visit(element) for element in node.elts)
+        keys = [self.transform(key) for key in node.keys]
+        values = [self.transform(value) for value in node.values]
 
-    def visit_List(self, node):
-        """List visitor
-        AST abstract grammar:
-            List(expr* elts, expr_context ctx)
+        return dict(zip(keys, values))
+
+    def transform_Tuple(self, node):
+        """Tuple visitor.
+
+        Handles tuples of the form `(x, y, 2)`.
         """
 
-        return [self.visit(element) for element in node.elts]
+        return tuple(self.transform(element) for element in node.values)
 
-    def visit_keyword(self, node):
-        """Keyword visitor
-        AST abstract grammar:
-            keyword = (identifier? arg, expr value)
+    def transform_ArrayLiteral(self, node):
+        """List literal visitor.
+
+        Handles lists of the form `[x, 2, 3]`.
         """
 
-        return node.arg, self.visit(node.value)
+        return [self.transform(element) for element in node.values]
 
-    def visit_Name(self, node):
-        """Name visitor
-        AST abstract grammar:
-            Name(identifier id, expr_context ctx)
+    def transform_Var(self, node):
+        """Variable visitor
+
+        Handles variables like `x` in `x = 2`.
         """
 
-        name = node.id
+        name = node.id.name
         if name == "meta":
             return self.meta
-        symbol = Registry.look_up_function(name)
+        symbol = Registry.lookup(name)
+        if symbol is not None:
+            return symbol
+        symbol = self.context.lookup_symbol(name)
         if symbol is not None:
             return symbol
-        symbol = self.scope_emitter.lookup_symbol(name)
+        self.report_error(f"Unknown identifier {name}.", node.span)
+
+    def transform_TypeVar(self, node):
+        """Type variable visitor.
+
+        Equivalent to `transform_Var` but for types.
+        """
+        name = node.id.name
+        symbol = Registry.lookup(name) or self.context.lookup_symbol(name)
         if symbol is not None:
             return symbol
-        self.report_error("Unknown identifier %s" % name)
+        self.report_error(f"Unknown identifier {name}.", node.span)
+
+    def transform_Constant(self, node):
+        """Constant value visitor.
 
-    # note that after Python3.8, ast.NameConstant, ast.Num, ast.Str are no longer used
-    def visit_Constant(self, node):
+        Constant values include `None`, `"strings"`, `2` (integers), `4.2`
+        (floats), and `true` (booleans).
+        """
         return node.value
 
-    def visit_NameConstant(self, node):
+    def transform_TypeConstant(self, node):
+        """Constant value visitor for types.
+
+        See `transform_Constant`.
+        """
         return node.value
 
-    def visit_Num(self, node):
-        return node.n
+    def transform_Return(self, node):
+        self.report_error(
+            "TVM script does not support return statements. Instead the last statement in any "
+            "block is implicitly returned.",
+            node.span,
+        )
 
-    def visit_Str(self, node):
-        return node.s
 
+def from_source(src):
+    """Parse function or string into TIR.
 
-def from_source(src, func_lineno=0):
-    """Parse the src into TIR
+    If possible, pass the TVM script in as a function so that line numbers and
+    filename will be accurate.
 
     Parameters
     ----------
-    src : str
+    src : [str, function, class]
         Pruned source of original script
     func_lineno : Optional[int]
         The line number of the first line of the script to be parsed
@@ -732,27 +865,92 @@ def from_source(src, func_lineno=0):
     functions : PrimFunc or IRModule
         The PrimFunc or IRModule in IR.
     """
+    if isinstance(src, str):
+        start_line = 0
+    else:
+        _, start_line = inspect.getsourcelines(src)
+    parser = TVMScriptParser(start_line)
+    return to_ast(src, TVMDiagnosticCtx(), parser)
 
-    root = ast.parse(src)
-    parser = TVMScriptParser(src, func_lineno)
-
-    try:
-        return parser.visit(root)
-    except TVMScriptParserError as e:
-        raise e
-    except TVMError as e:
-        # TVM internal c++ error, we have to process the error message and inject line info
-        inject_e = str(e).split("\n")
-        msg = inject_e[-1].split(":", maxsplit=1)[1].strip()
-        inject_e = inject_e[:-1]
-        inject_e.extend(
-            parser.wrap_line_col(msg, parser.current_lineno, parser.current_col_offset).split("\n")
-        )
-        inject_e[-1] = "TVM" + inject_e[-1][6:]
-        raise TVMError("\n".join(inject_e))
-    except Exception as e:
-        inject_e = parser.wrap_line_col(str(e), parser.current_lineno, parser.current_col_offset)
-        raise TVMScriptParserError(inject_e)
 
+def create_module(functions=None):
+    """Construct a module from list of functions.
+
+    Parameters
+    -----------
+    functions: Optional[dict].
+        Map of GlobalVar or str to PrimFunc
+
+    Returns
+    -------
+    mod : IRModule
+        An IRModule containing the passed definitions
+    """
+
+    return IRModule(functions=functions)
+
+
+def asscript(input_ir, show_meta=False):
+    """Transform a PrimFunc or IRModule to python syntax script
+
+    Parameters
+    ----------
+    input_ir : Union[PrimFunc, IRModule]
+        The PrimFunc or IRModule to be dumped
+
+    show_meta : bool
+        Whether show meta
+
+    Returns
+    -------
+    script : str
+        The Python script
+    """
+
+    return _ffi_api.AsTVMScript(input_ir, show_meta)
+
+
+def tir(script_in):
+    """Decorate a python function or class as tvm script.
+
+    The tvm function or parsing support parsing to the internal TIR.
+
+    Returns
+    -------
+    output : Union[Function, Module]
+        The Function or Module in IR.
+    """
+
+    if inspect.isfunction(script_in):
+        result = from_source(script_in)
+    elif inspect.isclass(script_in):
+        result = TVMScriptClass(script_in)
+    else:
+        raise TypeError("Only function and class definitions are supported.")
+    result.__name__ = script_in.__name__
+    result.__qualname__ = script_in.__qualname__
+    return result
+
+
+def module(script_in):
+    """Decorate a python function or class as tvm script.
+
+    Alias for tvm.script.tir for now.
+
+    Returns
+    -------
+    output : Union[Function, Module]
+        The Function or Module in IR.
+    """
+    return tir(script_in)
+
+
+class TVMScriptClass:
+    """Helper class for decorating a class"""
+
+    def __init__(self, script_in):
+        self.script = script_in
 
-tvm._ffi._init_api("script", __name__)
+    def __call__(self, *args, **kwargs):
+        # call the parser to transform tvm script into TIR
+        return from_source(self.script)
diff --git a/python/tvm/script/registry.py b/python/tvm/script/registry.py
index acbc444a4190..389570115935 100644
--- a/python/tvm/script/registry.py
+++ b/python/tvm/script/registry.py
@@ -15,19 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """TVM Script Parser Function Registry """
-# pylint: disable=inconsistent-return-statements
+# pylint: disable=inconsistent-return-statements, relative-beyond-top-level, import-outside-toplevel
 import inspect
-from enum import Enum
-from typed_ast import ast3 as ast
-
-import tvm
-
-
-class Category(Enum):
-    INTRIN = 0
-    WITH_SCOPE = 1
-    FOR_SCOPE = 2
-    SPECIAL_STMT = 3
 
 
 class Registry(object):
@@ -35,355 +24,35 @@ class Registry(object):
     All these maps are static
     """
 
-    functions = dict()
+    registrations = dict()
 
     @staticmethod
-    def look_up_function(func_name):
-        """look up a registered function by name"""
-        if func_name in Registry.functions:
-            return Registry.functions[func_name][0]
+    def lookup(name):
+        if name in Registry.registrations:
+            # every time we create a new handler
+            # since we may want to keep some local info inside it
+            return Registry.registrations[name]()
         return None
 
-    @staticmethod
-    def is_intrin(func):
-        """check whether a function belongs to intrin"""
-        return (func, Category.INTRIN) in Registry.functions.values()
-
-    @staticmethod
-    def is_with_scope(func):
-        """check whether a function belongs to with scope handlers"""
-        return (func, Category.WITH_SCOPE) in Registry.functions.values()
-
-    @staticmethod
-    def is_for_scope(func):
-        """check whether a function belongs to for scope handlers"""
-        return (func, Category.FOR_SCOPE) in Registry.functions.values()
-
-    @staticmethod
-    def is_special_stmt(func):
-        """check whether a function belongs to special stmts"""
-        return (func, Category.SPECIAL_STMT) in Registry.functions.values()
-
-    @staticmethod
-    def is_registered(func):
-        """check whether a function is registered"""
-        return (
-            Registry.is_intrin(func)
-            or Registry.is_with_scope(func)
-            or Registry.is_for_scope(func)
-            or Registry.is_special_stmt(func)
-        )
-
-
-class CallArgumentReader(object):
-    """A helper class which read required argument from passed arguments"""
-
-    def __init__(self, func_name, args, kwargs, parser):
-        self.func_name = func_name
-        self.args = args
-        self.kwargs = kwargs
-        self.parser = parser
-
-    def get_pos_only_arg(self, pos, name):
-        """Get corresponding position only function argument from argument list"""
-        if len(self.args) >= pos:
-            arg = self.args[pos - 1]
-        elif name not in self.kwargs:
-            self.parser.report_error(self.func_name + " misses argument " + name)
-        else:
-            arg = self.kwargs[name]
-
-        return arg
-
-    def get_kwarg(self, pos, name, default):
-        """Get corresponding keyword function argument from argument list
-        If user doesn't provide the argument, set it to default value
-        """
-        if len(self.args) >= pos:
-            arg = self.args[pos - 1]
-        elif name in self.kwargs:
-            arg = self.kwargs[name]
-        else:
-            return default
-
-        return arg
-
-    def get_varargs(self, pos):
-        """Get corresponding variable argument from argument list"""
-        if len(self.args) >= pos and len(self.kwargs) == 0:
-            return self.args[pos - 1 :]
-        return []
-
-    def auto_insert_body(self, pos, body):
-        """Automatically provide body as function call argument"""
-        if len(self.args) >= pos:
-            self.args.insert(pos - 1, body)
-        else:
-            self.kwargs["body"] = body
-
-
-def func_wrapper(func_name, func_to_register, arg_list, category, concise=False, with_var=False):
-    """Helper function to wrap a function to be registered """
-
-    def wrap_func(parser, node, args, kwargs):
-        if category == Category.FOR_SCOPE:
-            # automatically parse loop vars and body for for_scope handlers
-            loop_var_names = list()
-            if isinstance(node.target, ast.Name):
-                loop_var_names.append(node.target.id)
-            elif isinstance(node.target, ast.Tuple):
-                for elt in node.target.elts:
-                    if not isinstance(elt, ast.Name):
-                        parser.report_error("Invalid loop var")
-                    loop_var_names.append(elt.id)
-            else:
-                parser.report_error("Invalid loop var")
-            loop_vars = [tvm.te.var(name, dtype="int32") for name in loop_var_names]
-
-            parser.scope_emitter.new_scope()
-            parser.scope_emitter.node_stack[-1].extend(reversed(node.body))
-            for loop_var in loop_vars:
-                parser.scope_emitter.update_symbol(loop_var.name, loop_var)
-            body = parser.get_body()
-            parser.scope_emitter.pop_scope()
-        elif category == Category.WITH_SCOPE:
-            if not with_var:
-                if isinstance(node, ast.With) and node.items[0].optional_vars is not None:
-                    parser.report_error("Function " + func_name + " expects no optional vars")
-                # automatically parse body for with_scope handlers without optional vars
-                if isinstance(node, ast.With):
-                    parser.scope_emitter.new_scope()
-                    parser.scope_emitter.node_stack[-1].extend(reversed(node.body))
-                    body = parser.get_body()
-                    parser.scope_emitter.pop_scope()
-                else:
-                    body = parser.get_body()
-            else:
-                if isinstance(node, ast.With) and node.items[0].optional_vars is None:
-                    parser.report_error("Function " + func_name + " expects optional vars")
-                body = None
-
-            if not isinstance(node, ast.With) and not concise:
-                parser.report_error("Concise scoping is not allowed here")
-
-        reader = CallArgumentReader(func_name, args, kwargs, parser)
-        pos_only, kwargs, varargs = arg_list
-
-        internal_args = list()
-        if category == Category.WITH_SCOPE:
-            if not with_var:
-                internal_args.extend([parser, node, body])
-            else:
-                internal_args.extend([parser, node])
-        elif category == Category.FOR_SCOPE:
-            internal_args.extend([parser, node, body, loop_vars])
-        elif category == Category.SPECIAL_STMT:
-            internal_args.extend([parser, node])
-
-        for i, arg_name in enumerate(pos_only):
-            internal_args.append(reader.get_pos_only_arg(i + 1, arg_name))
 
-        for i, arg_info in enumerate(kwargs):
-            arg_name, default = arg_info
-            internal_args.append(reader.get_kwarg(i + 1 + len(pos_only), arg_name, default=default))
+def register(inputs):
+    """Register Intrin/ScopeHandler/SpecialStmt"""
+    if inspect.isfunction(inputs):
+        from .intrin import Intrin
 
-        if varargs is not None:
-            internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1))
+        def create_new_intrin(func):
+            class NewIntrin(Intrin):
+                def __init__(self):
+                    super().__init__(func)
 
-        return func_to_register(*internal_args)
-
-    return wrap_func
-
-
-def get_arg_list(origin_func, category, with_var=False):
-    """Helper function to get the argument list of Function
-    Parameters
-    ----------
-    origin_func: function
-        The function to get the argument list
-    category: Category
-        The category of registered function
-    with_var: bool, optional
-        Whether the with scope handler neeeds optional vars
-    """
-    full_arg_spec = inspect.getfullargspec(origin_func)
-
-    args, defaults = full_arg_spec.args, full_arg_spec.defaults
-
-    if defaults is None:
-        defaults = tuple()
-
-    if category == Category.WITH_SCOPE:
-        if not with_var:
-            if len(args) < 3 or args[0] != "parser" or args[1] != "node" or args[2] != "body":
-                raise RuntimeError(
-                    "TVM Script register error : the first three arguments of "
-                    "this with scope handler must be parser, node, body"
-                )
-            args = args[3:]
-        else:
-            if len(args) < 2 or args[0] != "parser" or args[1] != "node":
-                raise RuntimeError(
-                    "TVM Script register error : the first two arguments of "
-                    "this with scope handler must be parser, node"
-                )
-            args = args[2:]
-    elif category == Category.FOR_SCOPE:
-        if (
-            len(args) < 4
-            or args[0] != "parser"
-            or args[1] != "node"
-            or args[2] != "body"
-            or args[3] != "loop_vars"
-        ):
-            raise RuntimeError(
-                "TVM Script register error : the first three arguments of for scope handler"
-                "must be parser, node, body, loop_vars"
-            )
-        args = args[4:]
-    elif category == Category.SPECIAL_STMT:
-        if len(args) < 2 or args[0] != "parser" or args[1] != "node":
-            raise RuntimeError(
-                "TVM Script register error : the first three arguments of special stmt"
-                "must be parser, node"
-            )
-        args = args[2:]
-
-    if full_arg_spec.varkw is not None:
-        raise RuntimeError(
-            "TVM Script register error : variable keyword argument is not supported now"
-        )
-    if not len(full_arg_spec.kwonlyargs) == 0:
-        raise RuntimeError("TVM Script register error : keyword only argument is not supported now")
-
-    pos_only = list()
-    for arg in args[: len(args) - len(defaults)]:
-        pos_only.append(arg)
-    kwargs = list()
-    for default, arg in zip(defaults, args[len(args) - len(defaults) :]):
-        kwargs.append((arg, default))
-
-    return pos_only, kwargs, full_arg_spec.varargs
-
-
-def register_intrin(name=None):
-    """Decorator to register function under category intrin
-    Parameters
-    ----------
-    name: str, optional
-        registered name for the function
-    Example
-    ------
-    .. code-block:: python
-    @register_intrin
-    def broadcast(value, lanes):
-        lanes = lanes.value if not isinstance(lanes, int) else lanes
-        return tvm.tir.Broadcast(value, lanes)
-    """
-
-    def decorate(origin_func):
-        func_name = "tir." + origin_func.__qualname__ if name is None else name
-        Registry.functions[func_name] = (
-            func_wrapper(
-                func_name, origin_func, get_arg_list(origin_func, Category.INTRIN), Category.INTRIN
-            ),
-            Category.INTRIN,
-        )
-        return origin_func
-
-    return decorate
-
-
-def register_with_scope(concise=False, with_var=False, name=None):
-    """Decorator to register function under with scope handler
-    Parameters
-    ----------
-    concise: bool, optional
-        whether this with scope handler is allowed in concise scoping
-    with_var: bool, optional
-        whether this with scope handler neeeds optional vars
-    name: str, optional
-        registered name for the function
-    Example
-    ------
-    .. code-block:: python
-    @register_scope_handler(concise=True)
-    def attr(parser, node, attr_node, attr_key, value, body):
-        return tvm.tir.AttrStmt(attr_node, attr_key, tvm.runtime.convert(value), body)
-    """
-
-    def decorate(origin_func):
-        """Register function under category with_scope"""
-        func_name = "tir." + origin_func.__qualname__ if name is None else name
-        Registry.functions[func_name] = (
-            func_wrapper(
-                func_name,
-                origin_func,
-                get_arg_list(origin_func, Category.WITH_SCOPE, with_var),
-                Category.WITH_SCOPE,
-                concise=concise,
-                with_var=with_var,
-            ),
-            Category.WITH_SCOPE,
-        )
-        return origin_func
-
-    return decorate
-
-
-def register_for_scope(name=None):
-    """Decorator to register function under for scope handler
-    Parameters
-    ----------
-    name: str, optional
-        registered name for the function
-    """
-
-    def decorate(origin_func):
-        func_name = "tir." + origin_func.__qualname__ if name is None else name
-        Registry.functions[func_name] = (
-            func_wrapper(
-                func_name,
-                origin_func,
-                get_arg_list(origin_func, Category.FOR_SCOPE),
-                Category.FOR_SCOPE,
-            ),
-            Category.FOR_SCOPE,
-        )
-        return origin_func
-
-    return decorate
-
-
-def register_special_stmt(name=None):
-    """Decorator to register function under category special_stmt
-    Parameters
-    ----------
-    name: str, optional
-        registered name for the function
-    Example
-    -------
-    @register_special_stmt
-    def buffer_decl(parser, node, shape, dtype="float32", data=None, strides=[], elem_offset=None,
-                    scope="global", align=-1, offset_factor=0, buffer_type="default"):
-        align = align.value if not isinstance(align, int) else align
-        offset_factor = offset_factor.value if not isinstance(offset_factor, int) else offset_factor
-        buffer = tvm.tir.decl_buffer(shape, dtype, parser.assign_target, data, strides,
-                                    elem_offset, scope, align, offset_factor, buffer_type)
-        return buffer
-    """
+            return NewIntrin
 
-    def decorate(origin_func):
-        func_name = "tir." + origin_func.__qualname__ if name is None else name
-        Registry.functions[func_name] = (
-            func_wrapper(
-                func_name,
-                origin_func,
-                get_arg_list(origin_func, Category.SPECIAL_STMT),
-                Category.SPECIAL_STMT,
-            ),
-            Category.SPECIAL_STMT,
-        )
-        return origin_func
+        registration = create_new_intrin(inputs)
+    elif inspect.isclass(inputs):
+        registration = inputs
+    else:
+        raise ValueError()
 
-    return decorate
+    key = registration().signature()[0]
+    Registry.registrations[key] = registration
+    return registration
diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py
index 08cd7ca84eb9..15197eaf50af 100644
--- a/python/tvm/script/scope_handler.py
+++ b/python/tvm/script/scope_handler.py
@@ -14,182 +14,248 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Parser Scope Handler Functions
-This module provides the functions registered into parser under with_scope or for_scope category.
-Scope handler nodes are StmtNodes with body, which are used to handle such scenarios.
-1. For scope handler
-When registering a for scope handler, the first 4 arguments must be parser, node, body, loop_vars
-and these arguments will provided by TVM Script parser automatically
-.. code-block:: python
-    for loop_vars in tir.xxx():
-2. With scope handler
-There are 4 subtypes of with scope handlers, classified by
-    1) with or without as
-    2) allow concise scoping or not
-1) with as    & concise
-the first 2 arguments must be parser, node
-Need to parse the body manually
-Example : tir.alloc_with_scope
-.. code-block:: python
-    target = tir.xxx()
-    with tir.xxx() as target:
-2) with as    & not concise
-the first 2 arguments must be parser, node
-Need to parse the body manually
-Example : None atm
-.. code-block:: python
-    with tir.xxx() as target:
-3) without as & concise
-the first 3 arguments must be parser, node, body
-TVM Script parser will parse the body automatically
-Example : tir.allocate()/tir.realize()/tir.attr()
-.. code-block:: python
-    tir.xxx()
-    with tir.xxx():
-4) without as & not concise
-the first 3 arguments must be parser, node, body
-TVM Script parser will parse the body automatically
-Example : tir.assert()/tir.let()
-.. code-block:: python
-    with tir.xxx():
-"""
-# pylint: disable=redefined-builtin, unused-argument, invalid-name
-
-from typed_ast import ast3 as ast
+"""TVM Script Parser Scope Handler Classes"""
+# pylint: disable=redefined-builtin, unused-argument, invalid-name, relative-beyond-top-level
+
+from synr import ast
 import tvm.tir
-from .registry import register_with_scope, register_for_scope
-
-
-# With scope handler
-@register_with_scope(concise=True, with_var=True)
-def allocate(parser, node, extents, dtype, scope, condition=True):
-    """ With scope handler function tir.alloc_with_scope(var, extents, dtype, scope, condition) """
-    # defining buffer var and parse the body manually
-
-    buffer_var = tvm.te.var(parser.target[0], "handle")
-    # (TODO) Uncomment this line if we have richer type info for buffer var
-    # buffer_var = tvm.te.var(parser.target[0], tvm.ir.PointerType(tvm.ir.PrimType(dtype)))
-    if isinstance(node, ast.With):
-        parser.scope_emitter.new_scope()
-        parser.scope_emitter.update_symbol(buffer_var.name, buffer_var)
-        parser.scope_emitter.node_stack[-1].extend(reversed(node.body))
-        body = parser.get_body()
-        parser.scope_emitter.pop_scope()
-    else:
-        parser.scope_emitter.update_symbol(buffer_var.name, buffer_var)
-        body = parser.get_body()
-    condition = tvm.runtime.convert(condition)
-    scope = tvm.runtime.convert(scope)
-    body = tvm.tir.Allocate(buffer_var, dtype, extents, condition, body)
-    return tvm.tir.AttrStmt(buffer_var, "storage_scope", scope, body)
-
-
-@register_with_scope(concise=True)
-def launch_thread(parser, node, body, env_var, extent):
-    extent = tvm.runtime.convert(extent)
-    return tvm.tir.AttrStmt(
-        tvm.tir.IterVar(
-            None, env_var, getattr(tvm.tir.IterVar, "ThreadIndex"), parser.var_env_dict[env_var]
-        ),
-        "thread_extent",
-        extent,
-        body,
-    )
-
-
-@register_with_scope(concise=True)
-def realize(parser, node, body, buffer_bounds, scope, condition=True):
-    """ With scope handler function tir.realize(buffer_bounds, scope, condition) """
-    buffer, bounds = buffer_bounds
-    scope = tvm.runtime.convert(scope)
-    return tvm.tir.AttrStmt(
-        buffer, "realize_scope", scope, tvm.tir.BufferRealize(buffer, bounds, condition, body)
-    )
-
-
-@register_with_scope(concise=True)
-def attr(parser, node, body, attr_node, attr_key, value):
-    """ With scope handler function tir.attr(attr_node, attr_key, value) """
-    attr_node = tvm.runtime.convert(attr_node)
-    value = tvm.runtime.convert(value)
-    return tvm.tir.AttrStmt(attr_node, attr_key, value, body)
-
-
-@register_with_scope(concise=False)
-def Assert(parser, node, body, condition, message):
-    """ With scope handler function tir.Assert(condition, message) """
-    return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), body)
-
-
-@register_with_scope(concise=False)
-def let(parser, node, body, var, value):
-    """ With scope handler function tir.let(var, value) """
-    return tvm.tir.LetStmt(var, value, body)
-
-
-# For scope handler
-@register_for_scope()
-def serial(parser, node, body, loop_vars, begin, end):
-    """ For scope handler function tir.serial(begin, end)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    return tvm.tir.For(loop_vars[0], begin, extent, 0, 0, body)
-
-
-@register_for_scope()
-def parallel(parser, node, body, loop_vars, begin, end):
-    """ For scope handler function tir.parallel(begin, end)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    return tvm.tir.For(loop_vars[0], begin, extent, 1, 0, body)
-
-
-@register_for_scope()
-def vectorized(parser, node, body, loop_vars, begin, end):
-    """ For scope handler function tir.vectorized(begin, end)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    return tvm.tir.For(loop_vars[0], begin, extent, 2, 0, body)
-
-
-@register_for_scope()
-def unroll(parser, node, body, loop_vars, begin, end):
-    """ For scope handler function tir.unroll(begin, end)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    return tvm.tir.For(loop_vars[0], begin, extent, 3, 0, body)
-
-
-@register_for_scope(name="range")
-def Range(parser, node, body, loop_vars, begin, end, annotation=None):
-    """ For scope handler function range(begin, end, annotation)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    if annotation is None:
-        annotation = []
-    else:
-        annotation = [
-            tvm.tir.Annotation(key, tvm.runtime.convert(val) if isinstance(val, str) else val)
-            for key, val in annotation.items()
-        ]
-    return tvm.tir.Loop(loop_vars[0], begin, extent, annotation, body)
-
-
-@register_for_scope()
-def grid(parser, node, body, loop_vars, *extents):
-    """ For scope handler function tir.grid(*extents) """
-    if len(loop_vars) != len(extents):
-        parser.report_error("Inconsitent number of loop vars and extents")
-    for loop_var, extent in zip(reversed(loop_vars), reversed(extents)):
-        body = tvm.tir.Loop(loop_var, 0, extent, [], body)
-    return body
+from .utils import get_param_list
+from .registry import register
+
+
+class ScopeHandler:
+    """Base class for all scope handlers"""
+
+    def __init__(self, func):
+        self.func = func
+        self.body = None
+        self.node = None
+        self.context = None
+
+    def signature(self):
+        return "tir." + self.func.__name__, get_param_list(self.func)
+
+    def enter_scope(self, node, context):
+        pass
+
+    def exit_scope(self, node, context, arg_list):
+        self.node = node
+        self.context = context
+        return self.func(*arg_list)
+
+
+class WithScopeHandler(ScopeHandler):
+    """Base class for all with scope handlers"""
+
+    def __init__(self, func, concise_scope, def_symbol):
+        super().__init__(func)
+        self.concise_scope = concise_scope
+        self.def_symbol = def_symbol
+
+    @staticmethod
+    def get_optional_var_names(node, context):
+        """Get list of names from ast.With's optional_vars"""
+        assert isinstance(node, ast.With)
+
+        var_names = None
+        if isinstance(node.items[0].optional_vars, ast.Name):
+            var_names = [node.items[0].optional_vars.id]
+        elif isinstance(node.items[0].optional_vars, (ast.List, ast.Tuple)):
+            for var in node.items[0].optional_vars.elts:
+                if not isinstance(var, ast.Name):
+                    context.report_error("Invalid optional var definition")
+            var_names = [var.id for var in node.items[0].optional_vars.elts]
+        else:
+            context.report_error("Invalid optional var definition")
+        return var_names
+
+
+@register
+class Allocate(WithScopeHandler):
+    """ With scope handler tir.alloc_with_scope(var, extents, dtype, scope, condition) """
+
+    def __init__(self):
+        def allocate(extents, dtype, scope, condition=True):
+            condition = tvm.runtime.convert(condition)
+            scope = tvm.runtime.convert(scope)
+            body = tvm.tir.Allocate(self.buffer_var, dtype, extents, condition, self.body)
+            return tvm.tir.AttrStmt(self.buffer_var, "storage_scope", scope, body)
+
+        super().__init__(allocate, concise_scope=True, def_symbol=True)
+        self.buffer_var = None
+
+    def enter_scope(self, node, context):
+        # define buffer vars in symbol table
+        if isinstance(node, ast.With):
+            names = WithScopeHandler.get_optional_var_names(node, context)
+            if len(names) != 1:
+                context.report_error("Unexpected number of vars")
+            name = names[0]
+        elif isinstance(node, ast.Assign):
+            name = node.lhs.id.name
+        else:
+            raise Exception("Internal Bug")
+
+        self.buffer_var = tvm.te.var(name, "handle")
+        context.update_symbol(name, self.buffer_var)
+
+
+@register
+class LaunchThread(WithScopeHandler):
+    """ With scope handler tir.launch_thread(env_var, extent) """
+
+    def __init__(self):
+        def launch_thread(env_var, extent):
+            extent = tvm.runtime.convert(extent)
+            return tvm.tir.AttrStmt(
+                tvm.tir.IterVar(
+                    None,
+                    env_var,
+                    getattr(tvm.tir.IterVar, "ThreadIndex"),
+                    self.context.func_var_env_dict[env_var],
+                ),
+                "thread_extent",
+                extent,
+                self.body,
+            )
+
+        super().__init__(launch_thread, concise_scope=True, def_symbol=False)
+
+
+@register
+class Realize(WithScopeHandler):
+    """ With scope handler tir.realize(buffer_bounds, scope, condition) """
+
+    def __init__(self):
+        def realize(buffer_bounds, scope, condition=True):
+            buffer, bounds = buffer_bounds
+            scope = tvm.runtime.convert(scope)
+            return tvm.tir.AttrStmt(
+                buffer,
+                "realize_scope",
+                scope,
+                tvm.tir.BufferRealize(buffer, bounds, condition, self.body),
+            )
+
+        super().__init__(realize, concise_scope=True, def_symbol=False)
+
+
+@register
+class Attr(WithScopeHandler):
+    """ With scope handler tir.attr(attr_node, attr_key, value) """
+
+    def __init__(self):
+        def attr(attr_node, attr_key, value):
+            attr_node = tvm.runtime.convert(attr_node)
+            value = tvm.runtime.convert(value)
+            return tvm.tir.AttrStmt(attr_node, attr_key, value, self.body)
+
+        super().__init__(attr, concise_scope=True, def_symbol=False)
+
+
+@register
+class AssertHandler(WithScopeHandler):
+    """ With scope handler tir.Assert(condition, message) """
+
+    def __init__(self):
+        def Assert(condition, message):
+            return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), self.body)
+
+        super().__init__(Assert, concise_scope=True, def_symbol=False)
+
+
+@register
+class Let(WithScopeHandler):
+    """ With scope handler tir.let(var, value) """
+
+    def __init__(self):
+        def let(var, value):
+            return tvm.tir.LetStmt(var, value, self.body)
+
+        super().__init__(let, concise_scope=False, def_symbol=False)
+
+
+class ForScopeHandler(ScopeHandler):
+    """Base class for all for scope handlers"""
+
+    def __init__(self, func):
+        super().__init__(func)
+        self.loop_vars = None
+
+    def enter_scope(self, node, context):
+        assert isinstance(node, ast.For)
+
+        loop_var_names = list()
+        if isinstance(node.lhs, ast.Var):
+            loop_var_names.append(node.lhs.id.name)
+        elif isinstance(node.lhs, ast.Tuple):
+            for elt in node.lhs.values:
+                if not isinstance(elt, ast.Var):
+                    context.report_error("Invalid loop var", elt.span)
+                loop_var_names.append(elt.id.name)
+        else:
+            context.report_error("Invalid loop var", node.lhs)
+
+        self.loop_vars = [tvm.te.var(name, dtype="int32") for name in loop_var_names]
+        for loop_var in self.loop_vars:
+            context.update_symbol(loop_var.name, loop_var)
+
+
+@register
+class Serial(ForScopeHandler):
+    """ For scope handler tir.serial(begin, end)"""
+
+    def __init__(self):
+        def serial(begin, end):
+            if len(self.loop_vars) != 1:
+                self.context.report_error("Expect exact 1 loop var")
+            ana = tvm.arith.Analyzer()
+            extent = end if begin == 0 else ana.simplify(end - begin)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 0, 0, self.body)
+
+        super().__init__(serial)
+
+
+@register
+class Parallel(ForScopeHandler):
+    """ For scope handler tir.parallel(begin, end)"""
+
+    def __init__(self):
+        def parallel(begin, end):
+            if len(self.loop_vars) != 1:
+                self.context.report_error("Expect exact 1 loop var")
+            ana = tvm.arith.Analyzer()
+            extent = end if begin == 0 else ana.simplify(end - begin)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 1, 0, self.body)
+
+        super().__init__(parallel)
+
+
+@register
+class Vectorized(ForScopeHandler):
+    """ For scope handler tir.vectorized(begin, end)"""
+
+    def __init__(self):
+        def vectorized(begin, end):
+            if len(self.loop_vars) != 1:
+                self.context.report_error("Expect exact 1 loop var")
+            ana = tvm.arith.Analyzer()
+            extent = end if begin == 0 else ana.simplify(end - begin)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 2, 0, self.body)
+
+        super().__init__(vectorized)
+
+
+@register
+class Unroll(ForScopeHandler):
+    """ For scope handler tir.unroll(begin, end)"""
+
+    def __init__(self):
+        def unroll(begin, end):
+            if len(self.loop_vars) != 1:
+                self.context.report_error("Expect exact 1 loop var")
+            ana = tvm.arith.Analyzer()
+            extent = end if begin == 0 else ana.simplify(end - begin)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 3, 0, self.body)
+
+        super().__init__(unroll)
diff --git a/python/tvm/script/special_stmt.py b/python/tvm/script/special_stmt.py
index 53c01d49d371..f69475e37cfa 100644
--- a/python/tvm/script/special_stmt.py
+++ b/python/tvm/script/special_stmt.py
@@ -14,130 +14,174 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Parser Special Stmt Functions
-This module provides the functions registered into parser under special_stmt category.
-special_stmt functions don't correspond to an IRNode in the AST directly. It is usually
-used for some information that is not suitable to be printed directly.
-special_stmt can appear as 2 formats
-.. code-block:: python
-    target = tir.name():
-    tir.name()
-When registering a special stmt, the first two arguments must be parser, node
-"""
+"""TVM Script Parser Special Stmt Classes"""
 # pylint: disable=unused-argument, no-self-argument, inconsistent-return-statements
+# pylint: disable=relative-beyond-top-level
+from synr import ast
 
 import tvm.tir
 from tvm import te
-from .registry import register_special_stmt
-
-
-@register_special_stmt()
-def match_buffer(
-    parser,
-    node,
-    param,
-    shape,
-    dtype="float32",
-    data=None,
-    strides=None,
-    elem_offset=None,
-    scope="global",
-    align=-1,
-    offset_factor=0,
-    buffer_type="default",
-):
-    """Special function match_buffer(var, shape, dtype, data, strides, elem_offset, scope, align,
-                                      offset_factor, buffer_type)
+from .utils import get_param_list
+from .registry import register
+
+
+class SpecialStmt:
+    """Base class for all Special Stmts"""
+
+    def __init__(self, func, def_symbol):
+        self.func = func
+        self.def_symbol = def_symbol
+        self.node = None
+        self.context = None
+
+    def signature(self):
+        return "tir." + self.func.__name__, get_param_list(self.func)
+
+    def handle(self, node, context, arg_list):
+        self.node = node
+        self.context = context
+        return self.func(*arg_list)
+
+
+@register
+class MatchBuffer(SpecialStmt):
+    """Special Stmt match_buffer(var, shape, dtype, data, strides, elem_offset, scope, align,
+                                 offset_factor, buffer_type)
     Example
     -------
     .. code-block:: python
         A = tir.match_buffer(a, (128, 128), dtype="float32")
     """
 
-    if param not in parser.params:
-        parser.report_error("Can not bind non-input param to buffer")
-    if strides is None:
-        strides = []
-    align = align.value if not isinstance(align, int) else align
-    offset_factor = offset_factor.value if not isinstance(offset_factor, int) else offset_factor
-    buffer = tvm.tir.decl_buffer(
-        shape,
-        dtype,
-        parser.target[0],
-        data,
-        strides,
-        elem_offset,
-        scope,
-        align,
-        offset_factor,
-        buffer_type,
-    )
-    parser.buffer_map[param] = buffer
-    return buffer
-
-
-@register_special_stmt()
-def buffer_decl(
-    parser,
-    node,
-    shape,
-    dtype="float32",
-    data=None,
-    strides=None,
-    elem_offset=None,
-    scope="global",
-    align=-1,
-    offset_factor=0,
-    buffer_type="default",
-):
-    """Special function buffer_decl(shape, dtype, data, strides, elem_offset, scope, align,
-                                         offset_factor, buffer_type)
+    def __init__(self):
+        def match_buffer(
+            param,
+            shape,
+            dtype="float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope="global",
+            align=-1,
+            offset_factor=0,
+            buffer_type="default",
+        ):
+            assert isinstance(self.node, ast.Assign)
+
+            if param not in self.context.func_params:
+                self.context.report_error(
+                    "Can not bind non-input param to buffer", self.node.rhs.params[0].span
+                )
+            if strides is None:
+                strides = []
+            align = align.value if not isinstance(align, int) else align
+            offset_factor = (
+                offset_factor.value if not isinstance(offset_factor, int) else offset_factor
+            )
+            buffer = tvm.tir.decl_buffer(
+                shape,
+                dtype,
+                self.node.lhs.id.name,
+                data,
+                strides,
+                elem_offset,
+                scope,
+                align,
+                offset_factor,
+                buffer_type,
+            )
+            self.context.func_buffer_map[param] = buffer
+            self.context.update_symbol(self.node.lhs.id.name, buffer)
+
+        super().__init__(match_buffer, def_symbol=True)
+
+
+@register
+class BufferDeclare(SpecialStmt):
+    """Special Stmt buffer_decl(shape, dtype, data, strides, elem_offset, scope, align,
+                                offset_factor, buffer_type)
     Example
     -------
     .. code-block:: python
         A = tir.buffer_decl((128, 128), dtype="float32")
     """
 
-    if strides is None:
-        strides = []
-    align = align.value if not isinstance(align, int) else align
-    offset_factor = offset_factor.value if not isinstance(offset_factor, int) else offset_factor
-    buffer = tvm.tir.decl_buffer(
-        shape,
-        dtype,
-        parser.target[0],
-        data,
-        strides,
-        elem_offset,
-        scope,
-        align,
-        offset_factor,
-        buffer_type,
-    )
-    return buffer
-
-
-@register_special_stmt()
-def var(parser, node, dtype):
+    def __init__(self):
+        def buffer_decl(
+            shape,
+            dtype="float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope="global",
+            align=-1,
+            offset_factor=0,
+            buffer_type="default",
+        ):
+            assert isinstance(self.node, ast.Assign)
+
+            if strides is None:
+                strides = []
+            align = align.value if not isinstance(align, int) else align
+            offset_factor = (
+                offset_factor.value if not isinstance(offset_factor, int) else offset_factor
+            )
+            buffer = tvm.tir.decl_buffer(
+                shape,
+                dtype,
+                self.node.lhs.id.name,
+                data,
+                strides,
+                elem_offset,
+                scope,
+                align,
+                offset_factor,
+                buffer_type,
+            )
+            self.context.update_symbol(self.node.lhs.id.name, buffer)
+            return buffer
+
+        super().__init__(buffer_decl, def_symbol=True)
+
+
+@register
+class VarDef(SpecialStmt):
     """ Special function for defining a Var"""
-    return te.var(parser.target[0], dtype)
 
+    def __init__(self):
+        def var(dtype):
+            assert isinstance(self.node, ast.Assign)
+            v = te.var(self.node.lhs.id.name, dtype)
+            self.context.update_symbol(v.name, v)
 
-@register_special_stmt()
-def env_thread(parser, node, env_name):
+        super().__init__(var, def_symbol=True)
+
+
+@register
+class EnvThread(SpecialStmt):
     """ Bind a var to thread env """
-    v = te.var(parser.target[0])
-    parser.var_env_dict[v] = env_name
-    return v
 
+    def __init__(self):
+        def env_thread(env_name):
+            assert isinstance(self.node, ast.Assign)
+            v = te.var(self.node.lhs.id.name)
+            self.context.func_var_env_dict[v] = env_name
+            self.context.update_symbol(v.name, v)
+
+        super().__init__(env_thread, def_symbol=True)
 
-@register_special_stmt()
-def func_attr(parser, node, dict_attr):
-    """Special function for declaring the DictAttr of PrimFunc
+
+@register
+class FuncAttr(SpecialStmt):
+    """Special Stmt for declaring the DictAttr of PrimFunc
     Example
     -------
     .. code-block:: python
          tir.func_attr({"tir.noalias": True, "global_symbol"})
     """
 
-    parser.dict_attr = dict_attr
+    def __init__(self):
+        def func_attr(dict_attr):
+            self.context.func_dict_attr = dict_attr
+
+        super().__init__(func_attr, def_symbol=False)
diff --git a/python/tvm/script/ty.py b/python/tvm/script/ty.py
index 430a746fff40..1d7871624eb5 100644
--- a/python/tvm/script/ty.py
+++ b/python/tvm/script/ty.py
@@ -23,14 +23,15 @@
 import tvm
 
 
-class TypeGeneric:
+class TypeGeneric:  # pylint: disable=too-few-public-methods
     """Base class for all the TVM script typing class"""
 
     def evaluate(self):
+        """Return an actual ir.Type Object that this Generic class wraps"""
         raise TypeError("Cannot get tvm.Type from a generic type")
 
 
-class ConcreteType(TypeGeneric):
+class ConcreteType(TypeGeneric):  # pylint: disable=too-few-public-methods
     """TVM script typing class for uniform Type objects"""
 
     def __init__(self, vtype):
diff --git a/python/tvm/script/utils.py b/python/tvm/script/utils.py
index f510ddb906aa..ef6736f3e98b 100644
--- a/python/tvm/script/utils.py
+++ b/python/tvm/script/utils.py
@@ -17,93 +17,29 @@
 """Helper functions in TVM Script Parser"""
 
 import inspect
-from tvm import IRModule
 
-from . import _ffi_api
-from .parser import from_source
 
+def get_param_list(func):
+    """Get the parameter list from definition of function"""
+    full_arg_spec = inspect.getfullargspec(func)
 
-def create_module(functions=None):
-    """Construct a module from list of functions.
+    args, defaults = full_arg_spec.args, full_arg_spec.defaults
 
-    Parameters
-    -----------
-    functions: Optional[dict].
-        Map of GlobalVar or str to PrimFunc
+    if defaults is None:
+        defaults = tuple()
 
-    Returns
-    -------
-    mod : IRModule
-        An IRModule containing the passed definitions
-    """
+    if full_arg_spec.varkw is not None:
+        raise RuntimeError(
+            "TVM Script register error : variable keyword argument is not supported now"
+        )
+    if not len(full_arg_spec.kwonlyargs) == 0:
+        raise RuntimeError("TVM Script register error : keyword only argument is not supported now")
 
-    return IRModule(functions=functions)
+    pos_only = list()
+    for arg in args[: len(args) - len(defaults)]:
+        pos_only.append(arg)
+    kwargs = list()
+    for default, arg in zip(defaults, args[len(args) - len(defaults) :]):
+        kwargs.append((arg, default))
 
-
-def asscript(input_ir, show_meta=False):
-    """Transform a PrimFunc or IRModule to python syntax script
-
-    Parameters
-    ----------
-    input_ir : Union[PrimFunc, IRModule]
-        The PrimFunc or IRModule to be dumped
-
-    show_meta : bool
-        Whether show meta
-
-    Returns
-    -------
-    script : str
-        The Python script
-    """
-
-    return _ffi_api.AsTVMScript(input_ir, show_meta)
-
-
-def tir(script_in):
-    """Decorate a python function or class as tvm script.
-
-    The tvm function or parsing support parsing to the internal TIR.
-
-    Returns
-    -------
-    output : Union[Function, Module]
-        The Function or Module in IR.
-    """
-
-    if inspect.isfunction(script_in):
-        return _parse(script_in)
-
-    if inspect.isclass(script_in):
-        return TVMScriptClass(script_in)
-
-    raise TypeError("Only function and class are supported")
-
-
-def module(script_in):
-    """Decorate a python function or class as tvm script.
-
-    Alias for tvm.script.tir for now.
-
-    Returns
-    -------
-    output : Union[Function, Module]
-        The Function or Module in IR.
-    """
-    return tir(script_in)
-
-
-class TVMScriptClass:
-    """Helper class for decorating a class"""
-
-    def __init__(self, script_in):
-        self.script = script_in
-
-    def __call__(self, *args, **kwargs):
-        # call the parser to transform tvm script into TIR
-        return _parse(self.script)
-
-
-def _parse(script_in):
-    """Helper function to parse TVM script into TIR"""
-    return from_source(inspect.getsource(script_in), inspect.getsourcelines(script_in)[1])
+    return pos_only, kwargs, full_arg_spec.varargs
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 43426a554549..edbb0fa3792a 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -220,19 +220,25 @@ def intel_graphics(model="unknown", options=None):
     return Target(" ".join(["opencl"] + opts))
 
 
-def micro(hardware="unknown", options=None):
+def micro(model="unknown", options=None):
     """Returns a microTVM target.
 
     Parameters
     ----------
-    hardware : str
-        Canonically identifies the target device; typicaly one of cortex-mX, or a specific SoC model
-        when that model has been tested to work with microTVM.
+    model : str
+        Canonically identifies the target device. This is typically a CPU or board level name (other
+        flags such as -mcpu identify the ISA).
     options : str or list of str
         Additional options
     """
-    trans_table = {"host": ["-mcpu=native"]}
-    opts = _merge_opts(trans_table[hardware] + ["-runtime=c", "--system-lib"], options)
+    trans_table = {
+        "host": ["-mcpu=native"],
+        "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
+    }
+    opts = _merge_opts(
+        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"],
+        options,
+    )
 
     # NOTE: in the future, the default micro target will be LLVM except when
     # external dependencies are present.
diff --git a/python/tvm/te/hybrid/__init__.py b/python/tvm/te/hybrid/__init__.py
index 3cd1b01797ce..3a10bda29a7d 100644
--- a/python/tvm/te/hybrid/__init__.py
+++ b/python/tvm/te/hybrid/__init__.py
@@ -35,7 +35,7 @@
 
 from .module import HybridModule
 from .parser import source_to_op
-from .util import _pruned_source
+from .utils import _pruned_source
 
 
 def script(pyfunc):
@@ -51,7 +51,7 @@ def script(pyfunc):
     """
     # pylint: disable=import-outside-toplevel, missing-docstring
     def wrapped_func(func, *args, **kwargs):
-        from .util import _is_tvm_arg_types
+        from .utils import _is_tvm_arg_types
 
         if _is_tvm_arg_types(args):
             src = _pruned_source(func)
diff --git a/python/tvm/te/hybrid/calls.py b/python/tvm/te/hybrid/calls.py
index 2e7fc2b72311..761189115050 100644
--- a/python/tvm/te/hybrid/calls.py
+++ b/python/tvm/te/hybrid/calls.py
@@ -25,7 +25,7 @@
 from tvm.tir import call_intrin
 from tvm.tir.stmt import For
 
-from .util import _internal_assert
+from .utils import _internal_assert
 
 # pylint: disable=redefined-builtin,invalid-name
 
diff --git a/python/tvm/te/hybrid/module.py b/python/tvm/te/hybrid/module.py
index 672089cef384..beea8844f78c 100644
--- a/python/tvm/te/hybrid/module.py
+++ b/python/tvm/te/hybrid/module.py
@@ -23,9 +23,9 @@
 
 import ast
 
-from tvm.contrib import util
-from .util import _internal_assert
-from .util import _is_tvm_arg_types
+from tvm.contrib import utils
+from .utils import _internal_assert
+from .utils import _is_tvm_arg_types
 from .parser import source_to_op
 
 
@@ -48,7 +48,7 @@ def __init__(self, src=None, name=None):
         """
         self.src_ = self.name = self.func_ = self.root_ = None
         if src is not None:
-            temp = util.tempdir()
+            temp = utils.tempdir()
             dst = temp.relpath("script.py")
             with open(dst, "w") as f:
                 f.write("import tvm\n@tvm.te.hybrid.script\n%s" % src)
diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
index 8704518723f0..d47b2ee879fc 100644
--- a/python/tvm/te/hybrid/parser.py
+++ b/python/tvm/te/hybrid/parser.py
@@ -37,16 +37,16 @@
 from tvm.tir import all as _all
 from tvm.tir import any as _any
 
-from .util import _internal_assert
+from .utils import _internal_assert
 from . import calls
-from . import util
+from . import utils
 from .preprocessor import determine_variable_usage
 
 
 def concat_list_to_block(lst):
     """Concatenate a list of Python IR nodes to HalideIR Block"""
     if not lst:
-        return util.make_nop()
+        return utils.make_nop()
     n = len(lst)
     if n == 1:
         return lst[0]
@@ -55,10 +55,10 @@ def concat_list_to_block(lst):
 
 def visit_list_to_block(visit, lst):
     """Visit and concatenate a list of Python IR nodes to HalideIR Block"""
-    lst = [visit(stmt) for stmt in lst if not util.is_docstring(stmt)]
-    lst = [stmt for stmt in lst if not tvm.ir.structural_equal(stmt, util.make_nop())]
+    lst = [visit(stmt) for stmt in lst if not utils.is_docstring(stmt)]
+    lst = [stmt for stmt in lst if not tvm.ir.structural_equal(stmt, utils.make_nop())]
     if not lst:
-        return util.make_nop()
+        return utils.make_nop()
     return concat_list_to_block(lst)
 
 
@@ -314,7 +314,7 @@ def visit_Assign(self, node):
                 )
                 self.add_symbol(node.targets[i].id, Symbol.GlobalBuffer, rhs.output(i))
                 rmap[rhs.outputs[i].op] = rhs.output(i)
-            return util.replace_io(rhs.body, rmap)
+            return utils.replace_io(rhs.body, rmap)
 
         _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!")
         lhs = node.targets[0]
@@ -339,8 +339,8 @@ def visit_Assign(self, node):
                     self.add_symbol(lhs, getattr(Symbol, scope.title() + "Buffer"), ph)
                     if scope == "output":
                         self.outputs.append(lhs)
-                    return util.make_nop()
-                if isinstance(rhs, util.halide_imm_types) and ast.Store not in rw:
+                    return utils.make_nop()
+                if isinstance(rhs, utils.halide_imm_types) and ast.Store not in rw:
                     self.add_symbol(lhs, Symbol.ConstVar, rhs)
                 else:
                     _internal_assert(
@@ -355,7 +355,7 @@ def visit_Assign(self, node):
             if lhs is not None:
                 buf, args = lhs
                 return tvm.tir.ProducerStore(buf, rhs, args)
-            return util.make_nop()
+            return utils.make_nop()
 
         lhs, args = self.visit(lhs)
         _internal_assert(
@@ -412,7 +412,7 @@ def visit_If(self, node):
                 return visit_list_to_block(self.visit, node.body)
             if node.orelse:
                 return visit_list_to_block(self.visit, node.orelse)
-            return util.make_nop()
+            return utils.make_nop()
 
         if_body = visit_list_to_block(self.visit, node.body)
 
@@ -559,7 +559,7 @@ def visit_Return(self, node):
             logging.log(logging.CRITICAL, "[Warning] Not all the output buffers returned!")
         self.outputs = [self.symbols[i][1] for i in ids]
         self.returned = True
-        return util.make_nop()
+        return utils.make_nop()
 
     def visit_Tuple(self, node):
         return tuple(self.visit(i) for i in node.elts)
@@ -570,7 +570,7 @@ def visit_Str(self, node):
     def visit_Assert(self, node):
         test = self.visit(node.test)
         mesg = tvm.runtime.convert(self.visit(node.msg))
-        return tvm.tir.AssertStmt(test, mesg, util.make_nop())
+        return tvm.tir.AssertStmt(test, mesg, utils.make_nop())
 
 
 def parse_python(src, args, symbols, closure_vars):
diff --git a/python/tvm/te/hybrid/preprocessor.py b/python/tvm/te/hybrid/preprocessor.py
index b046231fbf48..295476f80812 100644
--- a/python/tvm/te/hybrid/preprocessor.py
+++ b/python/tvm/te/hybrid/preprocessor.py
@@ -19,7 +19,7 @@
 import ast
 import sys
 from .runtime import HYBRID_GLOBALS
-from .util import _internal_assert
+from .utils import _internal_assert
 
 
 class PyVariableUsage(ast.NodeVisitor):
diff --git a/python/tvm/te/hybrid/util.py b/python/tvm/te/hybrid/utils.py
similarity index 100%
rename from python/tvm/te/hybrid/util.py
rename to python/tvm/te/hybrid/utils.py
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 30d0df382c27..0f3457af0f10 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -317,7 +317,11 @@ def extern(
     if isinstance(body, tvm.tir.PrimExpr):
         body = tvm.tir.Evaluate(body)
     if not isinstance(body, tvm.tir.Stmt):
-        raise ValueError("Function '{}' should return PrimExpr or Stmt".format(fcompute.__name__))
+        raise ValueError(
+            "Function '{}' should return PrimExpr or Stmt, but it returned '{}'".format(
+                fcompute.__name__, type(body)
+            )
+        )
 
     op = _ffi_api.ExternOp(name, tag, attrs, inputs, input_placeholders, output_placeholders, body)
     res = [op.output(i) for i in range(len(output_placeholders))]
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index 6294eab2cad9..bdf39544759b 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -40,7 +40,7 @@ def __getitem__(self, indices):
 
     def asobject(self):
         """Convert slice to object."""
-        return self.tensor(*self.indices)
+        return self.tensor.__call__(*self.indices)
 
     @property
     def dtype(self):
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index 20e968b328f6..e5b17f3d7b53 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -617,6 +617,40 @@ def requires_llvm(*args):
     return _compose(args, _requires_llvm)
 
 
+def requires_micro(*args):
+    """Mark a test as requiring microTVM to run.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_micro = [
+        pytest.mark.skipif(
+            tvm.support.libinfo().get("USE_MICRO", "OFF") != "ON",
+            reason="MicroTVM support not enabled. Set USE_MICRO=ON in config.cmake to enable.",
+        )
+    ]
+    return _compose(args, _requires_micro)
+
+
+def requires_rpc(*args):
+    """Mark a test as requiring rpc to run.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_rpc = [
+        pytest.mark.skipif(
+            tvm.support.libinfo().get("USE_RPC", "OFF") != "ON",
+            reason="RPC support not enabled. Set USE_RPC=ON in config.cmake to enable.",
+        )
+    ]
+    return _compose(args, _requires_rpc)
+
+
 def _target_to_requirement(target):
     # mapping from target to decorator
     if target.startswith("cuda"):
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index 60d92e901764..ca46981acdb9 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -325,10 +325,13 @@ class Var(PrimExprWithOp):
 
     dtype : Union[str, tvm.irType]
         The data type
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, name, dtype):
-        self.__init_handle_by_constructor__(_ffi_api.Var, name, dtype)
+    def __init__(self, name, dtype, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Var, name, dtype, span)
 
 
 @tvm._ffi.register_object("tir.SizeVar")
@@ -343,11 +346,14 @@ class SizeVar(Var):
 
     dtype : int
         The data type
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
     # pylint: disable=super-init-not-called
-    def __init__(self, name, dtype):
-        self.__init_handle_by_constructor__(_ffi_api.SizeVar, name, dtype)
+    def __init__(self, name, dtype, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.SizeVar, name, dtype, span)
 
 
 @tvm._ffi.register_object("tir.IterVar")
@@ -370,6 +376,9 @@ class IterVar(Object, ExprOp):
     thread_tag : str
         The thread type tag.
 
+    span : Optional[Span]
+        The location of this itervar in the source code.
+
     See Also
     --------
     te.thread_axis: Create thread axis IterVar.
@@ -386,7 +395,7 @@ class IterVar(Object, ExprOp):
     Parallelized = 7
     Tensorized = 8
 
-    def __init__(self, dom, var, iter_type, thread_tag=""):
+    def __init__(self, dom, var, iter_type, thread_tag="", span=None):
         if dom is not None:
             if isinstance(dom, (list, tuple)):
                 if len(dom) != 2:
@@ -399,7 +408,7 @@ def __init__(self, dom, var, iter_type, thread_tag=""):
         name = var if var is not None else "iter"
         dtype = "int32" if dom is None else dom.extent.dtype
         var = Var(name, dtype=dtype) if not isinstance(var, Var) else var
-        self.__init_handle_by_constructor__(_ffi_api.IterVar, dom, var, iter_type, thread_tag)
+        self.__init_handle_by_constructor__(_ffi_api.IterVar, dom, var, iter_type, thread_tag, span)
 
 
 @tvm._ffi.register_object("tir.CommReducer")
@@ -419,11 +428,14 @@ class CommReducer(Object):
 
     identity_element : List[PrimExpr]
        The identity elements.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, lhs, rhs, result, identity_element):
+    def __init__(self, lhs, rhs, result, identity_element, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.CommReducer, lhs, rhs, result, identity_element
+            _ffi_api.CommReducer, lhs, rhs, result, identity_element, span
         )
 
 
@@ -450,11 +462,14 @@ class Reduce(PrimExprWithOp):
 
     init : list of Expr
         The initial value for output. This can be an int, float or ProducerLoad
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, combiner, src, rdom, condition, value_index, init=None):
+    def __init__(self, combiner, src, rdom, condition, value_index, init=None, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.Reduce, combiner, src, rdom, condition, value_index, init
+            _ffi_api.Reduce, combiner, src, rdom, condition, value_index, init, span
         )
 
 
@@ -469,10 +484,13 @@ class FloatImm(ConstExpr):
 
     value : float
         The constant value.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, value):
-        self.__init_handle_by_constructor__(tvm.ir._ffi_api.FloatImm, dtype, value)
+    def __init__(self, dtype, value, span=None):
+        self.__init_handle_by_constructor__(tvm.ir._ffi_api.FloatImm, dtype, value, span)
 
 
 @tvm._ffi.register_object
@@ -486,10 +504,13 @@ class IntImm(ConstExpr):
 
     value : int
         The constant value.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, value):
-        self.__init_handle_by_constructor__(tvm.ir._ffi_api.IntImm, dtype, value)
+    def __init__(self, dtype, value, span=None):
+        self.__init_handle_by_constructor__(tvm.ir._ffi_api.IntImm, dtype, value, span)
 
     def __hash__(self):
         return self.value
@@ -518,10 +539,13 @@ class StringImm(ConstExpr):
     ----------
     value : str
         The value of the function.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, value):
-        self.__init_handle_by_constructor__(_ffi_api.StringImm, value)
+    def __init__(self, value, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.StringImm, value, span)
 
     def __eq__(self, other):
         if isinstance(other, ConstExpr):
@@ -545,10 +569,13 @@ class Cast(PrimExprWithOp):
 
     value : PrimExpr
         The value of the function.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, value):
-        self.__init_handle_by_constructor__(_ffi_api.Cast, dtype, value)
+    def __init__(self, dtype, value, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Cast, dtype, value, span)
 
 
 @tvm._ffi.register_object("tir.Add")
@@ -562,10 +589,13 @@ class Add(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Add, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Add, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Sub")
@@ -579,10 +609,13 @@ class Sub(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Sub, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Sub, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Mul")
@@ -596,10 +629,13 @@ class Mul(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Mul, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Mul, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Div")
@@ -613,10 +649,13 @@ class Div(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Div, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Div, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Mod")
@@ -630,10 +669,13 @@ class Mod(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Mod, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Mod, a, b, span)
 
 
 @tvm._ffi.register_object("tir.FloorDiv")
@@ -647,10 +689,13 @@ class FloorDiv(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.FloorDiv, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.FloorDiv, a, b, span)
 
 
 @tvm._ffi.register_object("tir.FloorMod")
@@ -664,10 +709,13 @@ class FloorMod(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.FloorMod, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.FloorMod, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Min")
@@ -681,10 +729,13 @@ class Min(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Min, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Min, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Max")
@@ -698,10 +749,13 @@ class Max(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Max, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Max, a, b, span)
 
 
 @tvm._ffi.register_object("tir.EQ")
@@ -715,10 +769,13 @@ class EQ(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.EQ, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.EQ, a, b, span)
 
 
 @tvm._ffi.register_object("tir.NE")
@@ -732,10 +789,13 @@ class NE(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.NE, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.NE, a, b, span)
 
 
 @tvm._ffi.register_object("tir.LT")
@@ -749,10 +809,13 @@ class LT(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.LT, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.LT, a, b, span)
 
 
 @tvm._ffi.register_object("tir.LE")
@@ -766,10 +829,13 @@ class LE(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.LE, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.LE, a, b, span)
 
 
 @tvm._ffi.register_object("tir.GT")
@@ -783,10 +849,13 @@ class GT(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.GT, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.GT, a, b, span)
 
 
 @tvm._ffi.register_object("tir.GE")
@@ -800,10 +869,13 @@ class GE(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.GE, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.GE, a, b, span)
 
 
 @tvm._ffi.register_object("tir.And")
@@ -817,10 +889,13 @@ class And(LogicalExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.And, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.And, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Or")
@@ -834,10 +909,13 @@ class Or(LogicalExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Or, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Or, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Not")
@@ -848,10 +926,13 @@ class Not(LogicalExpr):
     ----------
     a : PrimExpr
         The input value
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a):
-        self.__init_handle_by_constructor__(_ffi_api.Not, a)
+    def __init__(self, a, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Not, a, span)
 
 
 @tvm._ffi.register_object("tir.Select")
@@ -876,10 +957,14 @@ class Select(PrimExprWithOp):
     false_value : PrimExpr
         The value to take when condition is false.
 
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, condition, true_value, false_value):
-        self.__init_handle_by_constructor__(_ffi_api.Select, condition, true_value, false_value)
+    def __init__(self, condition, true_value, false_value, span=None):
+        self.__init_handle_by_constructor__(
+            _ffi_api.Select, condition, true_value, false_value, span
+        )
 
 
 @tvm._ffi.register_object("tir.Load")
@@ -899,11 +984,17 @@ class Load(PrimExprWithOp):
 
     predicate : PrimExpr
         The load predicate.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, buffer_var, index, predicate=None):
-        args = [] if predicate is None else [predicate]
-        self.__init_handle_by_constructor__(_ffi_api.Load, dtype, buffer_var, index, *args)
+    def __init__(self, dtype, buffer_var, index, predicate=None, span=None):
+        if predicate is None:
+            predicate = _ffi_api.const_true(dtype)
+        self.__init_handle_by_constructor__(
+            _ffi_api.Load, dtype, buffer_var, index, predicate, span
+        )
 
 
 @tvm._ffi.register_object("tir.BufferLoad")
@@ -917,10 +1008,13 @@ class BufferLoad(PrimExprWithOp):
 
     indices : List[PrimExpr]
         The buffer indices.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer, indices):
-        self.__init_handle_by_constructor__(_ffi_api.BufferLoad, buffer, indices)
+    def __init__(self, buffer, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.BufferLoad, buffer, indices, span)
 
 
 @tvm._ffi.register_object("tir.ProducerLoad")
@@ -934,10 +1028,13 @@ class ProducerLoad(PrimExprWithOp):
 
     indices : List[PrimExpr]
         The buffer indices.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, producer, indices):
-        self.__init_handle_by_constructor__(_ffi_api.ProducerLoad, producer, indices)
+    def __init__(self, producer, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.ProducerLoad, producer, indices, span)
 
 
 @tvm._ffi.register_object("tir.Ramp")
@@ -954,10 +1051,13 @@ class Ramp(PrimExprWithOp):
 
     lanes : int
         The lanes of the expression.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, base, stride, lanes):
-        self.__init_handle_by_constructor__(_ffi_api.Ramp, base, stride, lanes)
+    def __init__(self, base, stride, lanes, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Ramp, base, stride, lanes, span)
 
 
 @tvm._ffi.register_object("tir.Broadcast")
@@ -971,10 +1071,13 @@ class Broadcast(PrimExprWithOp):
 
     lanes : int
         The lanes of the expression.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, value, lanes):
-        self.__init_handle_by_constructor__(_ffi_api.Broadcast, value, lanes)
+    def __init__(self, value, lanes, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Broadcast, value, lanes, span)
 
 
 @tvm._ffi.register_object("tir.Shuffle")
@@ -988,10 +1091,13 @@ class Shuffle(PrimExprWithOp):
 
     indices : Array of indices
         The indices
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, vectors, indices):
-        self.__init_handle_by_constructor__(_ffi_api.Shuffle, vectors, indices)
+    def __init__(self, vectors, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Shuffle, vectors, indices, span)
 
 
 class CallEffectKind:
@@ -1020,9 +1126,12 @@ class Call(PrimExprWithOp):
 
     args : list of Expr
         The input arguments to the call
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, op, args):
+    def __init__(self, dtype, op, args, span=None):
         if isinstance(op, str):
             if not op.startswith("tir."):
                 raise ValueError(
@@ -1034,7 +1143,7 @@ def __init__(self, dtype, op, args):
                     % op
                 )
             op = Op.get(op)
-        self.__init_handle_by_constructor__(_ffi_api.Call, dtype, op, args)
+        self.__init_handle_by_constructor__(_ffi_api.Call, dtype, op, args, span)
 
 
 @tvm._ffi.register_object("tir.Let")
@@ -1051,15 +1160,22 @@ class Let(PrimExprWithOp):
 
     body : PrimExpr
         The body expression.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, var, value, body):
-        self.__init_handle_by_constructor__(_ffi_api.Let, var, value, body)
+    def __init__(self, var, value, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Let, var, value, body, span)
 
 
 @tvm._ffi.register_object("tir.Any")
 class Any(PrimExpr):
-    """Any node."""
+    """Any node.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
+    """
 
-    def __init__(self):
-        self.__init_handle_by_constructor__(_ffi_api.Any)
+    def __init__(self, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Any, span)
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index b02ebba18765..79d18d8970b5 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -45,9 +45,12 @@ class PrimFunc(BaseFunc):
 
     attrs: Optional[tvm.Attrs]
         Attributes of the function, can be None
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, params, body, ret_type=None, buffer_map=None, attrs=None):
+    def __init__(self, params, body, ret_type=None, buffer_map=None, attrs=None, span=None):
         param_list = []
         buffer_map = {} if buffer_map is None else buffer_map
         for x in params:
@@ -62,10 +65,10 @@ def __init__(self, params, body, ret_type=None, buffer_map=None, attrs=None):
                 raise TypeError("params can only contain Var or Buffer")
 
         self.__init_handle_by_constructor__(
-            _ffi_api.PrimFunc, param_list, body, ret_type, buffer_map, attrs
+            _ffi_api.PrimFunc, param_list, body, ret_type, buffer_map, attrs, span
         )
 
-    def with_body(self, new_body):
+    def with_body(self, new_body, span=None):
         """Create a new PrimFunc with the same set signatures but a new body.
 
         Parameters
@@ -73,9 +76,12 @@ def with_body(self, new_body):
         new_body : Stmt
             The new body.
 
+        span : Optional[Span]
+            The location of this itervar in the source code.
+
         Returns
         -------
         new_func : PrimFunc
             The created new function.
         """
-        return PrimFunc(self.params, new_body, self.ret_type, self.buffer_map, self.attrs)
+        return PrimFunc(self.params, new_body, self.ret_type, self.buffer_map, self.attrs, span)
diff --git a/python/tvm/tir/ir_builder.py b/python/tvm/tir/ir_builder.py
index 77fe79b327b6..75c5c2921ff4 100644
--- a/python/tvm/tir/ir_builder.py
+++ b/python/tvm/tir/ir_builder.py
@@ -103,7 +103,8 @@ def __getitem__(self, index):
         index = self._linear_index(index)
         if t.lanes > 1:
             base = index * t.lanes
-            index = _expr.Ramp(base, const(1, base.dtype), t.lanes)
+            stride = 1 if (not hasattr(base, "dtype")) else const(1, base.dtype)
+            index = _expr.Ramp(base, stride, t.lanes)
         return _expr.Load(self._content_type, self._buffer_var, index)
 
     def __setitem__(self, index, value):
@@ -116,7 +117,8 @@ def __setitem__(self, index, value):
         t = DataType(self._content_type)
         if t.lanes > 1:
             base = index * t.lanes
-            index = _expr.Ramp(base, const(1, base.dtype), t.lanes)
+            stride = 1 if (not hasattr(base, "dtype")) else const(1, base.dtype)
+            index = _expr.Ramp(base, stride, t.lanes)
         self._builder.emit(_stmt.Store(self._buffer_var, value, index))
 
 
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 573bc0e7d970..cba4ce337b1d 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -50,10 +50,13 @@ class LetStmt(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, var, value, body):
-        self.__init_handle_by_constructor__(_ffi_api.LetStmt, var, value, body)
+    def __init__(self, var, value, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.LetStmt, var, value, body, span)
 
 
 @tvm._ffi.register_object("tir.AssertStmt")
@@ -70,10 +73,13 @@ class AssertStmt(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, condition, message, body):
-        self.__init_handle_by_constructor__(_ffi_api.AssertStmt, condition, message, body)
+    def __init__(self, condition, message, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.AssertStmt, condition, message, body, span)
 
 
 @tvm._ffi.register_object("tir.For")
@@ -99,6 +105,9 @@ class For(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
     Serial = 0
@@ -106,9 +115,9 @@ class For(Stmt):
     Vectorized = 2
     Unrolled = 3
 
-    def __init__(self, loop_var, min_val, extent, for_type, device_api, body):
+    def __init__(self, loop_var, min_val, extent, for_type, device_api, body, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.For, loop_var, min_val, extent, for_type, device_api, body
+            _ffi_api.For, loop_var, min_val, extent, for_type, device_api, body, span
         )
 
 
@@ -129,11 +138,17 @@ class Store(Stmt):
 
     predicate : PrimExpr
         The store predicate.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer_var, value, index, predicate=None):
-        args = [] if predicate is None else [predicate]
-        self.__init_handle_by_constructor__(_ffi_api.Store, buffer_var, value, index, *args)
+    def __init__(self, buffer_var, value, index, predicate=None, span=None):
+        if predicate is None:
+            predicate = _ffi_api.const_true(value.dtype)
+        self.__init_handle_by_constructor__(
+            _ffi_api.Store, buffer_var, value, index, predicate, span
+        )
 
 
 @tvm._ffi.register_object("tir.BufferStore")
@@ -150,10 +165,13 @@ class BufferStore(Stmt):
 
     indices : List[PrimExpr]
         The indices location to be stored.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer, value, indices):
-        self.__init_handle_by_constructor__(_ffi_api.BufferStore, buffer, value, indices)
+    def __init__(self, buffer, value, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.BufferStore, buffer, value, indices, span)
 
 
 @tvm._ffi.register_object("tir.BufferRealize")
@@ -173,10 +191,15 @@ class BufferRealize(Stmt):
 
     body : Stmt
         The body of the statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer, bounds, condition, body):
-        self.__init_handle_by_constructor__(_ffi_api.BufferRealize, buffer, bounds, condition, body)
+    def __init__(self, buffer, bounds, condition, body, span=None):
+        self.__init_handle_by_constructor__(
+            _ffi_api.BufferRealize, buffer, bounds, condition, body, span
+        )
 
 
 @tvm._ffi.register_object("tir.ProducerStore")
@@ -193,10 +216,13 @@ class ProducerStore(Stmt):
 
     indices : list of Expr
         The index arguments of the store.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, producer, value, indices):
-        self.__init_handle_by_constructor__(_ffi_api.ProducerStore, producer, value, indices)
+    def __init__(self, producer, value, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.ProducerStore, producer, value, indices, span)
 
 
 @tvm._ffi.register_object("tir.Allocate")
@@ -219,11 +245,14 @@ class Allocate(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer_var, dtype, extents, condition, body):
+    def __init__(self, buffer_var, dtype, extents, condition, body, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.Allocate, buffer_var, dtype, extents, condition, body
+            _ffi_api.Allocate, buffer_var, dtype, extents, condition, body, span
         )
 
 
@@ -244,10 +273,13 @@ class AttrStmt(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, node, attr_key, value, body):
-        self.__init_handle_by_constructor__(_ffi_api.AttrStmt, node, attr_key, value, body)
+    def __init__(self, node, attr_key, value, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.AttrStmt, node, attr_key, value, body, span)
 
 
 @tvm._ffi.register_object("tir.ProducerRealize")
@@ -267,11 +299,14 @@ class ProducerRealize(Stmt):
 
     body : Stmt
         The realize body
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, producer, bounds, condition, body):
+    def __init__(self, producer, bounds, condition, body, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.ProducerRealize, producer, bounds, condition, body
+            _ffi_api.ProducerRealize, producer, bounds, condition, body, span
         )
 
 
@@ -283,10 +318,13 @@ class SeqStmt(Stmt):
     ----------
     seq : List[Stmt]
         The statements
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, seq):
-        self.__init_handle_by_constructor__(_ffi_api.SeqStmt, seq)
+    def __init__(self, seq, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.SeqStmt, seq, span)
 
     def __getitem__(self, i):
         return self.seq[i]
@@ -309,10 +347,15 @@ class IfThenElse(Stmt):
 
     else_case : Stmt
         The statement to execute if condition is false.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, condition, then_case, else_case):
-        self.__init_handle_by_constructor__(_ffi_api.IfThenElse, condition, then_case, else_case)
+    def __init__(self, condition, then_case, else_case, span=None):
+        self.__init_handle_by_constructor__(
+            _ffi_api.IfThenElse, condition, then_case, else_case, span
+        )
 
 
 @tvm._ffi.register_object("tir.Evaluate")
@@ -323,10 +366,13 @@ class Evaluate(Stmt):
     ----------
     value : PrimExpr
         The expression to be evalued.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, value):
-        self.__init_handle_by_constructor__(_ffi_api.Evaluate, value)
+    def __init__(self, value, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Evaluate, value, span)
 
 
 @tvm._ffi.register_object("tir.Prefetch")
@@ -340,10 +386,13 @@ class Prefetch(Stmt):
 
     bounds : list of Range
         The bounds to be prefetched.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer, bounds):
-        self.__init_handle_by_constructor__(_ffi_api.Prefetch, buffer, bounds)
+    def __init__(self, buffer, bounds, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Prefetch, buffer, bounds, span)
 
 
 def stmt_seq(*args):
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 0b2174cb3cb0..555717854ed6 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -49,7 +49,7 @@
 from . import mali
 from . import bifrost
 from . import intel_graphics
-from . import util
+from . import utils
 from . import rocm
 from . import vision
 from . import image
@@ -57,7 +57,7 @@
 from . import hls
 
 # error reporting
-from .util import InvalidShapeError
+from .utils import InvalidShapeError
 
 # not import testing by default
 # because testing can have extra deps that are not necessary
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
index 7e0f566b96f4..15d84c20ed23 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -43,12 +43,21 @@ def get_arch_version(target_mattr):
 
 
 def is_dotprod_available():
-    """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
+    """ Checks whether the hardware has support for udot/sdot instructions. """
     target = tvm.target.Target.current(allow_none=False)
     arch_version = get_arch_version(target.mattr)
     return arch_version >= 8.4 or ((arch_version in (8.2, 8.3)) and "+dotprod" in target.mattr)
 
 
+def is_mmla_available():
+    """ Checks whether the hardware has support for ummla/smmla instructions. """
+    target = tvm.target.Target.current(allow_none=False)
+    arch_version = get_arch_version(target.mattr)
+    return arch_version >= 8.6 or (
+        (arch_version in (8.2, 8.3, 8.4, 8.5)) and "+i8mm" in target.mattr
+    )
+
+
 def is_aarch64_arm():
     """ Checks whether we are compiling for an AArch64 target. """
     target = tvm.target.Target.current(allow_none=False)
@@ -63,8 +72,10 @@ def get_tiling_B_interleaved_t(interleave_A):
     tile computation.
 
     Please refer to:
-        - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
-        - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
+    - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-performance-for-armv8-architectures # pylint: disable=line-too-long
+    - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
+    - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-through-mmla-instruction
+    - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
      In order to have more information
 
     Parameters
@@ -77,7 +88,13 @@ def get_tiling_B_interleaved_t(interleave_A):
     tile_rows_B: the output tile rows of B'
     tile_cols_B: the output tile columns of B'
     """
-    if is_dotprod_available():
+    if is_mmla_available():
+        # If smmla/ummla is available,  A must be interleaved.
+        # Each load from B' will contain 8 elements
+        # and we are loading 12 rows of B' (i.e., 12 columns of B)
+        tile_rows_B = 12
+        tile_cols_B = 8
+    elif is_dotprod_available():
         # The number of tile rows of B' vary depending on the
         # strategy:
         # * If we are interleaving A, then we select 12 columns from B'(i.e.,
@@ -92,7 +109,7 @@ def get_tiling_B_interleaved_t(interleave_A):
         # rows of the original matrix B)  need to be 4.
         tile_cols_B = 4
     else:
-        # If dot product is not available, A must be interleaved. In this case
+        # If no acceleration is available, A must be interleaved. In this case
         # we load 4 rows of B' (i.e., 4 columns of B). Each of them will contain 16 elements
         tile_rows_B = 4
         tile_cols_B = 16
diff --git a/python/tvm/topi/arm_cpu/bitserial_conv2d.py b/python/tvm/topi/arm_cpu/bitserial_conv2d.py
index fb22930b5681..88940a000397 100644
--- a/python/tvm/topi/arm_cpu/bitserial_conv2d.py
+++ b/python/tvm/topi/arm_cpu/bitserial_conv2d.py
@@ -25,8 +25,8 @@
 from ..nn.pad import pad
 from ..nn.bitserial_conv2d import bitserial_conv2d_legalize
 from ..nn.bitserial_util import bitpack, binary_op_multiplier
-from ..nn.util import get_pad_tuple
-from ..util import get_const_int, get_const_tuple
+from ..nn.utils import get_pad_tuple
+from ..utils import get_const_int, get_const_tuple
 
 
 def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC, use_bitpack=True):
diff --git a/python/tvm/topi/arm_cpu/bitserial_dense.py b/python/tvm/topi/arm_cpu/bitserial_dense.py
index 61778b7eb544..8ceab5153889 100644
--- a/python/tvm/topi/arm_cpu/bitserial_dense.py
+++ b/python/tvm/topi/arm_cpu/bitserial_dense.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .. import tag
 from .bitserial_conv2d import _intrin_popcount
 from ..nn.pad import pad
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
index b7f94f7e390c..7dbbf9d3d447 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -23,9 +23,9 @@
 from tvm import autotvm
 import tvm.contrib.nnpack
 
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 from .. import nn
-from ..nn.util import get_const_int, get_pad_tuple
+from ..nn.utils import get_const_int, get_pad_tuple
 from ..nn.winograd_util import winograd_transform_matrices
 from .conv2d_spatial_pack import (
     conv2d_spatial_pack_nchw,
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index a64bc413e0c6..c7c572c81110 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -25,7 +25,7 @@
 from tvm import autotvm
 
 from ..nn import conv2d_alter_layout
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..x86.conv2d import _get_default_config as _get_x86_default_config
 from .arm_utils import get_tiling_B_interleaved_t
 
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index b40fb89b5d33..85c03997a98d 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -21,15 +21,15 @@
 from tvm import te
 from tvm.topi import nn
 from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity
-from ..util import get_const_tuple, get_const_int
-from ..nn.util import get_pad_tuple
+from ..utils import get_const_tuple, get_const_int
+from ..nn.utils import get_pad_tuple
 from .tensor_intrin import (
-    gemm_quantized,
-    gemm_quantized_impl,
+    gemm_4x4_int8_int8_int32,
     gemm_acc_4x4_int8_int8_int32,
     gemm_acc_nx16_int8_int8_int32,
+    gemm_acc_2x2_int8_int8_int32,
 )
-from .arm_utils import is_aarch64_arm, is_dotprod_available
+from .arm_utils import is_aarch64_arm, is_dotprod_available, is_mmla_available
 
 
 def configure_knobs(cfg, M, K):
@@ -50,11 +50,8 @@ def configure_knobs(cfg, M, K):
 
     if not is_dotprod_available():
         cfg.define_knob("gemm_quantized_unroll", [True, False])
-        cfg.define_knob("gemm_quantized_interleave", [True, False])
-
         if cfg.is_fallback:
             cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
-            cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
 
 
 # Compute function
@@ -130,11 +127,18 @@ def compute_conv2d_gemm_without_weight_transform(
     # the tile computation.
     #
     # Please refer to:
-    #   - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
-    #   - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
+    # - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-performance-for-armv8-architectures # pylint: disable=line-too-long
+    # - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
+    # - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-through-mmla-instruction
+    # - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
     # In order to have more information
     #
-    if is_dotprod_available() and interleave_A:
+    if is_mmla_available():
+        # If smmla/ummla is enabled, we are loading 8 rows from A. Each row
+        # will contain 8 elements
+        tile_rows_A = 8
+        tile_cols_A = 8
+    elif is_dotprod_available() and interleave_A:
         # If dot product has been enabled, and we are interleaving A
         # tile size should be 8x4
         tile_rows_A = 8
@@ -177,24 +181,71 @@ def compute_conv2d_gemm_without_weight_transform(
             lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y],
             name="A_interleaved",
         )
-        # Execute GEMM
-        C_interleaved = te.compute(
-            (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B),
-            lambda b, x, y, w, z: te.sum(
-                A_interleaved[b, x, k // tile_cols_A, w, idxm(k, tile_cols_A)].astype("int32")
-                * B_interleaved_t[y, k // tile_cols_B, z, idxm(k, tile_cols_B)].astype("int32"),
-                axis=k,
-            ),
-            name="C_interleaved",
-        )
-        # Unpack the result
-        C = te.compute(
-            (batches, M, N),
-            lambda b, x, y: C_interleaved[
-                b, x // tile_rows_A, y // tile_rows_B, idxm(x, tile_rows_A), idxm(y, tile_rows_B)
-            ].astype(out_dtype),
-            name="C",
-        )
+        if is_mmla_available():
+            # Execute GEMM. In the case of mmla, we need to enforce the tiling
+            # from the compute. This is because mmla is doing a tiled computation
+            # as well. So we have a big 8x12 tile, with small 2x2 sub-tiles
+            # generated by mmla. In theory we could make the tile 2x2 and
+            # fuse and split during scheduling, but this would not work
+            # because of possible padding
+            C_interleaved = te.compute(
+                (
+                    batches,
+                    M_padded // tile_rows_A,
+                    N_transformed,
+                    tile_rows_A // 2,
+                    tile_rows_B // 2,
+                    2,
+                    2,
+                ),
+                lambda b, x, y, w, z, s, t: te.sum(
+                    A_interleaved[b, x, k // tile_cols_A, 2 * w + s, idxm(k, tile_cols_A)].astype(
+                        "int32"
+                    )
+                    * B_interleaved_t[y, k // tile_cols_B, 2 * z + t, idxm(k, tile_cols_B)].astype(
+                        "int32"
+                    ),
+                    axis=k,
+                ),
+                name="C_interleaved",
+            )
+            # Unpack the result
+            C = te.compute(
+                (batches, M, N),
+                lambda b, x, y: C_interleaved[
+                    b,
+                    x // tile_rows_A,
+                    y // tile_rows_B,
+                    idxm(x, tile_rows_A) // 2,
+                    idxm(y, tile_rows_B) // 2,
+                    idxm(idxm(x, tile_rows_A), 2),
+                    idxm(idxm(y, tile_rows_B), 2),
+                ].astype(out_dtype),
+                name="C",
+            )
+        else:
+            # Execute GEMM
+            C_interleaved = te.compute(
+                (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B),
+                lambda b, x, y, w, z: te.sum(
+                    A_interleaved[b, x, k // tile_cols_A, w, idxm(k, tile_cols_A)].astype("int32")
+                    * B_interleaved_t[y, k // tile_cols_B, z, idxm(k, tile_cols_B)].astype("int32"),
+                    axis=k,
+                ),
+                name="C_interleaved",
+            )
+            # Unpack the result
+            C = te.compute(
+                (batches, M, N),
+                lambda b, x, y: C_interleaved[
+                    b,
+                    x // tile_rows_A,
+                    y // tile_rows_B,
+                    idxm(x, tile_rows_A),
+                    idxm(y, tile_rows_B),
+                ].astype(out_dtype),
+                name="C",
+            )
         zero = tvm.tir.const(0)
     else:
         # No need to pack/unpack, execute GEMM directly
@@ -255,7 +306,7 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
         s[data_im2col].compute_inline()
 
     # Computation(through tensorize)
-    b, xo, yo, xi, yi = C_interleaved.op.axis
+    b, xo, yo, xi, yi = C_interleaved.op.axis[0:5]
     outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo])
 
     b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm)
@@ -271,40 +322,45 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
 
     k = C_interleaved.op.reduce_axis[0]
     _, M, N = C.shape
-    if is_dotprod_available():
-        gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type)
-        xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
-            xi, yi, x_factor=8, y_factor=4
-        )
-        k_outer, k_inner = s[C_interleaved].split(k, 4)
-        xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4)
-        s[C_interleaved].reorder(
-            b_outer_gemm_fused,
-            inner_gemm,
-            xi_outer,
-            yi_outer,
-            k_outer,
-            xi_inner_outer,
-            xi_inner_inner,
-            yi_inner,
-            k_inner,
-        )
-        s[C_interleaved].tensorize(xi_inner_inner, gemm_acc)
-        s[C_interleaved].unroll(xi_inner_outer)
-
-    elif is_aarch64_arm():
-        s[C_interleaved].reorder(yi, xi)
-        K = A_interleaved_input.shape[2]
-        assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
-        unroll = cfg["gemm_quantized_unroll"].val
-        interleave = cfg["gemm_quantized_interleave"].val
-        gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type)
-        s[C_interleaved].pragma(
-            b_outer_gemm_fused,
-            "import_llvm",
-            gemm_quantized_impl(M, N, K, unroll, interleave, in_type),
-        )
-        s[C_interleaved].tensorize(yi, gemm)
+    if in_type in ["int8", "uint8"]:
+        if is_mmla_available():
+            gemm_acc = gemm_acc_2x2_int8_int8_int32(in_type)
+            xi_inner, yi_inner = C_interleaved.op.axis[-2:]
+            k_outer, k_inner = s[C_interleaved].split(k, 8)
+            s[C_interleaved].reorder(
+                b_outer_gemm_fused, inner_gemm, k_outer, xi, yi, xi_inner, yi_inner, k_inner
+            )
+            s[C_interleaved].tensorize(xi_inner, gemm_acc)
+            s[C_interleaved].unroll(xi)
+            s[C_interleaved].unroll(yi)
+        elif is_dotprod_available():
+            gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type)
+            xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
+                xi, yi, x_factor=8, y_factor=4
+            )
+            k_outer, k_inner = s[C_interleaved].split(k, 4)
+            xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4)
+            s[C_interleaved].reorder(
+                b_outer_gemm_fused,
+                inner_gemm,
+                xi_outer,
+                yi_outer,
+                k_outer,
+                xi_inner_outer,
+                xi_inner_inner,
+                yi_inner,
+                k_inner,
+            )
+            s[C_interleaved].tensorize(xi_inner_inner, gemm_acc)
+            s[C_interleaved].unroll(xi_inner_outer)
+
+        elif is_aarch64_arm():
+            s[C_interleaved].reorder(yi, xi)
+            K = A_interleaved_input.shape[2]
+            assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
+            unroll = cfg["gemm_quantized_unroll"].val
+            gemm = gemm_4x4_int8_int8_int32(M, N, K, unroll, in_type)
+            s[C_interleaved].tensorize(yi, gemm)
 
     # Output transform
     if out != final_out:
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 43fe80178bd3..445b9ec0c113 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -19,7 +19,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import tag
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 from ..generic import conv2d as conv2d_generic
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index 2e664433c15b..f4cd9d899b73 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -21,8 +21,8 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import get_const_tuple
-from ..nn.util import get_const_int, get_pad_tuple
+from ..utils import get_const_tuple
+from ..nn.utils import get_const_int, get_pad_tuple
 
 
 def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
diff --git a/python/tvm/topi/arm_cpu/conv2d_transpose.py b/python/tvm/topi/arm_cpu/conv2d_transpose.py
index bab32ab9def6..c9f1e1efddfc 100644
--- a/python/tvm/topi/arm_cpu/conv2d_transpose.py
+++ b/python/tvm/topi/arm_cpu/conv2d_transpose.py
@@ -23,7 +23,7 @@
 from tvm import autotvm
 
 from ..nn import dilate, pad, get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 from .conv2d_spatial_pack import schedule_conv2d_spatial_pack_nchw
 
 
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
index b084066fc152..4f721da5420c 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
@@ -21,7 +21,7 @@
 from tvm import autotvm
 from tvm.autotvm.task import deserialize_args
 from tvm.topi.nn.conv2d import conv2d_nchw, conv2d_nhwc
-from tvm.topi.util import get_const_tuple, get_const_int, traverse_inline
+from tvm.topi.utils import get_const_tuple, get_const_int, traverse_inline
 
 
 def conv2d_direct(*args, **kwargs):
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
index 61dca8a37962..988c3a99c059 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
@@ -20,9 +20,9 @@
 from tvm import autotvm
 from tvm.autotvm.task import deserialize_args
 from tvm import te
-from tvm.topi.util import simplify, traverse_inline
+from tvm.topi.utils import simplify, traverse_inline
 from tvm.topi.nn.pad import pad
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 from ..micro_kernel.gemm import (
     intrin_gemm_MxKxN,
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
index 943aee0227d0..fb6f7a589525 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
@@ -129,6 +129,9 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
+#include <arm_math.h>
+#include <arm_nnsupportfunctions.h>
+
 __STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index b71c0c92864c..441b0a5a3688 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -23,8 +23,10 @@
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from .. import nn
-from ..util import traverse_inline, get_const_tuple, get_const_int
-from ..nn.util import get_pad_tuple
+from ..utils import traverse_inline, get_const_tuple, get_const_int
+from ..nn.utils import get_pad_tuple
+from .tensor_intrin import smlal_int16_int32
+from .arm_utils import is_aarch64_arm
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -151,7 +153,7 @@ def _callback(op):
 # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
 # Let us comment it out but not remove.
 # see discussion:
-# https://discuss.tvm.ai/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
+# https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
 @autotvm.register_topi_compute("depthwise_conv2d_nchw_spatial_pack.arm_cpu")
 def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """TOPI compute callback for depthwise_conv2d nchw
@@ -222,7 +224,6 @@ def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation, o
     output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-
     out_dtype = out_dtype or data.dtype
 
     N, IH, IW, IC = get_const_tuple(data.shape)
@@ -288,10 +289,18 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):
 
     ##### space definition begin #####
     n, h, w, c = s[out].op.axis
+    # Split the number of input/output channels
     cfg.define_split("tile_c", c, num_outputs=2)
+    # Split the height of the convolution
     _, hi = cfg.define_split("tile_h", h, num_outputs=2)
+    # Split the width of the convolution
     _, wi = cfg.define_split("tile_w", w, num_outputs=2)
+    # Additional out (e.g., requantization, bias addition, etc..)
+    # 0: locate the output on the second last axis of the main compuation
+    # 1: locate the output closest to the main computation
     cfg.define_knob("locate_output", [0, 1])
+    # Determine if we should unroll the computation of the inner tile
+    cfg.define_knob("unroll_tile", [True, False])
 
     # fallback support
     if cfg.is_fallback:
@@ -299,10 +308,15 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):
         cfg["tile_h"] = SplitEntity([-1, 2])
         cfg["tile_w"] = SplitEntity([-1, 2])
         cfg["locate_output"] = OtherOptionEntity(1)
+        cfg["unroll_tile"] = OtherOptionEntity(True)
     ##### space definition end #####
 
     def schedule_conv(conv):
         conv_data = conv.op.input_tensors[0]
+        kernel_data = conv.op.input_tensors[1]
+        in_type = conv_data.dtype
+
+        _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape)
 
         n, w, h, c = conv.op.axis
         r_h, r_w = conv.op.reduce_axis
@@ -310,24 +324,53 @@ def schedule_conv(conv):
         wo, wi = cfg["tile_w"].apply(s, conv, w)
         co, ci = cfg["tile_c"].apply(s, conv, c)
 
+        split_val = cfg["tile_c"].size[-1]
+        use_tensorization = (
+            (in_type == "int16")
+            and (split_val == 8)
+            and (IC % split_val == 0)
+            and (channel_multiplier == 1)
+            and is_aarch64_arm()
+        )
+
+        data_pad_value = -1
         if conv_data.name == "data_pad":
             assert isinstance(conv_data.op, tvm.te.ComputeOp)
-            # Define a policy for padding computation
-            cfg.define_knob("data_pad_inline", [1, 2, 3])
+            # Define a strategy for padding computation
+            cfg.define_knob("data_pad_strategy", [1, 2, 3])
             if cfg.is_fallback:
-                cfg["data_pad_inline"] = OtherOptionEntity(3)
-            if cfg["data_pad_inline"].val == 1:
+                # We cannot inline padding when tensorizing.
+                # So, if we can tensorize, let's compute_at the closest axis
+                cfg["data_pad_strategy"] = (
+                    OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3)
+                )
+            # Compute padding on the third to last axis of the computation
+            if cfg["data_pad_strategy"].val == 1:
                 s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                 s[conv_data].compute_at(s[conv], ho)
-            if cfg["data_pad_inline"].val == 2:
+            # Compute padding on the second to last axis of the computation
+            if cfg["data_pad_strategy"].val == 2:
                 s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                 s[conv_data].compute_at(s[conv], wo)
-            if cfg["data_pad_inline"].val == 3:
+            # Inline padding during computation
+            if cfg["data_pad_strategy"].val == 3:
                 s[conv_data].compute_inline()
+            data_pad_value = cfg["data_pad_strategy"].val
+
+        if use_tensorization and data_pad_value != 3:
+            smlal = smlal_int16_int32()
+            s[conv].tensorize(ci, smlal)
+        else:
+            s[conv].vectorize(ci)
+
+        if cfg["unroll_tile"].val:
+            s[conv].unroll(r_h)
+            s[conv].unroll(r_w)
+            s[conv].unroll(wi)
+            s[conv].unroll(hi)
 
         s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
         fused_n_ho = s[conv].fuse(n, ho)
-        s[conv].vectorize(ci)
         return fused_n_ho
 
     def schedule_conv_out(out):
@@ -335,13 +378,17 @@ def schedule_conv_out(out):
         co, ci = cfg["tile_c"].apply(s, out, c)
         wo, wi = cfg["tile_w"].apply(s, out, w)
         ho, hi = cfg["tile_h"].apply(s, out, h)
-        s[out].reorder(n, ho, wo, co, hi, wi)
+        s[out].reorder(n, ho, wo, co, hi, wi, ci)
+        if cfg["unroll_tile"]:
+            s[out].unroll(wi)
+            s[out].unroll(hi)
 
         if out.dtype in ["int8", "uint8"]:
             # In case of quantized convolution further split the channel in batches of 4 elements
             # so that we can use arm intrinsics to run fixed_point_multiplication
             ci_outer, ci_inner = s[out].split(ci, 4)
             s[out].vectorize(ci_inner)
+            s[out].unroll(ci_outer)
 
         fused_n_ho = s[out].fuse(n, ho)
         return hi, wi, fused_n_ho
diff --git a/python/tvm/topi/arm_cpu/injective.py b/python/tvm/topi/arm_cpu/injective.py
index aec86bc6d525..55f47c5dee4d 100644
--- a/python/tvm/topi/arm_cpu/injective.py
+++ b/python/tvm/topi/arm_cpu/injective.py
@@ -18,7 +18,7 @@
 """Schedule for pooling operators"""
 import tvm
 from tvm import te
-from ..util import is_empty_shape
+from ..utils import is_empty_shape
 
 
 def schedule_injective_from_existing(sch, out):
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 73cfacb62079..4055d7b05c24 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -19,401 +19,52 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import util, clang
-
-
-def gemm_quantized_4_4_batched():
-    return """
-           // First half
-           // Higher part of a0 * {b0,b1,b2,b3}
-           "umull v8.8h, v0.8b, v4.8b\\n"
-           "umull v9.8h, v0.8b, v5.8b\\n"
-           "umull v10.8h, v0.8b, v6.8b\\n"
-           "umull v11.8h, v0.8b, v7.8b\\n"
-
-           // Higher part of a1 * {b0,b1,b2,b3}
-           "umull v12.8h, v1.8b, v4.8b\\n"
-           "umull v13.8h, v1.8b, v5.8b\\n"
-           "umull v14.8h, v1.8b, v6.8b\\n"
-           "umull v15.8h, v1.8b, v7.8b\\n"
-
-           // Accumulate
-           "uadalp v16.4s, v8.8h\\n"
-           "uadalp v17.4s, v9.8h\\n"
-           "uadalp v18.4s, v10.8h\\n"
-           "uadalp v19.4s, v11.8h\\n"
-           "uadalp v20.4s, v12.8h\\n"
-           "uadalp v21.4s, v13.8h\\n"
-           "uadalp v22.4s, v14.8h\\n"
-           "uadalp v23.4s, v15.8h\\n"
-
-           // Lower part of a0 * {b0,b1,b2,b3}
-           "umull2 v8.8h, v0.16b, v4.16b\\n"
-           "umull2 v9.8h, v0.16b, v5.16b\\n"
-           "umull2 v10.8h, v0.16b, v6.16b\\n"
-           "umull2 v11.8h, v0.16b, v7.16b\\n"
-
-           // Lower part of a1 * {b0,b1,b2,b3}
-           "umull2 v12.8h, v1.16b, v4.16b\\n"
-           "umull2 v13.8h, v1.16b, v5.16b\\n"
-           "umull2 v14.8h, v1.16b, v6.16b\\n"
-           "umull2 v15.8h, v1.16b, v7.16b\\n"
-
-            // Accumulate again
-           "uadalp v16.4s, v8.8h\\n"
-           "uadalp v17.4s, v9.8h\\n"
-           "uadalp v18.4s, v10.8h\\n"
-           "uadalp v19.4s, v11.8h\\n"
-           "uadalp v20.4s, v12.8h\\n"
-           "uadalp v21.4s, v13.8h\\n"
-           "uadalp v22.4s, v14.8h\\n"
-           "uadalp v23.4s, v15.8h\\n"
-
-           // Second half
-           // Lower part of a2 * {b0,b1,b2,b3}
-           "umull v8.8h, v2.8b, v4.8b\\n"
-           "umull v9.8h, v2.8b, v5.8b\\n"
-           "umull v10.8h, v2.8b, v6.8b\\n"
-           "umull v11.8h, v2.8b, v7.8b\\n"
-
-           // Lower part of a3 * {b0,b1,b2,b3}
-           "umull v12.8h, v3.8b, v4.8b\\n"
-           "umull v13.8h, v3.8b, v5.8b\\n"
-           "umull v14.8h, v3.8b, v6.8b\\n"
-           "umull v15.8h, v3.8b, v7.8b\\n"
-
-           // Accumulate
-           "uadalp v24.4s, v8.8h\\n"
-           "uadalp v25.4s, v9.8h\\n"
-           "uadalp v26.4s, v10.8h\\n"
-           "uadalp v27.4s, v11.8h\\n"
-           "uadalp v28.4s, v12.8h\\n"
-           "uadalp v29.4s, v13.8h\\n"
-           "uadalp v30.4s, v14.8h\\n"
-           "uadalp v31.4s, v15.8h\\n"
-
-           // Higher part of a2 * {b0,b1,b2,b3}
-           "umull2 v8.8h, v2.16b, v4.16b\\n"
-           "umull2 v9.8h, v2.16b, v5.16b\\n"
-           "umull2 v10.8h, v2.16b, v6.16b\\n"
-           "umull2 v11.8h, v2.16b, v7.16b\\n"
-
-           // Higher part of a3 * {b0,b1,b2,b3}
-           "umull2 v12.8h, v3.16b, v4.16b\\n"
-           "umull2 v13.8h, v3.16b, v5.16b\\n"
-           "umull2 v14.8h, v3.16b, v6.16b\\n"
-           "umull2 v15.8h, v3.16b, v7.16b\\n"
-
-           // Accumulate again
-           "uadalp v24.4s, v8.8h\\n"
-           "uadalp v25.4s, v9.8h\\n"
-           "uadalp v26.4s, v10.8h\\n"
-           "uadalp v27.4s, v11.8h\\n"
-           "uadalp v28.4s, v12.8h\\n"
-           "uadalp v29.4s, v13.8h\\n"
-           "uadalp v30.4s, v14.8h\\n"
-           "uadalp v31.4s, v15.8h\\n"
-    """
 
 
-def gemm_quantized_4_4_interleaved():
-    return """
-             // First half
-             // Higher part of a0 * {b0,b1,b2,b3} and accumulate
-             "umull v8.8h, v0.8b, v4.8b\\n"
-             "uadalp v16.4s, v8.8h\\n"
-             "umull v9.8h, v0.8b, v5.8b\\n"
-             "uadalp v17.4s, v9.8h\\n"
-             "umull v10.8h, v0.8b, v6.8b\\n"
-             "uadalp v18.4s, v10.8h\\n"
-             "umull v11.8h, v0.8b, v7.8b\\n"
-             "uadalp v19.4s, v11.8h\\n"
-
-             // Higher part of a1 * {b0,b1,b2,b3} and accumulate
-             "umull v12.8h, v1.8b, v4.8b\\n"
-             "uadalp v20.4s, v12.8h\\n"
-             "umull v13.8h, v1.8b, v5.8b\\n"
-             "uadalp v21.4s, v13.8h\\n"
-             "umull v14.8h, v1.8b, v6.8b\\n"
-             "uadalp v22.4s, v14.8h\\n"
-             "umull v15.8h, v1.8b, v7.8b\\n"
-             "uadalp v23.4s, v15.8h\\n"
-
-             // Lower part of a0 * {b0,b1,b2,b3} and accumulate
-             "umull2 v8.8h, v0.16b, v4.16b\\n"
-             "uadalp v16.4s, v8.8h\\n"
-             "umull2 v9.8h, v0.16b, v5.16b\\n"
-             "uadalp v17.4s, v9.8h\\n"
-             "umull2 v10.8h, v0.16b, v6.16b\\n"
-             "uadalp v18.4s, v10.8h\\n"
-             "umull2 v11.8h, v0.16b, v7.16b\\n"
-             "uadalp v19.4s, v11.8h\\n"
-
-             // Lower part of a1 * {b0,b1,b2,b3} and accumulate
-             "umull2 v12.8h, v1.16b, v4.16b\\n"
-             "uadalp v20.4s, v12.8h\\n"
-             "umull2 v13.8h, v1.16b, v5.16b\\n"
-             "uadalp v21.4s, v13.8h\\n"
-             "umull2 v14.8h, v1.16b, v6.16b\\n"
-             "uadalp v22.4s, v14.8h\\n"
-             "umull2 v15.8h, v1.16b, v7.16b\\n"
-             "uadalp v23.4s, v15.8h\\n"
-
-             // Second half
-             // Higher part of a2 * {b0,b1,b2,b3} and accumulate
-             "umull v8.8h, v2.8b, v4.8b\\n"
-             "uadalp v24.4s, v8.8h\\n"
-             "umull v9.8h, v2.8b, v5.8b\\n"
-             "uadalp v25.4s, v9.8h\\n"
-             "umull v10.8h, v2.8b, v6.8b\\n"
-             "uadalp v26.4s, v10.8h\\n"
-             "umull v11.8h, v2.8b, v7.8b\\n"
-             "uadalp v27.4s, v11.8h\\n"
-
-             // Higher part of a3 * {b0,b1,b2,b3} and accumulate
-             "umull v12.8h, v3.8b, v4.8b\\n"
-             "uadalp v28.4s, v12.8h\\n"
-             "umull v13.8h, v3.8b, v5.8b\\n"
-             "uadalp v29.4s, v13.8h\\n"
-             "umull v14.8h, v3.8b, v6.8b\\n"
-             "uadalp v30.4s, v14.8h\\n"
-             "umull v15.8h, v3.8b, v7.8b\\n"
-             "uadalp v31.4s, v15.8h\\n"
-
-             // Lower part of a2 * {b0,b1,b2,b3} and accumulate
-             "umull2 v8.8h, v2.16b, v4.16b\\n"
-             "uadalp v24.4s, v8.8h\\n"
-             "umull2 v9.8h, v2.16b, v5.16b\\n"
-             "uadalp v25.4s, v9.8h\\n"
-             "umull2 v10.8h, v2.16b, v6.16b\\n"
-             "uadalp v26.4s, v10.8h\\n"
-             "umull2 v11.8h, v2.16b, v7.16b\\n"
-             "uadalp v27.4s, v11.8h\\n"
-
-             // Lower part of a3 * {b0,b1,b2,b3} and accumulate
-             "umull2 v12.8h, v3.16b, v4.16b\\n"
-             "uadalp v28.4s, v12.8h\\n"
-             "umull2 v13.8h, v3.16b, v5.16b\\n"
-             "uadalp v29.4s, v13.8h\\n"
-             "umull2 v14.8h, v3.16b, v6.16b\\n"
-             "uadalp v30.4s, v14.8h\\n"
-             "umull2 v15.8h, v3.16b, v7.16b\\n"
-             "uadalp v31.4s, v15.8h\\n"
+def gemm_4x4_int8_int8_int32(M, N, K, unroll, in_type):
     """
+    Int8 4x4 matrix multiplication and accumulation using a sequence of
+    umull -> uadalp -> umull2 -> uadalp instructions. This function
+    takes two arrays of int8 data type  A[4][K] and B[4][K], and produces
+    a 4x4 matrix which is equal to A*B'.
 
+    The pseudo code is as follows.
 
-def gemm_quantized_impl(M, N, K, unroll, interleave, data_type="uint8"):
-    """Assembly implementation of a blocked gemv. Given
-    a block a of shape (4, k) and a block b' of shape (4, k)
-    produces the output block c = a*b of shape (4,4)"""
-
-    stepA = min(4, M)
-    stepB = min(4, N)
-    assert data_type in ["uint8", "int8"], "Only uint8/int8 supported for this implementation"
-
-    signature = """extern "C" int gemm_quantized_{0}_{0}_int32_{1}_{2}""".format(
-        data_type, stepA, stepB
-    )
-    if unroll:
-        signature += "_" + str(K)
-
-    if interleave:
-        signature += "_interleaved"
-
-    signature += """(int *c_buffer,
-                      unsigned char *a_buffer,
-                      unsigned char *b_buffer,
-                      int K, int m, int n)"""
-
-    cc_code = signature
-    cc_code += """
-    {
-            unsigned char * a_ptr = a_buffer;
-            unsigned char * b_ptr = b_buffer;
-            int * c_ptr = c_buffer;
-
-            int k = K / 16;
-
-            __asm__  __volatile__ (
-                "movi v16.4s, #0\\n"
-                "movi v17.4s, #0\\n"
-                "movi v18.4s, #0\\n"
-                "movi v19.4s, #0\\n"
-                "movi v20.4s, #0\\n"
-                "movi v21.4s, #0\\n"
-                "movi v22.4s, #0\\n"
-                "movi v23.4s, #0\\n"
-                "movi v24.4s, #0\\n"
-                "movi v25.4s, #0\\n"
-                "movi v26.4s, #0\\n"
-                "movi v27.4s, #0\\n"
-                "movi v28.4s, #0\\n"
-                "movi v29.4s, #0\\n"
-                "movi v30.4s, #0\\n"
-                "movi v31.4s, #0\\n"
-            "1:"
-    """
+    .. code-block:: c
 
-    main_loop = ' "ldr q0, [%[a_ptr]]\\n" '
-
-    if M > 1:
-        main_loop += ' "ldr q1, [%[a_ptr], #16]\\n" '
-    else:
-        main_loop += ' "movi v1.4s, #0\\n" '
-
-    if M > 2:
-        main_loop += ' "ldr q2, [%[a_ptr], #32]\\n" '
-    else:
-        main_loop += ' "movi v2.4s, #0\\n" '
-
-    if M > 3:
-        main_loop += ' "ldr q3, [%[a_ptr], #48]\\n" '
-    else:
-        main_loop += ' "movi v3.4s, #0\\n" '
-
-    main_loop += ' "ldr q4, [%[b_ptr]]\\n" '
-
-    if N > 1:
-        main_loop += ' "ldr q5, [%[b_ptr], #16]\\n" '
-
-    if N > 2:
-        main_loop += ' "ldr q6, [%[b_ptr], #32]\\n" '
-
-    if N > 3:
-        main_loop += ' "ldr q7, [%[b_ptr], #48]\\n" '
-
-    # Main computation can interleave multiply/accumulate instructions
-    # or schedule them in batches (first all multiplies then all accumulates)
-    if interleave:
-        main_loop += gemm_quantized_4_4_interleaved()
-    else:
-        main_loop += gemm_quantized_4_4_batched()
-
-    blockA = min(64, M * 16)
-    blockB = min(64, N * 16)
-    main_loop += """// Increment pointers
-                    "add %[a_ptr], %[a_ptr], #{0}\\n"
-                    "add %[b_ptr], %[b_ptr], #{1}\\n" """.format(
-        blockA, blockB
-    )
-
-    if unroll:
-        k = int(K // 16)
-        for l in range(0, k):
-            cc_code += main_loop
-    else:
-        cc_code += main_loop
-        cc_code += """
-                    "subs %w[k], %w[k], #1\\n"
-                    "cbnz %w[k], 1b\\n"
-                   """
-    cc_code += """
-                // Final additions
-
-                // v16 contains the four partial sums of a[0, 0:K].*b[0,0:K], let's call them (a,b,c,d)
-                // v17 contains the four partial sums of a[0, 0:K].*b[1,0:K], let's call them (e,f,g,h)
-                // v18 contains the four partial sums of a[0, 0:K].*b[2,0:K], let's call them (i,j,k,l)
-                // v19 contains the four partial sums of a[0, 0:K].*b[3,0:K], let's call them (m,n,o,p)
-                "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b, c+d, e+f, g+h)
-                "addp v17.4s, v18.4s, v19.4s\\n" // v17 = (i+j, k+l, m+n, o+p)
-                "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-                // v20 contains the four partial sums of a[1, 0:K].*b[0,0:K], let's call them (a,b,c,d)
-                // v21 contains the four partial sums of a[1, 0:K].*b[1,0:K], let's call them (e,f,g,h)
-                // v22 contains the four partial sums of a[1, 0:K].*b[2,0:K], let's call them (i,j,k,l)
-                // v23 contains the four partial sums of a[1, 0:K].*b[3,0:K], let's call them (m,n,o,p)
-                "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b, c+d, e+f, g+h)
-                "addp v21.4s, v22.4s, v23.4s\\n" // v21 = (i+j, k+l, m+n, o+p)
-                "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-                // v24 contains the four partial sums of a[2, 0:K].*b[0,0:K], let's call them (a,b,c,d)
-                // v25 contains the four partial sums of a[2, 0:K].*b[1,0:K], let's call them (e,f,g,h)
-                // v26 contains the four partial sums of a[2, 0:K].*b[2,0:K], let's call them (i,j,k,l)
-                // v27 contains the four partial sums of a[2, 0:K].*b[3,0:K], let's call them (m,n,o,p)
-                "addp v24.4s, v24.4s, v25.4s\\n"  // v24 = (a+b, c+d, e+f, g+h)
-                "addp v25.4s, v26.4s, v27.4s\\n"  // v25 = (i+j, k+l, m+n, o+p)
-                "addp v24.4s, v24.4s, v25.4s\\n"  // v24 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-                // v28 contains the four partial sums of a[3, 0:K].*b[0,0:K], let's call them (a,b,c,d)
-                // v29 contains the four partial sums of a[3, 0:K].*b[1,0:K], let's call them (e,f,g,h)
-                // v30 contains the four partial sums of a[3, 0:K].*b[2,0:K], let's call them (i,j,k,l)
-                // v31 contains the four partial sums of a[3, 0:K].*b[3,0:K], let's call them (m,n,o,p)
-                "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b, c+d, e+f, g+h)
-                "addp v29.4s, v30.4s, v31.4s\\n" // v29 = (i+j, k+l, m+n, o+p)
-                "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-                "str q16, [%[c_ptr]]\\n"
-            """
-
-    stepC = min(4, N)
-    if M > 1:
-        cc_code += ' "str q20, [%[c_ptr], #{0}]\\n" '.format(stepC * 4)
-
-    if M > 2:
-        cc_code += ' "str q24, [%[c_ptr], #{0}]\\n" '.format(stepC * 8)
-
-    if M > 3:
-        cc_code += ' "str q28, [%[c_ptr], #{0}]\\n" '.format(stepC * 12)
-
-    cc_code += """
-             : [c_ptr] "+r" (c_ptr), [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [k] "+r" (k)
-             :
-             : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                    "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-                    "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-                    "v27", "v28", "v29", "v30", "v31"
-             );
-        return 0;
+        void gemm_4x4_int8_int8_int32(int8 A[4][K], int8 B[4][K], int32 C[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; j < 4; j++){
+                    for (int k = 0; k < K; k++){
+                        C[i][j] += A[i][k] * B[j][k]
+                    }
+            }
         }
-    """
-
-    if data_type == "int8":
-        cc_code = cc_code.replace("unsigned char", "char")
-        cc_code = cc_code.replace("umull", "smull")
-        cc_code = cc_code.replace("uadalp", "sadalp")
-
-    temp = util.tempdir()
-    ll_path = temp.relpath("temp.ll")
-    # Create LLVM ir from c source code
-    ll_code = clang.create_llvm(
-        cc_code, options=["--target=aarch64-linux-gnu -mattr=+neon"], output=ll_path
-    )
-    return ll_code
-
-
-def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
-    """
-    Use integer ARM v8 instructions in order to produce a block c of 4x4 elements
-    given two 4xK blocks a and b' (where b' is a Kx4 block transposed). The final
-    result is c = a*b (where '*' indicates the matrix product)
-
-    Every row of the matrix c is obtained (for uint8) by a sequence of
-
-          umull -> uadalp -> umull2 -> uadalp
 
-    The block size is constrained by the number of registers available in arvm8. This
-    function returns a TensorIntrin that can be used to tensorize
-    a schedule.
+    Notes:
+        * The tiling strategy is picked to maximize register usage.
 
     Parameters
     ----------
-    M: int
+    M : int
         rows of the matrix A
-    N: int
+    N : int
         columns of the matrix B
-    K: int
+    K : int
         columns of matrix A
-    in_type: str, {'uint8', 'int8'}
-    out_type: str, {'uint32', 'int32'}
+    unroll : bool
+        Unroll the loop accumulation if True
+    in_type : str, {'uint8', 'int8'}
 
     Returns
     -------
     intrin : TensorIntrin
         The ARM uint8/int8 TensorIntrin that can be used in tensorizing schedule
     """
+    assert in_type in ["uint8", "int8"]
     A = te.placeholder((K // 16, te.var("m"), 16), dtype=in_type, name="A")
     B = te.placeholder((K // 16, te.var("n"), 16), dtype=in_type, name="B")
-
+    dtype_vec = in_type + "x16"
     idxm = tvm.tir.indexmod
 
     k = te.reduce_axis((0, K), "k")
@@ -446,28 +97,322 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
         C.shape, dtype="int32", name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1]
     )
 
+    # Intrinsics used in the following algorithm
+    umull_intrin = "llvm.aarch64.neon.umull" if in_type == "uint8" else "llvm.aarch64.neon.smull"
+    uaddlp_intrin = "llvm.aarch64.neon.uaddlp" if in_type == "uint8" else "llvm.aarch64.neon.saddlp"
+    addp_intrin = "llvm.aarch64.neon.addp"
+
+    def uadalp(a, b):
+        """Add pair and accumulate
+
+        Parameters:
+        ----------
+        a: int16x8 vector
+        b: int16x8 vector
+
+        Returns:
+        --------
+            return a int32x4 vector
+
+        Pseudocode:
+        ----------
+            a += (b0+b1, b2+b3, b4+b5, b6+b7)
+        """
+
+        return a + tvm.tir.call_llvm_pure_intrin(
+            "int32x4", uaddlp_intrin, tvm.tir.const(1, "uint32"), b
+        )
+
+    def umull(a, b):
+        """Multiply long (higher part)
+
+        Parameters:
+        ----------
+        a: int8x16 vector
+        b: int8x16 vector
+
+        Returns:
+        --------
+            return a int16x8 vector
+
+        Pseudocode:
+        ----------
+            c = (a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7)
+        """
+        a_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", a)
+        b_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", b)
+        c = tvm.tir.call_llvm_pure_intrin(
+            "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_high, b_high
+        )
+        return c
+
+    def umull2(a, b):
+        """Multiply long (lower part)
+
+        Parameters:
+        ----------
+        a: int8x16 vector
+        b: int8x16 vector
+
+        Returns:
+        --------
+            return a int16x8 vector
+
+        Pseudocode:
+        ----------
+            c = (a8*b8, a9*b9, a10*b10, a11*b11, a12*b12, a13*b13, a14*b14, a15*b15)
+        """
+        a_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", a)
+        b_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", b)
+        c = tvm.tir.call_llvm_pure_intrin(
+            "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_low, b_low
+        )
+        return c
+
+    def addp(a, b):
+        """Add two vectors in pairs
+
+        Parameters:
+        ----------
+        a: int32x4 vector
+        b: int32x4 vector
+
+        Returns:
+        --------
+            return a int32x4 vector
+
+        Pseudocode:
+        ----------
+            c = (a0+a1, a2+a3, b0+b1, b0+b3)
+        """
+        return tvm.tir.call_llvm_pure_intrin(
+            "int32x4", addp_intrin, tvm.tir.const(2, "uint32"), a, b
+        )
+
+    def accumulation_loop(M, N, ins, acc, tile_idx):
+        """Internal tile accumulation. This function
+        takes two arrays of int8 data type  A[tile_idx][4][16] and B[tile_idx][4][16], produces
+        a 4x4 matrix which is equal to A*B' and accumulates into C[4][4]
+
+        The pseudo code is as follows.
+
+        .. code-block:: c
+
+            void gemm_4x4_int8_int8_int32(int8 A[tile_idx][4][K],
+                                          int8 B[tile_idx][4][K],
+                                          int32 C[4][4]){
+                for (int i = 0; i < 4; i++){
+                    for (int j = 0; j < 4; j++){
+                        for (int k = 0; k < 16; k++){
+                            C[i][j] += A[tile_idx][i][k] * B[tile_idx][j][k]
+                        }
+                }
+            }
+
+        Notes:
+            * The tiling strategy is picked to maximize register usage.
+
+        Parameters:
+        ----------
+        M : int
+            Number of total rows of the output matrix
+        N : int
+            Number of total columns of the output matrix
+        ins : list of tvm.tir.buffer
+            Input buffers
+        acc : tvm.tir.ir_builder.BufferVar
+            Bank of register accumulators
+        tiled_idx : int
+            Index of a sub-tile of A and B in A[tile_idx][:][:] and B[tile_idx][:][:].
+            Please note that  0 <= tile_idx <= K//16
+
+        """
+        a0 = ins[0].vload([tile_idx, 0, 0], dtype_vec)
+        a1 = tvm.tir.const(0, "int8x16")
+        if M > 1:
+            a1 = ins[0].vload([tile_idx, 1, 0], dtype_vec)
+        a2 = tvm.tir.const(0, "int8x16")
+        if M > 2:
+            a2 = ins[0].vload([tile_idx, 2, 0], dtype_vec)
+        a3 = tvm.tir.const(0, "int8x16")
+        if M > 3:
+            a3 = ins[0].vload([tile_idx, 3, 0], dtype_vec)
+
+        b0 = ins[1].vload([tile_idx, 0, 0], dtype_vec)
+        b1 = tvm.tir.const(0, "int8x16")
+        if N > 1:
+            b1 = ins[1].vload([tile_idx, 1, 0], dtype_vec)
+        b2 = tvm.tir.const(0, "int8x16")
+        if N > 2:
+            b2 = ins[1].vload([tile_idx, 2, 0], dtype_vec)
+        b3 = tvm.tir.const(0, "int8x16")
+        if N > 3:
+            b3 = ins[1].vload([tile_idx, 3, 0], dtype_vec)
+
+        # First half
+        # Lower part of a0 * {b0,b1,b2,b3}
+        d00 = umull(a0, b0)
+        d01 = umull(a0, b1)
+        d02 = umull(a0, b2)
+        d03 = umull(a0, b3)
+
+        # Lower part of a1 * {b0,b1,b2,b3}
+        d10 = umull(a1, b0)
+        d11 = umull(a1, b1)
+        d12 = umull(a1, b2)
+        d13 = umull(a1, b3)
+
+        # Accumulate
+        acc[0] = uadalp(acc[0], d00)
+        acc[1] = uadalp(acc[1], d01)
+        acc[2] = uadalp(acc[2], d02)
+        acc[3] = uadalp(acc[3], d03)
+        acc[4] = uadalp(acc[4], d10)
+        acc[5] = uadalp(acc[5], d11)
+        acc[6] = uadalp(acc[6], d12)
+        acc[7] = uadalp(acc[7], d13)
+
+        # Higher part of a0 * {b0,b1,b2,b3}
+        d00 = umull2(a0, b0)
+        d01 = umull2(a0, b1)
+        d02 = umull2(a0, b2)
+        d03 = umull2(a0, b3)
+
+        # Higher part of a1 * {b0,b1,b2,b3}
+        d10 = umull2(a1, b0)
+        d11 = umull2(a1, b1)
+        d12 = umull2(a1, b2)
+        d13 = umull2(a1, b3)
+
+        # Accumulate again
+        acc[0] = uadalp(acc[0], d00)
+        acc[1] = uadalp(acc[1], d01)
+        acc[2] = uadalp(acc[2], d02)
+        acc[3] = uadalp(acc[3], d03)
+        acc[4] = uadalp(acc[4], d10)
+        acc[5] = uadalp(acc[5], d11)
+        acc[6] = uadalp(acc[6], d12)
+        acc[7] = uadalp(acc[7], d13)
+
+        # Second half
+        # Lower part of a2 * {b0,b1,b2,b3}
+        d00 = umull(a2, b0)
+        d01 = umull(a2, b1)
+        d02 = umull(a2, b2)
+        d03 = umull(a2, b3)
+
+        # Lower part of a3 * {b0,b1,b2,b3}
+        d10 = umull(a3, b0)
+        d11 = umull(a3, b1)
+        d12 = umull(a3, b2)
+        d13 = umull(a3, b3)
+
+        # Accumulate
+        acc[8] = uadalp(acc[8], d00)
+        acc[9] = uadalp(acc[9], d01)
+        acc[10] = uadalp(acc[10], d02)
+        acc[11] = uadalp(acc[11], d03)
+        acc[12] = uadalp(acc[12], d10)
+        acc[13] = uadalp(acc[13], d11)
+        acc[14] = uadalp(acc[14], d12)
+        acc[15] = uadalp(acc[15], d13)
+
+        # Higher part of a2 * {b0,b1,b2,b3}
+        d00 = umull2(a2, b0)
+        d01 = umull2(a2, b1)
+        d02 = umull2(a2, b2)
+        d03 = umull2(a2, b3)
+
+        # Lower part of a3 * {b0,b1,b2,b3}
+        d10 = umull2(a3, b0)
+        d11 = umull2(a3, b1)
+        d12 = umull2(a3, b2)
+        d13 = umull2(a3, b3)
+
+        # Accumulate
+        acc[8] = uadalp(acc[8], d00)
+        acc[9] = uadalp(acc[9], d01)
+        acc[10] = uadalp(acc[10], d02)
+        acc[11] = uadalp(acc[11], d03)
+        acc[12] = uadalp(acc[12], d10)
+        acc[13] = uadalp(acc[13], d11)
+        acc[14] = uadalp(acc[14], d12)
+        acc[15] = uadalp(acc[15], d13)
+
     def _intrin_func(ins, outs):
         def _instr():
             ib = tvm.tir.ir_builder.create()
-            aa, bb = ins
-            cc = outs[0]
-            stepA = min(4, M)
-            stepB = min(4, N)
-            intrin_name = "gemm_quantized_{0}_{0}_int32_{1}_{2}".format(in_type, stepA, stepB)
+            # Allocate a local buffer (possibly translates to registers)
+            acc = ib.allocate("int32x4", 16, name="accs", scope="local")
+            m = outs[0].shape[0]
+            n = outs[0].shape[1]
+            # Initialization
+            for i in range(0, 16):
+                acc[i] = tvm.tir.const(0, "int32x4")
+
             if unroll:
-                intrin_name += "_" + str(K)
-            if interleave:
-                intrin_name += "_interleaved"
-            ib.emit(
-                tvm.tir.call_extern(
-                    "int32",
-                    intrin_name,
-                    outs[0].access_ptr("w"),
-                    a_buffer.access_ptr("r"),
-                    b_buffer.access_ptr("r"),
-                    K,
-                )
-            )
+                for i in range(0, int(K // 16)):
+                    accumulation_loop(M, N, ins, acc, i)
+            else:
+                with ib.for_range(0, K // 16, name="i") as i:
+                    accumulation_loop(M, N, ins, acc, i)
+
+            # Final accumulations
+            # acc[4*r + c] contains the partial accumulations of element C[r][c]
+            #
+            # In particular:
+            # acc[4*r] contains the partial sums of a[r,0:K].*b[0,0:K] -> (a,b,c,d)
+            # acc[4*r+1] contains the partial sums of a[r, 0:K].*b[1,0:K] -> (e,f,g,h)
+            # acc[4*r+2] contains the partial sums of a[r, 0:K].*b[2,0:K] -> (i,j,k,l)
+            # acc[4*r+3] contains the partial sums of a[r, 0:K].*b[3,0:K] -> (m,n,o,p)
+            #
+            # Please note that 0<= r, c < 4
+
+            acc[0] = addp(acc[0], acc[1])  # (a+b, c+d, e+f, g+h)
+            acc[1] = addp(acc[2], acc[3])  # (i+j, k+l, m+n, o+p)
+            acc[0] = addp(acc[0], acc[1])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+            acc[4] = addp(acc[4], acc[5])  # (a+b, c+d, e+f, g+h)
+            acc[5] = addp(acc[6], acc[7])  # (i+j, k+l, m+n, o+p)
+            acc[4] = addp(acc[4], acc[5])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+            acc[8] = addp(acc[8], acc[9])  # (a+b, c+d, e+f, g+h)
+            acc[9] = addp(acc[10], acc[11])  # (i+j, k+l, m+n, o+p)
+            acc[8] = addp(acc[8], acc[9])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+            acc[12] = addp(acc[12], acc[13])  # (a+b, c+d, e+f, g+h)
+            acc[13] = addp(acc[14], acc[15])  # (i+j, k+l, m+n, o+p)
+            acc[12] = addp(acc[12], acc[13])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+            # Store the result
+            if N > 3:
+                out_0 = acc[0]
+                out_1 = acc[4]
+                out_2 = acc[8]
+                out_3 = acc[12]
+            elif N > 2:
+                out_0 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[0])
+                out_1 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[4])
+                out_2 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[8])
+                out_3 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[12])
+            elif N > 1:
+                out_0 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[0])
+                out_1 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[4])
+                out_2 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[8])
+                out_3 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[12])
+            else:
+                out_0 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[0])
+                out_1 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[4])
+                out_2 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[8])
+                out_3 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[12])
+
+            ib.emit(outs[0].vstore([0, 0], out_0))
+            if M > 1:
+                ib.emit(outs[0].vstore([1, 0], out_1))
+            if M > 2:
+                ib.emit(outs[0].vstore([2, 0], out_2))
+            if M > 3:
+                ib.emit(outs[0].vstore([3, 0], out_3))
             return ib.get()
 
         # body, reset, update
@@ -508,9 +453,9 @@ def dot_int8_int8_int32(int32_lanes, dtype="uint"):
 
     Parameters
     ----------
-    int32_lanes: int
+    int32_lanes : int
         How many int32/uint32 to produce
-    dtype: str, optional, {"uint", "int"}
+    dtype : str, optional, {"uint", "int"}
         Whether it works on unsigned int or signed int
 
     Returns
@@ -601,16 +546,16 @@ def select_word(vec, lane, dtype_vec):
 
      Parameters
     ----------
-    vec: tvm.tir.Expr
+    vec : tvm.tir.Expr
          int8x16 vector expression
-    lane: int
+    lane : int
         vector lane we want to replicate
-    dtype_vec: str
+    dtype_vec : str
         vector data type (e.g., int8x16)
 
     Returns
     ----------
-    output: tvm.tir.Expr
+    output : tvm.tir.Expr
         replicated vector
     """
     # Reinterpret vec_a as 4 int32 words
@@ -627,7 +572,7 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
     Int8 4x4 matrix multiplication and accumulation using sdot/udot
     instructions. This function takes two arrays of int8 datatype
     -- A[4][4] and B[4][4] and produces a 4x4 matrix
-    which is equal to A*B.
+    which is equal to A*B'.
 
     The pseudo code is as follows.
 
@@ -643,12 +588,11 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
         }
 
     Notes:
-        * The rows of matrix B are transposed
         * The tiling strategy is picked to maximize register usage.
 
     Parameters
     ----------
-    dtype: str, {"uint8", "int8"}
+    dtype : str, {"uint8", "int8"}
         Whether it works on unsigned int or signed int
 
     Returns
@@ -656,6 +600,7 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
+    assert dtype in ["uint8", "int8"]
     # This needs to be a variable number of "rows" since TVM
     # "thinks" I only need to compute one row because of
     # padding
@@ -755,7 +700,7 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
     """
     Int8 nx16 matrix multiplication and accumulation using sdot/udot instructions
     This function takes two arrays of int8 datatype -- A[n][4] and
-    B[4][16] and produces a rowsx16 matrix which is equal to A*B
+    B[4][16] and produces a rowsx16 matrix which is equal to A*B'
     The pseudo code is as follows.
 
     .. code-block:: c
@@ -771,7 +716,6 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
         }
 
     Notes:
-        * The rows of matrix B are transposed
         * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16
           we need 4 tiles of B to compute a single row of the output. The first 4 values of
           k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on
@@ -779,9 +723,9 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
 
     Parameters
     ----------
-    dtype: str, {"uint8", "int8"}
+    dtype : str, {"uint8", "int8"}
         Whether it works on unsigned int or signed int
-    rows: int
+    rows : int
         Number of of the output rows "n"
 
     Returns
@@ -789,6 +733,7 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
+    assert dtype in ["uint8", "int8"]
     A = te.placeholder((rows, 16), dtype, name="A")
     B = te.placeholder((4, 16, 4), dtype, name="B")
     dtype_vec = dtype + "x16"
@@ -879,6 +824,193 @@ def _instr(index):
     )
 
 
+def smlal_int16_int32():
+    """
+    Intrinsic to be used in order to load two int16x8 vectors and multiply
+    them together through a pair of smlal/smlal2 instructions. The pseudo-code
+    for the algorithm is as follows:
+
+        vec_a = vload(A, "int16x8")
+        vec_b = vload(B, "int16x8")
+
+        vec_c[0:4] += vec_a[0:4]*vec_b[0:4] //  -> smlal instruction
+        vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction
+
+    So we load a single int16x8 vector and we accumulate its lower (0:4) and
+    higher part separately.
+    """
+    int16_lanes = 8
+    A = te.placeholder((int16_lanes,), dtype="int16", name="A")
+    B = te.placeholder((int16_lanes, 1), dtype="int16", name="B")
+    C = te.compute(
+        (int16_lanes,),
+        lambda i: A[i].astype("int32") * B[i, 0].astype("int32"),
+        name="C",
+    )
+
+    a_buffer = tvm.tir.decl_buffer(
+        A.shape, dtype="int16", name="a_buffer", offset_factor=1, strides=[1]
+    )
+    b_buffer = tvm.tir.decl_buffer(
+        B.shape,
+        dtype="int16",
+        name="b_buffer",
+        offset_factor=1,
+        strides=[te.var("sb"), 1],
+    )
+    c_buffer = tvm.tir.decl_buffer(
+        C.shape,
+        dtype="int32",
+        name="c_buffer",
+        offset_factor=1,
+        strides=[1],
+    )
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x8")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0], "int16x8")
+            vec_b = ins[1].vload([0, 0], "int16x8")
+            inst = "llvm.aarch64.neon.smull"
+
+            # Higher part of the vector
+            vec_c_h = outs[0].vload([4], "int32x4")
+            vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a)
+            vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b)
+            vmull_h = tvm.tir.call_llvm_pure_intrin(
+                "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_h, vec_b_h
+            )
+            vec_out_h = vec_c_h + vmull_h
+
+            # Lower part of the vector
+            vec_c_l = outs[0].vload([0], "int32x4")
+            vec_a_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_a)
+            vec_b_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_b)
+            vmull_l = tvm.tir.call_llvm_pure_intrin(
+                "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_l, vec_b_l
+            )
+            vec_out_l = vec_c_l + vmull_l
+
+            # Combine higher and lower part in a single int32x8 vector to store
+            # (this will require two different store instructions, since the
+            # length of a NEON vector is fixed at 128
+            vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h)
+            ib.emit(outs[0].vstore(0, vec_out))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={A: a_buffer, B: b_buffer, C: c_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
+def gemm_acc_2x2_int8_int8_int32(dtype):
+    """
+    Int8 2x2 matrix multiplication using smmla/ummla instructions
+    This function takes two arrays of int8 datatype -- A[2][8] and
+    B[2][8] and produces a 2x2 matrix which is equal to A*B'
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_2x2_int8_int8_int32(int8 A[2][8], int8 B[2][8], int32 C[2][2]){
+            for (int i = 0; i < 2; i++){
+                for (int j = 0; i < 2; i++){
+                    for (int k = 0; k < 8; k++){
+                        C[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Parameters
+    ----------
+    dtype : str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    assert dtype in ["uint8", "int8"]
+    A = te.placeholder((2, 8), dtype, name="A")
+    B = te.placeholder((2, 8), dtype, name="B")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 8), name="k")
+    C = te.compute(
+        (2, 2),
+        lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.smmla" if dtype == "int8" else "llvm.aarch64.neon.ummla"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore([0, 0], tvm.tir.const(0, "int32x4")))
+                return ib.get()
+            # Load in vec_a the two rows of A
+            # vec_a = [a, b, c, d, e, f, g, h;
+            #          i, j, k, l, m, n, o, p,]
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+            # Load in vec_b the two rows of B
+            # vec_b = [0, 2, 4, 6, 8, 10, 12, 14;
+            #          1, 3, 5, 7, 9, 11, 13, 14,]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the matrix multiplication via (s/u)mmla:
+            # vec_c = [a*0 + b*2 + c*4 + d*6 +e*8 + f*10 + g*12 + h*14;
+            #          a*1 + b*3 + c*5 + d*7 +e*9 + f*11 + g*13 + h*15;
+            #          i*0 + j*2 + k*4 + l*6 +m*8 + n*10 + o*12 + p*14;
+            #          i*1 + j*3 + k*5 + l*7 +m*9 + n*11 + o*13 + p*15]
+            vec_c = outs[0].vload([0, 0], "int32x4")
+            vmmla = tvm.tir.call_llvm_intrin(
+                "int32x4",
+                llvm_intrin,
+                tvm.tir.const(3, "uint32"),
+                vec_c,
+                vec_a,
+                vec_b,
+            )
+            # Store the result
+            ib.emit(outs[0].vstore([0, 0], vmmla))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
 def _q_multiply_shift_arm(op):
     """
     Implementation of q_multiply_shift_arm through arm intrinsics
diff --git a/python/tvm/topi/bifrost/conv2d.py b/python/tvm/topi/bifrost/conv2d.py
index a3be906c250f..3b6cca6aaea4 100644
--- a/python/tvm/topi/bifrost/conv2d.py
+++ b/python/tvm/topi/bifrost/conv2d.py
@@ -25,7 +25,7 @@
 
 from .gemm import decl_winograd_gemm, schedule_gemm
 from .transforms import tile_and_bind, tile_and_bind3d
-from ..util import traverse_inline, get_const_int, get_const_tuple
+from ..utils import traverse_inline, get_const_int, get_const_tuple
 from .. import nn
 from ..nn.winograd_util import winograd_transform_matrices
 
diff --git a/python/tvm/topi/bifrost/dense.py b/python/tvm/topi/bifrost/dense.py
index 85703f19ca2f..9ab8b4ebea62 100644
--- a/python/tvm/topi/bifrost/dense.py
+++ b/python/tvm/topi/bifrost/dense.py
@@ -20,7 +20,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("dense.bifrost")
diff --git a/python/tvm/topi/bifrost/depthwise_conv2d.py b/python/tvm/topi/bifrost/depthwise_conv2d.py
index 35da5a594ec2..625c274213ad 100644
--- a/python/tvm/topi/bifrost/depthwise_conv2d.py
+++ b/python/tvm/topi/bifrost/depthwise_conv2d.py
@@ -22,7 +22,7 @@
 import tvm
 from tvm import te
 
-from .. import util
+from .. import utils
 from .. import tag
 
 
@@ -70,12 +70,12 @@ def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
         VH = 1
         VW = 1
         num_thread = 4
-        while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
+        while utils.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
             VW = VW * 2
-        while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
+        while utils.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
             VH = VH * 2
         if raw_data.dtype == "float16":
-            if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
+            if utils.get_const_int(conv.shape[3]) % (VW * 2) == 0:
                 VW *= 2
                 num_thread *= 2
             else:
diff --git a/python/tvm/topi/bifrost/gemm.py b/python/tvm/topi/bifrost/gemm.py
index c06f62323817..6224493109ef 100644
--- a/python/tvm/topi/bifrost/gemm.py
+++ b/python/tvm/topi/bifrost/gemm.py
@@ -19,7 +19,7 @@
 from tvm import te
 
 from .transforms import tile_and_bind, tile_and_bind3d, interleave_transpose, transpose_interleave
-from .. import util
+from .. import utils
 
 
 def decl_gemm(cfg, A, B):
@@ -50,10 +50,10 @@ def decl_gemm(cfg, A, B):
     cfg.define_knob("split_k_factor", [1, 4, 16])
 
     # Mutual k axis must be of equal extent
-    assert util.get_const_int(A.shape[1]) == util.get_const_int(B.shape[0])
+    assert utils.get_const_int(A.shape[1]) == utils.get_const_int(B.shape[0])
     n = A.shape[0]
     m = B.shape[1]
-    k_size = util.get_const_int(A.shape[1])
+    k_size = utils.get_const_int(A.shape[1])
     unroll_gemm = cfg["split_k_factor"].val
     if unroll_gemm == 1:
         # No unrolling case must have the same set of tensors to keep scheduling consistent
@@ -120,8 +120,8 @@ def decl_batched_gemm(cfg, A, B):
 
     """
     # Mutual b and k axis must be of equal extent
-    assert util.get_const_int(A.shape[2]) == util.get_const_int(B.shape[1])
-    assert util.get_const_int(A.shape[0]) == util.get_const_int(B.shape[0])
+    assert utils.get_const_int(A.shape[2]) == utils.get_const_int(B.shape[1])
+    assert utils.get_const_int(A.shape[0]) == utils.get_const_int(B.shape[0])
 
     cfg.define_knob("work_group_x", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
     cfg.define_knob("work_group_y", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
@@ -131,8 +131,8 @@ def decl_batched_gemm(cfg, A, B):
 
     n = A.shape[1]
     m = B.shape[2]
-    k_size = util.get_const_int(A.shape[2])
-    b_size = util.get_const_int(A.shape[0])
+    k_size = utils.get_const_int(A.shape[2])
+    b_size = utils.get_const_int(A.shape[0])
 
     # Declare a batched GEMM
     k = te.reduce_axis((0, k_size), name="k")
@@ -163,9 +163,9 @@ def decl_winograd_gemm(cfg, A, B):
     -------
 
     """
-    alpha = util.get_const_int(A.shape[0])
-    n = util.get_const_int(A.shape[2])
-    k = util.get_const_int(A.shape[3])
+    alpha = utils.get_const_int(A.shape[0])
+    n = utils.get_const_int(A.shape[2])
+    k = utils.get_const_int(A.shape[3])
 
     A_3D = te.compute(
         (alpha * alpha, n, k), lambda b, i, j: A[b // alpha][b % alpha][i][j], name="A_3D"
diff --git a/python/tvm/topi/cpp/__init__.py b/python/tvm/topi/cpp/__init__.py
index 62e274c4e768..bad6f0e8d452 100644
--- a/python/tvm/topi/cpp/__init__.py
+++ b/python/tvm/topi/cpp/__init__.py
@@ -23,4 +23,4 @@
 from . import x86
 from . import generic
 from . import rocm
-from . import util
+from . import utils
diff --git a/python/tvm/topi/cpp/util.py b/python/tvm/topi/cpp/utils.py
similarity index 93%
rename from python/tvm/topi/cpp/util.py
rename to python/tvm/topi/cpp/utils.py
index ca0b86e5a353..60a2747f9abb 100644
--- a/python/tvm/topi/cpp/util.py
+++ b/python/tvm/topi/cpp/utils.py
@@ -17,4 +17,4 @@
 """FFI for TOPI utility functions"""
 import tvm._ffi
 
-tvm._ffi._init_api("topi.util", "tvm.topi.cpp.util")
+tvm._ffi._init_api("topi.utils", "tvm.topi.cpp.utils")
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index ed8037024635..3ff544f4bb3e 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -46,6 +46,7 @@
 from .ssd import *
 from .nms import get_valid_counts, non_max_suppression
 from .rcnn import *
+from .scatter import *
 from .sort import *
 from .conv2d_nhwc_tensorcore import *
 from .conv3d_ndhwc_tensorcore import *
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index bb060b3ad8a7..8d34b2996593 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -22,7 +22,7 @@
 from tvm.contrib import cublas
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn
-from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
 
 
 @autotvm.register_topi_compute("batch_matmul.cuda")
@@ -138,7 +138,7 @@ def _callback(op):
     return s
 
 
-def batch_matmul_cublas(x, y):
+def batch_matmul_cublas(x, y, out_shape=None):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
     data in batch.
 
@@ -150,6 +150,9 @@ def batch_matmul_cublas(x, y):
     y : tvm.te.Tensor
         3-D with shape [batch, N, K]
 
+    out_shape : None
+        The output shape
+
     Returns
     -------
     output : tvm.te.Tensor
diff --git a/python/tvm/topi/cuda/conv1d.py b/python/tvm/topi/cuda/conv1d.py
index 416e4803a7f0..e50913d88df2 100644
--- a/python/tvm/topi/cuda/conv1d.py
+++ b/python/tvm/topi/cuda/conv1d.py
@@ -21,7 +21,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 
 
 @autotvm.register_topi_compute("conv1d_ncw.cuda")
diff --git a/python/tvm/topi/cuda/conv1d_transpose_ncw.py b/python/tvm/topi/cuda/conv1d_transpose_ncw.py
index c827007f3aec..58f53eab20ac 100644
--- a/python/tvm/topi/cuda/conv1d_transpose_ncw.py
+++ b/python/tvm/topi/cuda/conv1d_transpose_ncw.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 
 
 @autotvm.task.register_topi_compute("conv1d_transpose_nchw.cuda")
@@ -65,29 +65,46 @@ def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype, output_p
     out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right + output_padding
     pad_left = kernel_size - 1 - pad_left
     pad_right = kernel_size - 1 - pad_right + output_padding
-    dilated_width = stride * (inp_width - 1) + 1
-    data = te.compute(
-        (batch, inp_channels, pad_left + dilated_width + pad_right),
+    padded_width = pad_left + inp_width + pad_right
+
+    padded_data = te.compute(
+        (batch, inp_channels, padded_width),
         lambda n, c, x: tvm.tir.if_then_else(
-            tvm.tir.all(
-                x >= pad_left,
-                x < pad_left + dilated_width,
-                tvm.tir.indexmod(x - pad_left, stride).equal(0),
-            ),
-            data[n, c, tvm.tir.indexdiv(x - pad_left, stride)],
+            tvm.tir.all(x >= pad_left, x < pad_left + inp_width),
+            data[n, c, x - pad_left],
             tvm.tir.const(0.0, "float32"),
         ),
         name="data_pad",
     )
 
-    dc = te.reduce_axis((0, inp_channels), name="dc")
-    dw = te.reduce_axis((0, kernel_size), name="dw")
+    padded_kernel = te.compute(
+        (inp_channels, out_channels, kernel_size + stride - 1),
+        lambda ci, co, k: tvm.tir.if_then_else(
+            tvm.tir.all(k < kernel_size),
+            kernel[ci, co, kernel_size - k - 1],
+            tvm.tir.const(0.0, "float32"),
+        ),
+        name="kernel_pad",
+    )
+
+    ci = te.reduce_axis((0, inp_channels), name="ci")
+    k = te.reduce_axis((0, tvm.tir.indexdiv(kernel_size + stride - 1, stride)), name="k")
+    border = pad_left * (stride - 1)
+
+    # Skip multiplication by 0 values in the input data inserted when stride is greater then 1.
+    # During multiplication of kernel by padded data:
+    #  Kernel indices are: 0, 1 * stride, 2 * stride, ..., ceil(kernel_size / stride) plus
+    #  data offset mod stride
     data_out = te.compute(
         (batch, out_channels, out_width),
-        lambda b, c, w: te.sum(
-            data[b, dc, w + dw].astype(out_dtype)
-            * kernel[dc, c, kernel_size - 1 - dw].astype(out_dtype),
-            axis=[dc, dw],
+        lambda b, co, w: te.sum(
+            padded_data[b, ci, tvm.tir.indexdiv(border + w + stride - 1, stride) + k].astype(
+                out_dtype
+            )
+            * padded_kernel[
+                ci, co, k * stride + tvm.tir.indexmod(stride - w - border, stride)
+            ].astype(out_dtype),
+            axis=[ci, k],
         ),
         tag="conv1d_transpose_ncw",
     )
@@ -118,8 +135,8 @@ def schedule_conv1d_transpose_ncw(cfg, outs):
 
     def _callback(op):
         if op.tag == "conv1d_transpose_ncw":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
+            padded_data = op.input_tensors[0]
+            padded_kernel = op.input_tensors[1]
             conv = op.output(0)
 
             ##### space definition begin #####
@@ -139,9 +156,6 @@ def _callback(op):
 
             ##### space definition end #####
 
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
             if conv.op in s.outputs:
                 output = conv
                 OL = s.cache_write(conv, "local")
@@ -150,10 +164,8 @@ def _callback(op):
                 s[conv].set_scope("local")
                 OL = conv
 
-            # create cache stage
-            s[pad_data].set_scope("shared")
-            AA = pad_data
-            WW = s.cache_read(kernel, "shared", [OL])
+            s[padded_kernel].compute_inline()
+            s[padded_data].compute_inline()
 
             # tile and bind spatial axes
             n, f, x = s[output].op.axis
@@ -172,9 +184,6 @@ def _callback(op):
 
             s[output].bind(tx, te.thread_axis("threadIdx.x"))
             s[OL].compute_at(s[output], tx)
-            # number of threads
-            n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-            n_tx = cfg["tile_x"].size[2]
 
             # tile reduction axes
             n, f, x = s[OL].op.axis
@@ -182,18 +191,6 @@ def _callback(op):
             rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
             s[OL].reorder(rco, rcm, rx, rci, n, f, x)
 
-            s[AA].compute_at(s[OL], rx)
-            s[WW].compute_at(s[OL], rx)
-
-            # cooperative fetching
-            for load in [AA, WW]:
-                n, f, x = s[load].op.axis
-                fused = s[load].fuse(f, x)
-                tz, fused = s[load].split(fused, nparts=n_tz)
-                tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, te.thread_axis("threadIdx.y"))
-                s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
             s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
             s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
 
diff --git a/python/tvm/topi/cuda/conv2d.py b/python/tvm/topi/cuda/conv2d.py
index cf335acfb98d..ce9cebc3c963 100644
--- a/python/tvm/topi/cuda/conv2d.py
+++ b/python/tvm/topi/cuda/conv2d.py
@@ -22,8 +22,8 @@
 from tvm.contrib import cudnn
 
 from .. import nn, generic
-from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..nn.utils import get_pad_tuple
+from ..utils import get_const_tuple, traverse_inline
 from .conv2d_direct import schedule_direct_cuda
 from .conv2d_nhwc import schedule_conv2d_nhwc_direct
 
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 9bac87c32cff..ad6635de0116 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -19,12 +19,10 @@
 
 import logging
 import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
+from tvm import te, relay, autotvm
 
 from .. import nn
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from .conv2d_winograd import _infer_tile_size
 from ..nn import conv2d_legalize
 
@@ -36,31 +34,56 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
 
-    _, outs = relay.backend.compile_engine.select_implementation(
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data, kernel = tinfos
+    out_dtype = out_type.dtype
+
+    impl, outs = relay.backend.compile_engine.select_implementation(
         relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
     if workload is None:
-        # The best implementation is not an AutoTVM template,
-        # we then assume it's not necessary to alter this op.
+        # The best implementation is not an AutoTVM template.
+        # It may be from the auto-scheduler
+
+        if impl.name.find("winograd") != -1:
+            if dilation != (1, 1):
+                logger.warning("Does not support weight pre-transform for dilated convolution.")
+                return None
+
+            assert data_layout == "NHWC" and kernel_layout == "HWIO"
+            N, H, W, CI = get_const_tuple(data.shape)
+            KH, KW, _, CO = get_const_tuple(kernel.shape)
+
+            # Pre-compute weight transformation in winograd
+            tile_size = _infer_tile_size(tinfos[0], tinfos[1])
+
+            # HWIO -> OIHW
+            kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
+            # alpha, alpha, CO, CI
+            weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+                kernel_transform, tile_size=tile_size
+            )
+            new_attrs["tile_size"] = tile_size
+            new_attrs["channels"] = CO
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
         return None
+
     cfg = dispatch_ctx.query(target, workload)
     if cfg.is_fallback:  # if is fallback, clear query cache and return None
         autotvm.task.clear_fallback_cache(target, workload)
         return None
 
     topi_tmpl = workload[0]
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data, kernel = tinfos
-    out_dtype = out_type.dtype
-
     if topi_tmpl == "conv2d_NCHWc_int8.cuda":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         N, CI, H, W = get_const_tuple(data.shape)
@@ -136,10 +159,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         KH, KW, _, CO = get_const_tuple(kernel.shape)
 
         # Pre-compute weight transformation in winograd
-        if H % 8 == 0:
-            tile_size = 4
-        else:
-            tile_size = 2
+        tile_size = _infer_tile_size(data, kernel)
         kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
         weight = relay.nn.contrib_conv2d_winograd_weight_transform(
             kernel_transform, tile_size=tile_size
diff --git a/python/tvm/topi/cuda/conv2d_direct.py b/python/tvm/topi/cuda/conv2d_direct.py
index e1f3d82cb3e9..2dc6635e680e 100644
--- a/python/tvm/topi/cuda/conv2d_direct.py
+++ b/python/tvm/topi/cuda/conv2d_direct.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def schedule_direct_cuda(cfg, s, conv):
diff --git a/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py b/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py
index db5a6c9863f2..e2d3cd927a6e 100644
--- a/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py
+++ b/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py
@@ -22,9 +22,9 @@
 from tvm import autotvm
 from tvm.target import Target
 from tvm.topi.cuda.injective import schedule_injective_from_existing
-from ..util import get_const_tuple, traverse_inline, simplify, tag
+from ..utils import get_const_tuple, traverse_inline, simplify, tag
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from .tensor_intrin import intrin_wmma_load_matrix_A
 from .tensor_intrin import intrin_wmma_load_matrix_W
 from .tensor_intrin import intrin_wmma_store_matrix
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index deeec50f6d71..50a0e8b71661 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -25,8 +25,8 @@
 from .tensor_intrin import dp4a
 from ..nn.pad import pad
 from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..nn.utils import get_pad_tuple
+from ..utils import get_const_tuple, traverse_inline
 
 
 def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"):
diff --git a/python/tvm/topi/cuda/conv2d_nhwc.py b/python/tvm/topi/cuda/conv2d_nhwc.py
index b25634586a69..a08d217696e2 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def schedule_conv2d_nhwc_direct(cfg, s, Conv):
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
index a33092d2ff22..f665cc779dc5 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
@@ -21,9 +21,9 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple, traverse_inline, simplify
+from ..utils import get_const_tuple, traverse_inline, simplify
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from .tensor_intrin import intrin_wmma_load_matrix_A
 from .tensor_intrin import intrin_wmma_load_matrix_W
 from .tensor_intrin import intrin_wmma_store_matrix
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
index 246437a26146..1e368f585354 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
@@ -23,7 +23,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import get_const_int, get_const_tuple, traverse_inline
+from ..utils import get_const_int, get_const_tuple, traverse_inline
 from ..nn.winograd_util import winograd_transform_matrices
 from .tensor_intrin import intrin_wmma_load_matrix_A
 from .tensor_intrin import intrin_wmma_load_matrix_W
diff --git a/python/tvm/topi/cuda/conv2d_transpose_nchw.py b/python/tvm/topi/cuda/conv2d_transpose_nchw.py
index 915e6cdecae2..609d1acc78bd 100644
--- a/python/tvm/topi/cuda/conv2d_transpose_nchw.py
+++ b/python/tvm/topi/cuda/conv2d_transpose_nchw.py
@@ -22,7 +22,7 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 
 
 @autotvm.register_topi_compute("conv2d_transpose_nchw.cuda")
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
index 11502e134fd5..2d8d6de33828 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -23,8 +23,9 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import get_const_int, get_const_tuple, traverse_inline
+from ..utils import get_const_int, get_const_tuple, traverse_inline
 from ..nn.winograd_util import winograd_transform_matrices
+from ..nn.conv2d import conv2d_winograd_nhwc, _conv2d_winograd_nhwc_impl
 
 
 logger = logging.getLogger("conv2d_winograd")
@@ -354,3 +355,16 @@ def _callback(op):
 
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+@conv2d_winograd_nhwc.register(["cuda", "gpu"])
+def conv2d_winograd_nhwc_cuda(
+    data, weight, strides, padding, dilation, out_dtype, pre_computed=False
+):
+    """Conv2D Winograd in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+    """
+    tile_size = _infer_tile_size(data, weight)
+    return _conv2d_winograd_nhwc_impl(
+        data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed
+    )
diff --git a/python/tvm/topi/cuda/conv3d.py b/python/tvm/topi/cuda/conv3d.py
index 98f351bb53d4..e5a3a53a89ff 100644
--- a/python/tvm/topi/cuda/conv3d.py
+++ b/python/tvm/topi/cuda/conv3d.py
@@ -21,7 +21,7 @@
 from tvm.contrib import cudnn
 
 from .. import nn, generic
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 from .conv3d_direct import schedule_direct_conv3d_cuda
 
 
diff --git a/python/tvm/topi/cuda/conv3d_alter_op.py b/python/tvm/topi/cuda/conv3d_alter_op.py
index 2dfba508e281..faf73e77255a 100644
--- a/python/tvm/topi/cuda/conv3d_alter_op.py
+++ b/python/tvm/topi/cuda/conv3d_alter_op.py
@@ -24,7 +24,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from .conv3d_winograd import _infer_tile_size
 
 logger = logging.getLogger("topi")
diff --git a/python/tvm/topi/cuda/conv3d_direct.py b/python/tvm/topi/cuda/conv3d_direct.py
index aa13e6b9a0f4..faccb75badd6 100644
--- a/python/tvm/topi/cuda/conv3d_direct.py
+++ b/python/tvm/topi/cuda/conv3d_direct.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def schedule_direct_conv3d_cuda(cfg, s, conv, layout, workload_name):
diff --git a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
index b253130268aa..a5c4e81a4dc3 100644
--- a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
+++ b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
@@ -21,9 +21,9 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple, traverse_inline, simplify
+from ..utils import get_const_tuple, traverse_inline, simplify
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple3d
+from ..nn.utils import get_pad_tuple3d
 from .tensor_intrin import intrin_wmma_load_matrix_A
 from .tensor_intrin import intrin_wmma_load_matrix_W
 from .tensor_intrin import intrin_wmma_store_matrix
diff --git a/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py b/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py
index 69c0e0f733ea..3ad85b9bbee7 100644
--- a/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py
+++ b/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 from .conv3d_direct import schedule_direct_conv3d_cuda
 
 
diff --git a/python/tvm/topi/cuda/conv3d_winograd.py b/python/tvm/topi/cuda/conv3d_winograd.py
index 7f4f13979976..2134ee9178b8 100644
--- a/python/tvm/topi/cuda/conv3d_winograd.py
+++ b/python/tvm/topi/cuda/conv3d_winograd.py
@@ -23,7 +23,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import get_const_int, get_const_tuple, traverse_inline, simplify
+from ..utils import get_const_int, get_const_tuple, traverse_inline, simplify
 from ..nn.winograd_util import winograd_transform_matrices
 
 logger = logging.getLogger("conv3d_winograd")
diff --git a/python/tvm/topi/cuda/correlation.py b/python/tvm/topi/cuda/correlation.py
index 12f564409c12..9b1698329fd3 100644
--- a/python/tvm/topi/cuda/correlation.py
+++ b/python/tvm/topi/cuda/correlation.py
@@ -20,7 +20,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("correlation_nchw.cuda")
diff --git a/python/tvm/topi/cuda/deformable_conv2d.py b/python/tvm/topi/cuda/deformable_conv2d.py
index 365fde541501..911588cad5a3 100644
--- a/python/tvm/topi/cuda/deformable_conv2d.py
+++ b/python/tvm/topi/cuda/deformable_conv2d.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("deformable_conv2d_nchw.cuda")
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 727992d76529..47b9db4f390a 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -25,7 +25,7 @@
 from .. import nn
 from .. import tag
 from .. import generic
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 
 logger = logging.getLogger("topi")
 
diff --git a/python/tvm/topi/cuda/dense_tensorcore.py b/python/tvm/topi/cuda/dense_tensorcore.py
index 8c7d7cc76677..a59ebd7347bb 100644
--- a/python/tvm/topi/cuda/dense_tensorcore.py
+++ b/python/tvm/topi/cuda/dense_tensorcore.py
@@ -21,7 +21,7 @@
 from tvm import te
 import tvm.autotvm as autotvm
 from .. import tag
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 from .tensor_intrin import (
     intrin_wmma_load_matrix_A,
     intrin_wmma_load_matrix_W,
@@ -199,6 +199,8 @@ def _schedule_dense_tensorcore(cfg, s, C):
     bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
     oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
     s[CS].reorder(bb, oo, bbii, ooii, bbi, ooi)
+    s[CS].bind(bb, thread_y)
+    s[CS].bind(oo, thread_z)
 
     # Schedule for wmma computation
     s[CF].compute_at(s[CS], oo)
diff --git a/python/tvm/topi/cuda/depthwise_conv2d.py b/python/tvm/topi/cuda/depthwise_conv2d.py
index 2908439f0a20..90a7371cb70b 100644
--- a/python/tvm/topi/cuda/depthwise_conv2d.py
+++ b/python/tvm/topi/cuda/depthwise_conv2d.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import traverse_inline
+from ..utils import traverse_inline
 from .. import tag
 from .. import nn
 
diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
index 35d511977fa9..2af011700235 100644
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ b/python/tvm/topi/cuda/group_conv2d_nchw.py
@@ -23,8 +23,8 @@
 from .injective import schedule_injective_from_existing
 from .tensor_intrin import dp4a
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
-from ..util import traverse_inline, get_const_tuple, get_const_int
+from ..nn.utils import get_pad_tuple
+from ..utils import traverse_inline, get_const_tuple, get_const_int
 from .. import nn
 
 
diff --git a/python/tvm/topi/cuda/injective.py b/python/tvm/topi/cuda/injective.py
index 8a5f618f76ab..60fb12e4975e 100644
--- a/python/tvm/topi/cuda/injective.py
+++ b/python/tvm/topi/cuda/injective.py
@@ -18,7 +18,7 @@
 """Schedule for composition of injective operator"""
 import tvm
 from tvm import te
-from .. import util
+from .. import utils
 
 
 def schedule_injective_from_existing(sch, out):
@@ -45,7 +45,7 @@ def schedule_injective_from_existing(sch, out):
     vector_width = 4 if out.dtype == "float16" else 1
 
     try:
-        const_size = util.get_const_int(util.prod(out.shape))
+        const_size = utils.get_const_int(utils.prod(out.shape))
         need_block_split = const_size > max_block * num_thread * vector_width
     except ValueError:
         need_block_split = False
@@ -87,7 +87,7 @@ def schedule_injective(outs):
 
     tvm.te.schedule.AutoInlineInjective(s)
     for out in outs:
-        if not util.is_empty_shape(out.shape):
+        if not utils.is_empty_shape(out.shape):
             schedule_injective_from_existing(s, out)
     return s
 
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 2041f4c232a2..ed6e8f086a0d 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -483,7 +483,12 @@ def non_max_suppression(
     score_axis = score_index
     score_shape = (batch_size, num_anchors)
     score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
-    if tvm.get_global_func("tvm.contrib.thrust.sort_nms", allow_missing=True):
+    target = tvm.target.Target.current()
+    if (
+        target
+        and target.kind.name == "cuda"
+        and tvm.get_global_func("tvm.contrib.thrust.sort_nms", allow_missing=True)
+    ):
         sort_tensor = argsort_thrust(
             score_tensor, valid_count=None, axis=1, is_ascend=False, dtype=valid_count_dtype
         )
diff --git a/python/tvm/topi/cuda/pooling.py b/python/tvm/topi/cuda/pooling.py
index a3caf5f45151..f2a6aadb659f 100644
--- a/python/tvm/topi/cuda/pooling.py
+++ b/python/tvm/topi/cuda/pooling.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 def schedule_adaptive_pool(outs, layout="NCHW"):
diff --git a/python/tvm/topi/cuda/rcnn/proposal.py b/python/tvm/topi/cuda/rcnn/proposal.py
index 119b7bd2a74f..5b7884c7363b 100644
--- a/python/tvm/topi/cuda/rcnn/proposal.py
+++ b/python/tvm/topi/cuda/rcnn/proposal.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from ...vision.rcnn import generate_anchor, reg_bbox, reg_iou
-from ...util import get_const_tuple, get_const_int
+from ...utils import get_const_tuple, get_const_int
 
 
 def predict_bbox_ir(
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
new file mode 100644
index 000000000000..5e03fafcfb58
--- /dev/null
+++ b/python/tvm/topi/cuda/scatter.py
@@ -0,0 +1,630 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
+"""Scatter operator """
+import tvm
+from tvm import te
+from ..scatter import _verify_scatter_nd_inputs
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def gen_ir_1d(data, indices, updates, axis, out, update_func):
+    """Generate scatter ir for 1d inputs
+
+    Parameters
+    ----------
+    data : tir.Tensor
+        The input data to the operator.
+
+    indices : tir.Tensor
+        The index locations to update.
+
+    updates : tir.Tensor
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    out : tir.Tensor
+        The output tensor.
+
+    update_func: function
+        The function to be applied to a destination and the corresponding update.
+
+    Returns
+    -------
+    ret : tir
+        The computational ir.
+    """
+    assert axis == 0
+    n = data.shape[0]
+
+    ib = tvm.tir.ir_builder.create()
+
+    out_ptr = ib.buffer_ptr(out)
+    data_ptr = ib.buffer_ptr(data)
+
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", n)
+        out_ptr[bx] = data_ptr[bx]
+
+    indices_ptr = ib.buffer_ptr(indices)
+    updates_ptr = ib.buffer_ptr(updates)
+
+    ni = indices.shape[0]
+
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", 1)
+        with ib.for_range(0, ni, name="i") as i:
+            index = indices_ptr[i]
+            with ib.if_scope(index < 0):
+                update_func(out_ptr, index + n, updates_ptr[i])
+            with ib.else_scope():
+                update_func(out_ptr, index, updates_ptr[i])
+
+    return ib.get()
+
+
+def gen_ir_2d(data, indices, updates, axis, out, update_func):
+    """Generate scatter ir for 2d inputs
+
+    Parameters
+    ----------
+    data : tir.Tensor
+        The input data to the operator.
+
+    indices : tir.Tensor
+        The index locations to update.
+
+    updates : tir.Tensor
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    out : tir.Tensor
+        The output tensor.
+
+    update_func: function
+        The function to be applied to a destination and the corresponding update
+
+    Returns
+    -------
+    ret : tir
+        The computational ir.
+    """
+    warp_size = tvm.target.Target.current(False).thread_warp_size
+
+    n = data.shape[0]
+    c = data.shape[1]
+
+    ib = tvm.tir.ir_builder.create()
+
+    out_ptr = ib.buffer_ptr(out)
+    data_ptr = ib.buffer_ptr(data)
+
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", n)
+        tx = te.thread_axis("threadIdx.x")
+        ib.scope_attr(tx, "thread_extent", warp_size)
+        with ib.for_range(0, ceil_div(c, warp_size), name="j") as j_:
+            j = j_ * warp_size + tx
+            with ib.if_scope(j < c):
+                idx = bx * c + j
+                out_ptr[idx] = data_ptr[idx]
+
+    indices_ptr = ib.buffer_ptr(indices)
+    updates_ptr = ib.buffer_ptr(updates)
+
+    ni = indices.shape[0]
+    ci = indices.shape[1]
+
+    if axis == 0:
+        with ib.new_scope():
+            j = te.thread_axis("blockIdx.x")
+            ib.scope_attr(j, "thread_extent", ci)
+            with ib.for_range(0, ni, name="i") as i:
+                idx = i * ci + j
+                index = indices_ptr[idx]
+                with ib.if_scope(index < 0):
+                    update_func(out_ptr, (index + n) * c + j, updates_ptr[idx])
+                with ib.else_scope():
+                    update_func(out_ptr, index * c + j, updates_ptr[idx])
+    else:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            with ib.for_range(0, ci, name="j") as j:
+                idx = i * ci + j
+                index = indices_ptr[idx]
+                with ib.if_scope(index < 0):
+                    update_func(out_ptr, i * c + (index + c), updates_ptr[idx])
+                with ib.else_scope():
+                    update_func(out_ptr, i * c + index, updates_ptr[idx])
+    return ib.get()
+
+
+def gen_ir_3d(data, indices, updates, axis, out, update_func):
+    """Generate scatter ir for 3d inputs
+
+    Parameters
+    ----------
+    data : tir.Tensor
+        The input data to the operator.
+
+    indices : tir.Tensor
+        The index locations to update.
+
+    updates : tir.Tensor
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    out : tir.Tensor
+        The output tensor.
+
+    update_func: function
+        The function to be applied to a destination and the corresponding update
+
+    Returns
+    -------
+    ret : tir
+        The computational ir.
+    """
+    warp_size = tvm.target.Target.current(False).thread_warp_size
+
+    n = data.shape[0]
+    c = data.shape[1]
+    h = data.shape[2]
+
+    ib = tvm.tir.ir_builder.create()
+
+    out_ptr = ib.buffer_ptr(out)
+    data_ptr = ib.buffer_ptr(data)
+
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", n)
+        by = te.thread_axis("blockIdx.y")
+        ib.scope_attr(by, "thread_extent", c)
+        tx = te.thread_axis("threadIdx.x")
+        ib.scope_attr(tx, "thread_extent", warp_size)
+        with ib.for_range(0, ceil_div(h, warp_size), name="k") as k_:
+            k = k_ * warp_size + tx
+            with ib.if_scope(k < h):
+                idx = (bx * c + by) * h + k
+                out_ptr[idx] = data_ptr[idx]
+
+    indices_ptr = ib.buffer_ptr(indices)
+    updates_ptr = ib.buffer_ptr(updates)
+    ni = indices.shape[0]
+    ci = indices.shape[1]
+    hi = indices.shape[2]
+
+    if axis == 0:
+        with ib.new_scope():
+            j = te.thread_axis("blockIdx.x")
+            ib.scope_attr(j, "thread_extent", ci)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, ni, name="i") as i:
+                with ib.for_range(0, ceil_div(hi, warp_size), name="k") as k_:
+                    k = k_ * warp_size + tx
+                    with ib.if_scope(k < hi):
+                        idx = (i * ci + j) * hi + k
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            update_func(out_ptr, ((index + n) * c + j) * h + k, updates_ptr[idx])
+                        with ib.else_scope():
+                            update_func(out_ptr, (index * c + j) * h + k, updates_ptr[idx])
+    elif axis == 1:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, ci, name="j") as j:
+                with ib.for_range(0, ceil_div(hi, warp_size), name="k") as k_:
+                    k = k_ * warp_size + tx
+                    with ib.if_scope(k < hi):
+                        idx = (i * ci + j) * hi + k
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            update_func(out_ptr, (i * c + (index + c)) * h + k, updates_ptr[idx])
+                        with ib.else_scope():
+                            update_func(out_ptr, (i * c + index) * h + k, updates_ptr[idx])
+    else:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            j = te.thread_axis("blockIdx.y")
+            ib.scope_attr(j, "thread_extent", ci)
+            with ib.for_range(0, hi, name="k") as k:
+                idx = (i * ci + j) * hi + k
+                index = indices_ptr[idx]
+                with ib.if_scope(index < 0):
+                    update_func(out_ptr, (i * c + j) * h + (index + h), updates_ptr[idx])
+                with ib.else_scope():
+                    update_func(out_ptr, (i * c + j) * h + index, updates_ptr[idx])
+    return ib.get()
+
+
+def gen_ir_4d(data, indices, updates, axis, out, update_func):
+    """Generate scatter ir for 4d inputs
+
+    Parameters
+    ----------
+    data : tir.Tensor
+        The input data to the operator.
+
+    indices : tir.Tensor
+        The index locations to update.
+
+    updates : tir.Tensor
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    out : tir.Tensor
+        The output tensor.
+
+    update_func: function
+        The function to be applied to a destination and the corresponding update
+
+    Returns
+    -------
+    ret : tir
+        The computational ir.
+    """
+    warp_size = tvm.target.Target.current(False).thread_warp_size
+
+    n = data.shape[0]
+    c = data.shape[1]
+    h = data.shape[2]
+    w = data.shape[3]
+
+    ib = tvm.tir.ir_builder.create()
+
+    out_ptr = ib.buffer_ptr(out)
+    data_ptr = ib.buffer_ptr(data)
+    with ib.new_scope():
+        i = te.thread_axis("blockIdx.x")
+        ib.scope_attr(i, "thread_extent", n)
+        j = te.thread_axis("blockIdx.y")
+        ib.scope_attr(j, "thread_extent", c)
+        k = te.thread_axis("blockIdx.z")
+        ib.scope_attr(k, "thread_extent", h)
+        tx = te.thread_axis("threadIdx.x")
+        ib.scope_attr(tx, "thread_extent", warp_size)
+        with ib.for_range(0, ceil_div(w, warp_size), name="l") as l_:
+            l = l_ * warp_size + tx
+            with ib.if_scope(l < w):
+                idx = ((i * c + j) * h + k) * w + l
+                out_ptr[idx] = data_ptr[idx]
+
+    indices_ptr = ib.buffer_ptr(indices)
+    updates_ptr = ib.buffer_ptr(updates)
+    ni = indices.shape[0]
+    ci = indices.shape[1]
+    hi = indices.shape[2]
+    wi = indices.shape[3]
+
+    if axis == 0:
+        with ib.new_scope():
+            j = te.thread_axis("blockIdx.y")
+            ib.scope_attr(j, "thread_extent", ci)
+            k = te.thread_axis("blockIdx.z")
+            ib.scope_attr(k, "thread_extent", hi)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, ni, name="i") as i:
+                with ib.for_range(0, ceil_div(wi, warp_size), name="l") as l_:
+                    l = l_ * warp_size + tx
+                    with ib.if_scope(l < wi):
+                        idx = ((i * ci + j) * hi + k) * wi + l
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            update_func(
+                                out_ptr, (((index + n) * c + j) * h + k) * w + l, updates_ptr[idx]
+                            )
+                        with ib.else_scope():
+                            update_func(
+                                out_ptr, ((index * c + j) * h + k) * w + l, updates_ptr[idx]
+                            )
+    elif axis == 1:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            k = te.thread_axis("blockIdx.z")
+            ib.scope_attr(k, "thread_extent", hi)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, ci, name="j") as j:
+                with ib.for_range(0, ceil_div(wi, warp_size), name="l") as l_:
+                    l = l_ * warp_size + tx
+                    with ib.if_scope(l < wi):
+                        idx = ((i * ci + j) * hi + k) * wi + l
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            update_func(
+                                out_ptr, ((i * c + (index + c)) * h + k) * w + l, updates_ptr[idx]
+                            )
+                        with ib.else_scope():
+                            update_func(
+                                out_ptr, ((i * c + index) * h + k) * w + l, updates_ptr[idx]
+                            )
+    elif axis == 2:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            j = te.thread_axis("blockIdx.y")
+            ib.scope_attr(j, "thread_extent", ci)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, hi, name="k") as k:
+                with ib.for_range(0, ceil_div(wi, warp_size), name="l") as l_:
+                    l = l_ * warp_size + tx
+                    with ib.if_scope(l < wi):
+                        idx = ((i * ci + j) * hi + k) * wi + l
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            update_func(
+                                out_ptr, ((i * c + j) * h + (index + h)) * w + l, updates_ptr[idx]
+                            )
+                        with ib.else_scope():
+                            update_func(
+                                out_ptr, ((i * c + j) * h + index) * w + l, updates_ptr[idx]
+                            )
+    else:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            j = te.thread_axis("blockIdx.y")
+            ib.scope_attr(j, "thread_extent", ci)
+            k = te.thread_axis("blockIdx.z")
+            ib.scope_attr(k, "thread_extent", hi)
+            with ib.for_range(0, wi, name="l") as l:
+                idx = ((i * ci + j) * hi + k) * wi + l
+                index = indices_ptr[idx]
+                with ib.if_scope(index < 0):
+                    update_func(out_ptr, ((i * c + j) * h + k) * w + (index + w), updates_ptr[idx])
+                with ib.else_scope():
+                    update_func(out_ptr, ((i * c + j) * h + k) * w + index, updates_ptr[idx])
+    return ib.get()
+
+
+def scatter(data, indices, updates, axis=0):
+    """Update data at positions defined by indices with values in updates
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    indices : relay.Expr
+        The index locations to update.
+
+    updates : relay.Expr
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    if axis < 0:
+        axis += len(data.shape)
+    assert axis >= 0
+    assert axis < len(data.shape)
+
+    rank = len(data.shape)
+    assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions"
+
+    ir_funcs = {
+        1: gen_ir_1d,
+        2: gen_ir_2d,
+        3: gen_ir_3d,
+        4: gen_ir_4d,
+    }
+
+    def update_func(dst_ptr, dst_index, update):
+        dst_ptr[dst_index] = update
+
+    out_shape = data.shape
+    out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
+    out = te.extern(
+        [out_shape],
+        [data, indices, updates],
+        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_gpu",
+        tag="scatter_gpu",
+    )
+
+    return out
+
+
+def scatter_add(data, indices, updates, axis=0):
+    """Update data by adding values in updates at positions defined by indices
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    indices : relay.Expr
+        The index locations to update.
+
+    updates : relay.Expr
+        The values to be added.
+
+    axis : int
+        The axis to scatter on
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    if axis < 0:
+        axis += len(data.shape)
+    assert axis >= 0
+    assert axis < len(data.shape)
+
+    rank = len(data.shape)
+    assert 1 <= rank <= 4, "scatter_add only supports 1-4 dimensions"
+
+    ir_funcs = {
+        1: gen_ir_1d,
+        2: gen_ir_2d,
+        3: gen_ir_3d,
+        4: gen_ir_4d,
+    }
+
+    def update_func(dst_ptr, dst_index, update):
+        dst_ptr[dst_index] += update
+
+    out_shape = data.shape
+    out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
+    out = te.extern(
+        [out_shape],
+        [data, indices, updates],
+        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_add_gpu",
+        tag="scatter_add_gpu",
+    )
+
+    return out
+
+
+def scatter_nd(data, indices, shape):
+    """Scatter elements from a n-dimension array.
+
+    Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), indices with shape
+    (M, Y_0, ..., Y_{K-1}), and output with shape (X_0, X_1, ..., X_{N-1}), scatter_nd computes
+
+    .. code-block::
+
+        output[indices[0, y_0, ..., y_{K-1}],
+               ...,
+               indices[M-1, y_0, ..., y_{K-1}],
+               x_M,
+               ...,
+               x_{N-1}
+              ] = data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]
+
+    all other entries in the output are 0. Repeated indices are summed.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The source array.
+
+    indices : tvm.te.Tensor
+        The indices of the values to extract.
+
+    shape : Sequence[int]
+        The output shape. This must be specified because it cannot be inferred.
+
+    Returns
+    -------
+    ret : tvm.te.Tensor
+    """
+    _verify_scatter_nd_inputs(data, indices, shape)
+
+    def gen_ir(data_ptr, indices_ptr, out_ptr):
+        ib = tvm.tir.ir_builder.create()
+
+        data = ib.buffer_ptr(data_ptr)
+        indices = ib.buffer_ptr(indices_ptr)
+        out = ib.buffer_ptr(out_ptr)
+
+        # We combine all the indices dimensions but the first one into a single
+        # dimension so we can iterate it in single loop instead of an arbitrary
+        # number of loops. We do the same thing for all the data dimensions.
+        fused_indices_dimension = 1
+        for i in indices_ptr.shape[1:]:
+            fused_indices_dimension *= i
+
+        fused_data_dimension = 1
+        for i in data_ptr.shape[len(indices_ptr.shape) - 1 :]:
+            fused_data_dimension *= i
+
+        fused_shape = 1
+        for i in shape:
+            fused_shape *= i
+
+        # For now we avoid parallizing over dimensions indexed by `indices` as
+        # there may be repeated indices and hadling parallel accumulation can
+        # be hard. So we parallelize over X_M .. X_{N-1} instead. This will
+        # work well when these dimensions are large enough to saturate memory
+        # bandwidth, but performance will be bad when these dimensions are
+        # small.
+        bx = te.thread_axis("blockIdx.x")
+        tx = te.thread_axis("threadIdx.x")
+        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+        tdim = min(max_threads, fused_data_dimension)
+        ib.scope_attr(tx, "thread_extent", tdim)
+        bdim = ceil_div(fused_data_dimension, tdim)
+        ib.scope_attr(bx, "thread_extent", bdim)
+
+        # zero data
+        # TODO(tkonolige): could we use topi.full to zero it instead?
+        with ib.for_range(0, ceil_div(fused_shape, bdim)) as i:
+            index = i * fused_data_dimension + bx * tdim + tx
+            with ib.if_scope(index < fused_shape):
+                out[index] = tvm.tir.Cast(data_ptr.dtype, 0)
+
+        with ib.for_range(0, fused_indices_dimension) as i:
+            j = bx * tdim + tx
+            with ib.if_scope(j < fused_data_dimension):
+                offset = fused_data_dimension
+                index = j  # This is x_M, .. x_{N-1} part of the index into out.
+                # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
+                # of the index into out.
+                for l in reversed(range(indices_ptr.shape[0].value)):
+                    # indices[i * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
+                    index += offset * indices[i + l * fused_indices_dimension]
+                    offset *= shape[l]
+                out[index] += data[i * fused_data_dimension + j]
+
+        return ib.get()
+
+    out_buf = tvm.tir.decl_buffer(shape, data.dtype, "out_buf")
+    return te.extern(
+        [shape],
+        [data, indices],
+        lambda ins, outs: gen_ir(ins[0], ins[1], outs[0]),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_nd_cuda",
+        tag="scatter_nd_cuda",
+    )
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 465299a5bc8f..ac14f5aae779 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -479,27 +479,28 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
             name="topk_gpu",
             tag="topk_gpu",
         )
-    if k < 1:
+    if isinstance(k, int) and k < 1:
         if ret_type == "indices":
             return output[1]
         return output
     beg = [0] * ndim
     end = []
+    strides = [1] * ndim
     for i in range(ndim):
         if i == axis:
-            end.append(k)
+            end.append(k if isinstance(k, int) else tvm.te.size_var("dim"))
         else:
             end.append(data.shape[i])
     if ret_type == "both":
         values_out, indices_out = output
-        values_out = strided_slice(values_out, beg, end)
-        indices_out = strided_slice(indices_out, beg, end)
+        values_out = strided_slice(values_out, beg, end, strides)
+        indices_out = strided_slice(indices_out, beg, end, strides)
         output = [values_out, indices_out]
     elif ret_type == "values":
-        output = [strided_slice(output, beg, end)]
+        output = [strided_slice(output, beg, end, strides)]
     else:  # ret_type == "indices"
         indices_out = output[1]
-        output = [strided_slice(indices_out, beg, end)]
+        output = [strided_slice(indices_out, beg, end, strides)]
     return output
 
 
@@ -561,10 +562,11 @@ def topk_thrust(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int
         tag="topk_gpu",
     )
 
-    if k > 0:
+    if not isinstance(k, int) or k > 0:
         beg = [0] * ndim
-        end = data.shape[:-1] + [k]
-        out = [strided_slice(o, beg, end) for o in out]
+        end = data.shape[:-1] + [k if isinstance(k, int) else tvm.te.size_var("dim")]
+        strides = [1] * ndim
+        out = [strided_slice(o, beg, end, strides) for o in out]
 
     if axis != ndim - 1:
         axes = swap(list(range(ndim)), axis)
diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index 3fd6fbebc62f..ebac5517d46c 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -23,7 +23,7 @@
 from tvm import relay, te
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 def sparse_dense(data, weight_data, weight_indices, weight_indptr):
@@ -180,7 +180,7 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
         assert (
             mb >= mi
         ), "Number of block rows in dense matrix must be larger than warp size: {} vs {}.".format(
-            warp_size, m
+            warp_size, mb
         )
         mo = ceil_div(mb, mi)
         ni = 1  # TODO(tkonolige): how do I compute the number of warps per block?
@@ -367,9 +367,14 @@ def _alter_sparse_dense_layout(_attrs, inputs, _tinfos, _out_type):
         and isinstance(inputs[2], relay.Constant)
         and isinstance(inputs[3], relay.Constant)
     ):
-        sparse_matrix = sp.bsr_matrix(
-            (inputs[1].data.asnumpy(), inputs[2].data.asnumpy(), inputs[3].data.asnumpy())
-        )
+        if len(inputs[1].data.asnumpy().shape) == 1:
+            sparse_matrix = sp.csr_matrix(
+                (inputs[1].data.asnumpy(), inputs[2].data.asnumpy(), inputs[3].data.asnumpy())
+            ).tobsr()
+        else:
+            sparse_matrix = sp.bsr_matrix(
+                (inputs[1].data.asnumpy(), inputs[2].data.asnumpy(), inputs[3].data.asnumpy())
+            )
         warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
         sparse_matrix = pad_sparse_matrix(sparse_matrix, warp_size)
         return relay.nn._make.sparse_dense_padded(
diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 122d7d27b60d..7dd9aed7545d 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
@@ -51,7 +51,7 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
         num_int8_elements,
     )
 
-    oc_bn = int32_lanes
+    oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
     ic_bn = 1
     for bn in range(oc_bn, 0, -4):
         if wkl.in_filter % bn == 0:
@@ -99,7 +99,7 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
         num_int8_elements,
     )
 
-    oc_bn = int32_lanes
+    oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
     ic_bn = 1
     for bn in range(oc_bn, 0, -4):
         if wkl.in_filter % bn == 0:
@@ -119,7 +119,7 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
 
 
 def schedule_conv_NCHWc_cpu_common_int8(
-    s, cfg, data_vec, kernel_vec, conv_out, last, int32_lanes=16, intrin=None
+    s, cfg, data_vec, kernel_vec, conv_out, last, int32_lanes=16, int8_elems=4, intrin=None
 ):
     """
     Defines the schedule for INT8 for Intel and ARM machines
@@ -180,7 +180,7 @@ def schedule_conv_NCHWc_cpu_common_int8(
     ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
 
     assert oc_bn % int32_lanes == 0
-    assert ic_bn % 4 == 0  # 4 (u)int8 elements in (u)int32
+    assert ic_bn % int8_elems == 0  # (u)int8 elements in (u)int32
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
@@ -245,7 +245,7 @@ def schedule_conv_NCHWc_cpu_common_int8(
 
 
 def schedule_conv_NCHWc_cpu_1x1_int8(
-    s, cfg, data_vec, kernel_vec, conv_out, last, int32_lanes=16, intrin=None
+    s, cfg, data_vec, kernel_vec, conv_out, last, int32_lanes=16, int8_elems=4, intrin=None
 ):
     """
     Defines the 1x1 conv schedule for INT8 for Intel and ARM machines
@@ -305,7 +305,7 @@ def schedule_conv_NCHWc_cpu_1x1_int8(
     kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
     assert oc_bn % int32_lanes == 0
-    assert ic_bn % 4 == 0  # 4 (u)int8 elements in (u)int32
+    assert ic_bn % int8_elems == 0  # (u)int8 elements in (u)int32
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
index 4bc3f97d850b..60ccd0d36abf 100644
--- a/python/tvm/topi/generic/nn.py
+++ b/python/tvm/topi/generic/nn.py
@@ -462,6 +462,24 @@ def schedule_deformable_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
+def schedule_deformable_conv2d_nhwc(outs):
+    """Schedule for deformable_conv2d_nhwc.
+    We only use the default schedule here and rely on auto_scheduler.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of deformable_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 def schedule_bitserial_conv2d_nchw(outs):
     """Schedule for bitserial_conv2d_nchw
 
diff --git a/python/tvm/topi/image/dilation2d.py b/python/tvm/topi/image/dilation2d.py
index b3887822c452..7aad50623164 100644
--- a/python/tvm/topi/image/dilation2d.py
+++ b/python/tvm/topi/image/dilation2d.py
@@ -19,9 +19,9 @@
 """Dilation2D operators"""
 from __future__ import absolute_import as _abs
 from tvm import te
-from tvm.topi.util import simplify
+from tvm.topi.utils import simplify
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 
 
 def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None):
diff --git a/python/tvm/topi/image/resize.py b/python/tvm/topi/image/resize.py
index ca9904492239..103850de4923 100644
--- a/python/tvm/topi/image/resize.py
+++ b/python/tvm/topi/image/resize.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 import tvm
 from tvm import te
-from tvm.topi.util import nchw_pack_layout, nchw_xc_layout
+from tvm.topi.utils import nchw_pack_layout, nchw_xc_layout
 from .. import tag
 
 
diff --git a/python/tvm/topi/intel_graphics/conv2d.py b/python/tvm/topi/intel_graphics/conv2d.py
index 340a6ccc8d23..bdbde91918dd 100644
--- a/python/tvm/topi/intel_graphics/conv2d.py
+++ b/python/tvm/topi/intel_graphics/conv2d.py
@@ -25,8 +25,8 @@
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from .. import nn
-from .. import util
-from ..util import simplify, get_const_tuple, traverse_inline
+from .. import utils
+from ..utils import simplify, get_const_tuple, traverse_inline
 
 
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False):
@@ -144,6 +144,7 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
     s[tensor].bind(xi, thread_x)
     return xi, thread_z, thread_y, thread_x
 
+
 def _pack_data(data, kernel, ic_bn, oc_bn):
     n, _, ih, iw = get_const_tuple(data.shape)
     oc, ic, kh, kw = get_const_tuple(kernel.shape)
@@ -486,8 +487,8 @@ def _callback(op):
 
 
 def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype="float16"):
-    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
-    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
+    batch, in_channel, in_height, in_width = [utils.get_const_int(x) for x in data.shape]
+    num_filter, channel, kernel_h, kernel_w = [utils.get_const_int(x) for x in kernel.shape]
     pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w))
 
     if isinstance(stride, (tuple, list)):
@@ -573,7 +574,7 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype="float16"):
 
 def _schedule_cl_spatialpack(s, op):
     output = op.output(0)
-    _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
+    _, _, out_height, out_width = [utils.get_const_int(x) for x in output.shape]
 
     conv = op.input_tensors[0]
     temp = s[conv].op.input_tensors[0]
@@ -583,7 +584,7 @@ def _schedule_cl_spatialpack(s, op):
     conv_L = s.cache_write(conv, "local")
 
     kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
-    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
+    _, in_channel, temp_h, temp_w = [utils.get_const_int(x) for x in temp.shape]
 
     attrs = s[conv].op.attrs
     OUTPUT_BLOCK_HEIGHT = attrs["block_h"]
diff --git a/python/tvm/topi/intel_graphics/conv2d_alter_op.py b/python/tvm/topi/intel_graphics/conv2d_alter_op.py
index 46802bba806f..0b59a849c2c9 100644
--- a/python/tvm/topi/intel_graphics/conv2d_alter_op.py
+++ b/python/tvm/topi/intel_graphics/conv2d_alter_op.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm import autotvm
 
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..nn import conv2d_alter_layout, conv2d_infer_layout
 from .conv2d import _get_default_config
 
diff --git a/python/tvm/topi/intel_graphics/depthwise_conv2d.py b/python/tvm/topi/intel_graphics/depthwise_conv2d.py
index e2367798d6cb..fabd63b8778c 100644
--- a/python/tvm/topi/intel_graphics/depthwise_conv2d.py
+++ b/python/tvm/topi/intel_graphics/depthwise_conv2d.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import traverse_inline
+from ..utils import traverse_inline
 from .. import tag
 from .. import nn
 from ..nn.depthwise_conv2d import depthwise_conv2d_infer_layout
diff --git a/python/tvm/topi/mali/conv2d.py b/python/tvm/topi/mali/conv2d.py
index 0ccf1e671e8c..eb4005eb37c7 100644
--- a/python/tvm/topi/mali/conv2d.py
+++ b/python/tvm/topi/mali/conv2d.py
@@ -22,7 +22,7 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import get_factors
 
-from ..util import traverse_inline, get_const_int, get_const_tuple
+from ..utils import traverse_inline, get_const_int, get_const_tuple
 from .. import nn
 from ..nn.winograd_util import winograd_transform_matrices
 
diff --git a/python/tvm/topi/mali/dense.py b/python/tvm/topi/mali/dense.py
index 7605acebe7c6..53f76219bacd 100644
--- a/python/tvm/topi/mali/dense.py
+++ b/python/tvm/topi/mali/dense.py
@@ -20,7 +20,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("dense.mali")
diff --git a/python/tvm/topi/mali/depthwise_conv2d.py b/python/tvm/topi/mali/depthwise_conv2d.py
index b64135c969e6..55fcb1de9c4a 100644
--- a/python/tvm/topi/mali/depthwise_conv2d.py
+++ b/python/tvm/topi/mali/depthwise_conv2d.py
@@ -22,7 +22,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.mali")
diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py
index a035f6778c97..2ebbd1d67bd1 100644
--- a/python/tvm/topi/nn/__init__.py
+++ b/python/tvm/topi/nn/__init__.py
@@ -46,3 +46,5 @@
 from .fifo_buffer import *
 from .depth_to_space import *
 from .space_to_depth import *
+from .space_to_batch_nd import *
+from .batch_to_space_nd import *
diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 9b926a1182d8..6e60f27eab5d 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -17,7 +17,7 @@
 """Binary Neural Network (BNN) Operators"""
 # pylint: disable=invalid-name
 from tvm import te
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def batch_matmul(x, y, oshape=None):
diff --git a/python/tvm/topi/nn/batch_to_space_nd.py b/python/tvm/topi/nn/batch_to_space_nd.py
new file mode 100644
index 000000000000..c61a90a7777b
--- /dev/null
+++ b/python/tvm/topi/nn/batch_to_space_nd.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""TVM operator batch_to_space_nd compute."""
+from __future__ import absolute_import
+from . import cpp
+
+
+def batch_to_space_nd(data, block_shape, crop_begin_list, crop_end_list):
+    """Perform space to batch transformation on the data
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D Tensor with shape [batch, spatial_shape, remaining_shapes],
+        where spatial_shape has M dimensions.
+
+    block_size : list of ints
+        list of size [M] where M is number of spatial dims, specifies block
+        size for each spatial dimension.
+
+    crop_begin_list : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        begin crop size for each spatial dimension.
+
+    crop_end_list : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        end crop size for each spatial dimension.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+    """
+
+    return cpp.nn.batch_to_space_nd(data, block_shape, crop_begin_list, crop_end_list)
diff --git a/python/tvm/topi/nn/bitserial_conv2d.py b/python/tvm/topi/nn/bitserial_conv2d.py
index d10451902789..78d05d027659 100644
--- a/python/tvm/topi/nn/bitserial_conv2d.py
+++ b/python/tvm/topi/nn/bitserial_conv2d.py
@@ -20,9 +20,9 @@
 import tvm
 from tvm import te
 from .pad import pad
-from .util import get_pad_tuple
+from .utils import get_pad_tuple
 from .bitserial_util import bitpack
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def bitserial_conv2d_nchw(
diff --git a/python/tvm/topi/nn/bitserial_dense.py b/python/tvm/topi/nn/bitserial_dense.py
index 0b86e2e17392..32154ac81910 100644
--- a/python/tvm/topi/nn/bitserial_dense.py
+++ b/python/tvm/topi/nn/bitserial_dense.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 import tvm
 from tvm import te
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .bitserial_util import bitpack
 
 
diff --git a/python/tvm/topi/nn/bitserial_util.py b/python/tvm/topi/nn/bitserial_util.py
index ae43668484b3..3a55422493d4 100644
--- a/python/tvm/topi/nn/bitserial_util.py
+++ b/python/tvm/topi/nn/bitserial_util.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm.topi.transform import concatenate
-from ..util import get_const_int
+from ..utils import get_const_int
 
 
 def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
diff --git a/python/tvm/topi/nn/bnn.py b/python/tvm/topi/nn/bnn.py
index 6c36b375f157..50539de10052 100644
--- a/python/tvm/topi/nn/bnn.py
+++ b/python/tvm/topi/nn/bnn.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import simplify, get_const_int
+from ..utils import simplify, get_const_int
 
 
 def binarize_pack(data, axis=None, name="PackedInput"):
diff --git a/python/tvm/topi/nn/conv1d.py b/python/tvm/topi/nn/conv1d.py
index cffed669f20a..8fdf3f8918ee 100644
--- a/python/tvm/topi/nn/conv1d.py
+++ b/python/tvm/topi/nn/conv1d.py
@@ -18,8 +18,8 @@
 """1D convolution operators."""
 from tvm import te
 from .pad import pad
-from ..util import simplify
-from .util import get_pad_tuple1d
+from ..utils import simplify
+from .utils import get_pad_tuple1d
 
 
 def conv1d(data, kernel, strides=1, padding="VALID", dilation=1, layout="NCW", out_dtype=None):
diff --git a/python/tvm/topi/nn/conv1d_transpose.py b/python/tvm/topi/nn/conv1d_transpose.py
index 813377e76ca6..6f040409f47c 100644
--- a/python/tvm/topi/nn/conv1d_transpose.py
+++ b/python/tvm/topi/nn/conv1d_transpose.py
@@ -19,8 +19,8 @@
 from tvm import te
 from .dilate import dilate
 from .pad import pad
-from ..util import simplify
-from .util import get_pad_tuple1d
+from ..utils import simplify
+from .utils import get_pad_tuple1d
 
 
 def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype, output_padding):
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index c0e941c4acc7..8d591a20839a 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -20,11 +20,11 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
-from tvm import te
+from tvm import te, auto_scheduler
 
 from .pad import pad
-from .util import get_pad_tuple
-from ..util import simplify, get_const_tuple, get_const_int, tag
+from .utils import get_pad_tuple
+from ..utils import simplify, get_const_tuple, get_const_int, tag
 from .winograd_util import winograd_transform_matrices
 
 # workload description of conv2d
@@ -331,7 +331,15 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     return Output
 
 
-def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype="float32"):
+def conv2d_nhwc(
+    Input,
+    Filter,
+    stride,
+    padding,
+    dilation,
+    out_dtype="float32",
+    auto_scheduler_rewritten_layout="",
+):
     """Convolution operator in NHWC layout.
 
     Parameters
@@ -371,8 +379,30 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype="float32"):
     else:
         dilation_h, dilation_w = dilation
 
+    if auto_scheduler_rewritten_layout:
+        # Infer shape for the rewritten layout
+        # todo(merrymercy): wrap this with a more general interface.
+        if len(Filter.shape) >= 10:
+            # For cpu tile structure SSRSRS
+            base = len(Filter.shape) - 10
+            kernel_h = Filter.shape[2 + base] * Filter.shape[6 + base]
+            kernel_w = Filter.shape[3 + base] * Filter.shape[7 + base]
+            channel = Filter.shape[4 + base] * Filter.shape[8 + base]
+            num_filter = Filter.shape[5 + base] * Filter.shape[9 + base]
+            for i in range(base + 2):
+                num_filter *= Filter.shape[i]
+        elif len(Filter.shape) == 4:
+            num_filter, kernel_h, kernel_w, channel = Filter.shape
+        else:
+            raise ValueError(
+                "Don't know how to infer the layout for filter shape: %s. "
+                "Please add a new branch to handle this case." % str(Filter)
+            )
+        auto_scheduler.remove_index_check(Filter)
+    else:
+        kernel_h, kernel_w, channel, num_filter = Filter.shape
+
     batch, in_height, in_width, in_channel = Input.shape
-    kernel_h, kernel_w, channel, num_filter = Filter.shape
     # compute the output shape
     dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
     dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
@@ -399,7 +429,12 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype="float32"):
         ),
         name="Conv2dOutput",
         tag="conv2d_nhwc",
+        attrs={"layout_free_placeholders": [Filter]},
     )
+
+    if auto_scheduler_rewritten_layout:
+        Output = auto_scheduler.rewrite_compute_body(Output, auto_scheduler_rewritten_layout)
+
     return Output
 
 
@@ -505,7 +540,7 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou
 
 
 def conv2d_NCHWc_int8(
-    data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="int32"
+    data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="int32", n_elems=4
 ):
     """Conv2D operator for nChw[x]c layout.
 
@@ -539,6 +574,9 @@ def conv2d_NCHWc_int8(
     out_dtype : str
         output data type
 
+    n_elems : int
+        numer of int8 elements accumulated
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -588,7 +626,6 @@ def conv2d_NCHWc_int8(
     kw = te.reduce_axis((0, kernel_width), name="kw")
 
     if groups == 1:
-        n_elems = 4
         ic_outer = te.reduce_axis((0, in_channel // ic_bn), name="ic_outer")
         ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner")
         ic_s_inner = te.reduce_axis((0, n_elems), name="ic_s_inner")
@@ -611,7 +648,6 @@ def conv2d_NCHWc_int8(
             tag="conv2d_NCHWc_int8",
         )
     # for int8 group conv support
-    n_elems = 4
     ic_chunk = in_channel // ic_bn
     ic_outer = te.reduce_axis((0, ic_chunk // groups), name="ic_outer")
     ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner")
@@ -943,3 +979,214 @@ def unpack_NCHWc_to_nchw(packed_out, out_dtype):
         tag=tag.INJECTIVE + ",unpack_nchwc",
     )
     return unpacked_out
+
+
+def _conv2d_winograd_nhwc_impl(
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    tile_size,
+    pre_computed=False,
+):
+    """Conv2D Winograd implementation in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    weight : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+    out_dtype : str, optional
+        Specifies the output data type.
+    tile_size : int
+        The size of the tile to use for the Winograd filter
+    pre_computed: bool
+        Whether the kernel is precomputed
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    N, H, W, CI = get_const_tuple(data.shape)
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
+    if not pre_computed:
+        KH, KW, CI, CO = get_const_tuple(weight.shape)
+    else:
+        H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
+        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+
+    pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
+    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
+    assert HSTR == 1 and WSTR == 1 and KH == 3 and KW == 3
+
+    r = KW
+    m = tile_size
+    alpha = m + r - 1
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    H = (H + pad_t + pad_b - KH) // HSTR + 1
+    W = (W + pad_l + pad_r - KW) // WSTR + 1
+    nH, nW = (H + m - 1) // m, (W + m - 1) // m
+    P = N * nH * nW
+
+    pad_extra = (nW - 1) * m + alpha - (H + pad_t + pad_b)
+    data_pad = pad(
+        data, (0, pad_t, pad_l, 0), (0, pad_b + pad_extra, pad_r + pad_extra, 0), name="data_pad"
+    )
+
+    if not pre_computed:
+        r_kh = te.reduce_axis((0, KH), name="r_kh")
+        r_kw = te.reduce_axis((0, KW), name="r_kw")
+        kernel_pack = te.compute(
+            (alpha, alpha, CO, CI),
+            lambda eps, nu, co, ci: te.sum(
+                weight[r_kh][r_kw][ci][co] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
+            ),
+            name="kernel_pack",
+        )
+    else:
+        kernel_pack = weight
+
+    # pack data tile
+    input_tile = te.compute(
+        (alpha, alpha, P, CI),
+        lambda eps, nu, p, ci: data_pad[p // (nH * nW)][((p // nW) % nH) * m + eps][
+            (p % nW) * m + nu
+        ][ci],
+        name="input_tile",
+    )
+
+    # transform data
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_b")
+    data_pack = te.compute(
+        (alpha, alpha, P, CI),
+        lambda eps, nu, p, ci: te.sum(
+            input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
+        ),
+        name="data_pack",
+        attrs={"auto_scheduler_simplify_const_tensor_indices": ["eps", "nu", "r_a", "r_b"]},
+        # the attrs are necessary hints for the auto-scheduler
+    )
+
+    # do batch gemm
+    ci = te.reduce_axis((0, CI), name="ci")
+    bgemm = te.compute(
+        (alpha, alpha, P, CO),
+        lambda eps, nu, p, co: te.sum(
+            data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][co][ci], axis=[ci]
+        ),
+        name="bgemm",
+        attrs={"layout_free_placeholders": [kernel_pack]},
+    )
+
+    # inverse transform
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_b")
+    inverse = te.compute(
+        (m, m, P, CO),
+        lambda vh, vw, p, co: te.sum(
+            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
+        ),
+        name="inverse",
+        attrs={"auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"]},
+        # the attrs are necessary hints for the auto-scheduler
+    )
+
+    # output
+    output = te.compute(
+        (N, H, W, CO),
+        lambda n, h, w, co: inverse[h % m, w % m, n * nH * nW + (h // m) * nW + (w // m), co],
+        name="conv2d_winograd",
+    )
+
+    return output
+
+
+@tvm.target.generic_func
+def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, pre_computed=False):
+    """Conv2D Winograd in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    weight : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+    out_dtype : str, optional
+        Specifies the output data type.
+    pre_computed: bool
+        Whether the kernel is precomputed
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    tile_size = 4
+
+    return _conv2d_winograd_nhwc_impl(
+        data,
+        weight,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        tile_size,
+        pre_computed,
+    )
+
+
+def conv2d_winograd_nhwc_without_weight_transform(
+    data, weight, strides, padding, dilation, out_dtype
+):
+    """Conv2D Winograd without layout transform in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    weight : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+    out_dtype : str, optional
+        Specifies the output data type.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+
+    return conv2d_winograd_nhwc(
+        data, weight, strides, padding, dilation, out_dtype, pre_computed=True
+    )
diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index f67f9c9c8a5a..22188bcd45a4 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -21,8 +21,8 @@
 from tvm import relay
 from .dilate import dilate
 from .pad import pad
-from .util import get_pad_tuple
-from ..util import simplify
+from .utils import get_pad_tuple
+from ..utils import simplify
 
 
 def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype, output_padding):
diff --git a/python/tvm/topi/nn/conv3d.py b/python/tvm/topi/nn/conv3d.py
index 1696ac663e95..f3cda2896f63 100644
--- a/python/tvm/topi/nn/conv3d.py
+++ b/python/tvm/topi/nn/conv3d.py
@@ -21,8 +21,8 @@
 from tvm import te
 
 from .pad import pad
-from .util import get_pad_tuple3d
-from ..util import simplify, get_const_tuple
+from .utils import get_pad_tuple3d
+from ..utils import simplify, get_const_tuple
 from .winograd_util import winograd_transform_matrices
 
 
diff --git a/python/tvm/topi/nn/conv3d_transpose.py b/python/tvm/topi/nn/conv3d_transpose.py
index 9a8828f7cbbd..9f5c01a1fc3b 100644
--- a/python/tvm/topi/nn/conv3d_transpose.py
+++ b/python/tvm/topi/nn/conv3d_transpose.py
@@ -21,8 +21,8 @@
 from tvm import relay
 from .dilate import dilate
 from .pad import pad
-from .util import get_pad_tuple3d
-from ..util import simplify
+from .utils import get_pad_tuple3d
+from ..utils import simplify
 
 
 def conv3d_transpose_ncdhw(Input, Filter, strides, padding, out_dtype, output_padding):
diff --git a/python/tvm/topi/nn/correlation.py b/python/tvm/topi/nn/correlation.py
index 583002e7fc83..d7d650cd965e 100644
--- a/python/tvm/topi/nn/correlation.py
+++ b/python/tvm/topi/nn/correlation.py
@@ -18,7 +18,7 @@
 from tvm import te
 
 from .pad import pad
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def correlation_nchw(
diff --git a/python/tvm/topi/nn/deformable_conv2d.py b/python/tvm/topi/nn/deformable_conv2d.py
index 3d2b7ce3f14e..780530cbad79 100644
--- a/python/tvm/topi/nn/deformable_conv2d.py
+++ b/python/tvm/topi/nn/deformable_conv2d.py
@@ -19,9 +19,9 @@
 import tvm
 from tvm import te
 
-from .util import get_pad_tuple
-from ..util import get_const_tuple
-from ..cpp.util import bilinear_sample_nchw
+from .utils import get_pad_tuple
+from ..utils import get_const_tuple
+from ..cpp.utils import bilinear_sample_nchw, bilinear_sample_nhwc
 
 
 def deformable_conv2d_nchw(
@@ -130,3 +130,111 @@ def _bilinear(n, c, h, w):
         ),
         tag="deformable_conv2d_nchw",
     )
+
+
+def deformable_conv2d_nhwc(
+    data, offset, kernel, strides, padding, dilation, deformable_groups, groups, out_dtype
+):
+    """Deformable conv2D operator in NHWC layout.
+
+    The deformable convolution operation is described in https://arxiv.org/abs/1703.06211
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    offset : tvm.te.Tensor
+        4-D with shape [batch, out_height, out_width,
+                        deformable_groups * filter_height * filter_width * 2].
+
+    kernel : tvm.te.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    deformable_groups : int
+        number of deformable groups
+
+    groups : int
+        number of groups
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    if out_dtype is None:
+        out_dtype = data.dtype
+
+    if isinstance(strides, int):
+        stride_h = stride_w = strides
+    else:
+        stride_h, stride_w = strides
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_height, in_width, in_channel = get_const_tuple(data.shape)
+    kernel_h, kernel_w, channel, out_channel = get_const_tuple(kernel.shape)
+    _, out_height, out_width, _ = get_const_tuple(offset.shape)
+    assert in_channel % deformable_groups == 0, "Input cahnnels must divide deformable group size"
+    assert groups == 1, "deformable_conv2d_nchw does not support groups > 1"
+
+    ic_per_dgroup = channel // deformable_groups
+
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, _, _ = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
+    rc = te.reduce_axis((0, in_channel), name="rc")
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+
+    zero = tvm.tir.const(0.0, data.dtype)
+
+    def _bilinear(n, h, w, c):
+        outside = tvm.tir.any(h < 0, w < 0, h >= in_height, w >= in_width)
+        val = bilinear_sample_nhwc(data, (n, h, w, c), in_height - 1, in_width - 1)
+        return tvm.tir.if_then_else(outside, zero, val)
+
+    data_deform = te.compute(
+        (batch, kernel_h, kernel_w, in_channel, out_height, out_width),
+        lambda n, kh, kw, c, y, x: _bilinear(
+            n,
+            y * stride_h
+            - pad_top
+            + kh * dilation_h
+            + offset[
+                n, y, x, c // ic_per_dgroup * (kernel_w * kernel_h * 2) + (kh * kernel_w + kw) * 2
+            ],
+            x * stride_w
+            - pad_left
+            + kw * dilation_w
+            + offset[
+                n,
+                y,
+                x,
+                c // ic_per_dgroup * (kernel_w * kernel_h * 2) + (kh * kernel_w + kw) * 2 + 1,
+            ],
+            c,
+        ),
+        tag="data_deform",
+    )
+    return te.compute(
+        (batch, out_height, out_width, out_channel),
+        lambda n, y, x, f: te.sum(
+            data_deform[n, ry, rx, rc, y, x].astype(out_dtype)
+            * kernel[ry, rx, rc, f].astype(out_dtype),
+            axis=[ry, rx, rc],
+        ),
+        tag="deformable_conv2d_nhwc",
+    )
diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py
index c863a157025b..72356821770d 100644
--- a/python/tvm/topi/nn/depthwise_conv2d.py
+++ b/python/tvm/topi/nn/depthwise_conv2d.py
@@ -23,8 +23,8 @@
 
 from .dilate import dilate
 from .pad import pad
-from .util import get_pad_tuple
-from ..util import simplify
+from .utils import get_pad_tuple
+from ..utils import simplify
 
 # workload description of depthwise-conv2d
 Workload = namedtuple(
diff --git a/python/tvm/topi/nn/dilate.py b/python/tvm/topi/nn/dilate.py
index 6980fea58173..6b2222e4a779 100644
--- a/python/tvm/topi/nn/dilate.py
+++ b/python/tvm/topi/nn/dilate.py
@@ -18,7 +18,7 @@
 """Dilation operators"""
 import tvm
 from tvm import te
-from .. import util
+from .. import utils
 from .. import tag
 
 
@@ -57,7 +57,7 @@ def _dilate(*indices):
         idxdiv = tvm.tir.indexdiv
         idxmod = tvm.tir.indexmod
         for i in range(n):
-            if not util.equal_const_int(strides[i], 1):
+            if not utils.equal_const_int(strides[i], 1):
                 index_tuple.append(idxdiv(indices[i], strides[i]))
                 not_zero.append(idxmod(indices[i], strides[i]).equal(0))
             else:
diff --git a/python/tvm/topi/nn/elemwise.py b/python/tvm/topi/nn/elemwise.py
index 03fffc76ab99..a80047d900f3 100644
--- a/python/tvm/topi/nn/elemwise.py
+++ b/python/tvm/topi/nn/elemwise.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import get_const_int
+from ..utils import get_const_int
 
 
 @tvm.te.tag_scope(tag=tag.ELEMWISE)
diff --git a/python/tvm/topi/nn/pad.py b/python/tvm/topi/nn/pad.py
index ec20ef6f0a13..78e41b5af92a 100644
--- a/python/tvm/topi/nn/pad.py
+++ b/python/tvm/topi/nn/pad.py
@@ -18,7 +18,7 @@
 from __future__ import absolute_import as _abs
 import tvm
 from tvm import te
-from ..util import equal_const_int
+from ..utils import equal_const_int
 from .. import tag
 
 
diff --git a/python/tvm/topi/nn/space_to_batch_nd.py b/python/tvm/topi/nn/space_to_batch_nd.py
new file mode 100644
index 000000000000..149f2b6464c6
--- /dev/null
+++ b/python/tvm/topi/nn/space_to_batch_nd.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""TVM operator space_to_batch_nd compute."""
+from __future__ import absolute_import
+from . import cpp
+
+
+def space_to_batch_nd(data, block_shape, pad_before, pad_after, pad_value=0.0):
+    """Perform batch to space transformation on the data
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D Tensor with shape [batch, spatial_shape, remaining_shapes],
+        where spatial_shape has M dimensions.
+
+    block_shape : list of ints
+        list of size [M] where M is number of spatial dims, specifies block
+        size for each spatial dimension.
+
+    pad_before : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        zero-padding size before each spatial dimension.
+
+    pad_after : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        zero-padding size after each spatial dimension.
+
+    pad_value : float, optional
+        The value used for padding.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+    """
+
+    return cpp.nn.space_to_batch_nd(data, block_shape, pad_before, pad_after, pad_value)
diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py
index 74a9ad5fd650..55b3e6a7d1e5 100644
--- a/python/tvm/topi/nn/sparse.py
+++ b/python/tvm/topi/nn/sparse.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def sparse_dense(data, weight_data, weight_indices, weight_indptr):
diff --git a/python/tvm/topi/nn/upsampling.py b/python/tvm/topi/nn/upsampling.py
index b390b803b516..b95835f6e103 100644
--- a/python/tvm/topi/nn/upsampling.py
+++ b/python/tvm/topi/nn/upsampling.py
@@ -17,7 +17,7 @@
 """TVM operator upsampling compute."""
 from tvm import topi
 from tvm import te
-from ..util import simplify
+from ..utils import simplify
 
 
 def upsampling(
diff --git a/python/tvm/topi/nn/util.py b/python/tvm/topi/nn/utils.py
similarity index 99%
rename from python/tvm/topi/nn/util.py
rename to python/tvm/topi/nn/utils.py
index 0894656dcf54..ff00441e9850 100644
--- a/python/tvm/topi/nn/util.py
+++ b/python/tvm/topi/nn/utils.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 
 import tvm
-from ..util import get_const_int
+from ..utils import get_const_int
 
 
 def infer_pad(data, data_pad):
diff --git a/python/tvm/topi/nn/winograd_util.py b/python/tvm/topi/nn/winograd_util.py
index d43586dbd0d7..c0f7097a6315 100644
--- a/python/tvm/topi/nn/winograd_util.py
+++ b/python/tvm/topi/nn/winograd_util.py
@@ -26,7 +26,7 @@
 from functools import reduce
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
-from ..util import const_matrix
+from ..utils import const_matrix
 
 
 # pylint: disable=invalid-name
diff --git a/python/tvm/topi/rocm/batch_matmul.py b/python/tvm/topi/rocm/batch_matmul.py
index fa4dd457f3ed..7f35f4b55620 100644
--- a/python/tvm/topi/rocm/batch_matmul.py
+++ b/python/tvm/topi/rocm/batch_matmul.py
@@ -19,7 +19,7 @@
 from tvm import autotvm
 from tvm.contrib import rocblas
 from .. import generic
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 @autotvm.register_topi_compute("batch_matmul_rocblas.rocm")
diff --git a/python/tvm/topi/rocm/conv2d.py b/python/tvm/topi/rocm/conv2d.py
index 0857d093535c..fac77f02b456 100644
--- a/python/tvm/topi/rocm/conv2d.py
+++ b/python/tvm/topi/rocm/conv2d.py
@@ -20,8 +20,8 @@
 from tvm.contrib import miopen
 
 from .. import generic
-from ..util import get_const_tuple
-from ..nn.util import get_pad_tuple
+from ..utils import get_const_tuple
+from ..nn.utils import get_pad_tuple
 
 
 @autotvm.register_topi_compute("conv2d_nchw_miopen.rocm")
diff --git a/python/tvm/topi/rocm/dense.py b/python/tvm/topi/rocm/dense.py
index 4a771c602f59..2f3ce77cc7ba 100644
--- a/python/tvm/topi/rocm/dense.py
+++ b/python/tvm/topi/rocm/dense.py
@@ -21,7 +21,7 @@
 from tvm.contrib import rocblas
 from .. import generic, nn
 from .. import tag
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("dense.rocm")
diff --git a/python/tvm/topi/scatter.py b/python/tvm/topi/scatter.py
index f1c307a43a44..a376963aa55a 100644
--- a/python/tvm/topi/scatter.py
+++ b/python/tvm/topi/scatter.py
@@ -16,7 +16,8 @@
 # under the License.
 # pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
 """Scatter operator"""
-from tvm.te import hybrid
+from ..tir import decl_buffer, ir_builder, Cast, AssertStmt, StringImm, Evaluate
+from ..te import extern, hybrid
 
 
 @hybrid.script
@@ -196,3 +197,120 @@ def scatter(data, indices, updates, axis=0):
     if len(data.shape) == 4:
         return _scatter_4d(data, indices, updates, axis)
     raise ValueError("scatter only support for 1-4 dimensions")
+
+
+def _verify_scatter_nd_inputs(data, indices, shape):
+    mdim = int(indices.shape[0])
+    assert mdim <= len(shape), (
+        f"The first dimension of the indices ({mdim}) must be less than or equal to "
+        f"the length of the shape of the output ({len(shape)})."
+    )
+    for i in range(len(indices.shape) - 1):
+        assert indices.shape[i + 1] == data.shape[i], (
+            f"Dimension of indices[{i+1}] ({indices.shape[i+1]}) must equal dimension of "
+            f"data[{i}] ({data.shape[i]})."
+        )
+    for i in range(mdim, len(shape)):
+        data_ind = i - mdim + len(indices.shape) - 1
+        assert data.shape[data_ind] == shape[i], (
+            f"Dimension of data[{data_ind}] ({data.shape[data_ind]}) must equal dimension "
+            f"of out_shape[{i}] ({shape[i]})."
+        )
+
+    assert (
+        "int" in indices.dtype
+    ), f"Indices must be a tensor of integers, but its elements are {indices.dtype}."
+
+
+def scatter_nd(data, indices, shape):
+    """Scatter elements from a n-dimension array.
+
+    Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), indices with shape
+    (M, Y_0, ..., Y_{K-1}), and output with shape (X_0, X_1, ..., X_{N-1}), scatter_nd computes
+
+    .. code-block::
+
+        output[indices[0, y_0, ..., y_{K-1}],
+               ...,
+               indices[M-1, y_0, ..., y_{K-1}],
+               x_M,
+               ...,
+               x_{N-1}
+              ] = data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]
+
+    all other entries in the output are 0. Repeated indices are summed.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The source array.
+
+    indices : tvm.te.Tensor
+        The indices of the values to extract.
+
+    shape : Sequence[int]
+        The output shape. This must be specified because it cannot be inferred.
+
+    Returns
+    -------
+    ret : tvm.te.Tensor
+    """
+    _verify_scatter_nd_inputs(data, indices, shape)
+
+    def gen_ir(data_ptr, indices_ptr, out_ptr):
+        ib = ir_builder.create()
+
+        data = ib.buffer_ptr(data_ptr)
+        indices = ib.buffer_ptr(indices_ptr)
+        out = ib.buffer_ptr(out_ptr)
+
+        # zero data
+        # TODO(tkonolige): could we use topi.full to zero it instead?
+        fused_shape = 1
+        for i in shape:
+            fused_shape *= i
+        with ib.for_range(0, fused_shape) as i:
+            out[i] = Cast(data_ptr.dtype, 0)
+
+        # We combine all the indices dimensions but the first one into a single
+        # dimension so we can iterate it in single loop instead of an arbitrary
+        # number of loops. We do the same thing for all the data dimensions.
+        fused_indices_dimension = 1
+        for i in indices_ptr.shape[1:]:
+            fused_indices_dimension *= i
+
+        fused_data_dimension = 1
+        for i in data_ptr.shape[len(indices_ptr.shape) - 1 :]:
+            fused_data_dimension *= i
+
+        with ib.for_range(0, fused_indices_dimension, name="i") as i:
+            with ib.for_range(0, fused_data_dimension, name="j") as j:
+                offset = fused_data_dimension
+                index = j  # This is x_M, .. x_{N-1} part of the index into out.
+                # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
+                # of the index into out.
+                for l in reversed(range(indices_ptr.shape[0].value)):
+                    # indices[i * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
+                    index += offset * indices[i + l * fused_indices_dimension]
+                    ib.emit(
+                        AssertStmt(
+                            indices[i + l * fused_indices_dimension] < shape[l],
+                            StringImm("index out of bounds"),
+                            Evaluate(0),
+                        )
+                    )
+                    offset *= shape[l]
+                out[index] += data[i * fused_data_dimension + j]
+
+        return ib.get()
+
+    out_buf = decl_buffer(shape, data.dtype, "out_buf")
+    return extern(
+        [shape],
+        [data, indices],
+        lambda ins, outs: gen_ir(ins[0], ins[1], outs[0]),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_nd_generic",
+        tag="scatter_nd_generic",
+    )
diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py
index 86e2bad591d9..98a1080660fb 100644
--- a/python/tvm/topi/sort.py
+++ b/python/tvm/topi/sort.py
@@ -18,7 +18,7 @@
 """Argsort operator"""
 import tvm
 from tvm import te
-from .util import get_const_tuple
+from .utils import get_const_tuple
 
 
 def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
diff --git a/python/tvm/topi/sparse/csrmm.py b/python/tvm/topi/sparse/csrmm.py
index 954f9dd955f0..f578e6001351 100644
--- a/python/tvm/topi/sparse/csrmm.py
+++ b/python/tvm/topi/sparse/csrmm.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import simplify
+from ..utils import simplify
 
 
 def csrmm_default(data, indices, indptr, weight, bias=None):
diff --git a/python/tvm/topi/sparse/dense.py b/python/tvm/topi/sparse/dense.py
index d86f5dd4bfce..d1516d0c20fc 100644
--- a/python/tvm/topi/sparse/dense.py
+++ b/python/tvm/topi/sparse/dense.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import simplify
+from ..utils import simplify
 
 
 def dense_si(data, indices, indptr, weight, bias=None):
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 5b23e8f4600e..85f13a763c40 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -31,7 +31,7 @@
 from .conv2d_transpose_python import conv2d_transpose_nchw_python, conv2d_transpose_nhwc_python
 from .conv1d_transpose_ncw_python import conv1d_transpose_ncw_python
 from .correlation_nchw_python import correlation_nchw_python
-from .deformable_conv2d_nchw_python import deformable_conv2d_nchw_python
+from .deformable_conv2d_python import deformable_conv2d_nchw_python, deformable_conv2d_nhwc_python
 from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc
 from .dilate_python import dilate_python
 from .softmax_python import softmax_python, log_softmax_python
@@ -57,6 +57,7 @@
 from .space_to_depth import space_to_depth_python
 from .crop_and_resize_python import crop_and_resize_python
 from .common import (
+    compare_numpy_tvm,
     get_injective_schedule,
     get_reduce_schedule,
     get_broadcast_schedule,
@@ -67,3 +68,5 @@
 from .adaptive_pool_python import adaptive_pool
 from .grid_sample_python import affine_grid_python, grid_sample_nchw_python
 from .matrix_set_diag import matrix_set_diag
+from .space_to_batch_nd import space_to_batch_nd_python
+from .batch_to_space_nd import batch_to_space_nd_python
diff --git a/python/tvm/topi/testing/batch_to_space_nd.py b/python/tvm/topi/testing/batch_to_space_nd.py
new file mode 100644
index 000000000000..80af79b8cacb
--- /dev/null
+++ b/python/tvm/topi/testing/batch_to_space_nd.py
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Batch to space ND in python"""
+import numpy as np
+from . import strided_slice_python
+
+
+def batch_to_space_nd_python(data, block_shape, crop_begin_list, crop_end_list):
+    """Batch to Space operator in python for NHWC layout.
+
+    Parameters
+    ----------
+    data : np.ndarray
+        N-D with shape [batch, spatial_shape, remaining_shapes],
+        where spatial_shape has M dimensions.
+
+    block_shape : list of ints
+        1-D array of size [M] where M is number of spatial dims, specifies block
+        size for each spatial dimension.
+
+    crop_begin_list : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        begin crop size for each spatial dimension.
+
+    crop_end_list : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        end crop size for each spatial dimension.
+
+    Returns
+    -------
+    b2s_out : np.ndarray
+        N-D with shape
+        [batch / prod(block_shape),
+        in_shape[1] * block_shape[0] - crop_begin_list[0] - crop_end_list[0], ...,
+        in_shape[M] * block_shape[M-1] - crop_begin_list[M-1] - crop_end_list[M-1],
+        remaining_shape]
+    """
+    in_shape = data.shape
+    N = len(in_shape)
+    M = len(block_shape)
+    block_shape_prod = np.prod(block_shape)
+    in_batch = data.shape[0]
+    axis = []
+    r_p_shape = []
+
+    r_shape = [block_shape[i] for i in range(0, M)]
+    axis.append(len(r_shape))
+    r_shape.append(in_batch // block_shape_prod)
+
+    for i in range(1, N):
+        axis.append(len(r_shape))
+        if len(axis) < (M + N):
+            axis.append(len(r_shape) - (M + 1))
+        r_shape.append(in_shape[i])
+
+    r_p_shape.append(int((in_batch / block_shape_prod)))
+    for i in range(1, M + 1):
+        r_p_shape.append(in_shape[i] * block_shape[i - 1])
+    for i in range(M + 1, N):
+        r_p_shape.append(in_shape[i])
+
+    b2s_out = np.reshape(data, newshape=r_shape)
+    b2s_out = np.transpose(b2s_out, axes=axis)
+    b2s_out = np.reshape(b2s_out, newshape=r_p_shape)
+
+    # Crop the start and end of dimensions of b2s_out
+    begin_idx = []
+    end_idx = []
+    strides = []
+
+    for i, _ in enumerate(r_p_shape):
+        strides.append(1)
+        if 0 < i <= M:
+            # begin and end index for spatial dimensions
+            begin_idx.append(crop_begin_list[i - 1])
+            end_idx.append(r_p_shape[i] - crop_end_list[i - 1])
+        else:
+            begin_idx.append(0)
+            end_idx.append(r_p_shape[i])
+
+    b2s_out = strided_slice_python(b2s_out, begin_idx, end_idx, strides)
+    return b2s_out
diff --git a/python/tvm/topi/testing/bilinear_resize_python.py b/python/tvm/topi/testing/bilinear_resize_python.py
index 8d78f13ddf76..844546e0643f 100644
--- a/python/tvm/topi/testing/bilinear_resize_python.py
+++ b/python/tvm/topi/testing/bilinear_resize_python.py
@@ -18,7 +18,7 @@
 """Bilinear Scale in python"""
 import math
 import numpy as np
-from tvm.topi.util import nchw_pack_layout
+from tvm.topi.utils import nchw_pack_layout
 
 
 def bilinear_resize_python(image, out_size, layout, coordinate_transformation_mode="align_corners"):
diff --git a/python/tvm/topi/testing/common.py b/python/tvm/topi/testing/common.py
index 51ea19afe7ce..e4e5e811ab18 100644
--- a/python/tvm/topi/testing/common.py
+++ b/python/tvm/topi/testing/common.py
@@ -17,8 +17,10 @@
 # pylint: disable=invalid-name
 """Common utility for topi test"""
 
+import numpy as np
 import tvm
 from tvm import topi
+from tvm.testing import assert_allclose
 
 _injective_schedule = {
     "generic": topi.generic.schedule_injective,
@@ -77,3 +79,32 @@ def get_reduce_schedule(target):
 
 def get_conv2d_nchw_implement(target):
     return dispatch(target, _conv2d_nchw_implement)
+
+
+def compare_numpy_tvm(inputs, output, target, ctx, compute, schedule):
+    """Compare a numpy inputs and output of a function to the results of the TVM version.
+
+    Parameters
+    ----------
+    inputs : Sequence[numpy.nd.array]
+        List of input numpy arrays to pass to the function.
+    output : numpy.nd.array
+        Verified correct function output.
+    target : tvm.target.Target
+        Target to run on.
+    ctx : tvm.TVMContext
+        Context to run on.
+    compute : callable
+        Topi compute function to test against.
+    schedule : callable
+        Topi scheduling function to test against.
+    """
+    te_inputs = [tvm.te.placeholder(shape=i.shape, dtype=str(i.dtype)) for i in inputs]
+    te_out = tvm.nd.array(np.zeros(output.shape).astype(output.dtype), ctx=ctx)
+    with tvm.target.Target(target):
+        out = compute(*te_inputs)
+        s = schedule([out])
+        func = tvm.build(s, te_inputs + [out])
+        arys = [tvm.nd.array(x, ctx=ctx) for x in inputs]
+        func(*(arys + [te_out]))
+        assert_allclose(te_out.asnumpy(), output, atol=1e-4, rtol=1e-4)
diff --git a/python/tvm/topi/testing/conv1d_ncw_python.py b/python/tvm/topi/testing/conv1d_ncw_python.py
index 1405adb54f46..190e1c664610 100644
--- a/python/tvm/topi/testing/conv1d_ncw_python.py
+++ b/python/tvm/topi/testing/conv1d_ncw_python.py
@@ -17,7 +17,7 @@
 # pylint: disable=unused-variable, invalid-name
 """1D convolution in python"""
 import numpy as np
-from tvm.topi.nn.util import get_pad_tuple1d
+from tvm.topi.nn.utils import get_pad_tuple1d
 
 
 def dilate_np(x, dilation):
diff --git a/python/tvm/topi/testing/conv1d_transpose_ncw_python.py b/python/tvm/topi/testing/conv1d_transpose_ncw_python.py
index 3a1bc61419ff..85e1410c0cd8 100644
--- a/python/tvm/topi/testing/conv1d_transpose_ncw_python.py
+++ b/python/tvm/topi/testing/conv1d_transpose_ncw_python.py
@@ -19,7 +19,7 @@
 import numpy as np
 import scipy
 import tvm.topi.testing
-from tvm.topi.nn.util import get_pad_tuple1d
+from tvm.topi.nn.utils import get_pad_tuple1d
 
 
 def conv1d_transpose_ncw_python(a_np, w_np, stride, padding, output_padding):
diff --git a/python/tvm/topi/testing/conv2d_hwcn_python.py b/python/tvm/topi/testing/conv2d_hwcn_python.py
index 9a06edd82c3b..9ee66df51541 100644
--- a/python/tvm/topi/testing/conv2d_hwcn_python.py
+++ b/python/tvm/topi/testing/conv2d_hwcn_python.py
@@ -18,7 +18,7 @@
 """Convolution in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def conv2d_hwcn_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv2d_nchw_python.py b/python/tvm/topi/testing/conv2d_nchw_python.py
index 38bed4a00c49..ce5d981cc651 100644
--- a/python/tvm/topi/testing/conv2d_nchw_python.py
+++ b/python/tvm/topi/testing/conv2d_nchw_python.py
@@ -18,7 +18,7 @@
 """Convolution in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def _conv2d_nchw_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv2d_nhwc_python.py b/python/tvm/topi/testing/conv2d_nhwc_python.py
index 136fb6b8834a..68ef8c1b283e 100644
--- a/python/tvm/topi/testing/conv2d_nhwc_python.py
+++ b/python/tvm/topi/testing/conv2d_nhwc_python.py
@@ -18,7 +18,7 @@
 """Convolution in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def _conv2d_nhwc_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv2d_transpose_python.py b/python/tvm/topi/testing/conv2d_transpose_python.py
index 04e60a71dd60..c7c0d9f2529a 100644
--- a/python/tvm/topi/testing/conv2d_transpose_python.py
+++ b/python/tvm/topi/testing/conv2d_transpose_python.py
@@ -19,7 +19,7 @@
 import numpy as np
 import scipy
 import tvm.topi.testing
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding):
diff --git a/python/tvm/topi/testing/conv3d_ncdhw_python.py b/python/tvm/topi/testing/conv3d_ncdhw_python.py
index 11b0e2351288..a10d9ed42d1c 100644
--- a/python/tvm/topi/testing/conv3d_ncdhw_python.py
+++ b/python/tvm/topi/testing/conv3d_ncdhw_python.py
@@ -18,7 +18,7 @@
 """Convolution 3D in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple3d
+from tvm.topi.nn.utils import get_pad_tuple3d
 
 
 def _conv3d_ncdhw_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv3d_ndhwc_python.py b/python/tvm/topi/testing/conv3d_ndhwc_python.py
index 52974d488a71..46f04f630863 100644
--- a/python/tvm/topi/testing/conv3d_ndhwc_python.py
+++ b/python/tvm/topi/testing/conv3d_ndhwc_python.py
@@ -18,7 +18,7 @@
 """Convolution 3D in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple3d
+from tvm.topi.nn.utils import get_pad_tuple3d
 
 
 def conv3d_ndhwc_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv3d_transpose_ncdhw_python.py b/python/tvm/topi/testing/conv3d_transpose_ncdhw_python.py
index 779371af9895..38b8bc51bc70 100644
--- a/python/tvm/topi/testing/conv3d_transpose_ncdhw_python.py
+++ b/python/tvm/topi/testing/conv3d_transpose_ncdhw_python.py
@@ -18,7 +18,7 @@
 """Convolution 3D transpose in python"""
 import numpy as np
 import tvm.topi.testing
-from tvm.topi.nn.util import get_pad_tuple3d
+from tvm.topi.nn.utils import get_pad_tuple3d
 
 
 def conv3d_transpose_ncdhw_python(a_np, w_np, stride, padding, output_padding):
diff --git a/python/tvm/topi/testing/deformable_conv2d_nchw_python.py b/python/tvm/topi/testing/deformable_conv2d_python.py
similarity index 73%
rename from python/tvm/topi/testing/deformable_conv2d_nchw_python.py
rename to python/tvm/topi/testing/deformable_conv2d_python.py
index cc66c5f12906..093084397ff1 100644
--- a/python/tvm/topi/testing/deformable_conv2d_nchw_python.py
+++ b/python/tvm/topi/testing/deformable_conv2d_python.py
@@ -18,7 +18,7 @@
 """Deformable convolution in python"""
 import itertools
 import numpy as np
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def deformable_conv2d_nchw_python(
@@ -119,3 +119,52 @@ def _bilinear(n, c, h, w):
         b_np[n, f, h, w] += np.tensordot(a_deform[n, c, h, w], w_np[f, c])
 
     return b_np
+
+
+def deformable_conv2d_nhwc_python(
+    a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups
+):
+    """Deformable convolution operator in NHWC layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    offset_np : numpy.ndarray
+        4-D with shape [batch, out_height, out_width,
+                        deformable_groups * filter_height * filter_width * 2]
+
+    w_np : numpy.ndarray
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str or a list/tuple of 2 or 4 ints
+        Padding size, or ['VALID', 'SAME'], or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 2 ints
+
+    dilation : int or a list/tuple of two ints
+        Dilation size, or [dilate_height, dilate_width]
+
+    deformable_groups : int
+        Number of deformable groups
+
+    groups : int
+        Number of groups
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    a_np = np.transpose(a_np, [0, 3, 1, 2])  # NHWC -> NCHW
+    offset_np = np.transpose(offset_np, [0, 3, 1, 2])  # NHWC -> NCHW
+    w_np = np.transpose(w_np, [3, 2, 0, 1])  # HWIO -> OIHW
+    b_np = deformable_conv2d_nchw_python(
+        a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups
+    )
+    b_np = np.transpose(b_np, [0, 2, 3, 1])  # NCHW -> NHWC
+    return b_np
diff --git a/python/tvm/topi/testing/space_to_batch_nd.py b/python/tvm/topi/testing/space_to_batch_nd.py
new file mode 100644
index 000000000000..de88c27e56d6
--- /dev/null
+++ b/python/tvm/topi/testing/space_to_batch_nd.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Space to batch ND in python"""
+import numpy as np
+
+
+def space_to_batch_nd_python(data, block_shape, pad_before, pad_after, pad_value=0):
+    """Space to Batch operator in python for NHWC layout.
+
+    Parameters
+    ----------
+    data : np.ndarray
+        N-D with shape [batch, spatial_shape, remaining_shapes],
+        where spatial_shape has M dimensions.
+
+    block_shape : list of ints
+        1-D array of size [M] where M is number of spatial dims, specifies block
+        size for each spatial dimension.
+
+    pad_before : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        zero-padding size before each spatial dimension.
+
+    pad_after : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        zero-padding size after each spatial dimension.
+
+    pad_value : float, optional
+        the value used for padding. Defaults to 0.
+
+    Returns
+    -------
+    s2b_out : np.ndarray
+        N-D with shape [batch * prod(block_shape),
+                        padded_data[1] / block_shape[0], ..., padded_data[M] / block_shape[M-1],
+                        remaining_shape]
+    """
+    M = len(block_shape)
+    in_batch = data.shape[0]
+    block_shape_prod = np.prod(block_shape)
+
+    # Apply padding to input data
+    input_shape = data.shape
+    # Add the paddings for batch and remaining dims
+    paddings = map(list, zip(pad_before, pad_after))
+    paddings = [[0, 0]] + list(paddings) + [[0, 0]] * (data.ndim - 1 - M)
+    padded_data = np.pad(data, paddings, mode="constant", constant_values=pad_value)
+    padded_shape = padded_data.shape
+
+    # Get the reshape shape and transpose axes
+    r_shape = []
+    trans_axis = []
+    r_shape.append(in_batch)
+    for i in range(1, M + 1):
+        r_shape.append((int(padded_shape[i] // block_shape[i - 1])))
+        r_shape.append(block_shape[i - 1])
+        trans_axis.append(len(r_shape) - 1)
+
+    axis_len = len(trans_axis)
+    trans_axis.append(0)
+    for i in range(axis_len):
+        trans_axis.append(trans_axis[i] - 1)
+
+    out_shape = []
+    out_shape.append(int((in_batch * block_shape_prod)))
+    for i in range(1, M + 1):
+        out_shape.append(int(padded_shape[i] // block_shape[i - 1]))
+
+    for i in range(M + 1, len(input_shape)):
+        r_shape.append(input_shape[i])
+        trans_axis.append(len(r_shape) - 1)
+        out_shape.append(input_shape[i])
+
+    s2b_out = np.reshape(padded_data, newshape=r_shape)
+    s2b_out = np.transpose(s2b_out, axes=trans_axis)
+    s2b_out = np.reshape(s2b_out, newshape=out_shape)
+
+    return s2b_out
diff --git a/python/tvm/topi/testing/upsampling_python.py b/python/tvm/topi/testing/upsampling_python.py
index 203e804d3338..7f48aa47b8d1 100644
--- a/python/tvm/topi/testing/upsampling_python.py
+++ b/python/tvm/topi/testing/upsampling_python.py
@@ -18,7 +18,7 @@
 """Upsampling in python"""
 import math
 import numpy as np
-from tvm.topi.util import nchw_pack_layout
+from tvm.topi.utils import nchw_pack_layout
 
 
 def upsample_nearest(arr, scale):
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index c4e51a8858d1..6ddbc73e4666 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -22,7 +22,7 @@
 from tvm import topi
 from . import cpp
 from . import tag
-from .util import within_index, make_idx
+from .utils import within_index, make_idx, const_vector
 
 
 def expand_dims(a, axis, num_newaxis=1):
@@ -200,6 +200,20 @@ def strided_slice(a, begin, end, strides=None, slice_mode="end"):
     -------
     ret : tvm.te.Tensor
     """
+    if (
+        isinstance(begin, tvm.te.Tensor)
+        or isinstance(end, tvm.te.Tensor)
+        or isinstance(strides, tvm.te.Tensor)
+    ):
+        if not isinstance(begin, tvm.te.Tensor):
+            begin = const_vector(begin)
+        if not isinstance(end, tvm.te.Tensor):
+            end = const_vector(end)
+        if strides is None:
+            strides = [1] * begin.shape[0].value
+        if not isinstance(strides, tvm.te.Tensor):
+            strides = const_vector(strides)
+        return cpp.dynamic_strided_slice(a, begin, end, strides)
     if strides is None:
         strides = []
     return cpp.strided_slice(a, begin, end, strides, slice_mode)
@@ -412,6 +426,28 @@ def take(a, indices, axis=None, mode="clip"):
     return cpp.take(a, indices, int(axis), mode)
 
 
+@tvm.target.generic_func
+def take_legalize(attrs, inputs, types):
+    """Legalizes dyn.topk op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current op
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    if tvm.relay.ty.is_dynamic(types[0]):
+        return tvm.relay.take(tvm.relay.annotation.stop_fusion(inputs[0]), inputs[1], **attrs)
+    return None
+
+
 def gather(data, axis, indices):
     """Gather values along given axis from given indices.
 
diff --git a/python/tvm/topi/util.py b/python/tvm/topi/utils.py
similarity index 91%
rename from python/tvm/topi/util.py
rename to python/tvm/topi/utils.py
index 0a5c93c632de..c3e14eff3919 100644
--- a/python/tvm/topi/util.py
+++ b/python/tvm/topi/utils.py
@@ -18,6 +18,8 @@
 """Common topi utilities"""
 from __future__ import absolute_import as _abs
 from numbers import Integral
+import numpy as np
+
 
 import tvm
 from tvm import te
@@ -188,6 +190,40 @@ def get_const_tuple(in_tuple):
     return tuple(ret)
 
 
+def const_vector(vector, name="const_vector"):
+    """convert a const numpy 1-dimensional vector to tvm tensor
+
+    Parameters
+    ----------
+    vector: numpy.ndarray
+        Const input array
+    name: str, optional
+        The name of output op
+
+    Returns
+    -------
+    tensor: Tensor
+        The created tensor
+    """
+    if not isinstance(vector, np.ndarray):
+        vector = np.array(vector)
+    row = vector.shape[0]
+    dtype = str(vector.dtype)
+    idxm = tvm.tir.indexmod
+
+    def select_array(i):
+        now = tvm.tir.const(0.0, dtype)
+        for ii in range(row):
+            now = tvm.tir.Select(
+                tvm.tir.all(idxm(i, row) == ii),
+                tvm.tir.const(vector[ii], dtype),
+                now,
+            )
+        return now
+
+    return te.compute(vector.shape, select_array, name=name)
+
+
 def get_float_tuple(in_tuple):
     """Verifies input tuple is FloatImm, returns tuple of float.
 
@@ -301,7 +337,7 @@ def select_array(i, j):
                 )
         return now
 
-    return te.compute(matrix.shape, select_array, name=name)
+    return te.compute(matrix.shape, select_array, name=name, attrs={"const_matrix": True})
 
 
 def get_max_power2_factor(n, max_value=None):
@@ -450,4 +486,4 @@ def is_empty_shape(shape):
     is_empty: bool
       Whether input shape is empty or has dimesion with size 0.
     """
-    return cpp.util.is_empty_shape(shape)
+    return cpp.utils.is_empty_shape(shape)
diff --git a/python/tvm/topi/vision/rcnn/proposal.py b/python/tvm/topi/vision/rcnn/proposal.py
index cda7522321fe..89726efd5d0e 100644
--- a/python/tvm/topi/vision/rcnn/proposal.py
+++ b/python/tvm/topi/vision/rcnn/proposal.py
@@ -19,7 +19,7 @@
 import math
 import tvm
 from tvm import te
-from ...util import get_const_tuple, get_const_int
+from ...utils import get_const_tuple, get_const_int
 from ...sort import argsort
 
 
diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py
index eafdc21089bd..a51ba33a6c45 100644
--- a/python/tvm/topi/vision/rcnn/roi_align.py
+++ b/python/tvm/topi/vision/rcnn/roi_align.py
@@ -18,8 +18,8 @@
 """Roi align operator"""
 import tvm
 from tvm import te
-from ...util import get_const_tuple
-from ...cpp.util import bilinear_sample_nchw
+from ...utils import get_const_tuple
+from ...cpp.utils import bilinear_sample_nchw
 
 
 def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
diff --git a/python/tvm/topi/vision/rcnn/roi_pool.py b/python/tvm/topi/vision/rcnn/roi_pool.py
index 2254b74f4bdf..dd1429bcb3c5 100644
--- a/python/tvm/topi/vision/rcnn/roi_pool.py
+++ b/python/tvm/topi/vision/rcnn/roi_pool.py
@@ -18,7 +18,7 @@
 """ROI pool operator"""
 import tvm
 from tvm import te
-from ...util import get_const_tuple
+from ...utils import get_const_tuple
 
 
 def roi_pool_nchw(data, rois, pooled_size, spatial_scale):
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index 659668cbbe4c..154511010a1c 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -39,3 +39,4 @@
 from .conv3d_transpose import *
 from .sparse import *
 from .conv2d_alter_op import *
+from .scatter import *
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index e3f08160509e..166c79a4c93b 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -19,9 +19,9 @@
 from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
-from tvm.contrib import cblas
+from tvm.contrib import cblas, mkl
 from .. import generic
-from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
 
 
 @autotvm.register_topi_compute("batch_matmul.x86")
@@ -37,6 +37,9 @@ def batch_matmul(cfg, x, y, out_shape=None):
         3-D with shape [batch, M, K]
     y : tvm.te.Tensor
         3-D with shape [batch, N, K]
+    out_shape : tuple or None
+        Shape of the outputs
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -134,10 +137,9 @@ def _default_batch_matmul_config(cfg, M, N, K):
     cfg["tile_y"] = SplitEntity([M // y_bn, y_bn])
 
 
-@autotvm.register_topi_compute("batch_matmul_cblas.x86")
-def batch_matmul_cblas(cfg, x, y):
+def batch_matmul_blas_common(cfg, x, y, out_shape, lib):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
-    data in batch.
+    data in batch, using one of BLAS libraries.
 
     Parameters
     ----------
@@ -147,6 +149,11 @@ def batch_matmul_cblas(cfg, x, y):
         3-D with shape [batch, M, K]
     y : tvm.te.Tensor
         3-D with shape [batch, N, K]
+    out_shape : tuple or None
+        Shape of the output
+    lib : A contrib module which implements batch_matmul funtion
+        cblas and mkl are supported
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -157,10 +164,33 @@ def batch_matmul_cblas(cfg, x, y):
     YB, N, YK = get_const_tuple(y.shape)
     assert XB == YB, "batch dimension doesn't match"
     assert XK == YK, "shapes of x and y is inconsistant"
+    if out_shape is not None:
+        assert out_shape[0] == XB, "got invalid output shape"
+        assert out_shape[1] == M, "got invalid output shape"
+        assert out_shape[2] == N, "got invalid output shape"
     cfg.add_flop(XB * M * N * XK * 2)
-    return cblas.batch_matmul(x, y, False, True)
+    return lib.batch_matmul(x, y, False, True)
+
+
+@autotvm.register_topi_compute("batch_matmul_cblas.x86")
+def batch_matmul_cblas(cfg, x, y, out_shape=None):
+    """Compute batch_matmul using cblas"""
+    return batch_matmul_blas_common(cfg, x, y, out_shape, cblas)
 
 
 @autotvm.register_topi_schedule("batch_matmul_cblas.x86")
 def schedule_batch_matmul_cblas(_, outs):
+    """Create schedule for batch_matmul_cblas"""
+    return generic.schedule_extern(outs)
+
+
+@autotvm.register_topi_compute("batch_matmul_mkl.x86")
+def batch_matmul_mkl(cfg, x, y, out_shape=None):
+    """Compute batch_matmul using mkl"""
+    return batch_matmul_blas_common(cfg, x, y, out_shape, mkl)
+
+
+@autotvm.register_topi_schedule("batch_matmul_mkl.x86")
+def schedule_batch_matmul_mkl(_, outs):
+    """Create schedule for batch_matmul_mul"""
     return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/x86/bitserial_conv2d.py b/python/tvm/topi/x86/bitserial_conv2d.py
index 5fcc9e119c4e..18f305094754 100644
--- a/python/tvm/topi/x86/bitserial_conv2d.py
+++ b/python/tvm/topi/x86/bitserial_conv2d.py
@@ -20,9 +20,9 @@
 from tvm import te
 from tvm import autotvm
 from .. import tag
-from ..util import get_const_int, get_const_tuple
+from ..utils import get_const_int, get_const_tuple
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from ..nn.bitserial_util import bitpack, binary_op_multiplier
 
 
diff --git a/python/tvm/topi/x86/bitserial_dense.py b/python/tvm/topi/x86/bitserial_dense.py
index e9546ac1ee2e..7af18f602234 100644
--- a/python/tvm/topi/x86/bitserial_dense.py
+++ b/python/tvm/topi/x86/bitserial_dense.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.topi.util import get_const_int, get_const_tuple
+from tvm.topi.utils import get_const_int, get_const_tuple
 from .. import tag
 from ..nn.bitserial_util import bitpack, binary_op_multiplier
 
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index a52e27a42755..a3b7e473415e 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -27,8 +27,8 @@
 from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
-from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..nn.utils import get_pad_tuple
+from ..utils import get_const_tuple, traverse_inline
 from . import conv2d_avx_1x1, conv2d_avx_common
 
 logger = logging.getLogger("topi")
@@ -263,7 +263,7 @@ def _callback(op):
     return s
 
 
-# FIXME - https://github.com/apache/incubator-tvm/issues/4122
+# FIXME - https://github.com/apache/tvm/issues/4122
 # _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO
 # layout. Commenting until we have clarity about the nhwc_pack implementation from the author.
 # elif layout == 'NHWC' and kh == 1 and kw == 1 and kernel.dtype == "int8":
diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
index 1c90841eb2de..db3c232b6a7f 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -26,9 +26,9 @@
 from tvm import autotvm
 from .conv2d import _get_default_config
 from .conv2d_int8 import is_int8_hw_support, _get_default_config_int8
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..nn import conv2d_legalize, conv2d_alter_layout
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 
 logger = logging.getLogger("topi")
 
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index 8ca20be262bc..3e5a12bc43b2 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -22,11 +22,11 @@
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from ..generic import conv2d as conv2d_generic
-from ..util import get_const_tuple, simplify
+from ..utils import get_const_tuple, simplify
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
-from .util import get_fp32_len
+from .utils import get_fp32_len
 
 
 def _fallback_schedule(cfg, wkl):
@@ -229,7 +229,7 @@ def _schedule_conv_nhwc_pack_int8(s, cfg, data, conv_out, last):
     packing of weight to make the address access be friendly to int8
     intrinsic
     """
-    # FIXME - https://github.com/apache/incubator-tvm/issues/3598
+    # FIXME - https://github.com/apache/tvm/issues/3598
     # pylint: disable=unreachable
     return s
 
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 28a698c342d8..8d707445be05 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -20,9 +20,9 @@
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from ..generic import conv2d as conv2d_generic
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
-from .util import get_fp32_len
+from .utils import get_fp32_len
 
 
 def _fallback_schedule(cfg, wkl):
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index e2862ec11ef7..905ada68f277 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -24,10 +24,10 @@
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 from .. import tag
 from ..generic import conv2d as conv2d_generic
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 from .. import nn
 from . import conv2d_avx_1x1, conv2d_avx_common
 
diff --git a/python/tvm/topi/x86/conv2d_transpose.py b/python/tvm/topi/x86/conv2d_transpose.py
index 105c45526085..865b62bb3e87 100644
--- a/python/tvm/topi/x86/conv2d_transpose.py
+++ b/python/tvm/topi/x86/conv2d_transpose.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D Transpose schedule on x86"""
 from tvm import te
-from ..util import traverse_inline
+from ..utils import traverse_inline
 from .. import nn
 from .conv2d import conv2d_nchw, schedule_conv2d_nchw
 
diff --git a/python/tvm/topi/x86/conv3d.py b/python/tvm/topi/x86/conv3d.py
index 479a27b296a4..cb202f5257af 100644
--- a/python/tvm/topi/x86/conv3d.py
+++ b/python/tvm/topi/x86/conv3d.py
@@ -22,11 +22,11 @@
 from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from ..util import traverse_inline
-from ..nn.util import get_pad_tuple3d, infer_pad3d
+from ..utils import traverse_inline
+from ..nn.utils import get_pad_tuple3d, infer_pad3d
 from ..nn.pad import pad
-from ..util import get_const_tuple, simplify, get_const_int
-from .util import get_fp32_len
+from ..utils import get_const_tuple, simplify, get_const_int
+from .utils import get_fp32_len
 
 Workload3D = namedtuple(
     "Workload",
diff --git a/python/tvm/topi/x86/conv3d_transpose.py b/python/tvm/topi/x86/conv3d_transpose.py
index f986ccfaa3d9..e743f02fb063 100644
--- a/python/tvm/topi/x86/conv3d_transpose.py
+++ b/python/tvm/topi/x86/conv3d_transpose.py
@@ -19,7 +19,7 @@
 
 """Conv3D Transpose schedule on x86"""
 from tvm import te
-from ..util import traverse_inline
+from ..utils import traverse_inline
 from .. import nn
 from .conv3d import conv3d_ncdhw, schedule_conv3d_ncdhw
 
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index b0cc71acb232..15d7a1a310d6 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -25,9 +25,9 @@
 from tvm.contrib import mkl
 from tvm.contrib import mkldnn
 
-from .util import get_fp32_len
+from .utils import get_fp32_len
 from .. import generic, tag
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 
 
 def _schedule_dense_pack_template(cfg, s, C):
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index 1921f7fecf29..badba1a248e9 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -22,12 +22,12 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from ..nn.pad import pad
-from ..util import get_const_tuple
-from ..nn.util import get_pad_tuple
+from ..utils import get_const_tuple
+from ..nn.utils import get_pad_tuple
 from ..nn.depthwise_conv2d import _get_workload, depthwise_conv2d_infer_layout
 from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..util import traverse_inline
-from .util import get_fp32_len
+from ..utils import traverse_inline
+from .utils import get_fp32_len
 
 
 def _fallback_schedule(cfg, wkl):
diff --git a/python/tvm/topi/x86/injective.py b/python/tvm/topi/x86/injective.py
index a5e521b8eb20..29f903fd4e35 100644
--- a/python/tvm/topi/x86/injective.py
+++ b/python/tvm/topi/x86/injective.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name
 """x86 declaration and schedules."""
 from tvm import te
-from ..util import is_empty_shape
+from ..utils import is_empty_shape
 
 
 def schedule_injective_from_existing(sch, out):
diff --git a/python/tvm/topi/x86/reduction.py b/python/tvm/topi/x86/reduction.py
index 69659de7d8a9..db3ea81b7358 100644
--- a/python/tvm/topi/x86/reduction.py
+++ b/python/tvm/topi/x86/reduction.py
@@ -20,7 +20,7 @@
 from tvm import te
 from .injective import schedule_injective_from_existing
 from .. import tag
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def _schedule_reduce(sch, op, is_idx_reduce=False):
diff --git a/python/tvm/topi/x86/roi_align.py b/python/tvm/topi/x86/roi_align.py
index baa23ad2a135..ac2146b558f9 100644
--- a/python/tvm/topi/x86/roi_align.py
+++ b/python/tvm/topi/x86/roi_align.py
@@ -21,7 +21,7 @@
 
 from tvm.te import hybrid
 from ..tensor import full
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 @hybrid.script
diff --git a/python/tvm/topi/x86/scatter.py b/python/tvm/topi/x86/scatter.py
new file mode 100644
index 000000000000..8147d3a00135
--- /dev/null
+++ b/python/tvm/topi/x86/scatter.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Scatter operators for x86"""
+import tvm
+from tvm import te
+from ..scatter import _verify_scatter_nd_inputs
+
+
+def scatter_nd(data, indices, shape):
+    """Scatter elements from a n-dimension array.
+
+    Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), indices with shape
+    (M, Y_0, ..., Y_{K-1}), and output with shape (X_0, X_1, ..., X_{N-1}), scatter_nd computes
+
+    .. code-block::
+
+        output[indices[0, y_0, ..., y_{K-1}],
+               ...,
+               indices[M-1, y_0, ..., y_{K-1}],
+               x_M,
+               ...,
+               x_{N-1}
+              ] = data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]
+
+    all other entries in the output are 0. Repeated indices are summed.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The source array.
+
+    indices : tvm.te.Tensor
+        The indices of the values to extract.
+
+    shape : Sequence[int]
+        The output shape. This must be specified because it cannot be inferred.
+
+    Returns
+    -------
+    ret : tvm.te.Tensor
+    """
+    _verify_scatter_nd_inputs(data, indices, shape)
+
+    def gen_ir(data_ptr, indices_ptr, out_ptr):
+        # pylint: disable=invalid-name
+        ib = tvm.tir.ir_builder.create()
+
+        data = ib.buffer_ptr(data_ptr)
+        indices = ib.buffer_ptr(indices_ptr)
+        out = ib.buffer_ptr(out_ptr)
+
+        # We combine all the indices dimensions but the first one into a single
+        # dimension so we can iterate it in single loop instead of an arbitrary
+        # number of loops. We do the same thing for all the data dimensions.
+        fused_indices_dimension = 1
+        for i in indices_ptr.shape[1:]:
+            fused_indices_dimension *= i
+
+        fused_data_dimension = 1
+        for i in data_ptr.shape[len(indices_ptr.shape) - 1 :]:
+            fused_data_dimension *= i
+
+        fused_shape = 1
+        for i in shape:
+            fused_shape *= i
+
+        # zero data
+        # TODO(tkonolige): could we use topi.full to zero it instead?
+        with ib.for_range(0, fused_shape) as i:
+            out[i] = tvm.tir.Cast(data_ptr.dtype, 0)
+
+        with ib.for_range(0, fused_indices_dimension) as i:
+            with ib.for_range(0, fused_data_dimension, for_type="parallel") as j:
+                offset = fused_data_dimension
+                index = j  # This is x_M, .. x_{N-1} part of the index into out.
+                # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
+                # of the index into out.
+                for l in reversed(range(indices_ptr.shape[0].value)):
+                    # indices[i * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
+                    index += offset * indices[i + l * fused_indices_dimension]
+                    offset *= shape[l]
+                out[index] += data[i * fused_data_dimension + j]
+
+        return ib.get()
+
+    out_buf = tvm.tir.decl_buffer(shape, data.dtype, "out_buf")
+    return te.extern(
+        [shape],
+        [data, indices],
+        lambda ins, outs: gen_ir(ins[0], ins[1], outs[0]),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_nd_x86",
+        tag="scatter_nd_x86",
+    )
diff --git a/python/tvm/topi/x86/sparse.py b/python/tvm/topi/x86/sparse.py
index 8c4a387b8052..b6291083c8c1 100644
--- a/python/tvm/topi/x86/sparse.py
+++ b/python/tvm/topi/x86/sparse.py
@@ -18,8 +18,8 @@
 """sparse_dense schedule on x86"""
 from tvm import te
 
-from ..util import traverse_inline, get_const_int
-from .util import get_fp32_len
+from ..utils import traverse_inline, get_const_int
+from .utils import get_fp32_len
 
 
 def schedule_sparse_dense(outs):
diff --git a/python/tvm/topi/x86/util.py b/python/tvm/topi/x86/utils.py
similarity index 100%
rename from python/tvm/topi/x86/util.py
rename to python/tvm/topi/x86/utils.py
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 28312a5e73dc..e75150859f90 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -27,6 +27,6 @@ members = [
 	"tvm-graph-rt",
 	"tvm-graph-rt/tests/test_tvm_basic",
 	"tvm-graph-rt/tests/test_tvm_dso",
-	"tvm-graph-rt/tests/test_wasm32",
 	"tvm-graph-rt/tests/test_nn",
+	"compiler-ext",
 ]
diff --git a/rust/compiler-ext/Cargo.toml b/rust/compiler-ext/Cargo.toml
new file mode 100644
index 000000000000..b830b7a84135
--- /dev/null
+++ b/rust/compiler-ext/Cargo.toml
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "compiler-ext"
+version = "0.1.0"
+authors = ["TVM Contributors"]
+edition = "2018"
+
+[lib]
+crate-type = ["staticlib", "cdylib"]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+tvm = { path = "../tvm", default-features = false, features = ["static-linking"] }
+log = "*"
+env_logger = "*"
diff --git a/rust/compiler-ext/src/lib.rs b/rust/compiler-ext/src/lib.rs
new file mode 100644
index 000000000000..278060ef4897
--- /dev/null
+++ b/rust/compiler-ext/src/lib.rs
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use env_logger;
+use tvm::export;
+
+fn diagnostics() -> Result<(), tvm::Error> {
+    tvm::ir::diagnostics::codespan::init()
+}
+
+export!(diagnostics);
+
+#[no_mangle]
+extern "C" fn compiler_ext_initialize() -> i32 {
+    let _ = env_logger::try_init();
+    tvm_export("rust_ext").expect("failed to initialize the Rust compiler extensions.");
+    log::debug!("Loaded the Rust compiler extension.");
+    return 0;
+}
diff --git a/rust/tvm-graph-rt/Cargo.toml b/rust/tvm-graph-rt/Cargo.toml
index d8dfcdb73269..13837f62695d 100644
--- a/rust/tvm-graph-rt/Cargo.toml
+++ b/rust/tvm-graph-rt/Cargo.toml
@@ -20,7 +20,7 @@ name = "tvm-graph-rt"
 version = "0.1.0"
 license = "Apache-2.0"
 description = "A static graph runtime for TVM."
-repository = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["tvm"]
 categories = ["api-bindings", "science"]
diff --git a/rust/tvm-graph-rt/src/errors.rs b/rust/tvm-graph-rt/src/errors.rs
index 2ca97bdabb6b..c4bddb85b0de 100644
--- a/rust/tvm-graph-rt/src/errors.rs
+++ b/rust/tvm-graph-rt/src/errors.rs
@@ -22,14 +22,14 @@ use tvm_sys::DataType;
 
 #[derive(Debug, Error)]
 pub enum GraphFormatError {
-    #[error("Could not parse graph json")]
-    Parse(#[from] serde_json::Error),
-    #[error("Could not parse graph params")]
-    Params,
-    #[error("{0} is missing attr: {1}")]
+    #[error("Failed to parse graph with error: {0}")]
+    Parse(#[source] serde_json::Error),
+    #[error("Failed to parse graph parameters with error: {0:?}")]
+    Params(#[source] Option<nom::Err<(Vec<u8>, nom::error::ErrorKind)>>),
+    #[error("{0} is missing attribute: {1}")]
     MissingAttr(String, String),
-    #[error("Graph has invalid attr that can't be parsed: {0}")]
-    InvalidAttr(#[from] std::num::ParseIntError),
+    #[error("Failed to parse graph attribute '{0}' with error: {1}")]
+    InvalidAttr(String, #[source] std::num::ParseIntError),
     #[error("Missing field: {0}")]
     MissingField(&'static str),
     #[error("Invalid DLType: {0}")]
diff --git a/rust/tvm-graph-rt/src/graph.rs b/rust/tvm-graph-rt/src/graph.rs
index 87dd4a76d5e4..646a20daaf5b 100644
--- a/rust/tvm-graph-rt/src/graph.rs
+++ b/rust/tvm-graph-rt/src/graph.rs
@@ -26,7 +26,7 @@ use nom::{
     character::complete::{alpha1, digit1},
     complete, count, do_parse, length_count, map, named,
     number::complete::{le_i32, le_i64, le_u16, le_u32, le_u64, le_u8},
-    opt, tag, take, tuple,
+    opt, tag, take, tuple, Err as NomErr,
 };
 use serde::{Deserialize, Serialize};
 use serde_json;
@@ -121,10 +121,22 @@ impl Node {
             .attrs
             .as_ref()
             .ok_or_else(|| GraphFormatError::MissingAttr(self.name.clone(), "attrs".to_owned()))?;
+
+        let func_name = get_node_attr!(self.name, attrs, "func_name")?.to_owned();
+
+        let num_outputs = get_node_attr!(self.name, attrs, "num_outputs")?
+            .parse::<usize>()
+            .map_err(|error| GraphFormatError::InvalidAttr("num_outputs".to_string(), error))?;
+
+        let flatten_data = get_node_attr!(self.name, attrs, "flatten_data")?
+            .parse::<u8>()
+            .map(|val| val == 1)
+            .map_err(|error| GraphFormatError::InvalidAttr("flatten_data".to_string(), error))?;
+
         Ok(NodeAttrs {
-            func_name: get_node_attr!(self.name, attrs, "func_name")?.to_owned(),
-            num_outputs: get_node_attr!(self.name, attrs, "num_outputs")?.parse::<usize>()?,
-            flatten_data: get_node_attr!(self.name, attrs, "flatten_data")?.parse::<u8>()? == 1,
+            func_name,
+            num_outputs,
+            flatten_data,
         })
     }
 }
@@ -132,16 +144,14 @@ impl Node {
 impl<'a> TryFrom<&'a String> for Graph {
     type Error = GraphFormatError;
     fn try_from(graph_json: &String) -> Result<Self, GraphFormatError> {
-        let graph = serde_json::from_str(graph_json)?;
-        Ok(graph)
+        serde_json::from_str(graph_json).map_err(|error| GraphFormatError::Parse(error))
     }
 }
 
 impl<'a> TryFrom<&'a str> for Graph {
     type Error = GraphFormatError;
     fn try_from(graph_json: &'a str) -> Result<Self, Self::Error> {
-        let graph = serde_json::from_str(graph_json)?;
-        Ok(graph)
+        serde_json::from_str(graph_json).map_err(|error| GraphFormatError::Parse(error))
     }
 }
 
@@ -475,14 +485,23 @@ named! {
 
 /// Loads a param dict saved using `relay.save_param_dict`.
 pub fn load_param_dict(bytes: &[u8]) -> Result<HashMap<String, Tensor>, GraphFormatError> {
-    if let Ok((remaining_bytes, param_dict)) = parse_param_dict(bytes) {
-        if remaining_bytes.is_empty() {
-            Ok(param_dict)
-        } else {
-            Err(GraphFormatError::Params)
+    match parse_param_dict(bytes) {
+        Ok((remaining_bytes, param_dict)) => {
+            if remaining_bytes.is_empty() {
+                Ok(param_dict)
+            } else {
+                Err(GraphFormatError::Params(None))
+            }
         }
-    } else {
-        Err(GraphFormatError::Params)
+        Err(error) => Err(match error {
+            NomErr::Incomplete(error) => GraphFormatError::Params(Some(NomErr::Incomplete(error))),
+            NomErr::Error((remainder, error_kind)) => {
+                GraphFormatError::Params(Some(NomErr::Error((remainder.into(), error_kind))))
+            }
+            NomErr::Failure((remainder, error_kind)) => {
+                GraphFormatError::Params(Some(NomErr::Failure((remainder.into(), error_kind))))
+            }
+        }),
     }
 }
 
diff --git a/rust/tvm-graph-rt/src/module/syslib.rs b/rust/tvm-graph-rt/src/module/syslib.rs
index 0279e31be079..efc29a336620 100644
--- a/rust/tvm-graph-rt/src/module/syslib.rs
+++ b/rust/tvm-graph-rt/src/module/syslib.rs
@@ -18,7 +18,7 @@
  */
 
 use std::{
-    collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::Mutex,
+    collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::RwLock,
 };
 
 use lazy_static::lazy_static;
@@ -35,14 +35,14 @@ extern "C" {
 }
 
 lazy_static! {
-    static ref SYSTEM_LIB_FUNCTIONS: Mutex<HashMap<String, &'static (dyn PackedFunc)>> =
-        Mutex::new(HashMap::new());
+    static ref SYSTEM_LIB_FUNCTIONS: RwLock<HashMap<String, &'static (dyn PackedFunc)>> =
+        RwLock::new(HashMap::new());
 }
 
 impl Module for SystemLibModule {
     fn get_function<S: AsRef<str>>(&self, name: S) -> Option<&(dyn PackedFunc)> {
         SYSTEM_LIB_FUNCTIONS
-            .lock()
+            .read()
             .unwrap()
             .get(name.as_ref())
             .copied()
@@ -65,7 +65,7 @@ pub extern "C" fn TVMBackendRegisterSystemLibSymbol(
     func: BackendPackedCFunc,
 ) -> i32 {
     let name = unsafe { CStr::from_ptr(cname).to_str().unwrap() };
-    SYSTEM_LIB_FUNCTIONS.lock().unwrap().insert(
+    SYSTEM_LIB_FUNCTIONS.write().unwrap().insert(
         name.to_string(),
         &*Box::leak(super::wrap_backend_packed_func(name.to_string(), func)),
     );
diff --git a/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml b/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml
index aed467f1235d..02e77d106f28 100644
--- a/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml
+++ b/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml
@@ -23,7 +23,7 @@ authors = ["TVM Contributors"]
 edition = "2018"
 
 [dependencies]
-ndarray="0.12"
+ndarray = "0.12"
 tvm-graph-rt = { path = "../../" }
 
 [build-dependencies]
diff --git a/rust/tvm-macros/Cargo.toml b/rust/tvm-macros/Cargo.toml
index 63b84727c525..37275d6a941e 100644
--- a/rust/tvm-macros/Cargo.toml
+++ b/rust/tvm-macros/Cargo.toml
@@ -20,7 +20,7 @@ name = "tvm-macros"
 version = "0.1.1"
 license = "Apache-2.0"
 description = "Procedural macros of the TVM crate."
-repository = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["tvm"]
 authors = ["TVM Contributors"]
@@ -33,5 +33,5 @@ proc-macro = true
 goblin = "^0.2"
 proc-macro2 = "^1.0"
 quote = "^1.0"
-syn = { version = "1.0.17", features = ["full", "extra-traits"] }
+syn = { version = "1.0.48", features = ["full", "parsing", "extra-traits"] }
 proc-macro-error = "^1.0"
diff --git a/rust/tvm-macros/src/external.rs b/rust/tvm-macros/src/external.rs
index 802d7aeb6779..146f9d4d6bc6 100644
--- a/rust/tvm-macros/src/external.rs
+++ b/rust/tvm-macros/src/external.rs
@@ -17,12 +17,35 @@
  * under the License.
  */
 use proc_macro2::Span;
+use proc_macro_error::abort;
 use quote::quote;
 use syn::parse::{Parse, ParseStream, Result};
 
-use syn::{FnArg, Generics, Ident, Lit, Meta, NestedMeta, Pat, ReturnType, TraitItemMethod, Type};
+use syn::{
+    token::Semi, Attribute, FnArg, Generics, Ident, Lit, Meta, NestedMeta, Pat, ReturnType,
+    Signature, Type, Visibility,
+};
+
+struct ExternalItem {
+    attrs: Vec<Attribute>,
+    visibility: Visibility,
+    sig: Signature,
+}
+
+impl Parse for ExternalItem {
+    fn parse(input: ParseStream) -> Result<Self> {
+        let item = ExternalItem {
+            attrs: input.call(Attribute::parse_outer)?,
+            visibility: input.parse()?,
+            sig: input.parse()?,
+        };
+        let _semi: Semi = input.parse()?;
+        Ok(item)
+    }
+}
 
 struct External {
+    visibility: Visibility,
     tvm_name: String,
     ident: Ident,
     generics: Generics,
@@ -32,7 +55,8 @@ struct External {
 
 impl Parse for External {
     fn parse(input: ParseStream) -> Result<Self> {
-        let method: TraitItemMethod = input.parse()?;
+        let method: ExternalItem = input.parse()?;
+        let visibility = method.visibility;
         assert_eq!(method.attrs.len(), 1);
         let sig = method.sig;
         let tvm_name = method.attrs[0].parse_meta()?;
@@ -47,8 +71,7 @@ impl Parse for External {
             }
             _ => panic!(),
         };
-        assert_eq!(method.default, None);
-        assert!(method.semi_token != None);
+
         let ident = sig.ident;
         let generics = sig.generics;
         let inputs = sig
@@ -60,6 +83,7 @@ impl Parse for External {
         let ret_type = sig.output;
 
         Ok(External {
+            visibility,
             tvm_name,
             ident,
             generics,
@@ -98,6 +122,7 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
     let mut items = Vec::new();
 
     for external in &ext_input.externs {
+        let visibility = &external.visibility;
         let name = &external.ident;
         let global_name = format!("global_{}", external.ident);
         let global_name = Ident::new(&global_name, Span::call_site());
@@ -109,7 +134,9 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
             .iter()
             .map(|ty_param| match ty_param {
                 syn::GenericParam::Type(param) => param.clone(),
-                _ => panic!(),
+                _ => abort! { ty_param,
+                    "Only supports type parameters."
+                },
             })
             .collect();
 
@@ -124,15 +151,21 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
                         let ty: Type = *pat_type.ty.clone();
                         (ident, ty)
                     }
-                    _ => panic!(),
+                    _ => abort! { pat_type,
+                        "Only supports type parameters."
+                    },
+                },
+                pat => abort! {
+                    pat, "invalid pattern type for function";
+
+                    note = "{:?} is not allowed here", pat;
                 },
-                _ => panic!(),
             })
             .unzip();
 
         let ret_type = match &external.ret_type {
             ReturnType::Type(_, rtype) => *rtype.clone(),
-            _ => panic!(),
+            ReturnType::Default => syn::parse_str::<Type>("()").unwrap(),
         };
 
         let global = quote! {
@@ -147,7 +180,7 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
         items.push(global);
 
         let wrapper = quote! {
-            pub fn #name<#(#ty_params),*>(#(#args : #tys),*) -> #result_type<#ret_type> {
+            #visibility fn #name<#(#ty_params),*>(#(#args : #tys),*) -> #result_type<#ret_type> {
                 let func_ref: #tvm_rt_crate::Function = #global_name.clone();
                 let func_ref: Box<dyn Fn(#(#tys),*) -> #result_type<#ret_type>> = func_ref.into();
                 let res: #ret_type = func_ref(#(#args),*)?;
diff --git a/rust/tvm-macros/src/lib.rs b/rust/tvm-macros/src/lib.rs
index 603e1ceaafcc..e563a57f149e 100644
--- a/rust/tvm-macros/src/lib.rs
+++ b/rust/tvm-macros/src/lib.rs
@@ -18,6 +18,7 @@
  */
 
 use proc_macro::TokenStream;
+use proc_macro_error::proc_macro_error;
 
 mod external;
 mod import_module;
@@ -29,12 +30,14 @@ pub fn import_module(input: TokenStream) -> TokenStream {
     import_module::macro_impl(input)
 }
 
-#[proc_macro_derive(Object, attributes(base, ref_name, type_key))]
+#[proc_macro_error]
+#[proc_macro_derive(Object, attributes(base, ref_name, type_key, no_derive))]
 pub fn macro_impl(input: TokenStream) -> TokenStream {
     // let input = proc_macro2::TokenStream::from(input);
     TokenStream::from(object::macro_impl(input))
 }
 
+#[proc_macro_error]
 #[proc_macro]
 pub fn external(input: TokenStream) -> TokenStream {
     external::macro_impl(input)
diff --git a/rust/tvm-macros/src/object.rs b/rust/tvm-macros/src/object.rs
index ff72d6a649be..c84d0aab612f 100644
--- a/rust/tvm-macros/src/object.rs
+++ b/rust/tvm-macros/src/object.rs
@@ -36,6 +36,10 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> TokenStream {
         .map(attr_to_str)
         .expect("Failed to get type_key");
 
+    let derive = get_attr(&derive_input, "no_derive")
+        .map(|_| false)
+        .unwrap_or(true);
+
     let ref_id = get_attr(&derive_input, "ref_name")
         .map(|a| Ident::new(attr_to_str(a).value().as_str(), Span::call_site()))
         .unwrap_or_else(|| {
@@ -75,6 +79,12 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> TokenStream {
         _ => panic!("derive only works for structs"),
     };
 
+    let ref_derives = if derive {
+        quote! { #[derive(Debug, Clone)]}
+    } else {
+        quote! { #[derive(Clone)] }
+    };
+
     let mut expanded = quote! {
         unsafe impl #tvm_rt_crate::object::IsObject for #payload_id {
             const TYPE_KEY: &'static str = #type_key;
@@ -87,7 +97,7 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> TokenStream {
             }
         }
 
-        #[derive(Clone)]
+        #ref_derives
         pub struct #ref_id(Option<#tvm_rt_crate::object::ObjectPtr<#payload_id>>);
 
         impl #tvm_rt_crate::object::IsObjectRef for #ref_id {
@@ -185,5 +195,25 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> TokenStream {
 
     expanded.extend(base_tokens);
 
+    if derive {
+        let derives = quote! {
+            impl std::hash::Hash for #ref_id {
+                fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+                    self.0.hash(state)
+                }
+            }
+
+            impl std::cmp::PartialEq for #ref_id {
+                fn eq(&self, other: &Self) -> bool {
+                    self.0 == other.0
+                }
+            }
+
+            impl std::cmp::Eq for #ref_id {}
+        };
+
+        expanded.extend(derives);
+    }
+
     TokenStream::from(expanded)
 }
diff --git a/rust/tvm-rt/Cargo.toml b/rust/tvm-rt/Cargo.toml
index acece5aeec48..13c05373f6b6 100644
--- a/rust/tvm-rt/Cargo.toml
+++ b/rust/tvm-rt/Cargo.toml
@@ -20,27 +20,34 @@ name = "tvm-rt"
 version = "0.1.0"
 license = "Apache-2.0"
 description = "Rust bindings for the TVM runtime API."
-repository = "https://github.com/apache/incubator-tvm"
-homepage = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
+homepage = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["rust", "tvm"]
 categories = ["api-bindings", "science"]
 authors = ["TVM Contributors"]
 edition = "2018"
 
+[features]
+default = ["dynamic-linking"]
+dynamic-linking = ["tvm-sys/bindings"]
+static-linking = []
+blas = ["ndarray/blas"]
+
 [dependencies]
 thiserror = "^1.0"
 ndarray = "0.12"
 num-traits = "0.2"
-tvm-sys = { version = "0.1", path = "../tvm-sys/", features = ["bindings"] }
 tvm-macros = { version = "0.1", path = "../tvm-macros" }
 paste = "0.1"
 mashup = "0.1"
 once_cell = "^1.3.1"
 memoffset = "0.5.6"
 
+[dependencies.tvm-sys]
+version = "0.1"
+default-features = false
+path = "../tvm-sys/"
+
 [dev-dependencies]
 anyhow = "^1.0"
-
-[features]
-blas = ["ndarray/blas"]
diff --git a/rust/tvm-rt/README.md b/rust/tvm-rt/README.md
index a586cd73b303..a99eeaa578dd 100644
--- a/rust/tvm-rt/README.md
+++ b/rust/tvm-rt/README.md
@@ -17,7 +17,7 @@
 
 # TVM Runtime Support
 
-This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/incubator-tvm) runtime.
+This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime.
 Currently this is tested on `1.42.0` and above.
 
 ## What Does This Crate Offer?
diff --git a/rust/tvm-rt/src/array.rs b/rust/tvm-rt/src/array.rs
index 5e19cefd8e97..1b0ce8399d1f 100644
--- a/rust/tvm-rt/src/array.rs
+++ b/rust/tvm-rt/src/array.rs
@@ -18,6 +18,7 @@
  */
 
 use std::convert::{TryFrom, TryInto};
+use std::iter::{FromIterator, IntoIterator, Iterator};
 use std::marker::PhantomData;
 
 use crate::errors::Error;
@@ -81,6 +82,55 @@ impl<T: IsObjectRef> Array<T> {
     }
 }
 
+impl<T: IsObjectRef> std::fmt::Debug for Array<T> {
+    fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let as_vec: Vec<T> = self.clone().into_iter().collect();
+        write!(formatter, "{:?}", as_vec)
+    }
+}
+
+pub struct IntoIter<T: IsObjectRef> {
+    array: Array<T>,
+    pos: isize,
+    size: isize,
+}
+
+impl<T: IsObjectRef> Iterator for IntoIter<T> {
+    type Item = T;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.pos < self.size {
+            let item =
+                self.array.get(self.pos)
+                    .expect("Can not index as in-bounds position after bounds checking.\nNote: this error can only be do to an uncaught issue with API bindings.");
+            self.pos += 1;
+            Some(item)
+        } else {
+            None
+        }
+    }
+}
+
+impl<T: IsObjectRef> IntoIterator for Array<T> {
+    type Item = T;
+    type IntoIter = IntoIter<T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        let size = self.len() as isize;
+        IntoIter {
+            array: self,
+            pos: 0,
+            size: size,
+        }
+    }
+}
+
+impl<T: IsObjectRef> FromIterator<T> for Array<T> {
+    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
+        Array::from_vec(iter.into_iter().collect()).unwrap()
+    }
+}
+
 impl<T: IsObjectRef> From<Array<T>> for ArgValue<'static> {
     fn from(array: Array<T>) -> ArgValue<'static> {
         array.object.into()
diff --git a/rust/tvm-rt/src/errors.rs b/rust/tvm-rt/src/errors.rs
index c884c56fed44..31ce385ef662 100644
--- a/rust/tvm-rt/src/errors.rs
+++ b/rust/tvm-rt/src/errors.rs
@@ -68,6 +68,23 @@ pub enum Error {
     Infallible(#[from] std::convert::Infallible),
     #[error("a panic occurred while executing a Rust packed function")]
     Panic,
+    #[error(
+        "one or more error diagnostics were emitted, please check diagnostic render for output."
+    )]
+    DiagnosticError(String),
+    #[error("{0}")]
+    Raw(String),
+}
+
+impl Error {
+    pub fn from_raw_tvm(raw: &str) -> Error {
+        let err_header = raw.find(":").unwrap_or(0);
+        let (err_ty, err_content) = raw.split_at(err_header);
+        match err_ty {
+            "DiagnosticError" => Error::DiagnosticError((&err_content[1..]).into()),
+            _ => Error::Raw(raw.into()),
+        }
+    }
 }
 
 impl Error {
diff --git a/rust/tvm-rt/src/function.rs b/rust/tvm-rt/src/function.rs
index bae06e929361..aec4a8ad44de 100644
--- a/rust/tvm-rt/src/function.rs
+++ b/rust/tvm-rt/src/function.rs
@@ -120,24 +120,27 @@ impl Function {
         let mut ret_val = ffi::TVMValue { v_int64: 0 };
         let mut ret_type_code = 0i32;
 
-        check_call!(ffi::TVMFuncCall(
-            self.handle,
-            values.as_mut_ptr() as *mut ffi::TVMValue,
-            type_codes.as_mut_ptr() as *mut c_int,
-            num_args as c_int,
-            &mut ret_val as *mut _,
-            &mut ret_type_code as *mut _
-        ));
+        let ret_code = unsafe {
+            ffi::TVMFuncCall(
+                self.handle,
+                values.as_mut_ptr() as *mut ffi::TVMValue,
+                type_codes.as_mut_ptr() as *mut c_int,
+                num_args as c_int,
+                &mut ret_val as *mut _,
+                &mut ret_type_code as *mut _,
+            )
+        };
+
+        if ret_code != 0 {
+            let raw_error = crate::get_last_error();
+            let error = match Error::from_raw_tvm(raw_error) {
+                Error::Raw(string) => Error::CallFailed(string),
+                e => e,
+            };
+            return Err(error);
+        }
 
         let rv = RetValue::from_tvm_value(ret_val, ret_type_code as u32);
-        match rv {
-            RetValue::ObjectHandle(object) => {
-                let optr = crate::object::ObjectPtr::from_raw(object as _).unwrap();
-                // println!("after wrapped call: {}", optr.count());
-                crate::object::ObjectPtr::leak(optr);
-            }
-            _ => {}
-        };
 
         Ok(rv)
     }
diff --git a/rust/tvm-rt/src/lib.rs b/rust/tvm-rt/src/lib.rs
index 84951f4c8e67..4b163eff9c8f 100644
--- a/rust/tvm-rt/src/lib.rs
+++ b/rust/tvm-rt/src/lib.rs
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-//! [TVM](https://github.com/apache/incubator-tvm) is a compiler stack for deep learning systems.
+//! [TVM](https://github.com/apache/tvm) is a compiler stack for deep learning systems.
 //!
 //! This crate provides an idiomatic Rust API for TVM runtime.
 //!
diff --git a/rust/tvm-rt/src/map.rs b/rust/tvm-rt/src/map.rs
index 721fb1ec4588..b8bfb4e5e644 100644
--- a/rust/tvm-rt/src/map.rs
+++ b/rust/tvm-rt/src/map.rs
@@ -48,8 +48,6 @@ where
 // TODO(@jroesch): convert to use generics instead of casting inside
 // the implementation.
 external! {
-    #[name("node.ArrayGetItem")]
-   fn array_get_item(array: ObjectRef, index: isize) -> ObjectRef;
    #[name("node.MapSize")]
    fn map_size(map: ObjectRef) -> i64;
    #[name("node.MapGetItem")]
diff --git a/rust/tvm-rt/src/ndarray.rs b/rust/tvm-rt/src/ndarray.rs
index ed280ccc2d80..07f783f0ef43 100644
--- a/rust/tvm-rt/src/ndarray.rs
+++ b/rust/tvm-rt/src/ndarray.rs
@@ -65,7 +65,7 @@ use crate::object::{Object, ObjectPtr};
 
 /// See the [`module-level documentation`](../ndarray/index.html) for more details.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "NDArray"]
 #[type_key = "runtime.NDArray"]
 pub struct NDArrayContainer {
diff --git a/rust/tvm-rt/src/object/mod.rs b/rust/tvm-rt/src/object/mod.rs
index 46e034232a63..8c07ed9f0853 100644
--- a/rust/tvm-rt/src/object/mod.rs
+++ b/rust/tvm-rt/src/object/mod.rs
@@ -40,6 +40,7 @@ pub trait IsObjectRef:
     + TryFrom<RetValue, Error = Error>
     + for<'a> Into<ArgValue<'a>>
     + for<'a> TryFrom<ArgValue<'a>, Error = Error>
+    + std::fmt::Debug
 {
     type Object: IsObject;
     fn as_ptr(&self) -> Option<&ObjectPtr<Self::Object>>;
@@ -88,14 +89,9 @@ pub trait IsObjectRef:
 
 external! {
     #[name("ir.DebugPrint")]
-    fn debug_print(object: ObjectRef) -> CString;
+    pub fn debug_print(object: ObjectRef) -> CString;
     #[name("node.StructuralHash")]
-    fn structural_hash(object: ObjectRef, map_free_vars: bool) -> ObjectRef;
+    fn structural_hash(object: ObjectRef, map_free_vars: bool) -> i64;
     #[name("node.StructuralEqual")]
-    fn structural_equal(lhs: ObjectRef, rhs: ObjectRef, assert_mode: bool, map_free_vars: bool) -> ObjectRef;
+    fn structural_equal(lhs: ObjectRef, rhs: ObjectRef, assert_mode: bool, map_free_vars: bool) -> bool;
 }
-
-// external! {
-//     #[name("ir.TextPrinter")]
-//     fn as_text(object: ObjectRef) -> CString;
-// }
diff --git a/rust/tvm-rt/src/object/object_ptr.rs b/rust/tvm-rt/src/object/object_ptr.rs
index 77254d2fbca2..8df6041956b8 100644
--- a/rust/tvm-rt/src/object/object_ptr.rs
+++ b/rust/tvm-rt/src/object/object_ptr.rs
@@ -19,6 +19,7 @@
 
 use std::convert::TryFrom;
 use std::ffi::CString;
+use std::fmt;
 use std::ptr::NonNull;
 use std::sync::atomic::AtomicI32;
 
@@ -125,7 +126,7 @@ impl Object {
     /// By using associated constants and generics we can provide a
     /// type indexed abstraction over allocating objects with the
     /// correct index and deleter.
-    pub fn base_object<T: IsObject>() -> Object {
+    pub fn base<T: IsObject>() -> Object {
         let index = Object::get_type_index::<T>();
         Object::new(index, delete::<T>)
     }
@@ -147,6 +148,18 @@ impl Object {
     }
 }
 
+// impl fmt::Debug for Object {
+//     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+//         let index =
+//             format!("{} // key: {}", self.type_index, "the_key");
+
+//         f.debug_struct("Object")
+//          .field("type_index", &index)
+//          // TODO(@jroesch: do we expose other fields?)
+//          .finish()
+//     }
+// }
+
 /// An unsafe trait which should be implemented for an object
 /// subtype.
 ///
@@ -154,7 +167,7 @@ impl Object {
 /// index, a method for accessing the base object given the
 /// subtype, and a typed delete method which is specialized
 /// to the subtype.
-pub unsafe trait IsObject: AsRef<Object> {
+pub unsafe trait IsObject: AsRef<Object> + std::fmt::Debug {
     const TYPE_KEY: &'static str;
 
     unsafe extern "C" fn typed_delete(object: *mut Self) {
@@ -264,6 +277,13 @@ impl<T: IsObject> std::ops::Deref for ObjectPtr<T> {
     }
 }
 
+impl<T: IsObject> fmt::Debug for ObjectPtr<T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use std::ops::Deref;
+        write!(f, "{:?}", self.deref())
+    }
+}
+
 impl<'a, T: IsObject> From<ObjectPtr<T>> for RetValue {
     fn from(object_ptr: ObjectPtr<T>) -> RetValue {
         let raw_object_ptr = ObjectPtr::leak(object_ptr) as *mut T as *mut std::ffi::c_void;
@@ -342,6 +362,24 @@ impl<'a, T: IsObject> TryFrom<ArgValue<'a>> for ObjectPtr<T> {
     }
 }
 
+impl<T: IsObject> std::hash::Hash for ObjectPtr<T> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_i64(
+            super::structural_hash(ObjectRef(Some(self.clone().upcast())), false).unwrap(),
+        )
+    }
+}
+
+impl<T: IsObject> PartialEq for ObjectPtr<T> {
+    fn eq(&self, other: &Self) -> bool {
+        let lhs = ObjectRef(Some(self.clone().upcast()));
+        let rhs = ObjectRef(Some(other.clone().upcast()));
+        super::structural_equal(lhs, rhs, false, false).unwrap()
+    }
+}
+
+impl<T: IsObject> Eq for ObjectPtr<T> {}
+
 #[cfg(test)]
 mod tests {
     use super::{Object, ObjectPtr};
@@ -351,7 +389,7 @@ mod tests {
 
     #[test]
     fn test_new_object() -> anyhow::Result<()> {
-        let object = Object::base_object::<Object>();
+        let object = Object::base::<Object>();
         let ptr = ObjectPtr::new(object);
         assert_eq!(ptr.count(), 1);
         Ok(())
@@ -359,7 +397,7 @@ mod tests {
 
     #[test]
     fn test_leak() -> anyhow::Result<()> {
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let object = ObjectPtr::leak(ptr);
         assert_eq!(object.count(), 1);
@@ -368,7 +406,7 @@ mod tests {
 
     #[test]
     fn test_clone() -> anyhow::Result<()> {
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let ptr2 = ptr.clone();
         assert_eq!(ptr2.count(), 2);
@@ -379,7 +417,7 @@ mod tests {
 
     #[test]
     fn roundtrip_retvalue() -> Result<()> {
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let ret_value: RetValue = ptr.clone().into();
         let ptr2: ObjectPtr<Object> = ret_value.try_into()?;
@@ -401,7 +439,7 @@ mod tests {
 
     #[test]
     fn roundtrip_argvalue() -> Result<()> {
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let ptr_clone = ptr.clone();
         assert_eq!(ptr.count(), 2);
@@ -435,7 +473,7 @@ mod tests {
     fn test_ref_count_boundary3() {
         use super::*;
         use crate::function::{register, Function};
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let stay = ptr.clone();
         assert_eq!(ptr.count(), 2);
diff --git a/rust/tvm-rt/src/string.rs b/rust/tvm-rt/src/string.rs
index 6ff24bef3a60..e61afaf7399b 100644
--- a/rust/tvm-rt/src/string.rs
+++ b/rust/tvm-rt/src/string.rs
@@ -25,9 +25,10 @@ use super::Object;
 use tvm_macros::Object;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "String"]
 #[type_key = "runtime.String"]
+#[no_derive]
 pub struct StringObj {
     base: Object,
     data: *const u8,
@@ -38,7 +39,7 @@ impl From<std::string::String> for String {
     fn from(s: std::string::String) -> Self {
         let size = s.len() as u64;
         let data = Box::into_raw(s.into_boxed_str()).cast();
-        let base = Object::base_object::<StringObj>();
+        let base = Object::base::<StringObj>();
         StringObj { base, data, size }.into()
     }
 }
@@ -47,7 +48,7 @@ impl From<&'static str> for String {
     fn from(s: &'static str) -> Self {
         let size = s.len() as u64;
         let data = s.as_bytes().as_ptr();
-        let base = Object::base_object::<StringObj>();
+        let base = Object::base::<StringObj>();
         StringObj { base, data, size }.into()
     }
 }
diff --git a/rust/tvm-rt/src/to_function.rs b/rust/tvm-rt/src/to_function.rs
index a89652b0378c..affd81b0e7ed 100644
--- a/rust/tvm-rt/src/to_function.rs
+++ b/rust/tvm-rt/src/to_function.rs
@@ -103,8 +103,10 @@ pub trait ToFunction<I, O>: Sized {
                 value = args_list[i];
                 tcode = type_codes_list[i];
                 if tcode == ffi::TVMArgTypeCode_kTVMObjectHandle as c_int
+                    || tcode == ffi::TVMArgTypeCode_kTVMObjectRValueRefArg as c_int
                     || tcode == ffi::TVMArgTypeCode_kTVMPackedFuncHandle as c_int
                     || tcode == ffi::TVMArgTypeCode_kTVMModuleHandle as c_int
+                    || tcode == ffi::TVMArgTypeCode_kTVMNDArrayHandle as c_int
                 {
                     check_call!(ffi::TVMCbArgToReturn(
                         &mut value as *mut _,
diff --git a/rust/tvm-rt/src/value.rs b/rust/tvm-rt/src/value.rs
index c49944dc7e33..b8cd190176c4 100644
--- a/rust/tvm-rt/src/value.rs
+++ b/rust/tvm-rt/src/value.rs
@@ -22,7 +22,6 @@
 //! `RetValue` is the owned version of `TVMPODValue`.
 
 use std::convert::TryFrom;
-// use std::ffi::c_void;
 
 use crate::{ArgValue, Module, RetValue};
 use tvm_sys::{errors::ValueDowncastError, ffi::TVMModuleHandle, try_downcast};
diff --git a/rust/tvm-sys/Cargo.toml b/rust/tvm-sys/Cargo.toml
index 4e3fc98b4e75..2952aa4938d7 100644
--- a/rust/tvm-sys/Cargo.toml
+++ b/rust/tvm-sys/Cargo.toml
@@ -23,6 +23,7 @@ license = "Apache-2.0"
 edition = "2018"
 
 [features]
+default = []
 bindings = []
 
 [dependencies]
diff --git a/rust/tvm-sys/build.rs b/rust/tvm-sys/build.rs
index 05806c0d5ce0..159023463e8d 100644
--- a/rust/tvm-sys/build.rs
+++ b/rust/tvm-sys/build.rs
@@ -60,7 +60,7 @@ fn main() -> Result<()> {
     if cfg!(feature = "bindings") {
         println!("cargo:rerun-if-env-changed=TVM_HOME");
         println!("cargo:rustc-link-lib=dylib=tvm");
-        println!("cargo:rustc-link-search={}/build", tvm_home);
+        println!("cargo:rustc-link-search=native={}/build", tvm_home);
     }
 
     // @see rust-bindgen#550 for `blacklist_type`
diff --git a/rust/tvm-sys/src/context.rs b/rust/tvm-sys/src/context.rs
index 3747bfcba314..a5165fccf0aa 100644
--- a/rust/tvm-sys/src/context.rs
+++ b/rust/tvm-sys/src/context.rs
@@ -51,7 +51,7 @@ use enumn::N;
 use thiserror::Error;
 
 /// Device type represents the set of devices supported by
-/// [TVM](https://github.com/apache/incubator-tvm).
+/// [TVM](https://github.com/apache/tvm).
 ///
 /// ## Example
 ///
diff --git a/rust/tvm-sys/src/datatype.rs b/rust/tvm-sys/src/datatype.rs
index 8050d932e5c1..5f7e0c3a3b60 100644
--- a/rust/tvm-sys/src/datatype.rs
+++ b/rust/tvm-sys/src/datatype.rs
@@ -83,6 +83,10 @@ impl DataType {
         DataType::new(DL_FLOAT_CODE, bits, lanes)
     }
 
+    pub const fn float32() -> DataType {
+        Self::float(32, 1)
+    }
+
     pub const fn uint(bits: u8, lanes: u16) -> DataType {
         DataType::new(DL_UINT_CODE, bits, lanes)
     }
diff --git a/rust/tvm-sys/src/packed_func.rs b/rust/tvm-sys/src/packed_func.rs
index f7b289c59675..7b8d5296d641 100644
--- a/rust/tvm-sys/src/packed_func.rs
+++ b/rust/tvm-sys/src/packed_func.rs
@@ -101,6 +101,7 @@ macro_rules! TVMPODValue {
                         TVMArgTypeCode_kTVMOpaqueHandle => Handle($value.v_handle),
                         TVMArgTypeCode_kTVMDLTensorHandle => ArrayHandle($value.v_handle as TVMArrayHandle),
                         TVMArgTypeCode_kTVMObjectHandle => ObjectHandle($value.v_handle),
+                        TVMArgTypeCode_kTVMObjectRValueRefArg => ObjectHandle(*($value.v_handle as *mut *mut c_void)),
                         TVMArgTypeCode_kTVMModuleHandle => ModuleHandle($value.v_handle),
                         TVMArgTypeCode_kTVMPackedFuncHandle => FuncHandle($value.v_handle),
                         TVMArgTypeCode_kTVMNDArrayHandle => NDArrayHandle($value.v_handle),
diff --git a/rust/tvm/Cargo.toml b/rust/tvm/Cargo.toml
index 55fc1790604e..29d2003b5089 100644
--- a/rust/tvm/Cargo.toml
+++ b/rust/tvm/Cargo.toml
@@ -20,30 +20,40 @@ name = "tvm"
 version = "0.1.0"
 license = "Apache-2.0"
 description = "Rust frontend support for TVM"
-repository = "https://github.com/apache/incubator-tvm"
-homepage = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
+homepage = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["rust", "tvm"]
 categories = ["api-bindings", "science"]
 authors = ["TVM Contributors"]
 edition = "2018"
 
+[features]
+default = ["python", "dynamic-linking"]
+dynamic-linking = ["tvm-rt/dynamic-linking"]
+static-linking = ["tvm-rt/static-linking"]
+blas = ["ndarray/blas"]
+python = ["pyo3"]
+
+[dependencies.tvm-rt]
+version = "0.1"
+default-features = false
+path = "../tvm-rt/"
+
 [dependencies]
 thiserror = "^1.0"
 anyhow = "^1.0"
 lazy_static = "1.1"
 ndarray = "0.12"
 num-traits = "0.2"
-tvm-rt = { version = "0.1", path = "../tvm-rt/" }
-tvm-sys = { version = "0.1", path = "../tvm-sys/" }
 tvm-macros = { version = "*", path = "../tvm-macros/" }
 paste = "0.1"
 mashup = "0.1"
 once_cell = "^1.3.1"
 pyo3 = { version = "0.11.1", optional = true }
+codespan-reporting = "0.9.5"
+structopt = { version = "0.3" }
 
-[features]
-default = ["python"]
-
-blas = ["ndarray/blas"]
-python = ["pyo3"]
+[[bin]]
+name = "tyck"
+required-features = ["dynamic-linking"]
diff --git a/rust/tvm/README.md b/rust/tvm/README.md
index 13aef8928aa7..26f9f1fbedfd 100644
--- a/rust/tvm/README.md
+++ b/rust/tvm/README.md
@@ -17,13 +17,13 @@
 
 # TVM Runtime Frontend Support
 
-This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/incubator-tvm) runtime frontend. Currently this requires **Nightly Rust** and tested on `rustc 1.32.0-nightly`
+This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime frontend. Currently this requires **Nightly Rust** and tested on `rustc 1.32.0-nightly`
 
 ## What Does This Crate Offer?
 
 Here is a major workflow
 
-1. Train your **Deep Learning** model using any major framework such as [PyTorch](https://pytorch.org/), [Apache MXNet](https://mxnet.incubator.apache.org/) or [TensorFlow](https://www.tensorflow.org/)
+1. Train your **Deep Learning** model using any major framework such as [PyTorch](https://pytorch.org/), [Apache MXNet](https://mxnet.apache.org/) or [TensorFlow](https://www.tensorflow.org/)
 2. Use **TVM** to build optimized model artifacts on a supported context such as CPU, GPU, OpenCL and specialized accelerators.
 3. Deploy your models using **Rust** :heart:
 
diff --git a/rust/tvm/examples/resnet/build.rs b/rust/tvm/examples/resnet/build.rs
index 1e5d8a98736d..9bf7d867e50f 100644
--- a/rust/tvm/examples/resnet/build.rs
+++ b/rust/tvm/examples/resnet/build.rs
@@ -21,9 +21,10 @@ use anyhow::{Context, Result};
 use std::{io::Write, path::Path, process::Command};
 
 fn main() -> Result<()> {
+    let out_dir = std::env::var("CARGO_MANIFEST_DIR")?;
     let output = Command::new("python3")
         .arg(concat!(env!("CARGO_MANIFEST_DIR"), "/src/build_resnet.py"))
-        .arg(&format!("--build-dir={}", env!("CARGO_MANIFEST_DIR")))
+        .arg(&format!("--build-dir={}", out_dir))
         .output()
         .with_context(|| anyhow::anyhow!("failed to run python3"))?;
     if !output.status.success() {
@@ -33,7 +34,7 @@ fn main() -> Result<()> {
         panic!("Failed to execute build script");
     }
     assert!(
-        Path::new(&format!("{}/deploy_lib.o", env!("CARGO_MANIFEST_DIR"))).exists(),
+        Path::new(&format!("{}/deploy_lib.o", out_dir)).exists(),
         "Could not prepare demo: {}",
         String::from_utf8(output.stderr)
             .unwrap()
@@ -42,10 +43,7 @@ fn main() -> Result<()> {
             .last()
             .unwrap_or("")
     );
-    println!(
-        "cargo:rustc-link-search=native={}",
-        env!("CARGO_MANIFEST_DIR")
-    );
+    println!("cargo:rustc-link-search=native={}", out_dir);
 
     Ok(())
 }
diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py
index bc100fed0df1..03ac611a191a 100644
--- a/rust/tvm/examples/resnet/src/build_resnet.py
+++ b/rust/tvm/examples/resnet/src/build_resnet.py
@@ -104,10 +104,11 @@ def download_img_labels():
         ]
     )
     synset_name = "synset.txt"
-    synset_path = download_testdata(synset_url, synset_name, module="data")
+    synset_path = download_testdata(synset_url, synset_name + ".raw", module="data", overwrite=True)
 
     with open(synset_path) as fin:
-        synset = eval(fin.read())
+        data = fin.read()
+        synset = eval(data)
 
     with open(synset_name, "w") as f:
         for key in synset:
diff --git a/rust/tvm/src/bin/tyck.rs b/rust/tvm/src/bin/tyck.rs
new file mode 100644
index 000000000000..839a6bd1c17f
--- /dev/null
+++ b/rust/tvm/src/bin/tyck.rs
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use std::path::PathBuf;
+
+use anyhow::Result;
+use structopt::StructOpt;
+
+use tvm::ir::diagnostics::codespan;
+use tvm::ir::{self, IRModule};
+use tvm::runtime::Error;
+
+#[derive(Debug, StructOpt)]
+#[structopt(name = "tyck", about = "Parse and type check a Relay program.")]
+struct Opt {
+    /// Input file
+    #[structopt(parse(from_os_str))]
+    input: PathBuf,
+}
+
+fn main() -> Result<()> {
+    codespan::init().expect("Failed to initialize Rust based diagnostics.");
+    let opt = Opt::from_args();
+    let _module = match IRModule::parse_file(opt.input) {
+        Err(ir::module::Error::TVM(Error::DiagnosticError(_))) => return Ok(()),
+        Err(e) => {
+            return Err(e.into());
+        }
+        Ok(module) => module,
+    };
+
+    Ok(())
+}
diff --git a/rust/tvm/src/ir/arith.rs b/rust/tvm/src/ir/arith.rs
index f589f2ac25c6..672e6e6113a0 100644
--- a/rust/tvm/src/ir/arith.rs
+++ b/rust/tvm/src/ir/arith.rs
@@ -24,7 +24,7 @@ use tvm_macros::Object;
 macro_rules! define_node {
     ($name:ident, $ref:expr, $typekey:expr; $node:ident { $($id:ident : $t:ty),*}) => {
         #[repr(C)]
-        #[derive(Object)]
+        #[derive(Object, Debug)]
         #[ref_name = $ref]
         #[type_key = $typekey]
         pub struct $node {
@@ -34,7 +34,7 @@ macro_rules! define_node {
 
         impl $name {
             pub fn new($($id : $t,)*) -> $name {
-                let base = Object::base_object::<$node>();
+                let base = Object::base::<$node>();
                 let node = $node { base, $($id),* };
                 $name(Some(ObjectPtr::new(node)))
             }
diff --git a/rust/tvm/src/ir/attrs.rs b/rust/tvm/src/ir/attrs.rs
index 5bd027ab4b4c..739ed405c906 100644
--- a/rust/tvm/src/ir/attrs.rs
+++ b/rust/tvm/src/ir/attrs.rs
@@ -21,7 +21,7 @@ use crate::runtime::Object;
 use tvm_macros::Object;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Attrs"]
 #[type_key = "Attrs"]
 pub struct BaseAttrsNode {
diff --git a/rust/tvm/src/ir/diagnostics/codespan.rs b/rust/tvm/src/ir/diagnostics/codespan.rs
new file mode 100644
index 000000000000..c411c0cd31a7
--- /dev/null
+++ b/rust/tvm/src/ir/diagnostics/codespan.rs
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+//! A TVM diagnostics renderer which uses the Rust `codespan` library
+//! to produce error messages.
+//!
+//! This is an example of using the exposed API surface of TVM to
+//! customize the compiler behavior.
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+
+use codespan_reporting::diagnostic::{Diagnostic as CDiagnostic, Label, Severity};
+use codespan_reporting::files::SimpleFiles;
+use codespan_reporting::term::termcolor::{ColorChoice, StandardStream};
+use codespan_reporting::term::{self};
+
+use super::*;
+use crate::ir::source_map::*;
+
+/// A representation of a TVM Span as a range of bytes in a file.
+struct ByteRange<FileId> {
+    /// The file in which the range occurs.
+    #[allow(dead_code)]
+    file_id: FileId,
+    /// The range start.
+    start_pos: usize,
+    /// The range end.
+    end_pos: usize,
+}
+
+/// A mapping from Span to ByteRange for a single file.
+enum FileSpanToByteRange {
+    AsciiSource(Vec<usize>),
+    #[allow(dead_code)]
+    Utf8 {
+        /// Map character regions which are larger then 1-byte to length.
+        lengths: HashMap<isize, isize>,
+        /// The source of the program.
+        source: String,
+    },
+}
+
+impl FileSpanToByteRange {
+    /// Construct a span to byte range mapping from the program source.
+    fn new(source: String) -> FileSpanToByteRange {
+        if source.is_ascii() {
+            let line_lengths = source.lines().map(|line| line.len()).collect();
+            FileSpanToByteRange::AsciiSource(line_lengths)
+        } else {
+            panic!()
+        }
+    }
+
+    /// Lookup the corresponding ByteRange for a given Span.
+    fn lookup(&self, span: &Span) -> ByteRange<String> {
+        use FileSpanToByteRange::*;
+
+        let source_name: String = span.source_name.name.as_str().unwrap().into();
+
+        match self {
+            AsciiSource(ref line_lengths) => {
+                let start_pos = (&line_lengths[0..(span.line - 1) as usize])
+                    .into_iter()
+                    .sum::<usize>()
+                    + (span.column) as usize;
+                let end_pos = (&line_lengths[0..(span.end_line - 1) as usize])
+                    .into_iter()
+                    .sum::<usize>()
+                    + (span.end_column) as usize;
+                ByteRange {
+                    file_id: source_name,
+                    start_pos,
+                    end_pos,
+                }
+            }
+            _ => panic!(),
+        }
+    }
+}
+
+/// A mapping for all files in a source map to byte ranges.
+struct SpanToByteRange {
+    map: HashMap<String, FileSpanToByteRange>,
+}
+
+impl SpanToByteRange {
+    fn new() -> SpanToByteRange {
+        SpanToByteRange {
+            map: HashMap::new(),
+        }
+    }
+
+    /// Add a source file to the span mapping.
+    pub fn add_source(&mut self, source: Source) {
+        let source_name: String = source.source_name.name.as_str().expect("foo").into();
+
+        if self.map.contains_key(&source_name) {
+            panic!()
+        } else {
+            let source = source.source.as_str().expect("fpp").into();
+            self.map
+                .insert(source_name, FileSpanToByteRange::new(source));
+        }
+    }
+
+    /// Lookup a span to byte range mapping.
+    ///
+    /// First resolves the Span to a file, and then maps the span to a byte range in the file.
+    pub fn lookup(&self, span: &Span) -> ByteRange<String> {
+        let source_name: String = span.source_name.name.as_str().expect("foo").into();
+
+        match self.map.get(&source_name) {
+            Some(file_span_to_bytes) => file_span_to_bytes.lookup(span),
+            None => panic!(),
+        }
+    }
+}
+
+/// The state of the `codespan` based diagnostics.
+struct DiagnosticState {
+    files: SimpleFiles<String, String>,
+    span_map: SpanToByteRange,
+    // todo unify wih source name
+    source_to_id: HashMap<String, usize>,
+}
+
+impl DiagnosticState {
+    fn new() -> DiagnosticState {
+        DiagnosticState {
+            files: SimpleFiles::new(),
+            span_map: SpanToByteRange::new(),
+            source_to_id: HashMap::new(),
+        }
+    }
+
+    fn add_source(&mut self, source: Source) {
+        let source_str: String = source.source.as_str().unwrap().into();
+        let source_name: String = source.source_name.name.as_str().unwrap().into();
+        self.span_map.add_source(source);
+        let file_id = self.files.add(source_name.clone(), source_str);
+        self.source_to_id.insert(source_name, file_id);
+    }
+
+    fn to_diagnostic(&self, diag: super::Diagnostic) -> CDiagnostic<usize> {
+        let severity = match diag.level {
+            DiagnosticLevel::Error => Severity::Error,
+            DiagnosticLevel::Warning => Severity::Warning,
+            DiagnosticLevel::Note => Severity::Note,
+            DiagnosticLevel::Help => Severity::Help,
+            DiagnosticLevel::Bug => Severity::Bug,
+        };
+
+        let source_name: String = diag.span.source_name.name.as_str().unwrap().into();
+        let file_id = *self.source_to_id.get(&source_name).unwrap();
+
+        let message: String = diag.message.as_str().unwrap().into();
+
+        let byte_range = self.span_map.lookup(&diag.span);
+
+        let diagnostic = CDiagnostic::new(severity)
+            .with_message(message)
+            .with_code("EXXX")
+            .with_labels(vec![Label::primary(
+                file_id,
+                byte_range.start_pos..byte_range.end_pos,
+            )]);
+
+        diagnostic
+    }
+}
+
+fn renderer(state: &mut DiagnosticState, diag_ctx: DiagnosticContext) {
+    let source_map = diag_ctx.module.source_map.clone();
+    let writer = StandardStream::stderr(ColorChoice::Always);
+    let config = codespan_reporting::term::Config::default();
+    for diagnostic in diag_ctx.diagnostics.clone() {
+        match source_map.source_map.get(&diagnostic.span.source_name) {
+            Err(err) => panic!(err),
+            Ok(source) => {
+                state.add_source(source);
+                let diagnostic = state.to_diagnostic(diagnostic);
+                term::emit(&mut writer.lock(), &config, &state.files, &diagnostic).unwrap();
+            }
+        }
+    }
+}
+
+/// Initialize the `codespan` based diagnostics.
+///
+/// Calling this function will globally override the TVM diagnostics renderer.
+pub fn init() -> Result<()> {
+    let diag_state = Arc::new(Mutex::new(DiagnosticState::new()));
+    let render_fn = move |diag_ctx: DiagnosticContext| {
+        let mut guard = diag_state.lock().unwrap();
+        renderer(&mut *guard, diag_ctx);
+    };
+
+    override_renderer(Some(render_fn))?;
+    Ok(())
+}
diff --git a/rust/tvm/src/ir/diagnostics/mod.rs b/rust/tvm/src/ir/diagnostics/mod.rs
new file mode 100644
index 000000000000..8bcdf8f51e60
--- /dev/null
+++ b/rust/tvm/src/ir/diagnostics/mod.rs
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use super::module::IRModule;
+use super::span::*;
+use crate::runtime::function::Result;
+use crate::runtime::object::{Object, ObjectPtr};
+use crate::runtime::{
+    array::Array,
+    function::{self, Function, ToFunction},
+    string::String as TString,
+};
+/// The diagnostic interface to TVM, used for reporting and rendering
+/// diagnostic information by the compiler. This module exposes
+/// three key abstractions: a Diagnostic, the DiagnosticContext,
+/// and the DiagnosticRenderer.
+use tvm_macros::{external, Object};
+
+pub mod codespan;
+
+external! {
+    #[name("node.ArrayGetItem")]
+    fn get_renderer() -> DiagnosticRenderer;
+
+    #[name("diagnostics.DiagnosticRenderer")]
+    fn diagnostic_renderer(func: Function) -> DiagnosticRenderer;
+
+    #[name("diagnostics.Emit")]
+    fn emit(ctx: DiagnosticContext, diagnostic: Diagnostic) -> ();
+
+    #[name("diagnostics.DiagnosticContextDefault")]
+    fn diagnostic_context_default(module: IRModule) -> DiagnosticContext;
+
+    #[name("diagnostics.DiagnosticContextRender")]
+    fn diagnostic_context_render(ctx: DiagnosticContext) -> ();
+
+    #[name("diagnostics.DiagnosticRendererRender")]
+    fn diagnositc_renderer_render(renderer: DiagnosticRenderer,ctx: DiagnosticContext) -> ();
+
+    #[name("diagnostics.ClearRenderer")]
+    fn clear_renderer() -> ();
+}
+
+/// The diagnostic level, controls the printing of the message.
+#[repr(C)]
+#[derive(PartialEq, Eq, Debug)]
+pub enum DiagnosticLevel {
+    Bug = 10,
+    Error = 20,
+    Warning = 30,
+    Note = 40,
+    Help = 50,
+}
+
+/// A compiler diagnostic.
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "Diagnostic"]
+#[type_key = "Diagnostic"]
+pub struct DiagnosticNode {
+    pub base: Object,
+    /// The level.
+    pub level: DiagnosticLevel,
+    /// The span at which to report an error.
+    pub span: Span,
+    /// The diagnostic message.
+    pub message: TString,
+}
+
+impl Diagnostic {
+    pub fn new(level: DiagnosticLevel, span: Span, message: TString) -> Diagnostic {
+        let node = DiagnosticNode {
+            base: Object::base::<DiagnosticNode>(),
+            level,
+            span,
+            message,
+        };
+        ObjectPtr::new(node).into()
+    }
+
+    pub fn bug(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Bug, span)
+    }
+
+    pub fn error(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Error, span)
+    }
+
+    pub fn warning(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Warning, span)
+    }
+
+    pub fn note(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Note, span)
+    }
+
+    pub fn help(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Help, span)
+    }
+}
+
+/// A wrapper around std::stringstream to build a diagnostic.
+pub struct DiagnosticBuilder {
+    /// The level.
+    pub level: DiagnosticLevel,
+
+    /// The span of the diagnostic.
+    pub span: Span,
+
+    /// The in progress message.
+    pub message: String,
+}
+
+impl DiagnosticBuilder {
+    pub fn new(level: DiagnosticLevel, span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder {
+            level,
+            span,
+            message: "".into(),
+        }
+    }
+}
+
+/// Display diagnostics in a given display format.
+///
+/// A diagnostic renderer is responsible for converting the
+/// raw diagnostics into consumable output.
+///
+/// For example the terminal renderer will render a sequence
+/// of compiler diagnostics to std::out and std::err in
+/// a human readable form.
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "DiagnosticRenderer"]
+#[type_key = "DiagnosticRenderer"]
+/// A diagnostic renderer, which given a diagnostic context produces a "rendered"
+/// form of the diagnostics for either human or computer consumption.
+pub struct DiagnosticRendererNode {
+    /// The base type.
+    pub base: Object,
+    // TODO(@jroesch): we can't easily exposed packed functions due to
+    // memory layout
+    // missing field here
+}
+
+impl DiagnosticRenderer {
+    /// Render the provided context.
+    pub fn render(&self, ctx: DiagnosticContext) -> Result<()> {
+        diagnositc_renderer_render(self.clone(), ctx)
+    }
+}
+
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "DiagnosticContext"]
+#[type_key = "DiagnosticContext"]
+/// A diagnostic context for recording errors against a source file.
+pub struct DiagnosticContextNode {
+    // The base type.
+    pub base: Object,
+
+    /// The Module to report against.
+    pub module: IRModule,
+
+    /// The set of diagnostics to report.
+    pub diagnostics: Array<Diagnostic>,
+
+    /// The renderer set for the context.
+    pub renderer: DiagnosticRenderer,
+}
+
+/// A diagnostic context which records active errors
+/// and contains a renderer.
+impl DiagnosticContext {
+    pub fn new<F>(module: IRModule, render_func: F) -> DiagnosticContext
+    where
+        F: Fn(DiagnosticContext) -> () + 'static,
+    {
+        let renderer = diagnostic_renderer(render_func.to_function()).unwrap();
+        let node = DiagnosticContextNode {
+            base: Object::base::<DiagnosticContextNode>(),
+            module,
+            diagnostics: Array::from_vec(vec![]).unwrap(),
+            renderer,
+        };
+        DiagnosticContext(Some(ObjectPtr::new(node)))
+    }
+
+    pub fn default(module: IRModule) -> DiagnosticContext {
+        diagnostic_context_default(module).unwrap()
+    }
+
+    /// Emit a diagnostic.
+    pub fn emit(&mut self, diagnostic: Diagnostic) -> Result<()> {
+        emit(self.clone(), diagnostic)
+    }
+
+    /// Render the errors and raise a DiagnosticError exception.
+    pub fn render(&mut self) -> Result<()> {
+        diagnostic_context_render(self.clone())
+    }
+
+    /// Emit a diagnostic and then immediately attempt to render all errors.
+    pub fn emit_fatal(&mut self, diagnostic: Diagnostic) -> Result<()> {
+        self.emit(diagnostic)?;
+        self.render()?;
+        Ok(())
+    }
+}
+
+/// Override the global diagnostics renderer.
+// render_func: Option[Callable[[DiagnosticContext], None]]
+//     If the render_func is None it will remove the current custom renderer
+//     and return to default behavior.
+fn override_renderer<F>(opt_func: Option<F>) -> Result<()>
+where
+    F: Fn(DiagnosticContext) -> () + 'static,
+{
+    match opt_func {
+        None => clear_renderer(),
+        Some(func) => {
+            let func = func.to_function();
+            let render_factory = move || diagnostic_renderer(func.clone()).unwrap();
+
+            function::register_override(render_factory, "diagnostics.OverrideRenderer", true)?;
+
+            Ok(())
+        }
+    }
+}
diff --git a/rust/tvm/src/ir/expr.rs b/rust/tvm/src/ir/expr.rs
index 91c42f0edbcf..653169def3a4 100644
--- a/rust/tvm/src/ir/expr.rs
+++ b/rust/tvm/src/ir/expr.rs
@@ -17,15 +17,17 @@
  * under the License.
  */
 
-use super::relay;
+use tvm_macros::Object;
+
 use crate::runtime::String as TString;
 use crate::runtime::{self, external, IsObject, IsObjectRef, Object, ObjectPtr, ObjectRef};
 use crate::DataType;
 
-use tvm_macros::Object;
+use super::relay;
+use super::span::Span;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BaseExpr"]
 #[type_key = "Expr"]
 pub struct BaseExprNode {
@@ -35,13 +37,13 @@ pub struct BaseExprNode {
 impl BaseExprNode {
     pub fn base<T: IsObject>() -> BaseExprNode {
         BaseExprNode {
-            base: Object::base_object::<T>(),
+            base: Object::base::<T>(),
         }
     }
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PrimExpr"]
 #[type_key = "PrimExpr"]
 pub struct PrimExprNode {
@@ -59,7 +61,7 @@ impl PrimExprNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "GlobalVar"]
 #[type_key = "GlobalVar"]
 pub struct GlobalVarNode {
@@ -68,7 +70,7 @@ pub struct GlobalVarNode {
 }
 
 impl GlobalVar {
-    pub fn new(name_hint: String, _span: ObjectRef) -> GlobalVar {
+    pub fn new(name_hint: String, _span: Span) -> GlobalVar {
         let node = GlobalVarNode {
             base: relay::ExprNode::base::<GlobalVarNode>(),
             name_hint: name_hint.into(),
diff --git a/rust/tvm/src/ir/function.rs b/rust/tvm/src/ir/function.rs
index 3043bf9e7cff..14c00ea02bf6 100644
--- a/rust/tvm/src/ir/function.rs
+++ b/rust/tvm/src/ir/function.rs
@@ -28,7 +28,7 @@ use tvm_macros::Object;
 pub type DictAttrs = ObjectRef;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BaseFunc"]
 #[type_key = "BaseFunc"]
 pub struct BaseFuncNode {
diff --git a/rust/tvm/src/ir/mod.rs b/rust/tvm/src/ir/mod.rs
index 126d0faccabb..6d5158005497 100644
--- a/rust/tvm/src/ir/mod.rs
+++ b/rust/tvm/src/ir/mod.rs
@@ -19,11 +19,13 @@
 
 pub mod arith;
 pub mod attrs;
+pub mod diagnostics;
 pub mod expr;
 pub mod function;
 pub mod module;
 pub mod op;
 pub mod relay;
+pub mod source_map;
 pub mod span;
 pub mod tir;
 pub mod ty;
diff --git a/rust/tvm/src/ir/module.rs b/rust/tvm/src/ir/module.rs
index e0444b3101da..a09f70dc25b9 100644
--- a/rust/tvm/src/ir/module.rs
+++ b/rust/tvm/src/ir/module.rs
@@ -17,32 +17,42 @@
  * under the License.
  */
 
+use std::collections::HashMap;
+use std::iter::FromIterator;
+use std::path::Path;
+
+use thiserror::Error;
+use tvm_macros::Object;
+
 use crate::runtime::array::Array;
 use crate::runtime::function::Result;
 use crate::runtime::map::Map;
 use crate::runtime::string::String as TVMString;
-use crate::runtime::{external, Object, ObjectRef};
+use crate::runtime::{external, IsObjectRef, Object};
 
 use super::expr::GlobalVar;
 use super::function::BaseFunc;
+use super::source_map::SourceMap;
+use super::{relay, ty::GlobalTypeVar, ty::TypeData};
 
-use std::io::Result as IOResult;
-use std::path::Path;
-
-use tvm_macros::Object;
-
-// TODO(@jroesch): define type
-type TypeData = ObjectRef;
-type GlobalTypeVar = ObjectRef;
+#[derive(Error, Debug)]
+pub enum Error {
+    #[error("{0}")]
+    IO(#[from] std::io::Error),
+    #[error("{0}")]
+    TVM(#[from] crate::runtime::Error),
+}
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "IRModule"]
 #[type_key = "IRModule"]
 pub struct IRModuleNode {
     pub base: Object,
     pub functions: Map<GlobalVar, BaseFunc>,
     pub type_definitions: Map<GlobalTypeVar, TypeData>,
+    pub source_map: SourceMap,
+    // TODO(@jroesch): this is missing some fields
 }
 
 external! {
@@ -51,7 +61,11 @@ external! {
     fn parse_module(file_name: TVMString, source: TVMString) -> IRModule;
     #[name("parser.ParseExpr")]
     fn parse_expression(file_name: TVMString, source: TVMString) -> IRModule;
+    #[name("ir.IRModule")]
+    fn module_new(funcs: Map<GlobalVar, BaseFunc>, types: Map<GlobalTypeVar, TypeData>) -> IRModule;
     // Module methods
+    #[name("ir.Module_Add")]
+    fn module_add(module: IRModule, type_name: GlobalVar, expr: BaseFunc, update: bool) -> IRModule;
     #[name("ir.Module_AddDef")]
     fn module_add_def(module: IRModule, type_name: GlobalTypeVar, type_data: TypeData, update: bool) -> ();
     #[name("ir.Module_GetGlobalVar")]
@@ -62,73 +76,70 @@ external! {
     fn module_lookup(module: IRModule, var: GlobalVar) -> BaseFunc;
     #[name("ir.Module_Lookup_str")]
     fn module_lookup_str(module: IRModule, name: TVMString) -> BaseFunc;
+    #[name("ir.Module_GetGlobalTypeVars")]
+    fn module_get_global_type_vars(module: IRModule) -> Array<GlobalTypeVar>;
+    #[name("ir.Module_ContainGlobalVar")]
+    fn module_contains_global_var(module: IRModule, name: TVMString) -> bool;
+    #[name("ir.Module_ContainGlobalTypeVar")]
+    fn module_contains_global_type_var(module: IRModule, name: TVMString) -> bool;
+    #[name("ir.Module_LookupDef")]
+    fn module_lookup_def(module: IRModule, global: GlobalTypeVar) -> TypeData;
+    #[name("ir.Module_LookupDef_str")]
+    fn module_lookup_def_str(module: IRModule, global: TVMString) -> TypeData;
+    #[name("ir.Module_LookupTag")]
+    fn module_lookup_tag(module: IRModule, tag: i32) -> relay::Constructor;
+    #[name("ir.Module_FromExpr")]
+    fn module_from_expr(expr: relay::Expr, funcs: Map<GlobalVar, BaseFunc>, types: Map<GlobalTypeVar, TypeData>) -> IRModule;
+    #[name("ir.Module_Import")]
+    fn module_import(module: IRModule, path: TVMString);
+    #[name("ir.Module_ImportFromStd")]
+    fn module_import_from_std(module: IRModule, path: TVMString);
 }
 
-// TVM_REGISTER_GLOBAL("ir.Module_GetGlobalTypeVars")
-//     .set_body_method<IRModule>(&IRModuleNode::GetGlobalTypeVars);
-
-// TVM_REGISTER_GLOBAL("ir.Module_ContainGlobalVar")
-//     .set_body_method<IRModule>(&IRModuleNode::ContainGlobalVar);
-
-// TVM_REGISTER_GLOBAL("ir.Module_GetGlobalTypeVar")
-//     .set_body_method<IRModule>(&IRModuleNode::GetGlobalTypeVar);
-
-// TVM_REGISTER_GLOBAL("ir.Module_LookupDef").set_body_typed([](IRModule mod, GlobalTypeVar var) {
-//   return mod->LookupTypeDef(var);
-// });
-
-// TVM_REGISTER_GLOBAL("ir.Module_LookupDef_str").set_body_typed([](IRModule mod, String var) {
-//   return mod->LookupTypeDef(var);
-// });
+// Note: we don't expose update here as update is going to be removed.
 
-// TVM_REGISTER_GLOBAL("ir.Module_LookupTag").set_body_typed([](IRModule mod, int32_t tag) {
-//   return mod->LookupTag(tag);
-// });
-
-// TVM_REGISTER_GLOBAL("ir.Module_FromExpr")
-//     .set_body_typed([](RelayExpr e, tvm::Map<GlobalVar, BaseFunc> funcs,
-//                        tvm::Map<GlobalTypeVar, TypeData> type_defs) {
-//       return IRModule::FromExpr(e, funcs, type_defs);
-//     });
-
-// TVM_REGISTER_GLOBAL("ir.Module_Update").set_body_typed([](IRModule mod, IRModule from) {
-//   mod->Update(from);
-// });
-
-// TVM_REGISTER_GLOBAL("ir.Module_UpdateFunction")
-//     .set_body_typed([](IRModule mod, GlobalVar gv, BaseFunc func) { mod->Update(gv, func); });
-
-// TVM_REGISTER_GLOBAL("ir.Module_Import").set_body_typed([](IRModule mod, String path) {
-//   mod->Import(path);
-// });
-
-// TVM_REGISTER_GLOBAL("ir.Module_ImportFromStd").set_body_typed([](IRModule mod, String path) {
-//   mod->ImportFromStd(path);
-// });
+impl IRModule {
+    pub fn new<F, T>(funcs: F, types: T) -> Result<IRModule>
+    where
+        F: IntoIterator<Item = (GlobalVar, BaseFunc)>,
+        T: IntoIterator<Item = (GlobalTypeVar, TypeData)>,
+    {
+        module_new(Map::from_iter(funcs), Map::from_iter(types))
+    }
 
-// TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-//     .set_dispatch<IRModuleNode>([](const ObjectRef& ref, ReprPrinter* p) {
-//       auto* node = static_cast<const IRModuleNode*>(ref.get());
-//       p->stream << "IRModuleNode( " << node->functions << ")";
-//     });
+    pub fn empty() -> Result<IRModule> {
+        let funcs = HashMap::<GlobalVar, BaseFunc>::new();
+        let types = HashMap::<GlobalTypeVar, TypeData>::new();
+        IRModule::new(funcs, types)
+    }
 
-impl IRModule {
-    pub fn parse<N, S>(file_name: N, source: S) -> IRModule
+    pub fn parse<N, S>(file_name: N, source: S) -> Result<IRModule>
     where
         N: Into<TVMString>,
         S: Into<TVMString>,
     {
-        parse_module(file_name.into(), source.into()).expect("failed to call parser")
+        parse_module(file_name.into(), source.into())
     }
 
-    pub fn parse_file<P: 'static + AsRef<Path>>(file_path: P) -> IOResult<IRModule> {
+    pub fn parse_file<P: 'static + AsRef<Path>>(
+        file_path: P,
+    ) -> std::result::Result<IRModule, Error> {
         let file_path = file_path.as_ref();
         let file_path_as_str = file_path.to_str().unwrap().to_string();
         let source = std::fs::read_to_string(file_path)?;
-        let module = IRModule::parse(file_path_as_str, source);
+        let module = IRModule::parse(file_path_as_str, source)?;
         Ok(module)
     }
 
+    pub fn add<F>(&mut self, var: GlobalVar, func: F) -> Result<IRModule>
+    // todo(@jroesch): can we do better here? why doesn't BaseFunc::Object work?
+    where
+        F: IsObjectRef,
+        F::Object: AsRef<<BaseFunc as IsObjectRef>::Object>,
+    {
+        module_add(self.clone(), var, func.upcast(), true)
+    }
+
     pub fn add_def(
         &mut self,
         type_name: GlobalTypeVar,
@@ -138,8 +149,11 @@ impl IRModule {
         module_add_def(self.clone(), type_name, type_data, update)
     }
 
-    pub fn get_global_var(&self, name: TVMString) -> Result<GlobalVar> {
-        module_get_global_var(self.clone(), name)
+    pub fn get_global_var<S>(&self, name: S) -> Result<GlobalVar>
+    where
+        S: Into<TVMString>,
+    {
+        module_get_global_var(self.clone(), name.into())
     }
 
     pub fn get_global_vars(&self) -> Result<Array<GlobalVar>> {
@@ -156,4 +170,216 @@ impl IRModule {
     {
         module_lookup_str(self.clone(), name.into())
     }
+
+    pub fn get_global_type_vars(&self) -> Result<Array<GlobalTypeVar>> {
+        module_get_global_type_vars(self.clone())
+    }
+
+    pub fn contains_global_var<S: Into<TVMString>>(&self, name: S) -> Result<bool> {
+        module_contains_global_var(self.clone(), name.into())
+    }
+
+    pub fn contains_global_type_var<S: Into<TVMString>>(&self, name: S) -> Result<bool> {
+        module_contains_global_type_var(self.clone(), name.into())
+    }
+
+    pub fn lookup_def(&self, global: GlobalTypeVar) -> Result<TypeData> {
+        module_lookup_def(self.clone(), global)
+    }
+
+    pub fn lookup_def_str<S>(&self, global: S) -> Result<TypeData>
+    where
+        S: Into<TVMString>,
+    {
+        module_lookup_def_str(self.clone(), global.into())
+    }
+
+    pub fn lookup_tag(&self, tag: i32) -> Result<relay::Constructor> {
+        module_lookup_tag(self.clone(), tag)
+    }
+
+    pub fn from_expr<E>(expr: E) -> Result<IRModule>
+    where
+        E: IsObjectRef,
+        E::Object: AsRef<<relay::Expr as IsObjectRef>::Object>,
+    {
+        Self::from_expr_with_items(expr, HashMap::new(), HashMap::new())
+    }
+
+    pub fn from_expr_with_items<E, F, T>(expr: E, funcs: F, types: T) -> Result<IRModule>
+    where
+        F: IntoIterator<Item = (GlobalVar, BaseFunc)>,
+        T: IntoIterator<Item = (GlobalTypeVar, TypeData)>,
+        E: IsObjectRef,
+        E::Object: AsRef<<relay::Expr as IsObjectRef>::Object>,
+    {
+        module_from_expr(expr.upcast(), Map::from_iter(funcs), Map::from_iter(types))
+    }
+
+    pub fn import<S: Into<TVMString>>(&mut self, path: S) -> Result<()> {
+        module_import(self.clone(), path.into())
+    }
+
+    pub fn import_from_std<S: Into<TVMString>>(&mut self, path: S) -> Result<()> {
+        module_import_from_std(self.clone(), path.into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::relay::*;
+    use super::*;
+    use crate::ir::span::Span;
+    use crate::ir::ty::{GlobalTypeVar, TypeData, TypeKind};
+    use tvm_rt::IsObjectRef;
+
+    fn add_dummy_functions(names: Vec<&str>) -> Result<IRModule> {
+        let mut module = IRModule::empty()?;
+        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
+        let params = vec![x.clone()];
+        let func = relay::Function::simple(params, x);
+
+        for name in names {
+            let gv = GlobalVar::new(name.into(), Span::null());
+            module = module.add(gv, func.clone())?;
+        }
+
+        Ok(module)
+    }
+
+    fn add_dummy_types(names: Vec<&str>) -> Result<IRModule> {
+        let mut module = IRModule::empty()?;
+
+        for name in names {
+            let name: String = name.into();
+            let name = GlobalTypeVar::new(name, TypeKind::Type, Span::null());
+            let type_data = TypeData::new(name.clone(), vec![], vec![], Span::null());
+            module.add_def(name, type_data, true)?;
+        }
+
+        Ok(module)
+    }
+
+    #[test]
+    fn test_module_add() -> anyhow::Result<()> {
+        let mut module = IRModule::empty()?;
+        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
+        let params = vec![x.clone()];
+        let func = relay::Function::simple(params, x);
+        let module = module.add(GlobalVar::new("foo".into(), Span::null()), func)?;
+        let lfunc = module.lookup_str("foo")?;
+        let lfunc = lfunc.downcast::<relay::Function>()?;
+        assert_eq!(lfunc.params.len(), 1);
+        Ok(())
+    }
+
+    #[test]
+    fn test_module_add_def() -> Result<()> {
+        let mut module = IRModule::empty()?;
+        let name = GlobalTypeVar::new("my_type", TypeKind::Type, Span::null());
+        let type_data = TypeData::new(name.clone(), vec![], vec![], Span::null());
+        module.add_def(name.clone(), type_data, true)?;
+        let by_gtv = module.lookup_def(name)?;
+        let by_gv = module.lookup_def_str("my_type")?;
+        Ok(())
+    }
+
+    #[test]
+    fn test_get_global_var() -> Result<()> {
+        let mut module = IRModule::empty()?;
+        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
+        let params = vec![x.clone()];
+        let func = relay::Function::simple(params, x);
+        let gv_foo = GlobalVar::new("foo".into(), Span::null());
+        let module = module.add(gv_foo.clone(), func)?;
+        let gv = module.get_global_var("foo")?;
+        assert_eq!(gv_foo, gv);
+        Ok(())
+    }
+
+    #[test]
+    fn test_get_global_vars() -> Result<()> {
+        let names = vec!["foo", "bar", "baz"];
+        let module = add_dummy_functions(names.clone())?;
+        let gvars: Vec<String> = module
+            .get_global_vars()?
+            .into_iter()
+            .map(|gv| gv.name_hint.as_str().unwrap().to_string())
+            .collect();
+
+        for name in names {
+            assert!(gvars.contains(&name.to_string()));
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_get_global_type_vars() -> Result<()> {
+        let names = vec!["foo", "bar", "baz"];
+        let module = add_dummy_types(names.clone())?;
+        let gvars: Vec<String> = module
+            .get_global_type_vars()?
+            .into_iter()
+            .map(|gv| gv.name_hint.as_str().unwrap().to_string())
+            .collect();
+
+        for name in names {
+            assert!(gvars.contains(&name.to_string()));
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_contains_global_var() -> Result<()> {
+        let module = add_dummy_functions(vec!["foo"])?;
+        assert!(module.contains_global_var("foo")?);
+        Ok(())
+    }
+
+    #[test]
+    fn test_contains_global_type_var() -> Result<()> {
+        let module = add_dummy_types(vec!["foo"])?;
+        assert!(module.contains_global_type_var("foo")?);
+        Ok(())
+    }
+
+    // TODO(@jroesch): not really sure about this API at all.
+    // pub fn lookup_tag(&self, tag: i32) -> Result<relay::Constructor> {
+    //     module_lookup_tag(self.clone(), tag)
+    // }
+
+    #[test]
+    fn test_from_expr() -> Result<()> {
+        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
+        let params = vec![x.clone()];
+        let func = relay::Function::simple(params, x);
+        let module = IRModule::from_expr(func.clone())?;
+        let main_fn = module.lookup_str("main")?;
+        let main_fn = main_fn.downcast::<relay::Function>()?;
+        assert_eq!(main_fn, func);
+        Ok(())
+    }
+
+    #[test]
+    fn test_import() -> Result<()> {
+        let mut std_path: String = env!("CARGO_MANIFEST_DIR").into();
+        std_path += "/../../python/tvm/relay/std/prelude.rly";
+
+        let mut mod1 = IRModule::empty()?;
+        mod1.import(std_path.clone())?;
+        mod1.lookup_str("map")?;
+
+        // TODO(@jroesch): this requires another patch of mine to enable.
+
+        // if cfg!(feature = "python") {
+        //     crate::python::load().unwrap();
+        //     let mut mod2 = IRModule::empty()?;
+        //     mod2.import_from_std("prelude.rly")?;
+        //     mod2.lookup_str("map")?;
+        // }
+
+        Ok(())
+    }
 }
diff --git a/rust/tvm/src/ir/op.rs b/rust/tvm/src/ir/op.rs
index d81d6a69c1eb..d222ead0391b 100644
--- a/rust/tvm/src/ir/op.rs
+++ b/rust/tvm/src/ir/op.rs
@@ -27,7 +27,7 @@ type FuncType = ObjectRef;
 type AttrFieldInfo = ObjectRef;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Op"]
 #[type_key = "Op"]
 pub struct OpNode {
diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index f743534e5f61..7ecd92febc22 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -27,7 +27,7 @@ use tvm_macros::Object;
 type IndexExpr = PrimExpr;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Conv2DAttrs"]
 #[type_key = "relay.attrs.Conv2DAttrs"]
 pub struct Conv2DAttrsNode {
@@ -46,7 +46,7 @@ pub struct Conv2DAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BiasAddAttrs"]
 #[type_key = "relay.attrs.BiasAddAttrs"]
 pub struct BiasAddAttrsNode {
@@ -55,7 +55,7 @@ pub struct BiasAddAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "DenseAttrs"]
 #[type_key = "relay.attrs.DenseAttrs"]
 pub struct DenseAttrsNode {
@@ -65,7 +65,7 @@ pub struct DenseAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "GlobalPool2DAttrs"]
 #[type_key = "relay.attrs.GlobalPool2DAttrs"]
 pub struct GlobalPool2DAttrsNode {
@@ -74,7 +74,7 @@ pub struct GlobalPool2DAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "MaxPool2DAttrs"]
 #[type_key = "relay.attrs.MaxPool2DAttrs"]
 pub struct MaxPool2DAttrsNode {
@@ -87,10 +87,22 @@ pub struct MaxPool2DAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "SoftmaxAttrs"]
 #[type_key = "relay.attrs.SoftmaxAttrs"]
 pub struct SoftmaxAttrsNode {
     pub base: BaseAttrsNode,
     pub axis: i32,
 }
+
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "BatchNormAttrs"]
+#[type_key = "relay.attrs.BatchNormAttrs"]
+pub struct BatchNormAttrsNode {
+    pub base: BaseAttrsNode,
+    pub axis: i32,
+    pub epsilon: f64,
+    pub center: bool,
+    pub scale: bool,
+}
diff --git a/rust/tvm/src/ir/relay/attrs/transform.rs b/rust/tvm/src/ir/relay/attrs/transform.rs
index 863f07617778..c459f96b2d2f 100644
--- a/rust/tvm/src/ir/relay/attrs/transform.rs
+++ b/rust/tvm/src/ir/relay/attrs/transform.rs
@@ -21,7 +21,7 @@ use crate::ir::attrs::BaseAttrsNode;
 use tvm_macros::Object;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "ExpandDimsAttrs"]
 #[type_key = "relay.attrs.ExpandDimsAttrs"]
 pub struct ExpandDimsAttrsNode {
diff --git a/rust/tvm/src/ir/relay/mod.rs b/rust/tvm/src/ir/relay/mod.rs
index e539221d1db6..9d2983237acb 100644
--- a/rust/tvm/src/ir/relay/mod.rs
+++ b/rust/tvm/src/ir/relay/mod.rs
@@ -16,26 +16,25 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
-pub mod attrs;
-
-use std::hash::Hash;
-
 use crate::runtime::array::Array;
-use crate::runtime::{object::*, String as TString};
+use crate::runtime::{object::*, IsObjectRef, String as TString};
 
 use super::attrs::Attrs;
 use super::expr::BaseExprNode;
 use super::function::BaseFuncNode;
+use super::span::Span;
 use super::ty::{Type, TypeNode};
 
 use tvm_macros::Object;
 use tvm_rt::NDArray;
 
 pub use super::expr::{GlobalVar, GlobalVarNode};
+pub use crate::runtime::DataType;
+
+pub mod attrs;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Expr"]
 #[type_key = "RelayExpr"]
 pub struct ExprNode {
@@ -50,29 +49,15 @@ impl ExprNode {
             base: BaseExprNode::base::<T>(),
             span: ObjectRef::null(),
             checked_type: Type::from(TypeNode {
-                base: Object::base_object::<TypeNode>(),
-                span: ObjectRef::null(),
+                base: Object::base::<TypeNode>(),
+                span: Span::null(),
             }),
         }
     }
 }
 
-impl Hash for Expr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.as_ptr().unwrap().ptr.hash(state)
-    }
-}
-
-impl PartialEq for Expr {
-    fn eq(&self, other: &Self) -> bool {
-        self.as_ptr().unwrap().ptr.eq(&other.as_ptr().unwrap().ptr)
-    }
-}
-
-impl Eq for Expr {}
-
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Id"]
 #[type_key = "relay.Id"]
 pub struct IdNode {
@@ -83,7 +68,7 @@ pub struct IdNode {
 impl Id {
     fn new(name_hint: TString) -> Id {
         let node = IdNode {
-            base: Object::base_object::<IdNode>(),
+            base: Object::base::<IdNode>(),
             name_hint: name_hint,
         };
         Id(Some(ObjectPtr::new(node)))
@@ -91,7 +76,7 @@ impl Id {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Constant"]
 #[type_key = "relay.Constant"]
 pub struct ConstantNode {
@@ -110,7 +95,7 @@ impl Constant {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Tuple"]
 #[type_key = "relay.Tuple"]
 pub struct TupleNode {
@@ -129,7 +114,7 @@ impl Tuple {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Var"]
 #[type_key = "relay.Var"]
 pub struct VarNode {
@@ -139,11 +124,11 @@ pub struct VarNode {
 }
 
 impl Var {
-    pub fn new(name_hint: String, type_annotation: Type, _span: ObjectRef) -> Var {
+    pub fn new(name_hint: String, type_annotation: Type, _span: Span) -> Var {
         let node = VarNode {
             base: ExprNode::base::<VarNode>(),
             vid: Id::new(name_hint.into()),
-            type_annotation,
+            type_annotation: type_annotation,
         };
         Var(Some(ObjectPtr::new(node)))
     }
@@ -152,13 +137,18 @@ impl Var {
         &self.vid.0.as_ref().unwrap().name_hint
     }
 
-    pub fn to_expr(self) -> Expr {
-        unsafe { Expr(std::mem::transmute(self.0)) }
+    pub fn static_tensor(name_hint: String, sh: Vec<i32>, dtype: DataType) -> Var {
+        let sh = Array::from_vec(sh.into_iter().map(Into::into).collect()).unwrap();
+        Self::new(
+            name_hint,
+            super::ty::TensorType::new(sh, dtype, Span::null()).upcast(),
+            Span::null(),
+        )
     }
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Call"]
 #[type_key = "relay.Call"]
 pub struct CallNode {
@@ -189,7 +179,7 @@ impl Call {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Let"]
 #[type_key = "relay.Let"]
 pub struct LetNode {
@@ -212,7 +202,7 @@ impl Let {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "If"]
 #[type_key = "relay.If"]
 pub struct IfNode {
@@ -235,7 +225,7 @@ impl If {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TupleGetItem"]
 #[type_key = "relay.TupleGetItem"]
 pub struct TupleGetItemNode {
@@ -256,7 +246,7 @@ impl TupleGetItem {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "RefCreate"]
 #[type_key = "relay.RefCreate"]
 pub struct RefCreateNode {
@@ -275,7 +265,7 @@ impl RefCreate {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "RefRead"]
 #[type_key = "relay.RefRead"]
 pub struct RefReadNode {
@@ -294,7 +284,7 @@ impl RefRead {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "RefWrite"]
 #[type_key = "relay.RefWrite"]
 pub struct RefWriteNode {
@@ -315,7 +305,7 @@ impl RefWrite {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Constructor"]
 #[type_key = "relay.Constructor"]
 pub struct ConstructorNode {
@@ -340,7 +330,7 @@ impl Constructor {
 // TODO(@jroesch): define the type data
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Pattern"]
 #[type_key = "relay.Pattern"]
 pub struct PatternNode {
@@ -351,14 +341,14 @@ pub struct PatternNode {
 impl PatternNode {
     pub fn base<T: IsObject>() -> PatternNode {
         PatternNode {
-            base: Object::base_object::<T>(),
+            base: Object::base::<T>(),
             span: ObjectRef::null(),
         }
     }
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PatternWildcard"]
 #[type_key = "relay.PatternWildcard"]
 pub struct PatternWildcardNode {
@@ -375,7 +365,7 @@ impl PatternWildcard {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PatternVar"]
 #[type_key = "relay.PatternVar"]
 pub struct PatternVarNode {
@@ -394,7 +384,7 @@ impl PatternVar {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PatternConstructor"]
 #[type_key = "relay.PatternConstructor"]
 pub struct PatternConstructorNode {
@@ -419,7 +409,7 @@ impl PatternConstructor {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PatternTuple"]
 #[type_key = "relay.PatternTuple"]
 pub struct PatternTupleNode {
@@ -438,7 +428,7 @@ impl PatternTuple {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Clause"]
 #[type_key = "relay.Clause"]
 pub struct ClauseNode {
@@ -450,7 +440,7 @@ pub struct ClauseNode {
 impl Clause {
     pub fn new(lhs: Pattern, rhs: Expr, _span: ObjectRef) -> Clause {
         let node = ClauseNode {
-            base: Object::base_object::<ClauseNode>(),
+            base: Object::base::<ClauseNode>(),
             lhs,
             rhs,
         };
@@ -459,7 +449,7 @@ impl Clause {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Match"]
 #[type_key = "relay.Match"]
 pub struct MatchNode {
@@ -482,7 +472,7 @@ impl Match {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Function"]
 #[type_key = "relay.Function"]
 pub struct FunctionNode {
@@ -509,6 +499,20 @@ impl Function {
         };
         Function(Some(ObjectPtr::new(node)))
     }
+
+    pub fn simple<E>(params: Vec<Var>, body: E) -> Function
+    where
+        E: IsObjectRef,
+        E::Object: AsRef<<Expr as IsObjectRef>::Object>,
+    {
+        let params = Array::from_vec(params).unwrap();
+        Self::new(
+            params,
+            body.upcast(),
+            Type::null(),
+            Array::from_vec(vec![]).unwrap(),
+        )
+    }
 }
 
 #[cfg(test)]
@@ -529,7 +533,7 @@ mod tests {
 
     #[test]
     fn test_global() -> Result<()> {
-        let gv = GlobalVar::new("main".to_string(), ObjectRef::null());
+        let gv = GlobalVar::new("main".to_string(), Span::null());
         let text = as_text(gv.clone());
         assert!(text.contains("@main"));
         Ok(())
@@ -537,7 +541,7 @@ mod tests {
 
     #[test]
     fn test_var() -> Result<()> {
-        let var = Var::new("local".to_string(), Type::null(), ObjectRef::null());
+        let var = Var::new("local".to_string(), Type::null(), Span::null());
         let text = as_text(var.clone());
         assert!(text.contains("%local"));
         Ok(())
@@ -553,9 +557,10 @@ def @main() -> float32 {
   0.01639530062675476f
 }
 "#,
-        );
+        )
+        .unwrap();
         let main = module
-            .lookup(module.get_global_var("main".to_string().into()).unwrap())
+            .lookup(module.get_global_var("main").unwrap())
             .unwrap();
         let func = main.downcast::<crate::ir::relay::Function>().unwrap();
         let constant = func
diff --git a/rust/tvm/src/ir/source_map.rs b/rust/tvm/src/ir/source_map.rs
new file mode 100644
index 000000000000..7376f4b74022
--- /dev/null
+++ b/rust/tvm/src/ir/source_map.rs
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either exprss or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use crate::runtime::map::Map;
+use crate::runtime::object::Object;
+use crate::runtime::string::String as TString;
+
+use super::span::SourceName;
+
+use tvm_macros::Object;
+
+/// A program source in any language.
+///
+/// Could represent the source from an ML framework or a source of an IRModule.
+#[repr(C)]
+#[derive(Object, Debug)]
+#[type_key = "Source"]
+#[ref_name = "Source"]
+pub struct SourceNode {
+    pub base: Object,
+    /// The source name.
+    pub source_name: SourceName,
+
+    /// The raw source.
+    pub source: TString,
+    // TODO(@jroesch): Non-ABI compat field
+    // A mapping of line breaks into the raw source.
+    // std::vector<std::pair<int, int>> line_map;
+}
+
+/// A mapping from a unique source name to source fragments.
+#[repr(C)]
+#[derive(Object, Debug)]
+#[type_key = "SourceMap"]
+#[ref_name = "SourceMap"]
+pub struct SourceMapNode {
+    /// The base object.
+    pub base: Object,
+    /// The source mapping.
+    pub source_map: Map<SourceName, Source>,
+}
diff --git a/rust/tvm/src/ir/span.rs b/rust/tvm/src/ir/span.rs
index d2e19a25a950..be74745b60ca 100644
--- a/rust/tvm/src/ir/span.rs
+++ b/rust/tvm/src/ir/span.rs
@@ -1,22 +1,71 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
 
-use crate::runtime::ObjectRef;
+* specific language governing permissions and limitations
+* under the License.
+*/
 
-pub type Span = ObjectRef;
+use crate::runtime::{Object, ObjectPtr, String as TString};
+use tvm_macros::Object;
+
+/// A source file name, contained in a Span.
+#[repr(C)]
+#[derive(Object, Debug)]
+#[type_key = "SourceName"]
+#[ref_name = "SourceName"]
+pub struct SourceNameNode {
+    pub base: Object,
+    pub name: TString,
+}
+
+/// Span information for diagnostic purposes.
+#[repr(C)]
+#[derive(Object, Debug)]
+#[type_key = "Span"]
+#[ref_name = "Span"]
+pub struct SpanNode {
+    pub base: Object,
+    /// The source name.
+    pub source_name: SourceName,
+    /// The line number.
+    pub line: i32,
+    /// The column offset.
+    pub column: i32,
+    /// The end line number.
+    pub end_line: i32,
+    /// The end column number.
+    pub end_column: i32,
+}
+
+impl Span {
+    pub fn new(
+        source_name: SourceName,
+        line: i32,
+        end_line: i32,
+        column: i32,
+        end_column: i32,
+    ) -> Span {
+        let span_node = SpanNode {
+            base: Object::base::<SpanNode>(),
+            source_name,
+            line,
+            end_line,
+            column,
+            end_column,
+        };
+        Span(Some(ObjectPtr::new(span_node)))
+    }
+}
diff --git a/rust/tvm/src/ir/tir.rs b/rust/tvm/src/ir/tir.rs
index 22d4e02054e1..ccbe30c95820 100644
--- a/rust/tvm/src/ir/tir.rs
+++ b/rust/tvm/src/ir/tir.rs
@@ -26,7 +26,7 @@ use tvm_macros::Object;
 macro_rules! define_node {
     ($name:ident, $ref:expr, $typekey:expr; $node:ident { $($id:ident : $t:ty),*}) => {
         #[repr(C)]
-        #[derive(Object)]
+        #[derive(Object, Debug)]
         #[ref_name = $ref]
         #[type_key = $typekey]
         pub struct $node {
@@ -47,6 +47,20 @@ macro_rules! define_node {
 // TODO(@jroesch): should move up to expr.rs to mirror TVM.
 define_node!(IntImm, "IntImm", "IntImm";
              IntImmNode { value: i64 });
+
+impl From<i32> for IntImm {
+    fn from(i: i32) -> IntImm {
+        IntImm::new(DataType::int(32, 1), i as i64)
+    }
+}
+
+impl From<i32> for PrimExpr {
+    fn from(i: i32) -> PrimExpr {
+        use crate::runtime::IsObjectRef;
+        IntImm::from(i).upcast()
+    }
+}
+
 define_node!(Var, "Var", "tir.Var";
              VarNode { name_hint: TVMString });
 
diff --git a/rust/tvm/src/ir/ty.rs b/rust/tvm/src/ir/ty.rs
index b6a47f553da4..f7c52b51f332 100644
--- a/rust/tvm/src/ir/ty.rs
+++ b/rust/tvm/src/ir/ty.rs
@@ -17,15 +17,16 @@
  * under the License.
  */
 
-use super::span::Span;
-use crate::runtime::{IsObject, Object, ObjectPtr};
 use tvm_macros::Object;
 use tvm_rt::{array::Array, DataType};
 
-use super::PrimExpr;
+use crate::ir::relay::Constructor;
+use crate::ir::span::Span;
+use crate::ir::PrimExpr;
+use crate::runtime::{string::String as TString, IsObject, Object, ObjectPtr};
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Type"]
 #[type_key = "Type"]
 pub struct TypeNode {
@@ -36,7 +37,7 @@ pub struct TypeNode {
 impl TypeNode {
     fn base<T: IsObject>(span: Span) -> Self {
         TypeNode {
-            base: Object::base_object::<T>(),
+            base: Object::base::<T>(),
             span,
         }
     }
@@ -51,7 +52,7 @@ impl TypeNode {
  * \sa PrimType
  */
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PrimType"]
 #[type_key = "PrimType"]
 pub struct PrimTypeNode {
@@ -73,7 +74,7 @@ pub struct PrimTypeNode {
  */
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PointerType"]
 #[type_key = "PointerType"]
 pub struct PointerTypeNode {
@@ -83,6 +84,7 @@ pub struct PointerTypeNode {
 }
 
 /// Possible kinds of type variables.
+#[derive(PartialEq, Eq, Debug)]
 pub enum TypeKind {
     Type = 0,
     /// Template variable in shape expression.
@@ -92,47 +94,51 @@ pub enum TypeKind {
     TypeData = 6,
 }
 
-/*
- * \brief Type parameter in functions.
- *
- * A type variable can be viewed as template parameter in c++ template function.
- *
- * For example, in the following pesudo code,
- * the TypeVar of f is TypeVar("n", kind=kShapeVar).
- * This function can take in a Tensor with shape=(3, 3) and
- * returns a Tensor with shape=(9,)
- *
- * \code
- *
- *  template<i32 n>
- *  f(x : Tensor[i32, (n, n)]) -> Tensor[i32, (n * n)]
- *
- * \endcode
- * \sa TypeVar, TypeKind
- */
+/// Type parameter in functions.
+///
+/// A type variable can be viewed as template parameter in c++ template function.
+///
+/// For example, in the following pesudo code,
+/// the TypeVar of f is TypeVar("n", kind=kShapeVar).
+/// This function can take in a Tensor with shape=(3, 3) and
+/// returns a Tensor with shape=(9,)
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TypeVar"]
 #[type_key = "TypeVar"]
 pub struct TypeVarNode {
     pub base: TypeNode,
-    pub name_hint: String,
+    pub name_hint: TString,
     pub kind: TypeKind,
 }
 
 /// A global type variable that is used for defining new types or type aliases.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "GlobalTypeVar"]
 #[type_key = "GlobalTypeVar"]
 pub struct GlobalTypeVarNode {
     pub base: TypeNode,
-    pub name_hint: String,
+    pub name_hint: TString,
     pub kind: TypeKind,
 }
 
+impl GlobalTypeVar {
+    pub fn new<S>(name_hint: S, kind: TypeKind, span: Span) -> GlobalTypeVar
+    where
+        S: Into<TString>,
+    {
+        let node = GlobalTypeVarNode {
+            base: TypeNode::base::<GlobalTypeVarNode>(span),
+            name_hint: name_hint.into(),
+            kind: kind,
+        };
+        ObjectPtr::new(node).into()
+    }
+}
+
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TupleType"]
 #[type_key = "TupleType"]
 pub struct TupleTypeNode {
@@ -147,7 +153,7 @@ impl TupleType {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TypeConstraint"]
 #[type_key = "TypeConstraint"]
 pub struct TypeConstraintNode {
@@ -156,7 +162,7 @@ pub struct TypeConstraintNode {
 
 /// The representation of a polymorphic function type.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "FuncType"]
 #[type_key = "FuncType"]
 pub struct FuncTypeNode {
@@ -181,7 +187,7 @@ pub struct FuncTypeNode {
  * TypeVar represents the input to the graph.
  */
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "IncompleteType"]
 #[type_key = "IncompleteType"]
 pub struct IncompleteTypeNode {
@@ -195,7 +201,7 @@ pub struct IncompleteTypeNode {
  * \sa RelayRefType.
  */
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "RefType"]
 #[type_key = "relay.RefType"]
 pub struct RelayRefTypeNode {
@@ -204,7 +210,7 @@ pub struct RelayRefTypeNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BaseTensorType"]
 #[type_key = "relay.BaseTensorType"]
 pub struct BaseTensorTypeNode {
@@ -212,7 +218,7 @@ pub struct BaseTensorTypeNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TensorType"]
 #[type_key = "relay.TensorType"]
 pub struct TensorTypeNode {
@@ -240,3 +246,52 @@ impl TensorType {
 // using TypeRelationFn = tvm::TypeRelationFn;
 // using TypeReporter = tvm::TypeReporter;
 // using TypeReporterNode = tvm::TypeReporterNode;
+
+/* TypeData container node.
+\brief Stores all data for an Algebraic Data Type (ADT).
+
+In particular, it stores the handle (global type var) for an ADT
+and the constructors used to build it and is kept in the module. Note
+that type parameters are also indicated in the type data: this means that
+for any instance of an ADT, the type parameters must be indicated. That is,
+an ADT definition is treated as a type-level function, so an ADT handle
+must be wrapped in a TypeCall node that instantiates the type-level arguments.
+The kind checker enforces this. */
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "TypeData"]
+#[type_key = "relay.TypeData"]
+pub struct TypeDataNode {
+    /// The header is simply the name of the ADT.
+    /// We adopt nominal typing for ADT definitions;
+    /// that is, differently-named ADT definitions with same constructors
+    /// have different types.
+    pub base: TypeNode,
+    pub type_name: GlobalTypeVar,
+    /// The type variables (to allow for polymorphism).
+    pub type_vars: Array<TypeVar>,
+    /// The constructors.
+    pub constructors: Array<Constructor>,
+}
+
+impl TypeData {
+    pub fn new<TypeVars, Ctors>(
+        type_name: GlobalTypeVar,
+        type_vars: TypeVars,
+        constructors: Ctors,
+        span: Span,
+    ) -> TypeData
+    where
+        TypeVars: IntoIterator<Item = TypeVar>,
+        Ctors: IntoIterator<Item = Constructor>,
+    {
+        use std::iter::FromIterator;
+        let type_data = TypeDataNode {
+            base: TypeNode::base::<TypeDataNode>(span),
+            type_name,
+            type_vars: Array::from_iter(type_vars),
+            constructors: Array::from_iter(constructors),
+        };
+        TypeData(Some(ObjectPtr::new(type_data)))
+    }
+}
diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs
index 36c750328249..e86420eb70c9 100644
--- a/rust/tvm/src/lib.rs
+++ b/rust/tvm/src/lib.rs
@@ -17,14 +17,14 @@
  * under the License.
  */
 
-//! [TVM](https://github.com/apache/incubator-tvm) is a compiler stack for deep learning systems.
+//! [TVM](https://github.com/apache/tvm) is a compiler stack for deep learning systems.
 //!
 //! This crate provides an idiomatic Rust API for TVM runtime frontend.
 //!
 //! One particular use case is that given optimized deep learning model artifacts,
 //! (compiled with TVM) which include a shared library
 //! `lib.so`, `graph.json` and a byte-array `param.params`, one can load them
-//! in Rust idomatically to create a TVM Graph Runtime and
+//! in Rust idiomatically to create a TVM Graph Runtime and
 //! run the model for some inputs and get the
 //! desired predictions *all in Rust*.
 //!
@@ -47,3 +47,28 @@ pub mod runtime;
 pub mod transform;
 
 pub use runtime::version;
+
+#[macro_export]
+macro_rules! export {
+    ($($fn_name:expr),*) => {
+        pub fn tvm_export(ns: &str) -> Result<(), tvm::Error> {
+            $(
+                let name = String::from(ns) + ::std::stringify!($fn_name);
+                tvm::runtime::function::register_override($fn_name, name, true)?;
+            )*
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! export_mod {
+    ($ns:expr, $($mod_name:expr),*) => {
+        pub fn tvm_mod_export() -> Result<(), tvm::Error> {
+            $(
+                $mod_name::tvm_export($ns)?;
+            )*
+            Ok(())
+        }
+    }
+}
diff --git a/rust/tvm/src/transform.rs b/rust/tvm/src/transform.rs
index 59fc60450825..b49633777b65 100644
--- a/rust/tvm/src/transform.rs
+++ b/rust/tvm/src/transform.rs
@@ -33,7 +33,7 @@ pub type IRModule = ObjectRef;
 pub type PassContext = ObjectRef;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PassInfo"]
 #[type_key = "transform.PassInfo"]
 pub struct PassInfoNode {
@@ -50,7 +50,7 @@ impl PassInfo {
         let required = Array::from_vec(required)?;
 
         let node = PassInfoNode {
-            base: Object::base_object::<PassInfoNode>(),
+            base: Object::base::<PassInfoNode>(),
             opt_level,
             name: name.into(),
             required,
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index daf61441b466..9737b53703fd 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -47,7 +47,7 @@ void Analyzer::Bind(const Var& var, const PrimExpr& expr, bool allow_override) {
 }
 
 void Analyzer::Bind(const Var& var, const Range& range, bool allow_override) {
-  CHECK(range.defined());
+  ICHECK(range.defined());
   if (tir::is_one(range->extent)) {
     this->Bind(var, range->min, allow_override);
   } else {
@@ -64,7 +64,7 @@ void Analyzer::Bind(const Map<Var, Range>& variables, bool allow_override) {
 }
 
 void ConstraintContext::EnterWithScope() {
-  CHECK(exit_ == nullptr);
+  ICHECK(exit_ == nullptr);
   // entering the scope.
   auto f0 = analyzer_->const_int_bound.EnterConstraint(constraint_);
   auto f1 = analyzer_->modular_set.EnterConstraint(constraint_);
@@ -78,7 +78,7 @@ void ConstraintContext::EnterWithScope() {
 }
 
 void ConstraintContext::ExitWithScope() {
-  CHECK(exit_ != nullptr);
+  ICHECK(exit_ != nullptr);
   exit_();
 }
 
diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index a88849b42e9f..d0a0702a0fb0 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -63,7 +63,7 @@ inline PrimExpr ModImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   if (mode == kTruncDiv) {
     return truncmod(a, b);
   } else {
-    CHECK_EQ(mode, kFloorDiv);
+    ICHECK_EQ(mode, kFloorDiv);
     return floormod(a, b);
   }
 }
@@ -72,7 +72,7 @@ inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   if (mode == kTruncDiv) {
     return truncdiv(a, b);
   } else {
-    CHECK_EQ(mode, kFloorDiv);
+    ICHECK_EQ(mode, kFloorDiv);
     return floordiv(a, b);
   }
 }
@@ -102,7 +102,7 @@ class SplitExprNode : public CanonicalExprNode {
   DivMode div_mode{kTruncDiv};
 
   /*! \brief verify that this is a valid entry. */
-  void Verify() const { CHECK(upper_factor == kPosInf || upper_factor % lower_factor == 0); }
+  void Verify() const { ICHECK(upper_factor == kPosInf || upper_factor % lower_factor == 0); }
 
   PrimExpr NormalizeWithScale(int64_t sscale) const {
     PrimExpr res = this->index;
@@ -118,7 +118,7 @@ class SplitExprNode : public CanonicalExprNode {
     }
     sscale *= this->scale;
     if (sscale != 1) {
-      CHECK(!dtype.is_uint() || sscale > 0);
+      ICHECK(!dtype.is_uint() || sscale > 0);
       res = res * make_const(dtype, sscale);
     }
     return res;
@@ -209,10 +209,10 @@ class SumExprNode : public CanonicalExprNode {
    * \param scale The scale to be applied.
    */
   void DivideBy(int64_t scale) {
-    CHECK_EQ(this->base % scale, 0);
+    ICHECK_EQ(this->base % scale, 0);
     this->base /= scale;
     for (size_t i = 0; i < this->args.size(); ++i) {
-      CHECK_EQ(args[i]->scale % scale, 0);
+      ICHECK_EQ(args[i]->scale % scale, 0);
       args[i].CopyOnWrite()->scale /= scale;
     }
   }
@@ -508,7 +508,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
       return expr;
     }
     expr = ToSplitExpr(Normalize(expr));
-    CHECK(expr->DivModeCompatibleTo(div_mode));
+    ICHECK(expr->DivModeCompatibleTo(div_mode));
     expr.CopyOnWrite()->div_mode = div_mode;
     return expr;
   }
@@ -648,7 +648,7 @@ void CanonicalSimplifier::Impl::SeparateDivisibleParts(const SumExprNode* psum,
 }
 
 SplitExpr CanonicalSimplifier::Impl::SplitDivConst(SplitExpr lhs, int64_t cval, DivMode div_mode) {
-  CHECK_GT(cval, 0);
+  ICHECK_GT(cval, 0);
   lhs = ConvertDivMode(lhs, div_mode);
 
   // the following rule works for both floordiv and truncdiv
@@ -682,8 +682,8 @@ SplitExpr CanonicalSimplifier::Impl::SplitDivConst(SplitExpr lhs, int64_t cval,
   }
   // directly return the split with cval == 1
   lhs = ToSplitExpr(Normalize(lhs));
-  CHECK(lhs->DivModeCompatibleTo(div_mode));
-  CHECK_EQ(lhs->scale, 1);
+  ICHECK(lhs->DivModeCompatibleTo(div_mode));
+  ICHECK_EQ(lhs->scale, 1);
   lhs.CopyOnWrite()->lower_factor *= cval;
   lhs.CopyOnWrite()->div_mode = div_mode;
   return lhs;
@@ -803,7 +803,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
 }
 
 SplitExpr CanonicalSimplifier::Impl::SplitModConst(SplitExpr lhs, int64_t cval, DivMode div_mode) {
-  CHECK_GT(cval, 0);
+  ICHECK_GT(cval, 0);
   lhs = ConvertDivMode(lhs, div_mode);
 
   if (lhs->scale % cval == 0) {
@@ -842,9 +842,9 @@ SplitExpr CanonicalSimplifier::Impl::SplitModConst(SplitExpr lhs, int64_t cval,
   }
   // Normalize the value.
   lhs = ToSplitExpr(Normalize(lhs));
-  CHECK(lhs->DivModeCompatibleTo(div_mode));
-  CHECK_EQ(lhs->scale, 1);
-  CHECK_EQ(lhs->lower_factor, 1);
+  ICHECK(lhs->DivModeCompatibleTo(div_mode));
+  ICHECK_EQ(lhs->scale, 1);
+  ICHECK_EQ(lhs->lower_factor, 1);
   lhs.CopyOnWrite()->div_mode = div_mode;
   lhs.CopyOnWrite()->upper_factor = cval;
   return lhs;
@@ -886,7 +886,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
             // contonue to use logic below.
             a = extra;
             psum = a.as<SumExprNode>();
-            CHECK(psum != nullptr);
+            ICHECK(psum != nullptr);
           }
         }
       }
@@ -948,7 +948,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
           // contonue to use logic below.
           a = extra;
           psum = a.as<SumExprNode>();
-          CHECK(psum != nullptr);
+          ICHECK(psum != nullptr);
         }
       }
       // Simplify the offset constant if necessary.
diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index 876d336454d8..7bc04a184633 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -150,7 +150,7 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
     if (pa && pb) {
       // due to division and mod can have different modes
       // NOTE: this will assumes truc div.
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
       return IntImm(rtype, pa->value / pb->value);
     }
     if (pa) {
@@ -158,7 +158,7 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
     }
     if (pb) {
       if (pb->value == 1) return a;
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
     if (fa && fb && fb->value != 0) {
       return FloatImm(rtype, fa->value / fb->value);
@@ -166,7 +166,7 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
     if (fa && fa->value == 0) return a;
     if (fb) {
       if (fb->value == 1) return a;
-      CHECK_NE(fb->value, 0) << "Divide by zero";
+      ICHECK_NE(fb->value, 0) << "Divide by zero";
     }
   });
   return PrimExpr();
@@ -177,7 +177,7 @@ inline PrimExpr TryConstFold<tir::Mod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
       return IntImm(rtype, pa->value % pb->value);
     }
     if (pa) {
@@ -185,7 +185,7 @@ inline PrimExpr TryConstFold<tir::Mod>(PrimExpr a, PrimExpr b) {
     }
     if (pb) {
       if (pb->value == 1) return tir::make_zero(rtype);
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
   return PrimExpr();
@@ -196,7 +196,7 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
       return IntImm(rtype, arith::floordiv(pa->value, pb->value));
     }
     if (pa) {
@@ -204,7 +204,7 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
     }
     if (pb) {
       if (pb->value == 1) return a;
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
     if (fa && fb && fb->value != 0) {
       return FloatImm(rtype, std::floor(fa->value / fb->value));
@@ -212,7 +212,7 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
     if (fa && fa->value == 0) return a;
     if (fb) {
       if (fb->value == 1) return a;
-      CHECK_NE(fb->value, 0) << "Divide by zero";
+      ICHECK_NE(fb->value, 0) << "Divide by zero";
     }
   });
   return PrimExpr();
@@ -223,7 +223,7 @@ inline PrimExpr TryConstFold<tir::FloorMod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
       return IntImm(rtype, floormod(pa->value, pb->value));
     }
     if (pa) {
@@ -231,7 +231,7 @@ inline PrimExpr TryConstFold<tir::FloorMod>(PrimExpr a, PrimExpr b) {
     }
     if (pb) {
       if (pb->value == 1) return tir::make_zero(rtype);
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
   return PrimExpr();
diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 876b7db188c6..75c09ac05073 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -109,11 +109,11 @@ class ConstIntBoundAnalyzer::Impl
     if (!allow_override) {
       auto it = var_map_.find(var);
       if (it != var_map_.end()) {
-        CHECK(it->second == info) << "Trying to update var \'" << var << "\'"
-                                  << " with a different const bound: "
-                                  << "original="
-                                  << ConstIntBound(it->second.min_value, it->second.max_value)
-                                  << ", new=" << ConstIntBound(info.min_value, info.max_value);
+        ICHECK(it->second == info)
+            << "Trying to update var \'" << var << "\'"
+            << " with a different const bound: "
+            << "original=" << ConstIntBound(it->second.min_value, it->second.max_value)
+            << ", new=" << ConstIntBound(info.min_value, info.max_value);
       }
     }
     var_map_[var] = info;
@@ -155,7 +155,7 @@ class ConstIntBoundAnalyzer::Impl
       auto val = bound_->find(expr);
       if (val != bound_->end()) {
         auto everything = Everything(expr->dtype);
-        CHECK(
+        ICHECK(
             (val->second->min_value == res.min_value && val->second->max_value == res.max_value) ||
             (val->second->min_value == everything.min_value &&
              val->second->max_value == everything.max_value))
@@ -211,7 +211,7 @@ class ConstIntBoundAnalyzer::Impl
   Entry VisitExpr_(const DivNode* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = VisitExpr(op->b);
-    CHECK(!b.is_const(0)) << "divide by zero";
+    ICHECK(!b.is_const(0)) << "divide by zero";
     return HandleDivision(a, b, op->dtype, InfAwareDiv);
   }
 
@@ -230,7 +230,7 @@ class ConstIntBoundAnalyzer::Impl
                          std::min(std::max(a.max_value, (int64_t)0), b_max_cap));
       }
     } else {
-      CHECK(!b.is_const(0)) << "mod by zero";
+      ICHECK(!b.is_const(0)) << "mod by zero";
       // mod by negative value is rare,
       // and we just use the simpliest rule.
       return Everything(op->dtype);
@@ -240,11 +240,28 @@ class ConstIntBoundAnalyzer::Impl
   Entry VisitExpr_(const FloorDivNode* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = VisitExpr(op->b);
-    CHECK(!b.is_const(0)) << "floordiv by zero";
+    ICHECK(!b.is_const(0)) << "floordiv by zero";
     return HandleDivision(a, b, op->dtype, InfAwareFloorDiv);
   }
 
   Entry VisitExpr_(const FloorModNode* op) final {
+    /* let a / b = x + y, where x is integer, y \in [0, 1)
+     * floormod(a, b) = a - floordiv(a, b) * b
+     * floordiv(a, b) = x
+     * floormod(a, b) = a - floordiv(a, b) * b
+     *                = a - x * b
+     *                = a - (a / b - y) * b
+     *                = a - a + y * b
+     *                = y * b
+     * note that 0 <= y < 1
+     * when b > 0, 0 <= b * y < b
+     *             0 <= b * y <= b - 1
+     * when b < 0, b < b * y <= 0
+     *             b + 1 <= b * y <= 0
+     * In all cases, min(0, b + 1) <= b * y <= max(0, b - 1)
+     *               min(0, b_min + 1) <= b * y <= max(0, b_max - 1)
+     * That is, min(0, b_min + 1) <= floormod(a, b) <= max(0, b_max - 1)
+     */
     Entry a = VisitExpr(op->a);
     Entry b = VisitExpr(op->b);
     if (b.min_value > 0) {
@@ -258,10 +275,12 @@ class ConstIntBoundAnalyzer::Impl
         return MakeBound(0, b_max_cap);
       }
     } else {
-      CHECK(!b.is_const(0)) << "floormod by zero";
-      // mod by negative value is rare,
-      // and we just use the simpliest rule.
-      return Everything(op->dtype);
+      ICHECK(!b.is_const(0)) << "floormod by zero";
+      int64_t b_min_cap = InfAwareAdd(b.min_value, 1);
+      int64_t b_max_cap = InfAwareAdd(b.max_value, -1);
+      return Intersect(MakeBound(std::min(static_cast<int64_t>(0), b_min_cap),
+                                 std::max(static_cast<int64_t>(0), b_max_cap)),
+                       Everything(op->dtype));
     }
   }
 
@@ -352,7 +371,7 @@ class ConstIntBoundAnalyzer::Impl
     additional_info_.insert(additional_info_.end(), info.begin(), info.end());
     size_t new_size = old_size + info.size();
     auto frecover = [old_size, new_size, this]() {
-      CHECK_EQ(additional_info_.size(), new_size);
+      ICHECK_EQ(additional_info_.size(), new_size);
       additional_info_.resize(old_size);
     };
     return frecover;
@@ -432,11 +451,11 @@ class ConstIntBoundAnalyzer::Impl
    */
   static int64_t InfAwareAdd(int64_t x, int64_t y) {
     if (x == kPosInf) {
-      CHECK(y != kNegInf);
+      ICHECK(y != kNegInf);
       return kPosInf;
     }
     if (x == kNegInf) {
-      CHECK(y != kPosInf);
+      ICHECK(y != kPosInf);
       return kNegInf;
     }
     if (y == kPosInf || y == kNegInf) return y;
@@ -464,7 +483,7 @@ class ConstIntBoundAnalyzer::Impl
    * \return the result.
    */
   static int64_t InfAwareDiv(int64_t x, int64_t y) {
-    CHECK_NE(y, 0);
+    ICHECK_NE(y, 0);
     if (x == kPosInf || x == kNegInf) {
       if (y > 0) return x;
       return -x;
@@ -478,7 +497,7 @@ class ConstIntBoundAnalyzer::Impl
    * \return the result.
    */
   static int64_t InfAwareFloorDiv(int64_t x, int64_t y) {
-    CHECK_NE(y, 0);
+    ICHECK_NE(y, 0);
     if (x == kPosInf || x == kNegInf) {
       if (y > 0) return x;
       return -x;
@@ -500,8 +519,8 @@ class ConstIntBoundAnalyzer::Impl
    */
   static Entry MakeBound(int64_t min_value, int64_t max_value) {
     Entry e;
-    e.min_value = min_value;
-    e.max_value = max_value;
+    e.min_value = (min_value == kPosInf) ? min_value - 1 : min_value;
+    e.max_value = (max_value == kNegInf) ? max_value + 1 : max_value;
     return e;
   }
   /*!
diff --git a/src/arith/domain_touched.cc b/src/arith/domain_touched.cc
index d59486cfcd79..3c3da5f4b99b 100644
--- a/src/arith/domain_touched.cc
+++ b/src/arith/domain_touched.cc
@@ -67,7 +67,7 @@ class BufferTouchedDomain final : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == tir::attr::thread_extent) {
       const IterVarNode* thread_axis = op->node.as<IterVarNode>();
-      CHECK(thread_axis);
+      ICHECK(thread_axis);
       const VarNode* var = thread_axis->var.get();
       dom_map_[var] = IntSet::FromRange(Range(make_zero(op->value.dtype()), op->value));
       StmtExprVisitor::VisitStmt_(op);
diff --git a/src/arith/int_constraints.cc b/src/arith/int_constraints.cc
index 189869bd64e7..3a668c2331e7 100644
--- a/src/arith/int_constraints.cc
+++ b/src/arith/int_constraints.cc
@@ -33,7 +33,7 @@
 #include <unordered_map>
 #include <utility>
 
-#include "../tir/transforms/ir_util.h"
+#include "../tir/transforms/ir_utils.h"
 
 namespace tvm {
 namespace arith {
@@ -43,9 +43,9 @@ Array<PrimExpr> AsConditions(const Array<Var>& variables, const Map<Var, IntGrou
   Array<PrimExpr> res;
   // use variables to keep the order of iteration
   // so as to get rid of any non-determinism.
-  CHECK_EQ(variables.size(), bounds.size());
+  ICHECK_EQ(variables.size(), bounds.size());
   for (const auto v : variables) {
-    CHECK(bounds.count(v));
+    ICHECK(bounds.count(v));
     const auto& bnds = bounds[v];
     PrimExpr lhs = bnds->coef * v;
     for (const PrimExpr& rhs : bnds->equal) {
@@ -66,7 +66,7 @@ Array<PrimExpr> AsConditions(const Array<Var>& variables, const Map<Var, IntGrou
 
 IntGroupBounds::IntGroupBounds(PrimExpr coef, Array<PrimExpr> lower, Array<PrimExpr> equal,
                                Array<PrimExpr> upper) {
-  CHECK(coef.dtype().is_int() || coef.dtype().is_uint())
+  ICHECK(coef.dtype().is_int() || coef.dtype().is_uint())
       << "Coefficient in IntGroupBounds must be integers";
   ObjectPtr<IntGroupBoundsNode> node = make_object<IntGroupBoundsNode>();
   node->coef = std::move(coef);
@@ -178,7 +178,7 @@ Range IntGroupBounds::FindBestRange(const Map<Var, Range>& vranges_addl) const {
   }
 
   if (!best_lower.defined()) {
-    CHECK(!best_diff_over.defined());
+    ICHECK(!best_diff_over.defined());
     return Range();
   }
   return Range::FromMinExtent(best_lower, analyzer.Simplify(best_diff_over + 1));
@@ -196,7 +196,7 @@ TVM_REGISTER_GLOBAL("arith.IntGroupBounds_from_range").set_body_typed(IntGroupBo
 
 TVM_REGISTER_GLOBAL("arith.IntGroupBounds_FindBestRange")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
-      CHECK(args.size() == 1 || args.size() == 2);
+      ICHECK(args.size() == 1 || args.size() == 2);
       IntGroupBounds bounds = args[0];
       if (args.size() == 1) {
         *ret = bounds.FindBestRange();
@@ -221,9 +221,9 @@ IntConstraints::IntConstraints(Array<Var> variables, Map<Var, Range> ranges,
   if (!ranges.defined()) {
     ranges = Map<Var, Range>();
   }
-  CHECK(relations.defined());
+  ICHECK(relations.defined());
   for (const auto& var : variables) {
-    CHECK(var.dtype().is_int() || var.dtype().is_uint())
+    ICHECK(var.dtype().is_int() || var.dtype().is_uint())
         << "Variables in IntConstraints must be integers";
   }
   node->variables = std::move(variables);
@@ -259,7 +259,7 @@ IntConstraintsTransform::IntConstraintsTransform(IntConstraints src, IntConstrai
 
 IntConstraintsTransform IntConstraintsTransform::operator+(
     const IntConstraintsTransform& other) const {
-  CHECK(other->src.same_as(operator->()->dst));
+  ICHECK(other->src.same_as(operator->()->dst));
   Map<Var, PrimExpr> dst_to_src;
   Map<Var, PrimExpr> src_to_dst;
 
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index 9940d1f60b39..6490f67e1b1a 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -412,7 +412,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
   IntervalSet VisitExpr_(const OrNode* op) final { return VisitBinaryExpr_<Or>(op); }
 
   IntervalSet VisitExpr_(const RampNode* op) final {
-    CHECK(eval_vec_);
+    ICHECK(eval_vec_);
     IntervalSet base = Eval(op->base);
     PVar<IntImm> stride;
     if (stride.Match(op->stride)) {
@@ -431,7 +431,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
   }
 
   IntervalSet VisitExpr_(const BroadcastNode* op) final {
-    CHECK(eval_vec_);
+    ICHECK(eval_vec_);
     return VisitExpr(op->value);
   }
 
@@ -506,7 +506,7 @@ Range IntSet::CoverRange(Range max_range) const {
   IntSet temp;
   Analyzer analyzer;
   const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
-  CHECK(s_int != nullptr);
+  ICHECK(s_int != nullptr);
   if (s_int->HasUpperBound() && s_int->HasLowerBound()) {
     return Range::FromMinExtent(s_int->min_value,
                                 analyzer.Simplify(s_int->max_value + 1 - s_int->min_value));
@@ -516,13 +516,13 @@ Range IntSet::CoverRange(Range max_range) const {
 
 PrimExpr IntSet::min() const {
   const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
-  CHECK(s_int);
+  ICHECK(s_int);
   return s_int->min_value;
 }
 
 PrimExpr IntSet::max() const {
   const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
-  CHECK(s_int);
+  ICHECK(s_int);
   return s_int->max_value;
 }
 
@@ -584,7 +584,7 @@ SignType IntSet::GetSignType() const {
 }
 PrimExpr IntSet::PointValue() const {
   const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
-  CHECK(s_int && s_int->IsSinglePoint());
+  ICHECK(s_int && s_int->IsSinglePoint());
   return s_int->min_value;
 }
 
diff --git a/src/arith/ir_mutator_with_analyzer.cc b/src/arith/ir_mutator_with_analyzer.cc
index 8fb69b31857a..7bc0d946ade7 100644
--- a/src/arith/ir_mutator_with_analyzer.cc
+++ b/src/arith/ir_mutator_with_analyzer.cc
@@ -96,7 +96,7 @@ Stmt IRMutatorWithAnalyzer::VisitStmt_(const IfThenElseNode* op) {
 Stmt IRMutatorWithAnalyzer::VisitStmt_(const AttrStmtNode* op) {
   if (op->attr_key == tir::attr::thread_extent || op->attr_key == tir::attr::virtual_thread) {
     IterVar iv = Downcast<IterVar>(op->node);
-    CHECK_NE(iv->thread_tag.length(), 0U);
+    ICHECK_NE(iv->thread_tag.length(), 0U);
     analyzer_->Bind(iv->var, Range::FromMinExtent(0, op->value));
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     return stmt;
diff --git a/src/arith/ir_visitor_with_analyzer.h b/src/arith/ir_visitor_with_analyzer.h
index 388720ad29c0..058abc8c7d20 100644
--- a/src/arith/ir_visitor_with_analyzer.h
+++ b/src/arith/ir_visitor_with_analyzer.h
@@ -44,7 +44,7 @@ class IRVisitorWithAnalyzer final : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       analyzer_.Bind(iv->var, Range::FromMinExtent(0, op->value));
       StmtExprVisitor::VisitStmt_(op);
     } else {
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
new file mode 100644
index 000000000000..7896db73d10a
--- /dev/null
+++ b/src/arith/iter_affine_map.cc
@@ -0,0 +1,765 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/arith/iter_affine_map.cc
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/arith/iter_affine_map.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/expr_functor.h>
+#include <tvm/tir/op.h>
+
+#include "../support/utils.h"
+#include "const_fold.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace tir;
+
+IterMark::IterMark(PrimExpr source, PrimExpr extent) {
+  auto n = make_object<IterMarkNode>();
+  n->source = std::move(source);
+  n->extent = std::move(extent);
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_GLOBAL("arith.IterMark").set_body_typed([](PrimExpr source, PrimExpr extent) {
+  return IterMark(source, extent);
+});
+
+TVM_REGISTER_NODE_TYPE(IterMarkNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<IterMarkNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const IterMarkNode*>(node.get());
+      p->stream << "IterMark(" << op->source << ", extent=" << op->extent << ")";
+    });
+
+IterSplitExpr::IterSplitExpr(IterMark source) {
+  auto n = make_object<IterSplitExprNode>();
+  auto one = make_const(source->source->dtype, 1);
+  n->dtype = source->source->dtype;
+  n->source = std::move(source);
+  n->extent = n->source->extent;
+  n->lower_factor = one;
+  n->scale = one;
+  data_ = std::move(n);
+}
+
+IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr scale) {
+  auto n = make_object<IterSplitExprNode>();
+  auto one = make_const(source->source->dtype, 1);
+  n->dtype = source->source->dtype;
+  n->source = std::move(source);
+  n->extent = n->source->extent;
+  n->lower_factor = one;
+  n->scale = std::move(scale);
+  data_ = std::move(n);
+}
+
+IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr lower_factor, PrimExpr extent,
+                             PrimExpr scale) {
+  auto n = make_object<IterSplitExprNode>();
+  n->dtype = source->source->dtype;
+  n->source = std::move(source);
+  n->lower_factor = std::move(lower_factor);
+  n->extent = std::move(extent);
+  n->scale = std::move(scale);
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_GLOBAL("arith.IterSplitExpr")
+    .set_body_typed([](IterMark source, PrimExpr lower_factor, PrimExpr extent, PrimExpr scale) {
+      return IterSplitExpr(source, lower_factor, extent, scale);
+    });
+
+TVM_REGISTER_NODE_TYPE(IterSplitExprNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<IterSplitExprNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const IterSplitExprNode*>(node.get());
+      p->stream << "IterSplit(" << op->source << ", lower_factor=" << op->lower_factor
+                << ", extent=" << op->extent << ", scale=" << op->scale << ")";
+    });
+
+IterSumExpr::IterSumExpr(Array<IterSplitExpr> args, PrimExpr base) {
+  auto n = make_object<IterSumExprNode>();
+  n->dtype = base->dtype;
+  n->args = std::move(args);
+  n->base = std::move(base);
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_GLOBAL("arith.IterSumExpr")
+    .set_body_typed([](Array<IterSplitExpr> args, PrimExpr base) {
+      return IterSumExpr(args, base);
+    });
+
+TVM_REGISTER_NODE_TYPE(IterSumExprNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<IterSumExprNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const IterSumExprNode*>(node.get());
+      p->stream << "IterSum(" << op->args << ", " << op->base << ")";
+    });
+
+/*!
+ * \brief Collector that collects
+ *  the outgoing split reference of each IterMark.
+ *
+ *  These out-going splits can then be used to
+ *  check if the iterators are independent.
+ */
+class IterMarkSplitCollector {
+ public:
+  // mark all IterMarks that are visited.
+  std::unordered_set<IterMark, ObjectPtrHash, ObjectPtrEqual> visited_;
+  // each iter mark to its outgoing splits that are referenced.
+  std::unordered_map<IterMark, std::vector<IterSplitExpr>, ObjectPtrHash, ObjectPtrEqual>
+      mark2splits_;
+  /*!
+   * \brief Collect all mark2splits recursively from indices.
+   * \param indices The iterator of interest.
+   */
+  void Collect(const Array<IterSumExpr>& indices) {
+    for (IterSumExpr sum_expr : indices) {
+      for (IterSplitExpr split : sum_expr->args) {
+        this->CollectInternal(split->source);
+        mark2splits_[split->source].push_back(split);
+      }
+    }
+  }
+
+  void CollectInternal(const IterMark& mark) {
+    if (visited_.count(mark)) return;
+    visited_.insert(mark);
+    if (auto* op = mark->source.as<IterSumExprNode>()) {
+      for (IterSplitExpr split : op->args) {
+        this->CollectInternal(split->source);
+        mark2splits_[split->source].push_back(split);
+      }
+    }
+  }
+};
+
+// Rewriter to rewrite PrimExpr to IterMapExpr
+// when possible
+class IterMapRewriter : public ExprMutator {
+ public:
+  using Parent = ExprMutator;
+
+  explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters)
+      : analyzer_(analyzer) {
+    for (auto kv : input_iters) {
+      const auto& vrng = kv.second;
+      if (is_zero(vrng->min)) {
+        IterMark mark(kv.first, vrng->extent);
+        var_map_[kv.first] = IterSplitExpr(mark);
+        input_marks_.push_back(mark);
+      } else {
+        IterMark mark(kv.first - vrng->min, vrng->extent);
+        auto sum_expr = ToIterSumExpr(IterSplitExpr(mark));
+        sum_expr.CopyOnWrite()->base = vrng->min;
+        var_map_[kv.first] = sum_expr;
+        input_marks_.push_back(mark);
+      }
+    }
+  }
+
+  size_t unresolved_count() const { return unresolved_count_; }
+
+  IterSumExpr Rewrite(PrimExpr expr) {
+    return NormalizeToIterWithOffset(ToIterSumExpr(DirectMutate(expr)));
+  }
+
+  bool CheckBijective(const Array<IterSumExpr>& indices) {
+    // This function checks two conditions:
+    // - C0: Each iter mark should be fully covered by non-overlapping splits.
+    // - C1: All of the input iterators are used.
+    //
+    // Example: given x in [0, 8) y in [0, 6)
+    // - indices = [x, x+1, y] won't pass because x and x+1 contribute
+    //   two splits that overlaps with each other.
+    // - indices = [x / 4, x % 4, y] will pass because x / 4 and x % 4
+    //   contribute two non-overlapping splits that covers x.
+    // - indices = [x / 4, x % 4] won't pass because y is not used.
+    //
+    IterMarkSplitCollector collector;
+    // We can check that for each iter mark:
+    // All the splits that refers to the itermark covers its extent.
+    // The splits do not overlap with each other.
+    collector.Collect(indices);
+    for (const IterMark& mark : collector.visited_) {
+      if (TryNormalizeSplits(mark, collector.mark2splits_[mark]).empty()) return false;
+    }
+    // all input marks must be visited
+    for (const auto& mark : input_marks_) {
+      if (collector.visited_.count(mark) == 0) return false;
+    }
+    return true;
+  }
+
+  // override the original mutate function.
+  PrimExpr VisitExpr(const PrimExpr& input_expr) final {
+    auto expr = ExprMutator::VisitExpr(input_expr);
+    if (expr->IsInstance<IterMapExprNode>()) {
+      ++unresolved_count_;
+    }
+    return expr;
+  }
+
+  // Normal mutation without normalization.
+  PrimExpr DirectMutate(const PrimExpr& expr) { return ExprMutator::VisitExpr(expr); }
+
+  PrimExpr VisitExpr_(const VarNode* op) final;
+  PrimExpr VisitExpr_(const AddNode* op) final;
+  PrimExpr VisitExpr_(const SubNode* op) final;
+  PrimExpr VisitExpr_(const MulNode* op) final;
+  PrimExpr VisitExpr_(const FloorDivNode* op) final;
+  PrimExpr VisitExpr_(const FloorModNode* op) final;
+
+ private:
+  // temp hash for de-duplication purposes.
+  struct IterSumHash {
+    size_t operator()(const IterSumExpr& value) const {
+      // for now only hash on source index.
+      size_t hash = value->args.size();
+      for (const auto& arg : value->args) {
+        hash = support::HashCombine(hash, std::hash<const Object*>()(arg->source.get()));
+      }
+      return hash;
+    }
+  };
+
+  struct IterSumEqual {
+    bool operator()(const IterSumExpr& lhs, const IterSumExpr& rhs) const {
+      tir::ExprDeepEqual equal;
+      if (lhs->args.size() != rhs->args.size()) return false;
+      if (!equal(lhs->base, rhs->base)) return false;
+      for (size_t i = 0; i < lhs->args.size(); ++i) {
+        auto lvalue = lhs->args[i];
+        auto rvalue = rhs->args[i];
+        if (!lvalue->source.same_as(rvalue->source)) return false;
+        if (!equal(lvalue->lower_factor, rvalue->lower_factor)) return false;
+        if (!equal(lvalue->scale, rvalue->scale)) return false;
+        if (!equal(lvalue->extent, rvalue->extent)) return false;
+      }
+      return true;
+    }
+  };
+
+  // Internal analyzer
+  Analyzer* analyzer_;
+  // Counter to keep track of unresolved cases.
+  int unresolved_count_{0};
+  // The var map
+  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> var_map_;
+  // input iter marks
+  std::vector<IterMark> input_marks_;
+  // The canonical map for sum
+  std::unordered_map<IterSumExpr, IterSplitExpr, IterSumHash, IterSumEqual> sum_fuse_map_;
+
+  /*!
+   * \brief Verify that splits fully covers mark in a non-overlapping fashion.
+   *        If verification passes, return splits from outermost to inner most order.
+   *        If not, return an empty array
+   * \param mark The iterator of interest.
+   * \param splits The splits to be verified.
+   * \return The normalized splits.
+   */
+  Array<IterSplitExpr> TryNormalizeSplits(const IterMark& mark,
+                                          const std::vector<IterSplitExpr>& splits) {
+    std::vector<bool> used(splits.size(), false);
+    std::vector<IterSplitExpr> iters;
+    PrimExpr expected_lower_factor = make_const(mark->source->dtype, 1);
+
+    for (size_t i = 0; i < splits.size(); ++i) {
+      size_t j = 0;
+      for (; j < splits.size(); ++j) {
+        if (used[j]) continue;
+        if (!used[j] && CanProveEqual(splits[j]->lower_factor, expected_lower_factor)) break;
+      }
+      if (j == splits.size()) {
+        return Array<IterSplitExpr>();
+      }
+      used[j] = true;
+      iters.push_back(splits[j]);
+      expected_lower_factor *= splits[j]->extent;
+    }
+    if (!CanProveEqual(expected_lower_factor, mark->extent)) return Array<IterSplitExpr>();
+    return Array<IterSplitExpr>(iters.rbegin(), iters.rend());
+  }
+
+  /*!
+   * \brief Normalize expr to an iterator + offset.
+   * \param expr The input expression.
+   * \return The Normalized expression.
+   */
+  IterSumExpr NormalizeToIterWithOffset(IterSumExpr expr) {
+    if (expr->args.size() <= 1) return expr;
+    PrimExpr base = expr->base;
+    expr.CopyOnWrite()->base = make_zero(expr->dtype);
+    auto opt = TryFuseIters(expr);
+    expr.CopyOnWrite()->base = base;
+    if (opt) {
+      expr.CopyOnWrite()->args = Array<IterSplitExpr>({opt.value()});
+      return expr;
+    } else {
+      ++unresolved_count_;
+      return expr;
+    }
+  }
+
+  bool CanProveEqual(PrimExpr lhs, PrimExpr rhs) {
+    const auto* clhs = lhs.as<IntImmNode>();
+    const auto* crhs = rhs.as<IntImmNode>();
+    if (clhs && crhs) return clhs->value == crhs->value;
+    return analyzer_->CanProve(lhs - rhs == 0);
+  }
+
+  /*!
+   * \brief Create a IterSumExpr from expr.
+   * \param expr The input expr.
+   * \return The transformed IterSumExpr.
+   */
+  static IterSumExpr ToIterSumExpr(const PrimExpr& expr) {
+    if (const auto* op = expr.as<IterSumExprNode>()) {
+      return GetRef<IterSumExpr>(op);
+    } else if (const auto* op = expr.as<IterSplitExprNode>()) {
+      return IterSumExpr({GetRef<IterSplitExpr>(op)}, make_zero(expr->dtype));
+    } else {
+      ICHECK(!expr->IsInstance<IterMapExprNode>());
+      return IterSumExpr({}, expr);
+    }
+  }
+
+  // Try to normalize IterSum into a fused IterMark
+  // return a corresponding splitexpr if needed.
+  // IterSum = x1*c1 + x2*c2 + ... + xn*cn
+  //         = (x1*s1 + x2*s2 + ... + xn)*cn
+  //         = y*cn (IterMark y => x1*s1 + x2*s2 + ... + xn)
+  //         = [IterSplit(IterMark(y), scale=cn)]
+  // return a corresponding IterSplitExpr if needed.
+  Optional<IterSplitExpr> TryFuseIters(IterSumExpr expr) {
+    if (!is_zero(expr->base)) return NullOpt;
+    if (expr->args.size() == 1) return expr->args[0];
+    // select the iterators in order
+    std::vector<bool> visited(expr->args.size(), false);
+    std::vector<IterSplitExpr> iters;
+    iters.reserve(expr->args.size());
+    // canonicalize the expression
+    // find the base scale first
+    Optional<IntImm> base_scale = NullOpt;
+    size_t base_index = 0;
+    for (size_t i = 0; i < expr->args.size(); ++i) {
+      if (const auto* op = expr->args[i]->scale.as<IntImmNode>()) {
+        if (!base_scale || op->value < base_scale.value()->value) {
+          base_scale = GetRef<IntImm>(op);
+          base_index = i;
+        }
+      }
+    }
+    if (!base_scale) return NullOpt;
+    // check if it can be remapped into a fused pattern.
+    PrimExpr expected_scale = base_scale.value();
+    for (size_t i = 0; i < expr->args.size(); ++i) {
+      size_t j = i == 0 ? base_index : 0;
+      for (; j < expr->args.size(); ++j) {
+        if (!visited[j] && CanProveEqual(expr->args[j]->scale, expected_scale)) break;
+      }
+      if (j == expr->args.size()) {
+        return NullOpt;
+      }
+      visited[j] = true;
+      auto arg = expr->args[j];
+      arg.CopyOnWrite()->scale = div(expr->args[j]->scale, base_scale.value());
+      iters.push_back(arg);
+      expected_scale *= expr->args[j]->extent;
+    }
+    // update the iterator to use the canonicalized form
+    expr.CopyOnWrite()->args = Array<IterSplitExpr>(iters.rbegin(), iters.rend());
+    auto it = sum_fuse_map_.find(expr);
+    if (it != sum_fuse_map_.end()) return it->second;
+    auto mark = IterMark(expr, div(expected_scale, base_scale.value()));
+    IterSplitExpr split(mark, base_scale.value());
+    sum_fuse_map_[expr] = split;
+    return split;
+  }
+
+  bool CanProveDivisible(const PrimExpr& lhs, const PrimExpr& rhs) {
+    const auto* clhs = lhs.as<IntImmNode>();
+    const auto* crhs = rhs.as<IntImmNode>();
+    if (clhs && crhs) return clhs->value % crhs->value == 0;
+    return analyzer_->CanProve(floormod(lhs, rhs) == 0);
+  }
+
+  PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs);
+  PrimExpr SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs);
+
+  static void AddToLhs(IterSumExprNode* lhs, IterSplitExpr rhs, int sign) {
+    tir::ExprDeepEqual equal;
+    for (size_t i = 0; i < lhs->args.size(); ++i) {
+      IterSplitExpr lvalue = lhs->args[i];
+      if (lvalue->source.same_as(rhs->source) && equal(lvalue->lower_factor, rhs->lower_factor) &&
+          equal(lvalue->extent, rhs->extent)) {
+        if (sign > 0) {
+          rhs.CopyOnWrite()->scale = lvalue->scale + rhs->scale;
+        } else {
+          rhs.CopyOnWrite()->scale = lvalue->scale - rhs->scale;
+        }
+        lhs->args.Set(i, rhs);
+        return;
+      }
+    }
+    if (sign > 0) {
+      lhs->args.push_back(rhs);
+    } else {
+      rhs.CopyOnWrite()->scale = make_zero(rhs->scale.dtype()) - rhs->scale;
+      lhs->args.push_back(rhs);
+    }
+  }
+
+  static void AddToLhs(IterSumExprNode* lhs, const IterSumExpr& rhs, int sign) {
+    for (const auto& arg : rhs->args) {
+      AddToLhs(lhs, arg, sign);
+    }
+    if (sign > 0) {
+      lhs->base += rhs->base;
+    } else {
+      lhs->base -= rhs->base;
+    }
+  }
+
+  static void MulToLhs(IterSumExprNode* lhs, const PrimExpr& rhs) {
+    for (size_t i = 0; i < lhs->args.size(); ++i) {
+      IterSplitExpr lvalue = lhs->args[i];
+      lvalue.CopyOnWrite()->scale *= rhs;
+      lhs->args.Set(i, lvalue);
+    }
+    lhs->base *= rhs;
+  }
+};
+
+Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                                 arith::Analyzer* analyzer) {
+  // Overall detection algorithm is divided into two steps:
+  // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns.
+  // - Step1: IterIndependenceChecker checks if the iterator are independent.
+  IterMapRewriter rewriter(analyzer, input_iters);
+  Array<IterSumExpr> results;
+
+  for (PrimExpr value : indices) {
+    results.push_back(rewriter.Rewrite(value));
+    if (rewriter.unresolved_count() != 0) return Array<IterSumExpr>();
+  }
+  if (!rewriter.CheckBijective(results)) return Array<IterSumExpr>();
+
+  return results;
+}
+
+TVM_REGISTER_GLOBAL("arith.DetectIterMap")
+    .set_body_typed([](const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters) {
+      arith::Analyzer ana;
+      return DetectIterMap(indices, input_iters, &ana);
+    });
+
+PrimExpr IterMapRewriter::VisitExpr_(const VarNode* op) {
+  auto var = GetRef<Var>(op);
+  auto it = var_map_.find(var);
+  if (it != var_map_.end()) return it->second;
+  return std::move(var);
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<Add>(a, b);
+  if (const_res.defined()) return const_res;
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return Add(a, b);
+    }
+  }
+
+  // canonical form simplification.
+  IterSumExpr ret = ToIterSumExpr(a);
+
+  if (!b->IsInstance<IterMapExprNode>()) {
+    ret.CopyOnWrite()->base += b;
+  } else if (const auto* op = b.as<IterSumExprNode>()) {
+    AddToLhs(ret.CopyOnWrite(), GetRef<IterSumExpr>(op), 1);
+  } else if (const auto* op = b.as<IterSplitExprNode>()) {
+    AddToLhs(ret.CopyOnWrite(), GetRef<IterSplitExpr>(op), 1);
+  } else {
+    AddToLhs(ret.CopyOnWrite(), ToIterSumExpr(b), 1);
+  }
+  return std::move(ret);
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<Sub>(a, b);
+  if (const_res.defined()) return const_res;
+
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return Sub(a, b);
+    }
+  }
+
+  // canonical form simplification.
+  IterSumExpr ret = ToIterSumExpr(a);
+
+  if (!b->IsInstance<IterMapExprNode>()) {
+    ret.CopyOnWrite()->base -= b;
+  } else if (const auto* op = b.as<IterSumExprNode>()) {
+    AddToLhs(ret.CopyOnWrite(), GetRef<IterSumExpr>(op), -1);
+  } else if (const auto* op = b.as<IterSplitExprNode>()) {
+    AddToLhs(ret.CopyOnWrite(), GetRef<IterSplitExpr>(op), -1);
+  } else {
+    AddToLhs(ret.CopyOnWrite(), ToIterSumExpr(b), -1);
+  }
+  return std::move(ret);
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+  // normalize
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<Mul>(a, b);
+  if (const_res.defined()) return const_res;
+
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return Mul(a, b);
+    }
+  }
+
+  if (a->IsInstance<IterMapExprNode>() && b->IsInstance<IterMapExprNode>()) {
+    // cannot multiply two iterators, mark as unresolved.
+    ++unresolved_count_;
+    return Mul(a, b);
+  }
+
+  if (!a->IsInstance<IterMapExprNode>()) {
+    std::swap(a, b);
+  }
+
+  if (a->IsInstance<IterSumExprNode>()) {
+    IterSumExpr ret = Downcast<IterSumExpr>(std::move(a));
+    MulToLhs(ret.CopyOnWrite(), b);
+    return std::move(ret);
+  } else {
+    ICHECK(a->IsInstance<IterSplitExprNode>());
+    IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
+    ret.CopyOnWrite()->scale *= b;
+    return std::move(ret);
+  }
+}
+
+PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) {
+  // floordiv(x*scale, rhs)
+  if (is_one(rhs)) return std::move(lhs);
+  if (!is_one(lhs->scale)) {
+    if (CanProveDivisible(lhs->scale, rhs)) {
+      // floordiv(x*c1*c2, c2) = x*c1, c1=scale/rhs
+      lhs.CopyOnWrite()->scale = floordiv(lhs->scale, rhs);
+      return std::move(lhs);
+    } else {
+      if (CanProveDivisible(rhs, lhs->scale)) {
+        // floordiv(x*c1, c1*c2) = floordiv(x, c2), c2=rhs/scale
+        rhs = floordiv(rhs, lhs->scale);
+        lhs.CopyOnWrite()->scale = make_const(rhs->dtype, 1);
+      } else {
+        // mark as unresolved.
+        ++unresolved_count_;
+        return floordiv(lhs, rhs);
+      }
+    }
+  }
+
+  // We handle scale!=1 in above code, hence we only consider floordiv(x, rhs) below
+  // where x=floormod(floordiv(iter, lower_factor), extent)
+  if (CanProveDivisible(lhs->extent, rhs)) {
+    // floordiv(floormod(floordiv(iter, lower_factor), c1c2), c1)
+    // = floordiv(floormod(y, c1c2), c1), where y=floordiv(iter, lower_factor)
+    // = floordiv(floormod(sc1c2+tc1+u, c1c2), c1), where y=sc1c2+tc1+u, t<c2, u<c1
+    // = t
+    // = floormod(sc2+t, c2)
+    // = floormod(floordiv(y, c1), c2)
+    // = floormod(floordiv(iter, lower_factor*c1), c2), where c1=rhs, c2=extent/rhs
+    auto* ptr_lhs = lhs.CopyOnWrite();
+    ptr_lhs->lower_factor *= rhs;
+    ptr_lhs->extent = analyzer_->Simplify(floordiv(ptr_lhs->extent, rhs));
+    return std::move(lhs);
+  } else {
+    // mark as unresolved.
+    ++unresolved_count_;
+    return floordiv(lhs, rhs);
+  }
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<FloorDiv>(a, b);
+  if (const_res.defined()) return const_res;
+
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return FloorDiv(a, b);
+    }
+  }
+
+  if (b->IsInstance<IterMapExprNode>()) {
+    // cannot divide an iterator, mark as unresolved.
+    ++unresolved_count_;
+    return FloorDiv(a, b);
+  }
+
+  if (a->IsInstance<IterSumExprNode>()) {
+    IterSumExpr ret = Downcast<IterSumExpr>(a);
+    if (auto opt = TryFuseIters(ret)) {
+      return SplitFloorDivConst(opt.value(), b);
+    } else {
+      ++unresolved_count_;
+      return FloorDiv(a, b);
+    }
+  } else {
+    ICHECK(a->IsInstance<IterSplitExprNode>());
+    IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
+    return SplitFloorDivConst(ret, b);
+  }
+}
+
+PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) {
+  // floormod(x*scale, rhs)
+  if (is_one(rhs)) return make_zero(lhs->dtype);
+  if (!is_one(lhs->scale)) {
+    // floormod(x*c1*c2, c1) = 0
+    if (CanProveDivisible(lhs->scale, rhs)) {
+      return make_zero(lhs->dtype);
+    } else {
+      if (CanProveDivisible(rhs, lhs->scale)) {
+        // floormod(x*c1, c1*c2) = (floormod(x, c2)) * c1, where c2 = rhs/scale
+        rhs = floordiv(rhs, lhs->scale);
+      } else {
+        // mark as unresolved.
+        ++unresolved_count_;
+        return floormod(lhs, rhs);
+      }
+    }
+  }
+
+  // floormod(x, rhs) where x=floormod(floordiv(iter, lower_factor), extent)
+  if (CanProveDivisible(lhs->extent, rhs)) {
+    // floormod(floormod(floordiv(iter, lower_factor), c1c2), c1)
+    // = floormod(floordiv(iter, lower_factor), c1), where c1=rhs
+    lhs.CopyOnWrite()->extent = rhs;
+    return std::move(lhs);
+  } else {
+    // mark as unresolved.
+    ++unresolved_count_;
+    return floormod(lhs, rhs);
+  }
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<FloorMod>(a, b);
+  if (const_res.defined()) return const_res;
+
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return FloorMod(a, b);
+    }
+  }
+
+  if (b->IsInstance<IterMapExprNode>()) {
+    // cannot mod an iterator, mark as unresolved.
+    ++unresolved_count_;
+    return FloorMod(a, b);
+  }
+
+  if (a->IsInstance<IterSumExprNode>()) {
+    IterSumExpr ret = Downcast<IterSumExpr>(a);
+    if (auto opt = TryFuseIters(ret)) {
+      return SplitFloorModConst(opt.value(), b);
+    } else {
+      ++unresolved_count_;
+      return FloorMod(a, b);
+    }
+  } else {
+    ICHECK(a->IsInstance<IterSplitExprNode>());
+    IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
+    return SplitFloorModConst(ret, b);
+  }
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arith/modular_set.cc b/src/arith/modular_set.cc
index 9826769a5c65..ac176b2623a3 100644
--- a/src/arith/modular_set.cc
+++ b/src/arith/modular_set.cc
@@ -67,7 +67,7 @@ struct ModularSetAnalyzer::Entry {
   Entry() = default;
 
   Entry(int64_t coeff, int64_t base) {
-    CHECK_GE(coeff, 0);
+    ICHECK_GE(coeff, 0);
     this->coeff = coeff;
     if (coeff != 0) {
       base = base % coeff;
@@ -93,10 +93,10 @@ class ModularSetAnalyzer::Impl : public ExprFunctor<ModularSetAnalyzer::Entry(co
     if (!allow_override) {
       auto it = var_map_.find(var);
       if (it != var_map_.end()) {
-        CHECK(it->second == info) << "Trying to update var \'" << var << "\'"
-                                  << " with a different const bound: "
-                                  << "original=" << ModularSet(it->second.coeff, it->second.base)
-                                  << ", new=" << info;
+        ICHECK(it->second == info)
+            << "Trying to update var \'" << var << "\'"
+            << " with a different const bound: "
+            << "original=" << ModularSet(it->second.coeff, it->second.base) << ", new=" << info;
       }
     }
     var_map_[var] = Entry(info->coeff, info->base);
@@ -165,7 +165,7 @@ class ModularSetAnalyzer::Impl : public ExprFunctor<ModularSetAnalyzer::Entry(co
 
   Entry DivByConst(const PrimExpr& lhs, int64_t val, bool round_down) {
     Entry a = VisitExpr(lhs);
-    CHECK_NE(val, 0);
+    ICHECK_NE(val, 0);
     if (a.coeff % val == 0) {
       if (a.base == 0) {
         // a c x  / c -> a x
diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h
index 78ae446d0321..01baaa8d13a2 100644
--- a/src/arith/pattern_match.h
+++ b/src/arith/pattern_match.h
@@ -49,10 +49,10 @@
  *  arith::PVar<Var> v;
  *  // We can match integer and Var, both of which are
  *  // special case container of Expr
- *  CHECK((v * c).Match(tx * 3));
- *  CHECK_EQ(c.Eval()->value, 3);
+ *  ICHECK((v * c).Match(tx * 3));
+ *  ICHECK_EQ(c.Eval()->value, 3);
  *  // cannot match c to ty
- *  CHECK(!(v * c).Match(tx * ty));
+ *  ICHECK(!(v * c).Match(tx * ty));
  *
  * \endcode
  *
@@ -199,7 +199,7 @@ class PVar : public Pattern<PVar<T>> {
   }
 
   T Eval() const {
-    CHECK(filled_);
+    ICHECK(filled_);
     return value_;
   }
 
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index c237edc493a6..a58e4433dadd 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -109,9 +109,9 @@ void RewriteSimplifier::Impl::Update(const Var& var, const PrimExpr& info, bool
   if (!can_override) {
     auto it = var_map_.find(var);
     if (it != var_map_.end()) {
-      CHECK(ExprDeepEqual()(it->second, info)) << "Trying to update var \'" << var << "\'"
-                                               << " with a different value: "
-                                               << "original=" << it->second << ", new=" << info;
+      ICHECK(ExprDeepEqual()(it->second, info)) << "Trying to update var \'" << var << "\'"
+                                                << " with a different value: "
+                                                << "original=" << it->second << ", new=" << info;
     }
   }
   var_map_[var] = info;
@@ -222,7 +222,7 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
   literal_constraints_.push_back(operator()(constraint));
   size_t new_literal_size = literal_constraints_.size();
   auto frecover = [old_literal_size, new_literal_size, this]() {
-    CHECK_EQ(literal_constraints_.size(), new_literal_size);
+    ICHECK_EQ(literal_constraints_.size(), new_literal_size);
     literal_constraints_.resize(old_literal_size);
   };
   return frecover;
@@ -461,8 +461,8 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
 
   // x / 2.0 = x * 0.5
   if (const FloatImmNode* ptr = op->b.as<FloatImmNode>()) {
-    CHECK(op->dtype.is_float() ||
-          datatype::Registry::Global()->GetTypeRegistered(op->dtype.code()));
+    ICHECK(op->dtype.is_float() ||
+           datatype::Registry::Global()->GetTypeRegistered(op->dtype.code()));
     return op->a * make_const(op->b.dtype(), 1.0 / ptr->value);
   }
 
@@ -882,6 +882,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
     TVM_TRY_REWRITE_IF(floormod(x + y * c1, c2), floormod(x, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
+    TVM_TRY_REWRITE(floormod(x * y, y), ZeroWithTypeLike(x));
+    TVM_TRY_REWRITE(floormod(y * x, y), ZeroWithTypeLike(y));
+
     // try modular analysis
     if (floormod(x, c1).Match(ret)) {
       ModularSet mod = analyzer_->modular_set(x.Eval());
diff --git a/src/arith/solve_linear_equation.cc b/src/arith/solve_linear_equation.cc
index cda1ec230cbc..22bf7360563d 100644
--- a/src/arith/solve_linear_equation.cc
+++ b/src/arith/solve_linear_equation.cc
@@ -42,8 +42,8 @@ void SmithNormalFormDiag(std::vector<std::vector<int64_t>>* S, std::vector<std::
   if (S->empty() || V->empty()) return;
   size_t m = S->size();
   size_t n = (*S)[0].size();  // n is # of variables
-  CHECK_EQ(V->size(), n);
-  CHECK_EQ((*V)[0].size(), n);
+  ICHECK_EQ(V->size(), n);
+  ICHECK_EQ((*V)[0].size(), n);
 
   for (size_t index = 0; index < std::min(m, n); ++index) {
     // Here A is partially diagonalized, that is A[i, j] is zero for all i, j
diff --git a/src/arith/solve_linear_inequality.cc b/src/arith/solve_linear_inequality.cc
index eec916ac6c22..f4de9ffb197b 100644
--- a/src/arith/solve_linear_inequality.cc
+++ b/src/arith/solve_linear_inequality.cc
@@ -268,7 +268,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
 
   Map<Var, IntGroupBounds> res_bounds;
   for (const Var& v : system_to_solve->variables) {
-    CHECK(!res_bounds.count(v))
+    ICHECK(!res_bounds.count(v))
         << "Variable " << v
         << " appears more than one time in the `variables` which might be a bug";
 
@@ -436,7 +436,7 @@ IntConstraints SolveInequalitiesToRange(const IntConstraints& inequalities) {
     analyzer.Bind(vranges);
 
     const Var& var = *it;
-    CHECK(solved_bounds.count(var));
+    ICHECK(solved_bounds.count(var));
     auto bnd = solved_bounds[var];
     if (is_one(bnd->coef) && !bnd->equal.empty()) {
       // There is an equation of the form `v == expr`, so this variable can be completely removed.
diff --git a/src/auto_scheduler/auto_schedule.cc b/src/auto_scheduler/auto_schedule.cc
index dd6b70573a3b..747aa01cfa05 100755
--- a/src/auto_scheduler/auto_schedule.cc
+++ b/src/auto_scheduler/auto_schedule.cc
@@ -19,9 +19,7 @@
 
 /*!
  * \file auto_scheduler/auto_schedule.cc
- * \brief The user interface of the TVM Auto-scheduler. This is the entry structure to get
- * schedule search requirements from upper level (Python API), and returns a high performance
- * schedule after search process.
+ * \brief The user interface and tuning options of the TVM auto-scheduler.
  */
 
 #include <tvm/auto_scheduler/auto_schedule.h>
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 23b38173c8c2..ca5997963520 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -42,6 +42,7 @@
 #include <vector>
 
 #include "../arith/pattern_match.h"
+#include "../relay/transforms/auto_scheduler_layout_rewrite.h"
 #include "search_policy/utils.h"
 #include "utils.h"
 
@@ -553,7 +554,7 @@ class FlopEstimator : public ExprFunctor<double(const PrimExpr& n)> {
         if (pop->attrs.count("FLOP")) {
           // Use user-provided FLOP
           auto pint = pop->attrs["FLOP"].as<IntImmNode>();
-          CHECK(pint != nullptr);
+          ICHECK(pint != nullptr);
           ret += pint->value;
         } else {
           // Estimate by parsing the compute body
@@ -658,11 +659,68 @@ class FlopEstimator : public ExprFunctor<double(const PrimExpr& n)> {
   int cur_type_code_;
 };
 
+void CheckComputeValidity(const te::Schedule& sch) {
+  // Check the validity of a compute definition:
+  // The name of each iterator should be unique.
+  for (auto stage : sch->stages) {
+    if (stage->op->IsInstance<te::ComputeOpNode>()) {
+      std::unordered_set<std::string> names;
+      for (const auto& x : stage->leaf_iter_vars) {
+        ICHECK(!names.count(x->var->name_hint))
+            << "Find duplicated iterator names in the compute definition: " << x->var->name_hint
+            << ". Please use different names for different iterators.";
+        names.insert(x->var->name_hint);
+      }
+    }
+  }
+}
+
 ComputeDAG::ComputeDAG(Array<te::Tensor> tensors) {
   auto node = make_object<ComputeDAGNode>();
   node->tensors = std::move(tensors);
   node->access_analyzer = AccessAnalyzer(node->tensors);
-  node->ops = node->access_analyzer->ops_topo_order;
+
+  Array<te::Operation> out_ops;
+  for (const auto& op : node->access_analyzer->ops_topo_order) {
+    if (node->access_analyzer.IsOutput(op)) {
+      out_ops.push_back(op);
+    }
+  }
+  te::Schedule sch = te::create_schedule(out_ops);
+  for (auto stage : sch->stages) {
+    node->ops.push_back(stage->op);
+  }
+
+  // Make sure it is a valid compute definition
+  CheckComputeValidity(sch);
+
+  node->flop_ct = FlopEstimator().EstimateFlop(node->ops);
+  node->init_state = State(node->ops);
+  data_ = std::move(node);
+}
+
+ComputeDAG::ComputeDAG(const te::Schedule& sch) {
+  auto node = make_object<ComputeDAGNode>();
+
+  // Make sure it is a valid compute definition
+  CheckComputeValidity(sch);
+
+  // Initialize ops. Here we enforce the order of ops and stages are consistent
+  for (auto stage : sch->stages) {
+    node->ops.push_back(stage->op);
+  }
+
+  // Collect input and output tensors
+  Array<te::Tensor> tensors;
+  for (auto stage : sch->stages) {
+    if (stage->op->IsInstance<te::PlaceholderOpNode>() || stage->is_output) {
+      for (auto i = 0; i < stage->op->num_outputs(); ++i) {
+        tensors.push_back(stage->op.output(i));
+      }
+    }
+  }
+  node->tensors = std::move(tensors);
+  node->access_analyzer = AccessAnalyzer(node->tensors);
   node->flop_ct = FlopEstimator().EstimateFlop(node->ops);
   node->init_state = State(node->ops);
   data_ = std::move(node);
@@ -684,11 +742,11 @@ class IndexRewriter : public StmtExprMutator {
       for (const auto& arg : op->indices) {
         std::string axis_name;
         if (const auto* int_imm = arg.as<IntImmNode>()) {
-          CHECK_EQ(int_imm->value, 0);
+          ICHECK_EQ(int_imm->value, 0);
           axis_name = "IntImm";
         } else {
           axis_name = AxisBaseName(CleanName(Downcast<Var>(arg)->name_hint));
-          CHECK_EQ(name_to_arg.count(axis_name), 0);
+          ICHECK_EQ(name_to_arg.count(axis_name), 0);
           name_to_arg[axis_name] = arg;
         }
       }
@@ -698,7 +756,7 @@ class IndexRewriter : public StmtExprMutator {
       for (int i = new_names_.size() - 1; i >= 0; --i) {
         auto ori_iter_name = new_names_[i];
         auto name_it = name_to_arg.find(ori_iter_name);
-        CHECK(name_it != name_to_arg.end());
+        ICHECK(name_it != name_to_arg.end());
         PrimExpr ori_arg = name_it->second;
 
         PrimExpr mod_factor = new_shape_[i];
@@ -737,12 +795,12 @@ std::string GetOrigLayout(std::set<std::string>* placeholder_axis_names, const t
   std::ostringstream os;
   uint32_t i = 0;
   const auto& placeholder_op = placeholder->op;
-  CHECK_GT(extractor.read_access.count(placeholder_op), 0);
+  ICHECK_GT(extractor.read_access.count(placeholder_op), 0);
   for (const auto& ev : extractor.read_access[placeholder_op]) {
     for (const auto& e : ev) {
       std::string axis_name;
       if (const auto* int_imm = e.as<IntImmNode>()) {
-        CHECK_EQ(int_imm->value, 0);
+        ICHECK_EQ(int_imm->value, 0);
         axis_name = "IntImm";
       } else {
         axis_name = AxisBaseName(CleanName(Downcast<Var>(e)->name_hint));
@@ -753,16 +811,15 @@ std::string GetOrigLayout(std::set<std::string>* placeholder_axis_names, const t
     }
   }
 
-  CHECK_EQ(placeholder_axis_names->size(), placeholder->shape.size());
+  ICHECK_EQ(placeholder_axis_names->size(), placeholder->shape.size());
   std::string orig_layout = os.str();
   os.str("");
-  // TODO(minmin): uncomment this line for relay integration
-  // ::tvm::relay::KernelLayoutTransformer::global_orig_layouts_queue.push_back(orig_layout);
+  ::tvm::relay::AutoSchedulerLayoutRewriter::global_ori_layouts_queue.push_back(orig_layout);
   return orig_layout;
 }
 
-std::string GetNewLayout(Array<PrimExpr>* new_shape, const State& state, const int stage_id,
-                         const Stage& stage, const te::Operation& op, const te::Tensor& placeholder,
+std::string GetNewLayout(const State& state, const int stage_id, const Stage& stage,
+                         const te::Operation& op, const te::Tensor& placeholder,
                          const std::set<std::string>& placeholder_axis_names) {
   std::ostringstream os;
   Array<Iterator> stage_iters;
@@ -802,7 +859,7 @@ std::string GetNewLayout(Array<PrimExpr>* new_shape, const State& state, const i
     ExtractOriginalIterators(iter->name, &ori_iter_names);
     // fused iters have been replaced with iter->orig_iters.
     // So there should be only one ori iter name extracted from iter->name.
-    CHECK_EQ(ori_iter_names.size(), 1);
+    ICHECK_EQ(ori_iter_names.size(), 1);
     auto ori_iter_name = AxisBaseName(*ori_iter_names.begin());
     new_axis_names.push_back(ori_iter_name);
   }
@@ -817,26 +874,30 @@ std::string GetNewLayout(Array<PrimExpr>* new_shape, const State& state, const i
     if (placeholder_axis_names.count(ori_iter_name)) {
       os << iter->range->extent << ori_iter_name;
       new_names.push_back(ori_iter_name);
-      new_shape->push_back(iter->range->extent);
     }
   }
   std::string new_layout = os.str();
   os.str("");
-  // TODO(minmin): uncomment this line for relay integration
-  // ::tvm::relay::KernelLayoutTransformer::global_new_layouts_queue.push_back(new_layout);
+  ::tvm::relay::AutoSchedulerLayoutRewriter::global_new_layouts_queue.push_back(new_layout);
   return new_layout;
 }
 
-void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
-  ComputeDAGNode* p_dag = this->CopyOnWrite();
+ComputeDAG ComputeDAG::RewriteLayout(Array<Step>* transform_steps,
+                                     LayoutRewriteOption layout_rewrite) const {
+  CHECK(layout_rewrite != LayoutRewriteOption::NoRewrite)
+      << "Call ComputeDAG::RewriteLayout with NoRewrite.";
+  ComputeDAG new_dag = *this;
+  ComputeDAGNode* p_dag = new_dag.CopyOnWrite();
+
   auto node = make_object<StateNode>();
-  node->transform_steps = transform_steps;
+  node->transform_steps = *transform_steps;
   node->concrete = true;
   const State& state = InferBound(State(node));
+
   OperationSet handled_ops;
-  int stage_id = -1;
-  for (const auto& stage : state->stages) {
-    stage_id += 1;
+  for (size_t stage_id = 0; stage_id < state->stages.size(); stage_id++) {
+    const auto& stage = state->stages[stage_id];
+
     const te::Operation& op = stage->op;
     if (!op->IsInstance<te::ComputeOpNode>()) {
       continue;
@@ -846,15 +907,13 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
       continue;
     }
     const ObjectRef& attr_value = attrs[layout_free_placeholders_key];
-    Array<te::Tensor> placeholders = Downcast<Array<te::Tensor>>(attr_value);
-    for (const auto& placeholder : placeholders) {
+    for (const auto& placeholder : Downcast<Array<te::Tensor>>(attr_value)) {
       const auto& placeholder_op = placeholder->op;
 
       // Check whether this placeholder has already been handled
       if (handled_ops.count(placeholder_op)) {
         continue;
       }
-
       // Skip the op that is not direct consumer of this placeholder.
       // This is usually caused by cache read/write.
       bool direct_consumer = false;
@@ -867,28 +926,89 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
       if (!direct_consumer) {
         continue;
       }
+      handled_ops.insert(placeholder_op);
 
+      // Process original layout
       std::set<std::string> placeholder_axis_names;
-      GetOrigLayout(&placeholder_axis_names, op, placeholder);
+      std::string origin_layout = GetOrigLayout(&placeholder_axis_names, op, placeholder);
+      Array<PrimExpr> origin_shape;
+      std::vector<std::string> origin_axes;
+      ParseKernelLayout(origin_layout, &origin_shape, &origin_axes);
 
-      Array<PrimExpr> new_shape;
+      // Process new layout
       std::string new_layout =
-          GetNewLayout(&new_shape, state, stage_id, stage, op, placeholder, placeholder_axis_names);
-
-      handled_ops.insert(placeholder_op);
-
-      Array<te::Operation> old_ops = p_dag->ops;
-      ArrayNode* pops = p_dag->ops.CopyOnWrite();
-
-      // Create new placeholder
-      te::Operation new_placeholder_op;
-      new_placeholder_op = te::PlaceholderOp(placeholder_op->name, new_shape,
+          GetNewLayout(state, stage_id, stage, op, placeholder, placeholder_axis_names);
+      Array<PrimExpr> new_shape;
+      std::vector<std::string> new_axes;
+      ParseKernelLayout(new_layout, &new_shape, &new_axes);
+
+      // Process op updates
+      te::Operation new_op_to_update;
+      if (layout_rewrite == LayoutRewriteOption::RewriteForPreTransformed) {
+        // Create new placeholder
+        new_op_to_update = te::PlaceholderOp(placeholder_op->name, new_shape,
                                              placeholder_op.as<te::PlaceholderOpNode>()->dtype);
+      } else if (layout_rewrite == LayoutRewriteOption::InsertTransformStage) {
+        // Process index strides
+        std::unordered_map<std::string, PrimExpr> axes_stride;
+        for (const auto& i : origin_axes) {
+          axes_stride[i] = Integer(1);
+        }
+        Array<PrimExpr> new_stride(new_shape.size(), PrimExpr());
+        PrimExpr temp = Integer(1);
+        for (int i = new_shape.size() - 1; i >= 0; i--) {
+          new_stride.Set(i, axes_stride[new_axes[i]]);
+          axes_stride[new_axes[i]] *= new_shape[i];
+        }
 
-      te::Operation new_compute_op, old_compute_op;
+        // Add extra layout transpose stage
+        const auto& layout_transform_tensor = te::compute(
+            new_shape,
+            [&new_stride, &placeholder_op, &origin_shape, &new_shape, &origin_axes,
+             &new_axes](const tvm::runtime::Array<tvm::tir::Var>& indices) -> tvm::PrimExpr {
+              Array<PrimExpr> access_indices;
+              for (size_t indice_index = 0; indice_index < origin_shape.size(); indice_index++) {
+                PrimExpr temp = Integer(0);
+                for (size_t i = 0; i < new_shape.size(); i++) {
+                  if (origin_axes[indice_index].compare(new_axes[i]) == 0) {
+                    temp += indices[i] * new_stride[i];
+                  }
+                }
+                access_indices.push_back(temp);
+              }
+              return placeholder_op.output(0)(access_indices);
+            },
+            "auto_schedule_layout_transpose");
+        new_op_to_update = layout_transform_tensor->op;
+
+        // Update the transform steps
+        for (size_t i = 0; i < transform_steps->size(); i++) {
+          Step step = (*transform_steps)[i];
+          if (step->stage_id >= static_cast<int>(stage_id)) {
+            step.CopyOnWrite()->stage_id++;
+          }
+          if (step->IsInstance<ComputeAtStepNode>()) {
+            auto compute_at_step = tvm::Downcast<ComputeAtStep>(step);
+            if (compute_at_step->target_stage_id >= static_cast<int>(stage_id)) {
+              dynamic_cast<ComputeAtStepNode*>(compute_at_step.CopyOnWrite())->target_stage_id++;
+            }
+            transform_steps->Set(i, std::move(compute_at_step));
+          } else {
+            transform_steps->Set(i, std::move(step));
+          }
+        }
+        Array<Integer> to_fuse;
+        for (size_t i = 0; i < new_shape.size() - 1; i++) {
+          to_fuse.push_back(i);
+        }
+        transform_steps->push_back(FuseStep(stage_id, to_fuse));
+        transform_steps->push_back(AnnotationStep(stage_id, 0, IteratorAnnotation::kParallel));
+      }
+
+      te::Operation new_compute_op, original_compute_op;
       Array<PrimExpr> new_body;
       IndexRewriter index_rewriter(placeholder_op, new_layout);
-      for (auto& op : old_ops) {
+      for (const auto& op : p_dag->ops) {
         if (auto* pop = op.as<te::ComputeOpNode>()) {
           bool need_update = false;
           for (auto& t : op->InputTensors()) {
@@ -898,35 +1018,45 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
             }
           }
           if (need_update) {
-            for (auto& body : pop->body) {
+            for (const auto& body : pop->body) {
               new_body.push_back(index_rewriter.Rewrite(body));
             }
-            old_compute_op = op;
+            original_compute_op = op;
             CHECK(!new_compute_op.defined());
             new_compute_op = te::ComputeOp(pop->name, pop->tag, pop->attrs, pop->axis, new_body);
           }
         }
       }
 
-      // construct the map from old_op to new_op
+      // construct the map from original_op to new_op
       std::unordered_map<te::Operation, te::Operation> updated_ops;
-      for (size_t i = 0; i < old_ops.size(); ++i) {
-        auto old_op = old_ops[i];
-        if (old_op == placeholder_op) {
-          pops->SetItem(i, new_placeholder_op);
-          updated_ops[placeholder_op] = new_placeholder_op;
-        } else if (old_op == old_compute_op) {
-          pops->SetItem(i, new_compute_op);
-          updated_ops[old_compute_op] = new_compute_op;
+
+      Array<te::Operation> original_ops = p_dag->ops;
+      p_dag->ops.clear();
+      for (size_t i = 0; i < original_ops.size(); ++i) {
+        const auto& original_op = original_ops[i];
+        if (original_op == placeholder_op) {
+          if (layout_rewrite == LayoutRewriteOption::InsertTransformStage) {
+            p_dag->ops.push_back(placeholder_op);
+          }
+          p_dag->ops.push_back(new_op_to_update);
+          updated_ops[placeholder_op] = new_op_to_update;
+        } else if (original_op == original_compute_op) {
+          p_dag->ops.push_back(new_compute_op);
+          updated_ops[original_compute_op] = new_compute_op;
         } else {
-          pops->SetItem(i, old_op);
+          p_dag->ops.push_back(original_op);
         }
       }
 
+      ArrayNode* pops = p_dag->ops.CopyOnWrite();
       // Because ops is sorted in topo-order, only do one pass linear scan here.
       for (size_t i = 0; i < pops->size(); ++i) {
-        auto old_op = Downcast<te::Operation>(pops->at(i));
-        if (auto* pop = old_op.as<te::ComputeOpNode>()) {
+        const auto& original_op = Downcast<te::Operation>(pops->at(i));
+        if (auto* pop = original_op.as<te::ComputeOpNode>()) {
+          if (original_op == new_op_to_update) {
+            continue;
+          }
           auto inputs = pop->InputTensors();
           std::unordered_map<te::Tensor, te::Tensor> rmap;
           for (auto input : inputs) {
@@ -942,20 +1072,21 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
             }
           }
           if (!rmap.empty()) {
-            te::Operation new_op = pop->ReplaceInputs(old_op, rmap);
-            updated_ops[old_op] = new_op;
+            te::Operation new_op = pop->ReplaceInputs(original_op, rmap);
+            updated_ops[original_op] = new_op;
             pops->SetItem(i, new_op);
           }
         }
       }
 
-      p_dag->init_state = State(p_dag->ops);
-
       Array<te::Tensor> old_tensors = p_dag->tensors;
       ArrayNode* p_tensors = p_dag->tensors.CopyOnWrite();
-
       for (size_t i = 0; i < old_tensors.size(); ++i) {
         const auto& old_tensor = old_tensors[i];
+        if (layout_rewrite != LayoutRewriteOption::RewriteForPreTransformed &&
+            old_tensor->op->IsInstance<te::PlaceholderOpNode>()) {
+          continue;
+        }
         auto it = updated_ops.find(old_tensor->op);
         te::Operation new_op;
         while (it != updated_ops.end()) {
@@ -970,17 +1101,47 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
     }  // end for placeholder
   }    // end for stage
   p_dag->access_analyzer = AccessAnalyzer(p_dag->tensors);
-  p_dag->ops = p_dag->access_analyzer->ops_topo_order;
+
+  Array<te::Operation> out_ops;
+  for (const auto& op : p_dag->access_analyzer->ops_topo_order) {
+    if (p_dag->access_analyzer.IsOutput(op)) {
+      out_ops.push_back(op);
+    }
+  }
+
+  p_dag->ops.clear();
+  te::Schedule sch = te::create_schedule(out_ops);
+  for (auto stage : sch->stages) {
+    p_dag->ops.push_back(stage->op);
+  }
   p_dag->flop_ct = FlopEstimator().EstimateFlop(p_dag->ops);
+  p_dag->init_state = State(p_dag->ops);
+
+  return new_dag;
+}
+
+// Return whether a DAG has placeholders that are marked as "layout free".
+bool HasLayoutFreeTensors(const ComputeDAG& dag) {
+  for (const auto& op : dag->ops) {
+    if (!op->IsInstance<te::ComputeOpNode>()) {
+      continue;
+    }
+    if (op->attrs.count(ComputeDAG::layout_free_placeholders_key)) {
+      return true;
+    }
+  }
+
+  return false;
 }
 
 std::pair<te::Schedule, Array<te::Tensor>> ComputeDAG::ApplySteps(
     const Array<Step>& transform_steps, Array<te::Stage>* stages, StageToAxesMap* stage_to_axes,
-    bool layout_rewrite) const {
-  if (layout_rewrite && !transform_steps.empty()) {
-    ComputeDAG new_dag = *this;
-    new_dag.RewriteLayout(transform_steps);
-    return new_dag.ApplySteps(transform_steps, stages, stage_to_axes, false);
+    LayoutRewriteOption layout_rewrite) const {
+  if (layout_rewrite != LayoutRewriteOption::NoRewrite && HasLayoutFreeTensors(*this) &&
+      !transform_steps.empty()) {
+    Array<Step> steps = transform_steps;
+    const auto& dag = RewriteLayout(&steps, layout_rewrite);
+    return dag.ApplySteps(steps);
   }
 
   // Temporal object to be used if the input pointer is nullptr
@@ -1063,7 +1224,7 @@ String ComputeDAG::PrintStepsAsPython(const Array<Step>& transform_steps) const
 }
 
 State ComputeDAG::InferBound(const State& state) const {
-  CHECK(state->concrete) << "Only concrete state can be processed to get bound info.";
+  ICHECK(state->concrete) << "Only concrete state can be processed to get bound info.";
 
   State ret_state;
   StateNode* pstate;
@@ -1088,7 +1249,7 @@ State ComputeDAG::InferBound(const State& state) const {
   Array<te::Tensor> tensors;
   // Replay steps to tvm::Schedule
   std::tie(sch, tensors) = ApplySteps(pstate->transform_steps, &stages, &stage_to_axes);
-  sch = sch.normalize();
+  sch = sch.normalize_for_feature_extraction();
   // Get bound information from TVM schedule
   Map<IterVar, Range> bounds = te::InferBound(sch);
 
@@ -1129,7 +1290,7 @@ Array<State> ComputeDAG::InferBound(const Array<State>& states) const {
 
   support::parallel_for(0, states.size(), [this, &states, &out_states](int i) {
     try {
-      out_states.Set(i, this->InferBound(states[i]));
+      out_states.Set(i, (states[i].defined()) ? this->InferBound(states[i]) : states[i]);
     } catch (dmlc::Error& e) {
       LOG(WARNING) << "InferBound fails on the state:\n"
                    << states[i] << "\n"
@@ -1144,17 +1305,7 @@ ComputeDAG ComputeDAG::ReplayAndGetDAG(const Array<Step>& transform_steps) const
   te::Schedule sch;
   Array<te::Tensor> old_tensors;
   std::tie(sch, old_tensors) = ApplySteps(transform_steps);
-
-  Array<te::Tensor> new_tensors;
-  for (auto stage : sch->stages) {
-    if (stage->op->IsInstance<te::PlaceholderOpNode>() || stage->is_output) {
-      for (auto i = 0; i < stage->op->num_outputs(); ++i) {
-        new_tensors.push_back(stage->op.output(i));
-      }
-    }
-  }
-
-  return ComputeDAG(new_tensors);
+  return ComputeDAG(sch);
 }
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -1231,7 +1382,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
               ss << ".v" << k;
             }
             if (auto preduce = pop->body[k].as<ReduceNode>()) {
-              CHECK_LT(k, preduce->combiner->result.size());
+              ICHECK_LT(k, preduce->combiner->result.size());
               PrimExpr combiner = preduce->combiner->result[k];
               if (combiner->IsInstance<AddNode>()) {
                 ss << " += " << preduce->source[0] << "\n";
@@ -1259,16 +1410,22 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << ss.str();
     });
 
-TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAG").set_body_typed([](Array<te::Tensor> tensors) {
-  return ComputeDAG(tensors);
-});
+TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAG")
+    .set_body_typed([](Optional<Array<te::Tensor>> tensors, Optional<te::Schedule> sch) {
+      if (tensors) {
+        return ComputeDAG(tensors.value());
+      }
+      ICHECK(sch) << "Both tensors and schedule are null";
+      return ComputeDAG(sch.value());
+    });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGApplyStepsFromState")
-    .set_body_typed([](const ComputeDAG& dag, const State& state, const bool layout_rewrite) {
+    .set_body_typed([](const ComputeDAG& dag, const State& state, int layout_rewrite) {
       te::Schedule sch;
       Array<te::Tensor> return_tensors;
       std::tie(sch, return_tensors) =
-          dag.ApplySteps(state->transform_steps, nullptr, nullptr, layout_rewrite);
+          dag.ApplySteps(state->transform_steps, nullptr, nullptr,
+                         static_cast<LayoutRewriteOption>(layout_rewrite));
       return Array<ObjectRef>{sch, return_tensors};
     });
 
@@ -1282,5 +1439,18 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGInferBoundFromState")
       return dag.InferBound(state);
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGRewriteLayoutFromState")
+    .set_body_typed([](const ComputeDAG& dag, const State& state) {
+      Array<Step>* transform_steps = const_cast<Array<Step>*>(&state->transform_steps);
+      return dag.RewriteLayout(transform_steps, LayoutRewriteOption::RewriteForPreTransformed);
+    });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.RewriteIndexForNewLayout")
+    .set_body_typed([](const te::Operation& placeholder_op, const std::string& new_layout,
+                       const PrimExpr& body) {
+      IndexRewriter index_rewriter(placeholder_op, new_layout);
+      return index_rewriter.Rewrite(body);
+    });
+
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/auto_scheduler/cost_model.cc b/src/auto_scheduler/cost_model.cc
index 3d540c7b6610..4ed5ca2bfbe8 100755
--- a/src/auto_scheduler/cost_model.cc
+++ b/src/auto_scheduler/cost_model.cc
@@ -34,7 +34,7 @@ TVM_REGISTER_OBJECT_TYPE(PythonBasedModelNode);
 RandomModel::RandomModel() {
   ObjectPtr<RandomModelNode> node = make_object<RandomModelNode>();
   const auto* f = runtime::Registry::Get("auto_scheduler.cost_model.random_fill_float");
-  CHECK(f != nullptr);
+  ICHECK(f != nullptr);
   node->random_number_func = reinterpret_cast<const TypedPackedFunc<void(size_t, void*)>*>(f);
   data_ = std::move(node);
 }
@@ -109,7 +109,7 @@ void PythonBasedModelNode::PredictStages(const SearchTask& task, const Array<Sta
   // Score of each stage in each states.
   size_t idx = n_states;
   for (size_t i = 0; i < n_states; ++i) {
-    CHECK_LE(idx, flatten_scores.size());
+    ICHECK_LE(idx, flatten_scores.size());
 
     // Number of scored stages of this state.
     int s_length = static_cast<int>(flatten_scores[idx++]);
@@ -134,7 +134,7 @@ void PythonBasedModelNode::PredictStages(const SearchTask& task, const Array<Sta
           scores.push_back(flatten_scores[idx + offset]);
           offset++;
         }
-        CHECK_EQ(offset, s_length);
+        ICHECK_EQ(offset, s_length);
         stage_scores->push_back(std::move(scores));
       }
       idx += s_length;
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 2744e0de4108..0df69b967d3b 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -298,7 +298,7 @@ class MathOpCounter : public StmtExprVisitor {
 
   void VisitExpr_(const CallNode* op) final {
     auto* pop = op->op.as<OpNode>();
-    CHECK(pop != nullptr);
+    ICHECK(pop != nullptr);
     auto effect_kind = op_call_effect_[GetRef<Op>(pop)];
     bool is_pure =
         effect_kind == CallEffectKind::kPure || effect_kind == CallEffectKind::kExprAnnotation;
@@ -669,7 +669,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     math_op_counter(node->value);
     std::vector<float> mem_bytes_list;
     std::vector<float> compute_ops_list;
-    int cur_compute_ops;
+    double cur_compute_ops;
 
     // Group 1: Computation related features
     ExtractComputationFeature(node, math_op_counter);
@@ -768,7 +768,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
   // Extract buffer access related features (group 2)
   void ExtractBufferAccessFeature(const BufferStoreNode* node, const MathOpCounter& math_op_counter,
-                                  int* cur_compute_ops, std::vector<float>* compute_ops_list,
+                                  double* cur_compute_ops, std::vector<float>* compute_ops_list,
                                   std::vector<float>* mem_bytes_list) {
     FeatureSet& fea = buffer_features[node->buffer];
 
@@ -871,7 +871,9 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
         stride = (i == static_cast<int>(for_loop_stack_.size()) - 1 ? stride : 0);
 
         float n_continuous = ele_bytes;
-        for (int i = static_cast<int>(tmp_region.size()) - 1; i >= 0; i--) {
+        for (int i = std::min(static_cast<int>(tmp_region.size()) - 1,
+                              static_cast<int>(int_shape.size()) - 1);
+             i >= 0; i--) {
           if (tmp_region[i] == int_shape[i]) {
             n_continuous *= tmp_region[i];
             break;
@@ -918,7 +920,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
   }
 
   // Extract arithmetic intensity related feature (group 3)
-  void ExtractArithmeticIntensityFeature(const BufferStoreNode* node, int cur_compute_ops,
+  void ExtractArithmeticIntensityFeature(const BufferStoreNode* node, double cur_compute_ops,
                                          const std::vector<float>& compute_ops_list,
                                          const std::vector<float>& mem_bytes_list) {
     FeatureSet& fea = buffer_features[node->buffer];
@@ -935,7 +937,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
         while (compute_ops_list[pt] < cur_compute_ops - 1e-4) {
           pt++;
         }
-        CHECK_LT(pt, compute_ops_list.size());
+        ICHECK_LT(pt, compute_ops_list.size());
 
         float value;
         if (pt == 0) {
@@ -1265,7 +1267,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
   Array<te::Tensor> tensors;
 
   std::tie(sch, tensors) = task->compute_dag.ApplySteps(state->transform_steps);
-  sch = sch.normalize();
+  sch = sch.normalize_for_feature_extraction();
   auto bounds = te::InferBound(sch);
 
   try {
@@ -1321,7 +1323,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
         tir::transform::Sequential(Array<tvm::transform::Pass>{tir::transform::Simplify()});
     mod = optimize(std::move(mod));
     const auto& it = mod->functions.find(global_var);
-    CHECK(it != mod->functions.end());
+    ICHECK(it != mod->functions.end());
     const auto& prim_func = (*it).second.as<PrimFuncNode>();
     GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs,
                        feature);
@@ -1343,11 +1345,6 @@ void GetPerStoreFeaturesFromStates(const Array<State>& states, const SearchTask&
                           GetPerStoreFeaturesWorkerFunc(task, states[i], max_n_bufs,
                                                         &(*features)[i], &error_ct);
                         });
-
-  if (error_ct > 0) {
-    std::cerr << "Encountered " << error_ct
-              << " errors during feature extraction, which are safely ignored." << std::endl;
-  }
 }
 
 void GetPerStoreFeaturesFromStates(const Array<State>& states, const std::vector<SearchTask>& tasks,
@@ -1363,11 +1360,6 @@ void GetPerStoreFeaturesFromStates(const Array<State>& states, const std::vector
                           GetPerStoreFeaturesWorkerFunc(tasks[i], states[i], max_n_bufs,
                                                         &(*features)[i], &error_ct);
                         });
-
-  if (error_ct > 0) {
-    std::cerr << "Encountered " << error_ct
-              << " errors during feature extraction. which are safely ignored." << std::endl;
-  }
 }
 
 void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int max_n_bufs,
@@ -1387,7 +1379,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
 
   const auto* workload_key_to_tensors =
       tvm::runtime::Registry::Get("auto_scheduler.workload_key_to_tensors");
-  CHECK(workload_key_to_tensors != nullptr);
+  ICHECK(workload_key_to_tensors != nullptr);
 
   // read from file
   RecordReader reader(filename);
@@ -1452,7 +1444,7 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
 
   const auto* workload_key_to_tensors =
       tvm::runtime::Registry::Get("auto_scheduler.workload_key_to_tensors");
-  CHECK(workload_key_to_tensors != nullptr);
+  ICHECK(workload_key_to_tensors != nullptr);
 
   tasks.reserve(inputs.size());
   normalized_throughputs->reserve(inputs.size());
@@ -1546,7 +1538,7 @@ TVMByteArray SerializeFeatures(std::vector<std::vector<float>>&& features,
   size_vector.push_back(static_cast<int>(task_ids.size()));
   total_bytes += sizeof(int) * task_ids.size();
 
-  CHECK_EQ(size_vector.size(), size_vector_size);
+  ICHECK_EQ(size_vector.size(), size_vector_size);
 
   // allocate memory
   out_data->reserve(total_bytes);
@@ -1572,7 +1564,7 @@ TVMByteArray SerializeFeatures(std::vector<std::vector<float>>&& features,
   memmove(ptr, reinterpret_cast<char*>(task_ids.data()), task_ids.size() * sizeof(int));
   ptr += task_ids.size() * sizeof(int);
 
-  CHECK_EQ(ptr - out_data->data(), total_bytes);
+  ICHECK_EQ(ptr - out_data->data(), total_bytes);
 
   return TVMByteArray{out_data->data(), total_bytes};
 }
diff --git a/src/auto_scheduler/loop_state.cc b/src/auto_scheduler/loop_state.cc
index c3c764fc8e2b..517f7ff91f55 100755
--- a/src/auto_scheduler/loop_state.cc
+++ b/src/auto_scheduler/loop_state.cc
@@ -114,7 +114,7 @@ void AttachMap::DeleteStage(int stage_id) {
 
 void AttachMap::UpdateIters(const std::vector<IterKey>& original_iters,
                             const std::vector<IterKey>& new_iters) {
-  CHECK_EQ(original_iters.size(), new_iters.size());
+  ICHECK_EQ(original_iters.size(), new_iters.size());
   AttachMapNode* pnode = CopyOnWrite();
   std::unordered_map<IterKey, std::vector<StageKey>> new_iter_to_attached_stages;
   for (size_t i = 0; i < original_iters.size(); ++i) {
@@ -265,8 +265,8 @@ void State::pragma(int stage_id, const Iterator& it, const String& pragma_type)
 
 void State::reorder(int stage_id, const Array<Iterator>& order) {
   const Stage& stage = operator->()->stages[stage_id];
-  CHECK_EQ(order.size(), stage->iters.size()) << "The order of all iterators "
-                                              << "should be specified";
+  ICHECK_EQ(order.size(), stage->iters.size()) << "The order of all iterators "
+                                               << "should be specified";
   Array<Integer> after_ids;
   GetIndices(stage->iters, order, &after_ids);
   ReorderStep step = ReorderStep(stage_id, after_ids);
@@ -445,6 +445,12 @@ String State::ToStr(bool delete_trivial_loop) const {
   return os.str();
 }
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<StageNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      const auto& stage = tvm::Downcast<Stage>(ref);
+      p->stream << stage->GetTypeKey() << "(" << stage.get() << ": " << stage->op->name << ")";
+    });
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<StateNode>([](const ObjectRef& ref, ReprPrinter* p) {
       PrintState(&p->stream, tvm::Downcast<State>(ref), true);
diff --git a/src/auto_scheduler/measure.cc b/src/auto_scheduler/measure.cc
index 70ea7abfae88..5b7e886f073c 100755
--- a/src/auto_scheduler/measure.cc
+++ b/src/auto_scheduler/measure.cc
@@ -38,6 +38,7 @@ TVM_REGISTER_NODE_TYPE(MeasureResultNode);
 TVM_REGISTER_OBJECT_TYPE(MeasureCallbackNode);
 TVM_REGISTER_OBJECT_TYPE(ProgramRunnerNode);
 TVM_REGISTER_OBJECT_TYPE(ProgramBuilderNode);
+TVM_REGISTER_OBJECT_TYPE(ProgramMeasurerNode);
 TVM_REGISTER_OBJECT_TYPE(LocalBuilderNode);
 TVM_REGISTER_OBJECT_TYPE(LocalRunnerNode);
 TVM_REGISTER_OBJECT_TYPE(RPCRunnerNode);
@@ -202,21 +203,26 @@ void ProgramMeasurerNode::Reset() {
   best_flops.clear();
   best_ct.clear();
   best_state.clear();
+  has_valid.clear();
 }
 
-void ProgramMeasurerNode::Measure(const SearchTask& task, const SearchPolicy& policy,
-                                  const Array<MeasureInput>& inputs, Array<MeasureResult>* results,
-                                  int batch_size) {
-  results->clear();
-  results->reserve(inputs.size());
+Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
+                                                  const SearchPolicy& policy,
+                                                  const Array<MeasureInput>& inputs,
+                                                  int batch_size) {
+  auto t_begin = std::chrono::high_resolution_clock::now();
+
+  Array<MeasureResult> results;
+  results.reserve(inputs.size());
 
   if (batch_size == -1) {
     // set default batch size
     batch_size = builder->n_parallel * 2;
   }
 
-  StdCout(verbose) << "Get " << inputs.size() << " programs for measure. (This may take a while)"
-                   << std::endl;
+  int old_verbosity = verbose;
+
+  StdCout(verbose) << "Get " << inputs.size() << " programs to measure:" << std::endl;
 
   for (size_t i = 0; i < inputs.size(); i += batch_size) {
     Array<MeasureInput> input_batch(inputs.begin() + i,
@@ -228,16 +234,18 @@ void ProgramMeasurerNode::Measure(const SearchTask& task, const SearchPolicy& po
 
     // update current best state according to the new measure result
     for (size_t j = 0; j < input_batch.size(); ++j) {
+      const String& workload_key = input_batch[j]->task->workload_key;
       double flops;
+
       if (result_batch[j]->error_no == 0) {
         flops = task->compute_dag->flop_ct / FloatArrayMean(result_batch[j]->costs);
         error_ct = 0;
+        has_valid.insert(workload_key);
       } else {
         flops = 0.0;
         error_ct++;
       }
 
-      const String& workload_key = input_batch[j]->task->workload_key;
       if (flops > best_flops[workload_key]) {
         best_flops[workload_key] = flops;
         best_state[workload_key] = input_batch[j]->state;
@@ -245,11 +253,12 @@ void ProgramMeasurerNode::Measure(const SearchTask& task, const SearchPolicy& po
       }
 
       ct++;
-      StdCout(verbose) << std::fixed << std::setprecision(2) << Chars('=', 50) << "\n"
-                       << "No: " << ct << "\tGFLOPS: " << flops / 1e9 << " / "
-                       << best_flops[workload_key] / 1e9 << "\tresults: " << result_batch[j] << "\n"
-                       << Chars('=', 50) << "\n"
-                       << input_batch[j]->state << "\n";
+      StdCout(verbose, 2) << std::fixed << std::setprecision(2) << Chars('=', 50) << "\n"
+                          << "No: " << ct << "\tGFLOPS: " << flops / 1e9 << " / "
+                          << best_flops[workload_key] / 1e9 << "\tresults: " << result_batch[j]
+                          << "\n"
+                          << Chars('=', 50) << "\n"
+                          << input_batch[j]->state << "\n";
     }
 
     // Call callback functions
@@ -261,13 +270,21 @@ void ProgramMeasurerNode::Measure(const SearchTask& task, const SearchPolicy& po
 
     // Store result batch
     for (auto& res : result_batch) {
-      results->push_back(res);
+      results.push_back(res);
     }
 
     if (error_ct > max_continuous_error) {
-      LOG(FATAL) << "Too many errors happened during tuning";
+      LOG(WARNING) << "Too many errors happened during tuning. Switching to debug mode."
+                   << std::endl;
+      verbose = 2;
+    } else {
+      verbose = old_verbosity;
     }
   }
+
+  PrintTimeElapsed(t_begin, "measurement", verbose);
+
+  return results;
 }
 
 void ProgramMeasurerNode::SilentMeasure(const SearchTask& task, const Array<MeasureInput>& inputs,
@@ -299,7 +316,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
         auto old_config = p->stream.precision(4);
         for (size_t i = 0; i < node->costs.size(); ++i) {
           auto pf = node->costs[i].as<FloatImmNode>();
-          CHECK(pf != nullptr);
+          ICHECK(pf != nullptr);
           p->stream << pf->value;
           if (i != node->costs.size() - 1) {
             p->stream << ",";
@@ -343,6 +360,12 @@ TVM_REGISTER_GLOBAL("auto_scheduler.MeasureResult")
       return MeasureResult(costs, error_no, error_msg, all_cost, timestamp);
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.ProgramMeasurer")
+    .set_body_typed([](ProgramBuilder builder, ProgramRunner runner,
+                       Array<MeasureCallback> callbacks, int verbose, int max_continuous_error) {
+      return ProgramMeasurer(builder, runner, callbacks, verbose, max_continuous_error);
+    });
+
 TVM_REGISTER_GLOBAL("auto_scheduler.ProgramBuilderBuild")
     .set_body_typed([](const ProgramBuilder& builder, const Array<MeasureInput>& inputs,
                        int verbose) { return builder->Build(inputs, verbose); });
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
old mode 100755
new mode 100644
index 99c01b17e78e..1bc2c78a99f0
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -53,7 +53,7 @@ struct Handler<::tvm::Array<::tvm::auto_scheduler::Stage>> {
     bool s;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(!s);
+    ICHECK(!s);
   }
 };
 
@@ -80,7 +80,7 @@ struct Handler<::tvm::Array<::tvm::auto_scheduler::Step>> {
       reader->BeginArray();
       data->push_back(::tvm::auto_scheduler::StepReadFromRecord(reader));
       s = reader->NextArrayItem();
-      CHECK(!s);
+      ICHECK(!s);
     }
   }
 };
@@ -97,12 +97,60 @@ struct Handler<::tvm::auto_scheduler::StateNode> {
     bool s;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->stages);
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->transform_steps);
     s = reader->NextArrayItem();
+    ICHECK(!s);
+  }
+};
+
+template <>
+struct Handler<::tvm::auto_scheduler::HardwareParamsNode> {
+  inline static void Write(dmlc::JSONWriter* writer,
+                           const ::tvm::auto_scheduler::HardwareParamsNode& data) {
+    writer->BeginArray(false);
+    writer->WriteArrayItem(data.num_cores);
+    writer->WriteArrayItem(data.vector_unit_bytes);
+    writer->WriteArrayItem(data.cache_line_bytes);
+    writer->WriteArrayItem(data.max_shared_memory_per_block);
+    writer->WriteArrayItem(data.max_registers_per_block);
+    writer->WriteArrayItem(data.max_threads_per_block);
+    writer->WriteArrayItem(data.max_vthread_extent);
+    writer->WriteArrayItem(data.warp_size);
+    writer->EndArray();
+  }
+  inline static void Read(dmlc::JSONReader* reader,
+                          ::tvm::auto_scheduler::HardwareParamsNode* data) {
+    bool s;
+    reader->BeginArray();
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->num_cores);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->vector_unit_bytes);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->cache_line_bytes);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->max_shared_memory_per_block);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->max_registers_per_block);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->max_threads_per_block);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->max_vthread_extent);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->warp_size);
+    s = reader->NextArrayItem();
     CHECK(!s);
   }
 };
@@ -114,22 +162,29 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     writer->BeginArray(false);
     writer->WriteArrayItem(std::string(data.workload_key));
     writer->WriteArrayItem(data.target->str());
+    writer->WriteArrayItem(*data.hardware_params.get());
     writer->EndArray();
   }
   inline static void Read(dmlc::JSONReader* reader, ::tvm::auto_scheduler::SearchTaskNode* data) {
     bool s;
     std::string str_value;
+    auto hardware_params_node = ::tvm::make_object<::tvm::auto_scheduler::HardwareParamsNode>();
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&str_value);
     data->workload_key = std::move(str_value);
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&str_value);
     data->target = ::tvm::Target(str_value);
     s = reader->NextArrayItem();
-    CHECK(!s);
+    if (s) {
+      reader->Read(hardware_params_node.get());
+      s = reader->NextArrayItem();
+      data->hardware_params = ::tvm::auto_scheduler::HardwareParams(hardware_params_node);
+      ICHECK(!s);
+    }
   }
 };
 
@@ -150,13 +205,13 @@ struct Handler<::tvm::auto_scheduler::MeasureInputNode> {
     bool s;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(task_node.get());
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(state_node.get());
     s = reader->NextArrayItem();
-    CHECK(!s);
+    ICHECK(!s);
 
     data->task = ::tvm::auto_scheduler::SearchTask(task_node);
     data->state = ::tvm::auto_scheduler::State(state_node);
@@ -172,7 +227,7 @@ struct Handler<::tvm::auto_scheduler::MeasureResultNode> {
     writer->BeginArray(false);
     for (const auto& x : data.costs) {
       auto pf = x.as<::tvm::tir::FloatImmNode>();
-      CHECK(pf != nullptr) << "Cost can only contain float values";
+      ICHECK(pf != nullptr) << "Cost can only contain float values";
       writer->WriteArrayItem(pf->value);
     }
     writer->EndArray();
@@ -187,23 +242,23 @@ struct Handler<::tvm::auto_scheduler::MeasureResultNode> {
     bool s;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&double_list);
     data->costs.clear();
     for (const auto& i : double_list) {
       data->costs.push_back(::tvm::FloatImm(::tvm::DataType::Float(64), i));
     }
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->error_no);
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->all_cost);
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->timestamp);
     s = reader->NextArrayItem();
-    CHECK(!s);
+    ICHECK(!s);
   }
 };
 
@@ -216,7 +271,7 @@ namespace auto_scheduler {
 TVM_REGISTER_OBJECT_TYPE(RecordToFileNode);
 TVM_REGISTER_OBJECT_TYPE(RecordReaderNode);
 
-const std::string AUTO_SCHEDULER_LOG_VERSION = "v0.2";  // NOLINT(*)
+const std::string AUTO_SCHEDULER_LOG_VERSION = "v0.3";  // NOLINT(*)
 
 RecordToFile::RecordToFile(String filename) {
   auto node = make_object<RecordToFileNode>();
@@ -340,5 +395,21 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SaveRecords")
       std::ofstream ofs(filename, std::ofstream::app);
       WriteMeasureRecords(&ofs, in, res);
     });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.SerializeMeasureInput")
+    .set_body_typed([](const MeasureInput& input) {
+      std::ostringstream os;
+      dmlc::JSONWriter writer(&os);
+      writer.Write(*input.get());
+      return os.str();
+    });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.DeserializeMeasureInput").set_body_typed([](String json) {
+  std::istringstream ss(json);
+  dmlc::JSONReader reader(&ss);
+  auto inp = make_object<MeasureInputNode>();
+  reader.Read(inp.get());
+  return ObjectRef(inp);
+});
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/auto_scheduler/search_policy/empty_policy.cc b/src/auto_scheduler/search_policy/empty_policy.cc
index 21a68ac21d91..79f98793d848 100644
--- a/src/auto_scheduler/search_policy/empty_policy.cc
+++ b/src/auto_scheduler/search_policy/empty_policy.cc
@@ -19,7 +19,8 @@
 
 /*!
  * \file auto_scheduler/search_policy/empty_policy.cc
- * \brief This is an brief example of search policy.
+ * \brief A simple example of the search policy which always returns the initial naive schedule
+ * (state).
  */
 
 #include "empty_policy.h"
@@ -29,6 +30,8 @@
 
 #include <utility>
 
+#include "utils.h"
+
 namespace tvm {
 namespace auto_scheduler {
 
@@ -54,7 +57,7 @@ State EmptyPolicyNode::Search(int num_measure_trials, int early_stopping,
   // Measure is disabled if num_measure_trials <= 1
   if (num_measure_trials <= 1) {
     const auto& res = SearchOneRound();
-    CHECK_GT(res.size(), 0);
+    ICHECK_GT(res.size(), 0);
 
     return res[0];
   } else {
@@ -64,19 +67,18 @@ State EmptyPolicyNode::Search(int num_measure_trials, int early_stopping,
     measurer->Reset();
     int ct = 0;
     // In each round, we call SearchOneRound to get several candidate states,
-    // then use ProgramMeasurer to test their performance
+    // then use ProgramMeasurer to measure their performance.
     while (ct < num_measure_trials) {
       const auto& res = SearchOneRound();
       ct += res.size();
       // Build MeasureInputs for measuring
       inputs.clear();
       for (const auto& state : res) {
-        // The class members measured_states_set_ provided by SearchPolicy can be used to filter
-        // out the already measured states
         inputs.push_back(MeasureInput(search_task, state));
       }
+      // Perform measurement.
       // ProgramMeasurer will record the state with best performance during measure process
-      measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs, &results);
+      results = measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs);
     }
 
     // Return a state with best measured performance
@@ -84,18 +86,33 @@ State EmptyPolicyNode::Search(int num_measure_trials, int early_stopping,
   }
 }
 
+std::pair<Array<MeasureInput>, Array<MeasureResult>> EmptyPolicyNode::ContinueSearchOneRound(
+    int num_measure, ProgramMeasurer measurer) {
+  Array<State> best_states;
+  Array<MeasureInput> inputs;
+  Array<MeasureResult> results;
+
+  // Search one round to get promising states
+  PrintTitle("Search", verbose);
+  best_states = SearchOneRound();
+
+  // Measure these states
+  PrintTitle("Measure", verbose);
+  for (const auto& state : best_states) {
+    inputs.push_back(MeasureInput(search_task, state));
+  }
+  results = measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs);
+
+  return std::make_pair(std::move(inputs), std::move(results));
+}
+
 // As an example policy, EmptyPolicy always returns a init state
 Array<State> EmptyPolicyNode::SearchOneRound() {
   Array<State> res;
 
-  // 1. We will process `Program sampling` first to generate several initial schedules
+  // Simply return the initial naive schedule (state).
   res.push_back(search_task->compute_dag->init_state);
 
-  // 2. Then `Performance Tuning`: use cost model and evolutionary search to seek for the schedule
-  // with best performance
-  // Note: This example policy does not include this part
-
-  // 3. The returned candidate schedules will be measured in hardware
   return res;
 }
 
diff --git a/src/auto_scheduler/search_policy/empty_policy.h b/src/auto_scheduler/search_policy/empty_policy.h
index 3d138220dc0b..2219ebce83f0 100644
--- a/src/auto_scheduler/search_policy/empty_policy.h
+++ b/src/auto_scheduler/search_policy/empty_policy.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file auto_scheduler/search_policy/empty_policy.h
- * \brief A brief example of the search policy which always returns the initial naive schedule
+ * \brief A simple example of the search policy which always returns the initial naive schedule
  * (state).
  */
 
@@ -27,14 +27,17 @@
 #define TVM_AUTO_SCHEDULER_SEARCH_POLICY_EMPTY_POLICY_H_
 
 #include <tvm/auto_scheduler/loop_state.h>
+#include <tvm/auto_scheduler/measure.h>
 #include <tvm/auto_scheduler/search_policy.h>
 
+#include <utility>
+
 namespace tvm {
 namespace auto_scheduler {
 
 /*!
- * \brief A brief example of the search policy which always returns the initial naive schedule
- * (state), the formal search policy will continue to follow its design.
+ * \brief A simple example of the search policy which always returns the initial naive schedule
+ * (state).
  * The key implementation for this structure is `Search()`, check `empty_policy.cc` for more
  * details.
  */
@@ -43,13 +46,16 @@ class EmptyPolicyNode : public SearchPolicyNode {
   State Search(int num_measure_trials, int early_stopping, int num_measures_per_round,
                ProgramMeasurer measurer) final;
 
+  std::pair<Array<MeasureInput>, Array<MeasureResult>> ContinueSearchOneRound(
+      int num_measure, ProgramMeasurer measurer) final;
+
   static constexpr const char* _type_key = "auto_scheduler.EmptyPolicy";
   TVM_DECLARE_FINAL_OBJECT_INFO(EmptyPolicyNode, SearchPolicyNode);
 
  private:
   /*!
    * \brief Use a sub function to generate several candidate states in each search round.
-   * \returns Several generated states
+   * \returns The generated states
    */
   Array<State> SearchOneRound();
 };
diff --git a/src/auto_scheduler/search_policy/search_policy.cc b/src/auto_scheduler/search_policy/search_policy.cc
index d73bd911a921..702eec087668 100644
--- a/src/auto_scheduler/search_policy/search_policy.cc
+++ b/src/auto_scheduler/search_policy/search_policy.cc
@@ -39,7 +39,7 @@ void SearchPolicyNode::PreloadMeasuredStates(const String& log_file) {
   RecordReader reader = RecordReader(log_file);
   const auto& res = reader->ReadLines(-1);
   size_t log_size = res.first.size();
-  CHECK_EQ(log_size, res.second.size());
+  ICHECK_EQ(log_size, res.second.size());
   if (log_size) {
     Array<State> measured_states;
     std::vector<float> measured_throughputs;
@@ -104,8 +104,13 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyRunCallbacks")
       }
     });
 
-TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicySetTask")
-    .set_body_typed([](SearchPolicy policy, SearchTask task) { policy->search_task = task; });
+TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyContinueSearchOneRound")
+    .set_body_typed([](SearchPolicy policy, int num_measure, ProgramMeasurer measurer) {
+      Array<MeasureInput> inputs;
+      Array<MeasureResult> results;
+      std::tie(inputs, results) = policy->ContinueSearchOneRound(num_measure, measurer);
+      return Array<ObjectRef>{inputs, results};
+    });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicySetVerbose")
     .set_body_typed([](SearchPolicy policy, int verbose) { policy->verbose = verbose; });
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index a89fa4b0c77a..e81e824626d6 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -116,8 +116,8 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
   } else if (IsGPUTask(node->search_task)) {
     // Sketch Generation Rules
     node->sketch_rules.push_back(&rule_add_cache_read_stage);
-    node->sketch_rules.push_back(&rule_always_inline);
     node->sketch_rules.push_back(&rule_special_compute_location_gpu);
+    node->sketch_rules.push_back(&rule_always_inline);
     node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor);
     node->sketch_rules.push_back(&rule_cross_thread_reduction);
     node->sketch_rules.push_back(&rule_add_cache_write_stage);
@@ -147,7 +147,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
   if (n_trials <= 1) {
     // No measurement is allowed
     const Array<State>& best_states = SearchOneRound(0);
-    CHECK_GT(best_states.size(), 0);
+    ICHECK_GT(best_states.size(), 0);
     return best_states[0];
   } else {
     int num_random =
@@ -157,19 +157,23 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
 
     int ct = 0;
     int empty_retry_count = GetIntParam(params, SketchParamKey::empty_retry_count);
+    Array<State> best_states, random_states;
     Array<MeasureInput> inputs;
     Array<MeasureResult> results;
     while (ct < n_trials) {
       if (!inputs.empty()) {
-        // Retrain cost models before the next search round
+        auto t_begin = std::chrono::high_resolution_clock::now();
+
+        // Retrain the cost model before the next search round
         PrintTitle("Train cost model", verbose);
         program_cost_model->Update(inputs, results);
+
+        PrintTimeElapsed(t_begin, "training", verbose);
       }
 
       // Search one round to get promising states
       PrintTitle("Search", verbose);
-      Array<State> random_states;
-      Array<State> best_states = SearchOneRound(num_random, &random_states);
+      best_states = SearchOneRound(num_random * 3, &random_states);
 
       // Infer bound. This is necessary for computing the correct ToStr() for redundancy check
       best_states = search_task->compute_dag.InferBound(best_states);
@@ -196,13 +200,14 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
 
       // Measure candidate states
       PrintTitle("Measure", verbose);
-      measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs, &results);
+      results = measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs);
       ct += inputs.size();
 
       // Check if reach the early stopping condition
-      if (ct - measurer->best_ct[search_task->workload_key] > early_stopping) {
+      if (ct - measurer->best_ct[search_task->workload_key] > early_stopping &&
+          measurer->has_valid.count(search_task->workload_key)) {
         StdCout(verbose) << "Stop early since no performance improvement in the last "
-                         << early_stopping << " measure steps.\n";
+                         << early_stopping << " measurements trials.\n";
         break;
       }
 
@@ -218,23 +223,56 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
   }
 }
 
-Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State>* random_states) {
-  // Temporal object to be used if the input pointer is nullptr
-  Array<State> temp_random_states;
-  if (random_states == nullptr) {
-    random_states = &temp_random_states;
-  } else {
-    random_states->clear();
+std::pair<Array<MeasureInput>, Array<MeasureResult>> SketchPolicyNode::ContinueSearchOneRound(
+    int num_measure, ProgramMeasurer measurer) {
+  num_measure_per_iter_ = num_measure;
+
+  Array<State> best_states, random_states;
+  Array<MeasureInput> inputs;
+  Array<MeasureResult> results;
+  int num_random = static_cast<int>(GetDoubleParam(params, "eps_greedy") * num_measure);
+
+  // Search one round to get promising states
+  PrintTitle("Search", verbose);
+  best_states = SearchOneRound(num_random * 3, &random_states);
+
+  // Infer bound. This is necessary for computing the correct ToStr() for redundancy check
+  best_states = search_task->compute_dag.InferBound(best_states);
+  random_states = search_task->compute_dag.InferBound(random_states);
+
+  // Pick `num_measure_per_iter` states to measure, check hash to remove already measured state
+  // Also pick some random states to do eps-greedy
+  inputs = PickStatesWithEpsGreedy(best_states, random_states, num_measure);
+
+  // Measure candidate states
+  PrintTitle("Measure", verbose);
+  results = measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs);
+
+  // Update measured states throughputs. These states will join the EvolutionarySearch in later
+  // search rounds.
+  for (const auto& res : results) {
+    measured_states_throughputs_.push_back(1.0 / FloatArrayMean(res->costs));
   }
 
+  auto t_begin = std::chrono::high_resolution_clock::now();
+
+  // Update the cost model
+  PrintTitle("Train cost model", verbose);
+  program_cost_model->Update(inputs, results);
+
+  PrintTimeElapsed(t_begin, "training", verbose);
+
+  return std::make_pair(std::move(inputs), std::move(results));
+}
+
+Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State>* random_states) {
   // Get parameters
   int population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
-  int num_use_measured =
-      std::min(static_cast<int>(measured_states_vector_.size()),
-               static_cast<int>(
-                   GetDoubleParam(params, SketchParamKey::EvolutionarySearch::use_measured_ratio) *
-                   population));
-  bool is_cost_model_reasonable = !program_cost_model->IsInstance<RandomModelNode>();
+  int num_use_measured = std::min(
+      static_cast<int>(measured_states_vector_.size()),
+      static_cast<int>(
+          GetDoubleParam(params, SketchParamKey::SampleInitPopulation::use_measured_ratio) *
+          population));
 
   // 1. Generate sketches
   if (sketch_cache_.empty()) {
@@ -242,24 +280,19 @@ Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State
   }
 
   // 2. Sample the init population
-  Array<State> init_population = SampleInitPopulation(
-      sketch_cache_, is_cost_model_reasonable ? population - num_use_measured : population);
-
-  // 3. If the cost model is useless (i.e. RandomCostModel), just random pick some generated
-  // states, else perform evolutionary search
-  if (is_cost_model_reasonable) {
-    // Also insert already measured good states to the initial population
-    std::vector<int> indices = Argsort(measured_states_throughputs_);
-    for (int i = 0; i < num_use_measured; i++) {
-      init_population.push_back(measured_states_vector_[indices[i]]);
-    }
-    // Sample some random states for eps-greedy
-    *random_states = RandomSampleStates(init_population, &rand_gen, num_random_states * 3);
-    return EvolutionarySearch(init_population, num_measure_per_iter_ * 2);
-  } else {
-    PruneInvalidState(search_task, &init_population);
-    return RandomSampleStates(init_population, &rand_gen, num_measure_per_iter_ * 3);
+  Array<State> init_population = SampleInitPopulation(sketch_cache_);
+
+  // 3. Perform evolutionary search.
+  // Also insert already measured good states to the initial population
+  std::vector<int> indices = Argsort(measured_states_throughputs_);
+  for (int i = 0; i < num_use_measured; i++) {
+    init_population.push_back(measured_states_vector_[indices[i]]);
+  }
+  // Sample some random states for eps-greedy
+  if (num_random_states > 0 && random_states != nullptr) {
+    *random_states = RandomSampleStates(init_population, &rand_gen, num_random_states);
   }
+  return EvolutionarySearch(init_population, num_measure_per_iter_ * 2);
 }
 
 Array<State> SketchPolicyNode::GenerateSketches() {
@@ -316,10 +349,10 @@ Array<State> SketchPolicyNode::GenerateSketches() {
     auto pstate = state.CopyOnWrite();
     for (size_t step_id = 0; step_id < pstate->transform_steps.size(); ++step_id) {
       if (pstate->transform_steps[step_id]->IsInstance<RfactorStepNode>()) {
-        CHECK_GE(step_id, 1);
+        ICHECK_GE(step_id, 1);
         int split_step_id = static_cast<int>(step_id - 1);
         auto step = pstate->transform_steps[split_step_id].as<SplitStepNode>();
-        CHECK(step != nullptr);
+        ICHECK(step != nullptr);
         pstate->transform_steps.Set(
             split_step_id, SplitStep(step->stage_id, step->iter_id, step->extent, {NullOpt},
                                      step->inner_to_outer));
@@ -332,27 +365,35 @@ Array<State> SketchPolicyNode::GenerateSketches() {
   return out_states;
 }
 
-Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches, int out_size) {
+Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches) {
+  // Use this population as the parallel degree to do sampling
+  int population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
+  // At least we should sample this number of valid programs
+  int min_population = GetIntParam(params, SketchParamKey::SampleInitPopulation::min_population);
+
+  auto tic_begin = std::chrono::high_resolution_clock::now();
+
   int fail_ct = 0;
   Array<State> out_states;
   std::vector<std::mt19937> rand_gens;
-  rand_gens.reserve(out_size);
-  for (int i = 0; i < out_size; i++) {
+  rand_gens.reserve(population);
+  for (int i = 0; i < population; i++) {
     rand_gens.push_back(std::mt19937(rand_gen()));
   }
-  auto tic_begin = std::chrono::high_resolution_clock::now();
 
-  while (static_cast<int>(out_states.size()) < out_size && fail_ct < out_size) {
-    std::vector<State> temp_states(out_size);
+  std::unordered_set<std::string> explored_state_strs;
+  size_t iter = 1;
+  size_t target_size = min_population;
+  size_t unchange_cnt = 0;
+  while (out_states.size() < target_size) {
+    std::vector<State> temp_states(population);
 
-    support::parallel_for(0, out_size - out_states.size(),
+    // Sample a batch of states randomly
+    support::parallel_for(0, population,
                           [this, &temp_states, &sketches, &rand_gens](int index) {
-                            // Random choose a starting sketch
-                            // TODO(jcf94, merrymercy): Maybe choose sketches in different
-                            // possibility for they may have different potential on generating state
-                            // with better performance
+                            // Randomly choose a sketch
                             State tmp_s = sketches[(rand_gens[index])() % sketches.size()];
-                            // Derivation rule based enumeration
+                            // Apply random annotation rules one by one
                             bool valid = true;
                             for (const auto& rule : init_rules) {
                               if (rule->Apply(this, &tmp_s, &rand_gens[index]) ==
@@ -366,13 +407,60 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
                             }
                           });
 
-    for (int i = 0; i < out_size; i++) {
-      if (temp_states[i].defined()) {
-        out_states.push_back(std::move(temp_states[i]));
+    // Filter out the states that were failed to apply initial rules
+    Array<State> cand_states;
+    for (auto tmp_s : temp_states) {
+      if (tmp_s.defined()) {
+        cand_states.push_back(std::move(tmp_s));
       } else {
         fail_ct++;
       }
     }
+
+    unchange_cnt++;
+    if (!cand_states.empty()) {
+      // Run the cost model to make filter out states that failed to extract features.
+      // This may happen due to illegal schedules or the schedules that uses too much
+      // memory on GPU.
+      std::vector<float> pop_scores;
+      pop_scores.reserve(cand_states.size());
+      cand_states = search_task->compute_dag.InferBound(cand_states);
+      PruneInvalidState(search_task, &cand_states);
+      program_cost_model->Predict(search_task, cand_states, &pop_scores);
+
+      for (size_t i = 0; i < cand_states.size(); i++) {
+        const auto state_str = cand_states[i].ToStr();
+        if (pop_scores[i] > -1e10 && explored_state_strs.count(state_str) == 0) {
+          explored_state_strs.insert(state_str);
+          out_states.push_back(std::move(cand_states[i]));
+          unchange_cnt = 0;  // Reset the counter once we found a valid state
+        } else {
+          fail_ct++;
+        }
+      }
+    }
+
+    if (iter % 5 == 0) {
+      double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+                            std::chrono::high_resolution_clock::now() - tic_begin)
+                            .count();
+      StdCout(verbose) << "Sample Iter: " << iter << std::fixed << std::setprecision(4)
+                       << "\t#Pop: " << out_states.size() << "\t#Target: " << target_size
+                       << "\tfail_ct: " << fail_ct << "\tTime elapsed: " << std::fixed
+                       << std::setprecision(2) << duration << std::endl;
+    }
+
+    if (unchange_cnt == 5) {
+      // Reduce the target size to avoid too-long time in this phase if no valid state was found
+      // in the past iterations
+      if (target_size > 1) {
+        target_size /= 2;
+        StdCout(verbose) << "#Target has been reduced to " << target_size
+                         << " due to too many failures or duplications" << std::endl;
+      }
+      unchange_cnt = 0;
+    }
+    iter++;
   }
 
   double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
@@ -390,8 +478,15 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
   size_t population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
-  int num_iters = GetIntParam(params, SketchParamKey::EvolutionarySearch::num_iters);
   double mutation_prob = GetDoubleParam(params, SketchParamKey::EvolutionarySearch::mutation_prob);
+  int num_iters = GetIntParam(params, SketchParamKey::EvolutionarySearch::num_iters);
+
+  bool is_cost_model_reasonable = !program_cost_model->IsInstance<RandomModelNode>();
+  if (!is_cost_model_reasonable && num_iters > 2) {
+    num_iters = 2;
+    StdCout(verbose) << "GA iteration number has been adjusted to " << num_iters
+                     << " due to random cost model" << std::endl;
+  }
 
   // Two ping pong buffers to avoid copy.
   Array<State> states_buf1{init_population}, states_buf2;
@@ -412,7 +507,7 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   // auxiliary global variables
   std::vector<float> pop_scores;
   std::vector<double> pop_selection_probs;
-  float max_score = 0.0;
+  float max_score = -1e-10;
   pop_scores.reserve(population);
   pop_selection_probs.reserve(population);
   std::uniform_real_distribution<> dis(0.0, 1.0);
@@ -460,9 +555,15 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
 
     // Print statistical information
     if (k % 5 == 0 || k == num_iters) {
-      StdCout(verbose) << "GA Iter: " << k << std::fixed << std::setprecision(4)
-                       << "\tMax score: " << max_score << "\tMin score: " << heap.front().second
-                       << "\t#Pop: " << pnow->size() << "\t#M+: " << mutation_success_ct / (k + 1)
+      StdCout(verbose) << "GA Iter: " << k;
+      if (!heap.empty()) {
+        StdCout(verbose) << std::fixed << std::setprecision(4) << "\tMax score: " << max_score
+                         << std::fixed << std::setprecision(4)
+                         << "\tMin score: " << heap.front().second;
+      } else {
+        StdCout(verbose) << "\tMax score: N/A\tMin score: N/A";
+      }
+      StdCout(verbose) << "\t#Pop: " << heap.size() << "\t#M+: " << mutation_success_ct / (k + 1)
                        << "\t#M-: " << mutation_fail_ct / (k + 1) << std::endl;
     }
     if (k == num_iters) {
@@ -472,6 +573,8 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
     // Compute selection probability
     ComputePrefixSumProb(pop_scores, &pop_selection_probs);
 
+    // TODO(merrymercy, comaniac): add crossover.
+
     // Do mutation
     while (pnext->size() < population) {
       State tmp_s = (*pnow)[RandomChoose(pop_selection_probs, &rand_gen)];
@@ -567,10 +670,10 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicyGenerateSketches")
     .set_body_typed([](SketchPolicy policy) { return policy->GenerateSketches(); });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicySampleInitialPopulation")
-    .set_body_typed([](SketchPolicy policy, int pop_size) {
+    .set_body_typed([](SketchPolicy policy) {
       const Array<State>& sketches = policy->GenerateSketches();
 
-      Array<State> init_population = policy->SampleInitPopulation(sketches, pop_size);
+      Array<State> init_population = policy->SampleInitPopulation(sketches);
       return init_population;
     });
 
@@ -580,5 +683,9 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicyEvolutionarySearch")
       return states;
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.PrintTitle").set_body_typed([](std::string title) {
+  PrintTitle(title, 1);
+});
+
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index 21aaa6ef7b90..3d135d1bda94 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -19,13 +19,15 @@
 
 /*!
  * \file auto_scheduler/search_policy/sketch_policy.h
- * \brief The search policy that searches in a hierarchical search space defined by sketches.
- * The policy randomly samples programs from the space defined by sketches and use evolutionary
- * search to fine-tune them.
+ * \brief This search policy constructs a search space according to the compute declaration.
+ * It then randomly samples programs from the search space and uses evolutionary search with a
+ * learned cost model to fine tune the sampled programs.
+ * The final optimized programs are sent to actual hardware for measurement.
+ * The above process is repeated until the auto-scheduler runs out of time budget.
  *
  * Reference:
  * L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
- * Programs for Deep Learning." arXiv preprint arXiv:2006.06762 (2020).
+ * Programs for Deep Learning." (OSDI 2020).
  */
 
 #ifndef TVM_AUTO_SCHEDULER_SEARCH_POLICY_SKETCH_POLICY_H_
@@ -54,16 +56,20 @@ struct SketchParamKey {
   /*! \brief Retry several times if SearchOneRound gets no valid state. */
   static constexpr const char* empty_retry_count = "retry_search_one_round_on_empty";
 
+  struct SampleInitPopulation {
+    /*! \brief The minimal size of valid population in the initial sampling. */
+    static constexpr const char* min_population = "sample_init_min_population";
+    /*! \brief The maximum percentage of measured states in the initial sampling. */
+    static constexpr const char* use_measured_ratio = "sample_init_use_measured_ratio";
+  };
+
   struct EvolutionarySearch {
-    /*! \brief The population size for evolutionary search. */
+    /*! \brief The population size of evolutionary search. */
     static constexpr const char* population = "evolutionary_search_population";
     /*! \brief The number of iterations performed by generic algorithm.*/
     static constexpr const char* num_iters = "evolutionary_search_num_iters";
     /*! \brief The mutation probability.*/
     static constexpr const char* mutation_prob = "evolutionary_search_mutation_prob";
-    /*! \brief The maximum percentage of measured states in the initial population for evolutionary
-     * search. */
-    static constexpr const char* use_measured_ratio = "evolutionary_search_use_measured_ratio";
   };
 
   struct MultiLevelTiling {
@@ -106,6 +112,9 @@ class SketchPolicyNode : public SearchPolicyNode {
   State Search(int num_measure_trials, int early_stopping, int num_measures_per_round,
                ProgramMeasurer measurer) final;
 
+  std::pair<Array<MeasureInput>, Array<MeasureResult>> ContinueSearchOneRound(
+      int num_measure, ProgramMeasurer measurer) final;
+
   /*!
    * \brief Generate sketches.
    * \return The generated sketches(states).
@@ -115,10 +124,9 @@ class SketchPolicyNode : public SearchPolicyNode {
   /*!
    * \brief Sample the init population.
    * \param sketches The initial sketches for the sampled population
-   * \param out_size The number of output states.
    * \return The generated states (the initial population).
    */
-  Array<State> SampleInitPopulation(const Array<State>& sketches, int out_size);
+  Array<State> SampleInitPopulation(const Array<State>& sketches);
 
   /*!
    * \brief Perform evolutionary search.
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 045ee860f1f1..814e72a9478c 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -19,7 +19,8 @@
 
 /*!
  * \file auto_scheduler/search_policy/sketch_policy_rules.cc
- * \brief Rules defined to generate the sketches and initial sampled states in SketchPolicy.
+ * \brief Rules for generating the sketches, sampling the initial population, and mutating the
+ * population in SketchPolicy.
  */
 
 #include "sketch_policy_rules.h"
@@ -53,21 +54,32 @@ std::vector<std::pair<State, int>> RuleSkipStage::Apply(const SketchPolicyNode&
 }
 
 /********** RuleAlwaysInline **********/
+inline bool ShouldAlwaysBeInlined(const SketchPolicyNode& policy, const State& state,
+                                  int stage_id) {
+  const SearchTask& task = policy.search_task;
+  const Stage& stage = state->stages[stage_id];
+
+  // Check the inline limitation of TE
+  if (stage->op_type == StageKind::kPlaceholder || IsOutputOp(task, state, stage_id) ||
+      HasReduceIter(stage)) {
+    return false;
+  }
+
+  if (IsGPUTask(task)) {  // Greedily inline all inlinable ops on gpu
+    return true;
+  } else {
+    // Only always-inline strict-inlinable ops on cpu.
+    // The computation location of other ops will be tuned by InitChangeComputeLocation
+    // and MutateComputeLocation.
+    return IsStrictlyInlineable(task, state, stage_id);
+  }
+}
 
 SketchGenerationRule::ConditionKind RuleAlwaysInline::MeetCondition(const SketchPolicyNode& policy,
                                                                     const State& state,
                                                                     int stage_id) const {
-  const Stage& stage = state->stages[stage_id];
-  // Check the inline limitation of TE first
-  if (stage->op_type == StageKind::kPlaceholder ||
-      IsOutputOp(policy.search_task, state, stage_id) || HasReduceIter(stage)) {
-    return ConditionKind::kSkip;
-  }
-
-  // Always do compute inline if it's strictly inlineable or is in GPU policy
-  return IsStrictlyInlineable(policy.search_task, state, stage_id) || IsGPUTask(policy.search_task)
-             ? ConditionKind::kApplyAndSkipRest
-             : ConditionKind::kSkip;
+  return ShouldAlwaysBeInlined(policy, state, stage_id) ? ConditionKind::kApplyAndSkipRest
+                                                        : ConditionKind::kSkip;
 }
 
 std::vector<std::pair<State, int>> RuleAlwaysInline::Apply(const SketchPolicyNode& policy,
@@ -114,7 +126,8 @@ SketchGenerationRule::ConditionKind RuleMultiLevelTilingWithFusion::MeetConditio
 std::vector<std::pair<State, int>> RuleMultiLevelTilingWithFusion::Apply(
     const SketchPolicyNode& policy, const State& state, int stage_id) const {
   int target_stage_id;
-  CHECK(HasSingleElementwiseMatchedConsumer(policy.search_task, state, stage_id, &target_stage_id));
+  ICHECK(
+      HasSingleElementwiseMatchedConsumer(policy.search_task, state, stage_id, &target_stage_id));
   const std::string& multi_level_tiling_structure =
       IsGPUTask(policy.search_task)
           ? GetStringParam(policy.params, SketchParamKey::MultiLevelTiling::gpu_structure)
@@ -150,9 +163,6 @@ SketchGenerationRule::ConditionKind RuleAddCacheRead::MeetCondition(const Sketch
 
   // Don't cache_read a stage if it has multiple consumers
   const std::set<int>& consumers = GetConsumers(task, state, stage_id);
-  if (consumers.size() != 1) {
-    return ConditionKind::kSkip;
-  }
 
   // Don't cache_read a stage if its consumer does not need multi-level tiling
   int target_stage_id = *consumers.begin();
@@ -178,16 +188,22 @@ std::vector<std::pair<State, int>> RuleAddCacheRead::Apply(const SketchPolicyNod
                                                            const State& state, int stage_id) const {
   const SearchTask& task = policy.search_task;
   const std::set<int>& consumers = GetConsumers(task, state, stage_id);
-  CHECK_EQ(consumers.size(), 1);
-  int target_stage_id = *consumers.begin();
   State tmp_s = state;
 
-  // Cache read add shared memory
-  int added_stage_id = tmp_s.cache_read(stage_id, "shared", {target_stage_id}, task->compute_dag);
-  target_stage_id++;
-  const auto& share_read_pos =
-      GetLastReduceIteratorInOutermostReduceTile(tmp_s->stages[target_stage_id]);
-  tmp_s.compute_at(added_stage_id, target_stage_id, share_read_pos);
+  int target_stage_id_offset = 0;
+  for (int orig_target_stage_id : consumers) {
+    int target_stage_id = orig_target_stage_id + target_stage_id_offset;
+
+    // Cache read add shared memory
+    int added_stage_id = tmp_s.cache_read(stage_id, "shared", {target_stage_id}, task->compute_dag);
+    target_stage_id_offset++;
+    target_stage_id++;
+
+    const auto& share_read_pos =
+        GetLastReduceIteratorInOutermostReduceTile(tmp_s->stages[target_stage_id]);
+    tmp_s.compute_at(added_stage_id, target_stage_id, share_read_pos);
+  }
+
   return {std::make_pair(tmp_s, stage_id)};
 }
 
@@ -292,7 +308,7 @@ std::vector<std::pair<State, int>> RuleSimplifyComputeWithConstTensor::Apply(
       unrolled_inner_iters.push_back(tmp_s.unroll(stage_id, iter));
     } else {
       // tile other space indices
-      CHECK(iter->iter_kind == IteratorKind::kSpatial);
+      ICHECK(iter->iter_kind == IteratorKind::kSpatial);
       tiled_outer_iters.push_back(
           tmp_s.split(stage_id, iter, Array<Optional<Integer>>(tile_level - 1, NullOpt)));
     }
@@ -315,9 +331,9 @@ std::vector<std::pair<State, int>> RuleSimplifyComputeWithConstTensor::Apply(
 
 SketchGenerationRule::ConditionKind RuleCrossThreadReduction::MeetCondition(
     const SketchPolicyNode& policy, const State& state, int stage_id) const {
-  CHECK(IsGPUTask(policy.search_task));
+  ICHECK(IsGPUTask(policy.search_task));
 
-  // If it is an intermidiate state created by RuleAddCacheWrite,
+  // If it is an intermediate state created by RuleAddCacheWrite,
   // we just skip it.
   if (HasCacheWriteStage(state, stage_id)) {
     return ConditionKind::kSkip;
@@ -331,7 +347,11 @@ SketchGenerationRule::ConditionKind RuleCrossThreadReduction::MeetCondition(
         GetCumulativeSpaceAndReductionLength(state->stages[stage_id]);
 
     if (NeedsMultilevelTiling(policy.search_task, state, stage_id)) {
-      // Do rfactor if we do not have enough parallelism on space iters
+      // Avoid rfactor if we have enough parallelism on space iters
+      if (cum_space_len > policy.search_task->hardware_params->max_threads_per_block) {
+        return ConditionKind::kSkip;
+      }
+
       return cum_space_len < cum_reduce_len ? ConditionKind::kApply : ConditionKind::kSkip;
     } else if (cum_reduce_len > 1) {
       // Try rfactor for other reduction operators
@@ -378,14 +398,14 @@ std::vector<std::pair<State, int>> RuleCrossThreadReduction::Apply(const SketchP
       // If the target stage does not have split step,
       // it must be a simple stage without reduce iters.
       // We then should do a split for it.
-      CHECK(!HasReduceIter(target_stage));
+      ICHECK(!HasReduceIter(target_stage));
       const auto& split_res = tmp_s.split(target_stage_id, target_stage->iters.back(),
                                           {Integer(task->hardware_params->warp_size)});
       tmp_s.bind(target_stage_id, split_res[1], IteratorAnnotation::kThreadX);
       split_step_ids.push_back(tmp_s->transform_steps.size() - 2);
     }
 
-    CHECK_EQ(split_step_ids.size(), 1);
+    ICHECK_EQ(split_step_ids.size(), 1);
 
     const Iterator& target_iter = tmp_s->stages[target_stage_id]->iters[num_common_outer - 1];
     const auto& split_res = tmp_s.follow_split(stage_id, fused_reduce_iter, split_step_ids[0], 1);
@@ -408,6 +428,10 @@ SketchGenerationRule::ConditionKind RuleSpecialComputeLocationGPU::MeetCondition
     return ConditionKind::kSkip;
   }
 
+  if (!ShouldAlwaysBeInlined(policy, state, stage_id)) {
+    return ConditionKind::kSkip;
+  }
+
   const std::set<int>& consumers = GetConsumers(policy.search_task, state, stage_id);
   if (consumers.size() == 1 && state->stages[*consumers.begin()]->op->attrs.count(
                                    SearchPolicyKey::simplify_const_tensor_indices)) {
@@ -421,13 +445,13 @@ std::vector<std::pair<State, int>> RuleSpecialComputeLocationGPU::Apply(
     const SketchPolicyNode& policy, const State& state, int stage_id) const {
   State tmp_s = state;
   const std::set<int>& consumers = GetConsumers(policy.search_task, state, stage_id);
-  CHECK_EQ(consumers.size(), 1);
+  ICHECK_EQ(consumers.size(), 1);
 
   // Get the last outer space iterator that is not unrolled.
   const Stage& target_stage = state->stages[*consumers.begin()];
   for (size_t i = 0; i < target_stage->iters.size(); ++i) {
     if (target_stage->iters[i]->annotation == IteratorAnnotation::kUnroll) {
-      CHECK_GT(i, 0);
+      ICHECK_GT(i, 0);
 
       tmp_s.compute_at(stage_id, *consumers.begin(), target_stage->iters[i - 1]);
       break;
@@ -441,6 +465,7 @@ std::vector<std::pair<State, int>> RuleSpecialComputeLocationGPU::Apply(
 
 PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy, State* state,
                                                              std::mt19937* rand_gen) const {
+  SplitFactorizationMemo split_memo;
   int max_innermost_split_factor =
       GetIntParam(policy->params, SketchParamKey::max_innermost_split_factor);
 
@@ -459,10 +484,11 @@ PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* p
         continue;
       }
 
-      CHECK(ps->extent);
+      ICHECK(ps->extent);
       int extent = GetIntImm(ps->extent.value());
-      const auto& candidate_lens = policy->split_memo.GetFactorizationSchemes(
-          extent, ps->lengths.size(), max_innermost_split_factor);
+      const auto& candidate_lens = split_memo.GetFactorizationSchemes(extent, ps->lengths.size(),
+                                                                      max_innermost_split_factor);
+      ICHECK(!candidate_lens.empty());
       const auto& candidate_lengths = candidate_lens[(*rand_gen)() % candidate_lens.size()];
 
       pstate->transform_steps.Set(
@@ -702,17 +728,20 @@ PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode*
 
 PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy, State* state,
                                                            std::mt19937* rand_gen) const {
+  // Collect all stages that are roots of stages that perform multi-level tiling.
   std::set<int> multi_level_tiling_root_set;
   for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
     if (NeedsMultilevelTiling(policy->search_task, *state, stage_id)) {
       const Stage& stage = (*state)->stages[stage_id];
-      if (stage->compute_at != ComputeAtKind::kIter) {
+      if (stage->compute_at == ComputeAtKind::kInlined) {
+        continue;
+      } else if (stage->compute_at != ComputeAtKind::kIter) {
         // This stage is not multi-level tiled,
         // so it must be produced by RuleCrossThreadReduction.
-        CHECK(HasCrossThreadReduction(*state, stage_id));
+        ICHECK(HasCrossThreadReduction(*state, stage_id));
       } else {
         const auto res = (*state)->attach_map->stage_to_attach_iter.find(stage_id);
-        CHECK(res != (*state)->attach_map->stage_to_attach_iter.end());
+        ICHECK(res != (*state)->attach_map->stage_to_attach_iter.end());
         multi_level_tiling_root_set.insert(res->second.first);
       }
     }
@@ -771,9 +800,9 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol
       std::vector<Iterator> to_fuse;
       int total_space_extent = 1;
       for (const auto& i : pop->root_iter_vars()) {
-        CHECK(i->dom.defined());
+        ICHECK(i->dom.defined());
         const auto& pint = i->dom->extent.as<IntImmNode>();
-        CHECK(pint);
+        ICHECK(pint);
         total_space_extent *= pint->value;
       }
 
@@ -836,7 +865,7 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol
       // Do cooperative fetching for the cache read stage.
       // Get spatial_split_step_ids from the root stage
       const auto& it = (*state)->attach_map->stage_to_attach_iter.find(stage_id);
-      CHECK(it != (*state)->attach_map->stage_to_attach_iter.end());
+      ICHECK(it != (*state)->attach_map->stage_to_attach_iter.end());
       Array<Integer> spatial_split_step_ids = GetSpatialSplitStepIds(*state, it->second.first);
 
       // Fuse all iterators to do cooperative fetching
@@ -886,7 +915,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
   do {
     step_id = split_step_ids[(*rand_gen)() % split_step_ids.size()];
     ps = (*state)->transform_steps[step_id].as<SplitStepNode>();
-    CHECK(ps != nullptr);
+    ICHECK(ps != nullptr);
     extent = GetIntImm(ps->extent.value());
     retry_ct += 1;
   } while (retry_ct < static_cast<int>(split_step_ids.size()) << 2 && (extent == 1 || extent == 0));
@@ -918,7 +947,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
     // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx]
     size_t dst_idx = random_perm[(i + 1) % random_perm.size()];
     const std::vector<int>& factors = policy->split_memo.GetFactors(length);
-    CHECK_GE(factors.size(), 1);
+    ICHECK_GE(factors.size(), 1);
 
     int divide_factor;
     if (dst_idx == lengths.size() - 1) {
@@ -950,7 +979,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
       }
     }
 
-    CHECK_LE(GetIntImm(new_lengths.back()), max_innermost_split_factor);
+    ICHECK_LE(GetIntImm(new_lengths.back()), max_innermost_split_factor);
 
     StateNode* pstate = state->CopyOnWrite();
     pstate->transform_steps.Set(
@@ -983,13 +1012,17 @@ PopulationGenerationRule::ResultKind MutateAutoUnroll::Apply(SketchPolicyNode* p
   // Randomly pick up an auto unroll pragma step
   auto step_id = pragma_steps[(*rand_gen)() % pragma_steps.size()];
   auto ps = (*state)->transform_steps[step_id].as<PragmaStepNode>();
-  CHECK(ps);
+  ICHECK(ps);
 
   // Mutate its value to a random candidates
-  auto val = std::to_string(auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()]);
+  int val = auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()];
   StateNode* pstate = state->CopyOnWrite();
-  pstate->transform_steps.Set(step_id, PragmaStep(ps->stage_id, ps->iter_id,
-                                                  std::string("auto_unroll_max_step") + "$" + val));
+  pstate->transform_steps.Set(
+      step_id, PragmaStep(ps->stage_id, ps->iter_id,
+                          std::string("auto_unroll_max_step") + "$" + std::to_string(val)));
+  Stage new_stage = pstate->stages[ps->stage_id];
+  new_stage.CopyOnWrite()->attrs.auto_unroll_max_step = val;
+  pstate->stages.Set(ps->stage_id, new_stage);
   return ResultKind::kValid;
 }
 
@@ -1024,7 +1057,7 @@ PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNo
   size_t step_id = compute_at_steps[(*rand_gen)() % compute_at_steps.size()];
   auto ps = (*state)->transform_steps[step_id].as<ComputeAtStepNode>();
   int stage_inc = GetTargetStageIDInState(*state, step_id) - ps->stage_id;
-  CHECK(ps != nullptr);
+  ICHECK(ps != nullptr);
 
   // Randomly pick a new computation location
   std::vector<std::pair<int, int>> candidates =
@@ -1113,6 +1146,10 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol
     }
   }
 
+  if (max_fusable_iter_id == 0) {
+    return ResultKind::kInvalid;
+  }
+
   // Randomly pick one granularity
   int fuse_to_iter_id = (*rand_gen)() % max_fusable_iter_id + 1;
   Array<Integer> fused_ids;
@@ -1141,14 +1178,14 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol
         if (ps->iter_id == 0) {
           step = AnnotationStep(ps->stage_id, 0, ps->annotation);
         } else {
-          CHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
+          ICHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
           step = AnnotationStep(ps->stage_id, ps->iter_id + iter_offset, ps->annotation);
         }
       } else if (auto ps = step.as<PragmaStepNode>()) {
         if (ps->iter_id == 0) {
           step = PragmaStep(ps->stage_id, 0, ps->pragma_type);
         } else {
-          CHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
+          ICHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
           step = PragmaStep(ps->stage_id, ps->iter_id + iter_offset, ps->pragma_type);
         }
       } else {
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h
index 035dc897d3da..046f036d59d9 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.h
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h
@@ -19,7 +19,8 @@
 
 /*!
  * \file auto_scheduler/search_policy/sketch_policy_rules.h
- * \brief Rules defined to generate the sketches and initial sampled states in SketchPolicy.
+ * \brief Rules for generating the sketches, sampling the initial population, and mutating the
+ * population in SketchPolicy.
  */
 
 #ifndef TVM_AUTO_SCHEDULER_SEARCH_POLICY_SKETCH_POLICY_RULES_H_
diff --git a/src/auto_scheduler/search_policy/utils.cc b/src/auto_scheduler/search_policy/utils.cc
index 9e72eeb3f0c2..d59df6965776 100644
--- a/src/auto_scheduler/search_policy/utils.cc
+++ b/src/auto_scheduler/search_policy/utils.cc
@@ -32,7 +32,7 @@ namespace auto_scheduler {
 Array<Integer> GetSpatialSplitStepIds(const State& s, int stage_id) {
   const auto& stage = s->stages[stage_id];
   const auto& pop = s->stages[stage_id]->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
+  ICHECK(pop != nullptr);
   const std::set<std::string>& no_split_at_inner_name_set =
       stage->op->attrs.count(SearchPolicyKey::no_split_at_inner)
           ? GetIterNameSetParam(stage->op->attrs, SearchPolicyKey::no_split_at_inner)
@@ -182,7 +182,7 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo
   for (const auto& iter : state->stages[stage_id]->iters) {
     if (!no_split_at_inner_name_set.count(iter->name)) {
       if (iter->iter_kind == IteratorKind::kSpatial) {
-        CHECK_GE(n_space, 1);
+        ICHECK_GE(n_space, 1);
 
         if (n_space == 1) {
           space_levels[0].push_back(iter);
@@ -194,7 +194,7 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo
           spatial_split_step_ids->push_back(tmp_s->transform_steps.size() - 1);
         }
       } else if (iter->iter_kind == IteratorKind::kReduction) {
-        CHECK_GE(n_reduce, 1);
+        ICHECK_GE(n_reduce, 1);
 
         if (n_reduce == 1) {
           reduce_levels[0].push_back(iter);
@@ -219,26 +219,26 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo
   }
 
   if (!space_outer.empty()) {
-    CHECK(!space_levels.empty());
+    ICHECK(!space_levels.empty());
     space_levels.front().insert(space_levels.front().begin(),
                                 std::make_move_iterator(space_outer.begin()),
                                 std::make_move_iterator(space_outer.end()));
   }
   if (!space_inner.empty()) {
-    CHECK(!space_levels.empty());
+    ICHECK(!space_levels.empty());
     space_levels.back().insert(space_levels.back().begin(),
                                std::make_move_iterator(space_inner.begin()),
                                std::make_move_iterator(space_inner.end()));
   }
 
   if (!reduce_outer.empty()) {
-    CHECK(!reduce_levels.empty());
+    ICHECK(!reduce_levels.empty());
     reduce_levels.front().insert(reduce_levels.front().begin(),
                                  std::make_move_iterator(reduce_outer.begin()),
                                  std::make_move_iterator(reduce_outer.end()));
   }
   if (!reduce_inner.empty()) {
-    CHECK(!reduce_levels.empty());
+    ICHECK(!reduce_levels.empty());
     reduce_levels.back().insert(reduce_levels.back().begin(),
                                 std::make_move_iterator(reduce_inner.begin()),
                                 std::make_move_iterator(reduce_inner.end()));
@@ -274,7 +274,7 @@ State FollowTiling(const State& state, int stage_id, const std::vector<int>& spl
   Array<Iterator> split_res;
 
   auto pop = state->stages[stage_id]->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
+  ICHECK(pop != nullptr);
   const Stage& stage = state->stages[stage_id];
   const std::set<std::string>& no_split_at_inner_name_set =
       stage->op->attrs.count(SearchPolicyKey::no_split_at_inner)
@@ -285,8 +285,8 @@ State FollowTiling(const State& state, int stage_id, const std::vector<int>& spl
     no_split_at_inner_name_in_stage_cnt += no_split_at_inner_name_set.count(iter->name);
   }
 
-  CHECK_EQ(state->stages[stage_id]->iters.size() - no_split_at_inner_name_in_stage_cnt,
-           split_step_ids.size());
+  ICHECK_EQ(state->stages[stage_id]->iters.size() - no_split_at_inner_name_in_stage_cnt,
+            split_step_ids.size());
 
   State tmp_s = state;
   int ct = 0;
@@ -328,7 +328,7 @@ State FollowTiling(const State& state, int stage_id, const std::vector<int>& spl
           } else if (n_split == 2) {
             space_2.push_back(iter);
           } else {
-            CHECK_EQ(n_split, 3);
+            ICHECK_EQ(n_split, 3);
             space_3.push_back(iter);
           }
         }
@@ -413,55 +413,19 @@ void PruneInvalidState(const SearchTask& task, Array<State>* states) {
 }
 
 /********** SplitFactorizationMemo **********/
-
-void SplitFactorizationMemo::ReadWriteLock::GetRead() {
-  std::unique_lock<std::mutex> lock(cv_mutex_);
-  // Wake up and get the mutex lock if there's no writing thread
-  cv_.wait(lock, [this]() { return !this->is_writing_; });
-  read_count_++;
-}
-
-void SplitFactorizationMemo::ReadWriteLock::GetWrite() {
-  std::unique_lock<std::mutex> lock(cv_mutex_);
-  // Wake up and get the mutex lock if there's no reading or writing threads
-  cv_.wait(lock, [this]() { return this->read_count_ == 0 && !this->is_writing_; });
-  is_writing_ = true;
-}
-
-void SplitFactorizationMemo::ReadWriteLock::UnlockRead() {
-  std::lock_guard<std::mutex> lock(cv_mutex_);
-  read_count_--;
-  // Notify the other blocked threads if this is the last reading thread
-  if (read_count_ == 0) {
-    cv_.notify_one();
-  }
-}
-
-void SplitFactorizationMemo::ReadWriteLock::UnlockWrite() {
-  std::lock_guard<std::mutex> lock(cv_mutex_);
-  is_writing_ = false;
-  // Notify the other blocked threads
-  cv_.notify_one();
-}
-
 const Array<Array<Integer>>& SplitFactorizationMemo::GetFactorizationSchemes(
     int extent, int n_lengths, int max_innermost_factor) {
   QueryKey key = std::make_tuple(extent, n_lengths, max_innermost_factor);
-  const auto& const_memory = memory_;
-  lock_.GetRead();
-  const auto& it = const_memory.find(key);
-  const auto& memory_end = const_memory.end();
-  lock_.UnlockRead();
-  if (it != memory_end) {
+  const auto& it = memory_.find(key);
+  if (it != memory_.end()) {
     return it->second;
   }
 
-  lock_.GetWrite();
   tmp_stack_ = Array<Integer>(n_lengths, Integer());
   results_ = &memory_[key];
   n_lengths_ = n_lengths;
+
   DfsEnumerate(0, extent, max_innermost_factor);
-  lock_.UnlockWrite();
 
   return *results_;
 }
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index 5c015ca46a9b..d59a6ca220ca 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -99,29 +99,29 @@ inline int OperationToStage(const te::Operation& op, const State& state) {
 
 /*! \brief Get an integer from a tvm str Map. */
 inline int GetIntParam(const Map<String, ObjectRef>& attr_dict, const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
+  ICHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
   auto pint = attr_dict[key].as<IntImmNode>();
-  CHECK(pint != nullptr);
+  ICHECK(pint != nullptr);
   return pint->value;
 }
 
 /*! \brief Get a double from a tvm str Map. */
 inline double GetDoubleParam(const Map<String, ObjectRef>& attr_dict, const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
+  ICHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
   auto pdouble = attr_dict[key].as<FloatImmNode>();
-  CHECK(pdouble != nullptr);
+  ICHECK(pdouble != nullptr);
   return pdouble->value;
 }
 
 /*! \brief Get a string from a tvm str Map. */
 inline std::string GetStringParam(const Map<String, ObjectRef>& attr_dict, const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
+  ICHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
   const auto& target = attr_dict[key];
   if (auto pstr = target.as<StringImmNode>()) {
     return pstr->value;
   }
   auto pstr = target.as<StringObj>();
-  CHECK(pstr != nullptr);
+  ICHECK(pstr != nullptr);
   return pstr->data;
 }
 
@@ -129,9 +129,9 @@ inline std::string GetStringParam(const Map<String, ObjectRef>& attr_dict, const
 inline std::set<std::string> GetIterNameSetParam(const Map<String, ObjectRef>& attr_dict,
                                                  const std::string& key) {
   std::set<std::string> ret;
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
+  ICHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
   auto names = attr_dict[key].as<ArrayNode>();
-  CHECK(names != nullptr);
+  ICHECK(names != nullptr);
   for (const auto& name : *names) {
     ret.insert(name.as<StringObj>()->data);
   }
@@ -477,7 +477,7 @@ inline bool HasCrossThreadReduction(const State& state, int stage_id) {
 /*! \brief Return whether the stage has been tiled already. */
 inline bool IsTiled(const Stage& stage) {
   auto op = stage->op.as<te::ComputeOpNode>();
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   return stage->iters.size() != op->axis.size() + op->reduce_axis.size();
 }
 
@@ -502,7 +502,7 @@ inline void ExtractOriginalIterators(const std::string& name, std::set<std::stri
 /*! \brief Get the last reduce iterator in the outermost reduce tile. */
 inline Iterator GetLastReduceIteratorInOutermostReduceTile(const Stage& stage) {
   auto pop = stage->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
+  ICHECK(pop != nullptr);
   std::set<std::string> original_names;
 
   const std::set<std::string>& no_split_at_inner_name_set =
@@ -583,7 +583,7 @@ inline State FuseAllReductionIterators(const State& state, int stage_id, Iterato
     }
   }
 
-  CHECK(!reduce_iters->empty());
+  ICHECK(!reduce_iters->empty());
   State tmp_s = state;
   if (reduce_iters->size() > 1) {
     *fused_iter = tmp_s.fuse(stage_id, *reduce_iters);
@@ -609,7 +609,7 @@ inline State FuseAllOuterSpaceIterators(const State& state, int stage_id, Iterat
     to_fuse.push_back(it);
   }
 
-  CHECK(!to_fuse.empty());
+  ICHECK(!to_fuse.empty());
   State tmp_s = state;
   if (to_fuse.size() > 1) {
     *fused_iter = tmp_s.fuse(stage_id, to_fuse);
@@ -649,7 +649,7 @@ inline int RandomChoose(const std::vector<double>& prefix_sum_probs, std::mt1993
   std::uniform_real_distribution<> dis(0.0, 1.0);
   double x = dis(*random_gen);
 
-  CHECK(!prefix_sum_probs.empty());
+  ICHECK(!prefix_sum_probs.empty());
 
   return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) -
          prefix_sum_probs.begin();
@@ -657,9 +657,9 @@ inline int RandomChoose(const std::vector<double>& prefix_sum_probs, std::mt1993
 
 /*! \brief Print a title */
 inline void PrintTitle(const std::string& title, int verbose) {
-  StdCout(verbose) << Chars('-', 60) << "\n"
-                   << Chars('-', 25) << "  [ " << title << " ]\n"
-                   << Chars('-', 60) << std::endl;
+  StdCout(verbose) << Chars('-', 70) << "\n"
+                   << Chars('-', 30) << "  [ " << title << " ]\n"
+                   << Chars('-', 70) << std::endl;
 }
 
 /*!
@@ -677,33 +677,6 @@ class SplitFactorizationMemo {
  private:
   void DfsEnumerate(int now, int remaining_length, int max_innermost_factor);
 
-  /*!
-   * \brief A simple implementation of read-write lock.
-   * The guarded block can be read by multiple threads at the same time, while other operations will
-   * be blocked if one thread is writing.
-   * \note Writing threads will wait until all reading threads have finshed. If there're multiple
-   * writing threads, the process order of them is not guaranteed.
-   */
-  class ReadWriteLock {
-   public:
-    /*! \brief The method to get the read lock. One thread can process read if there's on other
-     * writing threads. */
-    void GetRead();
-    /*! \brief The method to get the write lock. One thread can process write if there's on other
-     * reading or writing threads. */
-    void GetWrite();
-    /*! \brief The method to release the read lock. */
-    void UnlockRead();
-    /*! \brief The method to release the write lock. */
-    void UnlockWrite();
-
-   private:
-    uint32_t read_count_ = 0;
-    bool is_writing_ = false;
-    std::mutex cv_mutex_;
-    std::condition_variable cv_;
-  } lock_;
-
   std::unordered_map<QueryKey, Array<Array<Integer>>> memory_;
 
   int n_lengths_;
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index e3f35e9f0c19..48b3fc5eb38f 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -35,43 +35,57 @@ namespace auto_scheduler {
 TVM_REGISTER_NODE_TYPE(HardwareParamsNode);
 TVM_REGISTER_NODE_TYPE(SearchTaskNode);
 
-HardwareParams::HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes) {
+HardwareParams::HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes,
+                               int max_shared_memory_per_block, int max_registers_per_block,
+                               int max_threads_per_block, int max_vthread_extent, int warp_size) {
   auto node = make_object<HardwareParamsNode>();
   node->num_cores = num_cores;
   node->vector_unit_bytes = vector_unit_bytes;
   node->cache_line_bytes = cache_line_bytes;
+  node->max_shared_memory_per_block = max_shared_memory_per_block;
+  node->max_registers_per_block = max_registers_per_block;
+  node->max_threads_per_block = max_threads_per_block;
+  node->max_vthread_extent = max_vthread_extent;
+  node->warp_size = warp_size;
   data_ = std::move(node);
 }
 
 HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target,
                                                             const Target& target_host) {
   if (target->kind->device_type == kDLCPU) {
-    return HardwareParams(tvm::runtime::threading::MaxConcurrency(), 64, 64);
+    return HardwareParams(tvm::runtime::threading::MaxConcurrency(), 64, 64, 0, 0, 0, 0, 0);
   } else if (target->kind->device_type == kDLGPU) {
-    auto hardware_params = HardwareParams(-1, 16, 64);
-    auto* p_hardware_params = hardware_params.CopyOnWrite();
-
     auto ctx = TVMContext{kDLGPU, 0};
     auto func = tvm::runtime::Registry::Get("device_api.gpu");
-    CHECK(func != nullptr) << "Cannot find GPU device_api in registry";
+    ICHECK(func != nullptr) << "Cannot find GPU device_api in registry";
     auto device_api = static_cast<tvm::runtime::DeviceAPI*>(((*func)()).operator void*());
 
     tvm::runtime::TVMRetValue ret;
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret);
-    p_hardware_params->max_shared_memory_per_block = ret;
+    int max_shared_memory_per_block = ret;
 
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxRegistersPerBlock, &ret);
-    p_hardware_params->max_registers_per_block = ret;
+    int max_registers_per_block = ret;
 
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret);
-    p_hardware_params->max_threads_per_block = ret;
+    int max_threads_per_block = ret;
 
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kWarpSize, &ret);
-    p_hardware_params->warp_size = ret;
-
-    p_hardware_params->max_vthread_extent = p_hardware_params->warp_size / 4;
-
-    return hardware_params;
+    int warp_size = ret;
+
+    int max_vthread_extent = warp_size / 4;
+    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_registers_per_block,
+                          max_threads_per_block, max_vthread_extent, warp_size);
+  } else if (target->kind->device_type == kDLMetal) {
+    // Reference: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+    // This setting looks working for Metal GPUs later than A10
+    int max_shared_memory_per_block = 32 * 1024;
+    int max_registers_per_block = 4 * 1024;
+    int max_threads_per_block = 1024;
+    int warp_size = 8;
+    int max_vthread_extent = warp_size / 4;
+    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_registers_per_block,
+                          max_threads_per_block, max_vthread_extent, warp_size);
   } else {
     LOG(FATAL) << "No default hardware parameters for target: " << target;
   }
@@ -95,8 +109,12 @@ SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target targe
 }
 
 TVM_REGISTER_GLOBAL("auto_scheduler.HardwareParams")
-    .set_body_typed([](int num_cores, int vector_unit_bytes, int cache_line_bytes) {
-      return HardwareParams(num_cores, vector_unit_bytes, cache_line_bytes);
+    .set_body_typed([](int num_cores, int vector_unit_bytes, int cache_line_bytes,
+                       int max_shared_memory_per_block, int max_registers_per_block,
+                       int max_threads_per_block, int max_vthread_extent, int warp_size) {
+      return HardwareParams(num_cores, vector_unit_bytes, cache_line_bytes,
+                            max_shared_memory_per_block, max_registers_per_block,
+                            max_threads_per_block, max_vthread_extent, warp_size);
     });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask")
diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc
index 73f673421378..5560907dcffa 100755
--- a/src/auto_scheduler/transform_step.cc
+++ b/src/auto_scheduler/transform_step.cc
@@ -27,6 +27,7 @@
 #include <tvm/auto_scheduler/loop_state.h>
 #include <tvm/auto_scheduler/transform_step.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 #include <tvm/te/operation.h>
 
 #include <string>
@@ -43,7 +44,7 @@ struct Handler<::tvm::Array<::tvm::Integer>> {
   inline static void Write(dmlc::JSONWriter* writer, const ::tvm::Array<::tvm::Integer>& array) {
     writer->BeginArray(false);
     for (const auto& i : array) {
-      CHECK(i.defined());
+      ICHECK(i.defined());
       writer->WriteArrayItem(i->value);
     }
     writer->EndArray();
@@ -65,7 +66,7 @@ struct Handler<::tvm::Array<::tvm::Optional<::tvm::Integer>>> {
                            const ::tvm::Array<::tvm::Optional<::tvm::Integer>>& array) {
     writer->BeginArray(false);
     for (const auto& i : array) {
-      CHECK(i);
+      ICHECK(i);
       writer->WriteArrayItem(i.value()->value);
     }
     writer->EndArray();
@@ -121,11 +122,63 @@ const char* IteratorAnnotationString[] = {
     "tensorize"     // kTensorized = 11
 };
 
+StepNode* Step::CopyOnWrite() {
+  CHECK(data_ != nullptr);
+  if (!data_.unique()) {
+    if (const auto& ps = as<AnnotationStepNode>()) {
+      auto n = make_object<AnnotationStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<FuseStepNode>()) {
+      auto n = make_object<FuseStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<PragmaStepNode>()) {
+      auto n = make_object<PragmaStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<ReorderStepNode>()) {
+      auto n = make_object<ReorderStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<SplitStepNode>()) {
+      auto n = make_object<SplitStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<FollowSplitStepNode>()) {
+      auto n = make_object<FollowSplitStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<FollowFusedSplitStepNode>()) {
+      auto n = make_object<FollowFusedSplitStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<StorageAlignStepNode>()) {
+      auto n = make_object<StorageAlignStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<ComputeAtStepNode>()) {
+      auto n = make_object<ComputeAtStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<ComputeInlineStepNode>()) {
+      auto n = make_object<ComputeInlineStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<ComputeRootStepNode>()) {
+      auto n = make_object<ComputeRootStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<CacheReadStepNode>()) {
+      auto n = make_object<CacheReadStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<CacheWriteStepNode>()) {
+      auto n = make_object<CacheWriteStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<RfactorStepNode>()) {
+      auto n = make_object<RfactorStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else {
+      LOG(FATAL) << "Invalid step: " << (*this);
+    }
+  }
+  return static_cast<StepNode*>(data_.get());
+}
+
 Step StepReadFromRecord(dmlc::JSONReader* reader) {
   std::string name;
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&name);
   if (name == AnnotationStepNode::record_prefix_str) {
     return AnnotationStep(reader);
@@ -283,13 +336,13 @@ AnnotationStep::AnnotationStep(dmlc::JSONReader* reader) {
   auto node = make_object<AnnotationStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   int int_val;
   reader->Read(&int_val);
   node->annotation = IteratorAnnotation(int_val);
@@ -308,7 +361,7 @@ Iterator AnnotationStepNode::ApplyToState(State* state) const {
   const Stage& stage = (*state)->stages[stage_id];
   Iterator it = stage->iters[iter_id];
 
-  CHECK(it->annotation == IteratorAnnotation::kNone);
+  ICHECK(it->annotation == IteratorAnnotation::kNone);
   Iterator new_it = Iterator(it->name, it->range, it->iter_kind, annotation, &it->orig_iters);
   Stage new_stage = stage;
   new_stage.CopyOnWrite()->iters.Set(iter_id, new_it);
@@ -410,7 +463,7 @@ FuseStep::FuseStep(int stage_id, const Array<Integer>& fused_ids) {
   auto node = make_object<FuseStepNode>();
   node->stage_id = stage_id;
   for (const auto& x : fused_ids) {
-    CHECK(x->IsInstance<IntImmNode>());
+    ICHECK(x->IsInstance<IntImmNode>());
   }
   node->fused_ids = fused_ids;
   data_ = std::move(node);
@@ -420,10 +473,10 @@ FuseStep::FuseStep(dmlc::JSONReader* reader) {
   auto node = make_object<FuseStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->fused_ids);
   data_ = std::move(node);
 }
@@ -446,7 +499,7 @@ Iterator FuseStepNode::ApplyToState(State* state) const {
 
   for (size_t i = 0; i < fused_ids.size(); ++i) {
     if (i > 0) {
-      CHECK_EQ(fused_ids[i]->value, fused_ids[i - 1]->value + 1);
+      ICHECK_EQ(fused_ids[i]->value, fused_ids[i - 1]->value + 1);
     }
 
     if (i != fused_ids.size() - 1) {
@@ -574,13 +627,13 @@ PragmaStep::PragmaStep(dmlc::JSONReader* reader) {
   auto node = make_object<PragmaStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   std::string string_value;
   reader->Read(&string_value);
   node->pragma_type = std::move(string_value);
@@ -609,7 +662,7 @@ void PragmaStepNode::ApplyToState(State* state) const {
         break;
       }
     }
-    CHECK_LT(pos, pragma_type.size()) << "max step value not found.";
+    ICHECK_LT(pos, pragma_type.size()) << "max step value not found.";
     stage.CopyOnWrite()->attrs.auto_unroll_max_step = atoi(pragma_type.c_str() + pos + 1);
     pstate->stages.Set(stage_id, std::move(stage));
   } else {
@@ -628,7 +681,7 @@ void PragmaStepNode::ApplyToSchedule(Array<te::Stage>* stages,
         break;
       }
     }
-    CHECK_LT(pos, pragma_type.size()) << "max step value not found.";
+    ICHECK_LT(pos, pragma_type.size()) << "max step value not found.";
     int value = atoi(pragma_type.c_str() + pos + 1);
     stage.pragma(axes[iter_id], "auto_unroll_max_step", value);
     stage.pragma(axes[iter_id], "unroll_explicit", true);
@@ -651,7 +704,7 @@ String PragmaStepNode::PrintAsPythonAPI(Array<te::Stage>* stages,
         break;
       }
     }
-    CHECK_LT(pos, pragma_type.size()) << "max step value not found.";
+    ICHECK_LT(pos, pragma_type.size()) << "max step value not found.";
     int value = atoi(pragma_type.c_str() + pos + 1);
     ss << "s[" << op_name << "].pragma("
        << CleanName((*stage_to_axes)[stage][iter_id]->var->name_hint, op_name)
@@ -674,7 +727,7 @@ ReorderStep::ReorderStep(int stage_id, const Array<Integer>& after_ids) {
   auto node = make_object<ReorderStepNode>();
   node->stage_id = stage_id;
   for (const auto& x : after_ids) {
-    CHECK(x->IsInstance<IntImmNode>());
+    ICHECK(x->IsInstance<IntImmNode>());
   }
   node->after_ids = after_ids;
   data_ = std::move(node);
@@ -684,10 +737,10 @@ ReorderStep::ReorderStep(dmlc::JSONReader* reader) {
   auto node = make_object<ReorderStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->after_ids);
   data_ = std::move(node);
 }
@@ -713,7 +766,7 @@ void ReorderStepNode::ApplyToSchedule(Array<te::Stage>* stages,
                                       StageToAxesMap* stage_to_axes) const {
   auto stage = (*stages)[stage_id];
   const Array<IterVar>& axes = stage_to_axes->at(stage);
-  CHECK_EQ(after_ids.size(), axes.size());
+  ICHECK_EQ(after_ids.size(), axes.size());
 
   Array<IterVar> new_axes;
   new_axes.reserve(axes.size());
@@ -879,7 +932,7 @@ String PrintSplitAsPythonAPI(Array<te::Stage>* stages, StageToAxesMap* stage_to_
   const auto& func_name = CleanName(stage->op->name);
   const auto& outs =
       ApplySplitToSchedule(stages, stage_to_axes, stage_id, iter_id, lengths, inner_to_outer);
-  CHECK_EQ(outs.size(), lengths.size() + 1);
+  ICHECK_EQ(outs.size(), lengths.size() + 1);
 
   std::stringstream ss;
   int size = static_cast<int>(lengths.size());
@@ -921,23 +974,23 @@ SplitStep::SplitStep(dmlc::JSONReader* reader) {
   auto node = make_object<SplitStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   int int_val;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&int_val);
   if (int_val) {
     node->extent = Integer(int_val);
   }
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->lengths);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->inner_to_outer);
   data_ = std::move(node);
 }
@@ -988,14 +1041,14 @@ void FollowSplitStepNode::WriteToRecord(dmlc::JSONWriter* writer) const {
 Array<Optional<Integer>> FollowSplitStepNode::ExtractSplitLengths(
     const Array<Step>& transform_steps) const {
   // Make sure src_step_id is within the range of transform_steps.
-  CHECK_LT(src_step_id, transform_steps.size());
+  ICHECK_LT(src_step_id, transform_steps.size());
   auto ps = transform_steps[src_step_id].as<SplitStepNode>();
-  CHECK(ps != nullptr);
+  ICHECK(ps != nullptr);
 
   // Make sure the size of ps->lengths is not smaller than n_split-1.
   // Note that the number of actual splitting factors of src_step is ps->lengths.size()+1.
-  CHECK_LE(n_split, ps->lengths.size() + 1);
-  CHECK(ps != nullptr);
+  ICHECK_LE(n_split, ps->lengths.size() + 1);
+  ICHECK(ps != nullptr);
 
   Array<Optional<Integer>> lengths;
   lengths.reserve(n_split);
@@ -1029,16 +1082,16 @@ FollowSplitStep::FollowSplitStep(dmlc::JSONReader* reader) {
   auto node = make_object<FollowSplitStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->src_step_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->n_split);
   data_ = std::move(node);
 }
@@ -1079,19 +1132,19 @@ FollowFusedSplitStep::FollowFusedSplitStep(dmlc::JSONReader* reader) {
   auto node = make_object<FollowFusedSplitStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->src_step_ids);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->level);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->factor_or_nparts);
   data_ = std::move(node);
 }
@@ -1112,9 +1165,9 @@ Optional<Integer> FollowFusedSplitStepNode::ExtractSplitLength(
 
   for (int src_step_id : src_step_ids) {
     // Make sure the src_step_id is within the range of transform_steps.
-    CHECK_LT(src_step_id, transform_steps.size());
+    ICHECK_LT(src_step_id, transform_steps.size());
     auto ps = transform_steps[src_step_id].as<SplitStepNode>();
-    CHECK(ps != nullptr);
+    ICHECK(ps != nullptr);
     // Multiple the splitting factor on corresponding splitting level of src_steps.
     if (ps->lengths[level] && ret.defined()) {
       ret *= ps->lengths[level].value();
@@ -1158,16 +1211,16 @@ StorageAlignStep::StorageAlignStep(dmlc::JSONReader* reader) {
   auto node = make_object<StorageAlignStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->factor);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->offset);
   data_ = std::move(node);
 }
@@ -1224,13 +1277,13 @@ ComputeAtStep::ComputeAtStep(dmlc::JSONReader* reader) {
   auto node = make_object<ComputeAtStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->target_stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->target_iter_id);
   data_ = std::move(node);
 }
@@ -1295,7 +1348,7 @@ ComputeInlineStep::ComputeInlineStep(dmlc::JSONReader* reader) {
   auto node = make_object<ComputeInlineStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   data_ = std::move(node);
 }
@@ -1311,7 +1364,7 @@ void ComputeInlineStepNode::ApplyToState(State* state) const {
 
   // Check the validity of compute_inline
   for (size_t i = 0; i < stage->iters.size(); ++i) {
-    CHECK_EQ((*state)->attach_map->iter_to_attached_stages.count(std::make_pair(stage_id, i)), 0)
+    ICHECK_EQ((*state)->attach_map->iter_to_attached_stages.count(std::make_pair(stage_id, i)), 0)
         << "Invalid compute_inline: There are some other stages that are attached to the "
         << "target stage";
   }
@@ -1351,7 +1404,7 @@ ComputeRootStep::ComputeRootStep(dmlc::JSONReader* reader) {
   auto node = make_object<ComputeRootStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   data_ = std::move(node);
 }
@@ -1418,10 +1471,10 @@ Array<Step> GetFormerStageModifiableSteps(Step current_step, const Array<Step>&
         }
       }
       // add SplitStepNode required by rfactor
-      CHECK_GE(i, 1);
-      CHECK(transform_steps[i - 1]->IsInstance<SplitStepNode>());
+      ICHECK_GE(i, 1);
+      ICHECK(transform_steps[i - 1]->IsInstance<SplitStepNode>());
       const Step& split_step = transform_steps[i - 1];
-      CHECK_EQ(split_step->stage_id, step->stage_id);
+      ICHECK_EQ(split_step->stage_id, step->stage_id);
       ret_steps.push_back(split_step);
       // add RfactorStepNode
       ret_steps.push_back(step);
@@ -1449,15 +1502,15 @@ CacheReadStep::CacheReadStep(dmlc::JSONReader* reader) {
   auto node = make_object<CacheReadStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   std::string string_value;
   reader->Read(&string_value);
   node->scope_name = std::move(string_value);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->reader_stage_ids);
   data_ = std::move(node);
 }
@@ -1560,10 +1613,10 @@ CacheWriteStep::CacheWriteStep(dmlc::JSONReader* reader) {
   auto node = make_object<CacheWriteStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   std::string string_value;
   reader->Read(&string_value);
   node->scope_name = std::move(string_value);
@@ -1587,7 +1640,7 @@ int CacheWriteStepNode::ApplyToState(State* state, const ComputeDAG& dag) const
       GetFormerStageModifiableSteps(GetRef<Step>(this), (*state)->transform_steps));
   int added_ops = current_compute_dag->ops.size() - last_dag_op_size;
   // TODO(jcf94): Update this check to equal after fixing the cache write bug in TVM
-  CHECK_GE(added_ops, 1);
+  ICHECK_GE(added_ops, 1);
 
   // target_stage -> cache_write_stage + target_stage
   // Assume no step has been applied to the target stage before cache write.
@@ -1691,13 +1744,13 @@ RfactorStep::RfactorStep(dmlc::JSONReader* reader) {
   auto node = make_object<RfactorStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->factor_iter_id);
   data_ = std::move(node);
 }
diff --git a/src/auto_scheduler/utils.h b/src/auto_scheduler/utils.h
index 610fec96617a..9fc5a1dd8f22 100755
--- a/src/auto_scheduler/utils.h
+++ b/src/auto_scheduler/utils.h
@@ -32,6 +32,7 @@
 #include <deque>
 #include <exception>
 #include <future>
+#include <iomanip>
 #include <numeric>
 #include <random>
 #include <string>
@@ -150,8 +151,8 @@ inline bool IntArrayEqual(const Array<PrimExpr>& arr1, const Array<PrimExpr>& ar
   for (size_t i = 0; i < arr1.size(); ++i) {
     auto int1 = arr1[i].as<IntImmNode>();
     auto int2 = arr2[i].as<IntImmNode>();
-    CHECK(int1 != nullptr);
-    CHECK(int2 != nullptr);
+    ICHECK(int1 != nullptr);
+    ICHECK(int2 != nullptr);
     if (int1->value != int2->value) {
       return false;
     }
@@ -169,7 +170,7 @@ inline double FloatArrayMean(const Array<PrimExpr>& float_array) {
 
   for (const auto& x : float_array) {
     auto floatimm = x.as<tir::FloatImmNode>();
-    CHECK(floatimm != nullptr);
+    ICHECK(floatimm != nullptr);
     sum += floatimm->value;
   }
   return sum / float_array.size();
@@ -191,7 +192,7 @@ inline bool StrEndsWith(const String& a, const String& b) {
 /*! \brief Get an int value from an Expr */
 inline int64_t GetIntImm(const PrimExpr& expr) {
   auto pint = expr.as<IntImmNode>();
-  CHECK(pint != nullptr);
+  ICHECK(pint != nullptr) << "Expect an IntImm but get " << expr;
   return pint->value;
 }
 
@@ -253,6 +254,16 @@ inline std::string Chars(const char& str, int times) {
   return ret.str();
 }
 
+/*! \brief Print the time elapsed */
+inline void PrintTimeElapsed(std::chrono::time_point<std::chrono::high_resolution_clock> t_begin,
+                             const std::string& info, int verbose) {
+  double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+                        std::chrono::high_resolution_clock::now() - t_begin)
+                        .count();
+  StdCout(verbose) << "Time elapsed for " << info << ": " << std::fixed << std::setprecision(2)
+                   << duration << " s" << std::endl;
+}
+
 /*!
  * \brief Parse shape and axis names from layout string
  */
diff --git a/src/autotvm/feature_visitor.cc b/src/autotvm/feature_visitor.cc
index 54fc2522db66..15e09755cee2 100644
--- a/src/autotvm/feature_visitor.cc
+++ b/src/autotvm/feature_visitor.cc
@@ -60,7 +60,7 @@ void FeatureVisitor::VisitStmt_(const AttrStmtNode* op) {
   if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
     Var var = op->node.as<tir::IterVarNode>()->var;
     const auto* extent = op->value.as<IntImmNode>();
-    CHECK(extent);
+    ICHECK(extent);
 
     std::string name = var.get()->name_hint;
     AnnotationType ann = kParallel;
diff --git a/src/autotvm/touch_extractor.cc b/src/autotvm/touch_extractor.cc
index 91e2ee135b16..10ead718bae2 100644
--- a/src/autotvm/touch_extractor.cc
+++ b/src/autotvm/touch_extractor.cc
@@ -120,13 +120,13 @@ void TouchExtractor::ExitItervar_() {
     if (kv.second.stride != 0) {  // multiply count
       for (auto stack_var : itervar_stack_) {
         auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
-        CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+        ICHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
         touch_pattern->second.count *= itervar_map[var].length;
       }
     } else {  // multiply reuse ratio
       for (auto stack_var : itervar_stack_) {
         auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
-        CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+        ICHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
         touch_pattern->second.reuse *= itervar_map[var].length;
       }
     }
@@ -151,7 +151,7 @@ void TouchExtractor::ExitItervar_() {
       for (auto stack_var : itervar_stack_) {
         if (ParallelLevel(itervar_map[stack_var].ann) == para_level + 1) {
           auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
-          CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+          ICHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
           touch_pattern->second.thread_reuse = -kv.second.reuse;
           touch_pattern->second.thread_count = -kv.second.count;
           // NOTE: use minus as a flag to denote it is a base,
diff --git a/src/contrib/hybrid/codegen_hybrid.cc b/src/contrib/hybrid/codegen_hybrid.cc
index 67765f039714..7522f20523c8 100644
--- a/src/contrib/hybrid/codegen_hybrid.cc
+++ b/src/contrib/hybrid/codegen_hybrid.cc
@@ -65,14 +65,14 @@ std::string CodeGenHybrid::Finish() { return stream.str(); }
 void CodeGenHybrid::PrintType(DataType t, std::ostream& os) {
   if (t.is_float()) {
     os << "float";
-    CHECK(t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
+    ICHECK(t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
   } else if (t.is_int()) {
     os << "int";
-    CHECK(t.bits() == 8 || t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
+    ICHECK(t.bits() == 8 || t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
   } else {
-    CHECK(t.is_uint()) << "Unsupported type " << t;
+    ICHECK(t.is_uint()) << "Unsupported type " << t;
     os << "uint";
-    CHECK(t.bits() == 8 || t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
+    ICHECK(t.bits() == 8 || t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
   }
   os << t.bits();
 }
@@ -93,7 +93,7 @@ template <typename T>
 inline void PrintBinaryExpr(const T* op, const char* opstr,
                             std::ostream& os,  // NOLINT(*)
                             CodeGenHybrid* p) {
-  CHECK(op->dtype.lanes() == 1) << "vec bin op not implemented";
+  ICHECK(op->dtype.lanes() == 1) << "vec bin op not implemented";
   if (isalpha(opstr[0])) {
     os << opstr << '(';
     p->PrintExpr(op->a, os);
@@ -114,8 +114,8 @@ inline void PrintBinaryExpr(const T* op, const char* opstr,
 inline void PrintBinaryIntrinsitc(const CallNode* op, const char* opstr,
                                   std::ostream& os,  // NOLINT(*)
                                   CodeGenHybrid* p) {
-  CHECK(op->dtype.lanes() == 1) << "vec bin intrin not implemented";
-  CHECK_EQ(op->args.size(), 2U);
+  ICHECK(op->dtype.lanes() == 1) << "vec bin intrin not implemented";
+  ICHECK_EQ(op->args.size(), 2U);
   os << '(';
   p->PrintExpr(op->args[0], os);
   os << opstr;
@@ -228,7 +228,7 @@ void CodeGenHybrid::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLIN
   } else if (op->op.same_as(builtin::shift_right())) {
     PrintBinaryIntrinsitc(op, ">>", os, this);
   } else if (op->op.same_as(builtin::bitwise_not())) {
-    CHECK_EQ(op->args.size(), 1U);
+    ICHECK_EQ(op->args.size(), 1U);
     os << "(~";
     PrintExpr(op->args[0], os);
     os << ')';
@@ -251,9 +251,9 @@ void CodeGenHybrid::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLIN
     os << ")";
   } else {
     auto* ptr_op = op->op.as<OpNode>();
-    CHECK(ptr_op != nullptr);
+    ICHECK(ptr_op != nullptr);
     std::string name = ptr_op->name;
-    CHECK_EQ(name.compare(0, 4, "tir."), 0);
+    ICHECK_EQ(name.compare(0, 4, "tir."), 0);
     os << name.substr(4) << "(";
     for (size_t i = 0; i < op->args.size(); i++) {
       PrintExpr(op->args[i], os);
@@ -305,7 +305,7 @@ void CodeGenHybrid::VisitStmt_(const LetStmtNode* op) {
 void CodeGenHybrid::VisitStmt_(const AttrStmtNode* op) {
   if (op->attr_key == tir::attr::thread_extent) {
     auto iter_var = op->node.as<IterVarNode>();
-    CHECK(iter_var);
+    ICHECK(iter_var);
     binds_[iter_var->var.get()] = dot_to_underscore(iter_var->var->name_hint);
     PrintIndent();
     stream << "for " << binds_[iter_var->var.get()] << " in bind('" << iter_var->var->name_hint
@@ -327,7 +327,7 @@ void CodeGenHybrid::VisitStmt_(const AttrStmtNode* op) {
 
 void CodeGenHybrid::VisitStmt_(const ProducerRealizeNode* op) {
   auto tensor = Downcast<Tensor>(op->producer);
-  CHECK(alloc_storage_scope_.count(tensor->op));
+  ICHECK(alloc_storage_scope_.count(tensor->op));
   if (!alloc_storage_scope_[tensor->op].empty()) {
     PrintIndent();
     stream << GetTensorID(tensor) << " = allocate((";
@@ -493,7 +493,7 @@ void CodeGenHybrid::DumpStmt(const Stmt& stmt, const Array<ObjectRef>& inputs,
       stream << GetTensorID(GetRef<Tensor>(tensor));
     } else {
       auto var = inputs[i].as<VarNode>();
-      CHECK(var) << "Input should either be a tensor or a variable!";
+      ICHECK(var) << "Input should either be a tensor or a variable!";
       stream << GetVarID(var);
     }
   }
diff --git a/src/contrib/rust_extension.cc b/src/contrib/rust_extension.cc
new file mode 100644
index 000000000000..46e94fffdf55
--- /dev/null
+++ b/src/contrib/rust_extension.cc
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/contrib/rust_extension.cc
+ * \brief Expose Rust extensions initialization.
+ */
+#ifdef RUST_COMPILER_EXT
+
+extern "C" {
+int compiler_ext_initialize();
+static int test = compiler_ext_initialize();
+}
+
+#endif
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 705a3347b68c..5c119b64b93d 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -241,7 +241,7 @@ class TVMDSOOp : public OpKernel {
     // Load TVM function from dynamic library
     tvm::runtime::Module mod_dylib = tvm::runtime::Module::LoadFromFile(lib_path);
     tvm_func = mod_dylib.GetFunction(func_name);
-    CHECK(tvm_func != nullptr);
+    ICHECK(tvm_func != nullptr);
   }
 
   void Compute(tensorflow::OpKernelContext* context) override {
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 2e41f0bee921..f88b6215f927 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -215,7 +215,7 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
       tir::transform::CombineContextCall(),
   };
   auto opt_host = transform::Sequential(host_pass_list);
-  CHECK(mod_mixed.defined()) << "This module must be defined";
+  ICHECK(mod_mixed.defined()) << "This module must be defined";
   auto mhost = opt_host(mod_mixed);
 
   // device pipeline
@@ -243,9 +243,9 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
   }
 
   if (target->kind->device_type == kDLCPU && target_host == target) {
-    CHECK(mdevice->functions.empty()) << "No device code should be generated when target "
-                                      << "and host_target are both llvm target."
-                                      << "\n";
+    ICHECK(mdevice->functions.empty()) << "No device code should be generated when target "
+                                       << "and host_target are both llvm target."
+                                       << "\n";
   }
 
   return {mhost, mdevice};
@@ -272,7 +272,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_
 
   IRModule mhost_all = IRModule(Map<GlobalVar, BaseFunc>());
 
-  CHECK(mhost_all.defined()) << "The host module must be defined";
+  ICHECK(mhost_all.defined()) << "The host module must be defined";
 
   for (const auto& it : inputs) {
     if (it.second.defined()) {
@@ -280,9 +280,9 @@ runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_
       auto& mhost = pair.first;
       auto& mdevice = pair.second;
 
-      CHECK(mhost.defined()) << "The split host module must be defined";
+      ICHECK(mhost.defined()) << "The split host module must be defined";
 
-      CHECK(mhost_all.defined()) << "The host module must be defined";
+      ICHECK(mhost_all.defined()) << "The host module must be defined";
 
       mhost_all->Update(mhost);
 
diff --git a/src/ir/diagnostic.cc b/src/ir/diagnostic.cc
index ceadf78e2cfc..876113b85f6e 100644
--- a/src/ir/diagnostic.cc
+++ b/src/ir/diagnostic.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file src/ir/transform.cc
- * \brief Infrastructure for transformation passes.
+ * \file src/ir/diagnostic.cc
+ * \brief Implementation of DiagnosticContext and friends.
  */
 #include <tvm/ir/diagnostic.h>
 #include <tvm/parser/source_map.h>
@@ -30,13 +30,6 @@ namespace tvm {
 
 using tvm::parser::Source;
 
-const char* kTVM_INTERNAL_ERROR_MESSAGE =
-    "\n---------------------------------------------------------------\n"
-    "An internal invariant was violated during the execution of TVM.\n"
-    "Please read TVM's error reporting guidelines.\n"
-    "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n"
-    "---------------------------------------------------------------\n";
-
 // failed to check to argument arg0.dims[0] != 0
 
 /* Diagnostic */
@@ -120,6 +113,7 @@ TVM_REGISTER_GLOBAL("diagnostics.DiagnosticRendererRender")
     });
 
 DiagnosticContext::DiagnosticContext(const IRModule& module, const DiagnosticRenderer& renderer) {
+  CHECK(renderer.defined()) << "can not initialize a diagnostic renderer with a null function";
   auto n = make_object<DiagnosticContextNode>();
   n->module = module;
   n->renderer = renderer;
@@ -174,6 +168,10 @@ DiagnosticContext DiagnosticContext::Default(const IRModule& module) {
   return DiagnosticContext(module, renderer);
 }
 
+TVM_REGISTER_GLOBAL("diagnostics.Default").set_body_typed([](const IRModule& module) {
+  return DiagnosticContext::Default(module);
+});
+
 std::ostream& EmitDiagnosticHeader(std::ostream& out, const Span& span, DiagnosticLevel level,
                                    std::string msg) {
   rang::fg diagnostic_color = rang::fg::reset;
@@ -232,7 +230,7 @@ void ReportAt(const DiagnosticContext& context, std::ostream& out, const Span& s
     return;
   }
 
-  CHECK(context->module->source_map.defined());
+  ICHECK(context->module->source_map.defined());
   auto it = context->module->source_map->source_map.find(span->source_name);
 
   // If the source name is not in the current source map, sources were not annotated.
diff --git a/src/ir/env_func.cc b/src/ir/env_func.cc
index 7b0d6e6f09c2..6e1f847d3fdd 100644
--- a/src/ir/env_func.cc
+++ b/src/ir/env_func.cc
@@ -38,7 +38,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 ObjectPtr<Object> CreateEnvNode(const std::string& name) {
   auto* f = runtime::Registry::Get(name);
-  CHECK(f != nullptr) << "Cannot find global function \'" << name << '\'';
+  ICHECK(f != nullptr) << "Cannot find global function \'" << name << '\'';
   ObjectPtr<EnvFuncNode> n = make_object<EnvFuncNode>();
   n->func = *f;
   n->name = name;
@@ -51,7 +51,7 @@ TVM_REGISTER_GLOBAL("ir.EnvFuncGet").set_body_typed(EnvFunc::Get);
 
 TVM_REGISTER_GLOBAL("ir.EnvFuncCall").set_body([](TVMArgs args, TVMRetValue* rv) {
   EnvFunc env = args[0];
-  CHECK_GE(args.size(), 1);
+  ICHECK_GE(args.size(), 1);
   env->func.CallPacked(TVMArgs(args.values + 1, args.type_codes + 1, args.size() - 1), rv);
 });
 
diff --git a/src/ir/error.cc b/src/ir/error.cc
index 5cd7a247d025..5d3978dda4ff 100644
--- a/src/ir/error.cc
+++ b/src/ir/error.cc
@@ -46,7 +46,7 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
   // First we pick an error reporting strategy for each error.
   // TODO(@jroesch): Spanned errors are currently not supported.
   for (auto err : this->errors_) {
-    CHECK(!err.span.defined()) << "attempting to use spanned errors, currently not supported";
+    ICHECK(!err.span.defined()) << "attempting to use spanned errors, currently not supported";
   }
 
   NodeMap<GlobalVar, NodeMap<ObjectRef, std::string>> error_maps;
@@ -62,7 +62,7 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
 
     auto has_errs = this->node_to_error_.find(node);
 
-    CHECK(has_errs != this->node_to_error_.end());
+    ICHECK(has_errs != this->node_to_error_.end());
 
     const auto& error_indicies = has_errs->second;
 
@@ -113,7 +113,7 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
     annotated_prog << AsText(func, false, [&err_map](const ObjectRef& expr) {
       auto it = err_map.find(expr);
       if (it != err_map.end()) {
-        CHECK_NE(it->second.size(), 0);
+        ICHECK_NE(it->second.size(), 0);
         return it->second;
       } else {
         return std::string("");
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index 05d41cf204d6..0b7049ec212b 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -49,26 +49,29 @@ PrimExpr PrimExpr::FromObject_(ObjectRef ref) {
   if (auto* ptr = ref.as<runtime::StringObj>()) {
     return tir::StringImm(GetRef<runtime::String>(ptr));
   }
-  CHECK(ObjectTypeChecker<PrimExpr>::Check(ref.get()))
+  ICHECK(ObjectTypeChecker<PrimExpr>::Check(ref.get()))
       << "Expect type " << ObjectTypeChecker<PrimExpr>::TypeName() << " but get "
       << ref->GetTypeKey();
   return Downcast<PrimExpr>(ref);
 }
 
-IntImm::IntImm(DataType dtype, int64_t value) {
-  CHECK(dtype.is_scalar()) << "ValueError: IntImm can only take scalar.";
-  CHECK(dtype.is_int() || dtype.is_uint()) << "ValueError: IntImm supports only int or uint type.";
+IntImm::IntImm(DataType dtype, int64_t value, Span span) {
+  ICHECK(dtype.is_scalar()) << "ValueError: IntImm can only take scalar, but " << dtype
+                            << " was supplied.";
+  ICHECK(dtype.is_int() || dtype.is_uint())
+      << "ValueError: IntImm supports only int or uint type, but " << dtype << " was supplied.";
   if (dtype.is_uint()) {
-    CHECK_GE(value, 0U);
+    ICHECK_GE(value, 0U);
   }
   ObjectPtr<IntImmNode> node = make_object<IntImmNode>();
   node->dtype = dtype;
   node->value = value;
+  node->span = span;
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("ir.IntImm").set_body_typed([](DataType dtype, int64_t value) {
-  return IntImm(dtype, value);
+TVM_REGISTER_GLOBAL("ir.IntImm").set_body_typed([](DataType dtype, int64_t value, Span span) {
+  return IntImm(dtype, value, span);
 });
 
 TVM_REGISTER_NODE_TYPE(IntImmNode);
@@ -83,16 +86,17 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       }
     });
 
-FloatImm::FloatImm(DataType dtype, double value) {
-  CHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar.";
+FloatImm::FloatImm(DataType dtype, double value, Span span) {
+  ICHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar.";
   ObjectPtr<FloatImmNode> node = make_object<FloatImmNode>();
   node->dtype = dtype;
   node->value = value;
+  node->span = span;
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("ir.FloatImm").set_body_typed([](DataType dtype, double value) {
-  return FloatImm(dtype, value);
+TVM_REGISTER_GLOBAL("ir.FloatImm").set_body_typed([](DataType dtype, double value, Span span) {
+  return FloatImm(dtype, value, span);
 });
 
 TVM_REGISTER_NODE_TYPE(FloatImmNode);
diff --git a/src/ir/module.cc b/src/ir/module.cc
index 231ae68dd4e0..7990b281fb04 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -55,14 +55,14 @@ IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
 
   for (const auto& kv : n->functions) {
     // set global var map
-    CHECK(n->global_var_map_.count(kv.first->name_hint) == 0)
+    ICHECK(n->global_var_map_.count(kv.first->name_hint) == 0)
         << "Duplicate global function name " << kv.first->name_hint;
     n->global_var_map_.Set(kv.first->name_hint, kv.first);
   }
 
   for (const auto& kv : n->type_definitions) {
     // set global typevar map
-    CHECK(n->global_type_var_map_.count(kv.first->name_hint) == 0)
+    ICHECK(n->global_type_var_map_.count(kv.first->name_hint) == 0)
         << "Duplicate global type definition name " << kv.first->name_hint;
     n->global_type_var_map_.Set(kv.first->name_hint, kv.first);
     n->RegisterConstructors(kv.first, kv.second);
@@ -150,9 +150,9 @@ tvm::Array<GlobalVar> IRModuleNode::GetGlobalVars() const {
 }
 
 GlobalTypeVar IRModuleNode::GetGlobalTypeVar(const String& name) const {
-  CHECK(global_type_var_map_.defined());
+  ICHECK(global_type_var_map_.defined());
   auto it = global_type_var_map_.find(name);
-  CHECK(it != global_type_var_map_.end())
+  ICHECK(it != global_type_var_map_.end())
       << "Cannot find global type var " << name << " in the Module";
   return (*it).second;
 }
@@ -183,9 +183,9 @@ void WarnIfMalformed(const IRModule& mod, relay::Function func) {
   auto fv = relay::FreeVars(func);
   auto ftv = relay::FreeTypeVars(func, mod);
   // TODO(@jroesch): refactor to use diagnostic context
-  CHECK_EQ(fv.size(), 0) << "There are free variables: " << fv << std::endl;
-  CHECK_EQ(ftv.size(), 0) << "There are free type variables: " << fv
-                          << " in function: " << AsText(func, false);
+  ICHECK_EQ(fv.size(), 0) << "There are free variables: " << fv << std::endl;
+  ICHECK_EQ(ftv.size(), 0) << "There are free type variables: " << fv
+                           << " in function: " << AsText(func, false);
 }
 
 void IRModuleNode::Add(const GlobalVar& var, const BaseFunc& f, bool update) {
@@ -202,9 +202,9 @@ void IRModuleNode::AddUnchecked(const GlobalVar& var, const BaseFunc& func) {
 
   auto it = global_var_map_.find(var->name_hint);
   if (it != global_var_map_.end()) {
-    CHECK_EQ((*it).second, var);
+    ICHECK_EQ((*it).second, var);
   } else {
-    CHECK(global_var_map_.count(var->name_hint) == 0)
+    ICHECK(global_var_map_.count(var->name_hint) == 0)
         << "Duplicate global function name " << var->name_hint;
   }
 
@@ -234,7 +234,7 @@ void IRModuleNode::AddTypeDefUnchecked(const GlobalTypeVar& var, const TypeData&
   this->type_definitions.Set(var, type);
   if (!update) {
     // set global type var map
-    CHECK(global_type_var_map_.count(var->name_hint) == 0)
+    ICHECK(global_type_var_map_.count(var->name_hint) == 0)
         << "Duplicate global type definition name " << var->name_hint;
   }
   global_type_var_map_.Set(var->name_hint, var);
@@ -258,7 +258,7 @@ void IRModuleNode::Remove(const GlobalVar& var) {
 
 BaseFunc IRModuleNode::Lookup(const GlobalVar& var) const {
   auto it = functions.find(var);
-  CHECK(it != functions.end()) << "There is no definition of " << var->name_hint;
+  ICHECK(it != functions.end()) << "There is no definition of " << var->name_hint;
   return (*it).second;
 }
 
@@ -269,7 +269,7 @@ BaseFunc IRModuleNode::Lookup(const String& name) const {
 
 TypeData IRModuleNode::LookupTypeDef(const GlobalTypeVar& var) const {
   auto it = type_definitions.find(var);
-  CHECK(it != type_definitions.end()) << "There is no definition of " << var->name_hint;
+  ICHECK(it != type_definitions.end()) << "There is no definition of " << var->name_hint;
   return (*it).second;
 }
 
@@ -280,7 +280,7 @@ TypeData IRModuleNode::LookupTypeDef(const String& name) const {
 
 Constructor IRModuleNode::LookupTag(const int32_t tag) {
   auto it = constructor_tag_map_.find(tag);
-  CHECK(it != constructor_tag_map_.end()) << "There is no constructor with the tag " << tag;
+  ICHECK(it != constructor_tag_map_.end()) << "There is no constructor with the tag " << tag;
   return (*it).second;
 }
 
@@ -382,7 +382,7 @@ void IRModuleNode::Import(const String& path) {
 
 void IRModuleNode::ImportFromStd(const String& path) {
   auto* f = tvm::runtime::Registry::Get("tvm.relay.std_path");
-  CHECK(f != nullptr) << "The Relay std_path is not set, please register tvm.relay.std_path.";
+  ICHECK(f != nullptr) << "The Relay std_path is not set, please register tvm.relay.std_path.";
   std::string std_path = (*f)();
   this->Import(std_path + "/" + path);
 }
@@ -406,7 +406,7 @@ TVM_REGISTER_GLOBAL("ir.Module_Add").set_body([](TVMArgs args, TVMRetValue* ret)
   GlobalVar var = args[1];
   ObjectRef val = args[2];
   bool update = args[3];
-  CHECK(val->IsInstance<RelayExprNode>());
+  ICHECK(val->IsInstance<RelayExprNode>());
 
   if (val->IsInstance<BaseFuncNode>()) {
     mod->Add(var, Downcast<BaseFunc>(val), update);
@@ -439,6 +439,9 @@ TVM_REGISTER_GLOBAL("ir.Module_GetGlobalTypeVars")
 TVM_REGISTER_GLOBAL("ir.Module_ContainGlobalVar")
     .set_body_method<IRModule>(&IRModuleNode::ContainGlobalVar);
 
+TVM_REGISTER_GLOBAL("ir.Module_ContainGlobalTypeVar")
+    .set_body_method<IRModule>(&IRModuleNode::ContainGlobalTypeVar);
+
 TVM_REGISTER_GLOBAL("ir.Module_GetGlobalTypeVar")
     .set_body_method<IRModule>(&IRModuleNode::GetGlobalTypeVar);
 
diff --git a/src/ir/op.cc b/src/ir/op.cc
index 45c31963695c..5d2dc704f5b7 100644
--- a/src/ir/op.cc
+++ b/src/ir/op.cc
@@ -42,7 +42,7 @@ using OpRegistry = AttrRegistry<OpRegEntry, Op>;
 // find operator by name
 const Op& Op::Get(const String& name) {
   const OpRegEntry* reg = OpRegistry::Global()->Get(name);
-  CHECK(reg != nullptr) << "AttributeError: Operator " << name << " is not registered";
+  ICHECK(reg != nullptr) << "AttributeError: Operator " << name << " is not registered";
   return reg->op();
 }
 
@@ -130,7 +130,7 @@ struct Op2ObjectPtr : public ObjectRef {
 ObjectPtr<Object> CreateOp(const std::string& name) {
   // Hack use TVMRetValue as exchange
   auto op = Op::Get(name);
-  CHECK(op.defined()) << "Cannot find op \'" << name << '\'';
+  ICHECK(op.defined()) << "Cannot find op \'" << name << '\'';
   return Op2ObjectPtr::Get(op);
 }
 
diff --git a/src/ir/span.cc b/src/ir/span.cc
index 667c14e4a7ae..4a26f3a6eb11 100644
--- a/src/ir/span.cc
+++ b/src/ir/span.cc
@@ -74,9 +74,9 @@ Span::Span(SourceName source_name, int line, int end_line, int column, int end_c
 }
 
 Span Span::Merge(const Span& other) const {
-  CHECK(this->defined() && other.defined()) << "Span::Merge: both spans must be defined";
+  ICHECK(this->defined() && other.defined()) << "Span::Merge: both spans must be defined";
 
-  CHECK((*this)->source_name == other->source_name);
+  ICHECK((*this)->source_name == other->source_name);
   return Span((*this)->source_name, std::min((*this)->line, other->line),
               std::max((*this)->end_line, other->end_line),
               std::min((*this)->column, other->column),
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index ec88482ee3bf..f4516d5e57c5 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -60,8 +60,8 @@ void PassContext::EnterWithScope() {
 
 void PassContext::ExitWithScope() {
   PassContextThreadLocalEntry* entry = RelayPassContextThreadLocalStore::Get();
-  CHECK(!entry->context_stack.empty());
-  CHECK(entry->context_stack.top().same_as(*this));
+  ICHECK(!entry->context_stack.empty());
+  ICHECK(entry->context_stack.top().same_as(*this));
   entry->context_stack.pop();
 }
 
@@ -74,10 +74,30 @@ PassContext PassContext::Current() {
   }
 }
 
+// linearly scan the pass array to match pass_name
+bool PassArrayContains(const Array<runtime::String>& pass_array, const std::string& pass_name) {
+  for (auto x : pass_array) {
+    if (x == pass_name) return true;
+  }
+  return false;
+}
+
+bool PassContext::PassEnabled(const PassInfo& info) const {
+  if (PassArrayContains(operator->()->disabled_pass, info->name)) {
+    return false;
+  }
+
+  if (PassArrayContains(operator->()->required_pass, info->name)) {
+    return true;
+  }
+
+  return operator->()->opt_level >= info->opt_level;
+}
+
 class PassConfigManager {
  public:
   void Register(std::string key, uint32_t value_type_index) {
-    CHECK_EQ(key2vtype_.count(key), 0U);
+    ICHECK_EQ(key2vtype_.count(key), 0U);
     ValueTypeInfo info;
     info.type_index = value_type_index;
     info.type_key = runtime::Object::TypeIndex2Key(value_type_index);
@@ -103,7 +123,7 @@ class PassConfigManager {
         LOG(FATAL) << os.str();
       }
       const auto& info = it->second;
-      CHECK(kv.second.defined()) << "AttributeError: " << kv.first << " is None";
+      ICHECK(kv.second.defined()) << "AttributeError: " << kv.first << " is None";
       if (kv.second->IsInstance<Map<String, ObjectRef>::ContainerType>()) {
         ObjectRef converted =
             reflection->CreateObject(info.type_key, Downcast<Map<String, ObjectRef>>(kv.second));
@@ -224,15 +244,6 @@ class SequentialNode : public PassNode {
    */
   PassInfo Info() const override { return pass_info; }
 
-  /*!
-   * \brief Check if a pass is enabled.
-   *
-   * \param info The pass information.
-   *
-   * \return true if the pass is enabled. Otherwise, false.
-   */
-  bool PassEnabled(const PassInfo& info) const;
-
   /*!
    * \brief Resolve the pass dependency. It globs all required passes by
    *        a given pass and executes them.
@@ -344,29 +355,6 @@ void SequentialNode::ResolveDependency(const IRModule& mod) {
              << "\n";
 }
 
-// linearly scan the pass array to match pass_name
-inline bool PassArrayContains(const Array<runtime::String>& pass_array,
-                              const std::string& pass_name) {
-  for (auto x : pass_array) {
-    if (x == pass_name) return true;
-  }
-  return false;
-}
-
-bool SequentialNode::PassEnabled(const PassInfo& info) const {
-  PassContext ctx = PassContext::Current();
-
-  if (PassArrayContains(ctx->disabled_pass, info->name)) {
-    return false;
-  }
-
-  if (PassArrayContains(ctx->required_pass, info->name)) {
-    return true;
-  }
-
-  return ctx->opt_level >= info->opt_level;
-}
-
 Pass GetPass(const String& pass_name) {
   using tvm::runtime::Registry;
   const runtime::PackedFunc* f = nullptr;
@@ -376,7 +364,7 @@ Pass GetPass(const String& pass_name) {
     // pass
   } else if ((f = Registry::Get("relay._transform." + pass_name))) {
   }
-  CHECK(f != nullptr) << "Cannot use " << pass_name << "to create the pass";
+  ICHECK(f != nullptr) << "Cannot use " << pass_name << "to create the pass";
   return (*f)();
 }
 
@@ -385,9 +373,9 @@ Pass GetPass(const String& pass_name) {
 // ordering problem needs to be handled in the future.
 IRModule SequentialNode::operator()(IRModule mod, const PassContext& pass_ctx) const {
   for (const Pass& pass : passes) {
-    CHECK(pass.defined()) << "Found undefined pass for optimization.";
+    ICHECK(pass.defined()) << "Found undefined pass for optimization.";
     const PassInfo& pass_info = pass->Info();
-    if (!PassEnabled(pass_info)) continue;
+    if (!pass_ctx.PassEnabled(pass_info)) continue;
     // resolve dependencies
     for (const auto& it : pass_info->required) {
       mod = GetPass(it)(std::move(mod), pass_ctx);
diff --git a/src/node/attr_registry.h b/src/node/attr_registry.h
index 01d2b68c471b..f84be1467453 100644
--- a/src/node/attr_registry.h
+++ b/src/node/attr_registry.h
@@ -109,10 +109,10 @@ class AttrRegistry {
       op_map->data_.resize(index + 1, std::make_pair(TVMRetValue(), 0));
     }
     std::pair<TVMRetValue, int>& p = op_map->data_[index];
-    CHECK(p.second != plevel) << "Attribute " << attr_name << " of " << key->AttrRegistryName()
-                              << " is already registered with same plevel=" << plevel;
-    CHECK(value.type_code() != kTVMNullptr) << "Registered packed_func is Null for " << attr_name
-                                            << " of operator " << key->AttrRegistryName();
+    ICHECK(p.second != plevel) << "Attribute " << attr_name << " of " << key->AttrRegistryName()
+                               << " is already registered with same plevel=" << plevel;
+    ICHECK(value.type_code() != kTVMNullptr) << "Registered packed_func is Null for " << attr_name
+                                             << " of operator " << key->AttrRegistryName();
     if (p.second < plevel && value.type_code() != kTVMNullptr) {
       op_map->data_[index] = std::make_pair(value, plevel);
     }
diff --git a/src/node/container.cc b/src/node/container.cc
index 60b5f40b98f1..b72d5a4cd736 100644
--- a/src/node/container.cc
+++ b/src/node/container.cc
@@ -96,8 +96,8 @@ struct NDArrayContainerTrait {
   static constexpr const std::nullptr_t VisitAttrs = nullptr;
 
   static void SHashReduce(const runtime::NDArray::Container* key, SHashReducer hash_reduce) {
-    CHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    CHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor";
+    ICHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor";
     hash_reduce(runtime::DataType(key->dl_tensor.dtype));
     hash_reduce(key->dl_tensor.ndim);
     for (int i = 0; i < key->dl_tensor.ndim; ++i) {
@@ -113,10 +113,10 @@ struct NDArrayContainerTrait {
 
     auto ldt = lhs->dl_tensor.dtype;
     auto rdt = rhs->dl_tensor.dtype;
-    CHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    CHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    CHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
-    CHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
+    ICHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
+    ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
 
     if (lhs->dl_tensor.ndim != rhs->dl_tensor.ndim) return false;
     for (int i = 0; i < lhs->dl_tensor.ndim; ++i) {
@@ -172,18 +172,18 @@ TVM_REGISTER_GLOBAL("node.Array").set_body([](TVMArgs args, TVMRetValue* ret) {
 
 TVM_REGISTER_GLOBAL("node.ArrayGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
   int64_t i = args[1];
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<ArrayNode>());
+  ICHECK(ptr->IsInstance<ArrayNode>());
   auto* n = static_cast<const ArrayNode*>(ptr);
-  CHECK_LT(static_cast<size_t>(i), n->size()) << "out of bound of array";
+  ICHECK_LT(static_cast<size_t>(i), n->size()) << "out of bound of array";
   *ret = n->at(i);
 });
 
 TVM_REGISTER_GLOBAL("node.ArraySize").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<ArrayNode>());
+  ICHECK(ptr->IsInstance<ArrayNode>());
   *ret = static_cast<int64_t>(static_cast<const ArrayNode*>(ptr)->size());
 });
 
@@ -300,7 +300,7 @@ TVM_REGISTER_REFLECTION_VTABLE(MapNode, MapNodeTrait)
     .set_creator([](const std::string&) -> ObjectPtr<Object> { return MapNode::Empty(); });
 
 TVM_REGISTER_GLOBAL("node.Map").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args.size() % 2, 0);
+  ICHECK_EQ(args.size() % 2, 0);
   std::unordered_map<ObjectRef, ObjectRef, ObjectPtrHash, ObjectPtrEqual> data;
   for (int i = 0; i < args.num_args; i += 2) {
     ObjectRef k =
@@ -312,29 +312,29 @@ TVM_REGISTER_GLOBAL("node.Map").set_body([](TVMArgs args, TVMRetValue* ret) {
 });
 
 TVM_REGISTER_GLOBAL("node.MapSize").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<MapNode>());
+  ICHECK(ptr->IsInstance<MapNode>());
   auto* n = static_cast<const MapNode*>(ptr);
   *ret = static_cast<int64_t>(n->size());
 });
 
 TVM_REGISTER_GLOBAL("node.MapGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<MapNode>());
+  ICHECK(ptr->IsInstance<MapNode>());
 
   auto* n = static_cast<const MapNode*>(ptr);
   auto it = n->find(String::CanConvertFrom(args[1]) ? args[1].operator String()
                                                     : args[1].operator ObjectRef());
-  CHECK(it != n->end()) << "cannot find the corresponding key in the Map";
+  ICHECK(it != n->end()) << "cannot find the corresponding key in the Map";
   *ret = (*it).second;
 });
 
 TVM_REGISTER_GLOBAL("node.MapCount").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<MapNode>());
+  ICHECK(ptr->IsInstance<MapNode>());
   const MapNode* n = static_cast<const MapNode*>(ptr);
   int64_t cnt = n->count(String::CanConvertFrom(args[1]) ? args[1].operator String()
                                                          : args[1].operator ObjectRef());
@@ -342,7 +342,7 @@ TVM_REGISTER_GLOBAL("node.MapCount").set_body([](TVMArgs args, TVMRetValue* ret)
 });
 
 TVM_REGISTER_GLOBAL("node.MapItems").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
   auto* n = static_cast<const MapNode*>(ptr);
   Array<ObjectRef> rkvs;
diff --git a/src/node/reflection.cc b/src/node/reflection.cc
index ec82c91bb652..9dc9d330bb77 100644
--- a/src/node/reflection.cc
+++ b/src/node/reflection.cc
@@ -50,7 +50,7 @@ class AttrGetter : public AttrVisitor {
     if (skey == key) *ret = value[0];
   }
   void Visit(const char* key, uint64_t* value) final {
-    CHECK_LE(value[0], static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
+    ICHECK_LE(value[0], static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
         << "cannot return too big constant";
     if (skey == key) *ret = static_cast<int64_t>(value[0]);
   }
@@ -198,7 +198,7 @@ class NodeAttrSetter : public AttrVisitor {
 void InitNodeByPackedArgs(ReflectionVTable* reflection, Object* n, const TVMArgs& args) {
   NodeAttrSetter setter;
   setter.type_key = n->GetTypeKey();
-  CHECK_EQ(args.size() % 2, 0);
+  ICHECK_EQ(args.size() % 2, 0);
   for (int i = 0; i < args.size(); i += 2) {
     setter.attrs.emplace(args[i].operator std::string(), args[i + 1]);
   }
@@ -245,13 +245,13 @@ ObjectRef ReflectionVTable::CreateObject(const std::string& type_key,
 
 // Expose to FFI APIs.
 void NodeGetAttr(TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* self = static_cast<Object*>(args[0].value().v_handle);
   *ret = ReflectionVTable::Global()->GetAttr(self, args[1]);
 }
 
 void NodeListAttrNames(TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* self = static_cast<Object*>(args[0].value().v_handle);
 
   auto names =
diff --git a/src/node/serialization.cc b/src/node/serialization.cc
index 1f0e8c0f9b00..c7e4d27c8b2c 100644
--- a/src/node/serialization.cc
+++ b/src/node/serialization.cc
@@ -85,7 +85,7 @@ class NodeIndexer : public AttrVisitor {
   void Visit(const char* key, runtime::NDArray* value) final {
     DLTensor* ptr = const_cast<DLTensor*>((*value).operator->());
     if (tensor_index_.count(ptr)) return;
-    CHECK_EQ(tensor_index_.size(), tensor_list_.size());
+    ICHECK_EQ(tensor_index_.size(), tensor_list_.size());
     tensor_index_[ptr] = tensor_list_.size();
     tensor_list_.push_back(ptr);
   }
@@ -97,10 +97,10 @@ class NodeIndexer : public AttrVisitor {
   // make index of all the children of node
   void MakeIndex(Object* node) {
     if (node == nullptr) return;
-    CHECK(node->IsInstance<Object>());
+    ICHECK(node->IsInstance<Object>());
 
     if (node_index_.count(node)) return;
-    CHECK_EQ(node_index_.size(), node_list_.size());
+    ICHECK_EQ(node_index_.size(), node_list_.size());
     node_index_[node] = node_list_.size();
     node_list_.push_back(node);
 
@@ -195,7 +195,7 @@ struct JSONNode {
     helper.ReadAllFields(reader);
 
     if (repr_str.size() != 0) {
-      CHECK_EQ(repr_b64.size(), 0U);
+      ICHECK_EQ(repr_b64.size(), 0U);
       repr_bytes = std::move(repr_str);
     } else if (repr_b64.size() != 0) {
       repr_bytes = Base64Decode(repr_b64);
@@ -388,13 +388,13 @@ class JSONAttrSetter : public AttrVisitor {
   void Visit(const char* key, runtime::NDArray* value) final {
     size_t index;
     ParseValue(key, &index);
-    CHECK_LE(index, tensor_list_->size());
+    ICHECK_LE(index, tensor_list_->size());
     *value = tensor_list_->at(index);
   }
   void Visit(const char* key, ObjectRef* value) final {
     size_t index;
     ParseValue(key, &index);
-    CHECK_LE(index, node_list_->size());
+    ICHECK_LE(index, node_list_->size());
     *value = ObjectRef(node_list_->at(index));
   }
   // set node to be current JSONNode
@@ -421,13 +421,13 @@ class JSONAttrSetter : public AttrVisitor {
     if (jnode->type_key == MapNode::_type_key) {
       std::unordered_map<ObjectRef, ObjectRef, ObjectHash, ObjectEqual> container;
       if (jnode->keys.empty()) {
-        CHECK_EQ(jnode->data.size() % 2, 0U);
+        ICHECK_EQ(jnode->data.size() % 2, 0U);
         for (size_t i = 0; i < jnode->data.size(); i += 2) {
           container[ObjectRef(node_list_->at(jnode->data[i]))] =
               ObjectRef(node_list_->at(jnode->data[i + 1]));
         }
       } else {
-        CHECK_EQ(jnode->data.size(), jnode->keys.size());
+        ICHECK_EQ(jnode->data.size(), jnode->keys.size());
         for (size_t i = 0; i < jnode->data.size(); ++i) {
           container[String(jnode->keys[i])] = ObjectRef(node_list_->at(jnode->data[i]));
         }
@@ -530,7 +530,7 @@ struct JSONGraph {
         }
       }
     }
-    CHECK_EQ(topo_order.size(), n_nodes) << "Cyclic reference detected in JSON file";
+    ICHECK_EQ(topo_order.size(), n_nodes) << "Cyclic reference detected in JSON file";
     std::reverse(std::begin(topo_order), std::end(topo_order));
     return topo_order;
   }
@@ -562,7 +562,7 @@ ObjectRef LoadJSON(std::string json_str) {
       support::Base64InStream b64strm(&mstrm);
       b64strm.InitPosition();
       runtime::NDArray temp;
-      CHECK(temp.Load(&b64strm));
+      ICHECK(temp.Load(&b64strm));
       tensors.emplace_back(std::move(temp));
     }
   }
diff --git a/src/node/structural_equal.cc b/src/node/structural_equal.cc
index e05cbbb60d1f..1fa72c92b6fc 100644
--- a/src/node/structural_equal.cc
+++ b/src/node/structural_equal.cc
@@ -90,7 +90,7 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
 
   void MarkGraphNode() final {
     // need to push to pending tasks in this case
-    CHECK(!allow_push_to_stack_ && !task_stack_.empty());
+    ICHECK(!allow_push_to_stack_ && !task_stack_.empty());
     task_stack_.back().graph_equal = true;
   }
 
@@ -108,8 +108,8 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
     equal_map_lhs_.clear();
     equal_map_rhs_.clear();
     if (!SEqualReduce(lhs, rhs, map_free_vars)) return false;
-    CHECK_EQ(pending_tasks_.size(), 1U);
-    CHECK(allow_push_to_stack_);
+    ICHECK_EQ(pending_tasks_.size(), 1U);
+    ICHECK(allow_push_to_stack_);
     task_stack_.emplace_back(std::move(pending_tasks_.back()));
     pending_tasks_.clear();
     return RunTasks();
@@ -141,7 +141,7 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
         // We can safely mark lhs and rhs as equal to each other.
         auto it = equal_map_lhs_.find(entry.lhs);
         if (it != equal_map_lhs_.end()) {
-          CHECK(it->second.same_as(entry.rhs));
+          ICHECK(it->second.same_as(entry.rhs));
         }
         // create the map if the quality is graph equal.
         if (entry.graph_equal) {
@@ -156,7 +156,7 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
         // Expand the objects
         // The SEqual of the object can call into this->SEqualReduce
         // which populates the pending tasks.
-        CHECK_EQ(pending_tasks_.size(), 0U);
+        ICHECK_EQ(pending_tasks_.size(), 0U);
         allow_push_to_stack_ = false;
         if (!DispatchSEqualReduce(entry.lhs, entry.rhs, entry.map_free_vars)) return false;
         allow_push_to_stack_ = true;
@@ -174,7 +174,7 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
   // The default equal as registered in the structural equal vtable.
   bool DispatchSEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars) {
     auto compute = [=]() {
-      CHECK(lhs.defined() && rhs.defined() && lhs->type_index() == rhs->type_index());
+      ICHECK(lhs.defined() && rhs.defined() && lhs->type_index() == rhs->type_index());
       // skip entries that already have equality maps.
       auto it = equal_map_lhs_.find(lhs);
       if (it != equal_map_lhs_.end()) {
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index d21cb1f2d9b3..e0b729d3f103 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -28,6 +28,8 @@
 #include <algorithm>
 #include <unordered_map>
 
+#include "../support/utils.h"
+
 namespace tvm {
 
 // Define the dispatch functio here since primary user is in this file.
@@ -77,7 +79,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
 
   void MarkGraphNode() final {
     // need to push to pending tasks in this case
-    CHECK(!allow_push_to_stack_ && !task_stack_.empty());
+    ICHECK(!allow_push_to_stack_ && !task_stack_.empty());
     task_stack_.back().graph_node_hash = true;
   }
 
@@ -95,7 +97,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
   }
 
   void SHashReduceFreeVar(const runtime::Object* var, bool map_free_vars) final {
-    CHECK(!hash_memo_.count(GetRef<ObjectRef>(var)));
+    ICHECK(!hash_memo_.count(GetRef<ObjectRef>(var)));
     if (map_free_vars) {
       // use counter value.
       size_t value = std::hash<size_t>()(free_var_counter_++);
@@ -125,19 +127,19 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
   }
 
   size_t Hash(const ObjectRef& object, bool map_free_vars) {
-    CHECK_EQ(task_stack_.size(), 0U);
-    CHECK_EQ(pending_tasks_.size(), 0U);
-    CHECK_EQ(result_stack_.size(), 0U);
+    ICHECK_EQ(task_stack_.size(), 0U);
+    ICHECK_EQ(pending_tasks_.size(), 0U);
+    ICHECK_EQ(result_stack_.size(), 0U);
 
     this->SHashReduce(object, map_free_vars);
-    CHECK_EQ(pending_tasks_.size(), 1U);
-    CHECK(allow_push_to_stack_);
+    ICHECK_EQ(pending_tasks_.size(), 1U);
+    ICHECK(allow_push_to_stack_);
     task_stack_.emplace_back(std::move(pending_tasks_.back()));
     pending_tasks_.clear();
 
     this->RunTasks();
 
-    CHECK_EQ(result_stack_.size(), 1U);
+    ICHECK_EQ(result_stack_.size(), 1U);
     size_t ret = result_stack_.back();
     result_stack_.pop_back();
     return ret;
@@ -158,12 +160,12 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
    */
   size_t ReduceHash(const Task& task) {
     size_t stack_begin = task.result_stack_index;
-    CHECK_LE(stack_begin, result_stack_.size());
+    ICHECK_LE(stack_begin, result_stack_.size());
 
     // combine in the reverse order of the stack.
     size_t reduced_hash = task.reduced_hash;
     for (size_t i = result_stack_.size(); i != stack_begin; --i) {
-      reduced_hash = HashCombine(reduced_hash, result_stack_[i - 1]);
+      reduced_hash = support::HashCombine(reduced_hash, result_stack_[i - 1]);
     }
     result_stack_.resize(stack_begin);
     return reduced_hash;
@@ -186,8 +188,8 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
           // Append the graph node counter to the hash
           // so that we can distinguish DAG from trees.
           if (entry.graph_node_hash) {
-            entry.reduced_hash =
-                HashCombine(entry.reduced_hash, std::hash<size_t>()(graph_node_counter_++));
+            entry.reduced_hash = support::HashCombine(entry.reduced_hash,
+                                                      std::hash<size_t>()(graph_node_counter_++));
           }
           hash_memo_[entry.object] = entry.reduced_hash;
         }
@@ -208,7 +210,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
           entry.children_expanded = true;
           entry.result_stack_index = result_stack_.size();
 
-          CHECK_EQ(pending_tasks_.size(), 0U);
+          ICHECK_EQ(pending_tasks_.size(), 0U);
           allow_push_to_stack_ = false;
           // dispatch hash, reduce to the current slot.
           this->DispatchSHash(entry.object, entry.map_free_vars);
@@ -225,20 +227,10 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
 
   // The default equal as registered in the structural equal vtable.
   void DispatchSHash(const ObjectRef& object, bool map_free_vars) {
-    CHECK(object.defined());
+    ICHECK(object.defined());
     vtable_->SHashReduce(object.get(), SHashReducer(this, map_free_vars));
   }
 
-  /*!
-   * \brief Combine two hash values into a single one.
-   * \param key The left operand.
-   * \param value The right operand.
-   * \return the combined result.
-   */
-  size_t HashCombine(size_t key, size_t value) {
-    return key ^ (value + 0x9e3779b9 + (key << 6) + (key >> 2));
-  }
-
  private:
   // free var counter.
   size_t free_var_counter_{0};
diff --git a/src/parser/meta_ref.cc b/src/parser/meta_ref.cc
index d23892753c5f..c74b396900d8 100644
--- a/src/parser/meta_ref.cc
+++ b/src/parser/meta_ref.cc
@@ -72,9 +72,9 @@ struct MetaRefExpander : public ExprMutator {
     if (auto op_node = call->op.as<OpNode>()) {
       if (op_node->name == "parser.MetaRef") {
         auto meta_attrs = call->attrs.as<MetaRefAttrs>();
-        CHECK(meta_attrs) << "an internal error has occurred";
+        ICHECK(meta_attrs) << "an internal error has occurred";
         auto nodes = table.at(meta_attrs->node_type_key);
-        CHECK_LT(meta_attrs->node_index, nodes.size());
+        ICHECK_LT(meta_attrs->node_index, nodes.size());
         return Downcast<Expr>(nodes[meta_attrs->node_index]);
       }
     }
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 7dc55b0b519a..afcf70737933 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -21,7 +21,6 @@
  * \file parser.cc
  * \brief A parser for TVM IR.
  */
-#include <tvm/ir/diagnostic.h>
 #include <tvm/ir/module.h>
 #include <tvm/node/reflection.h>
 #include <tvm/parser/parser.h>
@@ -31,6 +30,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <fstream>
 
@@ -371,7 +371,7 @@ class Parser {
    * \return The Nth token.
    */
   Token Lookahead(int n) {
-    CHECK_GE(n, 1) << "lookahead is only valid when n >= 1";
+    ICHECK_GE(n, 1) << "lookahead is only valid when n >= 1";
 
     // We intend to skip n - 1 tokens, then return the nth.
     auto old_pos = pos;
@@ -605,30 +605,43 @@ class Parser {
     return ast;
   }
 
+  struct MetaRef {
+    std::string type_key;
+    uint64_t node_index;
+    Span span;
+    MetaRef(std::string type_key, uint64_t node_index, Span span)
+        : type_key(type_key), node_index(node_index), span(span) {}
+  };
+
+  MetaRef MetaRefFromToken(const Token& tok) {
+    Call ref = Downcast<Call>(tok->data);
+    auto attrs = ref->attrs.as<MetaRefAttrs>();
+    auto type_key = attrs->node_type_key;
+    auto index = attrs->node_index;
+    return MetaRef(type_key, index, ref->span);
+  }
+
   /*! \brief Parse a meta reference of the form `meta[type_key][node_index]`.
    * For example `meta[relay.Constant][0]` references the first constant, `meta[relay.Constant][1]`
    * the second, and so on.
    */
   ObjectRef ParseMetaRef() {
-    auto meta_ref = Match(TokenType::kMetaReference);
-    Call ref = Downcast<Call>(meta_ref->data);
-    auto attrs = ref->attrs.as<MetaRefAttrs>();
-    auto type_key = attrs->node_type_key;
-    auto index = attrs->node_index;
-    auto it = this->meta_table.find(type_key);
+    auto meta_ref_tok = Match(TokenType::kMetaReference);
+    auto meta_ref = MetaRefFromToken(meta_ref_tok);
+    auto it = this->meta_table.find(meta_ref.type_key);
     if (it != this->meta_table.end()) {
       auto nodes = (*it).second;
-      if (index < nodes.size()) {
-        return nodes[index];
+      if (meta_ref.node_index < nodes.size()) {
+        return nodes[meta_ref.node_index];
       } else {
-        this->diag_ctx.Emit(Diagnostic::Error(meta_ref->span)
-                            << "the node index `" << index << "` is out of bounds for `" << type_key
-                            << "`");
+        this->diag_ctx.Emit(Diagnostic::Error(meta_ref.span)
+                            << "the node index `" << meta_ref.node_index
+                            << "` is out of bounds for `" << meta_ref.type_key << "`");
         return ObjectRef();
       }
     } else {
-      this->diag_ctx.Emit(Diagnostic::Error(meta_ref->span)
-                          << "no entry in the meta table for `" << type_key << "`");
+      this->diag_ctx.Emit(Diagnostic::Error(meta_ref.span)
+                          << "no entry in the meta table for `" << meta_ref.type_key << "`");
       return ObjectRef();
     }
   }
@@ -822,7 +835,7 @@ class Parser {
               ctor = tvm::Constructor(ctor_name, arg_types, type_global);
             }
 
-            CHECK(ctor.defined());
+            ICHECK(ctor.defined());
 
             try {
               this->ctors.Add(ctor_name, ctor);
@@ -922,10 +935,7 @@ class Parser {
             exprs.push_back(ParseMatch(is_total));
             break;
           }
-          case TokenType::kIf: {
-            exprs.push_back(ParseIf());
-            break;
-          }
+
           // %x ...
           case TokenType::kGraph:
             if (Lookahead(2)->token_type == TokenType::kEqual) {
@@ -944,7 +954,7 @@ class Parser {
         }
       }
 
-      CHECK_GE(exprs.size(), 1);
+      ICHECK_GE(exprs.size(), 1);
 
       if (exprs.size() == 1) {
         // ICHECK(exprs[0].defined() && exprs[0]->span.defined())
@@ -1258,7 +1268,7 @@ class Parser {
         auto op = opt_op[0];
 
         Expr right = WithSpan<Expr>([this] { return ParseCallExpr(); });
-        CHECK(right->span.defined());
+        ICHECK(right->span.defined());
 
         // If the operator stack is empty
         // we parse an operator and expression
@@ -1285,7 +1295,7 @@ class Parser {
           exprs.pop_back();
           Expr left = exprs.back();
           exprs.pop_back();
-          CHECK(new_op.op.defined()) << "a call op must be set " << new_op.op;
+          ICHECK(new_op.op.defined()) << "a call op must be set " << new_op.op;
           exprs.push_back(
               relay::Call(new_op.op, {left, right}, Attrs(), {}, left->span.Merge(right->span)));
         }
@@ -1301,7 +1311,7 @@ class Parser {
         exprs.pop_back();
         Expr left = exprs.back();
         exprs.pop_back();
-        CHECK(new_op.op.defined()) << "a call op must be set " << new_op.op;
+        ICHECK(new_op.op.defined()) << "a call op must be set " << new_op.op;
         exprs.push_back(
             relay::Call(new_op.op, {left, right}, Attrs(), {}, left->span.Merge(right->span)));
       }
@@ -1344,6 +1354,10 @@ class Parser {
             Match(TokenType::kIdentifier);
             return ObjectRef();
           }
+          if (id == "None") {
+            Match(TokenType::kIdentifier);
+            return Optional<ObjectRef>();
+          }
         }
       }
       default:
@@ -1369,10 +1383,10 @@ class Parser {
   }
 
   Expr ParseCallArgs(Expr op) {
-    CHECK(op.defined()) << "the operator must be defined";
+    ICHECK(op.defined()) << "the operator must be defined";
 
     DLOG(INFO) << "Parser::ParseCallArgs";
-    Map<String, ObjectRef> raw_attrs;
+    Attrs attrs;
     std::string op_key;
     bool is_op = false;
 
@@ -1388,21 +1402,40 @@ class Parser {
           [&] {
             auto is_ident = Lookahead(1)->token_type == TokenType::kIdentifier;
             auto next_is_equal = Lookahead(2)->token_type == TokenType::kEqual;
-
-            if (is_op && is_ident && next_is_equal) {
-              raw_attrs = ParseAttrs();
+            auto is_pretty_attrs = is_ident && next_is_equal;
+            auto is_meta_next = Lookahead(1)->token_type == TokenType::kMetaReference;
+            // TODO(@jroesch): might not handle trailing comma
+            auto last_meta = Lookahead(2)->token_type == TokenType::kCloseParen;
+            auto is_meta_attrs = is_meta_next && last_meta;
+
+            if (is_op && (is_pretty_attrs || is_meta_attrs)) {
+              if (is_meta_attrs) {
+                auto meta_ref = ParseMetaRef();
+                if (meta_ref.as<BaseAttrsNode>()) {
+                  attrs = Downcast<Attrs>(meta_ref);
+                } else {
+                  // Not awesome parsing code here.
+                  this->pos--;
+                  return false;
+                }
+              } else {
+                auto raw_attrs = ParseAttrs();
+                auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs);
+                ICHECK(attr_obj.defined());
+                attrs = Downcast<Attrs>(attr_obj);
+              }
               return true;
             }
 
             return false;
           });
 
-      Attrs attrs;
-
-      if (is_op && op_key.size()) {
-        auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs);
-        CHECK(attr_obj.defined());
-        attrs = Downcast<Attrs>(attr_obj);
+      if (!attrs.defined()) {
+        if (is_op && op_key.size()) {
+          auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, {});
+          ICHECK(attr_obj.defined());
+          attrs = Downcast<Attrs>(attr_obj);
+        }
       }
 
       // TODO(@jroesch): in a secondary pass adjust spans.
@@ -1500,7 +1533,7 @@ class Parser {
             auto spanned_idents = ParseHierarchicalName();
             auto idents = spanned_idents.data;
             auto span = spanned_idents.span;
-            CHECK_NE(idents.size(), 0);
+            ICHECK_NE(idents.size(), 0);
             std::stringstream op_name;
             int i = 0;
             int periods = idents.size() - 1;
@@ -1527,6 +1560,10 @@ class Parser {
           ICHECK(e->span.defined()) << "function spans must be defined.\n" << e;
           return e;
         }
+        case TokenType::kIf: {
+          Expr e = ParseIf();
+          return e;
+        }
         case TokenType::kRef: {
           Consume(TokenType::kRef);
           Match(TokenType::kOpenParen);
diff --git a/src/parser/source_map.cc b/src/parser/source_map.cc
index 40998b0c9dc4..7340f6977943 100644
--- a/src/parser/source_map.cc
+++ b/src/parser/source_map.cc
@@ -62,7 +62,7 @@ Source::Source(SourceName src_name, std::string source) {
 
 tvm::String Source::GetLine(int line) {
   DLOG(INFO) << "Source::GetLine: line=" << line;
-  CHECK(line - 1 < static_cast<int64_t>((*this)->line_map.size()))
+  ICHECK(line - 1 < static_cast<int64_t>((*this)->line_map.size()))
       << "requested line: " << line << "at index: " << (line - 1)
       << "line_map size: " << (*this)->line_map.size() << "source: " << (*this)->source;
 
@@ -77,12 +77,6 @@ tvm::String Source::GetLine(int line) {
   return line_text;
 }
 
-// TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-//     .set_dispatch<SourceNameNode>([](const ObjectRef& ref, ReprPrinter* p) {
-//       auto* node = static_cast<const SourceNameNode*>(ref.get());
-//       p->stream << "SourceName(" << node->name << ", " << node << ")";
-//     });
-
 TVM_REGISTER_NODE_TYPE(SourceMapNode);
 
 SourceMap::SourceMap(Map<SourceName, Source> source_map) {
@@ -91,11 +85,6 @@ SourceMap::SourceMap(Map<SourceName, Source> source_map) {
   data_ = std::move(n);
 }
 
-// TODO(@jroesch): fix this
-static SourceMap global_source_map = SourceMap(Map<SourceName, Source>());
-
-SourceMap SourceMap::Global() { return global_source_map; }
-
 void SourceMap::Add(const Source& source) { (*this)->source_map.Set(source->source_name, source); }
 
 TVM_REGISTER_GLOBAL("SourceMapAdd").set_body_typed([](SourceMap map, String name, String content) {
diff --git a/src/parser/span_check.h b/src/parser/span_check.h
index b9ba76df4b8f..9a887474fe67 100644
--- a/src/parser/span_check.h
+++ b/src/parser/span_check.h
@@ -25,13 +25,13 @@
 #ifndef TVM_PARSER_SPAN_CHECK_H_
 #define TVM_PARSER_SPAN_CHECK_H_
 
-#include <tvm/ir/diagnostic.h>
 #include <tvm/ir/transform.h>
 #include <tvm/ir/type_functor.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/object.h>
+#include <tvm/support/logging.h>
 
 #include <fstream>
 #include <string>
diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h
index 20ad1734e573..c6fb3e09f4d1 100644
--- a/src/parser/tokenizer.h
+++ b/src/parser/tokenizer.h
@@ -100,7 +100,7 @@ struct Tokenizer {
   bool More() { return this->pos < this->source.size(); }
 
   char Peek() {
-    CHECK(pos < this->source.size());
+    ICHECK(pos < this->source.size());
     return this->source.at(this->pos);
   }
 
@@ -170,7 +170,7 @@ struct Tokenizer {
   }
 
   Token ParseNumber(bool is_pos, bool is_float, std::string number) {
-    CHECK(number.size() > 0) << "an empty string is an invalid number";
+    ICHECK(number.size() > 0) << "an empty string is an invalid number";
 
     try {
       if (is_float) {
@@ -231,22 +231,22 @@ struct Tokenizer {
     int line = this->line;
     int column = this->col;
 
-    CHECK_EQ(Peek(), '[');
+    ICHECK_EQ(Peek(), '[');
     Next();
     std::stringstream type_key;
     while (More() && Peek() != ']') {
       type_key << Next();
     }
-    CHECK_EQ(Peek(), ']');
+    ICHECK_EQ(Peek(), ']');
     Next();
 
-    CHECK_EQ(Peek(), '[');
+    ICHECK_EQ(Peek(), '[');
     Next();
     std::stringstream str_index;
     while (More() && Peek() != ']') {
       str_index << Next();
     }
-    CHECK_EQ(Peek(), ']');
+    ICHECK_EQ(Peek(), ']');
     Next();
     // todo: add error handling around bad indices
     auto index = ParseNumber(true, false, str_index.str()).ToNumber();
@@ -266,7 +266,7 @@ struct Tokenizer {
         raw_attribute << Next();
       }
 
-      CHECK_EQ(Next(), ']');
+      ICHECK_EQ(Next(), ']');
 
       auto attribute = raw_attribute.str();
       // Clean up the white-space on both sides.
@@ -328,7 +328,7 @@ struct Tokenizer {
       }
     } else if (next == '"') {
       // TODO(@jroesch): Properly tokenize escape sequences in strings.
-      // see https://github.com/apache/incubator-tvm/issues/6153.
+      // see https://github.com/apache/tvm/issues/6153.
       Next();
       std::stringstream string_content;
       while (More() && Peek() != '"') {
@@ -537,7 +537,7 @@ struct Tokenizer {
     DLOG(INFO) << "tvm::parser::Tokenize";
     while (this->More()) {
       auto token = TokenizeOnce();
-      CHECK(token.defined());
+      ICHECK(token.defined());
       this->tokens.push_back(token);
     }
     this->tokens.push_back(NewToken(TokenType::kEndOfFile));
@@ -576,15 +576,15 @@ std::vector<Token> Condense(const std::vector<Token>& tokens, Token* table) {
           i += 1;
           // TODO(@jroesch): merge spans
           auto tok = Token(current->span, TokenType::kLocal, next->data);
-          CHECK(tok.defined());
+          ICHECK(tok.defined());
           out.push_back(tok);
         } else if (next->token_type == TokenType::kInteger) {
           i += 1;
           auto tok = Token(current->span, TokenType::kGraph, next->data);
-          CHECK(tok.defined());
+          ICHECK(tok.defined());
           out.push_back(tok);
         } else {
-          CHECK(current.defined());
+          ICHECK(current.defined());
           out.push_back(current);
         }
         continue;
@@ -596,10 +596,10 @@ std::vector<Token> Condense(const std::vector<Token>& tokens, Token* table) {
           i += 1;
           // TODO(@jroesch): merge spans
           auto tok = Token(current->span, TokenType::kGlobal, next->data);
-          CHECK(tok.defined());
+          ICHECK(tok.defined());
           out.push_back(tok);
         } else {
-          CHECK(current.defined());
+          ICHECK(current.defined());
           out.push_back(current);
         }
         continue;
@@ -638,7 +638,7 @@ std::pair<std::vector<Token>, Token> Tokenize(const DiagnosticContext& ctx, cons
   Token meta_table(Span(), TokenType::kUnknown, ObjectRef());
   auto tokens = Condense(tokenizer.tokens, &meta_table);
   for (auto token : tokens) {
-    CHECK(token.defined());
+    ICHECK(token.defined());
   }
   return {tokens, meta_table};
 }
diff --git a/src/printer/doc.cc b/src/printer/doc.cc
index ab1eddbe7d1e..4b22d54448c2 100644
--- a/src/printer/doc.cc
+++ b/src/printer/doc.cc
@@ -85,7 +85,7 @@ class DocLine : public DocAtom {
 
 // DSL function implementations
 Doc& Doc::operator<<(const Doc& right) {
-  CHECK(this != &right);
+  ICHECK(this != &right);
   this->stream_.insert(this->stream_.end(), right.stream_.begin(), right.stream_.end());
   return *this;
 }
diff --git a/src/printer/meta_data.h b/src/printer/meta_data.h
index df27d92170c6..233da1baffd8 100644
--- a/src/printer/meta_data.h
+++ b/src/printer/meta_data.h
@@ -99,7 +99,7 @@ class TextMetaDataContext {
       return it->second;
     }
     std::string type_key = node->GetTypeKey();
-    CHECK(!type_key.empty());
+    ICHECK(!type_key.empty());
     Array<ObjectRef>& mvector = meta_data_[type_key];
     int64_t index = static_cast<int64_t>(mvector.size());
     mvector.push_back(node);
diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc
index 555d335a51da..da4f8cadfb3d 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/printer/relay_text_printer.cc
@@ -322,7 +322,7 @@ Doc RelayTextPrinter::VisitExpr_(const ConstantNode* op) {
   if (op->is_scalar()) {
     std::ostringstream os;
     DataType dtype = DataType(op->data->dtype);
-    CHECK_EQ(op->data->ctx.device_type, kDLCPU);
+    ICHECK_EQ(op->data->ctx.device_type, kDLCPU);
     if (dtype == DataType::Int(32)) {
       return ScalarLiteral(dtype, static_cast<const int32_t*>(op->data->data)[0]);
     } else if (dtype == DataType::Int(64)) {
@@ -489,7 +489,11 @@ Doc RelayTextPrinter::VisitExpr_(const CallNode* op) {
     // don't print as a call if it's a 0-arity cons
     return doc;
   } else {
-    return doc << "(" << Doc::Concat(args) << ")";
+    doc << "(" << Doc::Concat(args) << ")";
+    if (op->span.defined()) {
+      doc << " /* " << PrintSpan(op->span) << " */";
+    }
+    return doc;
   }
 }
 
@@ -831,7 +835,7 @@ std::vector<Doc> RelayTextPrinter::PrintFuncAttrs(const Attrs& attrs) {
   std::vector<Doc> docs;
   if (!attrs.defined()) return docs;
   const auto* dict_attrs = attrs.as<DictAttrsNode>();
-  CHECK(dict_attrs);
+  ICHECK(dict_attrs);
   for (const auto& k : dict_attrs->dict) {
     Doc doc;
     doc << k.first << "=" << Print(k.second);
@@ -840,6 +844,14 @@ std::vector<Doc> RelayTextPrinter::PrintFuncAttrs(const Attrs& attrs) {
   return docs;
 }
 
+Doc RelayTextPrinter::PrintSpan(const Span& span) {
+  Doc doc;
+  const auto* span_node = span.as<SpanNode>();
+  ICHECK(span_node);
+  doc << span_node->source_name->name;
+  return doc;
+}
+
 TVM_REGISTER_GLOBAL("ir.TextPrinter").set_body_typed([](ObjectRef node) {
   auto text = AsText(node, false, nullptr);
   return text;
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index e519969d6a4b..9a24fe65b4b1 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -74,6 +74,7 @@ class RelayTextPrinter : public ExprFunctor<Doc(const Expr&)>,
   Doc PrintFinal(const ObjectRef& node);
   std::vector<Doc> PrintCallAttrs(const Attrs& attrs, const Expr& op);
   std::vector<Doc> PrintFuncAttrs(const Attrs& attrs);
+  Doc PrintSpan(const Span& span);
 
   Doc Print(const ObjectRef& node, bool meta = false, bool try_inline = false);
 
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 7feb0b5031ab..107817db29b3 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -353,7 +353,7 @@ Doc TIRTextPrinter::VisitExpr_(const CallNode* op) {
   } else {
     // TODO(bohan): Print out the name by he global var in the module.
     auto* op_gvar = op->op.as<GlobalVarNode>();
-    CHECK(op_gvar != nullptr);
+    ICHECK(op_gvar != nullptr);
     doc << "@" << Doc::Text(op_gvar->name_hint) << "(";
   }
   std::vector<Doc> args;
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 5add7c17b04c..09f95e44b6d8 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -475,7 +475,7 @@ Doc TVMScriptPrinter::VisitExpr_(const CallNode* op) {
     doc << Doc::Text(ptr_op->name) << "(";
   } else {
     auto* op_gvar = op->op.as<GlobalVarNode>();
-    CHECK(op_gvar != nullptr);
+    ICHECK(op_gvar != nullptr);
     doc << Doc::Text(op_gvar->name_hint) << "(";
   }
   std::vector<Doc> args;
@@ -566,7 +566,7 @@ Doc TVMScriptPrinter::VisitStmt_(const AttrStmtNode* op) {
   // concise thread env
   if (op->node->IsInstance<IterVarNode>() && op->attr_key == "thread_extent") {
     const auto* iter_var = Downcast<IterVar>(op->node).get();
-    CHECK(!iter_var->dom.defined());
+    ICHECK(!iter_var->dom.defined());
     var_not_in_headers.insert(iter_var->var.get());
     var_env_map_[iter_var->var] = iter_var->thread_tag;
     if (current_num_ != num_child_ - 1) {
@@ -890,7 +890,7 @@ Doc TVMScriptPrinter::PrintBuffer(const BufferNode* op) {
 TVM_REGISTER_GLOBAL("script.AsTVMScript")
     .set_body_typed<std::string(const ObjectRef&, bool)>([](const ObjectRef& functions,
                                                             bool show_meta) {
-      CHECK(functions.as<PrimFuncNode>() != nullptr || functions.as<IRModuleNode>() != nullptr);
+      ICHECK(functions.as<PrimFuncNode>() != nullptr || functions.as<IRModuleNode>() != nullptr);
       return "@tvm.script.tir\n" + TVMScriptPrinter(show_meta).Print(functions).str() + "\n";
     });
 
diff --git a/src/relay/analysis/annotated_region_set.cc b/src/relay/analysis/annotated_region_set.cc
index 587add36706f..04a18c4b7351 100644
--- a/src/relay/analysis/annotated_region_set.cc
+++ b/src/relay/analysis/annotated_region_set.cc
@@ -119,7 +119,7 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor {
       }
 
       auto arg_region = region_set_->GetRegion(arg);
-      CHECK_EQ(region.defined(), arg_region.defined())
+      ICHECK_EQ(region.defined(), arg_region.defined())
           << "Arg regions are inconsistent: " << AsText(expr);
       if (region.defined() && region != arg_region) {
         region_set_->MergeRegions(arg_region, region);
@@ -137,21 +137,21 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor {
       AddToArgRegion(GetRef<Call>(call), call->args);
     } else if (call->op == begin_op_) {
       // The annotation node is inserted on edge so it must have only one argument.
-      CHECK_EQ(call->args.size(), 1U);
+      ICHECK_EQ(call->args.size(), 1U);
       std::string target = call->attrs.as<CompilerAttrs>()->compiler;
 
       // Check if the argument already belongs to a region
       auto region = region_set_->GetRegion(GetRef<Call>(call));
-      CHECK(!region.defined());
+      ICHECK(!region.defined());
 
       // Create a new region.
       region = region_set_->MakeRegion(target);
       region->nodes_.insert(GetRef<Call>(call));
       region->ins_.push_back(GetRef<Call>(call));
     } else {
-      CHECK_EQ(call->op, end_op_);
+      ICHECK_EQ(call->op, end_op_);
       // The annotation node is inserted on edge so it must have only one argument.
-      CHECK_EQ(call->args.size(), 1U);
+      ICHECK_EQ(call->args.size(), 1U);
       std::string target = call->attrs.as<CompilerAttrs>()->compiler;
 
       // Check if the argument already belongs to a region
@@ -162,7 +162,7 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor {
       } else {
         // If the argument is belonged to a region, it must have the same target.
         // Otherwise we should see a region_begin op.
-        CHECK_EQ(region->GetTarget(), target);
+        ICHECK_EQ(region->GetTarget(), target);
       }
       region->nodes_.insert(GetRef<Call>(call));
       region->outs_.push_back(GetRef<Call>(call));
diff --git a/src/relay/analysis/annotated_region_set.h b/src/relay/analysis/annotated_region_set.h
index cbcf155350df..d9923cca99fc 100644
--- a/src/relay/analysis/annotated_region_set.h
+++ b/src/relay/analysis/annotated_region_set.h
@@ -114,7 +114,7 @@ class AnnotatedRegion : public ObjectRef {
   /*! \return Mutable pointers to the node. */
   AnnotatedRegionNode* operator->() const {
     auto* ptr = get_mutable();
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
     return static_cast<AnnotatedRegionNode*>(ptr);
   }
 };
@@ -216,39 +216,39 @@ class AnnotatedRegionSet : public ObjectRef {
   /*! \return The begin iterator. */
   iterator begin() {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->begin();
   }
   /*! \return The end iterator. */
   iterator end() {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->end();
   }
   /*! \return The begin iterator. */
   const_iterator begin() const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->begin();
   }
   /*! \return The end iterator. */
   const_iterator end() const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->end();
   }
 
   /*! \return mutable pointers to the node. */
   AnnotatedRegionSetNode* operator->() const {
     auto* ptr = get_mutable();
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
     return static_cast<AnnotatedRegionSetNode*>(ptr);
   }
 
   /*! \return The region an expression belongs to. */
   AnnotatedRegion operator[](const Expr& expr) {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->GetRegion(expr);
   }
 
diff --git a/src/relay/analysis/call_graph.cc b/src/relay/analysis/call_graph.cc
index 0d3fedcde0f7..9edb471f7f79 100644
--- a/src/relay/analysis/call_graph.cc
+++ b/src/relay/analysis/call_graph.cc
@@ -51,7 +51,7 @@ CallGraph::CallGraph(IRModule module) {
 }
 
 void CallGraphNode::AddToCallGraph(const GlobalVar& gv, const Function& func) {
-  CHECK(func.defined() && gv.defined());
+  ICHECK(func.defined() && gv.defined());
   // Add the current global function as an entry to the call grpah.
   CallGraphEntry* cg_node = LookupGlobalVar(gv);
 
@@ -73,20 +73,20 @@ void CallGraphNode::AddToCallGraph(const GlobalVar& gv, const Function& func) {
 
 const CallGraphEntry* CallGraphNode::operator[](const GlobalVar& gv) const {
   const_iterator cit = call_graph_.find(gv);
-  CHECK(cit != call_graph_.end()) << "GlobalVar " << gv->name_hint
-                                  << " not found in the call graph!";
+  ICHECK(cit != call_graph_.end())
+      << "GlobalVar " << gv->name_hint << " not found in the call graph!";
   return cit->second.get();
 }
 
 CallGraphEntry* CallGraphNode::operator[](const GlobalVar& gv) {
   const_iterator cit = call_graph_.find(gv);
-  CHECK(cit != call_graph_.end()) << "GlobalVar " << gv->name_hint
-                                  << " not found in the call graph!";
+  ICHECK(cit != call_graph_.end())
+      << "GlobalVar " << gv->name_hint << " not found in the call graph!";
   return cit->second.get();
 }
 
 BaseFunc CallGraphNode::GetGlobalFunction(const GlobalVar& var) const {
-  CHECK(module->ContainGlobalVar(var->name_hint))
+  ICHECK(module->ContainGlobalVar(var->name_hint))
       << "GlobalVar " << var->name_hint << " not found in the current ir module";
   return module->Lookup(var);
 }
@@ -94,13 +94,13 @@ BaseFunc CallGraphNode::GetGlobalFunction(const GlobalVar& var) const {
 // Query the existence of a GlobalVar in the call graph. It creates an entry if
 // there is no such node available.
 CallGraphEntry* CallGraphNode::LookupGlobalVar(const GlobalVar& gv) {
-  CHECK(gv.defined());
+  ICHECK(gv.defined());
 
   // This inserts an element to the call graph if it is not there yet.
   auto& call_graph_node = call_graph_[gv];
   if (call_graph_node) return call_graph_node.get();
 
-  CHECK(module->ContainGlobalVar(gv->name_hint))
+  ICHECK(module->ContainGlobalVar(gv->name_hint))
       << "GlobalVar " << gv->name_hint << " not found in the current ir module";
 
   // Create the node for the inserted entry.
@@ -118,7 +118,7 @@ void CallGraphNode::Print(std::ostream& os) const {
 
 GlobalVar CallGraphNode::RemoveGlobalVarFromModule(CallGraphEntry* cg_node,
                                                    bool update_call_graph) {
-  CHECK(cg_node->empty() || (cg_node->IsRecursive() && cg_node->size() == 1))
+  ICHECK(cg_node->empty() || (cg_node->IsRecursive() && cg_node->size() == 1))
       << "Cannot remove global var " << cg_node->GetNameHint()
       << " from call graph, because it still calls " << cg_node->size()
       << " other global functions";
@@ -232,7 +232,7 @@ inline void CallGraphEntry::AddCalledGlobal(CallGraphEntry* cg_node) {
 // Remove an edge from the current global function to the callee.
 void CallGraphEntry::RemoveCallTo(const GlobalVar& callee) {
   for (auto it = begin();; ++it) {
-    CHECK(it != end()) << "Cannot find global function " << callee->name_hint << " to remove!";
+    ICHECK(it != end()) << "Cannot find global function " << callee->name_hint << " to remove!";
     if (it->second->GetGlobalVar() == callee) {
       // Only remove one occurrence of the call site.
       it->second->DecRef();
@@ -256,7 +256,7 @@ void CallGraphEntry::RemoveAllCallTo(CallGraphEntry* callee) {
     }
   }
   // Make sure all references to the callee are removed.
-  CHECK_EQ(callee->GetRefCount(), 0U)
+  ICHECK_EQ(callee->GetRefCount(), 0U)
       << "All references to " << callee->GetNameHint() << " should have been removed";
 }
 
@@ -291,7 +291,7 @@ TVM_REGISTER_NODE_TYPE(CallGraphNode);
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<CallGraphNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const CallGraphNode*>(ref.get());
-      CHECK(node);
+      ICHECK(node);
       p->stream << "CallGraph: \n" << GetRef<CallGraph>(node);
     });
 
diff --git a/src/relay/analysis/call_graph.h b/src/relay/analysis/call_graph.h
index 07b25278b1d6..7cc813ebbff1 100644
--- a/src/relay/analysis/call_graph.h
+++ b/src/relay/analysis/call_graph.h
@@ -218,25 +218,25 @@ class CallGraph : public ObjectRef {
   /*! \return The begin iterator. */
   iterator begin() {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->begin();
   }
   /*! \return The end iterator. */
   iterator end() {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->end();
   }
   /*! \return The begin iterator. */
   const_iterator begin() const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->begin();
   }
   /*! \return The end iterator. */
   const_iterator end() const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->end();
   }
 
@@ -249,7 +249,7 @@ class CallGraph : public ObjectRef {
    */
   const CallGraphEntry* operator[](const GlobalVar& gv) const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return (*n)[gv];
   }
   /*!
@@ -261,7 +261,7 @@ class CallGraph : public ObjectRef {
    */
   CallGraphEntry* operator[](const GlobalVar& gv) {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return (*n)[gv];
   }
   /*!
@@ -273,7 +273,7 @@ class CallGraph : public ObjectRef {
    */
   const CallGraphEntry* operator[](const std::string& gvar_name) const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return (*n)[gvar_name];
   }
   /*!
@@ -285,14 +285,14 @@ class CallGraph : public ObjectRef {
    */
   CallGraphEntry* operator[](const std::string& gvar_name) {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return (*n)[gvar_name];
   }
 
   /*! \return mutable pointers to the node. */
   CallGraphNode* operator->() const {
     auto* ptr = get_mutable();
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
     return static_cast<CallGraphNode*>(ptr);
   }
 
@@ -360,7 +360,7 @@ class CallGraphEntry {
    * \return The fetched CallGraphEntry.
    */
   CallGraphEntry* operator[](size_t i) const {
-    CHECK_LT(i, called_globals_.size()) << "Invalid Index";
+    ICHECK_LT(i, called_globals_.size()) << "Invalid Index";
     return called_globals_[i].second;
   }
 
@@ -452,7 +452,7 @@ class CallGraphEntry {
  private:
   /*! \brief Decrement the reference counter by 1. */
   void DecRef() {
-    CHECK_GT(ref_cnt_, 0);
+    ICHECK_GT(ref_cnt_, 0);
     --ref_cnt_;
   }
   /*! \brief Increment the reference counter by 1. */
diff --git a/src/relay/analysis/context_analysis.cc b/src/relay/analysis/context_analysis.cc
index 5fbd8a4d067f..a648b7af8fd3 100644
--- a/src/relay/analysis/context_analysis.cc
+++ b/src/relay/analysis/context_analysis.cc
@@ -151,7 +151,7 @@ DeviceDomainPtr Join(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) {
   } else if (rhs->IsEmptyDomain()) {
     return lhs;
   } else {
-    CHECK(*lhs.get() == *rhs.get()) << "All expressions must have a singular device to unify";
+    ICHECK(*lhs.get() == *rhs.get()) << "All expressions must have a singular device to unify";
     return lhs;
   }
 }
@@ -311,7 +311,7 @@ class ContextAnalyzer : public MixedModeVisitor {
       auto ty = let->value->checked_type();
       if (ty->IsInstance<FuncTypeNode>()) {
         auto gv = ExtractClosure(let);
-        CHECK(gv.defined() && gv->IsInstance<GlobalVarNode>());
+        ICHECK(gv.defined() && gv->IsInstance<GlobalVarNode>());
         closures_[let->var] = Downcast<GlobalVar>(gv);
       }
 
@@ -444,7 +444,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   // Process device copy call node
   void UnifyDeviceCopyCall(const CallNode* call) {
-    CHECK_EQ(call->args.size(), 1U);
+    ICHECK_EQ(call->args.size(), 1U);
 
     std::vector<Expr> inps{call->args[0]};
     std::vector<Expr> outs{GetRef<Call>(call)};
@@ -455,13 +455,13 @@ class ContextAnalyzer : public MixedModeVisitor {
       inps.push_back(fn->params[0]);
       outs.push_back(call->op);
       Expr body = fn->body;
-      CHECK(body->IsInstance<CallNode>() && IsDeviceCopy(body));
+      ICHECK(body->IsInstance<CallNode>() && IsDeviceCopy(body));
       Call call_body = Downcast<Call>(body);
       attrs = call_body->attrs.as<DeviceCopyAttrs>();
     } else {
       attrs = call->attrs.as<DeviceCopyAttrs>();
     }
-    CHECK(attrs != nullptr);
+    ICHECK(attrs != nullptr);
     src_dev_type = static_cast<DLDeviceType>(attrs->src_dev_type);
     dst_dev_type = static_cast<DLDeviceType>(attrs->dst_dev_type);
 
@@ -474,7 +474,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyAllocStorageCall(const CallNode* call) {
     // [size, alignment]
-    CHECK_EQ(call->args.size(), 2U);
+    ICHECK_EQ(call->args.size(), 2U);
 
     // The arguments of alloc storage should be on CPU.
     for (int i = 0; i < 2; i++) {
@@ -490,7 +490,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyAllocTensorCall(const CallNode* call) {
     // [storage, offset, shape]
-    CHECK_EQ(call->args.size(), 3U);
+    ICHECK_EQ(call->args.size(), 3U);
 
     Expr storage = call->args[0];
     Expr shape = call->args[1];
@@ -503,7 +503,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyShapeFuncCall(const CallNode* call) {
     // [func, inputs, outputs]
-    CHECK_EQ(call->args.size(), 3U);
+    ICHECK_EQ(call->args.size(), 3U);
     auto shape_func_domain = DeviceType(cpu_ctx_);
 
     // No need to unify the op of a shape_func as shape_func doesn't
@@ -523,7 +523,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyInvokeTVMOpCall(const CallNode* call) {
     // [op, inputs, outputs]
-    CHECK_EQ(call->args.size(), 3U);
+    ICHECK_EQ(call->args.size(), 3U);
     Tuple inps = Downcast<Tuple>(call->args[1]);
     Tuple outputs = Downcast<Tuple>(call->args[2]);
     UnifyCall(call->args[0], inps->fields, outputs->fields, Bottom());
@@ -532,7 +532,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyShapeOfCall(const CallNode* call) {
     // vm shape_of is always on the CPU.
-    CHECK_EQ(call->args.size(), 1U);
+    ICHECK_EQ(call->args.size(), 1U);
     MixedModeVisitor::VisitExpr(call->args[0]);
     // Note we don't unify the input of a shape_of with the cpu domain. This is
     // because vm.shape_of has a native instruction to compute the shape of
@@ -544,7 +544,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyReshapeTensorCall(const CallNode* call) {
     // [data, shape]
-    CHECK_EQ(call->args.size(), 2U);
+    ICHECK_EQ(call->args.size(), 2U);
     Expr data = call->args[0];
     Expr shape = call->args[1];
     Unify(DeviceFor(GetRef<Call>(call)), DeviceFor(data));
@@ -583,10 +583,10 @@ class ContextAnalyzer : public MixedModeVisitor {
   // Invoke a global function.
   void UnifyGlobalVarCall(const CallNode* call) {
     auto device = DeviceFor(GetRef<Call>(call));
-    CHECK(mod_.defined()) << "Cannot analyze context on a globalvar without module";
+    ICHECK(mod_.defined()) << "Cannot analyze context on a globalvar without module";
     GlobalVar gv = Downcast<GlobalVar>(call->op);
     auto func = Downcast<Function>(mod_->Lookup(gv));
-    CHECK_EQ(call->args.size(), func->params.size())
+    ICHECK_EQ(call->args.size(), func->params.size())
         << "The number of arguments doesn't match the number of parameters of the function.";
 
     for (size_t i = 0; i < call->args.size(); i++) {
@@ -596,14 +596,14 @@ class ContextAnalyzer : public MixedModeVisitor {
 
       // Save the the arg to function mapping for closures as it will
       // be invoked/unified later.
-      CHECK(arg->checked_type().defined())
+      ICHECK(arg->checked_type().defined())
           << "Type inference is required to run the context analysis passes.";
       if (arg->checked_type()->IsInstance<FuncTypeNode>()) {
         auto it = closures_.find(arg);
         if (it != closures_.end()) {
           closures_[param] = it->second;
         } else {
-          CHECK(arg->IsInstance<GlobalVarNode>());
+          ICHECK(arg->IsInstance<GlobalVarNode>());
           closures_[param] = Downcast<GlobalVar>(arg);
         }
       }
@@ -631,9 +631,9 @@ class ContextAnalyzer : public MixedModeVisitor {
     // Unify the corresponding arguement and parameter.
     auto device = DeviceFor(GetRef<Call>(call));
     auto it = closures_.find(call->op);
-    CHECK(it != closures_.end()) << "Cannot find var: " << call->op;
+    ICHECK(it != closures_.end()) << "Cannot find var: " << call->op;
     auto glb_var = it->second;
-    CHECK(mod_.defined()) << "Cannot analyze context on a globalvar without module";
+    ICHECK(mod_.defined()) << "Cannot analyze context on a globalvar without module";
     Function func = Downcast<Function>(mod_->Lookup(glb_var));
     // Unify the underlying function for clousre or currying functions.
     while (IsClosure(func) || IsCurrying(func)) {
@@ -648,7 +648,7 @@ class ContextAnalyzer : public MixedModeVisitor {
       }
     }
 
-    CHECK_EQ(call->args.size(), func->params.size());
+    ICHECK_EQ(call->args.size(), func->params.size());
     for (size_t i = 0; i < call->args.size(); i++) {
       Unify(DeviceFor(call->args[i]), DeviceFor(func->params[i]));
       MixedModeVisitor::VisitExpr(call->args[i]);
diff --git a/src/relay/analysis/dependency_graph.cc b/src/relay/analysis/dependency_graph.cc
index de61800d8c52..3a4fb59475a4 100644
--- a/src/relay/analysis/dependency_graph.cc
+++ b/src/relay/analysis/dependency_graph.cc
@@ -50,7 +50,7 @@ class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
   void Depend(DependencyGraph::Node* parent, const Expr& child) {
     VisitExpr(child);
 
-    CHECK_NE(graph_.expr_node.count(child), 0);
+    ICHECK_NE(graph_.expr_node.count(child), 0);
 
     Depend(parent, graph_.expr_node[child]);
   }
diff --git a/src/relay/analysis/feature.cc b/src/relay/analysis/feature.cc
index 63f5e711bfcd..f72b4e105749 100644
--- a/src/relay/analysis/feature.cc
+++ b/src/relay/analysis/feature.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/feature.h>
 
-#include "../transforms/pass_util.h"
+#include "../transforms/pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -114,7 +114,7 @@ std::string FeatureSet::ToString() const {
   DETECT_FEATURE(fGraph);
   DETECT_FEATURE(fLetRec);
 #undef DETECT_FEATURE
-  CHECK(detected == feature_count) << "some feature not printed";
+  ICHECK(detected == feature_count) << "some feature not printed";
   ret += "]";
   return ret;
 }
@@ -139,8 +139,8 @@ TVM_REGISTER_GLOBAL("relay.analysis.detect_feature").set_body_typed(PyDetectFeat
 
 void CheckFeature(const Expr& expr, const FeatureSet& fs) {
   auto dfs = DetectFeature(expr);
-  CHECK(dfs.is_subset_of(fs)) << AsText(expr, false)
-                              << "\nhas unsupported feature: " << (dfs - fs).ToString();
+  ICHECK(dfs.is_subset_of(fs)) << AsText(expr, false)
+                               << "\nhas unsupported feature: " << (dfs - fs).ToString();
 }
 
 void CheckFeature(const IRModule& mod, const FeatureSet& fs) {
diff --git a/src/relay/analysis/get_calibration_data.cc b/src/relay/analysis/get_calibration_data.cc
index 34d0d0002b6a..70fe2a68f21e 100644
--- a/src/relay/analysis/get_calibration_data.cc
+++ b/src/relay/analysis/get_calibration_data.cc
@@ -52,7 +52,7 @@ class Collector : public ExprRewriter {
     // intrinsic functions are excluded for now
     if (call->op->IsInstance<GlobalVarNode>()) {
       auto var = Downcast<GlobalVar>(call->op);
-      CHECK(module_->ContainGlobalVar(var->name_hint)) << "Function " << var << " is not defined";
+      ICHECK(module_->ContainGlobalVar(var->name_hint)) << "Function " << var << " is not defined";
       // we only handle functions with Compiler attribute set
       auto func = Downcast<Function>(module_->Lookup(var));
       if (func->GetAttr<String>(attr::kCompiler)) {
@@ -74,10 +74,10 @@ class Collector : public ExprRewriter {
 Expr FlattenOutputTuple(const Array<Expr>& exprs) {
   Array<Expr> fields;
   for (const auto& it : exprs) {
-    CHECK(it->checked_type_.defined());
+    ICHECK(it->checked_type_.defined());
     if (auto* tn = it->checked_type_.as<TupleTypeNode>()) {
       // TODO(seanlatias): for now input argument cannot be a tuple
-      CHECK(it->IsInstance<CallNode>());
+      ICHECK(it->IsInstance<CallNode>());
       for (size_t i = 0; i < tn->fields.size(); i++) {
         fields.push_back(TupleGetItem(it, i));
       }
@@ -140,8 +140,8 @@ class OutputMapper : public ExprRewriter {
   Expr Rewrite_(const CallNode* call, const Expr& post) final {
     if (call->op->IsInstance<GlobalVarNode>()) {
       auto var = Downcast<GlobalVar>(call->op);
-      CHECK(module_->ContainGlobalVar(var->name_hint)) << "Function " << var << " is not defined";
-      CHECK_EQ(output_map_->count(var), 0)
+      ICHECK(module_->ContainGlobalVar(var->name_hint)) << "Function " << var << " is not defined";
+      ICHECK_EQ(output_map_->count(var), 0)
           << "Repeated function call " << var << " is not supported.";
       auto func = Downcast<Function>(module_->Lookup(var));
       // we only handle functions with Compiler attribute set
diff --git a/src/relay/analysis/mac_count.cc b/src/relay/analysis/mac_count.cc
index d2e62b705d99..29edf55812cc 100644
--- a/src/relay/analysis/mac_count.cc
+++ b/src/relay/analysis/mac_count.cc
@@ -32,7 +32,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 
-#include "../transforms/pattern_util.h"
+#include "../transforms/pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -65,24 +65,24 @@ int64_t ConvMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK_EQ(args.size(), 2) << "The number of input arguments of a CONV 2D node should be 2.";
+  ICHECK_EQ(args.size(), 2) << "The number of input arguments of a CONV 2D node should be 2.";
   const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> data_shape = data_type->shape;
   std::string data_layout = conv_2d_attr->data_layout;
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
-  CHECK_NE(C_ind, -1) << "There is no input channel dimension.";
+  ICHECK_NE(C_ind, -1) << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImmNode>()->value);
   if (c_ind != -1) input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImmNode>()->value);
   Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
-  CHECK_EQ(kernel_size.size(), 2) << "The dimension of the kernel in Conv 2D should be 2.";
+  ICHECK_EQ(kernel_size.size(), 2) << "The dimension of the kernel in Conv 2D should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
-  CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
+  ICHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
       << "The dimension of the output tensor in Conv 2D should be 4 or 5.";
   int64_t count = GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
-  CHECK_EQ(input_channel % conv_2d_attr->groups, 0)
+  ICHECK_EQ(input_channel % conv_2d_attr->groups, 0)
       << "The number of input channels is not divisble by groups.";
   count *= input_channel / conv_2d_attr->groups;
   return count;
@@ -94,7 +94,7 @@ int64_t Conv2dTransposeMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK_EQ(args.size(), 2)
+  ICHECK_EQ(args.size(), 2)
       << "The number of input arguments of a CONV 2D Transpose node should be 2.";
   const auto* conv_2d_transpose_attr = call_node->attrs.as<Conv2DTransposeAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
@@ -102,18 +102,18 @@ int64_t Conv2dTransposeMacCount(const Call& call_node) {
   std::string data_layout = conv_2d_transpose_attr->data_layout;
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
-  CHECK_NE(C_ind, -1) << "There is no input channel dimension.";
+  ICHECK_NE(C_ind, -1) << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImmNode>()->value);
   if (c_ind != -1) input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImmNode>()->value);
   Array<IndexExpr> kernel_size = conv_2d_transpose_attr->kernel_size;
-  CHECK_EQ(kernel_size.size(), 2)
+  ICHECK_EQ(kernel_size.size(), 2)
       << "The dimension of the kernel in Conv 2D Transpose should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
-  CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
+  ICHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
       << "The dimension of the output tensor in Conv 2D Transpose should be 4 or 5.";
   int64_t count = GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
-  CHECK_EQ(input_channel % conv_2d_transpose_attr->groups, 0)
+  ICHECK_EQ(input_channel % conv_2d_transpose_attr->groups, 0)
       << "The number of input channels is not divisble by groups.";
   count *= input_channel / conv_2d_transpose_attr->groups;
   return count;
@@ -125,18 +125,18 @@ int64_t DenseMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK_EQ(args.size(), 2) << "The number of input arguments of a Dense node should be 2.";
+  ICHECK_EQ(args.size(), 2) << "The number of input arguments of a Dense node should be 2.";
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
   const auto* weight_type = args[1]->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> data_shape = data_type->shape;
   Array<IndexExpr> weight_shape = weight_type->shape;
-  CHECK(data_shape.size() == 2 && weight_shape.size() == 2)
+  ICHECK(data_shape.size() == 2 && weight_shape.size() == 2)
       << "The dimension of an input tensor to Dense node should be 2.";
   int64_t d1 = static_cast<int64_t>(data_shape[0].as<IntImmNode>()->value);
   int64_t d2 = static_cast<int64_t>(data_shape[1].as<IntImmNode>()->value);
   int64_t d3 = static_cast<int64_t>(weight_shape[0].as<IntImmNode>()->value);
   int64_t d4 = static_cast<int64_t>(weight_shape[1].as<IntImmNode>()->value);
-  CHECK_EQ(d2, d4) << "The dimensions of input arguments do not match.";
+  ICHECK_EQ(d2, d4) << "The dimensions of input arguments do not match.";
   int64_t count = d1 * d2 * d3;
   return count;
 }
@@ -147,7 +147,7 @@ int64_t BatchMatmulMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK_EQ(args.size(), 2);
+  ICHECK_EQ(args.size(), 2);
   Array<IndexExpr> x_shape = args[0]->checked_type().as<TensorTypeNode>()->shape;
   Array<IndexExpr> y_shape = args[1]->checked_type().as<TensorTypeNode>()->shape;
   int64_t batch = x_shape[0].as<IntImmNode>()->value;
diff --git a/src/relay/analysis/match_exhaustion.cc b/src/relay/analysis/match_exhaustion.cc
index e852c40dfeba..bb6e8f14ca09 100644
--- a/src/relay/analysis/match_exhaustion.cc
+++ b/src/relay/analysis/match_exhaustion.cc
@@ -68,7 +68,7 @@ class CandidateChecker : public PatternFunctor<MatchResult(const Pattern&, const
     }
 
     // now check that subpatterns match
-    CHECK_EQ(op->patterns.size(), ctor_cand->patterns.size());
+    ICHECK_EQ(op->patterns.size(), ctor_cand->patterns.size());
     bool unspecified = false;
     for (size_t i = 0; i < op->patterns.size(); i++) {
       MatchResult submatch = this->Check(op->patterns[i], ctor_cand->patterns[i]);
@@ -95,7 +95,7 @@ class CandidateChecker : public PatternFunctor<MatchResult(const Pattern&, const
     }
 
     // now check that subpatterns match
-    CHECK_EQ(op->patterns.size(), tuple_cand->patterns.size());
+    ICHECK_EQ(op->patterns.size(), tuple_cand->patterns.size());
     bool unspecified = false;
     for (size_t i = 0; i < op->patterns.size(); i++) {
       MatchResult submatch = this->Check(op->patterns[i], tuple_cand->patterns[i]);
@@ -126,7 +126,7 @@ class CandidateChecker : public PatternFunctor<MatchResult(const Pattern&, const
 
 // Returns list of arrays corresponding to Cartesian product of input list
 Array<Array<Pattern>> CartesianProduct(Array<Array<Pattern>> fields) {
-  CHECK_NE(fields.size(), 0);
+  ICHECK_NE(fields.size(), 0);
   Array<Pattern> field_vals = fields[fields.size() - 1];
   Array<Array<Pattern>> ret;
 
diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index 8c1cc92fe009..64db13acbac0 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -114,14 +114,14 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
     }
 
     if (lhs->resolved_type.as<IncompleteTypeNode>()) {
-      CHECK(!OccursCheck(lhs, rhs->resolved_type))
+      ICHECK(!OccursCheck(lhs, rhs->resolved_type))
           << "Incomplete type " << lhs->resolved_type << " occurs in " << rhs->resolved_type
           << ", cannot unify";
 
       solver_->MergeFromTo(lhs, rhs);
       return rhs->resolved_type;
     } else if (rhs->resolved_type.as<IncompleteTypeNode>()) {
-      CHECK(!OccursCheck(rhs, lhs->resolved_type))
+      ICHECK(!OccursCheck(rhs, lhs->resolved_type))
           << "Incomplete type " << rhs->resolved_type << " occurs in " << lhs->resolved_type
           << ", cannot unify";
       solver_->MergeFromTo(rhs, lhs);
@@ -242,11 +242,11 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
 
     std::vector<std::tuple<size_t, IndexExpr, IndexExpr>> mismatches;
 
-    CHECK_EQ(tt1->shape.size(), tt2->shape.size());
+    ICHECK_EQ(tt1->shape.size(), tt2->shape.size());
     for (size_t i = 0; i < tt1->shape.size(); i++) {
       auto dim = UnifyDim(tt1->shape[i], tt2->shape[i]);
       if (!dim.defined()) {
-        // NB: We push an arbitrary dimension here so we can continue error propogation.
+        // NB: We push an arbitrary dimension here so we can continue error propagation.
         shape.push_back(tt1->shape[i]);
         tvm::PrimExpr shape1 = tt1->shape[i];
         tvm::PrimExpr shape2 = tt2->shape[i];
@@ -259,10 +259,11 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
 
     if (mismatches.size() != 0) {
       auto err = Diagnostic::Error(this->span);
-      err << "in particular ";
+      err << "The Relay type checker is unable to show the following types match.\n";
+      err << "In particular ";
       for (auto mismatch : mismatches) {
-        err << "dimension " << std::get<0>(mismatch) << " conflicts " << std::get<1>(mismatch)
-            << " does not match " << std::get<2>(mismatch);
+        err << "dimension " << std::get<0>(mismatch) << " conflicts: " << std::get<1>(mismatch)
+            << " does not match " << std::get<2>(mismatch) << ".";
       }
       this->solver_->diag_ctx_.Emit(err);
       return Type(nullptr);
@@ -328,8 +329,8 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
     for (size_t i = 0; i < ft1->type_constraints.size(); ++i) {
       Type unified_constraint = Unify(ft1->type_constraints[i], ft2->type_constraints[i]);
       const auto* tcn = unified_constraint.as<TypeConstraintNode>();
-      CHECK(tcn) << "Two type constraints unified into a non-constraint?"
-                 << ft1->type_constraints[i] << " and " << ft2->type_constraints[i];
+      ICHECK(tcn) << "Two type constraints unified into a non-constraint?"
+                  << ft1->type_constraints[i] << " and " << ft2->type_constraints[i];
       type_constraints.push_back(GetRef<TypeConstraint>(tcn));
     }
 
@@ -527,7 +528,7 @@ TypeSolver::TypeSolver(const GlobalVar& current_func, DiagnosticContext diag_ctx
       current_func(current_func),
       diag_ctx_(diag_ctx),
       module_(diag_ctx->module) {
-  CHECK(module_.defined());
+  ICHECK(module_.defined());
 }
 
 // destructor
@@ -593,12 +594,12 @@ bool TypeSolver::Solve() {
     RelationNode* rnode = update_queue_.front();
     const auto& rel = rnode->rel;
     update_queue_.pop();
-    CHECK(!rnode->resolved);
+    ICHECK(!rnode->resolved);
     // update the relation with given evidence.
     Array<Type> args;
     for (auto* tlink = rnode->type_list.head; tlink != nullptr; tlink = tlink->next) {
       args.push_back(Resolve(tlink->value->FindRoot()->resolved_type));
-      CHECK_LE(args.size(), rel->args.size());
+      ICHECK_LE(args.size(), rel->args.size());
     }
 
     // We need to set this in order to understand where unification
@@ -615,7 +616,7 @@ bool TypeSolver::Solve() {
 
       rnode->resolved = resolved;
     } catch (const Error& err) {
-      this->diag_ctx_.Emit(Diagnostic::Error(rnode->span) << "err");
+      this->diag_ctx_.Emit(Diagnostic::Error(rnode->span) << err.what());
       rnode->resolved = false;
     } catch (const dmlc::Error& e) {
       ICHECK(false) << e.what();
diff --git a/src/relay/analysis/type_solver.h b/src/relay/analysis/type_solver.h
index 1fc0525d6bca..4ae2e6a2b07b 100644
--- a/src/relay/analysis/type_solver.h
+++ b/src/relay/analysis/type_solver.h
@@ -208,7 +208,7 @@ class TypeSolver {
    */
   void AddToQueue(RelationNode* rel) {
     if (rel->inqueue) return;
-    CHECK(!rel->resolved);
+    ICHECK(!rel->resolved);
     rel->inqueue = true;
     update_queue_.push(rel);
   }
diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index b98106a091b3..bcfbc83da514 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -31,7 +31,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/pattern_functor.h>
 
-#include "../transforms/pass_util.h"
+#include "../transforms/pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -71,7 +71,7 @@ class TypeVarTVisitor : public TypeVisitor {
   InsertionSet<TypeVar>* bound_type_vars_;
 };
 
-class TypeVarEVisitor : private ExprVisitor {
+class TypeVarEVisitor : private MixedModeVisitor {
  public:
   explicit TypeVarEVisitor(const IRModule& mod) : mod_(mod) {}
 
@@ -131,6 +131,8 @@ class TypeVarEVisitor : private ExprVisitor {
     return CollectAll();
   }
 
+  using MixedModeVisitor::VisitExpr_;
+
   void VisitExpr_(const FunctionNode* f) final {
     for (const auto& tp : f->type_params) {
       type_vars_.Insert(tp);
@@ -159,7 +161,7 @@ class TypeVarEVisitor : private ExprVisitor {
   const IRModule& mod_;
 };
 
-class VarVisitor : protected ExprVisitor, protected PatternVisitor {
+class VarVisitor : protected MixedModeVisitor, protected PatternVisitor {
  public:
   Array<Var> Free(const Expr& expr) {
     this->VisitExpr(expr);
@@ -204,6 +206,8 @@ class VarVisitor : protected ExprVisitor, protected PatternVisitor {
     vars_.Insert(v);
   }
 
+  using MixedModeVisitor::VisitExpr_;
+
   void VisitExpr_(const VarNode* var) final { vars_.Insert(GetRef<Var>(var)); }
 
   void VisitExpr_(const FunctionNode* op) final {
@@ -354,9 +358,9 @@ std::unordered_map<const Object*, size_t> GetExprRefCount(const Expr& body) {
 
 template <typename T>
 bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) {
-  CHECK_EQ(tensor->ctx.device_type, kDLCPU);
-  CHECK(tensor->strides == nullptr);
-  CHECK_EQ(tensor->byte_offset, 0);
+  ICHECK_EQ(tensor->ctx.device_type, kDLCPU);
+  ICHECK(tensor->strides == nullptr);
+  ICHECK_EQ(tensor->byte_offset, 0);
   const T* data = static_cast<const T*>(tensor->data);
   int64_t num_elems = 1;
   for (int i = 0; i < tensor->ndim; ++i) {
@@ -442,10 +446,10 @@ Expr TypeSubst(const Expr& expr, const tvm::Map<TypeVar, Type>& subst_map) {
    private:
     const tvm::Map<TypeVar, Type>& subst_map_;
   };
-  CHECK(WellFormed(expr));
+  ICHECK(WellFormed(expr));
   auto ret = TypeSubstMutator(subst_map).VisitExpr(expr);
-  CHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
-  CHECK(WellFormed(ret));
+  ICHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
+  ICHECK(WellFormed(ret));
   return ret;
 }
 
diff --git a/src/relay/analysis/well_formed.cc b/src/relay/analysis/well_formed.cc
index 3e409d10b885..856c5dc7aac1 100644
--- a/src/relay/analysis/well_formed.cc
+++ b/src/relay/analysis/well_formed.cc
@@ -21,10 +21,10 @@
  * \file well_formed.cc
  * \brief check that expression is well formed.
  */
-#include <tvm/ir/diagnostic.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
+#include <tvm/support/logging.h>
 
 #include <unordered_set>
 
@@ -32,7 +32,7 @@ namespace tvm {
 namespace relay {
 
 //! brief make sure each Var is bound at most once in a scope.
-class WellFormedChecker : private ExprVisitor, PatternVisitor {
+class WellFormedChecker : private MixedModeVisitor, PatternVisitor {
  public:
   Optional<DiagnosticContext> diag_ctx;
   Span occurs_in;
@@ -59,9 +59,9 @@ class WellFormedChecker : private ExprVisitor, PatternVisitor {
     WellFormedChecker* wfc;
     explicit Scope(WellFormedChecker* wfc) : wfc(wfc) { wfc->scope.push_back({{}}); }
     ~Scope() {
-      CHECK_GE(wfc->scope.size(), 0);
+      ICHECK_GE(wfc->scope.size(), 0);
       for (const Var& v : wfc->scope.back()) {
-        CHECK_GE(wfc->current_bound.count(v), 0);
+        ICHECK_GE(wfc->current_bound.count(v), 0);
         wfc->current_bound.erase(v);
       }
       wfc->scope.pop_back();
@@ -73,12 +73,14 @@ class WellFormedChecker : private ExprVisitor, PatternVisitor {
       Illformed(Diagnostic::Error(v->span) << "the variable " << v->name_hint()
                                            << "is bound more then once, this is not valid IR");
     }
-    CHECK_GE(scope.size(), 0);
+    ICHECK_GE(scope.size(), 0);
     scope.back().insert(v);
     current_bound.insert(v);
     total_bound.insert(v);
   }
 
+  using MixedModeVisitor::VisitExpr_;
+
   void VisitExpr_(const VarNode* op) final {
     Var v = GetRef<Var>(op);
     if (current_bound.count(v) == 0) {
@@ -118,15 +120,15 @@ class WellFormedChecker : private ExprVisitor, PatternVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
-    CHECK(call->op.defined());
+    ICHECK(call->op.defined());
 
     for (auto arg : call->args) {
-      CHECK(arg.defined());
+      ICHECK(arg.defined());
     }
 
-    // CHECK(call->attrs.defined());
-    CHECK(call->type_args.defined());
-    ExprVisitor::VisitExpr_(call);
+    // ICHECK(call->attrs.defined());
+    ICHECK(call->type_args.defined());
+    MixedModeVisitor::VisitExpr_(call);
   }
 
   void VisitClause(const Clause& c) final {
@@ -139,18 +141,14 @@ class WellFormedChecker : private ExprVisitor, PatternVisitor {
 
   void VisitVar(const Var& v) final { Bound(v); }
 
-  void VisitExpr(const Expr& e) final {
+ public:
+  bool CheckWellFormed(const Expr& e) {
     if (auto v = e.as<VarNode>()) {
       VisitExpr_(v);
     } else {
       // this->occurs_in = e->span;
-      ExprVisitor::VisitExpr(e);
+      VisitExpr(e);
     }
-  }
-
- public:
-  bool CheckWellFormed(const Expr& e) {
-    this->VisitExpr(e);
     return well_formed;
   }
 };
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 64f1253ff9db..a0828d1cac6c 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -22,6 +22,7 @@
  * \brief Code generation for TVM's graph runtime.
  */
 #include <tvm/driver/driver_api.h>
+#include <tvm/ir/expr.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/qnn/transform.h>
@@ -30,6 +31,7 @@
 
 #include <memory>
 
+#include "../../target/func_registry_generator.h"
 #include "../../target/source/codegen_source_base.h"
 #include "compile_engine.h"
 #include "utils.h"
@@ -88,6 +90,17 @@ struct GraphCodegen {
     return ret;
   }
 
+  std::unordered_map<std::string, int64_t> GetParamIds() {
+    std::unordered_map<std::string, int64_t> ret;
+    auto names = CallFunc<Array<runtime::String>>("list_params_name", nullptr);
+    for (const auto& expr : names) {
+      // Implicit cast from runtime::String to std::string
+      std::string key = expr;
+      ret[key] = CallFunc<int64_t>("get_param_id", key);
+    }
+    return ret;
+  }
+
  protected:
   tvm::runtime::Module mod;
   template <typename R, typename... Args>
@@ -124,7 +137,7 @@ class RelayBuildModule : public runtime::ModuleNode {
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
     } else if (name == "build") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 3);
+        ICHECK_EQ(args.num_args, 3);
         this->Build(args[0], args[1], args[2]);
       });
     } else if (name == "list_params") {
@@ -150,7 +163,7 @@ class RelayBuildModule : public runtime::ModuleNode {
       });
     } else if (name == "optimize") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 2);
+        ICHECK_EQ(args.num_args, 2);
         *rv = this->Optimize(args[0], args[1], this->params_);
       });
     } else {
@@ -244,7 +257,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     ICHECK(relay_module.defined()) << "The IRModule must be defined for the Relay compiler.";
 
     if (params.size()) {
-      CHECK(relay_module->ContainGlobalVar("main")) << "Missing the main entry function";
+      ICHECK(relay_module->ContainGlobalVar("main")) << "Missing the main entry function";
       GlobalVar main_glb_var = relay_module->GetGlobalVar("main");
       Function main_func = Downcast<Function>(relay_module->Lookup(main_glb_var));
       auto new_main = BindParamsByName(main_func, params);
@@ -319,13 +332,30 @@ class RelayBuildModule : public runtime::ModuleNode {
       Optional<Integer> opt_fallback_dev =
           pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast<int>(kDLCPU)));
       auto fallback_dev = opt_fallback_dev.value();
-      CHECK_GT(fallback_dev->value, 0U);
+      ICHECK_GT(fallback_dev->value, 0U);
       relay_module = RunDeviceAnnotationPass(relay_module, fallback_dev->value);
     }
 
     // Fuse the operations if it is needed.
     relay_module = transform::FuseOps()(relay_module);
+
+    // Do layout rewrite for auto-scheduler.
+    if (backend::IsAutoSchedulerEnabled() && targets.size() == 1) {
+      const auto& target = (*targets.begin()).second;
+      Pass major_pass = transform::AutoSchedulerLayoutRewrite();
+
+      if (target->kind->device_type == kDLCPU && pass_ctx.PassEnabled(major_pass->Info())) {
+        With<Target> tctx(target);
+        relay_module = major_pass(relay_module);
+        // Defuse ops to fold constants, then fuse them again
+        relay_module = transform::DefuseOps()(relay_module);
+        relay_module = transform::FoldConstant()(relay_module);
+        relay_module = transform::FuseOps()(relay_module);
+      }
+    }
+
     relay_module = transform::InferType()(relay_module);
+
     // Inline the functions that have been lifted by the module scope.
     //
     // TODO(@zhiics) Note that we need to be careful about the subgraphs with
@@ -335,7 +365,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     relay_module = transform::Inline()(relay_module);
     relay_module = transform::InferType()(relay_module);
 
-    CHECK(relay_module.defined());
+    ICHECK(relay_module.defined());
 
     return relay_module;
   }
@@ -383,7 +413,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     UpdateHeterogeneousInputs(fallback_device);
     auto rewrite = transform::RewriteAnnotatedOps(fallback_device);
     auto updated_module = rewrite(relay_module);
-    CHECK(updated_module.defined());
+    ICHECK(updated_module.defined());
 
     tvm::Map<Expr, Integer> device_map;
     for (const auto& it : updated_module->functions) {
@@ -408,11 +438,11 @@ class RelayBuildModule : public runtime::ModuleNode {
           break;
         }
         for (auto kv : annotation_map) {
-          CHECK_EQ(kv.second->value, dev_type) << "Expressions in the function are "
-                                               << "annotated with various device types,"
-                                               << "but not device copy operators "
-                                               << "found. Please check the "
-                                               << "RewriteAnnotation pass.";
+          ICHECK_EQ(kv.second->value, dev_type) << "Expressions in the function are "
+                                                << "annotated with various device types,"
+                                                << "but not device copy operators "
+                                                << "found. Please check the "
+                                                << "RewriteAnnotation pass.";
         }
         targets_.Set(0, CreateDefaultTarget(dev_type));
       }
@@ -443,16 +473,36 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     auto lowered_funcs = graph_codegen_->GetIRModule();
 
-    // When there is no lowered_funcs due to reasons such as optimization.
-    if (lowered_funcs.size() == 0) {
-      Target target_host = GetTargetHost();
+    Target target_host = GetTargetHost();
+    // If no target_host has been set, we choose a default one, which is
+    // llvm if "codegen.LLVMModuleCreate" is accessible.
+    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
+    if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
+
+    // Generate a placeholder function that attaches linked params as its arguments.
+    if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
+      CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
+      auto param_ids = graph_codegen_->GetParamIds();
+      auto link_params = Map<String, tir::LinkedParam>();
+      for (auto param : ret_.params) {
+        link_params.Set(param.first, tir::LinkedParam(param_ids[param.first], param.second));
+      }
 
-      // If no target_host has been set, we choose a default one, which is
-      // llvm if "codegen.LLVMModuleCreate" is accessible.
-      const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
-      if (!target_host.defined())
-        target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
+      Map<String, ObjectRef> dict;
+      dict.Set(tvm::tir::attr::kLinkedParams, link_params);
+      dict.Set(tvm::attr::kGlobalSymbol, String(::tvm::runtime::symbol::tvm_lookup_linked_param));
+      DictAttrs attrs{dict};
+      auto prim = tir::PrimFunc(Array<tir::Var>(), tir::SeqStmt(Array<tir::Stmt>()), VoidType(),
+                                Map<tir::Var, tir::Buffer>(), attrs);
+      if (lowered_funcs.find(target_host->str()) == lowered_funcs.end()) {
+        lowered_funcs.Set(target_host->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
+      }
+      lowered_funcs[target_host->str()]->Add(
+          GlobalVar(::tvm::runtime::symbol::tvm_lookup_linked_param), prim);
+    }
 
+    // When there is no lowered_funcs due to reasons such as optimization.
+    if (lowered_funcs.size() == 0) {
       if (target_host.defined() && target_host->kind->name == "llvm") {
         // If we can decide the target is LLVM, we then create an empty LLVM module.
         ret_.mod = (*pf)(target_host->str(), "empty_module");
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index b679fea50099..98d913662953 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -45,7 +45,7 @@
 #include <utility>
 #include <vector>
 
-#include "../transforms/pass_util.h"
+#include "../transforms/pass_utils.h"
 #include "utils.h"
 
 namespace tvm {
@@ -79,8 +79,8 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
     const int64_t* pval = tir::as_const_int(val);
     if (pval != nullptr) {
 #ifndef TVM_INDEX_DEFAULT_I64
-      CHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
-      CHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
+      ICHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
+      ICHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
       res.push_back(IntImm(DataType::Int(32), *pval));
 #else
       res.push_back(val);
@@ -99,7 +99,10 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
 class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
  public:
   explicit ScheduleGetter(Target target)
-      : target_(target), device_copy_op_(Op::Get("device_copy")) {}
+      : target_(target), device_copy_op_(Op::Get("device_copy")) {
+    // Whether to use auto_scheduler schedule.
+    use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
+  }
 
   CachedFunc Create(const Function& prim_func) {
     auto cache_node = make_object<CachedFuncNode>();
@@ -116,7 +119,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
         for (Type field : tuple_type->fields) {
           const auto* ttype = field.as<TensorTypeNode>();
           // TODO(@icemelon): Allow recursive tuple
-          CHECK(ttype != nullptr);
+          ICHECK(ttype != nullptr);
           tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
           cache_node->inputs.push_back(tensor);
           inputs.push_back(tensor);
@@ -135,7 +138,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       candidate_name = truncated_name.str();
     }
     cache_node->func_name = candidate_name;
-    CHECK(anchor_op_.defined());
+    ICHECK(anchor_op_.defined());
     // Fusion over tupled results may leave identity relationships
     // between inputs and outputs, and those should not be scheduled.
     // Hence schedule only non PlaceholderOp outputs.
@@ -145,11 +148,27 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
         tensor_outs.push_back(tensor);
       }
     }
+
     te::Schedule schedule;
     // No need to register schedule for device copy op.
     if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr) {
-      CHECK(anchor_implementation_.defined());
-      schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_);
+      if (use_auto_scheduler_) {
+        const auto* fauto_schedule =
+            runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
+        ICHECK(fauto_schedule != nullptr)
+            << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered";
+        bool has_complex_op = anchor_op_pattern_ >= kCommReduce;
+        ObjectRef obj = (*fauto_schedule)(tensor_outs, has_complex_op);
+        if (obj.defined()) {
+          schedule = Downcast<te::Schedule>(obj);
+        }
+      }
+
+      // Use TOPI schdule if user specificed, or the function has no auto_scheduler schedule.
+      if (!schedule.defined()) {
+        ICHECK(anchor_implementation_.defined());
+        schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_);
+      }
       for (const auto& scalar : scalars_) {
         if (schedule->Contain(scalar)) {
           schedule[scalar].compute_inline();
@@ -167,7 +186,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
     using tir::make_const;
-    CHECK(op->is_scalar());
+    ICHECK(op->is_scalar());
     void* data = op->data->data;
     DataType dtype = DataType(op->data->dtype);
     auto value = te::compute(
@@ -196,7 +215,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
     static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
     static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
-    CHECK(flower_call) << "relay.backend.lower_call is not registered.";
+    ICHECK(flower_call) << "relay.backend.lower_call is not registered.";
 
     Array<te::Tensor> inputs;
     int count_tuple = 0;
@@ -209,10 +228,10 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       }
     }
     if (count_tuple) {
-      CHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
+      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
     }
 
-    CHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
+    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
 
     Array<te::Tensor> outputs;
@@ -228,9 +247,9 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     }
 
     int op_pattern = fpattern[op];
-    if (op_pattern >= kCommReduce) {
-      CHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
-          << "Two complicated op in a primitive function "
+    if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
+      ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
+          << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
           << " anchor=" << anchor_op_ << " current=" << op;
     }
     if (op_pattern >= anchor_op_pattern_) {
@@ -241,8 +260,8 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     }
     if (outputs.size() != 1) {
       const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
-      CHECK(tuple_type) << "Expect output to be a tuple type";
-      CHECK_EQ(tuple_type->fields.size(), outputs.size());
+      ICHECK(tuple_type) << "Expect output to be a tuple type";
+      ICHECK_EQ(tuple_type->fields.size(), outputs.size());
     }
     // Set the name to `__copy`. It will be detected in graph runtime to perform
     // data copy across devices.
@@ -262,7 +281,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const LetNode* op) final {
     Array<te::Tensor> val = VisitExpr(op->value);
-    CHECK(!memo_.count(op->var));
+    ICHECK(!memo_.count(op->var));
     memo_[op->var] = val;
     return VisitExpr(op->body);
   }
@@ -270,9 +289,9 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
     Array<te::Tensor> fields;
     for (Expr field : op->fields) {
-      CHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
+      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
       Array<te::Tensor> res = VisitExpr(field);
-      CHECK_EQ(res.size(), 1);
+      ICHECK_EQ(res.size(), 1);
       fields.push_back(res[0]);
     }
     return fields;
@@ -281,9 +300,9 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const TupleGetItemNode* op) final {
     const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
     Array<te::Tensor> tuple = VisitExpr(op->tuple);
-    CHECK_EQ(tuple_type->fields.size(), tuple.size());
-    CHECK_GE(op->index, 0);
-    CHECK_LT(static_cast<size_t>(op->index), tuple.size());
+    ICHECK_EQ(tuple_type->fields.size(), tuple.size());
+    ICHECK_GE(op->index, 0);
+    ICHECK_LT(static_cast<size_t>(op->index), tuple.size());
     return {tuple[op->index]};
   }
 
@@ -295,11 +314,23 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   OpImplementation anchor_implementation_;
   std::ostringstream readable_name_stream_;
   Array<te::Operation> scalars_;
+  bool use_auto_scheduler_;
   // Cache device copy op for equivalence checking to reduce registry lookup
   // overhead for each invocation of call node when retrieving schedules.
   const Op& device_copy_op_;
 };
 
+/*!
+ * \brief Create schedule for target.
+ * \param source_func The primitive function to be lowered.
+ * \param target The target we want to create schedule for.
+ * \return Pair of schedule and cache.
+ *  The funcs field in cache is not yet populated.
+ */
+CachedFunc CreateSchedule(const Function& source_func, const Target& target) {
+  return ScheduleGetter(target).Create(source_func);
+}
+
 // Creates shape function from functor.
 class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
  public:
@@ -332,10 +363,10 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
         // flatten tuple of tensor type.
         const auto* tuple_type = param->type_as<TupleTypeNode>();
         // TODO(@icemelon): Support recursive tuple
-        CHECK(tuple_type);
+        ICHECK(tuple_type);
         for (Type field : tuple_type->fields) {
           const auto* ttype = field.as<TensorTypeNode>();
-          CHECK(ttype);
+          ICHECK(ttype);
           add_placeholder(ttype);
         }
       }
@@ -405,7 +436,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       LOG(FATAL) << "Free variable " << var->name_hint();
       return {};
     } else {
-      CHECK(data_dependants_.size());
+      ICHECK(data_dependants_.size());
       bool data_dependant = data_dependants_.back();
       if (data_dependant) {
         param_states_[var] |= kNeedInputData;
@@ -419,9 +450,29 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
     using tir::make_const;
-    CHECK(data_dependants_.size());
-    CHECK(op->is_scalar());
+    ICHECK(data_dependants_.size());
     bool data_dependant = data_dependants_.back();
+    if (!op->is_scalar()) {
+      // This is a constant weight, extract the shape of the weight tensor.
+      // This can not be data dependent.
+      CHECK(!data_dependant);
+      auto ttype = op->checked_type().as<TensorTypeNode>();
+      int ndim = static_cast<int>(ttype->shape.size());
+      Array<PrimExpr> out_shape{ndim};
+      te::Tensor value = tvm::te::compute(
+          out_shape,
+          [&](const Array<tvm::tir::Var>& indices) {
+            auto idx = indices[0];
+            PrimExpr ret = make_const(DataType::Int(64), 0);
+            for (int i = 0; i < ndim; i++) {
+              ret = tvm::if_then_else(idx == i, ttype->shape[i], ret);
+            }
+            return ret;
+          },
+          "shape_const", topi::kBroadcast);
+      scalars_.push_back(value);
+      return {value};
+    }
     if (data_dependant) {
       void* data = op->data->data;
       DataType dtype = DataType(op->data->dtype);
@@ -458,13 +509,13 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
     static auto fshape_func = Op::GetAttrMap<FShapeFunc>("FShapeFunc");
     static auto tshape_data_dependant = Op::GetAttrMap<TShapeDataDependant>("TShapeDataDependant");
-    CHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
+    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
-    CHECK(data_dependants_.empty() || !data_dependants_.back())
+    ICHECK(data_dependants_.empty() || !data_dependants_.back())
         << "Error in op fusion: output of the shape func is fed to a "
         << "data-dependant shape func";
-    CHECK_GT(fshape_func.count(op), 0) << "Internal error, cannot find ShapeFunc for " << op->name;
-    CHECK_GT(tshape_data_dependant.count(op), 0)
+    ICHECK_GT(fshape_func.count(op), 0) << "Internal error, cannot find ShapeFunc for " << op->name;
+    ICHECK_GT(tshape_data_dependant.count(op), 0)
         << "Internal error, cannot find TShapeDataDependant for " << op->name;
 
     data_dependants_.push_back(IsDataDependant(call_node));
@@ -480,7 +531,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       }
     }
     if (count_tuple) {
-      CHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
+      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
     }
     // Get output ndims
     auto ret_type = call_node->checked_type();
@@ -490,10 +541,10 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     } else {
       auto rtype = ret_type.as<TupleTypeNode>();
       // TODO(@icemelon): Allow recursive tuple
-      CHECK(rtype);
+      ICHECK(rtype);
       for (size_t i = 0; i < rtype->fields.size(); ++i) {
         auto ttype = rtype->fields[i].as<TensorTypeNode>();
-        CHECK(ttype);
+        ICHECK(ttype);
         out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
       }
     }
@@ -511,7 +562,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const LetNode* op) final {
     Array<te::Tensor> val = VisitExpr(op->value);
-    CHECK(!memo_.count(op->var));
+    ICHECK(!memo_.count(op->var));
     memo_[op->var] = val;
     return VisitExpr(op->body);
   }
@@ -519,9 +570,9 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
     Array<te::Tensor> fields;
     for (Expr field : op->fields) {
-      CHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
+      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
       Array<te::Tensor> res = VisitExpr(field);
-      CHECK_EQ(res.size(), 1);
+      ICHECK_EQ(res.size(), 1);
       fields.push_back(res[0]);
     }
     return fields;
@@ -579,34 +630,34 @@ class CompileEngineImpl : public CompileEngineNode {
     std::vector<CCacheKey> cached_ext_funcs;
     for (const auto& it : cache_) {
       auto src_func = it.first->source_func;
-      CHECK(src_func.defined());
+      ICHECK(src_func.defined());
       if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
         auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
-        CHECK(code_gen.defined()) << "No external codegen is set";
+        ICHECK(code_gen.defined()) << "No external codegen is set";
         std::string code_gen_name = code_gen.value();
         cached_ext_funcs.push_back(it.first);
 
         auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-        CHECK(symbol_name.defined()) << "No external symbol is set for:\n"
-                                     << AsText(src_func, false);
+        ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
+                                      << AsText(src_func, false);
 
         std::string sn = symbol_name.value();
         if (cached_symbol.count(sn)) {
           cached_symbol[sn] = code_gen_name;
         } else {
-          CHECK_NE(sn, code_gen_name)
+          ICHECK_NE(sn, code_gen_name)
               << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
         }
 
         std::string ext_name = "relay.ext." + code_gen_name;
         auto pf = tvm::runtime::Registry::Get(ext_name);
-        CHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
+        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
         // No need to keep compiler attribute at this point, functions have been
         // extracted for specific codegen.
         src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
         runtime::Module ext_mod = (*pf)(src_func);
 
-        CHECK(ext_mod.defined()) << "No external runtime is generated.";
+        ICHECK(ext_mod.defined()) << "No external runtime is generated.";
         ret.push_back(ext_mod);
       }
     }
@@ -620,6 +671,7 @@ class CompileEngineImpl : public CompileEngineNode {
   }
 
   void Clear() final { cache_.clear(); }
+
   // List all items in the cache.
   Array<ObjectRef> ListItems() {
     std::lock_guard<std::mutex> lock(mutex_);
@@ -630,16 +682,12 @@ class CompileEngineImpl : public CompileEngineNode {
     }
     return items;
   }
+
   /*!
-   * \brief Create schedule for target.
-   * \param source_func The primitive function to be lowered.
-   * \param target The target we want to create schedule for.
-   * \return Pair of schedule and cache.
-   *  The funcs field in cache is not yet populated.
+   * \brief Get the cache key of the function that is being lowered currently
+   * \return the cache key
    */
-  CachedFunc CreateSchedule(const Function& source_func, const Target& target) {
-    return ScheduleGetter(target).Create(source_func);
-  }
+  CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
 
  private:
   // implement lowered func
@@ -656,12 +704,14 @@ class CompileEngineImpl : public CompileEngineNode {
       value->use_count = 0;
       cache_[key] = value;
     }
+    cur_ccache_key_ = key;
+
     // No need to lower external functions for now. We will invoke the external
     // codegen tool once and lower all functions together.
     if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
       auto cache_node = make_object<CachedFuncNode>();
       const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-      CHECK(name_node.defined()) << "External function has not been attached a name yet.";
+      ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
       cache_node->func_name = std::string(name_node.value());
       cache_node->target = Target("ext_dev");
       cache_node->funcs->Add(GlobalVar(cache_node->func_name), key->source_func);
@@ -671,7 +721,7 @@ class CompileEngineImpl : public CompileEngineNode {
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
-    CHECK(!value->cached_func.defined());
+    ICHECK(!value->cached_func.defined());
     auto cfunc = CreateSchedule(key->source_func, key->target);
     auto cache_node = make_object<CachedFuncNode>(*(cfunc.operator->()));
 
@@ -720,7 +770,7 @@ class CompileEngineImpl : public CompileEngineNode {
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
-    CHECK(!value->cached_func.defined());
+    ICHECK(!value->cached_func.defined());
     auto spair = MakeShapeFunc().Create(key->source_func);
     auto cache_node = make_object<CachedFuncNode>(*(spair.second.operator->()));
     cache_node->func_name = GetUniqueName(cache_node->func_name);
@@ -770,6 +820,8 @@ class CompileEngineImpl : public CompileEngineNode {
   std::unordered_map<CCacheKey, CCacheValue> cache_;
   /*! \brief internal compiler cache for shape funcs */
   std::unordered_map<CCacheKey, CCacheValue> shape_func_cache_;
+  /*! \brief the cache key of the function that is being lowered currently*/
+  CCacheKey cur_ccache_key_;
 };
 
 /*! \brief The global compile engine */
@@ -780,6 +832,8 @@ CompileEngine& CompileEngine::Global() {
   return *inst;
 }
 
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
+
 TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
     .set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
       return LoweredOutput(outputs, impl);
@@ -811,7 +865,17 @@ TVM_REGISTER_GLOBAL("relay.backend._CompileEngineJIT")
     .set_body_typed([](CompileEngine self, CCacheKey key) { return self->JIT(key); });
 
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems").set_body_typed([](CompileEngine self) {
-  return static_cast<CompileEngineImpl*>(self.operator->())->ListItems();
+  CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
+  ICHECK(ptr != nullptr);
+  return ptr->ListItems();
 });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGetCurrentCCacheKey")
+    .set_body_typed([](CompileEngine self) {
+      CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
+      ICHECK(ptr != nullptr);
+      return ptr->GetCurrentCCacheKey();
+    });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 95166c74f891..d7628e7a5bdf 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -154,7 +154,7 @@ class CCacheKey : public ObjectRef {
   const CCacheKeyNode* operator->() const { return static_cast<const CCacheKeyNode*>(get()); }
   // comparator
   inline bool operator==(const CCacheKey& other) const {
-    CHECK(defined() && other.defined());
+    ICHECK(defined() && other.defined());
     return (*this)->Equal(other.operator->());
   }
   using ContainerType = CCacheKeyNode;
@@ -241,6 +241,15 @@ class CompileEngine : public ObjectRef {
   TVM_DLL static CompileEngine& Global();
 };
 
+/*!
+ * \brief Create schedule for target.
+ * \param source_func The primitive function to be lowered.
+ * \param target The target we want to create schedule for.
+ * \return Pair of schedule and cache.
+ *  The funcs field in cache is not yet populated.
+ */
+CachedFunc CreateSchedule(const Function& source_func, const Target& target);
+
 /*!
  * \brief Check if the type is dynamic.
  * \param ty The type to be checked.
@@ -272,7 +281,7 @@ namespace std {
 template <>
 struct hash<::tvm::relay::CCacheKey> {
   size_t operator()(const ::tvm::relay::CCacheKey& key) const {
-    CHECK(key.defined());
+    ICHECK(key.defined());
     return key->Hash();
   }
 };
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 087c895f4614..a963242f82d5 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -87,7 +87,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     }
     auto fn = cn->op.as<FunctionNode>();
     auto comp = fn->GetAttr<String>(attr::kComposite);
-    CHECK(comp.defined()) << "Arm Compute Library JSON runtime only supports composite functions.";
+    ICHECK(comp.defined()) << "Arm Compute Library JSON runtime only supports composite functions.";
     const std::string name = comp.value();
     std::shared_ptr<JSONGraphNode> json_node;
     if (name == "arm_compute_lib.conv2d" || name == "arm_compute_lib.qnn_conv2d") {
@@ -114,7 +114,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   static CompositeConvNode UnpackCompositeConvolution(const CallNode* cn) {
     CompositeConvNode nodes{};
     const auto* fn = cn->op.as<FunctionNode>();
-    CHECK(fn);
+    ICHECK(fn);
 
     // Traverse composite convolution function from child to parent
     const auto* current_call = fn->body.as<CallNode>();
@@ -132,9 +132,9 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     }
     // Enforce a convolution node exists at this point during traversal
     if (nodes.requantize) {
-      CHECK(backend::IsOp(current_call, "qnn.conv2d"));
+      ICHECK(backend::IsOp(current_call, "qnn.conv2d"));
     } else {
-      CHECK(backend::IsOp(current_call, "nn.conv2d"));
+      ICHECK(backend::IsOp(current_call, "nn.conv2d"));
     }
     nodes.conv = current_call;
     if (!current_call->args.empty() && current_call->args[0]->IsInstance<CallNode>()) {
@@ -157,8 +157,8 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     std::string name = "nn.conv2d";
 
     const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
-    CHECK(conv_attr);
-    CHECK(conv_attr->kernel_layout == "OHWI")
+    ICHECK(conv_attr);
+    ICHECK(conv_attr->kernel_layout == "OHWI")
         << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
 
     // Inputs must be added in the same order they appear in the relay graph.
@@ -186,7 +186,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     // Override attributes
     if (nodes.pad) {
       const auto* pad_attr = nodes.pad->attrs.as<PadAttrs>();
-      CHECK(pad_attr);
+      ICHECK(pad_attr);
       auto p = pad_attr->pad_width;
       // Convert to TVM layout for now, conversion to ACL layout takes place in runtime.
       // Standard convolution pad layout for TVM: top, left, bottom, right.
@@ -216,7 +216,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   static CompositeDenseNode UnpackCompositeDense(const CallNode* cn) {
     CompositeDenseNode nodes{};
     const auto* fn = cn->op.as<FunctionNode>();
-    CHECK(fn);
+    ICHECK(fn);
 
     // Traverse composite dense function from child to parent
     const auto* current_call = fn->body.as<CallNode>();
@@ -230,9 +230,9 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     }
     // Enforce a dense node exists at this point during traversal
     if (nodes.requantize) {
-      CHECK(backend::IsOp(current_call, "qnn.dense"));
+      ICHECK(backend::IsOp(current_call, "qnn.dense"));
     } else {
-      CHECK(backend::IsOp(current_call, "nn.dense"));
+      ICHECK(backend::IsOp(current_call, "nn.dense"));
     }
     nodes.dense = current_call;
     return nodes;
@@ -282,13 +282,13 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
    */
   std::shared_ptr<JSONGraphNode> CreateCompositeAvgPool2DJSONNode(const CallNode* cn) {
     const auto* fn = cn->op.as<FunctionNode>();
-    CHECK(fn);
+    ICHECK(fn);
     const auto* cast = fn->body.as<CallNode>();
-    CHECK(cast);
+    ICHECK(cast);
     const auto* avg_pool = cast->args[0].as<CallNode>();
-    CHECK(avg_pool);
+    ICHECK(avg_pool);
     const auto* avg_pool_op = avg_pool->op.as<OpNode>();
-    CHECK(avg_pool_op);
+    ICHECK(avg_pool_op);
     const std::string name = avg_pool_op->name;
 
     std::vector<JSONGraphNodeEntry> inputs;
@@ -310,16 +310,16 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   std::shared_ptr<JSONGraphNode> CreateCompositeL2Pool2DJSONNode(const CallNode* cn) {
     const std::string name = "nn.l2_pool2d";
     const auto* fn = cn->op.as<FunctionNode>();
-    CHECK(fn);
+    ICHECK(fn);
     const auto* sqrt = fn->body.as<CallNode>();
-    CHECK(sqrt);
+    ICHECK(sqrt);
     const auto* avg_pool = sqrt->args[0].as<CallNode>();
-    CHECK(avg_pool);
+    ICHECK(avg_pool);
     const auto* pow = avg_pool->args[0].as<CallNode>();
-    CHECK(pow);
+    ICHECK(pow);
     const auto* exponent = pow->args[1].as<ConstantNode>();
-    CHECK(exponent);
-    CHECK_EQ(*static_cast<float*>(exponent->data->data), 2) << "Exponent must be 2 for L2 pooling";
+    ICHECK(exponent);
+    ICHECK_EQ(*static_cast<float*>(exponent->data->data), 2) << "Exponent must be 2 for L2 pooling";
 
     std::vector<JSONGraphNodeEntry> inputs;
     inputs.push_back(VisitExpr(cn->args[0])[0]);
@@ -363,7 +363,7 @@ TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib.optimize").set_body_typed(PreProc
  * \return A runtime module.
  */
 runtime::Module ACLCompiler(const ObjectRef& ref) {
-  CHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
+  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
   Function func = Downcast<Function>(ref);
   std::string func_name = backend::GetExtSymbol(func);
 
@@ -372,7 +372,7 @@ runtime::Module ACLCompiler(const ObjectRef& ref) {
   std::string graph_json = serializer.GetJSON();
   auto param_names = serializer.GetParams();
   const auto* pf = runtime::Registry::Get("runtime.arm_compute_lib_runtime_create");
-  CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
+  ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
   runtime::Module lib = (*pf)(func_name, graph_json, param_names);
   return lib;
 }
diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index c7b5a8da1fed..935ac16efb23 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -61,7 +61,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     std::vector<Output> outs;
     for (auto field : node->fields) {
       auto res = VisitExpr(field);
-      CHECK_EQ(res.size(), 1U) << "Do not support tuple nest";
+      ICHECK_EQ(res.size(), 1U) << "Do not support tuple nest";
       outs.push_back(res[0]);
     }
     return outs;
@@ -69,7 +69,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
 
   std::vector<Output> VisitExpr_(const TupleGetItemNode* op) final {
     auto res = VisitExpr(op->tuple);
-    CHECK_GT(res.size(), static_cast<size_t>(op->index));
+    ICHECK_GT(res.size(), static_cast<size_t>(op->index));
 
     // Only keep the item we want for the child node.
     // FIXME(@comaniac): The other items should still be requried for the primary outputs.
@@ -84,7 +84,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     // Get const: static_cast<float*>(gcc_0_consts[0]->data)
     output.name = CreateDataReference(ext_func_id_, const_idx_);
     const auto* type_node = cn->checked_type().as<TensorTypeNode>();
-    CHECK(type_node);
+    ICHECK(type_node);
     const auto& dtype = GetDtypeString(type_node);
 
     // Generate the global variable for needed ndarrays
@@ -94,7 +94,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
       ext_func_body_.insert(ext_func_body_.begin(), checker);
     }
 
-    CHECK(dtype == "float" || dtype == "int") << "Only float and int are supported for now.";
+    ICHECK(dtype == "float" || dtype == "int") << "Only float and int are supported for now.";
     output.dtype = dtype;
 
     std::string const_var_name = CreateConstVar(ext_func_id_, const_idx_);
@@ -130,7 +130,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     }
 
     const auto* type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node);
+    ICHECK(type_node);
     const auto& dtype = GetDtypeString(type_node);
     macro_stream << ", " << dtype;
 
@@ -216,7 +216,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
 class CSourceCodegen : public CSourceModuleCodegenBase {
  public:
   std::pair<std::string, Array<String>> GenCFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
+    ICHECK(func.defined()) << "Input error: expect a Relay function.";
 
     // Record the external symbol for runtime lookup.
     auto sid = GetExtSymbol(func);
@@ -260,7 +260,7 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
 
     code_stream_ << operator_macro << "\n\n";
 
-    CHECK(ref->IsInstance<FunctionNode>());
+    ICHECK(ref->IsInstance<FunctionNode>());
     auto res = GenCFunc(Downcast<Function>(ref));
     std::string code = code_stream_.str();
 
@@ -269,7 +269,7 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
 
     // Create a CSource module
     const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
-    CHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
+    ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
     return (*pf)(code, "c", sym, variables);
   }
 
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
index 0d395b7977b2..9448b4d0738d 100644
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -85,7 +85,7 @@ class CodegenCBase {
    * \brief Exit a scope.
    */
   void ExitScope() {
-    CHECK_GE(indent_, 2U) << "Wrong ident found.";
+    ICHECK_GE(indent_, 2U) << "Wrong ident found.";
     indent_ -= 2;
   }
 
@@ -262,7 +262,7 @@ class CodegenCBase {
    */
   std::string GetDtypeString(const Var& var) {
     auto ttype = var->checked_type().as<TensorTypeNode>();
-    CHECK(ttype) << "Expect TensorTypeNode";
+    ICHECK(ttype) << "Expect TensorTypeNode";
     return GetDtypeString(ttype);
   }
 
@@ -297,7 +297,7 @@ class CodegenCBase {
    */
   std::string CreateInitChecker(const std::string& symbol) const {
     std::ostringstream oss;
-    oss << "CHECK(!" << symbol
+    oss << "ICHECK(!" << symbol
         << "_consts.empty()) << \"C source module hasn't been initialized.\";\n";
     return oss.str();
   }
diff --git a/src/relay/backend/contrib/codegen_json/codegen_json.h b/src/relay/backend/contrib/codegen_json/codegen_json.h
index 9ed15a88c72a..859ef8c9bdb2 100644
--- a/src/relay/backend/contrib/codegen_json/codegen_json.h
+++ b/src/relay/backend/contrib/codegen_json/codegen_json.h
@@ -197,8 +197,8 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
     if (const auto* tuple_type = checked_type.as<TupleTypeNode>()) {
       for (size_t i = 0; i < tuple_type->fields.size(); ++i) {
         const auto* tensor_type = tuple_type->fields[i].as<TensorTypeNode>();
-        CHECK(tensor_type) << "Expect TensorType, but received: ."
-                           << tuple_type->fields[i]->GetTypeKey();
+        ICHECK(tensor_type) << "Expect TensorType, but received: ."
+                            << tuple_type->fields[i]->GetTypeKey();
         ret.push_back(JSONGraphNodeEntry(node_id, i));
         shape.emplace_back(GetIntShape(tensor_type->shape));
         dtype.emplace_back(DType2String(tensor_type->dtype));
@@ -206,7 +206,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
       node->SetNumOutput(tuple_type->fields.size());
     } else {
       const auto* tensor_type = checked_type.as<TensorTypeNode>();
-      CHECK(tensor_type) << "Expect TensorType, but received: " << checked_type->GetTypeKey();
+      ICHECK(tensor_type) << "Expect TensorType, but received: " << checked_type->GetTypeKey();
       shape.emplace_back(GetIntShape(tensor_type->shape));
       dtype.emplace_back(DType2String(tensor_type->dtype));
       ret.push_back(JSONGraphNodeEntry(node_id, 0));
@@ -228,7 +228,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
       extractor.Extract(const_cast<Object*>(call_attr));
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto pattern = fn->GetAttr<String>(attr::kPartitionedFromPattern);
-      CHECK(pattern.defined());
+      ICHECK(pattern.defined());
       std::vector<std::string> values;
       values.push_back(pattern.value());
       std::vector<dmlc::any> attr;
@@ -243,7 +243,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
   }
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const VarNode* vn) {
-    CHECK(memo_.count(GetRef<Expr>(vn)));
+    ICHECK(memo_.count(GetRef<Expr>(vn)));
     return memo_[GetRef<Expr>(vn)];
   }
 
@@ -270,7 +270,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
       name = op_node->name;
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto comp = fn->GetAttr<String>(attr::kComposite);
-      CHECK(comp.defined()) << "JSON runtime only supports composite functions.";
+      ICHECK(comp.defined()) << "JSON runtime only supports composite functions.";
       name = comp.value();
     } else {
       LOG(FATAL) << "JSON runtime does not support calls to " << cn->op->GetTypeKey();
@@ -289,7 +289,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
   }
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const LetNode* ln) {
-    CHECK_EQ(memo_.count(ln->var), 0);
+    ICHECK_EQ(memo_.count(ln->var), 0);
     memo_[ln->var] = VisitExpr(ln->value);
     return VisitExpr(ln->body);
   }
@@ -300,7 +300,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
   }
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const FunctionNode* fn) {
-    CHECK(fn->GetAttr<String>(attr::kComposite).defined())
+    ICHECK(fn->GetAttr<String>(attr::kComposite).defined())
         << "JSON runtime only supports composite functions";
     // FunctionNode should be handled by the caller.
     return {};
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index bec9af0cf83f..bfc5c77d116b 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -57,7 +57,7 @@ inline size_t GetShape1DSize(const Type& type) {
 std::vector<std::string> Conv2d(const CallNode* call) {
   std::vector<std::string> args;
   const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
-  CHECK(conv2d_attr);
+  ICHECK(conv2d_attr);
 
   auto ishape = GetShape(call->args[0]->checked_type());
   auto wshape = GetShape(call->args[1]->checked_type());
@@ -155,7 +155,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
     std::vector<Output> outs;
     for (auto field : node->fields) {
       auto res = VisitExpr(field);
-      CHECK_EQ(res.size(), 1U) << "Do not support tuple nest";
+      ICHECK_EQ(res.size(), 1U) << "Do not support tuple nest";
       outs.push_back(res[0]);
     }
     return outs;
@@ -163,7 +163,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
 
   std::vector<Output> VisitExpr_(const TupleGetItemNode* op) final {
     auto res = VisitExpr(op->tuple);
-    CHECK_GT(res.size(), static_cast<size_t>(op->index));
+    ICHECK_GT(res.size(), static_cast<size_t>(op->index));
 
     // Only keep the item we want for the child node.
     // FIXME(@comaniac): The other items should still be requried for the primary outputs.
@@ -190,8 +190,8 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
     const_idx_++;
 
     const auto* type_node = cn->checked_type().as<TensorTypeNode>();
-    CHECK(type_node);
-    CHECK_EQ(GetDtypeString(type_node), "float") << "Only float is supported for now.";
+    ICHECK(type_node);
+    ICHECK_EQ(GetDtypeString(type_node), "float") << "Only float is supported for now.";
 
     return {output};
   }
@@ -233,7 +233,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
 
   GenerateBodyOutput GenerateOpCall(const CallNode* call) {
     const auto* op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "Expect OpNode, but got " << call->op->GetTypeKey();
+    ICHECK(op_node) << "Expect OpNode, but got " << call->op->GetTypeKey();
 
     using ArgFunType = std::function<std::vector<std::string>(const CallNode*)>;
     static const std::map<std::string, std::pair<std::string, ArgFunType>> op_map = {
@@ -257,7 +257,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
   GenerateBodyOutput GenerateCompositeFunctionCall(const FunctionNode* callee,
                                                    const CallNode* caller) {
     const auto pattern_name = callee->GetAttr<runtime::String>(attr::kComposite);
-    CHECK(pattern_name.defined()) << "Only functions with composite attribute supported";
+    ICHECK(pattern_name.defined()) << "Only functions with composite attribute supported";
 
     if (pattern_name == "dnnl.conv2d_bias_relu") {
       const auto* conv_call =
@@ -283,7 +283,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
                                   const std::vector<std::string>& func_args,
                                   const std::vector<std::string>& attribute_args) {
     // Make function call with input buffers when visiting arguments
-    CHECK_GT(func_args.size(), 0);
+    ICHECK_GT(func_args.size(), 0);
     std::ostringstream decl_stream;
     decl_stream << "(" << func_args[0];
     for (size_t i = 1; i < func_args.size(); ++i) {
@@ -295,11 +295,11 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
     if (root_call->checked_type()->IsInstance<TupleTypeNode>()) {
       auto type_node = root_call->checked_type().as<TupleTypeNode>();
       for (auto field : type_node->fields) {
-        CHECK(field->IsInstance<TensorTypeNode>());
+        ICHECK(field->IsInstance<TensorTypeNode>());
         out_types.push_back(field);
       }
     } else if (root_call->checked_type()->IsInstance<TensorTypeNode>()) {
-      CHECK(root_call->checked_type()->IsInstance<TensorTypeNode>());
+      ICHECK(root_call->checked_type()->IsInstance<TensorTypeNode>());
       out_types.push_back(root_call->checked_type());
     } else {
       LOG(FATAL) << "Unrecognized type node: " << AsText(root_call->checked_type(), false);
@@ -363,7 +363,7 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
  public:
   // Create a corresponding DNNL function for the given relay Function.
   std::pair<std::string, Array<String>> GenDNNLFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
+    ICHECK(func.defined()) << "Input error: expect a Relay function.";
 
     // Record the external symbol for runtime lookup.
     auto sid = GetExtSymbol(func);
@@ -404,7 +404,7 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
     code_stream_ << "using namespace tvm::runtime::contrib;\n";
     code_stream_ << "\n";
 
-    CHECK(ref->IsInstance<FunctionNode>());
+    ICHECK(ref->IsInstance<FunctionNode>());
     auto res = GenDNNLFunc(Downcast<Function>(ref));
     std::string code = code_stream_.str();
     String sym = std::get<0>(res);
@@ -412,7 +412,7 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
 
     // Create a CSource module
     const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
-    CHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
+    ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
     return (*pf)(code, "c", sym, variables);
   }
 
@@ -441,14 +441,14 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
       name = op_node->name;
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto comp = fn->GetAttr<String>(attr::kComposite);
-      CHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
+      ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
       name = comp.value();
 
       if (name == "dnnl.conv2d_bias_relu") {
         call = GetRootCall(fn->body.as<CallNode>(), 2, {"nn.conv2d", "add", "nn.relu"});
       } else if (name == "dnnl.conv2d_relu") {
         call = GetRootCall(fn->body.as<CallNode>(), 1, {"nn.conv2d", "nn.relu"});
-        CHECK(call->op.as<OpNode>()) << "Not op node";
+        ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else {
         LOG(FATAL) << "Unrecognized DNNL pattern: " << name;
       }
@@ -476,7 +476,7 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
  */
 runtime::Module DNNLCompiler(const ObjectRef& ref) {
 #ifdef USE_JSON_RUNTIME
-  CHECK(ref->IsInstance<FunctionNode>());
+  ICHECK(ref->IsInstance<FunctionNode>());
   auto func = Downcast<Function>(ref);
   auto func_name = GetExtSymbol(func);
   DNNLJSONSerializer serializer(func_name, func);
@@ -485,7 +485,7 @@ runtime::Module DNNLCompiler(const ObjectRef& ref) {
   auto params = serializer.GetParams();
 
   const auto* pf = runtime::Registry::Get("runtime.DNNLJSONRuntimeCreate");
-  CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
+  ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
   auto mod = (*pf)(func_name, graph_json, params);
   return mod;
 #else
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index dd92c6bfe723..3097a300a0d9 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -43,7 +43,7 @@ sl::TensorInfo GetTensorInfo(std::map<Expr, std::vector<sl::TensorInfo>> tensor_
 bool IsEthosnOp(const Call& call, const std::string& op_name) {
   if (call->op->IsInstance<OpNode>()) {
     Op op = Downcast<Op>(call->op);
-    CHECK(op.defined());
+    ICHECK(op.defined());
     return op == Op::Get(op_name);
   } else {
     return false;
@@ -53,7 +53,7 @@ bool IsEthosnOp(const Call& call, const std::string& op_name) {
 bool IsEthosnFunc(const Call& call, const std::string& op_name) {
   if (call->op->IsInstance<FunctionNode>()) {
     Function func = Downcast<Function>(call->op);
-    CHECK(func.defined());
+    ICHECK(func.defined());
     auto name_node = func->GetAttr<String>(attr::kComposite);
     return name_node.value() == op_name;
   }
@@ -62,7 +62,7 @@ bool IsEthosnFunc(const Call& call, const std::string& op_name) {
 
 std::map<Expr, std::vector<sl::TensorInfo>> InferTensorsVisitor::Infer(const Expr& expr) {
   tensor_table_.clear();
-  CHECK(expr->checked_type().defined());
+  ICHECK(expr->checked_type().defined());
   size_t output_size = 1;
   if (auto tuple = expr->checked_type().as<TupleTypeNode>()) {
     output_size = tuple->fields.size();
@@ -162,7 +162,7 @@ void InferTensorsVisitor::VisitExpr_(const CallNode* cn) {
 
 void InferTensorsVisitor::VisitExpr_(const TupleNode* tn) {
   auto tuple = GetRef<Tuple>(tn);
-  CHECK(tensor_table_.find(tuple) != tensor_table_.end());
+  ICHECK(tensor_table_.find(tuple) != tensor_table_.end());
   for (size_t i = 0; i < tn->fields.size(); i++) {
     tensor_table_[tn->fields[i]] = {tensor_table_[tuple][i]};
   }
@@ -176,7 +176,7 @@ void InferTensorsVisitor::VisitExpr_(const TupleGetItemNode* tgn) {
   // Don't assume it must be targeting a TupleNode
   // Vars and calls can still have TupleType
   auto tg = GetRef<TupleGetItem>(tgn);
-  CHECK(tensor_table_.find(tg) != tensor_table_.end());
+  ICHECK(tensor_table_.find(tg) != tensor_table_.end());
   auto tuple = tg->tuple;
   auto type = tuple->checked_type().as<TupleTypeNode>();
   int index = tg->index;
@@ -517,7 +517,7 @@ runtime::Module EthosnCompiler::CreateRuntimeModule(const ObjectRef& ref) {
     IRModule mod;
     Function func = Downcast<Function>(ref);
     auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(name_node.defined()) << "Failed to retrieved external symbol.";
+    ICHECK(name_node.defined()) << "Failed to retrieved external symbol.";
     GlobalVar gvar = GlobalVar(name_node.value());
     mod->Add(gvar, func);
     Function mod_func = Downcast<Function>(mod->functions.at(gvar));
@@ -539,7 +539,7 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
   // Finally compile the network
   std::vector<std::unique_ptr<sl::CompiledNetwork>> compiled_networks =
       sl::Compile(*network_with_ids.network, options);
-  CHECK_GE(compiled_networks.size(), 1) << "Ethos-N compiler failed to compile network";
+  ICHECK_GE(compiled_networks.size(), 1) << "Ethos-N compiler failed to compile network";
   auto compiled_network = std::move(compiled_networks[0]);
   // Determine the order that the inputs/outputs are in and how that corresponds to the
   // order that the TVM runtime will expect them in
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index f3d7f4562533..4b3e1bc05367 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -338,6 +338,9 @@ runtime::Module CompileEthosn(const ObjectRef& ref) {
 
 TVM_REGISTER_GLOBAL("relay.ext.ethos-n").set_body_typed(CompileEthosn);
 
+TVM_REGISTER_GLOBAL("relay.ext.ethos-n.constant_updater")
+    .set_body_typed([](Expr expr, std::string symbol) { return Map<String, runtime::NDArray>(); });
+
 }  // namespace ethosn
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/contrib/tensorrt/README b/src/relay/backend/contrib/tensorrt/README
deleted file mode 100644
index 09d03a157acd..000000000000
--- a/src/relay/backend/contrib/tensorrt/README
+++ /dev/null
@@ -1,61 +0,0 @@
-# Relay TensorRT Integration
-
-Currently, Relay/TRT integration only works when the entire model can be converted to TRT. It is enabled with the `EnableTrt` pass. If any op in the model cannot be converted to TRT, `EnableTrt` will return the original module unmodified.
-
-# How to use
-1. Build TVM with cmake flag `USE_TENSORRT=ON` or `USE_TENSORRT=/path/to/TensorRT`. USE_CUDA should be enabled as well.
-
-2. Convert the model into TensorRT. This step will determine if every node in the graph can be converted to TensorrRT and if so will mark the graph to use TensorRT and apply some specific optimization passes.
-```python
-import tvm.relay.tensorrt
-mod = relay.tensorrt.EnableTrt(mod, params)
-```
-
-3. Check if TRT was enabled. If not, it means some op in the graph is not supported by the TensorRT conversion. EnableTrt will output which particular ops are not supported and why.
-```python
-assert mod['main'].attrs and mod['main'].attrs.Compiler == 'tensorrt'
-```
-
-4. Finish compilation.
-```python
-with relay.build_config(opt_level=2, disabled_pass={"SimplifyInference"}):
-  graph, lib, params = relay.build(mod, "cuda", params=params)
-```
-
-5. (Optional) Serialize/deserialize the compiled model. The model will be serialized to three files: `compiled.json`, `compiled.params`, and `compiled.tensorrt`.
-```python
-# Serialize
-with open('compiled.json', 'w') as f_graph_json:
-  f_graph_json.write(graph)
-with open('compiled.params', 'wb') as f_params:
-  f_params.write(relay.save_param_dict(params))
-lib.save('compiled.tensorrt')
-
-# Deserialize
-with open('compiled.json', 'r') as f_graph_json:
-  graph = f_graph_json.read()
-with open('compiled.params', 'rb') as f_params:
-  params = tvm.relay.load_param_dict(f_params.read())
-lib = tvm.runtime.load_module("compiled.tensorrt")
-```
-
-6. Run inference. The first invocation will trigger creation of the TensorRT engine. This could take up to a few minutes.
-```python
-# Create graph runtime
-mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
-mod.set_input(**params)
-
-i_data = np.random.uniform(0, 1, input_shape).astype(dtype)
-# Build TensorRT engine
-mod.run(data=i_data)
-
-# Run inference
-mod.run(data=i_data)
-res = mod.get_output(0)
-```
-
-
-
-The tests `tests/python/relay/test_tensorrt.py` provide some deeper examples of how to use this feature.
-
-The NNVM/TRT integration is still present.
\ No newline at end of file
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
new file mode 100644
index 000000000000..cb648333df8d
--- /dev/null
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/tensorrt/codegen.cc
+ * \brief Implementation of the TensorRT JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+#if TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#endif
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*! \brief Attributes to store the compiler options for TensorRT. */
+struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfigNode> {
+  Array<Integer> tensorrt_version;
+  bool use_implicit_batch;
+  size_t max_workspace_size;
+  bool remove_no_mac_subgraphs;
+
+  TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
+    TVM_ATTR_FIELD(tensorrt_version)
+        .describe("TensorRT version as (major, minor, patch).")
+        .set_default(Array<Integer>({6, 0, 1}));
+    TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
+    TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
+    TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
+  }
+};
+
+class TensorRTCompilerConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorRTCompilerConfig, Attrs,
+                                            TensorRTCompilerConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", TensorRTCompilerConfig);
+
+/*!
+ * \brief Generates an TensorRTModule from a relay expression by serializing the expression to a
+ * json representation. TensorRT is not required here because use of TensorRT APIs is deferred until
+ * runtime.
+ */
+class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  TensorRTJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr) {}
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) {
+    std::string name;
+    if (const auto* op_node = cn->op.as<OpNode>()) {
+      name = op_node->name;
+    } else {
+      return JSONSerializer::VisitExpr_(cn);
+    }
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+    if (name == "nn.pad") {
+      SetPadNodeAttribute(node, cn);
+    } else if (name == "strided_slice") {
+      SetStridedSliceNodeAttribute(node, cn);
+    } else {
+      SetCallNodeAttribute(node, cn);
+    }
+    // These attributes are global to the whole module.
+    SaveGlobalAttributes(node);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+
+  void SetPadNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
+    const auto* pad_attr = cn->attrs.as<PadAttrs>();
+    ICHECK(pad_attr);
+    auto p = pad_attr->pad_width;
+    const int dim_h = (p.size() == 5) ? 3 : 2;
+    const int dim_w = (p.size() == 5) ? 4 : 3;
+    std::vector<std::string> padding = {std::to_string(p[dim_h][0].as<IntImmNode>()->value),
+                                        std::to_string(p[dim_w][0].as<IntImmNode>()->value),
+                                        std::to_string(p[dim_h][1].as<IntImmNode>()->value),
+                                        std::to_string(p[dim_w][1].as<IntImmNode>()->value)};
+    std::vector<dmlc::any> padding_attr;
+    padding_attr.emplace_back(padding);
+    node->SetAttr("padding", padding_attr);
+  }
+
+  void SetStridedSliceNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
+    const auto* attrs = cn->attrs.as<StridedSliceAttrs>();
+    ICHECK(attrs && attrs->begin && attrs->end && attrs->strides)
+        << "StridedSlice must have static begin, end, and strides.";
+    const bool default_strides =
+        !attrs->strides.value().defined() || attrs->strides.value().size() == 0;
+    auto ishape = backend::GetShape(cn->args[0]->checked_type());
+
+    auto process_slice_index = [](Integer x, int default_value, int dim_value) {
+      if (!x.defined()) return default_value;
+      int value = x.as<IntImmNode>()->value;
+      if (value < 0) value += dim_value;
+      return value;
+    };
+
+    std::vector<std::string> start, size, strides;
+    for (size_t i = 0; i < attrs->begin.value().size(); ++i) {
+      const int begin_value = process_slice_index(attrs->begin.value()[i], 0, ishape[i]);
+      ICHECK_GE(begin_value, 0);
+      start.push_back(std::to_string(begin_value));
+      const int stride_value = (default_strides || i >= attrs->strides.value().size() ||
+                                !attrs->strides.value()[i].defined())
+                                   ? 1
+                                   : attrs->strides.value()[i].as<IntImmNode>()->value;
+      ICHECK_GT(stride_value, 0);
+      strides.push_back(std::to_string(stride_value));
+      int size_value;
+      if (attrs->slice_mode == "end") {
+        const int end_value = process_slice_index(attrs->end.value()[i], ishape[i], ishape[i]);
+        size_value = (end_value - begin_value + stride_value - 1) / stride_value;
+      } else if (attrs->slice_mode == "size") {
+        // with slice_mode = "size", attrs->end_value mean the size of the slice
+        int end_value = attrs->end.value()[i].as<IntImmNode>()->value;
+        size_value = (end_value == -1) ? ishape[i] - begin_value : end_value;
+      }
+      ICHECK_GT(size_value, 0);
+      size.push_back(std::to_string(size_value));
+    }
+    std::vector<dmlc::any> start_attr, size_attr, strides_attr;
+    start_attr.emplace_back(start);
+    size_attr.emplace_back(size);
+    strides_attr.emplace_back(strides);
+    node->SetAttr("start", start_attr);
+    node->SetAttr("size", size_attr);
+    node->SetAttr("strides", strides_attr);
+  }
+
+  void SaveGlobalAttributes(std::shared_ptr<JSONGraphNode> node) {
+    auto ctx = transform::PassContext::Current();
+    auto cfg = ctx->GetConfig<TensorRTCompilerConfig>("relay.ext.tensorrt.options");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<TensorRTCompilerConfig>();
+    }
+    ICHECK_EQ(cfg.value()->tensorrt_version.size(), 3);
+    std::vector<std::string> tensorrt_version = {std::to_string(cfg.value()->tensorrt_version[0]),
+                                                 std::to_string(cfg.value()->tensorrt_version[1]),
+                                                 std::to_string(cfg.value()->tensorrt_version[2])};
+    std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
+    std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
+    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
+    tensorrt_version_attr.emplace_back(tensorrt_version);
+    use_implicit_batch_attr.emplace_back(use_implicit_batch);
+    max_workspace_size_attr.emplace_back(max_workspace_size);
+    node->SetAttr("tensorrt_version", tensorrt_version_attr);
+    node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
+    node->SetAttr("max_workspace_size", max_workspace_size_attr);
+  }
+};
+
+/*!
+ * \brief Create a runtime module for TensorRT.
+ * \param ref The ext_func Relay expression/module to be executed using extern ops.
+ * \return A runtime module.
+ */
+runtime::Module TensorRTCompiler(const ObjectRef& ref) {
+  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
+  Function func = Downcast<Function>(ref);
+  std::string func_name = backend::GetExtSymbol(func);
+
+  TensorRTJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto param_names = serializer.GetParams();
+  const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
+  ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
+  runtime::Module lib = (*pf)(func_name, graph_json, param_names);
+  return lib;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TensorRTCompiler);
+
+/*!
+ * \brief Check whether TensorRT graph runtime is enabled.
+ * \return True if enabled, False if not.
+ */
+inline constexpr bool IsTensorRTRuntimeEnabled() {
+#if TVM_GRAPH_RUNTIME_TENSORRT
+  return true;
+#else
+  return false;
+#endif  // TVM_GRAPH_RUNTIME_TENSORRT
+}
+
+/*!
+ * \brief Get TensorRT version that TVM is built against.
+ * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph
+ * runtime is not enabled.
+ */
+Array<Integer> GetTensorRTVersion() {
+#if TVM_GRAPH_RUNTIME_TENSORRT
+  return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)};
+#else
+  return {};
+#endif  // TVM_GRAPH_RUNTIME_TENSORRT
+}
+
+TVM_REGISTER_GLOBAL("relay.op.is_tensorrt_runtime_enabled")
+    .set_body_typed(IsTensorRTRuntimeEnabled);
+TVM_REGISTER_GLOBAL("relay.op.get_tensorrt_version").set_body_typed(GetTensorRTVersion);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc b/src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc
deleted file mode 100644
index a627611fa905..000000000000
--- a/src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc
- * \brief Implementation of TensorRT codegen APIs.
- */
-
-#include <tvm/node/serialization.h>
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-
-#include <fstream>
-#include <sstream>
-#include <unordered_map>
-
-#include "../../../../runtime/contrib/tensorrt/tensorrt_module.h"
-#include "../../utils.h"
-#include "../codegen_c/codegen_c.h"
-#if TVM_GRAPH_RUNTIME_TENSORRT
-#include "NvInfer.h"
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-using namespace backend;
-
-/*!
- * \brief Generates a TensorRTModule from a relay expression. This "compilation"
- * does not require TensorRT since the actual conversion using TensorRT APIs is
- * deferred until runtime. This step simply serializes the relay functions into
- * strings.
- */
-class TensorRTModuleCodegen : public CSourceModuleCodegenBase {
- public:
-  /*!
-   * \brief Serializes a function and stores it in serialized_subgraphs_ so that
-   * it can be included in the TensorRT module.
-   * \param func A relay function to add to the TensorRT module.
-   */
-  void GenFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
-    // Record the external symbol for runtime lookup.
-    auto sid = GetExtSymbol(func);
-    serialized_subgraphs_[sid] = SaveJSON(func);
-  }
-
-  /*!
-   * \brief Creates the TensorRT module from the Relay function or IRModule.
-   * \param ref An object ref that could be either a Relay function or IRModule.
-   * \return The TensorRT runtime module.
-   */
-  runtime::Module CreateCSourceModule(const ObjectRef& ref) override {
-    if (ref->IsInstance<FunctionNode>()) {
-      GenFunc(Downcast<Function>(ref));
-    } else if (ref->IsInstance<IRModuleNode>()) {
-      IRModule mod = Downcast<IRModule>(ref);
-      for (const auto& it : mod->functions) {
-        GenFunc(Downcast<Function>(it.second));
-      }
-    } else {
-      LOG(FATAL)
-          << "The input ref is expected to be a Relay function or module.";
-    }
-    return runtime::TensorRTModuleCreate(serialized_subgraphs_);
-  }
-
- private:
-  /*! \brief Map of external symbol to serialized Relay functions. */
-  std::unordered_map<std::string, std::string> serialized_subgraphs_;
-};
-
-/*!
- * \brief The external compiler/codegen tool. It takes a Relay expression/module
- * and compiles it into a runtime module.
- */
-runtime::Module TrtCompiler(const ObjectRef& ref) {
-  TensorRTModuleCodegen tensorrt;
-  return tensorrt.CreateCSourceModule(ref);
-}
-
-TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TrtCompiler);
-
-/*!
- * \brief Get TensorRT version that TVM was compiled against.
- * \return TensorRT version as a list of [major, minor, patch], or an empty list
- * if not compiled against TensorRT.
- */
-Array<Integer> GetTrtVersion() {
-#if TVM_GRAPH_RUNTIME_TENSORRT
-  return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR),
-          Integer(NV_TENSORRT_PATCH)};
-#else
-  return {};
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.GetTrtVersion")
-    .set_body_typed(GetTrtVersion);
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/contrib/verilator/codegen.cc b/src/relay/backend/contrib/verilator/codegen.cc
new file mode 100644
index 000000000000..4124fa2459d6
--- /dev/null
+++ b/src/relay/backend/contrib/verilator/codegen.cc
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/verilator/codegen.cc
+ * \brief Implementation of Verilator codegen APIs.
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <numeric>
+#include <sstream>
+
+#include "../../../../runtime/contrib/json/json_node.h"
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+using namespace backend;
+
+class VerilatorJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  VerilatorJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr) {}
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
+    Expr expr = GetRef<Expr>(cn);
+    std::string name;
+    const CallNode* call = cn;
+    if (const auto* op_node = cn->op.as<OpNode>()) {
+      name = op_node->name;
+    } else {
+      LOG(FATAL) << "Verilator JSON runtime does not support calls to " << cn->op->GetTypeKey();
+    }
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+    SetCallNodeAttribute(node, call);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+};
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
+ * compile it into a runtime module.
+ */
+runtime::Module VerilatorCompiler(const ObjectRef& ref) {
+  CHECK(ref->IsInstance<FunctionNode>());
+  auto func = Downcast<Function>(ref);
+  auto func_name = GetExtSymbol(func);
+  VerilatorJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto params = serializer.GetParams();
+
+  const auto* pf = runtime::Registry::Get("runtime.VerilatorJSONRuntimeCreate");
+  CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
+  auto mod = (*pf)(func_name, graph_json, params);
+  return mod;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.verilator").set_body_typed(VerilatorCompiler);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc b/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
new file mode 100644
index 000000000000..f74b5306c5f4
--- /dev/null
+++ b/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
+ * \brief Register Vitis-AI codegen options. Main codegen is implemented in python.
+ */
+
+#include <tvm/ir/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace vitis_ai {
+
+/*! \brief The target Vitis-AI accelerator device */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.target", String);
+/*! \brief (Optional config) The build directory to be used by Vitis-AI */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.build_dir", String);
+/*! \brief (Optional config) The work directory to be used by Vitis-AI */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.work_dir", String);
+/*! \brief (Optional config) Export PyXIR runtime module to disk during serialization if provided */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.export_runtime_module", String);
+/*! \brief (Optional config) Load PyXIR runtime module from disk */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.load_runtime_module", String);
+
+}  // namespace vitis_ai
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 2b08f45b2582..15173c2c79db 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -82,16 +82,15 @@ class StorageAllocaBaseVisitor : public ExprVisitor {
   void VisitExpr_(const TupleNode* op) final {
     std::vector<StorageToken*> fields;
     for (Expr field : op->fields) {
-      auto tok = GetToken(field);
-      CHECK_EQ(tok.size(), 1U);
-      fields.push_back(tok[0]);
+      auto tokens = GetToken(field);
+      fields.insert(fields.end(), tokens.begin(), tokens.end());
     }
     token_map_[op] = fields;
   }
 
   void VisitExpr_(const TupleGetItemNode* op) final {
     const auto& tok = GetToken(op->tuple);
-    CHECK_LT(static_cast<size_t>(op->index), tok.size());
+    ICHECK_LT(static_cast<size_t>(op->index), tok.size());
     token_map_[op] = {tok[op->index]};
   }
 
@@ -115,7 +114,7 @@ class StorageAllocaBaseVisitor : public ExprVisitor {
   const std::vector<StorageToken*>& GetToken(const Expr& expr) {
     this->VisitExpr(expr);
     auto it = token_map_.find(expr.operator->());
-    CHECK(it != token_map_.end());
+    ICHECK(it != token_map_.end());
     return it->second;
   }
   /*!
@@ -142,14 +141,14 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor {
   using StorageAllocaBaseVisitor::VisitExpr_;
 
   void CreateToken(const ExprNode* op, bool can_realloc) final {
-    CHECK(!token_map_.count(op));
+    ICHECK(!token_map_.count(op));
     std::vector<StorageToken*> tokens;
     int device_type =
         node_device_map_.count(GetRef<Expr>(op)) ? node_device_map_[GetRef<Expr>(op)]->value : 0;
     if (const auto* tuple_type = op->checked_type().as<TupleTypeNode>()) {
       for (Type t : tuple_type->fields) {
         const auto* ttype = t.as<TensorTypeNode>();
-        CHECK(ttype);
+        ICHECK(ttype);
         StorageToken* token = arena_->make<StorageToken>();
         token->ttype = ttype;
         token->device_type = device_type;
@@ -157,7 +156,7 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor {
       }
     } else {
       const auto* ttype = op->checked_type().as<TensorTypeNode>();
-      CHECK(ttype);
+      ICHECK(ttype);
       StorageToken* token = arena_->make<StorageToken>();
       token->ttype = ttype;
       token->device_type = device_type;
@@ -233,9 +232,9 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   using StorageAllocaBaseVisitor::VisitExpr_;
   // override create token by getting token as prototype requirements.
   void CreateToken(const ExprNode* op, bool can_realloc) final {
-    CHECK(!token_map_.count(op));
+    ICHECK(!token_map_.count(op));
     auto it = prototype_.find(op);
-    CHECK(it != prototype_.end());
+    ICHECK(it != prototype_.end());
     std::vector<StorageToken*> tokens;
     for (StorageToken* tok : it->second) {
       if (can_realloc) {
@@ -286,12 +285,12 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
    */
   size_t GetMemorySize(StorageToken* prototype) {
     const TensorTypeNode* ttype = prototype->ttype;
-    CHECK(ttype != nullptr);
+    ICHECK(ttype != nullptr);
     size_t size = 1;
     for (IndexExpr dim : ttype->shape) {
       const int64_t* pval = tir::as_const_int(dim);
-      CHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
-      CHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
       size *= static_cast<size_t>(pval[0]);
     }
     size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
@@ -316,7 +315,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     for (auto it = mid; it != end; ++it) {
       StorageToken* tok = it->second;
       if (tok->device_type != prototype->device_type) continue;
-      CHECK_EQ(tok->ref_counter, 0);
+      ICHECK_EQ(tok->ref_counter, 0);
       // Use exect matching strategy
       tok->max_bytes = std::max(size, tok->max_bytes);
       tok->ref_counter = prototype->ref_counter;
@@ -329,7 +328,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
       --it;
       StorageToken* tok = it->second;
       if (tok->device_type != prototype->device_type) continue;
-      CHECK_EQ(tok->ref_counter, 0);
+      ICHECK_EQ(tok->ref_counter, 0);
       // Use exect matching strategy
       tok->max_bytes = std::max(size, tok->max_bytes);
       tok->ref_counter = prototype->ref_counter;
@@ -356,8 +355,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
    * \param tok The token to be released.
    */
   void CheckForRelease(StorageToken* tok) {
-    CHECK_GE(tok->storage_id, 0);
-    CHECK_GE(tok->ref_counter, 0);
+    ICHECK_GE(tok->storage_id, 0);
+    ICHECK_GE(tok->ref_counter, 0);
     if (tok->ref_counter == 0) {
       free_.insert({tok->max_bytes, tok});
     }
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index acc99c51b69b..7ed150495104 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -56,7 +56,7 @@ struct LoweredOutput {
   std::string graph_json;
   Map<String, IRModule> lowered_funcs;
   Array<tvm::runtime::Module> external_mods;
-  std::unordered_map<std::string, tvm::runtime::NDArray> params;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
 };
 
 /*! \brief Node types */
@@ -203,7 +203,12 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     GetJSON(&writer);
     LoweredOutput ret;
     ret.graph_json = os.str();
-    ret.params = params_;
+    ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
+    for (auto param : params_) {
+      ret.params.emplace(std::make_pair(
+          param.first,
+          std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
+    }
 
     for (auto& kv : lowered_funcs_) {
       if (ret.lowered_funcs.count(kv.first) == 0) {
@@ -243,9 +248,9 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   std::vector<GraphNodeRef> AddNode(GraphObjectPtr node, Expr expr) {
     auto checked_type = expr->checked_type();
     size_t count = storage_device_map_.count(expr);
-    CHECK_GT(count, 0) << "Expr is not existing in storage plan";
+    ICHECK_GT(count, 0) << "Expr is not existing in storage plan";
     auto storage_device_info = storage_device_map_[expr];
-    CHECK_EQ(storage_device_info.size(), 2);
+    ICHECK_EQ(storage_device_info.size(), 2);
     // storage
     std::vector<int64_t> storage_info;
     for (auto& v : storage_device_info[0]) {
@@ -282,7 +287,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
           LOG(FATAL) << "type " << checked_type->GetTypeKey() << " not supported";
         }
       }
-      CHECK_EQ(node->Type(), kGraphOpNode);
+      ICHECK_EQ(node->Type(), kGraphOpNode);
       auto op_nd = std::dynamic_pointer_cast<GraphOpNode>(node);
       op_nd->attrs_["shape"] = shape;
       op_nd->attrs_["dtype"] = dtype;
@@ -312,9 +317,12 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     Expr expr = GetRef<Expr>(op);
     size_t index = params_.size();
     std::string name = "p" + std::to_string(index);
-    params_[name] = op->data;
     auto node = GraphInputNode::make_node_ptr(name, GraphAttrs());
-    return AddNode(node, expr);
+    auto to_return = AddNode(node, expr);
+    CHECK_EQ(to_return.size(), 1) << "Expected exactly 1 parameter node created";
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    params_[name] = op->data;
+    return to_return;
   }
 
   std::vector<GraphNodeRef> VisitExpr_(const TupleNode* op) override {
@@ -367,19 +375,12 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
       target = Target("ext_dev");
       CCacheKey key = (*pf0)(func, target);
       CachedFunc ext_func = (*pf1)(compile_engine_, key);
-      CHECK(ext_func.defined()) << "External function is not defined.";
-
-      // Step into the functions that are handled by external codegen to
-      // collect metadata.
-      const auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-      std::string symobl = std::string(name_node.value());
-      ConstantUpdater const_visit(symobl, &params_);
-      const_visit(func);
-
+      ICHECK(ext_func.defined()) << "External function is not defined.";
+      UpdateConstants(func, &params_);
       return GraphAddCallNode(op, ext_func->func_name, ext_func->func_name);
     }
 
-    CHECK_GE(storage_device_map_.count(expr), 0);
+    ICHECK_GE(storage_device_map_.count(expr), 0);
     auto& device_type = storage_device_map_[expr][1];
     auto call_dev_type = device_type[0]->value;
     // Normal Relay Function
@@ -410,7 +411,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   }
 
   std::vector<GraphNodeRef> VisitExpr_(const LetNode* op) override {
-    CHECK_EQ(var_map_.count(op->var.get()), 0);
+    ICHECK_EQ(var_map_.count(op->var.get()), 0);
     var_map_[op->var.get()] = VisitExpr(op->value);
     return VisitExpr(op->body);
   }
@@ -431,7 +432,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const FunctionNode* op) override {
-    CHECK(op->GetAttr<String>(attr::kCompiler).defined())
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
         << "Only functions supported by custom codegen";
     return {};
   }
@@ -479,7 +480,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
       const auto& storage_id = dmlc::get<std::vector<int64_t>>(node->attrs_["storage_id"]);
       const auto& dtype_vec = dmlc::get<std::vector<std::string>>(node->attrs_["dtype"]);
 
-      CHECK_EQ(node->num_outputs_, shape_vec.size());
+      ICHECK_EQ(node->num_outputs_, shape_vec.size());
       num_entry += node->num_outputs_;
 
       shapes.insert(shapes.end(), shape_vec.begin(), shape_vec.end());
@@ -538,8 +539,14 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   std::unordered_map<const Object*, std::vector<GraphNodeRef>> var_map_;
   /*! \brief target device */
   TargetsMap targets_;
-  /*! \brief params */
+  /*!
+   * \brief parameters (i.e. ConstantNodes found in the graph).
+   * These are take as inputs to the GraphRuntime.
+   * Maps param name to a pair of storage_id and NDArray. At runtime, the storage_id can be
+   * used to lookup the parameter.
+   */
   std::unordered_map<std::string, runtime::NDArray> params_;
+  std::unordered_map<std::string, int64_t> param_storage_ids_;
   /*! \brief plan memory of device result */
   Map<Expr, Array<IntegerArray>> storage_device_map_;
   /*! \brief lowered funcs */
@@ -556,14 +563,14 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
   virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
     if (name == "init") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
-                                   << "runtime::Module mod and Map<int, Target> targets";
+        ICHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
+                                    << "runtime::Module mod and Map<int, Target> targets";
         void* mod = args[0];
         Map<Integer, tvm::Target> tmp = args[1];
         TargetsMap targets;
         for (const auto& it : tmp) {
           auto dev_type = it.first.as<tir::IntImmNode>();
-          CHECK(dev_type);
+          ICHECK(dev_type);
           targets[dev_type->value] = it.second;
         }
         codegen_ =
@@ -588,8 +595,16 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
     } else if (name == "get_param_by_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         String key = args[0];
-        CHECK_GT(this->output_.params.count(key), 0);
-        *rv = this->output_.params[key];
+        auto it = this->output_.params.find(key);
+        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+        *rv = (*it).second.second;
+      });
+    } else if (name == "get_param_id") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        String key = args[0];
+        auto it = this->output_.params.find(key);
+        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+        *rv = (*it).second.first;
       });
     } else if (name == "get_irmodule") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index e58c23b76670..993fb1a62787 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -54,7 +54,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 inline const PackedFunc& GetPackedFunc(const std::string& name) {
   const PackedFunc* pf = tvm::runtime::Registry::Get(name);
-  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
+  ICHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
   return *pf;
 }
 
@@ -347,12 +347,12 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
         }
       }
     }
-    CHECK_EQ(arg_counter, cfunc->inputs.size()) << "Shape function input sizes mismatch";
+    ICHECK_EQ(arg_counter, cfunc->inputs.size()) << "Shape function input sizes mismatch";
 
     auto fset_shape_output = [&](size_t i, Type val_type) {
       // TODO(@icemelon): allow recursive tuple
       const TensorTypeNode* rtype = val_type.as<TensorTypeNode>();
-      CHECK(rtype != nullptr);
+      ICHECK(rtype != nullptr);
       int64_t ndim = rtype->shape.size();
       auto arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_ctx);
       outputs[i] = arr;
@@ -371,7 +371,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       auto tt = Downcast<TensorType>(ret_type);
       fset_shape_output(0, tt);
     }
-    CHECK_EQ(cfunc->outputs.size(), out_cnt) << "Shape function output sizes mismatch";
+    ICHECK_EQ(cfunc->outputs.size(), out_cnt) << "Shape function output sizes mismatch";
 
     PackedFunc shape_func;
     Module m;
@@ -428,7 +428,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     if (const auto* tuple_type = func->body->checked_type().as<TupleTypeNode>()) {
       arg_len += tuple_type->fields.size();
     } else {
-      CHECK(func->body->checked_type().as<TensorTypeNode>()) << func->body->checked_type();
+      ICHECK(func->body->checked_type().as<TensorTypeNode>()) << func->body->checked_type();
       arg_len += 1;
     }
     std::vector<TVMValue> values(arg_len);
@@ -439,7 +439,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       const auto nd_array = Downcast<NDArray>(val);
       setter(i, nd_array);
       DLContext arg_ctx = nd_array->ctx;
-      CHECK(arg_ctx.device_type == context_.device_type && arg_ctx.device_id == context_.device_id)
+      ICHECK(arg_ctx.device_type == context_.device_type && arg_ctx.device_id == context_.device_id)
           << "Interpreter expect context to be " << context_ << ", but get " << arg_ctx;
     };
 
@@ -461,12 +461,12 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     // return type.
     auto fset_output = [&](size_t i, Type val_type) {
       const TensorTypeNode* rtype = val_type.as<TensorTypeNode>();
-      CHECK(rtype != nullptr);
+      ICHECK(rtype != nullptr);
       // Allocate output tensor.
       std::vector<int64_t> shape;
       for (auto dim : rtype->shape) {
         const auto* ivalue = tir::as_const_int(dim);
-        CHECK(ivalue) << "expected concrete dimensions";
+        ICHECK(ivalue) << "expected concrete dimensions";
         shape.push_back(ivalue[0]);
       }
       DLDataType dtype = rtype->dtype;
@@ -480,14 +480,14 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     bool is_dyn = IsDynamic(ret_type);
 
     if (is_dyn) {
-      CHECK(func->HasNonzeroAttr(attr::kPrimitive));
+      ICHECK(func->HasNonzeroAttr(attr::kPrimitive));
       out_shapes = ComputeDynamicShape(func, args);
     }
 
     PackedFunc packed_func = engine_->JIT(CCacheKey(func, target_));
     TVMRetValue rv;
     if (const TupleTypeNode* rtype = func->body->checked_type().as<TupleTypeNode>()) {
-      CHECK(!is_dyn || out_shapes.size() == rtype->fields.size());
+      ICHECK(!is_dyn || out_shapes.size() == rtype->fields.size());
       std::vector<ObjectRef> fields;
       for (size_t i = 0; i < rtype->fields.size(); ++i) {
         if (is_dyn) {
@@ -503,7 +503,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     } else {
       ObjectRef out_tensor;
       if (is_dyn) {
-        CHECK_EQ(out_shapes.size(), 1);
+        ICHECK_EQ(out_shapes.size(), 1);
         auto sh = out_shapes[0];
         auto tt = Downcast<TensorType>(ret_type);
         out_tensor = fset_output(0, TensorType(sh, tt->dtype));
@@ -526,16 +526,16 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     // Allocate a frame with the parameters and free variables.
     tvm::Map<Var, ObjectRef> locals;
 
-    CHECK_EQ(func->params.size(), args.size());
+    ICHECK_EQ(func->params.size(), args.size());
 
     for (size_t i = 0; i < func->params.size(); i++) {
-      CHECK_EQ(locals.count(func->params[i]), 0);
+      ICHECK_EQ(locals.count(func->params[i]), 0);
       locals.Set(func->params[i], args[i]);
     }
 
     // Add the var to value mappings from the Closure's environment.
     for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
-      CHECK_EQ(locals.count((*it).first), 0);
+      ICHECK_EQ(locals.count((*it).first), 0);
       locals.Set((*it).first, (*it).second);
     }
 
@@ -593,9 +593,9 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   ObjectRef VisitExpr_(const TupleGetItemNode* op) final {
     ObjectRef val = Eval(op->tuple);
     const auto* adt_obj = val.as<ADTObj>();
-    CHECK(adt_obj) << "interal error: when evaluating TupleGetItem expected an ADT value";
+    ICHECK(adt_obj) << "interal error: when evaluating TupleGetItem expected an ADT value";
     auto adt = GetRef<ADT>(adt_obj);
-    CHECK_LT(static_cast<size_t>(op->index), adt.size()) << "internal error: index out of bounds";
+    ICHECK_LT(static_cast<size_t>(op->index), adt.size()) << "internal error: index out of bounds";
     return adt[op->index];
   }
 
@@ -607,7 +607,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       cpu_ctx.device_type = kDLCPU;
       cpu_ctx.device_id = 0;
       NDArray cpu_array = nd_array.CopyTo(cpu_ctx);
-      CHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
+      ICHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
       // TODO(@jroesch, @MK): Refactor code into helper from DCE.
       if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
         return Eval(op->true_branch);
@@ -656,11 +656,11 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
 
   bool VisitPattern_(const PatternConstructorNode* op, const ObjectRef& v) final {
     const ConstructorValueObj* cvn = v.as<ConstructorValueObj>();
-    CHECK(cvn) << "need to be a constructor for match";
-    CHECK_NE(op->constructor->tag, -1);
-    CHECK_NE(cvn->tag, -1);
+    ICHECK(cvn) << "need to be a constructor for match";
+    ICHECK_NE(op->constructor->tag, -1);
+    ICHECK_NE(cvn->tag, -1);
     if (op->constructor->tag == cvn->tag) {
-      CHECK_EQ(op->patterns.size(), cvn->fields.size());
+      ICHECK_EQ(op->patterns.size(), cvn->fields.size());
       for (size_t i = 0; i < op->patterns.size(); ++i) {
         if (!VisitPattern(op->patterns[i], cvn->fields[i])) {
           return false;
@@ -673,7 +673,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
 
   bool VisitPattern_(const PatternTupleNode* op, const ObjectRef& v) final {
     auto adt = Downcast<ADT>(v);
-    CHECK_EQ(op->patterns.size(), adt.size());
+    ICHECK_EQ(op->patterns.size(), adt.size());
     for (size_t i = 0; i < op->patterns.size(); ++i) {
       if (!VisitPattern(op->patterns[i], adt[i])) {
         return false;
@@ -730,7 +730,7 @@ TypedPackedFunc<ObjectRef(Expr)> CreateInterpreter(IRModule mod, DLContext conte
   auto intrp = std::make_shared<Interpreter>(mod, context, target);
   auto packed = [intrp](Expr expr) {
     auto f = DetectFeature(expr);
-    CHECK(f.is_subset_of(FeatureSet::All() - fGraph));
+    ICHECK(f.is_subset_of(FeatureSet::All() - fGraph));
     return intrp->Eval(expr);
   };
   return TypedPackedFunc<ObjectRef(Expr)>(packed);
diff --git a/src/relay/backend/param_dict.cc b/src/relay/backend/param_dict.cc
index ef4b6589bdba..1d7e08abcdde 100644
--- a/src/relay/backend/param_dict.cc
+++ b/src/relay/backend/param_dict.cc
@@ -37,7 +37,7 @@ namespace relay {
 using namespace runtime;
 
 TVM_REGISTER_GLOBAL("tvm.relay._save_param_dict").set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_EQ(args.size() % 2, 0u);
+  ICHECK_EQ(args.size() % 2, 0u);
   // `args` is in the form "key, value, key, value, ..."
   size_t num_params = args.size() / 2;
   std::vector<std::string> names;
@@ -74,14 +74,14 @@ TVM_REGISTER_GLOBAL("tvm.relay._load_param_dict").set_body([](TVMArgs args, TVMR
   dmlc::MemoryStringStream memstrm(&bytes);
   dmlc::Stream* strm = &memstrm;
   uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid parameters file format";
-  CHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
-  CHECK(strm->Read(&reserved)) << "Invalid parameters file format";
-  CHECK(strm->Read(&names)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
+  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&names)) << "Invalid parameters file format";
   uint64_t sz;
   strm->Read(&sz, sizeof(sz));
   size_t size = static_cast<size_t>(sz);
-  CHECK(size == names.size()) << "Invalid parameters file format";
+  ICHECK(size == names.size()) << "Invalid parameters file format";
   tvm::Array<NamedNDArray> ret;
   for (size_t i = 0; i < size; ++i) {
     tvm::runtime::NDArray temp;
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 07f42266b831..e1677205ffa1 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -63,6 +63,37 @@ struct ConstantUpdater : public ExprVisitor {
   std::unordered_map<std::string, runtime::NDArray>* params_;
 };
 
+/*!
+ * \brief A function to update the params with constants found in an external function.
+ * \param func The function from which to get the constant params.
+ * \param params The params to update with the constants.
+ */
+inline void UpdateConstants(Function func,
+                            std::unordered_map<std::string, runtime::NDArray>* params) {
+  auto codegen = func->GetAttr<String>(attr::kCompiler);
+  ICHECK(codegen.defined()) << "No external codegen is set";
+  std::string codegen_name = codegen.value();
+  const auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+  std::string symbol = std::string(name_node.value());
+  std::string const_update_name = "relay.ext." + codegen_name + ".constant_updater";
+  // Get the constant updater for the external codegen
+  auto pf = tvm::runtime::Registry::Get(const_update_name);
+  // If the backend hasn't registered a constant updater, use a default one
+  if (pf == nullptr) {
+    ConstantUpdater const_visit(symbol, params);
+    const_visit(func);
+  } else {
+    Map<String, tvm::runtime::NDArray> constants = (*pf)(func, symbol);
+    for (const auto& it : constants) {
+      std::string const_name(it.first);
+      // Constant names should begin this the compiler name (to avoid conflicts)
+      ICHECK(const_name.find(codegen_name) == 0)
+          << "External constant names must start with compiler name";
+      (*params)[const_name] = it.second;
+    }
+  }
+}
+
 /*!
  * \brief A simple wrapper around ExprFunctor for a single argument case.
  *  The result of visit is memoized.
@@ -81,7 +112,7 @@ class MemoizedExprTranslator : public ::tvm::relay::ExprFunctor<OutputType(const
    * \return The result of the call
    */
   virtual OutputType VisitExpr(const Expr& n) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     auto it = memo_.find(n);
     if (it != memo_.end()) {
       return it->second;
@@ -115,7 +146,7 @@ inline const PackedFunc* GetPackedFunc(const std::string& func_name) {
 template <typename R, typename... Args>
 inline const runtime::TypedPackedFunc<R(Args...)> GetTypedPackedFunc(const std::string& func_name) {
   auto* pf = GetPackedFunc(func_name);
-  CHECK(pf != nullptr) << "can not find packed function";
+  ICHECK(pf != nullptr) << "can not find packed function";
   return runtime::TypedPackedFunc<R(Args...)>(*pf);
 }
 
@@ -129,8 +160,7 @@ inline std::vector<int64_t> GetIntShape(const Array<IndexExpr>& shape) {
   std::vector<int64_t> ret;
   for (const auto& dim : shape) {
     const int64_t* pval = tir::as_const_int(dim);
-    CHECK(pval) << "Expect integer, but received: " << dim->GetTypeKey();
-    ret.push_back(*pval);
+    ret.push_back(pval ? *pval : -1);
   }
   return ret;
 }
@@ -192,8 +222,8 @@ inline relay::Function BindParamsByName(
   }
   Expr bound_expr = relay::Bind(func, bind_dict);
   Function ret = Downcast<Function>(bound_expr);
-  CHECK(ret.defined()) << "The returning type is expected to be a Relay Function."
-                       << "\n";
+  ICHECK(ret.defined()) << "The returning type is expected to be a Relay Function."
+                        << "\n";
   return ret;
 }
 
@@ -204,11 +234,11 @@ inline relay::Function BindParamsByName(
  */
 inline std::vector<int> GetShape(const Type& type) {
   const auto* ttype = type.as<TensorTypeNode>();
-  CHECK(ttype) << "Expect TensorTypeNode";
+  ICHECK(ttype) << "Expect TensorTypeNode";
   std::vector<int> shape;
   for (size_t i = 0; i < ttype->shape.size(); ++i) {
     auto* val = ttype->shape[i].as<IntImmNode>();
-    CHECK(val);
+    ICHECK(val);
     shape.push_back(val->value);
   }
   return shape;
@@ -223,7 +253,7 @@ inline std::vector<int> GetShape(const Type& type) {
  */
 inline bool IsOp(const CallNode* call, const std::string& op_name) {
   const auto* op_node = call->op.as<OpNode>();
-  CHECK(op_node) << "Expects a single op.";
+  ICHECK(op_node) << "Expects a single op.";
   Op op = GetRef<Op>(op_node);
   return op == Op::Get(op_name);
 }
@@ -239,14 +269,14 @@ inline bool IsOp(const CallNode* call, const std::string& op_name) {
 
 inline const CallNode* GetRootCall(const CallNode* current_call, int depth,
                                    const std::vector<std::string>& expected_op_names) {
-  CHECK(current_call && depth >= 0 && static_cast<size_t>(depth) < expected_op_names.size() &&
-        IsOp(current_call, expected_op_names[depth]));
+  ICHECK(current_call && depth >= 0 && static_cast<size_t>(depth) < expected_op_names.size() &&
+         IsOp(current_call, expected_op_names[depth]));
 
   if (depth == 0) {
     return current_call;
   }
 
-  CHECK_GT(current_call->args.size(), 0);
+  ICHECK_GT(current_call->args.size(), 0);
 
   const auto* next_call = current_call->args[0].as<CallNode>();
   return GetRootCall(next_call, depth - 1, expected_op_names);
@@ -260,10 +290,19 @@ inline const CallNode* GetRootCall(const CallNode* current_call, int depth,
  */
 inline std::string GetExtSymbol(const Function& func) {
   const auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(name_node.defined()) << "Fail to retrieve external symbol.";
+  ICHECK(name_node.defined()) << "Fail to retrieve external symbol.";
   return std::string(name_node.value());
 }
 
+/*!
+ * \brief Return whether the auto scheduler is enabled in the pass context.
+ */
+inline bool IsAutoSchedulerEnabled() {
+  return transform::PassContext::Current()
+      ->GetConfig<Bool>("relay.backend.use_auto_scheduler", Bool(false))
+      .value();
+}
+
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index fb9ca08f0592..f652644afa3c 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -46,7 +46,7 @@
 #include "../../../target/source/codegen_source_base.h"
 #include "../../backend/compile_engine.h"
 #include "../../op/op_common.h"
-#include "../../transforms/pass_util.h"
+#include "../../transforms/pass_utils.h"
 #include "../utils.h"
 #include "compiler.h"
 
@@ -60,19 +60,19 @@ Pass InlinePrimitives();
 
 Pass ManifestAlloc(Target target_host, vm::TargetsMap targets) {
   auto f = tvm::runtime::Registry::Get("relay.transform.ManifestAlloc");
-  CHECK(f != nullptr) << "unable to load allocation manifestation pass";
+  ICHECK(f != nullptr) << "unable to load allocation manifestation pass";
   return (*f)(target_host, targets);
 }
 
 Pass MemoryPlan() {
   auto f = tvm::runtime::Registry::Get("relay.transform.MemoryPlan");
-  CHECK(f != nullptr) << "unable to load the memory planning pass";
+  ICHECK(f != nullptr) << "unable to load the memory planning pass";
   return (*f)();
 }
 
 Pass LiftConstants() {
   auto f = tvm::runtime::Registry::Get("relay.transform.LiftConstants");
-  CHECK(f != nullptr) << "unable to load the constant lifting pass";
+  ICHECK(f != nullptr) << "unable to load the constant lifting pass";
   return (*f)();
 }
 
@@ -178,7 +178,7 @@ TreeObjectPtr BuildDecisionTreeFromPattern(MatchValuePtr data, Pattern pattern,
     return TreeBranchNode::Make(cond, then_branch, else_branch);
   } else {
     const auto* pt = pattern.as<PatternTupleNode>();
-    CHECK(pt) << "unhandled case: " << AsText(pattern, false);
+    ICHECK(pt) << "unhandled case: " << AsText(pattern, false);
     size_t field_index = 0;
     for (auto& p : pt->patterns) {
       auto d = std::make_shared<AccessField>(data, field_index++);
@@ -209,10 +209,10 @@ std::vector<int64_t> ToAllocTensorShape(NDArray shape) {
   if (shape->ndim == 0) {
     return raw_shape;
   }
-  CHECK_EQ(shape->ndim, 1u);
-  CHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got "
-                                  << DLDataType2String(shape->dtype);
-  CHECK(shape->dtype.bits == 64 || shape->dtype.bits == 32)
+  ICHECK_EQ(shape->ndim, 1u);
+  ICHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got "
+                                   << DLDataType2String(shape->dtype);
+  ICHECK(shape->dtype.bits == 64 || shape->dtype.bits == 32)
       << "The dtype of constant shape must be int32 or int64, but got"
       << DLDataType2String(shape->dtype);
 
@@ -247,7 +247,7 @@ int GetFallbackDevice() {
   Optional<Integer> opt_fallback_dev =
       pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast<int>(kDLCPU)));
   auto fallback_dev = opt_fallback_dev.value();
-  CHECK_GT(fallback_dev->value, 0U);
+  ICHECK_GT(fallback_dev->value, 0U);
   return fallback_dev->value;
 }
 
@@ -271,7 +271,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     // We then assign register num to the free variables
     for (auto param : func->params) {
       auto arg_register = NewRegister();
-      CHECK_EQ(i, arg_register);
+      ICHECK_EQ(i, arg_register);
       var_register_map_.insert({param, arg_register});
       params_.push_back(param->name_hint());
       ++i;
@@ -281,7 +281,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       Function inner_func = Downcast<Function>(func->body);
       for (auto param : inner_func->params) {
         auto arg_register = NewRegister();
-        CHECK_EQ(i, arg_register);
+        ICHECK_EQ(i, arg_register);
         var_register_map_.insert({param, arg_register});
         params_.push_back(param->name_hint());
         ++i;
@@ -295,10 +295,10 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     std::vector<Index> params_device_type;
     for (const auto& it : func->params) {
       if (!expr_device_map_.empty()) {
-        CHECK_GT(expr_device_map_.count(it), 0U);
+        ICHECK_GT(expr_device_map_.count(it), 0U);
         params_device_type.push_back(expr_device_map_[it].device_type);
       } else {
-        CHECK_EQ(targets_.size(), 1U);
+        ICHECK_EQ(targets_.size(), 1U);
         params_device_type.push_back((targets_.begin())->first);
       }
     }
@@ -311,7 +311,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 
   inline void Emit(const Instruction& instr) {
     DLOG(INFO) << "VMCompiler::Emit: instr=" << instr;
-    CHECK((int)instr.op < 100) << "Invalid opcode " << (int)instr.op;
+    ICHECK((int)instr.op < 100) << "Invalid opcode " << (int)instr.op;
     switch (instr.op) {
       case Opcode::AllocADT:
       case Opcode::AllocTensor:
@@ -343,19 +343,12 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   void VisitExpr_(const ConstantNode* const_node) {
     // Check the shape is valid
     NDArray data = const_node->data;
-    const DLTensor* tensor = data.operator->();
-    if (tensor->ndim > 0) {
-      int64_t* shapes = reinterpret_cast<int64_t*>(tensor->shape);
-      for (auto i = 0; i < tensor->ndim; i++) {
-        CHECK_GT(shapes[i], 0U);
-      }
-    }
     size_t konst_idx = context_->constants.size();
     if (expr_device_map_.empty()) {
       context_->const_device_type.push_back(targets_.begin()->first);
     } else {
       auto con = GetRef<Constant>(const_node);
-      CHECK_GT(expr_device_map_.count(con), 0U);
+      ICHECK_GT(expr_device_map_.count(con), 0U);
       context_->const_device_type.push_back(expr_device_map_[con].device_type);
     }
     context_->constants.push_back(const_node->data);
@@ -365,7 +358,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   void VisitExpr_(const VarNode* var_node) {
     auto var = GetRef<Var>(var_node);
     auto reg_it = this->var_register_map_.find(var);
-    CHECK(reg_it != this->var_register_map_.end());
+    ICHECK(reg_it != this->var_register_map_.end());
     last_register_ = reg_it->second;
   }
 
@@ -407,7 +400,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     auto var = GetRef<GlobalVar>(gvar);
     auto func = context_->module->Lookup(var);
     auto it = context_->global_map.find(var);
-    CHECK(it != context_->global_map.end());
+    ICHECK(it != context_->global_map.end());
     // Allocate closure with zero free vars
     Emit(Instruction::AllocClosure(it->second, 0, {}, NewRegister()));
   }
@@ -465,7 +458,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     auto cfunc = engine_->LowerShapeFunc(key);
     int op_index = -1;
     // pick the only function inside the context
-    CHECK_EQ(cfunc->funcs->functions.size(), 1);
+    ICHECK_EQ(cfunc->funcs->functions.size(), 1);
     auto pfunc = Downcast<tir::PrimFunc>((*cfunc->funcs->functions.begin()).second);
     if (context_->seen_funcs.count(pfunc) == 0) {
       op_index = context_->cached_funcs.size();
@@ -484,7 +477,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 
     for (auto output : outputs) {
       auto reg = var_register_map_.find(Downcast<Var>(output));
-      CHECK(reg != var_register_map_.end())
+      ICHECK(reg != var_register_map_.end())
           << "internal error: all variables should be in the register mapping";
       argument_registers.push_back(reg->second);
     }
@@ -496,16 +489,16 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   void EmitInvokeTVMOp(const Function& func, const Expr& inputs, const Expr& outputs) {
     std::vector<Index> argument_registers;
 
-    CHECK(func->GetAttr<Integer>(attr::kPrimitive, 0) != 0)
+    ICHECK(func->GetAttr<Integer>(attr::kPrimitive, 0) != 0)
         << "internal error: invoke_tvm_op requires the first argument to be a relay::Function";
 
     auto input_tuple = inputs.as<TupleNode>();
-    CHECK(input_tuple) << "internal error: invoke_tvm_op inputs must be a tuple,"
-                       << "please file a bug in the memory manifestation pass";
+    ICHECK(input_tuple) << "internal error: invoke_tvm_op inputs must be a tuple,"
+                        << "please file a bug in the memory manifestation pass";
 
     auto output_tuple = outputs.as<TupleNode>();
-    CHECK(output_tuple) << "internal error: invoke_tvm_op outputs must be a tuple,"
-                        << "please file a bug in the memory manifestation pass";
+    ICHECK(output_tuple) << "internal error: invoke_tvm_op outputs must be a tuple,"
+                         << "please file a bug in the memory manifestation pass";
 
     for (auto input : input_tuple->fields) {
       VisitExpr(input);
@@ -514,7 +507,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 
     for (auto output : output_tuple->fields) {
       auto reg = var_register_map_.find(Downcast<Var>(output));
-      CHECK(reg != var_register_map_.end())
+      ICHECK(reg != var_register_map_.end())
           << "internal error: all variables should be in the register mapping";
       argument_registers.push_back(reg->second);
     }
@@ -527,11 +520,11 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       // Next generate the invoke instruction.
       if (expr_device_map_.empty()) {
         // homogeneous execution.
-        CHECK_EQ(targets_.size(), 1U);
+        ICHECK_EQ(targets_.size(), 1U);
         const auto& it = targets_.begin();
         target = (*it).second;
       } else {
-        CHECK_GT(expr_device_map_.count(func), 0U)
+        ICHECK_GT(expr_device_map_.count(func), 0U)
             << "Found not annotated expression, please make sure "
                "context analysis has been executed";
         int dev_type = expr_device_map_[func].device_type;
@@ -552,7 +545,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       context_->cached_funcs.push_back(cfunc);
     } else {
       // TODO(jroesch): support lowered funcs for multiple targets
-      CHECK_EQ(cfunc->funcs->functions.size(), 1);
+      ICHECK_EQ(cfunc->funcs->functions.size(), 1);
       auto pfunc = Downcast<tir::PrimFunc>((*cfunc->funcs->functions.begin()).second);
       if (context_->seen_funcs.find(pfunc) == context_->seen_funcs.end()) {
         op_index = context_->cached_funcs.size();
@@ -578,16 +571,16 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       matcher
           .Match("vm.invoke_tvm_op",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 3);
+                   ICHECK_EQ(args.size(), 3);
                    EmitInvokeTVMOp(Downcast<Function>(args[0]), args[1], args[2]);
                  })
           .Match("memory.alloc_tensor",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 3);
+                   ICHECK_EQ(args.size(), 3);
 
                    // Get the attributes.
                    auto alloc_attrs = attrs.as<AllocTensorAttrs>();
-                   CHECK(alloc_attrs != nullptr) << "must be the alloc tensor attrs";
+                   ICHECK(alloc_attrs != nullptr) << "must be the alloc tensor attrs";
                    auto dtype = alloc_attrs->dtype;
 
                    // The storage will be passed dynamically.
@@ -619,22 +612,22 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
           .Match("memory.alloc_storage",
                  [this, call_node](const Array<Expr>& args, const Attrs& attrs,
                                    const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 2);
+                   ICHECK_EQ(args.size(), 2);
                    // Compute the size of the allocation.
                    this->VisitExpr(args[0]);
                    auto size_register = last_register_;
 
-                   CHECK(args[1].as<ConstantNode>());
+                   ICHECK(args[1].as<ConstantNode>());
                    NDArray alignment_arr = args[1].as<ConstantNode>()->data;
-                   CHECK_EQ(alignment_arr->dtype.code, 0U)
+                   ICHECK_EQ(alignment_arr->dtype.code, 0U)
                        << "The dtype of constant shape must be int32 or int64, but got "
                        << DLDataType2String(alignment_arr->dtype);
-                   CHECK_EQ(alignment_arr->dtype.bits, 64U);
+                   ICHECK_EQ(alignment_arr->dtype.bits, 64U);
                    Index alignment = reinterpret_cast<int64_t*>(alignment_arr->data)[0];
 
                    // Get the dtype hint from the attributes.
                    auto alloc_attrs = attrs.as<AllocStorageAttrs>();
-                   CHECK(alloc_attrs != nullptr) << "must be the AllocStorage attrs";
+                   ICHECK(alloc_attrs != nullptr) << "must be the AllocStorage attrs";
                    auto dtype = alloc_attrs->dtype;
 
                    Index device_type;
@@ -644,7 +637,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                      auto& kv = *(targets_.begin());
                      device_type = kv.first;
                    } else {
-                     CHECK_GT(expr_device_map_.count(GetRef<Call>(call_node)), 0U)
+                     ICHECK_GT(expr_device_map_.count(GetRef<Call>(call_node)), 0U)
                          << " The alloc_storage node is not annotated";
                      device_type = expr_device_map_[GetRef<Call>(call_node)].device_type;
                    }
@@ -654,7 +647,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                  })
           .Match("vm.shape_func",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 3);
+                   ICHECK_EQ(args.size(), 3);
                    auto shape_func = Downcast<Function>(args[0]);
                    auto inputs = Downcast<Tuple>(args[1]);
                    auto outputs = Downcast<Tuple>(args[2]);
@@ -662,11 +655,11 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                  })
           .Match("vm.shape_of",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 1U);
+                   ICHECK_EQ(args.size(), 1U);
                    // Get the attributes.
                    const auto* shape_of_attrs = attrs.as<ShapeOfAttrs>();
-                   CHECK(shape_of_attrs) << "Must be the shape_of attrs";
-                   CHECK_EQ(shape_of_attrs->dtype.bits(), 64)
+                   ICHECK(shape_of_attrs) << "Must be the shape_of attrs";
+                   ICHECK_EQ(shape_of_attrs->dtype.bits(), 64)
                        << "The dtype of shape of must be int64, but got"
                        << DLDataType2String(shape_of_attrs->dtype);
                    this->VisitExpr(args[0]);
@@ -674,7 +667,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                  })
           .Match("vm.reshape_tensor",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 2u);
+                   ICHECK_EQ(args.size(), 2u);
                    this->VisitExpr(args[0]);
                    auto tensor_reg = last_register_;
                    this->VisitExpr(args[1]);
@@ -683,12 +676,12 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                  })
           .Match("device_copy",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 1U);
+                   ICHECK_EQ(args.size(), 1U);
                    this->VisitExpr(args[0]);
                    auto src_reg = last_register_;
 
                    auto device_copy_attrs = attrs.as<DeviceCopyAttrs>();
-                   CHECK(device_copy_attrs != nullptr) << "Must be the device copy attrs";
+                   ICHECK(device_copy_attrs != nullptr) << "Must be the device copy attrs";
                    Index src_device_type = device_copy_attrs->src_dev_type;
                    Index dst_device_type = device_copy_attrs->dst_dev_type;
                    Emit(Instruction::DeviceCopy(src_reg, src_device_type, dst_device_type,
@@ -718,7 +711,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       // calling convention.
       auto global = GetRef<GlobalVar>(global_node);
       auto it = context_->global_map.find(global);
-      CHECK(it != context_->global_map.end());
+      ICHECK(it != context_->global_map.end());
       DLOG(INFO) << "VisitExpr_: generating invoke for " << global->name_hint
                  << " with func_index=" << it->second;
 
@@ -862,13 +855,13 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
   if (name == "lower") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.num_args, 3);
+      ICHECK_EQ(args.num_args, 3);
       IRModule mod = args[0];
       this->Lower(mod, args[1], args[2]);
     });
   } else if (name == "codegen") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.num_args, 0);
+      ICHECK_EQ(args.num_args, 0);
       this->Codegen();
     });
   } else if (name == "get_executable") {
@@ -891,7 +884,7 @@ PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Obje
     });
   } else if (name == "optimize") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.num_args, 3);
+      ICHECK_EQ(args.num_args, 3);
       *rv = this->OptimizeModule(args[0], args[1], args[2]);
     });
   } else {
@@ -907,7 +900,7 @@ void VMCompiler::SetParam(const std::string& name, runtime::NDArray data_in) {
 void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Target& target_host) {
   if (params_.size()) {
     BaseFunc base_func = mod->Lookup("main");
-    CHECK(base_func->IsInstance<FunctionNode>())
+    ICHECK(base_func->IsInstance<FunctionNode>())
         << "VM compiler expects to compile relay::Function";
     auto f = relay::backend::BindParamsByName(Downcast<Function>(base_func), params_);
     auto gvar = mod->GetGlobalVar("main");
@@ -943,7 +936,7 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
       auto vm_func = func_compiler.Compile(gvar, func);
 
       size_t func_index = context_.global_map.at(gvar);
-      CHECK(func_index < exec_->functions.size());
+      ICHECK(func_index < exec_->functions.size());
       exec_->functions[func_index] = vm_func;
     }
   }
@@ -1130,9 +1123,9 @@ void VMCompiler::Codegen() {
 
     if (target_str == "ext_dev") {
       // Collect metadata in functions that are handled by external codegen.
-      CHECK(mod->ContainGlobalVar(cfunc->func_name));
-      backend::ConstantUpdater const_visit(cfunc->func_name, &params_);
-      const_visit(Downcast<Function>(mod->Lookup(cfunc->func_name)));
+      ICHECK(mod->ContainGlobalVar(cfunc->func_name));
+      Function func = Downcast<Function>(mod->Lookup(cfunc->func_name));
+      backend::UpdateConstants(func, &params_);
       continue;
     } else if (funcs.count(target_str) == 0) {
       funcs.emplace(target_str, mod);
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 19924ab38358..56965c544701 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -44,7 +44,7 @@
 #include "../../../runtime/vm/naive_allocator.h"
 #include "../../../runtime/vm/profiler/vm.h"
 #include "../../backend/compile_engine.h"
-#include "../../transforms/pass_util.h"
+#include "../../transforms/pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index 22b8364534c8..f21d0967701a 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -82,7 +82,7 @@ class LambdaLifter : public ExprMutator {
       auto var = GetRef<Var>(var_node);
       if (!letrec_.empty() && var == letrec_.back()) {
         auto it = lambda_map_.find(var);
-        CHECK(it != lambda_map_.end());
+        ICHECK(it != lambda_map_.end());
         return Call(it->second, call->args, call_node->attrs, call_node->type_args);
       }
     }
@@ -154,11 +154,12 @@ class LambdaLifter : public ExprMutator {
       lifted_func = MarkClosure(lifted_func);
     }
 
-    CHECK(lifted_func.defined());
+    ICHECK(lifted_func.defined());
 
     if (module_->ContainGlobalVar(name)) {
       const auto existing_func = module_->Lookup(name);
-      CHECK(tvm::StructuralEqual()(lifted_func, existing_func)) << "lifted function hash collision";
+      ICHECK(tvm::StructuralEqual()(lifted_func, existing_func))
+          << "lifted function hash collision";
       // If an identical function already exists, use its global var.
       global = module_->GetGlobalVar(name);
     } else {
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 50c05f2923bc..44b87633d208 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -85,7 +85,7 @@ void DFPatternMatcher::ClearMap(size_t watermark) {
 
 bool DFPatternMatcher::VisitDFPattern(const DFPattern& pattern, const Expr& expr) {
   if (memoize_ && memo_.count(pattern)) {
-    CHECK_EQ(memo_[pattern].size(), 1);
+    ICHECK_EQ(memo_[pattern].size(), 1);
     return expr.same_as(memo_[pattern][0]);
   } else {
     auto watermark = matched_nodes_.size();
@@ -133,7 +133,7 @@ bool MatchRetValue(const ObjectRef& lhs, const TVMRetValue& rhs) {
       }
       break;
     default:
-      CHECK(false) << "Unsupported type code in Pattern Node " << rhs.type_code();
+      ICHECK(false) << "Unsupported type code in Pattern Node " << rhs.type_code();
   }
   return false;
 }
@@ -643,8 +643,6 @@ class PatternGrouper {
     auto extractor = MatchExtractor(inputs);
     auto body = extractor.Mutate(expr);
 
-    // Verify the pattern still holds
-    CHECK(DFPatternMatcher(body).Match(pattern_, body));
     group.function = Function(params, body, NullValue<Type>(), Array<TypeVar>());
     group.name = extractor.GetName();
     // Check to make sure we aren't overlapping with another group or creating an invalid fusion
@@ -765,7 +763,7 @@ class PatternRewriter : protected MixedModeMutator {
     int count = 0;
     bool equal = true;
     static auto* structural_equal = runtime::Registry::Get("node.StructuralEqual");
-    CHECK(structural_equal) << "node.StructuralEqual is not registered.";
+    ICHECK(structural_equal) << "node.StructuralEqual is not registered.";
     do {
       last = post;
       for (auto callback : callbacks) {
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 237cb35d8455..89d1f1ab0f11 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -47,7 +47,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<ConstantNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const ConstantNode*>(ref.get());
       const PackedFunc* fprint = Registry::Get("relay._constant_repr");
-      CHECK(fprint) << "unable to find printing function for constants";
+      ICHECK(fprint) << "unable to find printing function for constants";
       std::string data = (*fprint)(GetRef<Constant>(node));
       p->stream << "Constant(" << data << ")";
     });
@@ -56,8 +56,8 @@ TensorType ConstantNode::tensor_type() const {
   auto dtype = DataType(data->dtype);
   Array<tvm::PrimExpr> shape;
   for (int i = 0; i < data->ndim; i++) {
-    CHECK_LE(data->shape[i], std::numeric_limits<int32_t>::max());
-    CHECK_GE(data->shape[i], std::numeric_limits<int32_t>::min());
+    ICHECK_LE(data->shape[i], std::numeric_limits<int32_t>::max());
+    ICHECK_GE(data->shape[i], std::numeric_limits<int32_t>::min());
     shape.push_back(tvm::IntImm(DataType::Int(32), data->shape[i]));
   }
 
@@ -73,8 +73,8 @@ Tuple::Tuple(tvm::Array<relay::Expr> fields, Span span) {
 
 TVM_REGISTER_NODE_TYPE(TupleNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.Tuple").set_body_typed([](tvm::Array<relay::Expr> fields) {
-  return Tuple(fields);
+TVM_REGISTER_GLOBAL("relay.ir.Tuple").set_body_typed([](tvm::Array<relay::Expr> fields, Span span) {
+  return Tuple(fields, span);
 });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -121,8 +121,8 @@ Call::Call(Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args, Span s
 TVM_REGISTER_NODE_TYPE(CallNode);
 
 TVM_REGISTER_GLOBAL("relay.ir.Call")
-    .set_body_typed([](Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args) {
-      return Call(op, args, attrs, type_args);
+    .set_body_typed([](Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args, Span span) {
+      return Call(op, args, attrs, type_args, span);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index cbc41d225d4b..74095a753950 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -33,77 +33,9 @@
 
 namespace tvm {
 namespace relay {
-/*!
- * \brief A function to iteratively traverse dataflow regions of a graph
- *
- * ExpandDataflow manually manages a stack and performs DFS to determine the processing
- * order of nodes in an input graph.
- *
- * If it finds a dataflow node (Call, Tuple, TupleGetItem), it checks if the arguments to that node
- * need to be processed via fcheck_visited. If so, the function pushes those arguments to the stack
- * and continues iteratively to process the top of the stack. When it finds a node that doesn't
- * match the dataflow types, or a node who's inputs have all been processed, it visits the current
- * leaf via fvisit_leaf.
- *
- * This function should be used internally to other classes to implement mixed-mode traversals. The
- * expectation is that fvisit_leaf will perform recursive analysis within mixed-mode traversal if it
- * hits a non-dataflow node.
- *
- * fcheck_visited and fvisit_leaf are templated to encourage compiler inlining.
- */
-template <typename FCheckVisited, typename FVisitLeaf>
-void ExpandDataflow(Expr expr, FCheckVisited fcheck_visited, FVisitLeaf fvisit_leaf) {
-  std::stack<std::pair<Expr, bool>> stack;
-  auto fpush_to_stack = [&fcheck_visited, &stack](const Expr& expr) {
-    // The second state of the stack indicate whether the child has been
-    // expanded in the pre-order.
-    // NOTE: function will be inlined.
-    if (!fcheck_visited(expr)) {
-      stack.push({expr, false});
-    }
-  };
-  fpush_to_stack(expr);
-  while (stack.size() > 0) {
-    auto node = stack.top().first;
-    if (fcheck_visited(node)) {
-      // if this node was visited through another path
-      // after being added to the stack ignore it.
-      stack.pop();
-    } else if (stack.top().second) {
-      // all the children have already been expanded.
-      // we can just run post order visit on it.
-      fvisit_leaf(node);
-      stack.pop();
-    } else if (const CallNode* op = node.as<CallNode>()) {
-      // mark expanded = true
-      stack.top().second = true;
-      // push the children to the stack in reverse order
-      // to match recursive processing order
-      for (auto it = op->args.rbegin(); it != op->args.rend(); ++it) {
-        fpush_to_stack(*it);
-      }
-      fpush_to_stack(op->op);
-    } else if (const TupleNode* op = node.as<TupleNode>()) {
-      stack.top().second = true;
-      // push the children to the stack in reverse order
-      // to match recursive processing order
-      for (auto it = op->fields.rbegin(); it != op->fields.rend(); ++it) {
-        fpush_to_stack(*it);
-      }
-    } else if (const TupleGetItemNode* op = node.as<TupleGetItemNode>()) {
-      stack.top().second = true;
-      fpush_to_stack(op->tuple);
-    } else {
-      // No need to expand the children directly run visit.
-      fvisit_leaf(node);
-      stack.pop();
-    }
-  }
-}
-
 MixedModeVisitor::MixedModeVisitor(int visit_limit) {
-  CHECK(visit_limit > 0) << "Dataflow visit limit must be greater than 0";
-  CHECK(visit_limit < 10) << "Dataflow visit limit must be less than 10";
+  ICHECK(visit_limit > 0) << "Dataflow visit limit must be greater than 0";
+  ICHECK(visit_limit < 10) << "Dataflow visit limit must be less than 10";
   visit_limit_ = visit_limit;
 }
 
@@ -517,18 +449,20 @@ TVM_REGISTER_GLOBAL("relay.analysis.post_order_visit").set_body_typed([](Expr ex
 });
 
 // Implement bind.
-class ExprBinder : public ExprMutator, PatternMutator {
+class ExprBinder : public MixedModeMutator, PatternMutator {
  public:
   explicit ExprBinder(const tvm::Map<Var, Expr>& args_map) : args_map_(args_map) {}
 
+  using MixedModeMutator::VisitExpr_;
+
   Expr VisitExpr_(const LetNode* op) final {
-    CHECK(!args_map_.count(op->var)) << "Cannot bind an internel variable in let";
+    ICHECK(!args_map_.count(op->var)) << "Cannot bind an internel variable in let";
     return ExprMutator::VisitExpr_(op);
   }
 
   Expr VisitExpr_(const FunctionNode* op) final {
     for (Var param : op->params) {
-      CHECK(!args_map_.count(param)) << "Cannnot bind an internal function parameter";
+      ICHECK(!args_map_.count(param)) << "Cannnot bind an internal function parameter";
     }
     return ExprMutator::VisitExpr_(op);
   }
@@ -551,7 +485,7 @@ class ExprBinder : public ExprMutator, PatternMutator {
   }
 
   Var VisitVar(const Var& v) final {
-    CHECK(!args_map_.count(v)) << "Cannnot bind an internal pattern variable";
+    ICHECK(!args_map_.count(v)) << "Cannnot bind an internal pattern variable";
     return v;
   }
 
@@ -582,7 +516,7 @@ Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
       }
     }
     ret = Function(new_params, new_body, func->ret_type, func->type_params, func->attrs);
-    CHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
+    ICHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
     return std::move(ret);
   } else {
     return ExprBinder(args_map).VisitExpr(expr);
@@ -594,7 +528,7 @@ TVM_REGISTER_GLOBAL("relay.ir.Bind").set_body([](TVMArgs args, TVMRetValue* ret)
   if (input->IsInstance<ExprNode>()) {
     *ret = Bind(Downcast<Expr>(input), args[1]);
   } else {
-    CHECK(input->IsInstance<TypeNode>());
+    ICHECK(input->IsInstance<TypeNode>());
     *ret = Bind(Downcast<Type>(input), args[1]);
   }
 });
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index 1439e8b59cf0..c9920a621b56 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -29,8 +29,8 @@ namespace relay {
 Function::Function(tvm::Array<Var> params, Expr body, Type ret_type,
                    tvm::Array<TypeVar> type_params, DictAttrs attrs, Span span) {
   ObjectPtr<FunctionNode> n = make_object<FunctionNode>();
-  CHECK(params.defined());
-  CHECK(type_params.defined());
+  ICHECK(params.defined());
+  ICHECK(type_params.defined());
   n->params = std::move(params);
   n->body = std::move(body);
   n->ret_type = std::move(ret_type);
diff --git a/src/relay/ir/indexed_graph.h b/src/relay/ir/indexed_graph.h
index 70508279af21..4bbb741b760d 100644
--- a/src/relay/ir/indexed_graph.h
+++ b/src/relay/ir/indexed_graph.h
@@ -115,8 +115,8 @@ class IndexedGraph {
       return nullptr;
     }
     while (lhs != rhs) {
-      CHECK(lhs);
-      CHECK(rhs);
+      ICHECK(lhs);
+      ICHECK(rhs);
       if (lhs->depth_ < rhs->depth_) {
         rhs = rhs->dominator_parent_;
       } else if (lhs->depth_ > rhs->depth_) {
diff --git a/src/relay/ir/transform.cc b/src/relay/ir/transform.cc
index b5f4d152ee00..596f812e25af 100644
--- a/src/relay/ir/transform.cc
+++ b/src/relay/ir/transform.cc
@@ -128,7 +128,7 @@ IRModule FunctionPassNode::operator()(IRModule mod, const PassContext& pass_ctx)
 
   const PassInfo& pass_info = Info();
 
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
 
   DLOG(INFO) << "Executing function pass : " << pass_info->name
              << " with opt level: " << pass_info->opt_level;
diff --git a/src/relay/op/algorithm/argsort.cc b/src/relay/op/algorithm/argsort.cc
index a24097420873..455d413c2746 100644
--- a/src/relay/op/algorithm/argsort.cc
+++ b/src/relay/op/algorithm/argsort.cc
@@ -33,10 +33,10 @@ bool ArgsortRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   // `types` contains: [data, result]
   const ArgsortAttrs* param = attrs.as<ArgsortAttrs>();
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "Argsort: expect input type to be TensorType but get " << types[0];
     return false;
   }
diff --git a/src/relay/op/algorithm/topk.cc b/src/relay/op/algorithm/topk.cc
index 14308dd592d6..c1d3e5472743 100644
--- a/src/relay/op/algorithm/topk.cc
+++ b/src/relay/op/algorithm/topk.cc
@@ -34,15 +34,15 @@ bool TopKRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, result]
   const TopKAttrs* param = attrs.as<TopKAttrs>();
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
-  CHECK(data);
+  if (data == nullptr) return false;
   int ndim = data->shape.size();
   int axis = param->axis;
   if (axis < 0) {
     axis += ndim;
   }
-  CHECK(axis >= 0 && axis < ndim);
+  ICHECK(axis >= 0 && axis < ndim);
   Array<IndexExpr> out_shape;
   for (int i = 0; i < ndim; ++i) {
     if (i != axis) {
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index d3eb4f96ed09..a2c54656d464 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -30,7 +30,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/topi/elemwise.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../type_relations.h"
 
 namespace tvm {
diff --git a/src/relay/op/device_copy.cc b/src/relay/op/device_copy.cc
index 3a58607e6dd8..b26dc879be0a 100644
--- a/src/relay/op/device_copy.cc
+++ b/src/relay/op/device_copy.cc
@@ -33,7 +33,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/topi/elemwise.h>
 
-#include "../transforms/infer_layout_util.h"
+#include "../transforms/infer_layout_utils.h"
 #include "type_relations.h"
 
 namespace tvm {
diff --git a/src/relay/op/dyn/algorithm/topk.cc b/src/relay/op/dyn/algorithm/topk.cc
index 1c88730a5463..0ce0a18b2170 100644
--- a/src/relay/op/dyn/algorithm/topk.cc
+++ b/src/relay/op/dyn/algorithm/topk.cc
@@ -33,31 +33,31 @@ bool TopKRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, k, result]
   const TopKAttrs* param = attrs.as<TopKAttrs>();
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* k = types[1].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[0];
     return false;
   }
   if (k == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[1];
     return false;
   }
-  CHECK(k->shape.size() <= 1) << "Parameter k must be a Scalar or a Tensor of shape (1, )";
+  ICHECK(k->shape.size() <= 1) << "Parameter k must be a Scalar or a Tensor of shape (1, )";
   if (k->shape.size() == 1) {
     const IntImmNode* k_shape = k->shape[0].as<IntImmNode>();
-    CHECK(k_shape) << "Parameter k must have static shape";
-    CHECK_EQ(k_shape->value, 1) << "Parameter k must be a Scalar or a Tensor of shape (1, )";
+    ICHECK(k_shape) << "Parameter k must have static shape";
+    ICHECK_EQ(k_shape->value, 1) << "Parameter k must be a Scalar or a Tensor of shape (1, )";
   }
   int ndim = data->shape.size();
   int axis = param->axis;
   if (axis < 0) {
     axis += ndim;
   }
-  CHECK(axis >= 0 && axis < ndim);
+  ICHECK(axis >= 0 && axis < ndim);
   Array<IndexExpr> out_shape;
   for (int i = 0; i < ndim; ++i) {
     if (i != axis) {
diff --git a/src/relay/op/dyn/image/resize.cc b/src/relay/op/dyn/image/resize.cc
index 23e17400f29d..6581250db0cd 100644
--- a/src/relay/op/dyn/image/resize.cc
+++ b/src/relay/op/dyn/image/resize.cc
@@ -36,17 +36,17 @@ TVM_REGISTER_NODE_TYPE(ResizeAttrs);
 bool ResizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // {data, size, out}
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const ResizeAttrs* param = attrs.as<ResizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "Resize only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
diff --git a/src/relay/op/dyn/nn/pad.cc b/src/relay/op/dyn/nn/pad.cc
index 8a17f50df0df..42ec784f8c15 100644
--- a/src/relay/op/dyn/nn/pad.cc
+++ b/src/relay/op/dyn/nn/pad.cc
@@ -41,7 +41,7 @@ namespace dyn {
 bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
             const TypeReporter& reporter) {
   // types = [data_type, pad_width_type, pad_value_type, ret_type]
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
@@ -52,19 +52,13 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (pad_value == nullptr) return false;
 
   int data_rank = data->shape.size();
-  CHECK(data_rank) << "Data shape must have static rank";
+  ICHECK(data_rank) << "Data shape must have static rank";
 
   int pad_width_rank = pad_width->shape.size();
-  CHECK_EQ(pad_width_rank, 2) << "Pad width must be 2D";
-
-  auto pad_width_dim1 = pad_width->shape[0].as<IntImmNode>();
-  auto pad_width_dim2 = pad_width->shape[1].as<IntImmNode>();
-
-  CHECK(pad_width_dim1->value == data_rank && pad_width_dim2->value == 2)
-      << "Pad width must have shape (N, 2), where N is the rank of input data";
+  ICHECK_EQ(pad_width_rank, 2) << "Pad width must be 2D";
 
   const PadAttrs* param = attrs.as<PadAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   std::vector<IndexExpr> oshape;
   for (int i = 0; i < data_rank; i++) {
@@ -78,7 +72,7 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> PadCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                              const Type& out_type) {
   const auto* param = attrs.as<PadAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   auto data = inputs[0];
   auto pad_width = inputs[1];
@@ -94,7 +88,7 @@ Array<te::Tensor> PadCompute(const Attrs& attrs, const Array<te::Tensor>& inputs
   }
 
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
 
   return Array<te::Tensor>{topi::pad(inputs[0], pad_before, pad_after, pad_value, "T_pad",
                                      topi::kElementWise, param->pad_mode,
diff --git a/src/relay/op/dyn/nn/upsampling.cc b/src/relay/op/dyn/nn/upsampling.cc
index 9ed3298142af..93869757e96f 100644
--- a/src/relay/op/dyn/nn/upsampling.cc
+++ b/src/relay/op/dyn/nn/upsampling.cc
@@ -22,13 +22,14 @@
  * \brief upsampling operator
  */
 
-#include "../../nn/upsampling.h"
+#include "upsampling.h"
 
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/tir/data_layout.h>
 
+#include <utility>
 #include <vector>
 
 #include "../../op_common.h"
@@ -40,7 +41,7 @@ namespace dyn {
 bool UpSamplingRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
   // types = [data_type, scale_h_type, scale_w_type, ret_type]
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* scale_h = types[1].as<TensorTypeNode>();
   const auto* scale_w = types[2].as<TensorTypeNode>();
@@ -48,17 +49,16 @@ bool UpSamplingRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (scale_h == nullptr) return false;
   if (scale_w == nullptr) return false;
 
-  CHECK_EQ(data->shape.size(), 4);
-  CHECK_EQ(scale_h->shape.size(), 0);
-  CHECK_EQ(scale_w->shape.size(), 0);
+  ICHECK_EQ(scale_h->shape.size(), 0);
+  ICHECK_EQ(scale_w->shape.size(), 0);
   static const Layout kNCHW("NCHW");
 
   const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
-  CHECK(param);
+  ICHECK(param);
   const Layout in_layout(param->layout);
 
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "UpSampling only supports input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -122,18 +122,18 @@ RELAY_REGISTER_OP("dyn.nn.upsampling")
 bool UpSampling3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   // types = [data_type, scale_d_type, scale_h_type, scale_w_type, ret_type]
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCDHW("NCDHW");
 
   const UpSampling3DAttrs* param = attrs.as<UpSampling3DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
 
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "UpSampling3D only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
diff --git a/src/relay/op/dyn/nn/upsampling.h b/src/relay/op/dyn/nn/upsampling.h
new file mode 100644
index 000000000000..acdc54174913
--- /dev/null
+++ b/src/relay/op/dyn/nn/upsampling.h
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *
+ * \file src/relay/op/dyn/nn/upsampling.h
+ * \brief implementation of the InferCorrectLayout pass for dynamic upsampling
+ */
+
+#ifndef TVM_RELAY_OP_DYN_NN_UPSAMPLING_H_
+#define TVM_RELAY_OP_DYN_NN_UPSAMPLING_H_
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/tir/data_layout.h>
+
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace dyn {
+
+template <typename T>
+Array<Array<Layout> > UpsamplingInferCorrectLayout(const Attrs& attrs,
+                                                   const Array<Layout>& new_in_layouts,
+                                                   const Array<Layout>& old_in_layouts,
+                                                   const Array<tvm::relay::Type>& old_in_types) {
+  // NOTE: Discard "const" qualifier here.
+  T* params = const_cast<T*>(attrs.as<T>());
+  if (new_in_layouts.defined()) {
+    ICHECK_GT(new_in_layouts.size(), 0);
+
+    Layout raw_layout(params->layout);
+    Layout input = new_in_layouts[0];
+    if (input.IndexOf(LayoutAxis::Get('W')) == raw_layout.IndexOf(LayoutAxis::Get('W')) &&
+        input.IndexOf(LayoutAxis::Get('H')) == raw_layout.IndexOf(LayoutAxis::Get('H')) &&
+        !input.Contains(LayoutAxis::Get('w')) && !input.Contains(LayoutAxis::Get('h')) &&
+        (input.IndexOf(LayoutAxis::Get('D')) == -1 ||
+         (input.IndexOf(LayoutAxis::Get('D')) == raw_layout.IndexOf(LayoutAxis::Get('D')) &&
+          !input.Contains(LayoutAxis::Get('d'))))) {
+      params->layout = input.name();  // modify self to follow the input layout
+    }
+  }
+
+  Layout inferred_layout(params->layout);
+  Layout param_layout("NCHW");
+  return Array<Array<Layout> >{{inferred_layout, param_layout, param_layout}, {inferred_layout}};
+}
+
+}  // namespace dyn
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_DYN_NN_UPSAMPLING_H_
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index 4b594ffccfa5..815f24b6bda9 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <vector>
 
-#include "../../../transforms/infer_layout_util.h"
+#include "../../../transforms/infer_layout_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -47,11 +47,11 @@ namespace dyn {
 bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   // types: [data, newshape, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
 
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "reshape: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -59,7 +59,7 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   Array<IndexExpr> oshape;
   const auto* newshape = types[1].as<TensorTypeNode>();
   if (newshape == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "reshape: expect input type to be TensorType but get " << types[1];
     return false;
   }
@@ -76,7 +76,7 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
   Array<IndexExpr> newshape;
   for (auto val : out_ttype->shape) {
     if (val->IsInstance<tir::AnyNode>()) {
@@ -149,21 +149,21 @@ RELAY_REGISTER_OP("dyn.reshape")
 bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, reps, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* reps = types[1].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[0];
     return false;
   }
   if (reps == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[1];
     return false;
   }
   const IntImmNode* reps_shape = reps->shape[0].as<IntImmNode>();
-  CHECK(reps_shape) << "Parameter reps must have static shape";
+  ICHECK(reps_shape) << "Parameter reps must have static shape";
   const size_t ndim = data->shape.size();
   const size_t rndim = reps_shape->value;
   size_t tndim = (ndim > rndim) ? ndim : rndim;
@@ -178,7 +178,7 @@ bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
 Array<te::Tensor> TileCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
-  CHECK_EQ(inputs.size(), 2);
+  ICHECK_EQ(inputs.size(), 2);
   const auto* out_ttype = out_type.as<TensorTypeNode>();
   size_t rndim = inputs[1]->shape[0].as<IntImmNode>()->value;
   return {topi::dyn_tile(inputs[0], out_ttype->shape, rndim)};
@@ -212,7 +212,7 @@ RELAY_REGISTER_OP("dyn.tile")
 bool BroadCastToRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
   // types = [data_type, broadcast_shape_type, ret_type]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
 
   const auto* input_type = types[0].as<TensorTypeNode>();
   const auto* target_type = types[1].as<TensorTypeNode>();
@@ -225,8 +225,9 @@ bool BroadCastToRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   auto out_dtype = input_type->dtype;
   // rank must be static
   const IntImmNode* rank = target_type->shape[0].as<IntImmNode>();
-  CHECK(rank) << "Target shape must have static rank";  // rank must be static even in dyn pass
-                                                        // could add support for dyn rank in futures
+  ICHECK(rank)
+      << "Target shape must have static rank";  // rank must be static even in dyn pass
+                                                // could add support for dyn rank in futures
 
   std::vector<IndexExpr> oshape;
   for (int i = 0; i < rank->value; ++i) {
@@ -266,13 +267,13 @@ RELAY_REGISTER_OP("dyn.broadcast_to")
 bool InitOpRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // types = [zeros_shape, ret_type]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
   const auto* fill_shape = types[0].as<TensorTypeNode>();
   DataType out_dtype = param->dtype;
 
   const IntImmNode* shape_shape = fill_shape->shape[0].as<IntImmNode>();
-  CHECK(shape_shape) << "Parameter shape must have static rank";
+  ICHECK(shape_shape) << "Parameter shape must have static rank";
 
   std::vector<IndexExpr> oshape;
   for (int i = 0; i < shape_shape->value; ++i) {
@@ -324,9 +325,9 @@ RELAY_REGISTER_OP("dyn.ones")
 bool OneHotRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // `types` contains: [indices, on_value, off_value, result]
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto* indices = types[0].as<TensorTypeNode>();
-  CHECK(indices);
+  ICHECK(indices);
 
   const auto param = attrs.as<OneHotAttrs>();
 
@@ -349,7 +350,7 @@ bool OneHotRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> OneHotCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const auto* param = attrs.as<OneHotAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const auto* out_ttype = out_type.as<TensorTypeNode>();
   return Array<te::Tensor>{topi::one_hot(inputs[0], inputs[1](), inputs[2](), -1, param->axis,
                                          param->dtype, out_ttype->shape)};
@@ -393,7 +394,7 @@ RELAY_REGISTER_OP("dyn.one_hot")
 
 bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
   const auto* fill_value = types[0].as<TensorTypeNode>();
   const auto* fill_shape = types[1].as<TensorTypeNode>();
@@ -406,11 +407,11 @@ bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     out_dtype = fill_value->dtype;
   }
 
-  CHECK_EQ(fill_value->shape.size(), 0)
+  ICHECK_EQ(fill_value->shape.size(), 0)
       << "Fill value should be a scalar but has dimension " << fill_value->shape.size() << ".";
 
   const IntImmNode* rank = fill_shape->shape[0].as<IntImmNode>();
-  CHECK(rank) << "Parameter shape must have static rank";
+  ICHECK(rank) << "Parameter shape must have static rank";
 
   std::vector<IndexExpr> oshape;
   for (int i = 0; i < rank->value; ++i) {
@@ -449,7 +450,7 @@ RELAY_REGISTER_OP("dyn.full")
 bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   // [data, begin, end, strides, out]
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
   if (param == nullptr) {
     return false;
@@ -471,28 +472,6 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
   return true;
 }
 
-inline te::Tensor DynamicStridedSlice(const te::Tensor& input, const te::Tensor& begin,
-                                      const te::Tensor& end, const te::Tensor& strides,
-                                      std::string name = "T_strided_slice_dynamic",
-                                      std::string tag = topi::kInjective) {
-  int64_t src_tensor_dim = input->shape.size();
-  Array<IndexExpr> out_shape;
-  for (int64_t i = 0; i < src_tensor_dim; ++i) {
-    out_shape.push_back(tvm::tir::Var("dim"));
-  }
-  // TODO(yongwww): move the compute into topi
-  return te::compute(
-      out_shape,
-      [&](const Array<tvm::tir::Var>& indices) {
-        Array<IndexExpr> real_indices;
-        for (int32_t i = 0; i < src_tensor_dim; ++i) {
-          real_indices.push_back(indices[i] * strides(i) + begin(i));
-        }
-        return input(real_indices);
-      },
-      name, tag);
-}
-
 Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                       const Type& out_type) {
   te::Tensor data = inputs[0];
@@ -501,12 +480,12 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
   te::Tensor strides = inputs[3];
   // Dynamic computation
   int64_t data_rank = data->shape.size();
-  CHECK(begin->shape[0].as<IntImmNode>()->value == data_rank &&
-        end->shape[0].as<IntImmNode>()->value == data_rank &&
-        strides->shape[0].as<IntImmNode>()->value == data_rank)
+  ICHECK(begin->shape[0].as<IntImmNode>()->value == data_rank &&
+         end->shape[0].as<IntImmNode>()->value == data_rank &&
+         strides->shape[0].as<IntImmNode>()->value == data_rank)
       << "begin, end, and strides are required to have the same length"
       << " if they are dynamic variables.";
-  return Array<te::Tensor>{DynamicStridedSlice(data, begin, end, strides)};
+  return Array<te::Tensor>{topi::dynamic_strided_slice(data, begin, end, strides)};
 }
 
 Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, String slice_mode) {
@@ -555,6 +534,77 @@ Examples::
     .set_attr<TOpPattern>("TOpPattern", kInjective)
     .set_attr<AnyCodegenStrategy>("AnyCodegenStrategy", kVariableDimensions);
 
+bool SparseToDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  ICHECK_EQ(num_inputs, 4);
+  auto sparse_indices = types[0].as<TensorTypeNode>();
+  auto sparse_values = types[1].as<TensorTypeNode>();
+  auto default_value = types[2].as<TensorTypeNode>();
+  auto output_shape = types[3].as<TensorTypeNode>();
+
+  if (sparse_indices == nullptr || sparse_values == nullptr || default_value == nullptr ||
+      output_shape == nullptr) {
+    return false;
+  }
+
+  CHECK(sparse_indices->dtype.is_int()) << "sparse_indices must be tensor of integers";
+
+  CHECK_LE(sparse_indices->shape.size(), 3)
+      << "sparse_indices must be a tensor of either 0D, 1D or 2D";
+
+  CHECK_LE(sparse_values->shape.size(), 2) << "sparse_values must be a tensor of either 0D, 1D";
+
+  CHECK_EQ(default_value->shape.size(), 0) << "default_value should be a scalar";
+
+  Array<IndexExpr> oshape;
+  for (int i = 0; i < output_shape->shape[0].as<IntImmNode>()->value; i++) {
+    oshape.push_back(Any());
+  }
+  reporter->Assign(types[4], TensorType(oshape, sparse_values->dtype));
+  return true;
+}
+
+Array<te::Tensor> SparseToDenseCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
+                                       const Type& out_type) {
+  ICHECK_EQ(inputs.size(), 4);
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  ICHECK(out_ttype);
+  return {topi::sparse_to_dense(inputs[0], out_ttype->shape, inputs[1], inputs[2]())};
+}
+
+TVM_REGISTER_GLOBAL("relay.op.dyn._make.sparse_to_dense")
+    .set_body_typed([](Expr indices, Expr output_shape, Expr values, Expr default_value) {
+      static const Op& op = Op::Get("dyn.sparse_to_dense");
+      return Call(op, {indices, values, default_value, output_shape});
+    });
+
+RELAY_REGISTER_OP("dyn.sparse_to_dense")
+    .describe(R"code(A dense tensor from a sparse representation.
+
+    - **sparse_indices**: A 0-D, 1-D, or 2-D tensor of integers containing location of sparse values
+
+    - **output_shape**: A list of integers. Shape of the dense output tensor.
+
+    - **sparse_values**: A 0-D or 1-D tensor containing the sparse values for the sparse indices.
+
+    - **default_value**: A 0-D tensor containing the default value for the remaining locations. Defaults to 0.
+
+    Example::
+      -  sparse_to_dense([0, 0], [1, 2]], [3, 4], [1, 2], 0) = [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
+
+    )code" TVM_ADD_FILELINE)
+    .set_num_inputs(4)
+    .set_support_level(3)
+    .add_argument("sparse_indices", "Tensor", "Contains sparse indices.")
+    .add_argument("sparse_values", "Tensor", "Contains values for sparse indices.")
+    .add_argument("default_value", "Tensor", "Value to set for non-sparse indices. Defaults to 0.")
+    .add_argument("output_shape", "Tensor", "Shape of the dense output tensor")
+    .add_type_rel("DynSparseToDense", SparseToDenseRel)
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<FTVMCompute>("FTVMCompute", SparseToDenseCompute);
+
 }  // namespace dyn
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/image/dilation2d.cc b/src/relay/op/image/dilation2d.cc
index 462f11f56d0d..1f8c7ec732d9 100644
--- a/src/relay/op/image/dilation2d.cc
+++ b/src/relay/op/image/dilation2d.cc
@@ -62,7 +62,7 @@ Expr MakeDilation2D(Expr data, Expr weight, Array<IndexExpr> strides, Array<Inde
 template <typename AttrType>
 bool Dilation2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -70,23 +70,23 @@ bool Dilation2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   static const Layout kOIHW("IHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Dilation2D only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Dilation2D only support kernel layouts that are convertible from OIHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->data_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Dilation2D only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
diff --git a/src/relay/op/image/grid_sample.cc b/src/relay/op/image/grid_sample.cc
index bc6989155323..d5fa68aed82a 100644
--- a/src/relay/op/image/grid_sample.cc
+++ b/src/relay/op/image/grid_sample.cc
@@ -35,21 +35,21 @@ TVM_REGISTER_NODE_TYPE(AffineGridAttrs);
 
 bool AffineGridRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   auto batch_size = data->shape[0];
 
   const AffineGridAttrs* param = attrs.as<AffineGridAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Array<IndexExpr> oshape;
 
-  CHECK(data->shape.size() == 3U && reporter->AssertEQ(data->shape[1], 2) &&
-        reporter->AssertEQ(data->shape[2], 3))
+  ICHECK(data->shape.size() == 3U && reporter->AssertEQ(data->shape[1], 2) &&
+         reporter->AssertEQ(data->shape[2], 3))
       << "data should be an"
          "affine matrix with shape [batch_size, 2, 3]";
-  CHECK(param->target_shape.defined() && param->target_shape.size() == 2)
+  ICHECK(param->target_shape.defined() && param->target_shape.size() == 2)
       << "target_shape should be 2D";
   oshape.push_back(batch_size);
   oshape.push_back(2);
@@ -97,12 +97,12 @@ TVM_REGISTER_NODE_TYPE(GridSampleAttrs);
 
 bool GridSampleRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* grid = types[1].as<TensorTypeNode>();
   if (!data || !grid) return false;
   const auto* param = attrs.as<GridSampleAttrs>();
-  CHECK(param);
+  ICHECK(param);
   static const Layout kNCHW("NCHW");
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index 41b7afe6d00c..b8875e48ed0f 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -35,17 +35,17 @@ TVM_REGISTER_NODE_TYPE(ResizeAttrs);
 
 bool ResizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const ResizeAttrs* param = attrs.as<ResizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "Resize only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -104,17 +104,17 @@ TVM_REGISTER_NODE_TYPE(Resize3dAttrs);
 
 bool Resize3dRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCDHW("NCDHW");
 
   const Resize3dAttrs* param = attrs.as<Resize3dAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "Resize3d only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
@@ -175,14 +175,14 @@ TVM_REGISTER_NODE_TYPE(CropAndResizeAttrs);
 
 bool CropAndResizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* boxes = types[1].as<TensorTypeNode>();
   const auto* box_indices = types[2].as<TensorTypeNode>();
   if (data == nullptr || boxes == nullptr || box_indices == nullptr) return false;
 
   const CropAndResizeAttrs* param = attrs.as<CropAndResizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto crop_size = param->crop_size;
 
   DataType out_dtype = param->out_dtype;
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 631ec4c0d2f5..d2fb6aa2b9c3 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -52,6 +52,8 @@ Expr MakeFull(Expr fill_value, Array<Integer> shape, DataType dtype);
 
 Expr MakeLayoutTransform(Expr data, String src_layout, String dst_layout);
 
+Expr MakeAutoSchedulerLayoutTransform(Expr data, String src_layout, String dst_layout);
+
 Expr MakeOnes(Array<Integer> shape, DataType dtype);
 
 Expr MakePad(Expr data, Array<Array<Integer>> pad_width, double pad_value, String pad_mode);
@@ -62,6 +64,9 @@ Expr MakeRepeat(Expr data, int repeats, int axis);
 
 Expr MakeReshape(Expr data, Array<Integer> newshape);
 
+Expr MakeReshapeLike(Expr lhs, Expr rhs, int lhs_begin, Integer lhs_end, int rhs_begin,
+                     Integer rhs_end);
+
 Expr MakeSplit(Expr data, ObjectRef indices_or_sections, int axis);
 
 Expr MakeSqueeze(Expr data, Array<Integer> axis);
@@ -91,6 +96,8 @@ Expr MakeOneHot(Expr indices, Expr on_value, Expr off_value, int depth, int axis
 Expr MakeResize(Expr data, Array<IndexExpr> size, String layout, String method,
                 String coordinate_transformation_mode, DataType out_dtype);
 
+Expr MakeSparseToDense(Expr indices, Array<Integer> output_shape, Expr values, Expr default_value);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_MAKE_OP_H_
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index 771024502b21..dc5a1ebd3c73 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -29,7 +29,7 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/topi/elemwise.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 #include "../type_relations.h"
 
@@ -54,19 +54,19 @@ TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage")
 
 bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3u);
+  ICHECK_EQ(types.size(), 3u);
   auto size_type = types[0];
   auto tensor_type = size_type.as<TensorTypeNode>();
-  CHECK(tensor_type != nullptr);
-  CHECK_EQ(tensor_type->dtype, DataType::Int(64));
-  CHECK_EQ(tensor_type->shape.size(), 0);
+  ICHECK(tensor_type != nullptr);
+  ICHECK_EQ(tensor_type->dtype, DataType::Int(64));
+  ICHECK_EQ(tensor_type->shape.size(), 0);
   auto align_type = types[1];
   auto align_ttype = align_type.as<TensorTypeNode>();
-  CHECK(align_ttype != nullptr);
-  CHECK_EQ(align_ttype->dtype, DataType::Int(64));
-  CHECK_EQ(align_ttype->shape.size(), 0);
+  ICHECK(align_ttype != nullptr);
+  ICHECK_EQ(align_ttype->dtype, DataType::Int(64));
+  ICHECK_EQ(align_ttype->shape.size(), 0);
   auto mod = reporter->GetModule();
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
   auto storage_name = mod->GetGlobalTypeVar("Storage");
   auto storage = TypeCall(storage_name, {});
   reporter->Assign(types[2], storage);
@@ -107,10 +107,10 @@ TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_tensor")
 std::vector<int64_t> FromConstShape(Constant konst) {
   runtime::NDArray shape = konst->data;
   std::vector<int64_t> raw_shape;
-  CHECK_EQ(shape->ndim, 1u);
-  CHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got "
-                                  << runtime::DLDataType2String(shape->dtype);
-  CHECK(shape->dtype.bits == 64 || shape->dtype.bits == 32)
+  ICHECK_EQ(shape->ndim, 1u);
+  ICHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got "
+                                   << runtime::DLDataType2String(shape->dtype);
+  ICHECK(shape->dtype.bits == 64 || shape->dtype.bits == 32)
       << "The dtype of constant shape must be int32 or int64, but got"
       << runtime::DLDataType2String(shape->dtype);
 
@@ -131,28 +131,28 @@ std::vector<int64_t> FromConstShape(Constant konst) {
 
 bool AllocTensorRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4u);
+  ICHECK_EQ(types.size(), 4u);
   auto alloc_attrs = attrs.as<AllocTensorAttrs>();
-  CHECK(alloc_attrs != nullptr) << "must be alloc_tensor attributes";
+  ICHECK(alloc_attrs != nullptr) << "must be alloc_tensor attributes";
   // First argument should be storage.
   auto mod = reporter->GetModule();
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
   auto storage_name = mod->GetGlobalTypeVar("Storage");
   auto storage = relay::TypeCall(storage_name, {});
   reporter->Assign(types[0], storage);
   // Second argument should be the offset.
   auto offset_type = types[1].as<TensorTypeNode>();
-  CHECK(offset_type != nullptr) << "must be a scalar type";
+  ICHECK(offset_type != nullptr) << "must be a scalar type";
 
   // Third argument should be shape tensor.
   auto tt = types[2].as<TensorTypeNode>();
-  CHECK(tt != nullptr) << "must be tensor type";
+  ICHECK(tt != nullptr) << "must be tensor type";
 
   // Be careful about having to allocate scalars.
   int64_t dims = 0;
   if (tt->shape.size() != 0) {
     auto rank = tt->shape[0].as<tvm::IntImmNode>();
-    CHECK(rank != nullptr);
+    ICHECK(rank != nullptr);
     dims = rank->value;
   }
 
@@ -161,14 +161,14 @@ bool AllocTensorRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   if (alloc_attrs->const_shape.defined()) {
     auto con = alloc_attrs->const_shape;
     auto sh = FromConstShape(con);
-    CHECK_EQ(sh.size(), dims);
+    ICHECK_EQ(sh.size(), dims);
     Array<IndexExpr> out_shape;
     for (auto i = 0u; i < dims; i++) {
       out_shape.push_back(tvm::Integer(sh[i]));
     }
     alloc_type = TensorType(out_shape, alloc_attrs->dtype);
   } else {
-    CHECK(alloc_attrs->assert_shape.defined())
+    ICHECK(alloc_attrs->assert_shape.defined())
         << "the assert_shape must be set when const_shape is not";
     alloc_type = TensorType(alloc_attrs->assert_shape, alloc_attrs->dtype);
     return true;
@@ -198,7 +198,7 @@ RELAY_REGISTER_OP("memory.alloc_tensor")
 
 bool KillRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2u);
+  ICHECK_EQ(types.size(), 2u);
   // TODO(@jroesch): should only support tensors.
   reporter->Assign(types[1], TupleType::Empty());
   return true;
diff --git a/src/relay/op/nn/bitserial.cc b/src/relay/op/nn/bitserial.cc
index 022ca5cc96d8..853807997a4d 100644
--- a/src/relay/op/nn/bitserial.cc
+++ b/src/relay/op/nn/bitserial.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/tir/data_layout.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 
 namespace tvm {
@@ -50,9 +50,9 @@ Array<Array<Layout>> BinaryConv2DInferCorrectLayout(const Attrs& attrs,
 bool BitPackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   const BitPackAttrs* param = attrs.as<BitPackAttrs>();
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
-  CHECK(data);
+  ICHECK(data);
   int ndim = data->shape.size();
   int bits = param->bits;
   int pack_axis = param->pack_axis;
@@ -120,20 +120,20 @@ TVM_REGISTER_NODE_TYPE(BinaryConv2DAttrs);
 
 bool BinaryConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const BinaryConv2DAttrs* param = attrs.as<BinaryConv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   static const Layout kNCHW("NCHW");
 
   const Layout in_layout(param->data_layout);
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
   Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
-  CHECK(param->channels.defined());
-  CHECK(param->kernel_size.defined());
+  ICHECK(param->channels.defined());
+  ICHECK(param->kernel_size.defined());
   Array<IndexExpr> oshape({dshape_nchw[0], param->channels, 0, 0});
   IndexExpr pad_h, pad_w;
   GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
@@ -199,15 +199,15 @@ TVM_REGISTER_NODE_TYPE(BinaryDenseAttrs);
 
 bool BinaryDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const BinaryDenseAttrs* param = attrs.as<BinaryDenseAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK(static_cast<int>(data->shape.size()) != 0);
-  CHECK(param->units.defined());
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
+  ICHECK(param->units.defined());
 
   Array<tvm::PrimExpr> oshape = data->shape;
   oshape.Set((oshape.size() - 1), param->units);
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 2b9103b9709a..cf3f0fa85d68 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -29,7 +29,7 @@
 
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 #include "convolution_make.h"
 
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index cd334d7269ab..13e87a54b9d8 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -24,7 +24,7 @@
 #ifndef TVM_RELAY_OP_NN_CONVOLUTION_H_
 #define TVM_RELAY_OP_NN_CONVOLUTION_H_
 
-#include <tvm/ir/diagnostic.h>
+#include <tvm/support/logging.h>
 #include <tvm/tir/analysis.h>
 
 #include <string>
@@ -40,7 +40,7 @@ namespace relay {
 template <typename AttrType>
 bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -48,23 +48,23 @@ bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   static const Layout kOIW("OIW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCW."
       << " But got " << out_layout;
 
@@ -92,17 +92,17 @@ bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
           << "Conv1D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << wshape;
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
           << "Conv1D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << wshape;
     }
     if (!dshape_ncw[1].as<tir::AnyNode>() && !wshape[1].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(dshape_ncw[1], wshape[1]));
+      ICHECK(reporter->AssertEQ(dshape_ncw[1], wshape[1]));
     }
     channels = wshape[0];
     dilated_ksize = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -139,7 +139,7 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   static const Layout kOIHW("OIHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
@@ -191,8 +191,8 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 2);
-    CHECK_EQ(param->dilation.size(), 2);
+    ICHECK_EQ(param->kernel_size.size(), 2);
+    ICHECK_EQ(param->dilation.size(), 2);
     Array<IndexExpr> wshape;
 
     if (is_depthwise) {
@@ -212,8 +212,16 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     if (weight != nullptr) {
       weight_dtype = weight->dtype;
     }
-    // assign result to reporter
-    reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+
+    if (param->auto_scheduler_rewritten_layout.size() == 0) {
+      // Normal case: assign result to reporter
+      reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+    } else {
+      // If the layout is rewritten by auto-scheduler,
+      // we just forcly apply the layout provided by auto-scheduler and
+      // skip the normal inference logic.
+      {}  // do nothing
+    }
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
@@ -291,7 +299,7 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 template <typename AttrType>
 bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -299,23 +307,23 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   static const Layout kOIDHW("OIDHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIDHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIDHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCDHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCDHW."
       << " But got " << out_layout;
 
@@ -324,8 +332,8 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   IndexExpr channels, dilated_ksize_z, dilated_ksize_y, dilated_ksize_x;
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 3);
-    CHECK_EQ(param->dilation.size(), 3);
+    ICHECK_EQ(param->kernel_size.size(), 3);
+    ICHECK_EQ(param->dilation.size(), 3);
     Array<IndexExpr> wshape;
     tvm::tir::ExprDeepEqual expr_equal;
 
@@ -355,23 +363,23 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     if (weight == nullptr) return false;
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 3);
+      ICHECK_EQ(param->kernel_size.size(), 3);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]) &&
-            reporter->AssertEQ(param->kernel_size[2], wshape[4]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+             reporter->AssertEQ(param->kernel_size[1], wshape[3]) &&
+             reporter->AssertEQ(param->kernel_size[2], wshape[4]))
           << "Conv3D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << wshape;
     }
 
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
           << "Conv3D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << wshape;
     }
 
     if (!dshape_ncdhw[1].as<tir::AnyNode>() && !wshape[1].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(dshape_ncdhw[1], param->groups), wshape[1]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_ncdhw[1], param->groups), wshape[1]));
     }
     channels = wshape[0];
     dilated_ksize_z = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -413,14 +421,14 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 // Winograd convolution shape relations
 inline bool Conv2DWinogradWeightTransformRel(const Array<Type>& types, int num_inputs,
                                              const Attrs& attrs, const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
+  ICHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
 
   std::vector<IndexExpr> oshape{
       param->tile_size + data->shape[2] - 1,
@@ -458,16 +466,16 @@ inline bool Conv2DWinogradWeightTransformRel(const Array<Type>& types, int num_i
 //
 inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_inputs,
                                          const Attrs& attrs, const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* weight = types[0].as<TensorTypeNode>();
   if (weight == nullptr) return false;
 
   const ConvGemmWeightTransformAttrs* param = attrs.as<ConvGemmWeightTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   int n = param->tile_rows;
   int k = param->tile_cols;
 
-  CHECK_EQ(weight->shape.size(), 4) << "Only support HWIO kernel layout";
+  ICHECK_EQ(weight->shape.size(), 4) << "Only support HWIO kernel layout";
 
   const auto K = weight->shape[0] * weight->shape[1] * weight->shape[2];
   const auto N = weight->shape[3];
@@ -494,14 +502,14 @@ inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_input
 
 inline bool Conv3DWinogradWeightTransformRel(const Array<Type>& types, int num_inputs,
                                              const Attrs& attrs, const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK_EQ(data->shape.size(), 5) << "Only support NCDHW normal kernel layout";
+  ICHECK_EQ(data->shape.size(), 5) << "Only support NCDHW normal kernel layout";
 
   // Shape of packed weights depends on whether depth is being transformed or not.
   Array<IndexExpr> oshape({0, 0, 0, data->shape[0], data->shape[1]});
@@ -524,7 +532,7 @@ inline bool Conv3DWinogradWeightTransformRel(const Array<Type>& types, int num_i
 inline bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types, int num_inputs,
                                                    const Attrs& attrs,
                                                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -532,9 +540,9 @@ inline bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types, int
 
   const Conv2DWinogradNNPACKWeightTransformAttrs* param =
       attrs.as<Conv2DWinogradNNPACKWeightTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
+  ICHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
 
   std::vector<IndexExpr> oshape{
       data->shape[0],
@@ -554,30 +562,30 @@ inline bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types, int
 template <typename AttrType>
 bool Conv2DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   static const Layout kNCHW("NCHW");
   static const Layout kOIHW("OIHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
@@ -585,11 +593,11 @@ bool Conv2DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
-  CHECK(param->kernel_size.defined() && param->channels.defined())
+  ICHECK(param->kernel_size.defined() && param->channels.defined())
       << "The kernel size and channels of a Conv must be set or inferred by previous pass";
 
-  CHECK_EQ(param->kernel_size.size(), 2);
-  CHECK_EQ(param->dilation.size(), 2);
+  ICHECK_EQ(param->kernel_size.size(), 2);
+  ICHECK_EQ(param->dilation.size(), 2);
 
   channels = param->channels;
   dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
@@ -631,30 +639,30 @@ bool Conv2DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 template <typename AttrType>
 bool Conv2DGemmRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   static const Layout kNHWC("NHWC");
   static const Layout kHWIO("HWIO");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNHWC);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NHWC."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kHWIO);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from HWIO."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNHWC);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NHWC."
       << " But got " << out_layout;
 
@@ -662,11 +670,11 @@ bool Conv2DGemmRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
-  CHECK(param->kernel_size.defined() && param->channels.defined())
+  ICHECK(param->kernel_size.defined() && param->channels.defined())
       << "The kernel size and channels of a Conv must be set or inferred by previous pass";
 
-  CHECK_EQ(param->kernel_size.size(), 2);
-  CHECK_EQ(param->dilation.size(), 2);
+  ICHECK_EQ(param->kernel_size.size(), 2);
+  ICHECK_EQ(param->dilation.size(), 2);
 
   channels = param->channels;
   dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
@@ -703,30 +711,30 @@ bool Conv2DGemmRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 template <typename AttrType>
 bool Conv3DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   static const Layout kNCDHW("NCDHW");
   static const Layout kOIDHW("OIDHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIDHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIDHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCDHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCDHW."
       << " But got " << out_layout;
 
@@ -734,11 +742,11 @@ bool Conv3DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 
   IndexExpr channels, dilated_ksize_d, dilated_ksize_y, dilated_ksize_x;
 
-  CHECK(param->kernel_size.defined() && param->channels.defined())
+  ICHECK(param->kernel_size.defined() && param->channels.defined())
       << "The kernel size and channels of a Conv must be set or inferred by previous pass";
 
-  CHECK_EQ(param->kernel_size.size(), 3);
-  CHECK_EQ(param->dilation.size(), 3);
+  ICHECK_EQ(param->kernel_size.size(), 3);
+  ICHECK_EQ(param->dilation.size(), 3);
 
   channels = param->channels;
   dilated_ksize_d = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
@@ -787,7 +795,7 @@ bool Conv3DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 template <typename AttrType>
 bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -796,23 +804,23 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   static const Layout kOIW("OIW");
 
   const Conv1DTransposeAttrs* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCW."
       << " But got " << out_layout;
 
@@ -822,8 +830,8 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 1);
-    CHECK_EQ(param->dilation.size(), 1);
+    ICHECK_EQ(param->kernel_size.size(), 1);
+    ICHECK_EQ(param->dilation.size(), 1);
 
     Array<IndexExpr> wshape(
         {dshape_ncw[1], indexdiv(param->channels, param->groups), param->kernel_size[0]});
@@ -839,19 +847,19 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     if (weight == nullptr) return false;
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 1);
+      ICHECK_EQ(param->kernel_size.size(), 1);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
           << "Conv1D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[1]))
           << "Conv1D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_ncw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(dshape_ncw[1], param->groups), wshape[0]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_ncw[1], param->groups), wshape[0]));
     }
     channels = wshape[1];
     dilated_ksize_x = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -879,7 +887,7 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 template <typename AttrType>
 bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -888,23 +896,23 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   static const Layout kOIDHW("OIDHW");
 
   const Conv3DTransposeAttrs* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv3d_transpose only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIDHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv3d_transpose only support kernel layouts that are convertible from OIDHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCDHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv3d_transpose only support output layouts that are convertible from NCDHW."
       << " But got " << out_layout;
 
@@ -914,8 +922,8 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 3);
-    CHECK_EQ(param->dilation.size(), 3);
+    ICHECK_EQ(param->kernel_size.size(), 3);
+    ICHECK_EQ(param->dilation.size(), 3);
 
     Array<IndexExpr> wshape({dshape_ncdhw[1], indexdiv(param->channels, param->groups),
                              param->kernel_size[0], param->kernel_size[1], param->kernel_size[2]});
@@ -933,21 +941,21 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     if (weight == nullptr) return false;
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 3);
+      ICHECK_EQ(param->kernel_size.size(), 3);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]) &&
-            reporter->AssertEQ(param->kernel_size[2], wshape[4]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+             reporter->AssertEQ(param->kernel_size[1], wshape[3]) &&
+             reporter->AssertEQ(param->kernel_size[2], wshape[4]))
           << "Conv3D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[1]))
           << "Conv3D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_ncdhw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(dshape_ncdhw[1], param->groups), wshape[0]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_ncdhw[1], param->groups), wshape[0]));
     }
     channels = wshape[1];
     dilated_ksize_d = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -991,7 +999,7 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 template <typename AttrType>
 bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -1000,23 +1008,23 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   static const Layout kOIHW("OIHW");
 
   const Conv2DTransposeAttrs* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
@@ -1026,8 +1034,8 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 2);
-    CHECK_EQ(param->dilation.size(), 2);
+    ICHECK_EQ(param->kernel_size.size(), 2);
+    ICHECK_EQ(param->dilation.size(), 2);
 
     Array<IndexExpr> wshape({dshape_nchw[1], indexdiv(param->channels, param->groups),
                              param->kernel_size[0], param->kernel_size[1]});
@@ -1044,20 +1052,20 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     if (weight == nullptr) return false;
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 2);
+      ICHECK_EQ(param->kernel_size.size(), 2);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+             reporter->AssertEQ(param->kernel_size[1], wshape[3]))
           << "Conv2D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[1]))
           << "Conv2D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_nchw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[0]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[0]));
     }
     channels = wshape[1];
     dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -1093,21 +1101,21 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 template <typename AttrType>
 bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                          const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[2].as<TensorTypeNode>();
 
-  CHECK(data);
+  ICHECK(data);
   auto* param = attrs.as<AttrType>();
-  CHECK_EQ(param->data_layout, "NCHW") << "data layout not supported.";
-  CHECK_EQ(param->kernel_layout, "OIHW") << "kernel_layout not supported.";
+  ICHECK_EQ(param->data_layout, "NCHW") << "data layout not supported.";
+  ICHECK_EQ(param->kernel_layout, "OIHW") << "kernel_layout not supported.";
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x, ksize_y, ksize_x;
 
   // infer weight shape if kernel_size and channels are defiend
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 2);
-    CHECK_EQ(param->dilation.size(), 2);
+    ICHECK_EQ(param->kernel_size.size(), 2);
+    ICHECK_EQ(param->dilation.size(), 2);
     Array<IndexExpr> wshape({param->channels, indexdiv(data->shape[1], param->groups),
                              param->kernel_size[0], param->kernel_size[1]});
     channels = param->channels;
@@ -1122,20 +1130,20 @@ bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs&
     if (weight == nullptr) return false;
     auto wshape = weight->shape;
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 2);
+      ICHECK_EQ(param->kernel_size.size(), 2);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+             reporter->AssertEQ(param->kernel_size[1], wshape[3]))
           << "DeformableConv2D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << wshape;
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
           << "DeformableConv2D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << wshape;
     }
     if (!data->shape[1].as<tir::AnyNode>() && !wshape[1].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(data->shape[1], param->groups), wshape[1]));
+      ICHECK(reporter->AssertEQ(indexdiv(data->shape[1], param->groups), wshape[1]));
     }
     channels = wshape[0];
     ksize_y = wshape[2];
diff --git a/src/relay/op/nn/correlation.cc b/src/relay/op/nn/correlation.cc
index 5970cc75b2a9..0c2f481e10cb 100644
--- a/src/relay/op/nn/correlation.cc
+++ b/src/relay/op/nn/correlation.cc
@@ -64,14 +64,14 @@ Expr MakeCorrelation(Expr data1, Expr data2, int kernel_size, int max_displaceme
 
 bool CorrelationRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data1 = types[0].as<TensorTypeNode>();
   const auto* data2 = types[1].as<TensorTypeNode>();
   if (data1 == nullptr || data2 == nullptr) return false;
 
   const CorrelationAttrs* param = attrs.as<CorrelationAttrs>();
-  CHECK(param != nullptr);
-  CHECK_EQ(param->layout, "NCHW") << "layout not supported.";
+  ICHECK(param != nullptr);
+  ICHECK_EQ(param->layout, "NCHW") << "layout not supported.";
   IndexExpr pad_h, pad_w;
   GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
   IndexExpr padded_height = data1->shape[2] + pad_h;
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 1de7ca003772..816b98038e46 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -37,7 +37,7 @@
 #include <string>
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../make_op.h"
 #include "../op_common.h"
 #include "../type_relations.h"
@@ -50,17 +50,17 @@ TVM_REGISTER_NODE_TYPE(BiasAddAttrs);
 
 bool BiasAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const BiasAddAttrs* param = attrs.as<BiasAddAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   int axis = param->axis;
   if (axis < 0) {
     axis = data->shape.size() + axis;
   }
-  CHECK_LE(axis, static_cast<int>(data->shape.size()))
+  ICHECK_LE(axis, static_cast<int>(data->shape.size()))
       << "axis " << param->axis << " is out of range";
 
   // assign output type
@@ -107,15 +107,15 @@ Expr MakeFIFOBuffer(Expr input, Expr buffer, int axis) {
 
 bool FIFOBufferRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* input = types[0].as<TensorTypeNode>();
   const auto* buffer = types[1].as<TensorTypeNode>();
   const FIFOBufferAttrs* param = attrs.as<FIFOBufferAttrs>();
   if (input == nullptr || buffer == nullptr) {
     return false;
   }
-  CHECK(param != nullptr);
-  CHECK_EQ(input->shape.size(), buffer->shape.size());
+  ICHECK(param != nullptr);
+  ICHECK_EQ(input->shape.size(), buffer->shape.size());
 
   const size_t buffer_axis = static_cast<size_t>(
       param->axis < 0 ? static_cast<int>(buffer->shape.size()) + param->axis : param->axis);
@@ -221,14 +221,14 @@ TVM_REGISTER_NODE_TYPE(PReluAttrs);
 
 bool PReluRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const PReluAttrs* param = attrs.as<PReluAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK(param->axis < static_cast<int>(data->shape.size()))
+  ICHECK(param->axis < static_cast<int>(data->shape.size()))
       << "Wrong axis (" << param->axis << ")value.";
 
   // assign alpha type
@@ -245,11 +245,11 @@ Array<Array<Layout>> PReluInferCorrectLayout(const Attrs& attrs,
                                              const Array<Layout>& new_in_layouts,
                                              const Array<Layout>& old_in_layouts,
                                              const Array<tvm::relay::Type>& old_in_types) {
-  CHECK_EQ(old_in_layouts.size(), 2U);
-  CHECK_EQ(old_in_types.size(), 2U);
+  ICHECK_EQ(old_in_layouts.size(), 2U);
+  ICHECK_EQ(old_in_types.size(), 2U);
   Layout data_layout = old_in_layouts[0];
   if (new_in_layouts.defined()) {
-    CHECK_EQ(new_in_layouts.size(), 2U);
+    ICHECK_EQ(new_in_layouts.size(), 2U);
   }
   return Array<Array<Layout>>{{data_layout, Layout("C")}, {data_layout}};
 }
@@ -335,8 +335,8 @@ RELAY_REGISTER_OP("nn.log_softmax")
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* param = attrs.as<SoftmaxAttrs>();
-      CHECK(param != nullptr);
-      CHECK(param->axis == -1 || param->axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
+      ICHECK(param != nullptr);
+      ICHECK(param->axis == -1 || param->axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
           << "log_softmax currently only works on last dimension";
       return Array<te::Tensor>{topi::nn::log_softmax(inputs[0])};
     });
@@ -344,7 +344,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
 // relay.nn.batch_flatten
 bool BatchFlattenRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   if (data->shape.size() == 0) return false;
@@ -499,7 +499,7 @@ TVM_REGISTER_NODE_TYPE(DropoutAttrs);
 
 bool DropoutRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
@@ -544,7 +544,7 @@ Array<Array<Layout>> BatchNormInferCorrectLayout(const Attrs& attrs,
 
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
 
@@ -572,14 +572,14 @@ Array<Array<Layout>> BatchNormInferCorrectLayout(const Attrs& attrs,
 
 bool BatchNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const BatchNormAttrs* param = attrs.as<BatchNormAttrs>();
 
   // axis of -1 means use the last dimension
-  CHECK(param->axis >= -1 && param->axis < (int)data->shape.size());
+  ICHECK(param->axis >= -1 && param->axis < (int)data->shape.size());
   int axis = (param->axis != -1) ? param->axis : data->shape.size() - 1;
   auto axis_size = data->shape[axis];
 
@@ -666,12 +666,12 @@ TVM_REGISTER_NODE_TYPE(InstanceNormAttrs);
 
 bool InstanceNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   const InstanceNormAttrs* param = attrs.as<InstanceNormAttrs>();
   int axis = param->axis >= 0 ? param->axis : param->axis + data->shape.size();
-  CHECK(axis >= 0 && axis < (int)data->shape.size());
+  ICHECK(axis >= 0 && axis < (int)data->shape.size());
   reporter->Assign(types[1], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[2], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
@@ -733,12 +733,12 @@ TVM_REGISTER_NODE_TYPE(LayerNormAttrs);
 
 bool LayerNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   const LayerNormAttrs* param = attrs.as<LayerNormAttrs>();
   int axis = param->axis >= 0 ? param->axis : param->axis + data->shape.size();
-  CHECK(axis >= 0 && axis < (int)data->shape.size());
+  ICHECK(axis >= 0 && axis < (int)data->shape.size());
   reporter->Assign(types[1], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[2], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
@@ -778,12 +778,12 @@ TVM_REGISTER_NODE_TYPE(GroupNormAttrs);
 
 bool GroupNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   const GroupNormAttrs* param = attrs.as<GroupNormAttrs>();
   int axis = param->axis >= 0 ? param->axis : param->axis + data->shape.size();
-  CHECK(axis >= 0 && axis < (int)data->shape.size());
+  ICHECK(axis >= 0 && axis < (int)data->shape.size());
   reporter->Assign(types[1], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[2], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
@@ -847,11 +847,11 @@ If the input has size k on axis 1, then both gamma and beta have shape (k,).
 // relay.nn.batch_matmul
 bool BatchMatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* x = types[0].as<TensorTypeNode>();
   const auto* y = types[1].as<TensorTypeNode>();
   if (x == nullptr || y == nullptr) return false;
-  CHECK(x->shape.size() == 3 && y->shape.size() == 3);
+  ICHECK(x->shape.size() == 3 && y->shape.size() == 3);
   bool is_dyn = false;
   Array<tvm::PrimExpr> oshape;
   for (size_t i = 0; i < 3; ++i) {
@@ -859,15 +859,19 @@ bool BatchMatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
       is_dyn = true;
       oshape.push_back(Any());
     } else {
-      oshape.push_back(x->shape[i]);
+      if (i == 0) {
+        oshape.push_back(max(x->shape[i], y->shape[i]));
+      } else {
+        oshape.push_back(x->shape[i]);
+      }
     }
   }
   if (!is_dyn) {
-    CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]) || reporter->AssertEQ(x->shape[0], 1) ||
-          reporter->AssertEQ(y->shape[0], 1))
+    ICHECK(reporter->AssertEQ(x->shape[0], y->shape[0]) || reporter->AssertEQ(x->shape[0], 1) ||
+           reporter->AssertEQ(y->shape[0], 1))
         << "BatchDot: batch dimensions don't match, "
         << " x shape=" << x->shape << ", y shape=" << y->shape;
-    CHECK(reporter->AssertEQ(x->shape[2], y->shape[2]))
+    ICHECK(reporter->AssertEQ(x->shape[2], y->shape[2]))
         << "BatchDot: shapes of x and y is inconsistent, "
         << " x shape=" << x->shape << ", y shape=" << y->shape;
 
@@ -909,19 +913,19 @@ are data in batch.
 // relay.nn.cross_entropy
 bool CrossEntropyRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* x = types[0].as<TensorTypeNode>();
   const auto* y = types[1].as<TensorTypeNode>();
   if (x == nullptr || y == nullptr) return false;
-  CHECK(x->shape.size() == 2 && y->shape.size() == 2)
+  ICHECK(x->shape.size() == 2 && y->shape.size() == 2)
       << "CrossEntropy: shapes of x and y is inconsistent, "
       << "x shape = " << x->shape << ", "
       << "y shape = " << y->shape;
-  CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
+  ICHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
       << "CrossEntropy: shapes of x and y is inconsistent, "
       << "x shape = " << x->shape << ", "
       << "y shape = " << y->shape;
-  CHECK(reporter->AssertEQ(x->shape[1], y->shape[1]))
+  ICHECK(reporter->AssertEQ(x->shape[1], y->shape[1]))
       << "CrossEntropy: shapes of x and y is inconsistent, "
       << "x shape = " << x->shape << ", "
       << "y shape = " << y->shape;
@@ -954,11 +958,11 @@ TVM_REGISTER_NODE_TYPE(DilateAttrs);
 
 bool DilateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* x = types[0].as<TensorTypeNode>();
   const DilateAttrs* param = attrs.as<DilateAttrs>();
   if (x == nullptr) return false;
-  CHECK_EQ(x->shape.size(), param->strides.size());
+  ICHECK_EQ(x->shape.size(), param->strides.size());
 
   std::vector<IndexExpr> oshape;
   for (size_t i = 0; i < param->strides.size(); ++i) {
@@ -1018,18 +1022,18 @@ TVM_REGISTER_NODE_TYPE(SubPixelAttrs);
 
 bool DepthToSpaceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const SubPixelAttrs* param = attrs.as<SubPixelAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const int block_size = param->block_size;
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "DepthToSpace only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -1081,18 +1085,18 @@ RELAY_REGISTER_OP("nn.depth_to_space")
 
 bool SpaceToDepthRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const SubPixelAttrs* param = attrs.as<SubPixelAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const int block_size = param->block_size;
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "SpaceToDepth only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -1141,5 +1145,223 @@ RELAY_REGISTER_OP("nn.space_to_depth")
     .set_support_level(5)
     .add_type_rel("SpaceToDepth", SpaceToDepthRel);
 
+// Positional relay function to create SpaceToBatchND operator
+// used by frontend FFI
+TVM_REGISTER_NODE_TYPE(SpaceToBatchNDAttrs);
+
+Expr MakeSpaceToBatchND(Expr data, Array<Integer> block_shape, Array<Array<IndexExpr>> paddings,
+                        double pad_value) {
+  auto attrs = make_object<SpaceToBatchNDAttrs>();
+  attrs->block_shape = std::move(block_shape);
+  attrs->paddings = std::move(paddings);
+  attrs->pad_value = pad_value;
+  static const Op& op = Op::Get("nn.space_to_batch_nd");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+bool SpaceToBatchNDRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+
+  auto* input = types[0].as<TensorTypeNode>();
+  // Input must be a TensorType
+  if (input == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "SpaceToBatchND: expect input type to be TensorType but got " << types[0];
+    return false;
+  }
+
+  if (input->shape.size() <= 1) return false;
+
+  const auto* param = attrs.as<SpaceToBatchNDAttrs>();
+  CHECK(param != nullptr);
+
+  auto block_shape = param->block_shape;
+  auto paddings = param->paddings;
+  const int bdims = static_cast<int>(block_shape.size());
+  const int pdims = static_cast<int>(paddings.size());
+  // Paddings must be provided for each spatial dim.
+  CHECK(pdims == bdims) << "SpaceToBatchND: Paddings must be provided for each spatial dim";
+
+  // Apply paddings to input
+  auto in_shape = input->shape;
+  std::vector<IndexExpr> padded_shape(input->shape.begin(), input->shape.end());
+  for (size_t i = 0; i < paddings.size(); i++) {
+    CHECK_EQ(paddings[i].size(), 2U);
+    auto pad_before = tir::as_const_int(param->paddings[i][0]);
+    auto pad_after = tir::as_const_int(param->paddings[i][1]);
+    auto padding = tir::make_const(input->shape[i].dtype(), *pad_before + *pad_after);
+    padded_shape[i + 1] = in_shape[i + 1] + padding;
+  }
+
+  auto block_shape_numele = tir::make_const(DataType::Int(32), 1);
+  for (size_t i = 0; i < block_shape.size(); i++) {
+    block_shape_numele *= block_shape[i];
+  }
+
+  // Construct output shape
+  std::vector<IndexExpr> out_shape(padded_shape);
+  out_shape[0] = in_shape[0] * block_shape_numele;
+  for (size_t i = 1; i <= block_shape.size(); i++) {
+    out_shape[i] = div(padded_shape[i], block_shape[i - 1]);
+  }
+
+  // Assign output shape
+  reporter->Assign(types[1], TensorType(Array<IndexExpr>(out_shape), input->dtype));
+  return true;
+}
+
+Array<te::Tensor> SpaceToBatchNDCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
+                                        const Type& out_type) {
+  const auto* param = attrs.as<SpaceToBatchNDAttrs>();
+  CHECK(param != nullptr);
+
+  auto b_shape = param->block_shape;
+  auto paddings = param->paddings;
+  Array<IndexExpr> pad_before;
+  Array<IndexExpr> pad_after;
+
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    pad_before.push_back(paddings[i][0]);
+  }
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    pad_after.push_back(paddings[i][1]);
+  }
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  return Array<te::Tensor>{
+      topi::space_to_batch_nd(inputs[0], b_shape, pad_before, pad_after,
+                              tvm::tir::make_const(out_ttype->dtype, param->pad_value))};
+}
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.space_to_batch_nd").set_body_typed(MakeSpaceToBatchND);
+
+RELAY_REGISTER_OP("nn.space_to_batch_nd")
+    .describe(R"code(Divide spatial dimensions of the input into a grid of blocks
+and interleave them into batch dim.
+
+- **data**: data is a ND array of shape
+            (batch, spatial_shapes, remaining_shapes) for NHWC
+
+- **out**: Output is a ND array of shape
+           (batch * prod(block_shape), padded_data[1] / block_shape[0], ..., padded_data[M] / block_shape[M-1],
+            remaining_shape) for NHWC, where M is the number of spatial dimensions.
+
+Example::
+
+  x = [[[[1], [2]], [[3], [4]]]]
+
+  space_to_batch_nd(x, block_shape = [2, 2]) =
+    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_attrs_type<SpaceToBatchNDAttrs>()
+    .set_support_level(5)
+    .add_type_rel("SpaceToBatchND", SpaceToBatchNDRel)
+    .set_attr<FTVMCompute>("FTVMCompute", SpaceToBatchNDCompute)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
+
+/*****************************************************************/
+
+// Positional relay function to create BatchToSpaceND operator
+// used by frontend FFI
+TVM_REGISTER_NODE_TYPE(BatchToSpaceNDAttrs);
+
+Expr MakeBatchToSpaceND(Expr data, Array<Integer> block_shape, Array<Array<IndexExpr>> crops) {
+  auto attrs = make_object<BatchToSpaceNDAttrs>();
+  attrs->block_shape = std::move(block_shape);
+  attrs->crops = std::move(crops);
+  static const Op& op = Op::Get("nn.batch_to_space_nd");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+bool BatchToSpaceNDRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+
+  auto* input = types[0].as<TensorTypeNode>();
+  // Input must be a TensorType
+  if (input == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "BatchToSpaceND: expect input type to be TensorType but got " << types[0];
+    return false;
+  }
+
+  if (input->shape.size() <= 1) return false;
+
+  const auto* param = attrs.as<BatchToSpaceNDAttrs>();
+  CHECK(param != nullptr);
+
+  auto block_shape = param->block_shape;
+  auto crops = param->crops;
+  const int bdims = static_cast<int>(block_shape.size());
+  const int cdims = static_cast<int>(crops.size());
+  const int indims = static_cast<int>(input->shape.size());
+  // crops must be provided for each spatial dim.
+  CHECK(cdims == bdims) << "BatchToSpaceND: crops must be provided for each spatial dim";
+  CHECK(bdims < indims) << "BatchToSpaceND: block_shape must be less than input shape";
+
+  auto block_shape_numele = tir::make_const(DataType::Int(32), 1);
+  for (size_t i = 0; i < block_shape.size(); i++) {
+    block_shape_numele *= block_shape[i];
+  }
+
+  auto in_shape = input->shape;
+
+  // Construct output shape
+  // Start with input shape, only batch and spatial dims shapes are modified.
+  std::vector<IndexExpr> out_shape(input->shape.begin(), input->shape.end());
+  out_shape[0] = in_shape[0] / block_shape_numele;
+  for (size_t i = 1; i <= block_shape.size(); i++) {
+    out_shape[i] = (in_shape[i] * block_shape[i - 1]) - crops[i - 1][0] - crops[i - 1][1];
+  }
+  for (int i = bdims + 1; i < indims; i++) {
+    out_shape[i] = in_shape[i];
+  }
+
+  // Assign output shape
+  reporter->Assign(types[1], TensorType(Array<IndexExpr>(out_shape), input->dtype));
+  return true;
+}
+
+Array<te::Tensor> BatchToSpaceNDCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
+                                        const Type& out_type) {
+  const auto* param = attrs.as<BatchToSpaceNDAttrs>();
+  CHECK(param != nullptr);
+
+  auto b_shape = param->block_shape;
+  auto crops = param->crops;
+  Array<IndexExpr> crop_begin_list, crop_end_list;
+  for (size_t i = 0; i < crops.size(); ++i) {
+    crop_begin_list.push_back(crops[i][0]);
+    crop_end_list.push_back(crops[i][1]);
+  }
+
+  return Array<te::Tensor>{
+      topi::batch_to_space_nd(inputs[0], b_shape, crop_begin_list, crop_end_list)};
+}
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.batch_to_space_nd").set_body_typed(MakeBatchToSpaceND);
+
+RELAY_REGISTER_OP("nn.batch_to_space_nd")
+    .describe(R"code(Reshape the batch dimension into spatial dimensions.
+
+Example::
+
+  x = [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+
+  batch_to_space_nd(x, block_shape = [2, 2]) =
+    [[[[1], [2]], [[3], [4]]]]
+
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_attrs_type<BatchToSpaceNDAttrs>()
+    .set_support_level(5)
+    .add_type_rel("BatchToSpaceND", BatchToSpaceNDRel)
+    .set_attr<FTVMCompute>("FTVMCompute", BatchToSpaceNDCompute)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index e7f5a4b9d618..30ef3079e565 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -37,15 +37,15 @@ namespace relay {
 template <typename AttrType>
 bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK(static_cast<int>(data->shape.size()) != 0);
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
 
   Array<tvm::PrimExpr> oshape = data->shape;
   if (param->units.defined()) {
@@ -62,9 +62,9 @@ bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   } else {
     if (weight == nullptr) return false;
     Array<tvm::PrimExpr> wshape = weight->shape;
-    CHECK(static_cast<int>(weight->shape.size()) == 2);
+    ICHECK(static_cast<int>(weight->shape.size()) == 2);
     if (!data->shape.back().as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(data->shape[data->shape.size() - 1], weight->shape[1]))
+      ICHECK(reporter->AssertEQ(data->shape[data->shape.size() - 1], weight->shape[1]))
           << "DenseRel: input dimension doesn't match,"
           << " data shape=" << data->shape << ", weight shape=" << weight->shape;
     }
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 45447e155135..5b9988b101eb 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -55,8 +55,8 @@ Array<Array<Layout>> PadInferCorrectLayout(const Attrs& attrs, const Array<Layou
     // 1) Create a map from axis to param_width using old layout.
     std::map<std::string, tvm::Array<Integer>> axis_pad_width;
     int index_counter = 0;
-    CHECK_EQ(new_in_layouts.size(), 1);
-    CHECK_EQ(old_in_layouts.size(), 1);
+    ICHECK_EQ(new_in_layouts.size(), 1);
+    ICHECK_EQ(old_in_layouts.size(), 1);
     for (auto iter_var : old_in_layouts[0]->axes) {
       const auto& old_layout_axis = LayoutAxis::Get(iter_var);
       axis_pad_width.emplace(old_layout_axis.name(), params->pad_width[index_counter]);
@@ -75,7 +75,7 @@ Array<Array<Layout>> PadInferCorrectLayout(const Attrs& attrs, const Array<Layou
         // This is the axis that got split. So, check that pad_width was [0, 0] originally.
         const auto& dual_axis = new_layout_axis.ToPrimal();
         auto dual_axis_name = dual_axis.name();
-        CHECK(axis_pad_width.count(dual_axis_name))
+        ICHECK(axis_pad_width.count(dual_axis_name))
             << "Missing axis " << dual_axis << " in " << old_in_layouts[0].name();
         new_pad_width.push_back(axis_pad_width.at(dual_axis_name));
 
@@ -102,7 +102,7 @@ Array<Array<Layout>> PadInferCorrectLayout(const Attrs& attrs, const Array<Layou
 
   if (!is_layout_modified) {
     if (old_in_layouts.defined()) {
-      CHECK_EQ(old_in_layouts.size(), 1);
+      ICHECK_EQ(old_in_layouts.size(), 1);
       ret = old_in_layouts[0];
     } else {
       ret = Layout::Undef();
@@ -114,15 +114,15 @@ Array<Array<Layout>> PadInferCorrectLayout(const Attrs& attrs, const Array<Layou
 
 bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
             const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const PadAttrs* param = attrs.as<PadAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // check that pad widths match lengths
-  CHECK(data->shape.size() == param->pad_width.size())
+  ICHECK(data->shape.size() == param->pad_width.size())
       << "There should be as many pad width pairs as shape dimensions "
       << "but the shape has " << data->shape.size() << " dimensions "
       << "and there are " << param->pad_width.size() << " pad width pairs.";
@@ -130,19 +130,19 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // each pad width element should be a pair of positive integers
   std::vector<IndexExpr> oshape;
   for (size_t i = 0; i < param->pad_width.size(); i++) {
-    CHECK(param->pad_width[i].size() == 2)
+    ICHECK(param->pad_width[i].size() == 2)
         << "Each pad width element should be a pair but at index " << i << " there are "
         << param->pad_width[i].size() << " elements.";
 
     auto width1 = tir::as_const_int(param->pad_width[i][0]);
     auto width2 = tir::as_const_int(param->pad_width[i][1]);
-    CHECK(width1 != nullptr);
-    CHECK(width2 != nullptr);
+    ICHECK(width1 != nullptr);
+    ICHECK(width2 != nullptr);
 
-    CHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
-                        << "index " << i << " is " << *width1 << ".";
-    CHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
-                        << "index " << i << " is " << *width2 << ".";
+    ICHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
+                         << "index " << i << " is " << *width1 << ".";
+    ICHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
+                         << "index " << i << " is " << *width2 << ".";
 
     if (!data->shape[i].as<tir::AnyNode>()) {
       auto padding = tir::make_const(data->shape[i].dtype(), *width1 + *width2);
@@ -159,10 +159,10 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> PadCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                              const Type& out_type) {
   const auto* param = attrs.as<PadAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   auto pad_width = param->pad_width;
-  CHECK(pad_width.size() == inputs[0].ndim() && pad_width[0].size() == 2) << "Illegal pad_width";
+  ICHECK(pad_width.size() == inputs[0].ndim() && pad_width[0].size() == 2) << "Illegal pad_width";
   Array<IndexExpr> pad_before;
   for (size_t i = 0; i < pad_width.size(); ++i) {
     pad_before.push_back(pad_width[i][0]);
@@ -207,15 +207,15 @@ TVM_REGISTER_NODE_TYPE(MirrorPadAttrs);
 
 bool MirrorPadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const MirrorPadAttrs* param = attrs.as<MirrorPadAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // check that pad widths match lengths
-  CHECK(data->shape.size() == param->pad_width.size())
+  ICHECK(data->shape.size() == param->pad_width.size())
       << "There should be as many pad width pairs as shape dimensions "
       << "but the shape has " << data->shape.size() << " dimensions "
       << "and there are " << param->pad_width.size() << " pad width pairs.";
@@ -223,19 +223,19 @@ bool MirrorPadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // each pad width element should be a pair of positive integers
   std::vector<IndexExpr> oshape;
   for (size_t i = 0; i < param->pad_width.size(); i++) {
-    CHECK(param->pad_width[i].size() == 2)
+    ICHECK(param->pad_width[i].size() == 2)
         << "Each pad width element should be a pair but at index " << i << " there are "
         << param->pad_width[i].size() << " elements.";
 
     auto width1 = tir::as_const_int(param->pad_width[i][0]);
     auto width2 = tir::as_const_int(param->pad_width[i][1]);
-    CHECK(width1 != nullptr);
-    CHECK(width2 != nullptr);
+    ICHECK(width1 != nullptr);
+    ICHECK(width2 != nullptr);
 
-    CHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
-                        << "index " << i << " is " << *width1 << ".";
-    CHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
-                        << "index " << i << " is " << *width2 << ".";
+    ICHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
+                         << "index " << i << " is " << *width1 << ".";
+    ICHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
+                         << "index " << i << " is " << *width2 << ".";
 
     auto padding = tir::make_const(data->shape[i].dtype(), *width1 + *width2);
     oshape.push_back(data->shape[i] + padding);
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 1e5306035a6c..4fb1745d65aa 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -31,7 +31,7 @@
 
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -50,7 +50,7 @@ Array<Array<Layout> > PoolInferCorrectLayout(const Attrs& attrs,
 
   if (new_in_layouts.defined()) {
     // Set the pool with the new layout.
-    CHECK_EQ(new_in_layouts.size(), 1);
+    ICHECK_EQ(new_in_layouts.size(), 1);
     params->layout = new_in_layouts[0].name();
   }
 
@@ -61,20 +61,20 @@ Array<Array<Layout> > PoolInferCorrectLayout(const Attrs& attrs,
 template <typename AttrType>
 bool Pool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) return false;
 
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 2U)
+  ICHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
   const auto param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool2D layout must have H and W, which cannot be split";
 
   const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
@@ -131,21 +131,21 @@ Array<te::Tensor> Pool2DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
                                 const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto pool_size = param->pool_size;
   auto strides = param->strides;
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
 
-  CHECK(tir::BijectiveLayout(layout, kNCHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "max_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "max_pool2d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "max_pool2d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
       << " or 5-D input (e.g. NCHWc on for vector instructions)"
       << " or 6-D input (e.g. NCHWnc for tensor accelerators)";
@@ -248,20 +248,20 @@ TVM_REGISTER_NODE_TYPE(GlobalPool2DAttrs);
 
 bool GlobalPool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 2U)
+  ICHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
   const auto param = attrs.as<GlobalPool2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool2D layout must have H and W, which cannot be split";
 
   const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
@@ -280,16 +280,16 @@ Array<te::Tensor> GlobalPool2DCompute(const Attrs& attrs, const Array<te::Tensor
                                       const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<GlobalPool2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(tir::BijectiveLayout(layout, kNCHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "global_avg_pool2d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "global_avg_pool2d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
       << " or 5-D input (last dimension is a split of channel)";
   return Array<te::Tensor>{topi::nn::global_pool(inputs[0], mode, layout.name())};
@@ -354,27 +354,27 @@ TVM_REGISTER_NODE_TYPE(AdaptivePool2DAttrs);
 
 bool AdaptivePool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 2U)
+  ICHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
   const auto* param = attrs.as<AdaptivePool2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool2D layout must have H and W, which cannot be split";
 
   const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
   const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
   Array<IndexExpr> oshape(dshape);
   auto output_size = param->output_size;
-  CHECK_LE(output_size.size(), 2U) << "output_size can have up to 2 elements.";
+  ICHECK_LE(output_size.size(), 2U) << "output_size can have up to 2 elements.";
   IndexExpr output_height, output_width;
   if (output_size.empty()) {
     output_height = dshape[hidx];
@@ -400,16 +400,16 @@ Array<te::Tensor> AdaptivePool2DCompute(const Attrs& attrs, const Array<te::Tens
                                         const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AdaptivePool2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(tir::BijectiveLayout(layout, kNCHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "Adaptive pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "Adaptive pool2d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "Adaptive pool2d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
       << " or 5-D input (last dimension is a split of channel)";
 
@@ -505,21 +505,21 @@ TVM_REGISTER_NODE_TYPE(AdaptivePool3DAttrs);
 
 bool AdaptivePool3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 3U)
+  ICHECK_GE(dshape.size(), 3U)
       << "Pool3D only support input >= 3-D: input must have depth, height and width";
   const auto* param = attrs.as<AdaptivePool3DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
-        layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
+         layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout
       << ". Pool3D layout must have D, H and W, which cannot be split";
 
@@ -528,7 +528,7 @@ bool AdaptivePool3DRel(const Array<Type>& types, int num_inputs, const Attrs& at
   const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
   Array<IndexExpr> oshape(dshape);
   auto output_size = param->output_size;
-  CHECK_LE(output_size.size(), 3U) << "output_size can have up to 3 elements.";
+  ICHECK_LE(output_size.size(), 3U) << "output_size can have up to 3 elements.";
   IndexExpr output_depth, output_height, output_width;
   if (output_size.empty()) {
     output_depth = dshape[didx];
@@ -558,18 +558,18 @@ Array<te::Tensor> AdaptivePool3DCompute(const Attrs& attrs, const Array<te::Tens
                                         const Type& out_type) {
   static const Layout kNCDHW("NCDHW");
   const auto* param = attrs.as<AdaptivePool3DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
       << "Adaptive pool3d currently only supports layouts that are convertible from NCDHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
       << "Adaptive pool3d does not support input split on depth";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "Adaptive pool3d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "Adaptive pool3d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
+  ICHECK(inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
       << "Pool3D only support 5-D input (e.g., NCDHW)"
       << " or 6-D input (last dimension is a split of channel)";
 
@@ -666,7 +666,7 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool3d")
 
 bool Pool2DGradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[1].as<TensorTypeNode>();
 
   if (data == nullptr) return false;
@@ -681,26 +681,26 @@ Array<te::Tensor> Pool2DGradCompute(const Attrs& attrs, const Array<te::Tensor>&
                                     const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
-  CHECK_EQ(inputs.size(), 2);
+  ICHECK(param != nullptr);
+  ICHECK_EQ(inputs.size(), 2);
   auto pool_size = param->pool_size;
   auto strides = param->strides;
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
 
-  CHECK(tir::BijectiveLayout(layout, kNCHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "pool2d_grad currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "pool2d_grad does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "pool2d_grad does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2DGrad only support 4-D output gradient (e.g., NCHW)"
       << " or 5-D output gradient (last dimension is a split of channel)";
 
-  CHECK(inputs[1].ndim() == 4U || inputs[1].ndim() == 5U)
+  ICHECK(inputs[1].ndim() == 4U || inputs[1].ndim() == 5U)
       << "Pool2DGrad only support 4-D input (e.g., NCHW)"
       << " or 5-D input (last dimension is a split of channel)";
 
@@ -823,18 +823,18 @@ TVM_REGISTER_NODE_TYPE(AvgPool1DAttrs);
 template <typename AttrType>
 bool Pool1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) return false;
 
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 1U) << "Pool1D only support input >= 1-D: input must have width";
+  ICHECK_GE(dshape.size(), 1U) << "Pool1D only support input >= 1-D: input must have width";
   const auto param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool1D layout must have W, which cannot be split";
 
   const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
@@ -873,19 +873,19 @@ Array<te::Tensor> Pool1DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
                                 const Type& out_type) {
   static const Layout kNCW("NCW");
   const auto* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto pool_size = param->pool_size;
   auto strides = param->strides;
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
 
-  CHECK(tir::BijectiveLayout(layout, kNCW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCW).defined())
       << "max_pool1d currently only supports layouts that are convertible from NCW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "max_pool1d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 3U || inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+  ICHECK(inputs[0].ndim() == 3U || inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool1D only support 3-D input (e.g., NCW)"
       << " or 4-D input (e.g. NCWc on for vector instructions)"
       << " or 5-D input (e.g. NCWnc for tensor accelerators)";
@@ -982,21 +982,21 @@ TVM_REGISTER_NODE_TYPE(AvgPool3DAttrs);
 template <typename AttrType>
 bool Pool3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) return false;
 
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 3U)
+  ICHECK_GE(dshape.size(), 3U)
       << "Pool3D only support input >= 3-D: input must have depth, height and width";
   const auto param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
-        layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
+         layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout
       << ". Pool3D layout must have D, H and W, which cannot be split";
 
@@ -1051,23 +1051,23 @@ Array<te::Tensor> Pool3DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
                                 const Type& out_type) {
   static const Layout kNCDHW("NCDHW");
   const auto* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto pool_size = param->pool_size;
   auto strides = param->strides;
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
 
-  CHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
       << "max_pool3d currently only supports layouts that are convertible from NCDHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
       << "max_pool3d does not support input split on depth";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "max_pool3d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "max_pool3d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
       << "Pool3D only support 5-D input (e.g., NCDHW)"
       << " or 6-D input (e.g. NCDHWc on for vector instructions)"
       << " or 7-D input (e.g. NCDHWnc for tensor accelerators)";
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index f12afe2a7f1f..09dca09a82de 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -28,7 +28,7 @@
 
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -38,10 +38,10 @@ TVM_REGISTER_NODE_TYPE(SparseDenseAttrs);
 
 bool SparseDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight_data = types[1].as<TensorTypeNode>();
-  CHECK(weight_data->shape.size() == 1 || weight_data->shape.size() == 3);
+  ICHECK(weight_data->shape.size() == 1 || weight_data->shape.size() == 3);
   const auto* weight_indptr = types[3].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
@@ -131,11 +131,11 @@ TVM_REGISTER_NODE_TYPE(SparseTransposeAttrs);
 
 bool SparseTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* sparse_data = types[0].as<TensorTypeNode>();
-  CHECK_EQ(sparse_data->shape.size(), 1);
+  ICHECK_EQ(sparse_data->shape.size(), 1);
   const auto* sparse_indices = types[1].as<TensorTypeNode>();
-  CHECK_EQ(sparse_indices->shape.size(), 1);
+  ICHECK_EQ(sparse_indices->shape.size(), 1);
   const auto* sparse_indptr = types[2].as<TensorTypeNode>();
 
   std::vector<Type> output_types;
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index bdf3090cefad..3b0139b16b1b 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -42,18 +42,18 @@ TVM_REGISTER_NODE_TYPE(UpSampling3DAttrs);
 
 bool UpSamplingRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
 
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "UpSampling only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -110,18 +110,18 @@ RELAY_REGISTER_OP("nn.upsampling")
 // UpSampling3D
 bool UpSampling3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCDHW("NCDHW");
 
   const UpSampling3DAttrs* param = attrs.as<UpSampling3DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
 
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "UpSampling3D only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
diff --git a/src/relay/op/nn/upsampling.h b/src/relay/op/nn/upsampling.h
index e4e3bc9b1929..4cd292e78cb6 100644
--- a/src/relay/op/nn/upsampling.h
+++ b/src/relay/op/nn/upsampling.h
@@ -43,7 +43,7 @@ Array<Array<Layout> > UpsamplingInferCorrectLayout(const Attrs& attrs,
   T* params = const_cast<T*>(attrs.as<T>());
 
   if (new_in_layouts.defined()) {
-    CHECK_EQ(new_in_layouts.size(), 1);
+    ICHECK_EQ(new_in_layouts.size(), 1);
 
     Layout raw_layout(params->layout);
     Layout input = new_in_layouts[0];
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index cbb8cec2d43b..6c2c6b2cce69 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -33,7 +33,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../transforms/infer_layout_util.h"
+#include "../transforms/infer_layout_utils.h"
 #include "type_relations.h"
 
 namespace tvm {
@@ -151,7 +151,7 @@ inline void GetPaddingWidth(const Array<IndexExpr>& padding, IndexExpr* pad_w) {
   } else if (padding.size() == 2) {
     *pad_w = padding[0] + padding[1];
   } else {
-    CHECK_EQ(padding.size(), 4) << " Expected padding size of 1 or 2, found " << padding.size();
+    ICHECK_EQ(padding.size(), 4) << " Expected padding size of 1 or 2, found " << padding.size();
   }
 }
 
@@ -168,7 +168,7 @@ inline void GetPaddingHeightWidth(const Array<IndexExpr>& padding, IndexExpr* pa
     *pad_h = padding[0] + padding[2];
     *pad_w = padding[1] + padding[3];
   } else {
-    CHECK_EQ(padding.size(), 4) << " Padding size should be 1, 2 or 4, but got " << padding.size();
+    ICHECK_EQ(padding.size(), 4) << " Padding size should be 1, 2 or 4, but got " << padding.size();
   }
 }
 
@@ -188,7 +188,7 @@ inline void GetPaddingDepthHeightWidth(const Array<IndexExpr>& padding, IndexExp
     *pad_h = padding[1] + padding[4];
     *pad_w = padding[2] + padding[5];
   } else {
-    CHECK_EQ(padding.size(), 6) << " Padding size should be 1, 3 or 6, but got " << padding.size();
+    ICHECK_EQ(padding.size(), 6) << " Padding size should be 1, 3 or 6, but got " << padding.size();
   }
 }
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index df128ff05338..aafd4492fec4 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -34,7 +34,7 @@ namespace relay {
 #define RELAY_BINARY_COMPUTE(FTOPI)                       \
   [](const Attrs& attrs, const Array<te::Tensor>& inputs, \
      const Type& out_type) -> Array<te::Tensor> {         \
-    CHECK_EQ(inputs.size(), 2U);                          \
+    ICHECK_EQ(inputs.size(), 2U);                         \
     return {FTOPI(inputs[0], inputs[1])};                 \
   }
 
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 16f5f0116b60..afe45571f558 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -63,12 +63,12 @@ inline std::vector<int64_t> GetReduceAxes(const uint32_t indim, const Array<Inte
     }
 
     // Check out of bounds error
-    CHECK(axis >= 0) << "Axis out of bounds in reduce operator.";
-    CHECK(axis < indim) << "Axis out of bounds in reduce operator.";
+    ICHECK(axis >= 0) << "Axis out of bounds in reduce operator.";
+    ICHECK(axis < indim) << "Axis out of bounds in reduce operator.";
     in_axes.push_back(axis);
   }
 
-  CHECK(in_axes[in_axes.size() - 1] < indim)
+  ICHECK(in_axes[in_axes.size() - 1] < indim)
       << "Reduction axis " << in_axes[in_axes.size() - 1] << " exceeds input dimensions " << indim;
 
   std::sort(in_axes.begin(), in_axes.end());
@@ -91,7 +91,7 @@ inline std::vector<int64_t> GetReduceAxes(const uint32_t indim, const Array<Inte
 
 // Get axis under exclude condition.
 Array<Integer> GetExcludeAxes(size_t indim, const Array<Integer>& inaxis) {
-  CHECK(inaxis.defined()) << "Cannot set exclude when axis=None";
+  ICHECK(inaxis.defined()) << "Cannot set exclude when axis=None";
   std::vector<bool> axis_flag(indim, true);
   for (auto i : inaxis) {
     int64_t axis = i->value;
@@ -99,8 +99,8 @@ Array<Integer> GetExcludeAxes(size_t indim, const Array<Integer>& inaxis) {
       axis = axis + static_cast<int64_t>(indim);
     }
     // Check out of bounds error
-    CHECK_GE(axis, 0) << "Axis out of bounds in reduce operator.";
-    CHECK_LT(axis, static_cast<int64_t>(indim)) << "Axis out of bounds in reduce operator.";
+    ICHECK_GE(axis, 0) << "Axis out of bounds in reduce operator.";
+    ICHECK_LT(axis, static_cast<int64_t>(indim)) << "Axis out of bounds in reduce operator.";
     axis_flag[axis] = false;
   }
 
@@ -125,7 +125,7 @@ Array<Array<Layout>> ReduceInferCorrectLayout(const Attrs& attrs,
   // Get the reduce axes.
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
   uint32_t indim = old_in_shapes[0].size();
@@ -135,8 +135,8 @@ Array<Array<Layout>> ReduceInferCorrectLayout(const Attrs& attrs,
   if (new_in_layouts.defined() && r_axes.size()) {
     // Adapt to new layout. The axis has to change. Record original reduce axes. Convert to the
     // modified layout axes.
-    CHECK_EQ(new_in_layouts.size(), 1);
-    CHECK_EQ(old_in_layouts.size(), 1);
+    ICHECK_EQ(new_in_layouts.size(), 1);
+    ICHECK_EQ(old_in_layouts.size(), 1);
 
     // 1) Collect the original axes
     std::unordered_set<std::string> old_r_dims;
@@ -166,7 +166,7 @@ Array<Array<Layout>> ReduceInferCorrectLayout(const Attrs& attrs,
     params->axis = new_r_axes;
   } else if (old_in_layouts.defined()) {
     // If the new layout is undefined, set the old layout as the inferred layout.
-    CHECK_EQ(old_in_layouts.size(), 1);
+    ICHECK_EQ(old_in_layouts.size(), 1);
     ret = old_in_layouts[0];
   }
 
@@ -177,7 +177,7 @@ template <typename F>
 Array<te::Tensor> ReduceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type, F f) {
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   if (inputs[0]->shape.size() == 0) {
     return {topi::identity(inputs[0])};
   }
@@ -221,8 +221,8 @@ inline std::vector<IndexExpr> ReduceShapeImpl(const std::vector<IndexExpr>& in_s
   }
 
   if (is_dynamic_input) {
-    CHECK(reporter->Assert(max_shape <
-                           tir::make_const(DataType::Int(64), std::numeric_limits<int32_t>::max())))
+    ICHECK(reporter->Assert(
+        max_shape < tir::make_const(DataType::Int(64), std::numeric_limits<int32_t>::max())))
         << "The maximum possible index of reduced shape cannot be more than int32 max.";
   }
 
@@ -259,14 +259,14 @@ inline std::vector<IndexExpr> ReduceShapeImpl(const std::vector<IndexExpr>& in_s
  */
 bool ArgReduceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  CHECK(static_cast<int>(data->shape.size()) != 0);
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
   std::vector<IndexExpr> in_shape(data->shape.begin(), data->shape.end());
 
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // assign output type and shape
   auto oshape = ReduceShapeImpl(in_shape, param, reporter);
@@ -283,13 +283,13 @@ bool ArgReduceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
  */
 bool ReduceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   std::vector<IndexExpr> in_shape(data->shape.begin(), data->shape.end());
 
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // assign output type and shape
   auto oshape = ReduceShapeImpl(in_shape, param, reporter);
@@ -501,7 +501,7 @@ Array<te::Tensor> MeanCompute(const Attrs& attrs, const Array<te::Tensor>& input
                               const Type& out_type) {
   IndexExpr count = tir::make_const(inputs[0]->dtype, 1);
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto axes = param->axis;
   for (int64_t i : GetReduceAxes(inputs[0]->shape.size(), param->axis, param->exclude)) {
     count *= inputs[0]->shape[i];
@@ -537,19 +537,19 @@ Example::
 
 bool VarianceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  CHECK(static_cast<int>(data->shape.size()) != 0);
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
   const auto* mean = types[1].as<TensorTypeNode>();
   if (mean == nullptr) return false;
 
   std::vector<IndexExpr> in_shape(data->shape.begin(), data->shape.end());
   std::vector<IndexExpr> mean_shape(mean->shape.begin(), mean->shape.end());
-  CHECK_EQ(in_shape.size(), mean_shape.size());
+  ICHECK_EQ(in_shape.size(), mean_shape.size());
 
   const VarianceAttrs* param = attrs.as<VarianceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // assign output type and shape
   auto oshape = ReduceShapeImpl(in_shape, param, reporter);
@@ -561,7 +561,7 @@ Array<te::Tensor> VarianceCompute(const Attrs& attrs, const Array<te::Tensor>& i
                                   const Type& out_type) {
   IndexExpr count = tir::make_const(inputs[0]->dtype, 1);
   const VarianceAttrs* param = attrs.as<VarianceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto axes = param->axis;
   bool unbiased = param->unbiased;
   auto data = inputs[0];
@@ -576,7 +576,7 @@ Array<te::Tensor> VarianceCompute(const Attrs& attrs, const Array<te::Tensor>& i
   auto sq_diff = topi::power(topi::subtract(data, mean), 2);
   if (param->exclude) {
     axes = GetExcludeAxes(sq_diff->shape.size(), param->axis);
-    CHECK_NE(axes.size(), 0);
+    ICHECK_NE(axes.size(), 0);
   }
   auto var = topi::divide(topi::sum(sq_diff, axes, param->keepdims, false), count);
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 817023424900..a2b4f67a21d0 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -40,11 +40,12 @@
 
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
-#include "../../transforms/pass_util.h"
-#include "../../transforms/pattern_util.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../../transforms/pass_utils.h"
+#include "../../transforms/pattern_utils.h"
 #include "../make_op.h"
 #include "../op_common.h"
+#include "../type_relations.h"
 
 namespace tvm {
 namespace relay {
@@ -55,10 +56,10 @@ TVM_REGISTER_NODE_TYPE(CastAttrs);
 
 bool CastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "cast: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -70,7 +71,7 @@ bool CastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> CastCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
   const CastAttrs* param = attrs.as<CastAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   DataType dtype = param->dtype;
   return {topi::cast(inputs[0], dtype)};
 }
@@ -100,16 +101,16 @@ RELAY_REGISTER_OP("cast")
 // relay.cast_like
 bool CastLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "cast: expect input type to be TensorType but get " << types[0];
     return false;
   }
   const auto* dtype_like = types[1].as<TensorTypeNode>();
   if (dtype_like == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "cast: expect input type to be TensorType but get " << types[1];
     return false;
   }
@@ -144,7 +145,7 @@ RELAY_REGISTER_OP("cast_like")
 Array<te::Tensor> ReinterpretCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                      const Type& out_type) {
   const CastAttrs* param = attrs.as<CastAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   DataType dtype = param->dtype;
   return {topi::reinterpret(inputs[0], dtype)};
 }
@@ -178,10 +179,10 @@ TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
 bool ExpandDimsRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "expand_dims: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -189,9 +190,9 @@ bool ExpandDimsRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const int ndim = static_cast<int>(data->shape.size());
   const int axis = param->axis;
   const int num_newaxis = param->num_newaxis;
-  CHECK(num_newaxis >= 0) << "expand_dims only accepts `num_newaxis >= 0`"
-                          << ", but got num_newaxis = " << num_newaxis;
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(num_newaxis >= 0) << "expand_dims only accepts `num_newaxis >= 0`"
+                           << ", but got num_newaxis = " << num_newaxis;
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
   const int pivot = axis < 0 ? ndim + axis + 1 : axis;
@@ -213,7 +214,7 @@ bool ExpandDimsRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> ExpandDimsCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                     const Type& out_type) {
   const ExpandDimsAttrs* param = attrs.as<ExpandDimsAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::expand_dims(inputs[0], param->axis, param->num_newaxis)};
 }
 
@@ -247,7 +248,7 @@ TVM_REGISTER_NODE_TYPE(ConcatenateAttrs);
 Array<te::Tensor> ConcatenateCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                      const Type& out_type) {
   const ConcatenateAttrs* param = attrs.as<ConcatenateAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::concatenate(inputs, param->axis)};
 }
 
@@ -282,10 +283,10 @@ TVM_REGISTER_NODE_TYPE(StackAttrs);
 bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* tensor_tuple = types[0].as<TupleTypeNode>();
   if (tensor_tuple == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "cast: expect input type to be TupleType but get " << types[0];
     return false;
   }
@@ -295,7 +296,7 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   // Sanity check: axis
   int axis = param->axis;
-  CHECK(-(ndim + 1) <= axis && axis < ndim + 1)
+  ICHECK(-(ndim + 1) <= axis && axis < ndim + 1)
       << "stack only accepts `axis` in [-(ndim+1), ndim+1)"
       << ", but got axis = " << axis << ", and ndim = " << ndim;
   axis = axis < 0 ? ndim + axis + 1 : axis;
@@ -306,8 +307,8 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     const auto& e = Downcast<TensorType>(ele);
     int e_ndim = static_cast<int>(e->shape.size());
     const DataType& e_dtype = e->dtype;
-    CHECK_EQ(e_ndim, ndim) << "relay.stack requires all tensors have the same ndim";
-    CHECK_EQ(e_dtype, dtype) << "relay.stack requires all tensors have the same dtype";
+    ICHECK_EQ(e_ndim, ndim) << "relay.stack requires all tensors have the same ndim";
+    ICHECK_EQ(e_dtype, dtype) << "relay.stack requires all tensors have the same dtype";
     for (size_t j = 0; j < first->shape.size(); ++j) {
       if (j == static_cast<size_t>(axis)) continue;
       if (first->shape[j].as<AnyNode>() || e->shape[j].as<AnyNode>() ||
@@ -337,7 +338,7 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> StackCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                const Type& out_type) {
   const StackAttrs* param = attrs.as<StackAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::stack(inputs, param->axis)};
 }
 
@@ -372,10 +373,10 @@ TVM_REGISTER_NODE_TYPE(TransposeAttrs);
 bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "transpose: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -383,7 +384,7 @@ bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const int ndim = data->shape.size();
   const Array<Integer>& axes = param->axes;
   // check dimension match
-  CHECK(!axes.defined() || static_cast<int>(axes.size()) == ndim)
+  ICHECK(!axes.defined() || static_cast<int>(axes.size()) == ndim)
       << "Dimension mismatch: axes has " << axes.size() << " elements"
       << ", but data.ndim = " << ndim;
   // construct int_axes
@@ -399,12 +400,12 @@ bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     for (const Integer& e : axes) {
       int64_t axis = e;
       // sanity check for axis and ndim
-      CHECK(-ndim <= axis && axis < ndim)
+      ICHECK(-ndim <= axis && axis < ndim)
           << "transpose only allows each `axis` in `axes` in range [-data.ndim, data.ndim)"
           << ", but got axis = " << axis << ", and data.ndim = " << ndim;
       axis = axis < 0 ? axis + ndim : axis;
       // sanity check for duplication
-      CHECK(!axis_used[axis]) << "Duplicate axes in transpose: " << axis;
+      ICHECK(!axis_used[axis]) << "Duplicate axes in transpose: " << axis;
       axis_used[axis] = 1;
       int_axes.push_back(static_cast<int>(axis));
     }
@@ -421,7 +422,7 @@ bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> TransposeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                    const Type& out_type) {
   const auto* param = attrs.as<TransposeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{topi::transpose(inputs[0], param->axes)};
 }
 
@@ -452,6 +453,7 @@ RELAY_REGISTER_OP("transpose")
 
 /* relay.reshape */
 TVM_REGISTER_NODE_TYPE(ReshapeAttrs);
+TVM_REGISTER_NODE_TYPE(ReshapeLikeAttrs);
 
 Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs& attrs) {
   const auto* param = attrs.as<ReshapeAttrs>();
@@ -480,13 +482,13 @@ Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs&
       ++src_idx;
     } else if (svalue == 0) {
       // keep same
-      CHECK_LT(src_idx, ishape.size());
+      ICHECK_LT(src_idx, ishape.size());
       used_input_dims.insert(src_idx);
       used_output_dims.insert(oshape.size());
       oshape.push_back(ishape[src_idx++]);
     } else if (svalue == -1) {
       // inference based on rest
-      CHECK_LT(infer_idx, 0) << "One and only one dim can be inferred";
+      ICHECK_LT(infer_idx, 0) << "One and only one dim can be inferred";
       infer_idx = i;
       oshape.push_back(1);
       ++src_idx;
@@ -499,7 +501,7 @@ Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs&
       }
     } else if (svalue == -3) {
       // merge two dims from source
-      CHECK_LT(src_idx + 1, ishape.size());
+      ICHECK_LT(src_idx + 1, ishape.size());
       used_input_dims.insert(src_idx);
       IndexExpr d1 = ishape[src_idx++];
       used_input_dims.insert(src_idx);
@@ -513,14 +515,14 @@ Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs&
     } else if (svalue == -4) {
       // split the source dim s into two dims
       // read the left dim and then the right dim (either can be -1)
-      CHECK_LT(i + 2, newshape.size());
-      CHECK_LT(src_idx, ishape.size());
+      ICHECK_LT(i + 2, newshape.size());
+      ICHECK_LT(src_idx, ishape.size());
       used_input_dims.insert(src_idx);
       IndexExpr d0 = ishape[src_idx++];
       Integer d1 = newshape[++i];
       Integer d2 = newshape[++i];
       if (d1->value == -1) {
-        CHECK_NE(d2->value, -1) << "Split dims cannot both be -1.";
+        ICHECK_NE(d2->value, -1) << "Split dims cannot both be -1.";
         used_output_dims.insert(oshape.size());
         if (d0.as<AnyNode>()) {
           oshape.push_back(Any());
@@ -584,10 +586,10 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   const auto* param = attrs.as<ReshapeAttrs>();
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "reshape: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -627,7 +629,7 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     data_shape_sum *= Downcast<tvm::Integer>(x)->value;
   }
   if (!found_dynamic) {
-    CHECK_EQ(oshape_sum, data_shape_sum)
+    ICHECK_EQ(oshape_sum, data_shape_sum)
         << "Input tensor shape and reshaped shape are not compatible";
   }
 
@@ -640,15 +642,53 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
+Array<PrimExpr> infer_reshape_like(const Array<PrimExpr>& lhs_shape,
+                                   const Array<PrimExpr>& rhs_shape, const Attrs& attrs) {
+  const auto* like_attrs = attrs.as<ReshapeLikeAttrs>();
+  CHECK(!like_attrs->lhs_end.defined() || like_attrs->lhs_end.as<IntImmNode>())
+      << "lhs_end must be a concrete integer or None";
+  CHECK(!like_attrs->rhs_end.defined() || like_attrs->rhs_end.as<IntImmNode>())
+      << "rhs_end must be a concrete integer or None";
+
+  int64_t lhs_shape_size = static_cast<int64_t>(lhs_shape.size());
+  int64_t rhs_shape_size = static_cast<int64_t>(rhs_shape.size());
+  int64_t lhs_begin = static_cast<int64_t>(like_attrs->lhs_begin);
+  int64_t lhs_end =
+      like_attrs->lhs_end.defined() ? like_attrs->lhs_end.as<IntImmNode>()->value : lhs_shape_size;
+  int64_t rhs_begin = static_cast<int64_t>(like_attrs->rhs_begin);
+  int64_t rhs_end =
+      like_attrs->rhs_end.defined() ? like_attrs->rhs_end.as<IntImmNode>()->value : rhs_shape_size;
+
+  // handle negative axes
+  lhs_begin = lhs_begin < 0 ? lhs_begin + lhs_shape_size : lhs_begin;
+  lhs_end = lhs_end < 0 ? lhs_end + lhs_shape_size : lhs_end;
+  rhs_begin = rhs_begin < 0 ? rhs_begin + rhs_shape_size : rhs_begin;
+  rhs_end = rhs_end < 0 ? rhs_end + rhs_shape_size : rhs_end;
+
+  Array<PrimExpr> shape_like;
+  for (auto i = 0; i < lhs_begin; i++) {
+    shape_like.push_back(lhs_shape[i]);
+  }
+  for (auto i = rhs_begin; i < rhs_end; i++) {
+    shape_like.push_back(rhs_shape[i]);
+  }
+  for (auto i = lhs_end; i < lhs_shape_size; i++) {
+    shape_like.push_back(lhs_shape[i]);
+  }
+  return shape_like;
+}
+
 Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   // Quick path for reshape_like
   if (!attrs.as<ReshapeAttrs>()) {
-    return {topi::reshape(inputs[0], inputs[1]->shape)};
+    ICHECK(attrs.as<ReshapeLikeAttrs>() != nullptr);
+    auto shape_like = infer_reshape_like(inputs[0]->shape, inputs[1]->shape, attrs);
+    return {topi::reshape(inputs[0], shape_like)};
   }
 
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
   Array<IndexExpr> newshape;
   bool newshape_has_any = false;
   for (auto val : out_ttype->shape) {
@@ -745,7 +785,8 @@ Example::
  */
 bool ReshapeLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK(attrs.as<ReshapeLikeAttrs>() != nullptr);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -754,6 +795,7 @@ bool ReshapeLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   if (reshape_like == nullptr) {
     return false;
   }
+  auto shape_like = infer_reshape_like(data->shape, reshape_like->shape, attrs);
   // Only check When input data has static shape.
   bool is_static_shape = true;
   for (size_t i = 0; i < data->shape.size(); ++i) {
@@ -762,17 +804,24 @@ bool ReshapeLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
       break;
     }
   }
+  auto output_type = TensorType(shape_like, data->dtype);
   if (is_static_shape) {
-    CHECK(reporter->AssertEQ(data->Size(), reshape_like->Size()))
+    ICHECK(reporter->AssertEQ(data->Size(), output_type->Size()))
         << "Reshape inputs size should be compatible.";
   }
-  reporter->Assign(types[2], TensorType(reshape_like->shape, data->dtype));
+  reporter->Assign(types[2], output_type);
   return true;
 }
 
-Expr MakeReshapeLike(Expr data, Expr shape_like) {
+Expr MakeReshapeLike(Expr lhs, Expr rhs, int lhs_begin, Integer lhs_end, int rhs_begin,
+                     Integer rhs_end) {
+  auto attrs = make_object<ReshapeLikeAttrs>();
+  attrs->lhs_begin = std::move(lhs_begin);
+  attrs->lhs_end = std::move(lhs_end);
+  attrs->rhs_begin = std::move(rhs_begin);
+  attrs->rhs_end = std::move(rhs_end);
   static const Op& op = Op::Get("reshape_like");
-  return Call(op, {data, shape_like}, Attrs(), {});
+  return Call(op, {lhs, rhs}, Attrs(attrs), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op._make.reshape_like").set_body_typed(MakeReshapeLike);
@@ -783,7 +832,15 @@ For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation re
 the input array into an output array with the same shape as the second input array.
 .. note::
     Sizes for both array should be compatible.
+Example::
+
+  data.shape == (1, 2, 3, 4)
+  shape_like.shape == (6, 2, 2, 3)
+
+  ret = reshape_like(data, shape_like, lhs_begin=1, rhs_end=3)
+  ret.shape == (1, 6, 2, 2)
 )code" TVM_ADD_FILELINE)
+    .set_attrs_type<ReshapeLikeAttrs>()
     .set_num_inputs(2)
     .add_argument("data", "Tensor", "The input tensor.")
     .add_argument("shape_like", "Tensor", "Shape tensor.")
@@ -795,7 +852,7 @@ the input array into an output array with the same shape as the second input arr
 // ArgWhere
 bool ArgWhereRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 1);
+  ICHECK_EQ(num_inputs, 1);
   auto tt = types[0].as<TensorTypeNode>();
 
   if (tt == nullptr) {
@@ -832,8 +889,8 @@ TVM_REGISTER_NODE_TYPE(ScatterAttrs);
 // Scatter
 bool ScatterRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 3);
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(num_inputs, 3);
+  ICHECK_EQ(types.size(), 4);
   auto data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -846,9 +903,9 @@ bool ScatterRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (updates == nullptr) {
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
   const auto param = attrs.as<ScatterAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
   return true;
 }
@@ -879,8 +936,8 @@ TVM_REGISTER_NODE_TYPE(ScatterAddAttrs);
 // Scatter Add
 bool ScatterAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 3);
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(num_inputs, 3);
+  ICHECK_EQ(types.size(), 4);
   auto data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -893,9 +950,9 @@ bool ScatterAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (updates == nullptr) {
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of scatter_add must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of scatter_add must be tensor of integer";
   const auto param = attrs.as<ScatterAddAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
   return true;
 }
@@ -920,13 +977,81 @@ RELAY_REGISTER_OP("scatter_add")
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_support_level(10);
 
+// scatter_nd operator
+TVM_REGISTER_NODE_TYPE(ScatterNDAttrs);
+
+bool ScatterNDRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  // `types` contains: [data, indices, result]
+  ICHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* indices = types[1].as<TensorTypeNode>();
+  if (data == nullptr) {
+    ICHECK(types[0].as<IncompleteTypeNode>())
+        << "ScatterND: expect input data type to be TensorType but got " << types[0];
+    return false;
+  }
+  if (indices == nullptr) {
+    ICHECK(types[1].as<IncompleteTypeNode>())
+        << "ScatterND: expect indices type to be TensorType but got " << types[1];
+    return false;
+  }
+  ICHECK(indices->dtype.is_int()) << "ScatterND: indices must be a tensor of integers.";
+  const auto out_shape = attrs.as<ScatterNDAttrs>()->out_shape;
+  const IntImmNode* mdim = indices->shape[0].as<IntImmNode>();
+  const size_t kdim = indices->shape.size() - 1;
+  const size_t ndim = out_shape.size();
+  ICHECK_LE(size_t(mdim->value), ndim)
+      << "ScatterND: Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), and indices "
+         "with shape (M, Y_0, ..., Y_{K-1}), M must be less than or equal to N.";
+  // Indices: (M, Y_0, .. Y_{K-1}) data: (Y_0, .. Y_{K-1}, ...), verify Y's.
+  for (size_t i = 0; i < kdim; i++) {
+    reporter->AssertEQ(indices->shape[i + 1], data->shape[i]);
+  }
+
+  std::vector<IndexExpr> oshape;
+  for (auto& x : out_shape) {
+    oshape.push_back(x);
+  }
+
+  // data: (Y_0, .. Y_{K-1}, X_M, .. X_{N-1}) out: (X_0, .. X_{N-1}), verify X_M to X_{N-1}
+  for (size_t i = mdim->value; i < ndim; i++) {
+    reporter->AssertEQ(data->shape[i - mdim->value + kdim], oshape[i]);
+  }
+
+  reporter->Assign(types[2], TensorType(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeScatterND(Expr data, Expr indices, const Array<Integer> out_shape) {
+  auto attrs = make_object<ScatterNDAttrs>();
+  attrs->out_shape = out_shape;
+  static const Op& op = Op::Get("scatter_nd");
+  return Call(op, {data, indices}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.scatter_nd").set_body_typed(MakeScatterND);
+
+RELAY_REGISTER_OP("scatter_nd")
+    .describe(R"code(Scatter elements or slices from data and store to a tensor
+whose shape is defined by indices.
+
+Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}) and indices with shape
+(M, Y_0, ..., Y_{K-1}), the output will have shape (X_0, X_1, ..., X_{N-1}).
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_support_level(3)
+    .add_type_rel("ScatterND", ScatterNDRel)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
+
 // Take
 TVM_REGISTER_NODE_TYPE(TakeAttrs);
 
 bool TakeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, indices, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -935,9 +1060,9 @@ bool TakeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (indices == nullptr) {
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
   const auto param = attrs.as<TakeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   if (!param->axis.defined()) {
     std::vector<IndexExpr> oshape(indices->shape.begin(), indices->shape.end());
@@ -950,8 +1075,8 @@ bool TakeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto ndim_indices = static_cast<int>(indices->shape.size());
   int axis = static_cast<int>(param->axis->value);
   if (axis < 0) axis += ndim_data;
-  CHECK_LE(axis, ndim_data) << "axis should be with in data shape"
-                            << ", but got = " << axis;
+  ICHECK_LE(axis, ndim_data) << "axis should be with in data shape"
+                             << ", but got = " << axis;
 
   oshape.reserve(ndim_data - 1 + ndim_indices);
   for (int i = 0; i < axis; ++i) {
@@ -971,7 +1096,7 @@ bool TakeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> TakeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
   const auto* param = attrs.as<TakeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   if (!param->axis.defined()) {
     return Array<te::Tensor>{topi::take(inputs[0], inputs[1], param->mode)};
   } else {
@@ -1026,7 +1151,7 @@ TVM_REGISTER_NODE_TYPE(InitOpAttrs);
 
 bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
   const auto* fill_value = types[0].as<TensorTypeNode>();
   if (fill_value == nullptr) {
@@ -1038,7 +1163,7 @@ bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     out_dtype = fill_value->dtype;
   }
 
-  CHECK_EQ(fill_value->shape.size(), 0)
+  ICHECK_EQ(fill_value->shape.size(), 0)
       << "Fill value should be a scalar but has dimension " << fill_value->shape.size() << ".";
 
   std::vector<IndexExpr> oshape;
@@ -1081,10 +1206,10 @@ RELAY_REGISTER_OP("full")
 bool InitOpRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // types = [ret_type]
-  CHECK_EQ(types.size(), 1);
+  ICHECK_EQ(types.size(), 1);
 
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   DataType out_dtype = param->dtype;
   std::vector<IndexExpr> oshape;
@@ -1137,7 +1262,7 @@ RELAY_REGISTER_OP("ones")
 
 bool FullLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -1147,7 +1272,7 @@ bool FullLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     return false;
   }
 
-  CHECK_EQ(fill_value->shape.size(), 0)
+  ICHECK_EQ(fill_value->shape.size(), 0)
       << "The fill value should be a scalar but here it has dimension " << fill_value->shape.size()
       << ".";
 
@@ -1185,7 +1310,7 @@ TVM_REGISTER_NODE_TYPE(ArangeAttrs);
 
 bool ArangeRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const ArangeAttrs* attrs = raw_attrs.as<ArangeAttrs>();
   const ConstantNode *cstart, *cstop, *cstep;
 
@@ -1199,8 +1324,8 @@ bool ArangeRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
     double stop = ToScalar(cstop->data);
     double step = ToScalar(cstep->data);
     int32_t num_elem = static_cast<int32_t>(std::ceil((stop - start) / step));
-    CHECK_GT(num_elem, 0) << "Invalid arange attributes (start, stop, step): " << attrs->start
-                          << ", " << attrs->stop << ", " << attrs->step;
+    ICHECK_GT(num_elem, 0) << "Invalid arange attributes (start, stop, step): " << attrs->start
+                           << ", " << attrs->stop << ", " << attrs->step;
     reporter->Assign(types[3], TensorType({num_elem}, attrs->dtype));
     return true;
   } else {
@@ -1225,7 +1350,7 @@ inline te::Tensor DynamicArange(const te::Tensor& start, const te::Tensor& stop,
 Array<te::Tensor> ArangeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const ArangeAttrs* param = attrs.as<ArangeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   te::Tensor start = inputs[0];
   te::Tensor stop = inputs[1];
   te::Tensor step = inputs[2];
@@ -1276,10 +1401,10 @@ TVM_REGISTER_NODE_TYPE(RepeatAttrs);
 bool RepeatRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "repeat: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -1287,9 +1412,9 @@ bool RepeatRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const int ndim = static_cast<int>(data->shape.size());
   const int repeats = param->repeats;
   const int axis = param->axis;
-  CHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
-                      << ", but got repeats = " << repeats;
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
+                       << ", but got repeats = " << repeats;
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
   const int pivot = axis < 0 ? ndim + axis : axis;
@@ -1313,7 +1438,7 @@ bool RepeatRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> RepeatCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const RepeatAttrs* param = attrs.as<RepeatAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::repeat(inputs[0], param->repeats, param->axis)};
 }
 
@@ -1347,7 +1472,7 @@ TVM_REGISTER_NODE_TYPE(MeshgridAttrs);
 bool MeshgridRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
                  const TypeReporter& reporter) {
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const MeshgridAttrs* attrs = raw_attrs.as<MeshgridAttrs>();
   const auto* tensor_tuple = types[0].as<TupleTypeNode>();
   if (tensor_tuple == nullptr) {
@@ -1403,7 +1528,7 @@ bool MeshgridRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attr
 Array<te::Tensor> MeshgridCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                   const Type& out_type) {
   const MeshgridAttrs* param = attrs.as<MeshgridAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::meshgrid(inputs, param->indexing)};
 }
 
@@ -1486,10 +1611,10 @@ TVM_REGISTER_NODE_TYPE(TileAttrs);
 bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -1497,12 +1622,12 @@ bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const size_t ndim = data->shape.size();
   const Array<Integer>& reps = param->reps;
   // check dimension match
-  CHECK(reps.defined()) << "repetition array is not defined. data.ndim = " << ndim;
+  ICHECK(reps.defined()) << "repetition array is not defined. data.ndim = " << ndim;
   const size_t rndim = reps.size();
   for (size_t i = 0; i < rndim; ++i) {
     if (const tvm::tir::IntImmNode* val = reps[i].as<tvm::tir::IntImmNode>()) {
-      CHECK_GT(val->value, 0) << "Tile reps value should always be larger than 0, but get: "
-                              << val->value;
+      ICHECK_GT(val->value, 0) << "Tile reps value should always be larger than 0, but get: "
+                               << val->value;
     }
   }
   size_t tndim = (ndim > rndim) ? ndim : rndim;
@@ -1554,7 +1679,7 @@ bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> TileCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
   const TileAttrs* param = attrs.as<TileAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::tile(inputs[0], param->reps)};
 }
 
@@ -1587,17 +1712,17 @@ TVM_REGISTER_NODE_TYPE(ReverseAttrs);
 bool ReverseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "reverse: expect input type to be TensorType but get " << types[0];
     return false;
   }
   const auto* param = attrs.as<ReverseAttrs>();
   const int ndim = static_cast<int>(data->shape.size());
   const int axis = param->axis;
-  CHECK(-ndim <= axis && axis < ndim)
+  ICHECK(-ndim <= axis && axis < ndim)
       << "reverse only accepts `axis` in [-data.ndim, data.ndim - 1]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
   reporter->Assign(types[1], types[0]);
@@ -1607,7 +1732,7 @@ bool ReverseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> ReverseCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   const ReverseAttrs* param = attrs.as<ReverseAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   // pass empty seq_length tensor to reverse_sequence
   return {topi::reverse_sequence(inputs[0], te::Tensor(), param->axis)};
 }
@@ -1641,44 +1766,44 @@ TVM_REGISTER_NODE_TYPE(ReverseSequenceAttrs);
 bool ReverseSequenceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
   // `types` contains: [data, seq_lengths, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "reverse_sequence: expect input type to be TensorType but get " << types[0];
     return false;
   }
 
   const auto* seq_lengths = types[1].as<TensorTypeNode>();
   if (seq_lengths == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "reverse_sequence: expect input type to be TensorType but get " << types[1];
     return false;
   }
 
   const int seq_lengths_dim = static_cast<int>(seq_lengths->shape.size());
-  CHECK(seq_lengths_dim == 1) << "For reverse_sequnece, seq_lengths must be a 1D vector";
-  CHECK(seq_lengths->dtype.is_int())
+  ICHECK(seq_lengths_dim == 1) << "For reverse_sequnece, seq_lengths must be a 1D vector";
+  ICHECK(seq_lengths->dtype.is_int())
       << "For reverse_sequnece, seq_lengths must be tensor of integer";
 
   const auto* param = attrs.as<ReverseSequenceAttrs>();
   const int ndim = static_cast<int>(data->shape.size());
   int batch_axis = param->batch_axis;
-  CHECK(-ndim <= batch_axis && batch_axis < ndim)
+  ICHECK(-ndim <= batch_axis && batch_axis < ndim)
       << "reverse_sequence only accepts `batch_axis` in [-data.ndim, data.ndim - 1]"
       << ", but got batch_axis = " << batch_axis << ", and data.ndim = " << ndim;
 
   if (batch_axis < 0) {
     batch_axis = static_cast<int>(data->shape.size()) + batch_axis;
   }
-  CHECK(reporter->Assert(seq_lengths->shape[0] == data->shape[batch_axis]))
+  ICHECK(reporter->Assert(seq_lengths->shape[0] == data->shape[batch_axis]))
       << "For reverse_sequnece seq_lengths size should match with dimension of batch axis"
       << ", but got dimension of batch_axis = " << data->shape[batch_axis]
       << ", and seq_length size = " << seq_lengths->shape[0];
 
   const int seq_axis = param->seq_axis;
-  CHECK(-ndim <= seq_axis && seq_axis < ndim)
+  ICHECK(-ndim <= seq_axis && seq_axis < ndim)
       << "reverse_sequnece only accepts `seq_axis` in [-data.ndim, data.ndim - 1]"
       << ", but got seq_axis = " << seq_axis << ", and data.ndim = " << ndim;
 
@@ -1689,7 +1814,7 @@ bool ReverseSequenceRel(const Array<Type>& types, int num_inputs, const Attrs& a
 Array<te::Tensor> ReverseSequenceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                          const Type& out_type) {
   const ReverseSequenceAttrs* param = attrs.as<ReverseSequenceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::reverse_sequence(inputs[0], inputs[1], param->seq_axis, param->batch_axis)};
 }
 
@@ -1728,7 +1853,7 @@ Input is first sliced along batch axis and then elements are reversed along seq
 // where operator
 bool WhereRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4U);
+  ICHECK_EQ(types.size(), 4U);
   const auto* condition = types[0].as<TensorTypeNode>();
   const auto* x = types[1].as<TensorTypeNode>();
   const auto* y = types[2].as<TensorTypeNode>();
@@ -1737,30 +1862,17 @@ bool WhereRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     return false;
   }
 
-  const auto& cond_shape = condition->shape;
-  const auto& x_shape = x->shape;
-  const auto& y_shape = y->shape;
-  CHECK(x_shape.size() == y_shape.size()) << "x and y must have the same size";
+  ICHECK_EQ(x->dtype, y->dtype) << "x and y must have the same dtype: " << x->dtype << " vs "
+                                << y->dtype;
 
-  if (cond_shape.size() != x_shape.size()) {
-    CHECK_EQ(cond_shape.size(), 1) << "Shape of condition " << condition->shape
-                                   << " must be either equal to x or has dimension of 1.";
-  }
-  for (size_t i = 0; i < x_shape.size(); i++) {
-    CHECK(reporter->AssertEQ(x_shape[i], y_shape[i]))
-        << "x and y must have the same shape: " << x_shape << " vs " << y_shape;
+  auto tensor_ty_condition = GetRef<TensorType>(condition);
+  auto tensor_ty_x = GetRef<TensorType>(x);
+  auto tensor_ty_y = GetRef<TensorType>(y);
 
-    if (i < cond_shape.size()) {
-      CHECK(reporter->AssertEQ(cond_shape[i], x_shape[i]))
-          << "condition and x must have the same shape: " << cond_shape << " vs " << x_shape;
-    }
-  }
-  if (x_shape.size() == 0) {
-    // if x and y are scalar, the condition shape becomes the output shape
-    reporter->Assign(types[3], TensorType(cond_shape, x->dtype));
-  } else {
-    reporter->Assign(types[3], TensorType(x_shape, x->dtype));
-  }
+  auto b_ty = ConcreteBroadcast(tensor_ty_x, tensor_ty_y, x->dtype);
+  auto ret_ty = ConcreteBroadcast(tensor_ty_condition, b_ty, b_ty->dtype);
+
+  reporter->Assign(types[3], ret_ty);
   return true;
 }
 
@@ -1783,17 +1895,10 @@ Return the elements, either from x or y, depending on the condition.
 
 Given three ndarrays, condition, x, and y, return an ndarray with the elements
 from x or y, depending on the elements from condition are true or false.
-x and y must have the same shape. If condition has the same shape as x,
-each element in the output array is from x if the corresponding element
-in the condition is true, and from y if false.
 
-If condition does not have the same shape as x, it must be a 1D array whose
-size is the same as x’s first dimension size. Each row of the output array
-is from x’s row if the corresponding element from condition is true, and
-from y’s row if false.
-
-When x and y are scalars, condition must be an 1D array. The output shape
-is the same as condition's shape.
+Shapes of condition, x, and y must be broadcastable to a common shape, which
+is the output shape of this op. Semantics follow numpy where function.
+https://numpy.org/doc/stable/reference/generated/numpy.where.html
 
 Note that all non-zero values are interpreted as True in condition.
 
@@ -1805,7 +1910,7 @@ Examples::
   where(cond, x, y) = [[5, 2], [3, 8]]
 
 
-  cond = [1, 0]
+  cond = [[1], [0]]
   where(cond, x, y) = [[1, 2], [7, 8]]
 
   cond = [0, 1]
@@ -1835,13 +1940,13 @@ TVM_REGISTER_GLOBAL("relay.op._make.squeeze").set_body_typed(MakeSqueeze);
 
 bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
   const auto* param = attrs.as<SqueezeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   std::vector<IndexExpr> result_shape;
   // if axes is None, squeeze all axes of dimension 1
   if (!param->axis.defined()) {
@@ -1850,7 +1955,7 @@ bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
         LOG(FATAL) << "axis needs to be defined for dynamic input.";
       }
       const int64_t* axis_ptr = tir::as_const_int(e);
-      CHECK(axis_ptr != nullptr) << "the axes attribute must be concrete";
+      ICHECK(axis_ptr != nullptr) << "the axes attribute must be concrete";
       if (*axis_ptr != 1) {
         result_shape.push_back(e);
       }
@@ -1866,8 +1971,8 @@ bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       if (axis_val < 0) {
         axis_val += static_cast<int64_t>(original_shape.size());
       }
-      CHECK_GE(axis_val, 0);
-      CHECK_LT(axis_val, original_shape.size());
+      ICHECK_GE(axis_val, 0);
+      ICHECK_LT(axis_val, original_shape.size());
       original_shape.at(axis_val).second = false;
     }
     for (const auto& p : original_shape) {
@@ -1875,7 +1980,7 @@ bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
         result_shape.push_back(p.first);
       } else {
         if (const int64_t* axis_ptr = tir::as_const_int(p.first)) {
-          CHECK_EQ(*axis_ptr, 1) << "cannot squeeze axis with dimension not equal to 1";
+          ICHECK_EQ(*axis_ptr, 1) << "cannot squeeze axis with dimension not equal to 1";
         }
       }
     }
@@ -1887,7 +1992,7 @@ bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> SqueezeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   const SqueezeAttrs* param = attrs.as<SqueezeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::squeeze(inputs[0], param->axis)};
 }
 
@@ -1908,7 +2013,7 @@ RELAY_REGISTER_OP("squeeze")
 // CollapseSumLike: <A, B> -> B where BroadCast(A, B) = A
 bool CollapseSumLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   reporter->Assign(types[2], types[1]);
   return BroadcastRel({types[0], types[1], types[0]}, 2, Attrs(), reporter);
 }
@@ -1921,7 +2026,7 @@ Expr MakeCollapseSumLike(Expr data, Expr collapse_type) {
 Array<te::Tensor> CollapseSumLikeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                          const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
   return {topi::collapse_sum(inputs[0], out_ttype->shape)};
 }
 
@@ -1941,14 +2046,14 @@ RELAY_REGISTER_OP("collapse_sum_like")
 // CollapseSumTo: <A, B> -> B where Broadcast(A, B) = A
 bool CollapseSumToRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
 
   const auto* target_shape = types[1].as<TensorTypeNode>();
   DataType out_dtype = types[0].as<TensorTypeNode>()->dtype;
 
   const IntImmNode* rank = target_shape->shape[0].as<IntImmNode>();
-  CHECK(rank) << "Parameter must have static rank";
+  ICHECK(rank) << "Parameter must have static rank";
 
   std::vector<IndexExpr> oshape;
   if (param->shape) {
@@ -1990,10 +2095,10 @@ RELAY_REGISTER_OP("collapse_sum_to")
 bool BroadCastToRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
   // types = [data_type, ret_type], broadcast_to_type is in attrs bc static
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
 
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   DataType out_dtype = types[0].as<TensorTypeNode>()->dtype;
   std::vector<IndexExpr> oshape;
@@ -2035,7 +2140,7 @@ RELAY_REGISTER_OP("broadcast_to")
 // BroadCastToLike: <A, B> -> B where BroadCast(A, B) = B
 bool BroadCastToLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   reporter->Assign(types[2], types[1]);
   return BroadcastRel({types[0], types[1], types[1]}, 2, Attrs(), reporter);
 }
@@ -2048,7 +2153,7 @@ Expr MakeBroadCastToLike(Expr data, Expr broadcast_type) {
 Array<te::Tensor> BroadCastToLikeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                          const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
   return {topi::broadcast_to(inputs[0], out_ttype->shape)};
 }
 
@@ -2068,7 +2173,7 @@ RELAY_REGISTER_OP("broadcast_to_like")
 // Adapter function to make int array.
 Array<Integer> GetIntArray(Array<IndexExpr> arr) {
   for (size_t i = 0; i < arr.size(); ++i) {
-    CHECK(!arr[i].defined() || arr[i].as<IntImmNode>()) << "Expect an int array";
+    ICHECK(!arr[i].defined() || arr[i].as<IntImmNode>()) << "Expect an int array";
   }
   return Downcast<Array<Integer>>(arr);
 }
@@ -2078,7 +2183,7 @@ TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
 
 bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
   if (param == nullptr) {
     return false;
@@ -2099,7 +2204,7 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
     std::vector<int64_t> stride_vec(num_axis, 1);
     if (param->slice_mode == "end") {
       for (size_t i = 0; i < param->strides.value().size(); ++i) {
-        CHECK(param->strides.value()[i].defined());
+        ICHECK(param->strides.value()[i].defined());
         stride_vec[i] = param->strides.value()[i]->value;
       }
     }
@@ -2163,14 +2268,14 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
       int64_t slice_range, step;
       if (stride_v < 0) {
         if (end_v < -1) end_v = -1;
-        CHECK_LE(end_v, begin_v) << "strided_slice get empty slice at axis " << i;
+        ICHECK_LE(end_v, begin_v) << "strided_slice get empty slice at axis " << i;
         begin_v = std::min(dim_size - 1, begin_v);
         slice_range = begin_v - end_v;
         step = -stride_v;
       } else {
         if (begin_v < 0) begin_v = 0;
-        CHECK_GE(stride_v, 0);
-        CHECK_LE(begin_v, end_v) << "strided_slice get invalid slice at axis " << i;
+        ICHECK_GE(stride_v, 0);
+        ICHECK_LE(begin_v, end_v) << "strided_slice get invalid slice at axis " << i;
         end_v = std::min(dim_size, end_v);
         slice_range = end_v - begin_v;
         step = stride_v;
@@ -2178,9 +2283,9 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
       oshape[i] = tir::make_const(dshape[i].dtype(), (slice_range + step - 1) / step);
     }
   } else {
-    CHECK(param->begin) << "strided_slice recieved invalid begin " << param->begin;
-    CHECK(param->end) << "strided_slice recieved invalid end " << param->end;
-    CHECK(param->strides) << "strided_slice recieved invalid strides " << param->strides;
+    ICHECK(param->begin) << "strided_slice recieved invalid begin " << param->begin;
+    ICHECK(param->end) << "strided_slice recieved invalid end " << param->end;
+    ICHECK(param->strides) << "strided_slice recieved invalid strides " << param->strides;
   }
   reporter->Assign(types[1], TensorType(oshape, data->dtype));
   return true;
@@ -2192,37 +2297,37 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
                                                     const Array<tvm::relay::Type>& old_in_types) {
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
 
-  CHECK(old_in_layouts.defined());
-  CHECK_GE(old_in_layouts.size(), 1);
-  CHECK(old_in_shapes.defined());
-  CHECK_GE(old_in_shapes.size(), 1);
+  ICHECK(old_in_layouts.defined());
+  ICHECK_GE(old_in_layouts.size(), 1);
+  ICHECK(old_in_shapes.defined());
+  ICHECK_GE(old_in_shapes.size(), 1);
 
   auto layout = old_in_layouts[0];
   if (layout.defined() && new_in_layouts.defined()) {
-    CHECK_GE(new_in_layouts.size(), 1);
+    ICHECK_GE(new_in_layouts.size(), 1);
     auto new_layout = new_in_layouts[0];
     auto shape = old_in_shapes[0];
 
     // NOTE: Discard "const" qualifier here.
     auto* params = const_cast<StridedSliceAttrs*>(attrs.as<StridedSliceAttrs>());
-    CHECK(params != nullptr);
+    ICHECK(params != nullptr);
     Array<Integer> begin, end, strides;
     if (params->begin && params->end && params->strides) {
       for (Integer i : params->strides.value()) {
-        CHECK(i.defined());
+        ICHECK(i.defined());
         strides.push_back(params->slice_mode == "size" ? 1 : i->value);
       }
 
       for (Integer i : params->begin.value()) {
-        CHECK(i.defined());
+        ICHECK(i.defined());
         begin.push_back(i->value);
       }
       for (Integer i : params->end.value()) {
-        CHECK(i.defined());
+        ICHECK(i.defined());
         end.push_back(i->value);
       }
     }
@@ -2325,22 +2430,21 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
 Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                       const Type& out_type) {
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Array<Integer> begin, end, strides;
+  Array<PrimExpr> begin_expr, end_expr, strides_expr;
   begin = param->begin.value();
   end = param->end.value();
   strides = param->strides.value();
   if (IsDynamic(out_type)) {
     auto input = inputs[0];
     size_t src_tensor_dim = input->shape.size();
-    CHECK(begin.size() == src_tensor_dim)
+    ICHECK(begin.size() == src_tensor_dim)
         << "for dynamic inputs, len(begin) must equal the input dimension";
     Array<IndexExpr> out_shape;
     for (size_t i = 0; i < src_tensor_dim; ++i) {
       out_shape.push_back(tvm::tir::Var("dim"));
     }
-    Array<PrimExpr> begin_expr;
-    Array<PrimExpr> strides_expr;
     for (size_t i = 0; i < src_tensor_dim; ++i) {
       int64_t begin_i = begin[i]->value;
       if (begin_i < 0) {
@@ -2361,8 +2465,19 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
           return input(real_indices);
         },
         std::string{"T_strided_slice_dynamic"}, std::string{topi::kInjective})};
+  } else {
+    for (size_t i = 0; i < begin.size(); ++i) {
+      begin_expr.push_back(begin[i]);
+    }
+    for (size_t i = 0; i < end.size(); ++i) {
+      end_expr.push_back(end[i]);
+    }
+    for (size_t i = 0; i < strides.size(); ++i) {
+      strides_expr.push_back(strides[i]);
+    }
   }
-  return Array<te::Tensor>{topi::strided_slice(inputs[0], begin, end, strides, param->slice_mode)};
+  return Array<te::Tensor>{
+      topi::strided_slice(inputs[0], begin_expr, end_expr, strides_expr, param->slice_mode)};
 }
 
 // Positional relay function to create StridedSlice operator used by frontend FFI.
@@ -2416,7 +2531,7 @@ Examples::
 // strided_set
 bool StridedSetRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
   reporter->Assign(types[5], types[0]);
   return true;
 }
@@ -2460,23 +2575,23 @@ TVM_REGISTER_NODE_TYPE(SplitAttrs);
 bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  CHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
+  ICHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
   const auto param = attrs.as<SplitAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto axis = param->axis;
   if (axis < 0) {
     axis += data->shape.size();
   }
-  CHECK_LT(axis, data->shape.size()) << "axis should be within the input dimension range.";
-  CHECK_GE(axis, 0) << "axis should be within the input dimension range.";
+  ICHECK_LT(axis, data->shape.size()) << "axis should be within the input dimension range.";
+  ICHECK_GE(axis, 0) << "axis should be within the input dimension range.";
 
   if (const IntImmNode* sections = param->indices_or_sections.as<IntImmNode>()) {
     if (!data->shape[axis].as<AnyNode>()) {
-      CHECK(reporter->Assert(indexmod(data->shape[axis], sections->value) ==
-                             tir::make_zero(DataType::Int(64))))
+      ICHECK(reporter->Assert(indexmod(data->shape[axis], sections->value) ==
+                              tir::make_zero(DataType::Int(64))))
           << "indices_or_sections need to be able to divide input.shape[axis]";
     }
     std::vector<Type> fields;
@@ -2496,7 +2611,7 @@ bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     auto begin = IndexExpr(tir::make_zero(DataType::Int(32)));
     std::vector<Type> fields;
     for (unsigned int i = 0; i < indices.size(); ++i) {
-      CHECK(reporter->Assert(Downcast<IndexExpr>(indices[i]) > begin))
+      ICHECK(reporter->Assert(Downcast<IndexExpr>(indices[i]) > begin))
           << "indices_or_sections need to be a sorted ascending list";
       std::vector<IndexExpr> oshape(data->shape.begin(), data->shape.end());
       oshape[axis] = Downcast<IndexExpr>(indices[i]) - begin;
@@ -2505,7 +2620,7 @@ bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       fields.push_back(vec_type);
     }
     if (!data->shape[axis].as<AnyNode>()) {
-      CHECK(reporter->Assert(begin < data->shape[axis]))
+      ICHECK(reporter->Assert(begin < data->shape[axis]))
           << "The sum of sections must match the input.shape[axis]";
     }
     std::vector<IndexExpr> oshape(data->shape.begin(), data->shape.end());
@@ -2524,7 +2639,7 @@ bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> SplitCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                const Type& out_type) {
   const auto param = attrs.as<SplitAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   if (const IntImmNode* sections = param->indices_or_sections.as<IntImmNode>()) {
     int64_t num_sections = sections->value;
@@ -2590,7 +2705,7 @@ TVM_REGISTER_NODE_TYPE(SliceLikeAttrs);
  */
 bool SliceLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -2602,7 +2717,7 @@ bool SliceLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   const auto param = attrs.as<SliceLikeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   const Array<IndexExpr>& dshape = data->shape;
   const Array<IndexExpr>& target_shape = target->shape;
@@ -2612,22 +2727,22 @@ bool SliceLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     for (size_t i = 0; i < dshape.size(); ++i) {
       if (i < target_shape.size()) {
         oshape[i] = target_shape[i];
-        CHECK(reporter->Assert(oshape[i] <= dshape[i]))
+        ICHECK(reporter->Assert(oshape[i] <= dshape[i]))
             << "End index of axis " << i << " exceeds input shape: " << oshape[i] << " vs "
             << dshape[i];
       }
     }
   } else {
-    CHECK(param->axes.size() != 0) << "Axes cannot be empty.";
+    ICHECK(param->axes.size() != 0) << "Axes cannot be empty.";
     for (Integer val : param->axes) {
       int axis = val->value;
       if (axis < 0) {
         axis += dshape.size();
       }
-      CHECK(axis < static_cast<int>(target_shape.size()))
+      ICHECK(axis < static_cast<int>(target_shape.size()))
           << "Axis " << axis << " exceeds dimension " << target_shape.size() << " of target_shape.";
       oshape[axis] = target_shape[axis];
-      CHECK(reporter->Assert(oshape[axis] <= dshape[axis]))
+      ICHECK(reporter->Assert(oshape[axis] <= dshape[axis]))
           << "End index of axis " << axis << " exceeds input shape: " << oshape[axis] << " vs "
           << dshape[axis];
     }
@@ -2647,7 +2762,7 @@ Expr MakeSliceLike(Expr data, Expr shape_like, Array<Integer> axes) {
 Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                    const Type& out_type) {
   const auto* param = attrs.as<SliceLikeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Array<IndexExpr> src_shape = inputs[0]->shape;
   Array<IndexExpr> target_shape = inputs[1]->shape;
   Array<IndexExpr> begin_idx, end_idx, strides;
@@ -2660,7 +2775,7 @@ Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>&
     for (size_t i = 0; i < src_shape.size(); ++i) {
       if (i < target_shape.size()) {
         end_idx.Set(i, target_shape[i]);
-        CHECK_LE(topi::GetConstInt(end_idx[i]), topi::GetConstInt(src_shape[i]))
+        ICHECK_LE(topi::GetConstInt(end_idx[i]), topi::GetConstInt(src_shape[i]))
             << "End index of axis " << i
             << " exceeds input shape: " << topi::GetConstInt(end_idx[i]) << " vs "
             << topi::GetConstInt(src_shape[i]);
@@ -2672,14 +2787,13 @@ Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>&
         axis = static_cast<int>(src_shape.size()) + axis;
       }
       end_idx.Set(axis, target_shape[axis]);
-      CHECK_LE(topi::GetConstInt(end_idx[axis]), topi::GetConstInt(src_shape[axis]))
+      ICHECK_LE(topi::GetConstInt(end_idx[axis]), topi::GetConstInt(src_shape[axis]))
           << "End index of axis " << axis
           << " exceeds input shape: " << topi::GetConstInt(end_idx[axis]) << " vs "
           << topi::GetConstInt(src_shape[axis]);
     }
   }
-  return Array<te::Tensor>{topi::strided_slice(inputs[0], GetIntArray(begin_idx),
-                                               GetIntArray(end_idx), GetIntArray(strides), "end")};
+  return Array<te::Tensor>{topi::strided_slice(inputs[0], begin_idx, end_idx, strides, "end")};
 }
 
 TVM_REGISTER_GLOBAL("relay.op._make.slice_like").set_body_typed(MakeSliceLike);
@@ -2702,7 +2816,7 @@ TVM_REGISTER_NODE_TYPE(LayoutTransformAttrs);
 Array<te::Tensor> LayoutTransformCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                          const Type& out_type) {
   const auto* param = attrs.as<LayoutTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{topi::layout_transform(inputs[0], param->src_layout, param->dst_layout)};
 }
 
@@ -2710,7 +2824,7 @@ bool LayoutTransformRel(const Array<Type>& types, int num_inputs, const Attrs& a
                         const TypeReporter& reporter) {
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "LayoutTransform: expect input data type to be TensorType but get " << types[0];
     return false;
   }
@@ -2719,9 +2833,9 @@ bool LayoutTransformRel(const Array<Type>& types, int num_inputs, const Attrs& a
   Layout src_layout(params->src_layout);
   Layout dst_layout(params->dst_layout);
 
-  CHECK(src_layout.defined() && dst_layout.defined()) << "cannot convert from/to undefined layout";
+  ICHECK(src_layout.defined() && dst_layout.defined()) << "cannot convert from/to undefined layout";
   auto layout_converter = tir::BijectiveLayout(src_layout, dst_layout);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "cannot convert from " << params->src_layout << " to " << params->dst_layout;
 
   const auto& out_shape = layout_converter.ForwardShape(data->shape);
@@ -2753,7 +2867,55 @@ the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
     .set_support_level(5)
     .set_attr<FTVMCompute>("FTVMCompute", LayoutTransformCompute);
 
-/* relay._contrib_reverse_reshape */
+// relay.auto_scheduler_layout_transform
+TVM_REGISTER_NODE_TYPE(AutoSchedulerLayoutTransformAttrs);
+
+Array<te::Tensor> AutoSchedulerLayoutTransformCompute(const Attrs& attrs,
+                                                      const Array<te::Tensor>& inputs,
+                                                      const Type& out_type) {
+  const auto* param = attrs.as<AutoSchedulerLayoutTransformAttrs>();
+  CHECK(param != nullptr);
+  return Array<te::Tensor>{
+      topi::auto_scheduler_layout_transform(inputs[0], param->src_layout, param->dst_layout)};
+}
+
+bool AutoSchedulerLayoutTransformRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                                     const TypeReporter& reporter) {
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data != nullptr);
+  const AutoSchedulerLayoutTransformAttrs* params = attrs.as<AutoSchedulerLayoutTransformAttrs>();
+
+  Array<IndexExpr> dst_shape;
+  std::vector<std::string> dst_axes;
+
+  topi::parse_auto_scheduler_layout(params->dst_layout, &dst_shape, &dst_axes);
+
+  reporter->Assign(types[1], TensorType(dst_shape, data->dtype));
+  return true;
+}
+
+Expr MakeAutoSchedulerLayoutTransform(Expr data, String src_layout, String dst_layout) {
+  auto attrs = make_object<AutoSchedulerLayoutTransformAttrs>();
+  attrs->src_layout = std::move(src_layout);
+  attrs->dst_layout = std::move(dst_layout);
+  static const Op& op = Op::Get("auto_scheduler_layout_transform");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.auto_scheduler_layout_transform")
+    .set_body_typed(MakeAutoSchedulerLayoutTransform);
+
+RELAY_REGISTER_OP("auto_scheduler_layout_transform")
+    .describe(R"code(Transform the input kernel layout.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<AutoSchedulerLayoutTransformAttrs>()
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_type_rel("auto_scheduler_layout_transform", AutoSchedulerLayoutTransformRel)
+    .set_support_level(5)
+    .set_attr<FTVMCompute>("FTVMCompute", AutoSchedulerLayoutTransformCompute);
+
+// relay._contrib_reverse_reshape
 Expr MakeReverseReshape(Expr data, Array<Integer> newshape) {
   auto attrs = make_object<ReshapeAttrs>();
   attrs->newshape = std::move(newshape);
@@ -2792,39 +2954,39 @@ TVM_REGISTER_NODE_TYPE(GatherAttrs);
 bool GatherRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // `types` contains: [data, indices, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* indices = types[1].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "Gather: expect input data type to be TensorType but get " << types[0];
     return false;
   }
   if (indices == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "Gather: expect indices type to be TensorType but get " << types[1];
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
   const auto param = attrs.as<GatherAttrs>();
-  CHECK(param != nullptr);
-  CHECK(param->axis.defined());
+  ICHECK(param != nullptr);
+  ICHECK(param->axis.defined());
 
   const auto ndim_data = data->shape.size();
   const auto ndim_indices = indices->shape.size();
   int axis = param->axis->value;
-  CHECK_EQ(ndim_data, ndim_indices);
-  CHECK_GE(axis, 0);
-  CHECK_LT(axis, ndim_data);
+  ICHECK_EQ(ndim_data, ndim_indices);
+  ICHECK_GE(axis, 0);
+  ICHECK_LT(axis, ndim_data);
 
   std::vector<IndexExpr> oshape;
   oshape.reserve(ndim_data);
   for (size_t i = 0; i < ndim_data; ++i) {
     if (i == (size_t)axis) {
       const int64_t* indice_shape_i = tir::as_const_int(indices->shape[i]);
-      CHECK_GE(*indice_shape_i, 1);
+      ICHECK_GE(*indice_shape_i, 1);
     } else {
-      CHECK(reporter->AssertEQ(indices->shape[i], data->shape[i]));
+      ICHECK(reporter->AssertEQ(indices->shape[i], data->shape[i]));
     }
     oshape.emplace_back(indices->shape[i]);
   }
@@ -2852,9 +3014,9 @@ RELAY_REGISTER_OP("gather")
 
 E.g. for a 3D tensor, output is computed as:
 
-	out[i][j][k] = data[indices[i][j][k]][j][k]  # if axis == 0
-	out[i][j][k] = data[i][indices[i][j][k]][k]  # if axis == 1
-	out[i][j][k] = data[i][j][indices[i][j][k]]  # if axis == 2
+       out[i][j][k] = data[indices[i][j][k]][j][k]  # if axis == 0
+       out[i][j][k] = data[i][indices[i][j][k]][k]  # if axis == 1
+       out[i][j][k] = data[i][j][indices[i][j][k]]  # if axis == 2
 
 ``indices`` must have same shape as ``data``, except at dimension ``axis``
 which must just be not null. Output will have same shape as ``indices``.
@@ -2872,23 +3034,23 @@ which must just be not null. Output will have same shape as ``indices``.
 bool GatherNDRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
   // `types` contains: [data, indices, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* indices = types[1].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "GatherND: expect input data type to be TensorType but get " << types[0];
     return false;
   }
   if (indices == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "GatherND: expect indices type to be TensorType but get " << types[1];
     return false;
   }
   const size_t ndim = data->shape.size();
   const IntImmNode* mdim = indices->shape[0].as<IntImmNode>();
   const size_t kdim = indices->shape.size() - 1;
-  CHECK(size_t(mdim->value) <= ndim) << "GatherND: indices shape does satisfy.";
+  ICHECK(size_t(mdim->value) <= ndim) << "GatherND: indices shape does satisfy.";
 
   Array<IndexExpr> oshape;
   for (size_t i = 1; i < kdim + 1; ++i) oshape.push_back(indices->shape[i]);
@@ -2931,14 +3093,14 @@ TVM_REGISTER_NODE_TYPE(SequenceMaskAttrs);
 bool SequenceMaskRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   // `types` contains: [data, valid_length, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_length = types[1].as<TensorTypeNode>();
-  CHECK(data);
-  CHECK(valid_length);
+  ICHECK(data);
+  ICHECK(valid_length);
   const auto param = attrs.as<SequenceMaskAttrs>();
   Array<IndexExpr> valid_length_shape;
-  CHECK(param->axis == 0 || param->axis == 1);
+  ICHECK(param->axis == 0 || param->axis == 1);
   valid_length_shape.push_back(data->shape[1 - param->axis]);
   reporter->Assign(types[1], TensorType(valid_length_shape, valid_length->dtype));
   reporter->Assign(types[2], types[0]);
@@ -2948,7 +3110,7 @@ bool SequenceMaskRel(const Array<Type>& types, int num_inputs, const Attrs& attr
 Array<te::Tensor> SequenceMaskCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                       const Type& out_type) {
   const auto* param = attrs.as<SequenceMaskAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{
       topi::sequence_mask(inputs[0], inputs[1], param->mask_value, param->axis)};
 }
@@ -3028,12 +3190,12 @@ TVM_REGISTER_NODE_TYPE(OneHotAttrs);
 bool OneHotRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // `types` contains: [indices, on_value, off_value, result]
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* indices = types[0].as<TensorTypeNode>();
-  CHECK(indices);
+  ICHECK(indices);
 
   const auto param = attrs.as<OneHotAttrs>();
-  CHECK_GT(param->depth, 0);
+  ICHECK_GT(param->depth, 0);
 
   Array<IndexExpr> oshape;
   int ndim = indices->shape.size() + 1;
@@ -3054,7 +3216,7 @@ bool OneHotRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> OneHotCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const auto* param = attrs.as<OneHotAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{
       topi::one_hot(inputs[0], inputs[1](), inputs[2](), param->depth, param->axis, param->dtype)};
 }
@@ -3098,23 +3260,23 @@ RELAY_REGISTER_OP("one_hot")
 /* relay.unravel_index */
 bool UnRavelIndexRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
 
   const auto* indices = types[0].as<TensorTypeNode>();
   if (indices == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "unravel_index: expect input type to be TensorType but get " << types[0];
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of unravel_index must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of unravel_index must be tensor of integer";
 
   const auto* shape = types[1].as<TensorTypeNode>();
   if (shape == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "unravel_index: expect input type to be TensorType but get " << types[1];
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "shape of unravel_index must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "shape of unravel_index must be tensor of integer";
 
   Array<IndexExpr> indices_shape;
   Array<IndexExpr> shape_shape;
@@ -3160,7 +3322,7 @@ TVM_REGISTER_NODE_TYPE(SparseToDenseAttrs);
 
 bool SparseToDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 3);
+  ICHECK_EQ(num_inputs, 3);
   auto sparse_indices = types[0].as<TensorTypeNode>();
   auto sparse_values = types[1].as<TensorTypeNode>();
   auto default_value = types[2].as<TensorTypeNode>();
@@ -3169,17 +3331,17 @@ bool SparseToDenseRel(const Array<Type>& types, int num_inputs, const Attrs& att
     return false;
   }
 
-  CHECK(sparse_indices->dtype.is_int()) << "sparse_indices must be tensor of integers";
+  ICHECK(sparse_indices->dtype.is_int()) << "sparse_indices must be tensor of integers";
 
-  CHECK_LE(sparse_indices->shape.size(), 3)
+  ICHECK_LE(sparse_indices->shape.size(), 3)
       << "sparse_indices must be a tensor of either 0D, 1D or 2D";
 
-  CHECK_LE(sparse_values->shape.size(), 2) << "sparse_values must be a tensor of either 0D, 1D";
+  ICHECK_LE(sparse_values->shape.size(), 2) << "sparse_values must be a tensor of either 0D, 1D";
 
-  CHECK_EQ(default_value->shape.size(), 0) << "default_value should be a scalar";
+  ICHECK_EQ(default_value->shape.size(), 0) << "default_value should be a scalar";
 
   const auto* param = attrs.as<SparseToDenseAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Array<IndexExpr> oshape;
   for (auto i : param->output_shape) {
@@ -3191,19 +3353,24 @@ bool SparseToDenseRel(const Array<Type>& types, int num_inputs, const Attrs& att
 
 Array<te::Tensor> SparseToDenseCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                        const Type& out_type) {
-  CHECK_EQ(inputs.size(), 3);
+  ICHECK_EQ(inputs.size(), 3);
   const auto* param = attrs.as<SparseToDenseAttrs>();
-  CHECK(param != nullptr);
-  return {topi::sparse_to_dense(inputs[0], param->output_shape, inputs[1], inputs[2]())};
+  ICHECK(param != nullptr);
+  Array<IndexExpr> output_shape;
+  for (auto val : param->output_shape) {
+    output_shape.push_back(val);
+  }
+  return {topi::sparse_to_dense(inputs[0], output_shape, inputs[1], inputs[2]())};
 }
 
-TVM_REGISTER_GLOBAL("relay.op._make.sparse_to_dense")
-    .set_body_typed([](Expr indices, Array<Integer> output_shape, Expr values, Expr default_value) {
-      auto attrs = make_object<SparseToDenseAttrs>();
-      attrs->output_shape = std::move(output_shape);
-      static const Op& op = Op::Get("sparse_to_dense");
-      return Call(op, {indices, values, default_value}, Attrs(attrs));
-    });
+Expr MakeSparseToDense(Expr indices, Array<Integer> output_shape, Expr values, Expr default_value) {
+  auto attrs = make_object<SparseToDenseAttrs>();
+  attrs->output_shape = std::move(output_shape);
+  static const Op& op = Op::Get("sparse_to_dense");
+  return Call(op, {indices, values, default_value}, Attrs(attrs));
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.sparse_to_dense").set_body_typed(MakeSparseToDense);
 
 RELAY_REGISTER_OP("sparse_to_dense")
     .describe(R"code(A dense tensor from a sparse representation.
@@ -3238,16 +3405,16 @@ TVM_REGISTER_NODE_TYPE(MatrixSetDiagAttrs);
 bool MatrixSetDiagRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
   // `types` contains: [input, diagonal, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
 
   const auto* input = types[0].as<TensorTypeNode>();
-  CHECK(input);
+  ICHECK(input);
 
   const auto* diagonal = types[1].as<TensorTypeNode>();
-  CHECK(diagonal);
+  ICHECK(diagonal);
 
   const auto param = attrs.as<MatrixSetDiagAttrs>();
-  CHECK_GE(param->k2, param->k1);
+  ICHECK_GE(param->k2, param->k1);
 
   int d_ndims = diagonal->shape.size();
   int i_ndims = input->shape.size();
@@ -3276,7 +3443,7 @@ bool MatrixSetDiagRel(const Array<Type>& types, int num_inputs, const Attrs& att
 Array<te::Tensor> MatrixSetDiagCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                        const Type& out_type) {
   const auto* param = attrs.as<MatrixSetDiagAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{topi::matrix_set_diag(inputs[0], inputs[1], param->k1, param->k2,
                                                  param->super_diag_right_align,
                                                  param->sub_diag_right_align)};
@@ -3317,7 +3484,7 @@ RELAY_REGISTER_OP("matrix_set_diag")
 // adv_index
 bool AdvIndexRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 1);
+  ICHECK_EQ(num_inputs, 1);
   auto inputs = types[0].as<TupleTypeNode>();
   auto data = inputs->fields[0].as<TensorTypeNode>();
 
@@ -3337,7 +3504,7 @@ bool AdvIndexRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       if (index_type == nullptr) {
         return false;
       }
-      CHECK(index_type->dtype.is_int()) << "indices must be tensor of integers";
+      ICHECK(index_type->dtype.is_int()) << "indices must be tensor of integers";
 
       int64_t flatten_len = 1;
       bool has_dyn_shape = false;
diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h
index 0fe4734fe883..4173d57a84de 100644
--- a/src/relay/op/tensor/transform.h
+++ b/src/relay/op/tensor/transform.h
@@ -44,7 +44,7 @@ template <typename AttrType>
 bool ConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   /* If we receive a tuple we can continue, if we receive
    * anything but an incomplete type we should signal an
    * error.
@@ -131,9 +131,9 @@ static inline Array<Array<Layout>> ConcatenateLayout(const Attrs& attrs,
   ConcatenateAttrs* param = const_cast<ConcatenateAttrs*>(attrs.as<ConcatenateAttrs>());
 
   Array<Array<IndexExpr>> old_in_shapes;
-  CHECK_EQ(old_in_types.size(), 1);
+  ICHECK_EQ(old_in_types.size(), 1);
   for (auto old_in_tuple_t : old_in_types) {
-    CHECK(old_in_tuple_t.as<TupleTypeNode>());
+    ICHECK(old_in_tuple_t.as<TupleTypeNode>());
     for (auto old_in_t : old_in_tuple_t.as<TupleTypeNode>()->fields) {
       old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
     }
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 59ef47f413fe..e17bdc0e0906 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -424,9 +424,9 @@ TVM_REGISTER_NODE_TYPE(ShapeOfAttrs);
 
 Array<te::Tensor> ShapeOfCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
-  CHECK_EQ(inputs.size(), 1);
+  ICHECK_EQ(inputs.size(), 1);
   const auto* param = attrs.as<ShapeOfAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::shape(inputs[0], param->dtype)};
 }
 
@@ -456,7 +456,7 @@ TVM_REGISTER_NODE_TYPE(NdarraySizeAttrs);
 
 bool NdarraySizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 1);
+  ICHECK_EQ(num_inputs, 1);
   auto tt = types[0].as<TensorTypeNode>();
 
   if (tt == nullptr) {
@@ -464,16 +464,16 @@ bool NdarraySizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   }
 
   const auto* param = attrs.as<NdarraySizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   reporter->Assign(types[1], TensorType({}, param->dtype));
   return true;
 }
 
 Array<te::Tensor> NdarraySizeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                      const Type& out_type) {
-  CHECK_EQ(inputs.size(), 1);
+  ICHECK_EQ(inputs.size(), 1);
   const auto* param = attrs.as<NdarraySizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{topi::ndarray_size(inputs[0], param->dtype)};
 }
 
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 0647ec9780f3..7a3bfcb21ce6 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -64,7 +64,7 @@ bool EqualConstInt(const IndexExpr& lhs, int64_t value) {
   return false;
 }
 
-Type ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataType output_dtype) {
+TensorType ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataType output_dtype) {
   std::vector<IndexExpr> oshape;
   size_t ndim1 = t1->shape.size();
   size_t ndim2 = t2->shape.size();
@@ -99,12 +99,12 @@ Type ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataType outp
 
 bool BroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   // DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
   //                 << ",Out:" << types[2] << std::endl;
   if (auto* t0 = types[0].as<TensorTypeNode>()) {
     if (auto* t1 = types[1].as<TensorTypeNode>()) {
-      CHECK_EQ(t0->dtype, t1->dtype);
+      ICHECK_EQ(t0->dtype, t1->dtype);
       reporter->Assign(
           types[2], ConcreteBroadcast(GetRef<TensorType>(t0), GetRef<TensorType>(t1), t0->dtype));
       return true;
@@ -115,12 +115,12 @@ bool BroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
 bool BroadcastCompRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   // DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
   //                 << ",Out:" << types[2] << std::endl;
   if (auto* t0 = types[0].as<TensorTypeNode>()) {
     if (auto* t1 = types[1].as<TensorTypeNode>()) {
-      CHECK_EQ(t0->dtype, t1->dtype);
+      ICHECK_EQ(t0->dtype, t1->dtype);
       reporter->Assign(types[2], ConcreteBroadcast(GetRef<TensorType>(t0), GetRef<TensorType>(t1),
                                                    DataType::Bool()));
       return true;
@@ -149,13 +149,13 @@ Array<IndexExpr> RankShape(const Array<IndexExpr>& shape) {
 
 bool ShapeOfRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 1);
+  ICHECK_EQ(num_inputs, 1);
   auto tt = types[0].as<TensorTypeNode>();
   if (tt == nullptr) {
     return false;
   }
   const auto* param = attrs.as<ShapeOfAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto rank_shape = RankShape(tt->shape);
   reporter->Assign(types[1], TensorType(rank_shape, param->dtype));
   return true;
diff --git a/src/relay/op/type_relations.h b/src/relay/op/type_relations.h
index 5ab8b121ae9d..6d6d5f70c0c2 100644
--- a/src/relay/op/type_relations.h
+++ b/src/relay/op/type_relations.h
@@ -57,6 +57,15 @@ bool IdentityRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 bool BroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter);
 
+/*!
+ * \brief Determine the broadcasted shape from two input shapes
+ * \param t1 One of two Tensortype whose shapes are broadcasted
+ * \param t2 One of two Tensortype whose shapes are broadcasted
+ * \param output_dtype dtype of the output TensorType
+ * \return A TensorType whose shape is broadcasted from two input TensorType.
+ */
+TensorType ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataType output_dtype);
+
 /*!
  * \brief The broadcast type relation, implements the broadcasting
  *  rule over the two input types producing the broadcasted type.
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index b766facff050..17d0a4718298 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -32,12 +32,12 @@ TVM_REGISTER_NODE_TYPE(MultiBoxPriorAttrs);
 
 bool MultiboxPriorRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const MultiBoxPriorAttrs* param = attrs.as<MultiBoxPriorAttrs>();
   const auto& dshape = data->shape;
-  CHECK_EQ(dshape.size(), 4) << "Input data should be 4D: "
-                                "[batch, channel, height, width]";
+  ICHECK_EQ(dshape.size(), 4) << "Input data should be 4D: "
+                                 "[batch, channel, height, width]";
   IndexExpr in_height = dshape[2];
   IndexExpr in_width = dshape[3];
   int num_sizes = static_cast<int>(param->sizes.size());
@@ -78,7 +78,7 @@ TVM_REGISTER_NODE_TYPE(MultiBoxTransformLocAttrs);
 
 bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
 
   const auto* cls_prob = types[0].as<TensorTypeNode>();
   const auto* loc_pred = types[1].as<TensorTypeNode>();
@@ -92,17 +92,17 @@ bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs, const Att
   const auto& loc_shape = loc_pred->shape;
   const auto& anchor_shape = anchor->shape;
 
-  CHECK_EQ(cls_shape.size(), 3U) << "The dimension of class probability should be 3, but received "
-                                 << cls_shape.size();
-  CHECK_EQ(loc_shape.size(), 2U)
+  ICHECK_EQ(cls_shape.size(), 3U) << "The dimension of class probability should be 3, but received "
+                                  << cls_shape.size();
+  ICHECK_EQ(loc_shape.size(), 2U)
       << "The dimension of location prediction should be 2, but received " << loc_shape.size();
-  CHECK_EQ(anchor_shape.size(), 3U)
+  ICHECK_EQ(anchor_shape.size(), 3U)
       << "The dimension of anchor should be 3, but received " << anchor_shape.size();
 
-  CHECK(reporter->AssertEQ(cls_shape[2], anchor_shape[1])) << "Number of anchors mismatch found";
-  CHECK(reporter->AssertEQ(cls_shape[2] * 4, loc_shape[1])) << "# anchors mismatch with # loc.";
-  CHECK(reporter->Assert(anchor_shape[1] > 0)) << "Number of anchors must > 0.";
-  CHECK(reporter->AssertEQ(anchor_shape[2], 4));
+  ICHECK(reporter->AssertEQ(cls_shape[2], anchor_shape[1])) << "Number of anchors mismatch found";
+  ICHECK(reporter->AssertEQ(cls_shape[2] * 4, loc_shape[1])) << "# anchors mismatch with # loc.";
+  ICHECK(reporter->Assert(anchor_shape[1] > 0)) << "Number of anchors must > 0.";
+  ICHECK(reporter->AssertEQ(anchor_shape[2], 4));
 
   std::vector<IndexExpr> oshape0({cls_shape[0], anchor_shape[1], 6});
   std::vector<IndexExpr> oshape1({cls_shape[0]});
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index f9cdaf66e255..76fdf2829ed0 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -31,10 +31,10 @@ TVM_REGISTER_NODE_TYPE(GetValidCountsAttrs);
 
 bool GetValidCountRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto& dshape = data->shape;
-  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+  ICHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
 
   std::vector<IndexExpr> oshape({data->shape[0]});
   std::vector<IndexExpr> oshape_indices({data->shape[0], data->shape[1]});
@@ -73,14 +73,14 @@ TVM_REGISTER_NODE_TYPE(NonMaximumSuppressionAttrs);
 
 bool NMSRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
             const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_count = types[1].as<TensorTypeNode>();
   const NonMaximumSuppressionAttrs* param = attrs.as<NonMaximumSuppressionAttrs>();
   const auto& dshape = data->shape;
   const auto& vshape = valid_count->shape;
-  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
-  CHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
+  ICHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+  ICHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
 
   // assign output type
   if (param->return_indices) {
diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc
index f14b29604f06..f7bbf378d09c 100644
--- a/src/relay/op/vision/rcnn_op.cc
+++ b/src/relay/op/vision/rcnn_op.cc
@@ -25,7 +25,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -35,23 +35,23 @@ TVM_REGISTER_NODE_TYPE(ROIAlignAttrs);
 bool ROIAlignRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
   auto roi_align_attrs = attrs.as<ROIAlignAttrs>();
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* rois = types[1].as<TensorTypeNode>();
-  CHECK(data);
-  CHECK(rois);
+  ICHECK(data);
+  ICHECK(rois);
   const auto& dshape = data->shape;
   const auto& rshape = rois->shape;
-  CHECK(roi_align_attrs);
-  CHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
-  CHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
+  ICHECK(roi_align_attrs);
+  ICHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
+  ICHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
   // assign output type
   std::vector<IndexExpr> oshape;
   if (roi_align_attrs->layout == "NCHW") {
     oshape = {rshape[0], dshape[1], roi_align_attrs->pooled_size[0],
               roi_align_attrs->pooled_size[1]};
   } else {
-    CHECK_EQ(roi_align_attrs->layout, "NHWC") << "Unexpected ROI Align layout";
+    ICHECK_EQ(roi_align_attrs->layout, "NHWC") << "Unexpected ROI Align layout";
     oshape = {rshape[0], roi_align_attrs->pooled_size[0], roi_align_attrs->pooled_size[1],
               dshape[3]};
   }
@@ -111,22 +111,43 @@ TVM_REGISTER_NODE_TYPE(ROIPoolAttrs);
 bool ROIPoolRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   auto roi_pool_attrs = attrs.as<ROIPoolAttrs>();
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* rois = types[1].as<TensorTypeNode>();
   const auto& dshape = data->shape;
   const auto& rshape = rois->shape;
-  CHECK(roi_pool_attrs);
-  CHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
-  CHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
-  CHECK_EQ(roi_pool_attrs->layout, "NCHW") << "ROI Pool only supports NCHW layout";
+  ICHECK(roi_pool_attrs);
+  ICHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
+  ICHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
   // assign output type
-  std::vector<IndexExpr> oshape(
-      {rshape[0], dshape[1], roi_pool_attrs->pooled_size[0], roi_pool_attrs->pooled_size[1]});
+  std::vector<IndexExpr> oshape;
+  if (roi_pool_attrs->layout == "NCHW") {
+    oshape = {rshape[0], dshape[1], roi_pool_attrs->pooled_size[0], roi_pool_attrs->pooled_size[1]};
+  } else if (roi_pool_attrs->layout == "NHWC") {
+    oshape = {rshape[0], roi_pool_attrs->pooled_size[0], roi_pool_attrs->pooled_size[1], dshape[3]};
+  } else {
+    LOG(FATAL) << "vision.roi_pool does not support " << roi_pool_attrs->layout << " layout";
+  }
+
   reporter->Assign(types[2], TensorType(oshape, data->dtype));
   return true;
 }
 
+template <typename T>
+Array<Array<Layout> > ROIPoolInferCorrectLayout(const Attrs& attrs,
+                                                const Array<Layout>& new_in_layouts,
+                                                const Array<Layout>& old_in_layouts,
+                                                const Array<tvm::relay::Type>& old_in_types) {
+  // NOTE: Discard "const" qualifier here.
+  T* params = const_cast<T*>(attrs.as<T>());
+  Layout data_layout = params->layout;
+
+  // Layout inference needs to define the layout for all inputs and output data layouts.
+  // For roi_pool, the second inputs is 2-D tensor with shape [num_roi, 5].
+  // So, we set the layout as "N5".
+  return Array<Array<Layout> >{{data_layout, Layout("N5")}, {data_layout}};
+}
+
 Expr MakeROIPool(Expr data, Expr rois, Array<IndexExpr> pooled_size, double spatial_scale,
                  String layout) {
   auto attrs = make_object<ROIPoolAttrs>();
@@ -153,14 +174,15 @@ RELAY_REGISTER_OP("vision.roi_pool")
     .add_argument("data", "Tensor", "The input tensor.")
     .add_argument("rois", "Tensor", "The input rois")
     .set_support_level(5)
-    .add_type_rel("ROIPool", ROIPoolRel);
+    .add_type_rel("ROIPool", ROIPoolRel)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ROIPoolInferCorrectLayout<ROIPoolAttrs>);
 
 TVM_REGISTER_NODE_TYPE(ProposalAttrs);
 
 bool ProposalRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
   auto proposal_attrs = attrs.as<ProposalAttrs>();
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* cls_prob = types[0].as<TensorTypeNode>();
   const auto* bbox_pred = types[1].as<TensorTypeNode>();
   const auto* im_info = types[2].as<TensorTypeNode>();
@@ -169,13 +191,13 @@ bool ProposalRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     return false;
   }
 
-  CHECK_EQ(cls_prob->shape.size(), 4U)
+  ICHECK_EQ(cls_prob->shape.size(), 4U)
       << "The dimension of class probability should be 4, but received " << cls_prob->shape.size();
-  CHECK_EQ(bbox_pred->shape.size(), 4U)
+  ICHECK_EQ(bbox_pred->shape.size(), 4U)
       << "The dimension of box prediction should be 4, but received " << bbox_pred->shape.size();
-  CHECK_EQ(im_info->shape.size(), 2U)
+  ICHECK_EQ(im_info->shape.size(), 2U)
       << "The dimension of image info should be 2, but received " << im_info->shape.size();
-  CHECK(reporter->AssertEQ(im_info->shape[1], 3));
+  ICHECK(reporter->AssertEQ(im_info->shape[1], 3));
 
   auto batch = cls_prob->shape[0];
 
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
index cfd81131be73..70d882061299 100644
--- a/src/relay/op/vision/yolo.cc
+++ b/src/relay/op/vision/yolo.cc
@@ -44,14 +44,14 @@ TVM_REGISTER_NODE_TYPE(YoloReorgAttrs);
  */
 bool YoloReorgRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const YoloReorgAttrs* param = attrs.as<YoloReorgAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK(data->shape.size() == 4) << "Yolo reorg supports only 4 dimension.";
+  ICHECK(data->shape.size() == 4) << "Yolo reorg supports only 4 dimension.";
   std::vector<IndexExpr> oshape(data->shape.begin(), data->shape.end());
   oshape[1] = oshape[1] * param->stride * param->stride;
   oshape[2] = indexdiv(oshape[2], param->stride);
@@ -80,7 +80,7 @@ Its function is mostly shape transform.")doc" TVM_ADD_FILELINE)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* params = attrs.as<YoloReorgAttrs>();
-      CHECK(params != nullptr);
+      ICHECK(params != nullptr);
       return Array<te::Tensor>{topi::vision::reorg(inputs[0], params->stride)};
     });
 
diff --git a/src/relay/op/vm/vm.cc b/src/relay/op/vm/vm.cc
index 59756ea6ffab..8c1c9f3e9c59 100644
--- a/src/relay/op/vm/vm.cc
+++ b/src/relay/op/vm/vm.cc
@@ -30,7 +30,7 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/topi/elemwise.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 #include "../type_relations.h"
 
@@ -69,12 +69,12 @@ TVM_REGISTER_GLOBAL("relay.op.vm.shape_func")
 
 bool ShapeFuncRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4u);
+  ICHECK_EQ(types.size(), 4u);
   auto shape_func_attrs = attrs.as<ShapeFuncAttrs>();
-  CHECK(shape_func_attrs != nullptr) << "Internal compiler error";
+  ICHECK(shape_func_attrs != nullptr) << "Internal compiler error";
 
   auto func_type = types[0].as<FuncTypeNode>();
-  CHECK(func_type != nullptr);
+  ICHECK(func_type != nullptr);
 
   auto tuple = TupleType(func_type->arg_types);
   auto in_types = FlattenTupleType(tuple);
@@ -137,20 +137,20 @@ RELAY_REGISTER_OP("vm.shape_func")
 // vm.invoke_tvm_op
 bool InvokeTVMOpRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4u);
+  ICHECK_EQ(types.size(), 4u);
   auto func_type = types[0].as<FuncTypeNode>();
-  CHECK(func_type != nullptr) << "input must be operator with known type";
+  ICHECK(func_type != nullptr) << "input must be operator with known type";
   auto input_type = types[1].as<TupleTypeNode>();
   auto output_type = types[2].as<TupleTypeNode>();
-  CHECK(input_type != nullptr)
+  ICHECK(input_type != nullptr)
       << "internal invariant violated: invoke_tvm_op inputs must be a tuple";
-  CHECK(output_type != nullptr)
+  ICHECK(output_type != nullptr)
       << "internal invariant violated: invoke_tvm_op outputs must be a tuple";
   Type ex_output;
   if (func_type->ret_type.as<TensorTypeNode>()) {
     ex_output = TupleType({func_type->ret_type});
   } else {
-    CHECK(func_type->ret_type.as<TupleTypeNode>()) << "should be tuple type";
+    ICHECK(func_type->ret_type.as<TupleTypeNode>()) << "should be tuple type";
     ex_output = func_type->ret_type;
   }
   auto ex_input = TupleType(func_type->arg_types);
@@ -188,11 +188,11 @@ TVM_REGISTER_NODE_TYPE(ReshapeTensorAttrs);
 
 bool ReshapeTensorRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3u);
+  ICHECK_EQ(types.size(), 3u);
   auto reshape_attrs = attrs.as<ReshapeTensorAttrs>();
-  CHECK(reshape_attrs);
+  ICHECK(reshape_attrs);
   auto tt = types[0].as<TensorTypeNode>();
-  CHECK(tt) << "input must be tensor type";
+  ICHECK(tt) << "input must be tensor type";
   reporter->Assign(types[2], TensorType(reshape_attrs->newshape, tt->dtype));
   return true;
 }
diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc
index bda8cf878793..7a716a1ec498 100644
--- a/src/relay/qnn/op/concatenate.cc
+++ b/src/relay/qnn/op/concatenate.cc
@@ -28,9 +28,9 @@
 #include <tvm/tir/expr.h>
 
 #include "../../op/tensor/transform.h"
-#include "../../transforms/infer_layout_util.h"
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
@@ -38,7 +38,7 @@ namespace qnn {
 
 bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
 
   // Check the scale and zero point types
   const auto* input_scales_tuple = types[1].as<TupleTypeNode>();
@@ -48,7 +48,7 @@ bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& at
                 << PrettyPrint(types[1]));
   }
   for (const auto& input_scale : input_scales_tuple->fields) {
-    CHECK(IsScalarType(input_scale, DataType::Float(32)));  // input_scales[idx]
+    ICHECK(IsScalarType(input_scale, DataType::Float(32)));  // input_scales[idx]
   }
 
   const auto* input_zero_points_tuple = types[2].as<TupleTypeNode>();
@@ -58,11 +58,11 @@ bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& at
                 << PrettyPrint(types[2]));
   }
   for (const auto& input_zero_point : input_zero_points_tuple->fields) {
-    CHECK(IsScalarType(input_zero_point, DataType::Int(32)));  // input_zero_points[idx]
+    ICHECK(IsScalarType(input_zero_point, DataType::Int(32)));  // input_zero_points[idx]
   }
 
-  CHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
-  CHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
 
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // Concatenate infer type function.
@@ -74,9 +74,9 @@ Array<Array<Layout>> QnnConcatenateLayout(const Attrs& attrs, const Array<Layout
                                           const Array<Layout>& old_in_layouts,
                                           const Array<tvm::relay::Type>& old_in_types) {
   // Collect the layouts and types to reuse Relay Concatenate Infer Correct Layout.
-  CHECK_EQ(old_in_types.size(), 5);
+  ICHECK_EQ(old_in_types.size(), 5);
   auto input_tuple_type = old_in_types[0].as<TupleTypeNode>();
-  CHECK(input_tuple_type);
+  ICHECK(input_tuple_type);
   auto num_input_tensors = input_tuple_type->fields.size();
 
   Array<Layout> relay_new_in_layouts(nullptr);
@@ -126,19 +126,19 @@ Expr MakeQnnConcatenate(Expr data, Expr input_scales, Expr input_zero_points, Ex
 Expr ConcatenateQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                                 const Array<tvm::relay::Type>& arg_types) {
   // Get the attrs.
-  CHECK_EQ(new_args.size(), 5);
+  ICHECK_EQ(new_args.size(), 5);
   auto& data = new_args[0];
   auto& input_scales = new_args[1];
   auto& input_zero_points = new_args[2];
   auto& output_scale = new_args[3];
   auto& output_zero_point = new_args[4];
   const auto* concatenate_attrs = attrs.as<ConcatenateAttrs>();
-  CHECK(concatenate_attrs != nullptr);
+  ICHECK(concatenate_attrs != nullptr);
 
   // Get the input dtype and shape.
-  CHECK_GE(arg_types.size(), 1);
+  ICHECK_GE(arg_types.size(), 1);
   auto tuple_type = arg_types[0].as<TupleTypeNode>();
-  CHECK(tuple_type != nullptr);
+  ICHECK(tuple_type != nullptr);
 
   // FIXME (anijain2305) - The lowering can be further optimized. Instead of inserting requantize in
   // the start, we can insert requantize at the end if and only if all the input tensors have same
@@ -156,13 +156,13 @@ Expr ConcatenateQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
       tuple_exprs.push_back(TupleGetItem(call, i));
     }
   }
-  CHECK(!tuple_exprs.empty());
+  ICHECK(!tuple_exprs.empty());
 
   auto tuple_input_scales = input_scales.as<TupleNode>();
-  CHECK(tuple_input_scales != nullptr);
+  ICHECK(tuple_input_scales != nullptr);
 
   auto tuple_input_zero_points = input_zero_points.as<TupleNode>();
-  CHECK(tuple_input_zero_points != nullptr);
+  ICHECK(tuple_input_zero_points != nullptr);
 
   int idx = 0;
   Array<Expr> requantized_exprs;
@@ -207,6 +207,7 @@ RELAY_REGISTER_OP("qnn.concatenate")
                   "The quantization zero_point of the output tensor.")
     .set_support_level(11)
     .add_type_rel("QnnConcatenate", QnnConcatenateRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", ConcatenateQnnCanonicalize)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnConcatenateLayout);
 
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index f112a7259552..a9f2f361f2b3 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -31,8 +31,8 @@
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/data_layout.h>
 
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
@@ -42,34 +42,34 @@ namespace qnn {
 
 bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 7);
+  ICHECK_EQ(types.size(), 7);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr) << "Conv2DAttrs cannot be nullptr.";
-  CHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+  ICHECK(param != nullptr) << "Conv2DAttrs cannot be nullptr.";
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype;
-  CHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
-  CHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
+  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
       << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype;
-  CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
+  ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
 
   // Check the types of scale and zero points.
-  CHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
-  CHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
-  CHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
   // Kernel scale can be a vector of length output_channels or a scalar.
   if (param->groups == 1) {
     size_t axis = param->kernel_layout.operator std::string().find('O');
-    CHECK(axis != std::string::npos) << "Kernel layout attribute is not defined";
+    ICHECK(axis != std::string::npos) << "Kernel layout attribute is not defined";
     AssignType(types[5], DataType::Float(32), weight->shape[axis], reporter);  // kernel scale
   } else {
     // Here, total number of output channels depend on depth multiplier.
     size_t o_axis = param->kernel_layout.operator std::string().find('O');
     size_t i_axis = param->kernel_layout.operator std::string().find('I');
-    CHECK(o_axis != std::string::npos || i_axis != std::string::npos)
+    ICHECK(o_axis != std::string::npos || i_axis != std::string::npos)
         << "Kernel layout attribute is not defined";
     AssignType(types[5], DataType::Float(32), weight->shape[i_axis] * weight->shape[o_axis],
                reporter);  // kernel scale
@@ -628,18 +628,18 @@ Expr Conv2DCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3,
  */
 Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                            const Array<tvm::relay::Type>& arg_types) {
-  CHECK_EQ(new_args.size(), 6);
+  ICHECK_EQ(new_args.size(), 6);
   Expr data = new_args[0];
   Expr weight = new_args[1];
   Expr input_zero_point = new_args[2];
   Expr kernel_zero_point = new_args[3];
   const auto* param = attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   // Assertion checks for exisiing support.
-  CHECK(param->data_layout == "NCHW" || param->data_layout == "NHWC")
+  ICHECK(param->data_layout == "NCHW" || param->data_layout == "NHWC")
       << "qnn.conv2d supports only NCHW/NHWC input data layout.";
-  CHECK(param->kernel_layout == "OIHW" || param->kernel_layout == "HWIO" ||
-        param->kernel_layout == "HWOI")
+  ICHECK(param->kernel_layout == "OIHW" || param->kernel_layout == "HWIO" ||
+         param->kernel_layout == "HWOI")
       << "qnn.conv2d supports only OIHW/HWIO/HWOI kernel data layout.";
 
   int batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier;
@@ -655,14 +655,14 @@ Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // traverse the elements in dilated manner. Currently, we do not have strided pool. So, in case of
   // dilated conv with non-zero kernel point, we fall back to simpler but slow lowering.
 
-  CHECK_EQ(param->dilation.size(), 2) << "qnn.conv2d only supports 2D dilation";
+  ICHECK_EQ(param->dilation.size(), 2) << "qnn.conv2d only supports 2D dilation";
   auto dilation_h = get_const_int(param->dilation[0]);
   auto dilation_w = get_const_int(param->dilation[1]);
   if ((kernel_zero_point_int != 0 && (dilation_h != 1 || dilation_w != 1)) ||
       (param->groups != 1 && !is_depthwise(param))) {
     return Conv2DFallBack(data, weight, input_zero_point, kernel_zero_point, param);
   } else if (is_depthwise(param)) {
-    CHECK_NE(channel_multiplier, -1);
+    ICHECK_NE(channel_multiplier, -1);
     auto padded_data = Conv2DPadInput(data, input_zero_point, param);
     auto term1 = Conv2DFirstTerm(padded_data, weight, param);
     auto term2 = DepthwiseConv2DSecondTerm(padded_data, kernel_zero_point, param, kernel_h,
@@ -733,6 +733,7 @@ operator to understand how to scale back the int32 output to (u)int8.
                   "The quantization zero_point of the weight tensor.")
     .set_support_level(11)
     .add_type_rel("QnnConv2D", QnnConv2DRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnConv2DCanonicalize)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnConvInferCorrectLayout);
 
diff --git a/src/relay/qnn/op/convolution_transpose.cc b/src/relay/qnn/op/convolution_transpose.cc
new file mode 100644
index 000000000000..c7515b5904f1
--- /dev/null
+++ b/src/relay/qnn/op/convolution_transpose.cc
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/convolution_transpose.cc
+ * \brief Property def of qnn transpose convolution operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/base.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/data_layout.h>
+
+#include "../../op/nn/convolution.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+// relay.op.qnn.conv2d_transpose
+
+inline Expr MakeQnnConv2DTranspose(Expr data, Expr weight, Expr input_zero_point,
+                                   Expr kernel_zero_point, Expr input_scale, Expr kernel_scale,
+                                   Array<IndexExpr> strides, Array<IndexExpr> padding,
+                                   Array<IndexExpr> dilation, int groups, IndexExpr channels,
+                                   Array<IndexExpr> kernel_size, std::string data_layout,
+                                   std::string kernel_layout, std::string out_layout,
+                                   Array<IndexExpr> output_padding, DataType out_dtype) {
+  auto attrs = make_object<Conv2DTransposeAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = std::move(channels);
+  attrs->kernel_size = std::move(kernel_size);
+  attrs->data_layout = std::move(data_layout);
+  attrs->kernel_layout = std::move(kernel_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->output_padding = std::move(output_padding);
+  attrs->out_dtype = std::move(out_dtype);
+  const Op& op = Op::Get("qnn.conv2d_transpose");
+  return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale},
+              Attrs(attrs), {});
+}
+
+Array<Array<Layout>> QnnConvTransposeInferCorrectLayout(
+    const Attrs& attrs, const Array<Layout>& new_in_layouts, const Array<Layout>& old_in_layouts,
+    const Array<tvm::relay::Type>& old_in_types) {
+  // Use Relay Conv2D Infer correct layout.
+  auto layouts = ConvInferCorrectLayout<Conv2DTransposeAttrs>(attrs, new_in_layouts, old_in_layouts,
+                                                              old_in_types);
+
+  // Fill the layouts of remaining input tensors - scales and zero points. The layouts of these
+  // tensors can be treated as channel layout.
+  Layout channel_layout = Layout("C");
+  Array<Layout> input_layouts = {layouts[0][0],  layouts[0][1],  channel_layout,
+                                 channel_layout, channel_layout, channel_layout};
+  Array<Layout> output_layouts = layouts[1];
+  return {input_layouts, output_layouts};
+}
+
+bool QnnConv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                           const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 7);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr || weight == nullptr) return false;
+  const auto* param = attrs.as<Conv2DTransposeAttrs>();
+  ICHECK(param != nullptr) << "Conv2DTransposeAttrs cannot be nullptr.";
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+      << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype;
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
+      << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
+  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
+      << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype;
+  ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
+
+  // Check the types of scale and zero points.
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
+  // Kernel scale can be a vector of length output_channels or a scalar.
+  if (param->groups == 1) {
+    size_t axis = param->kernel_layout.find('O');
+    ICHECK(axis != std::string::npos) << "Kernel layout attribute is not defined";
+    AssignType(types[5], DataType::Float(32), weight->shape[axis], reporter);  // kernel scale
+  } else {
+    // Here, total number of output channels depend on depth multiplier.
+    size_t o_axis = param->kernel_layout.find('O');
+    size_t i_axis = param->kernel_layout.find('I');
+    ICHECK(o_axis != std::string::npos || i_axis != std::string::npos)
+        << "Kernel layout attribute is not defined";
+    AssignType(types[5], DataType::Float(32), weight->shape[i_axis] * weight->shape[o_axis],
+               reporter);  // kernel scale
+  }
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // Conv2D infer type function.
+  Array<Type> tensor_types = {types[0], types[1], types[6]};
+  return Conv2DTransposeRel<Conv2DTransposeAttrs>(tensor_types, 3, attrs, reporter);
+}
+
+RELAY_REGISTER_OP("qnn.conv2d_transpose")
+    .describe(R"code(Quantized transposed 2D convolution layer (sometimes called Deconvolution).
+This operator deconvolves quantized weight with quantized data. The scale of the
+output quantized tensor is the product of the weight_scale and input_scale of
+the input quantized tensors. The zero point of the output quantized tensor is
+0. By default, the dtype of output is int32. Please also refer to Requantize
+operator to understand how to scale back the int32 output to (u)int8.
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<Conv2DTransposeAttrs>()
+    .set_num_inputs(6)
+    .add_argument("data", "Tensor", "The quantized input data tensor.")
+    .add_argument("weight", "Tensor", "The quantized weight tensor.")
+    .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .add_argument("weight_scale", "Tensor", "The quantization scale of the weight tensor.")
+    .add_argument("weight_zero_point", "Tensor",
+                  "The quantization zero_point of the weight tensor.")
+    .set_support_level(11)
+    .add_type_rel("QnnConv2DTranspose", QnnConv2DTransposeRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnConvTransposeInferCorrectLayout);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.conv2d_transpose").set_body_typed(MakeQnnConv2DTranspose);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 464b3f9aeff3..3602995b8f16 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -28,8 +28,8 @@
 #include <tvm/relay/qnn/attrs.h>
 
 #include "../../op/nn/nn.h"
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
@@ -39,26 +39,26 @@ namespace qnn {
 
 bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 7);
+  ICHECK_EQ(types.size(), 7);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<DenseAttrs>();
-  CHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
-  CHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+  ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
-  CHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for weight but was " << weight->dtype;
-  CHECK(param->out_dtype == DataType::Int(32))
+  ICHECK(param->out_dtype == DataType::Int(32))
       << "Expected quantized dense type(int32) for output but was " << param->out_dtype;
 
   // Check the types of scale and zero points.
-  CHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
-  CHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
-  CHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
   AssignType(types[5], DataType::Float(32), param->units, reporter);
 
-  CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
+  ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
 
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // Dense infer type function.
@@ -99,6 +99,18 @@ Expr DenseFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int re
   return MakeConstantScalar(DataType::Int(32), scalar_term);
 }
 
+Expr DenseFourthTerm(const Expr& input_zero_point, const Expr& kernel_zero_point,
+                     int reduction_dim_size) {
+  auto reduction_dim = MakeConstantScalar(DataType::Int(32), reduction_dim_size);
+  return Multiply(Multiply(input_zero_point, kernel_zero_point), reduction_dim);
+}
+
+Expr DenseCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3, const Expr& term4) {
+  auto data_term = Subtract(term1, term2);
+  // Putting constant terms together, so that constant folding can fold it.
+  auto const_term = Subtract(term4, term3);
+  return Add(data_term, const_term);
+}
 /*
  * \brief Forward rewrite the qnn dense op.
  * \param attrs The QNN dense attrs.
@@ -133,7 +145,7 @@ Expr DenseFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int re
  */
 Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                           const Array<tvm::relay::Type>& arg_types) {
-  CHECK_EQ(new_args.size(), 6);
+  ICHECK_EQ(new_args.size(), 6);
   Expr quantized_data = new_args[0];
   Expr quantized_kernel = new_args[1];
   Expr input_zero_point = new_args[2];
@@ -144,14 +156,24 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
 
   const auto* qnn_dense_attrs = attrs.as<DenseAttrs>();
 
+  auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
+  auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point);
+  auto term3 = DenseThirdTerm(quantized_kernel, input_zero_point);
+
   // Extract the integer zero points.
-  auto input_zero_point_int = GetScalarFromConstant<int>(input_zero_point);
   auto kernel_zero_point_int = GetScalarFromConstant<int>(kernel_zero_point);
 
+  if (!IsConstScalar(input_zero_point)) {
+    if (kernel_zero_point_int == 0) {
+      return Subtract(term1, term3);
+    }
+    auto term4 = DenseFourthTerm(input_zero_point, kernel_zero_point, reduction_dim_size);
+    return DenseCombineTerms(term1, term2, term3, term4);
+  }
+
+  auto input_zero_point_int = GetScalarFromConstant<int>(input_zero_point);
+
   // Get all the terms as described in the comments.
-  auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
-  auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point);
-  auto term3 = DenseThirdTerm(quantized_kernel, input_zero_point);
   auto term4 = DenseFourthTerm(input_zero_point_int, kernel_zero_point_int, reduction_dim_size);
 
   // Combine those 4 terms depending on the zero points to get the best lowering.
@@ -165,10 +187,7 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
     // term 2 and term 4 become zero.
     return Subtract(term1, term3);
   } else {
-    auto data_term = Subtract(term1, term2);
-    // Putting constant terms together, so that constant folding can fold it.
-    auto const_term = Subtract(term4, term3);
-    return Add(data_term, const_term);
+    return DenseCombineTerms(term1, term2, term3, term4);
   }
 }
 
@@ -189,6 +208,7 @@ RELAY_REGISTER_OP("qnn.dense")
                   "The quantization zero_point of the weight tensor.")
     .set_support_level(11)
     .add_type_rel("QDense", QnnDenseRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnDenseCanonicalize);
 
 TVM_REGISTER_GLOBAL("relay.qnn.op._make.dense").set_body_typed(MakeQuantizedDense);
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 3a5f81e2627a..724441e0c523 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -27,8 +27,8 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
 
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
@@ -38,7 +38,7 @@ TVM_REGISTER_NODE_TYPE(DequantizeAttrs);
 
 bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) {
@@ -46,17 +46,17 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   const auto input_dtype = data->dtype;
-  CHECK(input_dtype == DataType::Int(8) || input_dtype == DataType::UInt(8) ||
-        input_dtype == DataType::Int(32))
+  ICHECK(input_dtype == DataType::Int(8) || input_dtype == DataType::UInt(8) ||
+         input_dtype == DataType::Int(32))
       << "Input type should be one of the quantized types [unit8, int8, int32] but was "
       << input_dtype;
 
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
   int axis = dequantize_attrs->axis;
   axis = (axis == -1) ? data->shape.size() - 1 : axis;
-  CHECK_LT(axis, static_cast<int>(data->shape.size()))
+  ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << dequantize_attrs->axis << " is out of range";
-  CHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
+  ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
 
   // Check and assign types for scale and zero points.
   AssignType(types[1], DataType::Float(32), data->shape[axis], reporter);  // scale
@@ -79,49 +79,48 @@ Expr MakeDequantize(Expr data, Expr input_scale, Expr input_zero_point, int axis
 }
 
 Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
-                     const Expr& input_zero_point, const Array<IndexExpr>& input_shape,
+                     const Expr& input_zero_point, const Array<tvm::relay::Type>& types,
                      const DequantizeAttrs* attrs) {
   const auto axis = attrs->axis;
 
+  ICHECK_EQ(types.size(), 4);
+  auto in_type = types[0];
+  auto in_tensor_type = in_type.as<TensorTypeNode>();
+  ICHECK(in_tensor_type != nullptr) << "Type information missing"
+                                    << " Please run infer_type pass.";
+  Array<IndexExpr> input_shape = in_tensor_type->shape;
+
   size_t n_dim = input_shape.size();
 
   // Expand scale and zero point if the input tensor is channel quantized
   auto expanded_input_scale = input_scale;
-  if (!IsConstScalar(input_scale)) {
+  if (!IsConstScalar(input_scale) && !IsScalarType(types[1])) {
     expanded_input_scale = ExpandBiasToMatchAxis(input_scale, n_dim, {axis});
   }
 
   auto expanded_input_zero_point = input_zero_point;
-  if (!IsConstScalar(input_zero_point)) {
+  if (!IsConstScalar(input_zero_point) && !IsScalarType(types[2])) {
     expanded_input_zero_point = ExpandBiasToMatchAxis(input_zero_point, n_dim, {axis});
   }
 
-  auto shift = Subtract(Cast(input_tensor, DataType::Int(32)), input_zero_point);
-  auto scaled_output = Multiply(Cast(shift, DataType::Float(32)), input_scale);
+  auto shift = Subtract(Cast(input_tensor, DataType::Int(32)), expanded_input_zero_point);
+  auto scaled_output = Multiply(Cast(shift, DataType::Float(32)), expanded_input_scale);
   return scaled_output;
 }
 
 Expr DequantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                                const Array<tvm::relay::Type>& types) {
-  CHECK_EQ(new_args.size(), 3);
+  ICHECK_EQ(new_args.size(), 3);
   auto& data = new_args[0];
   auto& input_scale = new_args[1];
   auto& input_zero_point = new_args[2];
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
 
   // Get attrs.
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
-  CHECK(dequantize_attrs != nullptr);
-
-  // Find input shape.
-  CHECK_EQ(types.size(), 4);
-  auto in_type = types[0];
-  auto in_tensor_type = in_type.as<TensorTypeNode>();
-  CHECK(in_tensor_type != nullptr) << "Type information missing."
-                                   << " Please run infer_type pass.";
-  Array<IndexExpr> input_shape = in_tensor_type->shape;
+  ICHECK(dequantize_attrs != nullptr);
 
-  return DequantizeLower(data, input_scale, input_zero_point, input_shape, dequantize_attrs);
+  return DequantizeLower(data, input_scale, input_zero_point, types, dequantize_attrs);
 }
 
 RELAY_REGISTER_OP("qnn.dequantize")
@@ -136,6 +135,7 @@ The input is always quantized (int8, uint8) and will be converted to float32 giv
     .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
     .set_support_level(11)
     .add_type_rel("Dequantize", DequantizeRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", DequantizeQnnCanonicalize);
 
 TVM_REGISTER_GLOBAL("relay.qnn.op._make.dequantize").set_body_typed(MakeDequantize);
diff --git a/src/relay/qnn/op/mul.cc b/src/relay/qnn/op/mul.cc
index ec74b799407b..781114cc5f5a 100644
--- a/src/relay/qnn/op/mul.cc
+++ b/src/relay/qnn/op/mul.cc
@@ -25,8 +25,8 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
 
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 #include "op_common.h"
 
 namespace tvm {
diff --git a/src/relay/qnn/op/op_common.h b/src/relay/qnn/op/op_common.h
index 50fc0cda30cf..330802c4c9b1 100644
--- a/src/relay/qnn/op/op_common.h
+++ b/src/relay/qnn/op/op_common.h
@@ -32,8 +32,8 @@
 #include <vector>
 
 #include "../../op/type_relations.h"
-#include "../../transforms/infer_layout_util.h"
-#include "../util.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
@@ -68,7 +68,7 @@ struct QnnBinaryOpArguments {
   Expr output_zero_point;
 
   explicit QnnBinaryOpArguments(const Array<Expr>& new_args) {
-    CHECK_EQ(new_args.size(), kNumQnnBinaryOpInputs);
+    ICHECK_EQ(new_args.size(), kNumQnnBinaryOpInputs);
     int idx = 0;
     lhs = new_args[idx++];
     rhs = new_args[idx++];
@@ -78,7 +78,7 @@ struct QnnBinaryOpArguments {
     rhs_zero_point = new_args[idx++];
     output_scale = new_args[idx++];
     output_zero_point = new_args[idx++];
-    CHECK_EQ(idx, kNumQnnBinaryOpInputs);
+    ICHECK_EQ(idx, kNumQnnBinaryOpInputs);
   }
 };
 
@@ -92,9 +92,9 @@ struct QnnBinaryOpTensorType {
   Array<PrimExpr> shape;
 
   explicit QnnBinaryOpTensorType(const Array<tvm::relay::Type>& arg_types, const int32_t arg_idx) {
-    CHECK_EQ(arg_types.size(), kNumQnnBinaryOpArgTypes);
+    ICHECK_EQ(arg_types.size(), kNumQnnBinaryOpArgTypes);
     auto tensor_type = arg_types[arg_idx].as<TensorTypeNode>();
-    CHECK(tensor_type != nullptr);
+    ICHECK(tensor_type != nullptr);
     dtype = tensor_type->dtype;
     shape = tensor_type->shape;
   }
@@ -168,15 +168,15 @@ inline Array<Array<Layout> > QnnBinaryBroadcastLayout(const Attrs& attrs,
 
 static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), kNumQnnBinaryOpArgTypes);
+  ICHECK_EQ(types.size(), kNumQnnBinaryOpArgTypes);
 
   // Check the scale and zero point types
-  CHECK(IsScalarType(types[2], DataType::Float(32)));  // lhs_scale
-  CHECK(IsScalarType(types[3], DataType::Int(32)));    // lhs_zero_point
-  CHECK(IsScalarType(types[4], DataType::Float(32)));  // rhs_scale
-  CHECK(IsScalarType(types[5], DataType::Int(32)));    // rhs_zero_point
-  CHECK(IsScalarType(types[6], DataType::Float(32)));  // output_scale
-  CHECK(IsScalarType(types[7], DataType::Int(32)));    // output_zero_point
+  ICHECK(IsScalarType(types[2], DataType::Float(32)));  // lhs_scale
+  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // lhs_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // rhs_scale
+  ICHECK(IsScalarType(types[5], DataType::Int(32)));    // rhs_zero_point
+  ICHECK(IsScalarType(types[6], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[7], DataType::Int(32)));    // output_zero_point
 
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // BroadcastRel infer type function.
@@ -215,6 +215,7 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
       .add_argument("output_scale", "Tensor", "The scale of the output tensor.")                   \
       .add_argument("output_zero_point", "Tensor", "The zero_point of the output tensor.")         \
       .add_type_rel("QnnBroadcast", QnnBroadcastRel)                                               \
+      .set_attr<TNonComputational>("TNonComputational", true)                                      \
       .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnBinaryBroadcastLayout)
 
 }  // namespace qnn
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index fb7ef9720c87..9829834f43a3 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -27,8 +27,8 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
 
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
@@ -38,7 +38,7 @@ TVM_REGISTER_NODE_TYPE(QuantizeAttrs);
 
 bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) {
@@ -46,15 +46,15 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   const auto input_dtype = data->dtype;
-  CHECK(input_dtype == DataType::Float(32))
+  ICHECK(input_dtype == DataType::Float(32))
       << "Input type should be one of float32 but was " << input_dtype;
 
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
   int axis = quantize_attrs->axis;
   axis = (axis == -1) ? data->shape.size() - 1 : axis;
-  CHECK_LT(axis, static_cast<int>(data->shape.size()))
+  ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << quantize_attrs->axis << " is out of range";
-  CHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
+  ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
 
   // Check and assign types for scale and zero points.
   AssignType(types[1], DataType::Float(32), data->shape[axis], reporter);  // scale
@@ -62,8 +62,8 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const Array<tvm::PrimExpr> oshape = data->shape;
   const DataType out_dtype = quantize_attrs->out_dtype;
-  CHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
-        out_dtype == DataType::Int(32))
+  ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
+         out_dtype == DataType::Int(32))
       << "Output type should be one of [int8, unit8, int32] but was " << out_dtype;
   // assign output type
   reporter->Assign(types[3], TensorType(oshape, out_dtype));
@@ -83,20 +83,27 @@ Expr MakeQuantize(Expr data, Expr output_scale, Expr output_zero_point, int axis
 }
 
 Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale,
-                   const Expr& output_zero_point, const Array<IndexExpr>& input_shape,
+                   const Expr& output_zero_point, const Array<tvm::relay::Type>& types,
                    const QuantizeAttrs* attrs) {
+  ICHECK_EQ(types.size(), 4);
+  auto in_type = types[0];
+  auto in_tensor_type = in_type.as<TensorTypeNode>();
+  ICHECK(in_tensor_type != nullptr) << "Type information missing."
+                                    << " Please run infer_type pass.";
+  Array<IndexExpr> input_shape = in_tensor_type->shape;
+
   const auto out_dtype = attrs->out_dtype;
   const auto axis = attrs->axis;
 
   size_t n_dim = input_shape.size();
 
   auto expanded_output_scale = output_scale;
-  if (!IsConstScalar(output_scale)) {
+  if (!IsConstScalar(output_scale) && !IsScalarType(types[1])) {
     expanded_output_scale = ExpandBiasToMatchAxis(output_scale, n_dim, {axis});
   }
 
   auto expanded_output_zero_point = output_zero_point;
-  if (!IsConstScalar(output_zero_point)) {
+  if (!IsConstScalar(output_zero_point) && !IsScalarType(types[2])) {
     expanded_output_zero_point = ExpandBiasToMatchAxis(output_zero_point, n_dim, {axis});
   }
 
@@ -113,22 +120,14 @@ Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale,
 
 Expr QuantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                              const Array<tvm::relay::Type>& types) {
-  CHECK_EQ(new_args.size(), 3);
+  ICHECK_EQ(new_args.size(), 3);
   auto& data = new_args[0];
   auto& output_scale = new_args[1];
   auto& output_zero_point = new_args[2];
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
-  CHECK(quantize_attrs != nullptr);
-
-  // Find input shape.
-  CHECK_EQ(types.size(), 4);
-  auto in_type = types[0];
-  auto in_tensor_type = in_type.as<TensorTypeNode>();
-  CHECK(in_tensor_type != nullptr) << "Type information missing."
-                                   << " Please run infer_type pass.";
-  Array<IndexExpr> input_shape = in_tensor_type->shape;
+  ICHECK(quantize_attrs != nullptr);
 
-  return QuantizeLower(data, output_scale, output_zero_point, input_shape, quantize_attrs);
+  return QuantizeLower(data, output_scale, output_zero_point, types, quantize_attrs);
 }
 
 RELAY_REGISTER_OP("qnn.quantize")
@@ -150,6 +149,7 @@ scale and zero point.
                   "The quantization zero_point of the output tensor.")
     .set_support_level(11)
     .add_type_rel("Quantize", QuantizeRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QuantizeQnnCanonicalize);
 
 TVM_REGISTER_GLOBAL("relay.qnn.op._make.quantize").set_body_typed(MakeQuantize);
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 817a734dc637..8e9b31e6fc39 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -26,9 +26,9 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
 
-#include "../../transforms/infer_layout_util.h"
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
@@ -44,7 +44,7 @@ Array<Array<Layout>> RequantizeInferCorrectLayout(const Attrs& attrs,
 
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
 
@@ -52,8 +52,8 @@ Array<Array<Layout>> RequantizeInferCorrectLayout(const Attrs& attrs,
   if (new_in_layouts.defined()) {
     // Adapt to new layout. The axis has to change.
     // Record original reduce axis. Convert to the modified layout axis.
-    CHECK_EQ(new_in_layouts.size(), 5);
-    CHECK_EQ(old_in_layouts.size(), 5);
+    ICHECK_EQ(new_in_layouts.size(), 5);
+    ICHECK_EQ(old_in_layouts.size(), 5);
 
     // 1) Get the axis.
     int axis = param->axis;
@@ -90,7 +90,7 @@ Array<Array<Layout>> RequantizeInferCorrectLayout(const Attrs& attrs,
     param->axis = new_axis;
   } else if (old_in_layouts.defined()) {
     // If the new layout is undefined, set the old layout as the inferred layout.
-    CHECK_EQ(old_in_layouts.size(), 5);
+    ICHECK_EQ(old_in_layouts.size(), 5);
 
     Layout old_layout = old_in_layouts[0];
 
@@ -214,32 +214,32 @@ Expr RequantizeLower(const Expr& input_tensor, const Expr& input_scale,
  */
 Expr RequantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                                const Array<tvm::relay::Type>& types) {
-  CHECK_EQ(new_args.size(), 5);
+  ICHECK_EQ(new_args.size(), 5);
   auto& quantized_data = new_args[0];
   auto& input_scale = new_args[1];
   auto& input_zero_point = new_args[2];
   auto& output_scale = new_args[3];
   auto& output_zero_point = new_args[4];
   const auto* param = attrs.as<RequantizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // Find input shape.
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
   auto in_type = types[0];
   auto in_tensor_type = in_type.as<TensorTypeNode>();
-  CHECK(in_tensor_type != nullptr) << "Type information missing."
-                                   << " Please run infer_type pass.";
+  ICHECK(in_tensor_type != nullptr) << "Type information missing."
+                                    << " Please run infer_type pass.";
   Array<IndexExpr> input_shape = in_tensor_type->shape;
 
   // Find the output dtype.
   auto out_type = types[5];
   auto out_tensor_type = out_type.as<TensorTypeNode>();
-  CHECK(out_tensor_type != nullptr) << "Type information missing."
-                                    << " Please run infer_type pass.";
+  ICHECK(out_tensor_type != nullptr) << "Type information missing."
+                                     << " Please run infer_type pass.";
   auto out_dtype = out_tensor_type->dtype;
 
   // Check rounding validity.
-  CHECK(param->rounding == "UPWARD" || param->rounding == "TONEAREST")
+  ICHECK(param->rounding == "UPWARD" || param->rounding == "TONEAREST")
       << "QNN requantize supports two rounding modes - UPWARD and "
       << "TONEAREST";
   return RequantizeLower(quantized_data, input_scale, input_zero_point, output_scale,
@@ -256,7 +256,7 @@ Expr RequantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
  */
 bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) {
@@ -264,29 +264,29 @@ bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   const auto in_dtype = data->dtype;
-  CHECK(in_dtype == DataType::Int(8) || in_dtype == DataType::UInt(8) ||
-        in_dtype == DataType::Int(32))
+  ICHECK(in_dtype == DataType::Int(8) || in_dtype == DataType::UInt(8) ||
+         in_dtype == DataType::Int(32))
       << "Input type should be one of [int8, uint8, int32] but was " << in_dtype;
 
   const RequantizeAttrs* requantize_attrs = attrs.as<RequantizeAttrs>();
   int axis = requantize_attrs->axis;
   axis = (axis == -1) ? data->shape.size() - 1 : axis;
-  CHECK_LT(axis, static_cast<int>(data->shape.size()))
+  ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << requantize_attrs->axis << " is out of range";
-  CHECK_GE(axis, 0) << "axis " << requantize_attrs->axis << " is out of range";
+  ICHECK_GE(axis, 0) << "axis " << requantize_attrs->axis << " is out of range";
 
   // Check and assign types for scale and zero points.
   AssignType(types[1], DataType::Float(32), data->shape[axis], reporter);  // input_scale
   AssignType(types[2], DataType::Int(32), data->shape[axis], reporter);    // input_zero_pt
   // For now, requantize output tensor is limited to full tensor uniform quantization.
-  CHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
-  CHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
 
   const Array<tvm::PrimExpr> oshape = data->shape;
   // assign output type
   auto out_dtype = requantize_attrs->out_dtype;
-  CHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
-        out_dtype == DataType::Int(32))
+  ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
+         out_dtype == DataType::Int(32))
       << "Output type should be one of [int8, uint8, int32] but was " << out_dtype;
   reporter->Assign(types[5], TensorType(oshape, out_dtype));
   return true;
@@ -324,6 +324,7 @@ Q_output = zp_output +  (scale_input)/(scale_output) * (Q_input - zp_input)
                   "The quantization zero_point of the output tensor.")
     .set_support_level(11)
     .add_type_rel("Requantize", RequantizeRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", RequantizeQnnCanonicalize)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", RequantizeInferCorrectLayout);
 
diff --git a/src/relay/qnn/util.cc b/src/relay/qnn/utils.cc
similarity index 97%
rename from src/relay/qnn/util.cc
rename to src/relay/qnn/utils.cc
index 113038e327d7..982efa0a61c1 100644
--- a/src/relay/qnn/util.cc
+++ b/src/relay/qnn/utils.cc
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file src/relay/qnn/util.cc
+ * \file src/relay/qnn/utils.cc
  * \brief Utility functions for QNN.
  */
 
-#include "util.h"
+#include "utils.h"
 
-#include "../transforms/pattern_util.h"
+#include "../transforms/pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -46,12 +46,12 @@ std::pair<int32_t, int32_t> GetFixedPointMultiplierShift(double double_multiplie
   // multiplying the double value with 2^31 and then casting to int.
   significand_d = std::round(significand_d * (1ll << 31));
   auto significand_int64 = static_cast<int64_t>(significand_d);
-  CHECK_LE(significand_int64, (1ll << 31));
+  ICHECK_LE(significand_int64, (1ll << 31));
   if (significand_int64 == (1ll << 31)) {
     significand_int64 /= 2;
     ++exponent;
   }
-  CHECK_LE(significand_int64, std::numeric_limits<int32_t>::max());
+  ICHECK_LE(significand_int64, std::numeric_limits<int32_t>::max());
   significand = static_cast<int32_t>(significand_int64);
   return std::make_pair(significand, exponent);
 }
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/utils.h
similarity index 85%
rename from src/relay/qnn/util.h
rename to src/relay/qnn/utils.h
index 72eb2a46b2ae..23759a52ec41 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/utils.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file src/relay/qnn/util.h
+ * \file src/relay/qnn/utils.h
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
-#ifndef TVM_RELAY_QNN_UTIL_H_
-#define TVM_RELAY_QNN_UTIL_H_
+#ifndef TVM_RELAY_QNN_UTILS_H_
+#define TVM_RELAY_QNN_UTILS_H_
 
 #include <tvm/relay/expr.h>
 #include <tvm/relay/qnn/attrs.h>
@@ -41,16 +41,16 @@ namespace qnn {
 
 static inline Array<IndexExpr> get_shape(const Type& type) {
   auto input_tt = type.as<TensorTypeNode>();
-  CHECK(input_tt != nullptr) << "Type information missing."
-                             << " Please run infer_type pass.";
+  ICHECK(input_tt != nullptr) << "Type information missing."
+                              << " Please run infer_type pass.";
   return input_tt->shape;
 }
 
 static inline int32_t GetQmin(const DataType& dtype) {
-  CHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
+  ICHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
   if (dtype.is_int() || dtype.is_uint()) {
     auto* min_value = tir::as_const_int(tvm::min_value(dtype));
-    CHECK(min_value != nullptr);
+    ICHECK(min_value != nullptr);
     return static_cast<int32_t>(min_value[0]);
   } else {
     LOG(FATAL) << "Type not supported " << dtype;
@@ -59,10 +59,10 @@ static inline int32_t GetQmin(const DataType& dtype) {
 }
 
 static inline int32_t GetQmax(const DataType& dtype) {
-  CHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
+  ICHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
   if (dtype.is_int() || dtype.is_uint()) {
     auto* max_value = tir::as_const_int(tvm::max_value(dtype));
-    CHECK(max_value != nullptr);
+    ICHECK(max_value != nullptr);
     return static_cast<int32_t>(max_value[0]);
   } else {
     LOG(FATAL) << "Type not supported " << dtype;
@@ -109,7 +109,7 @@ static inline Expr Requantize(const Expr& data, const Array<IndexExpr>& input_sh
 
 static inline int64_t get_const_int(const tvm::PrimExpr& x) {
   auto* value_ptr = tir::as_const_int(x);
-  CHECK(value_ptr) << "Expr is not a constant int";
+  ICHECK(value_ptr) << "Expr is not a constant int";
   return value_ptr[0];
 }
 
@@ -171,12 +171,24 @@ Expr FixedPointMultiplyPerChannel(Expr tensor, std::vector<double> multiplier,
  * \return True if the type is a scalar of given dtype
  */
 static inline bool IsScalarType(const Type& expr_type, const DataType& dtype) {
+  const auto* tensor_type = expr_type.as<TensorTypeNode>();
+  ICHECK(tensor_type) << "Only tensor type can be checked for scalar values. But got"
+                      << AsText(expr_type, false);
+  ICHECK_EQ(tensor_type->shape.size(), 0);
+  ICHECK(tensor_type->dtype == dtype) << "Expected " << dtype << " but got " << tensor_type->dtype;
+  return true;
+}
+
+/*
+ * \brief Checks whether an expr type is scalar.
+ * \param expr_type The type of expr to be checked.
+ * \return True if the type is a scalar
+ */
+static inline bool IsScalarType(const Type& expr_type) {
   const auto* tensor_type = expr_type.as<TensorTypeNode>();
   CHECK(tensor_type) << "Only tensor type can be checked for scalar values. But got"
                      << AsText(expr_type, false);
-  CHECK_EQ(tensor_type->shape.size(), 0);
-  CHECK(tensor_type->dtype == dtype) << "Expected " << dtype << " but got " << tensor_type->dtype;
-  return true;
+  return tensor_type->shape.size() == 0;
 }
 
 /*
@@ -190,9 +202,10 @@ static inline void AssignType(const Type& expr_type, const DataType& dtype, cons
                               const TypeReporter& reporter) {
   // Scale/Zero_points can be either const scalar or a vector with C axis num elems.
   const auto* tensor_type = expr_type.as<TensorTypeNode>();
-  CHECK(tensor_type) << "Can assign type to Tensor type only. But got " << AsText(expr_type, false);
+  ICHECK(tensor_type) << "Can assign type to Tensor type only. But got "
+                      << AsText(expr_type, false);
   const auto tensor_dtype = tensor_type->dtype;
-  CHECK(tensor_dtype == dtype) << "Expected type is " << dtype << " but received " << tensor_dtype;
+  ICHECK(tensor_dtype == dtype) << "Expected type is " << dtype << " but received " << tensor_dtype;
   if (tensor_type->shape.size() != 0) {
     reporter->Assign(expr_type, TensorType({shape}, tensor_type->dtype));
   }
@@ -201,7 +214,7 @@ static inline void AssignType(const Type& expr_type, const DataType& dtype, cons
 static inline std::vector<float> GetFloatVectorFromConstant(const Expr& expr) {
   const auto* n = expr.as<ConstantNode>();
   std::vector<float> vals;
-  CHECK(n) << "Expr must be a constant expr - " << AsText(expr, false);
+  ICHECK(n) << "Expr must be a constant expr - " << AsText(expr, false);
   int64_t num_elems = 1;
   auto shape = n->data.Shape();
   for (size_t i = 0; i < shape.size(); i++) {
@@ -216,4 +229,4 @@ static inline std::vector<float> GetFloatVectorFromConstant(const Expr& expr) {
 }  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_QNN_UTIL_H_
+#endif  // TVM_RELAY_QNN_UTILS_H_
diff --git a/src/relay/quantize/annotate.cc b/src/relay/quantize/annotate.cc
index 8ae7df9e2941..3def616e9423 100644
--- a/src/relay/quantize/annotate.cc
+++ b/src/relay/quantize/annotate.cc
@@ -83,7 +83,7 @@ Pass QuantizeAnnotate() {
   std::function<Expr(const Expr&)> fmulti_ref = [](const Expr& e) {
     if (e->IsInstance<TempExprNode>()) {
       const auto* n = e.as<QAnnotateExprNode>();
-      CHECK(n);
+      ICHECK(n);
       const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
       Expr ret = (*f)(n->expr, static_cast<int>(kQInput));
       return static_cast<Expr>(QAnnotateExpr(ret, kQInput));
diff --git a/src/relay/quantize/calibrate.cc b/src/relay/quantize/calibrate.cc
index ea42a198bf84..0ac445295496 100644
--- a/src/relay/quantize/calibrate.cc
+++ b/src/relay/quantize/calibrate.cc
@@ -71,7 +71,7 @@ static float ComputeEntropy(float* p, float* q, size_t size) {
   float q_sum = std::accumulate(q, q + size, 0.f);
   float ret = 0;
   for (size_t i = 0; i < size; i++) {
-    CHECK(p[i] > 0 && q[i] > 0);
+    ICHECK(p[i] > 0 && q[i] > 0);
     p[i] /= p_sum;
     q[i] /= q_sum;
     if (p[i] && q[i]) ret += p[i] * std::log(p[i] / q[i]);
@@ -150,7 +150,7 @@ class StatsCollector : private ExprMutator {
   Expr Collect(const Expr& expr) {
     auto new_e = this->Mutate(expr);
     const FunctionNode* func = new_e.as<FunctionNode>();
-    CHECK(func) << "Input shoule be Function";
+    ICHECK(func) << "Input shoule be Function";
     Expr new_body = Tuple(std::move(profile_data_));
     return Function(FreeVars(new_body), new_body, NullValue<Type>(), func->type_params,
                     func->attrs);
@@ -163,7 +163,7 @@ class StatsCollector : private ExprMutator {
   Expr VisitExpr_(const CallNode* call) {
     Expr new_e = ExprMutator::VisitExpr_(call);
     const CallNode* new_call = new_e.as<CallNode>();
-    CHECK(new_call);
+    ICHECK(new_call);
     if (new_call->op == simulated_quantize_op_) {
       auto attrs = new_call->attrs.as<SimulatedQuantizeAttrs>();
       // rewrite the annotation
@@ -178,7 +178,7 @@ class StatsCollector : private ExprMutator {
 
       // add non-const expressions to profile data
       if (attrs->kind != QAnnotateKind::kQWeight) {
-        CHECK(!quantize_input.as<ConstantNode>());
+        ICHECK(!quantize_input.as<ConstantNode>());
         profile_data_.push_back(identity_quantize);
       }
       return identity_quantize;
diff --git a/src/relay/quantize/partition.cc b/src/relay/quantize/partition.cc
index 14b420d6034c..c65cc1879932 100644
--- a/src/relay/quantize/partition.cc
+++ b/src/relay/quantize/partition.cc
@@ -26,7 +26,7 @@
 
 #include <tvm/relay/transform.h>
 
-#include "../transforms/pattern_util.h"
+#include "../transforms/pattern_utils.h"
 #include "./quantize.h"
 
 namespace tvm {
diff --git a/src/relay/quantize/quantize.cc b/src/relay/quantize/quantize.cc
index 64a02fff1dca..846367c9c8a9 100644
--- a/src/relay/quantize/quantize.cc
+++ b/src/relay/quantize/quantize.cc
@@ -39,9 +39,9 @@ TVM_REGISTER_NODE_TYPE(SimulatedQuantizeAttrs);
 
 bool SimulatedQuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                           const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto param = attrs.as<SimulatedQuantizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   const auto* data = types[0].as<TensorTypeNode>();
 
@@ -49,7 +49,7 @@ bool SimulatedQuantizeRel(const Array<Type>& types, int num_inputs, const Attrs&
     return false;
   }
 
-  CHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
+  ICHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
 
   reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // dom_scale
   reporter->Assign(types[2], TensorType({}, DataType::Float(32)));  // clip_min
diff --git a/src/relay/quantize/quantize.h b/src/relay/quantize/quantize.h
index d5396dea00d1..7c2acbcb06d4 100644
--- a/src/relay/quantize/quantize.h
+++ b/src/relay/quantize/quantize.h
@@ -29,7 +29,7 @@
 
 #include <string>
 
-#include "../transforms/pattern_util.h"
+#include "../transforms/pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index ace2c2473173..2716c6e65f65 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -29,8 +29,8 @@
 #include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/transform.h>
 
-#include "../qnn/util.h"
-#include "../transforms/pattern_util.h"
+#include "../qnn/utils.h"
+#include "../transforms/pattern_utils.h"
 #include "./quantize.h"
 
 namespace tvm {
@@ -107,7 +107,7 @@ inline Expr MulAndDiv(Expr data, float s1, float s2, DataType dtype,
 
   float factor = s1 / s2;
   float shift_factor = std::log2(factor);
-  CHECK_GT(shift_factor, 0);
+  ICHECK_GT(shift_factor, 0);
   if (static_cast<int>(shift_factor) == shift_factor) {
     return LeftShift(data, MakeConstantScalar(dtype, static_cast<int>(shift_factor)));
   } else if (static_cast<int>(factor) == factor) {
@@ -129,7 +129,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
   const QConfig& cfg = QConfig::Current();
   // do not handle data type cast
   const auto param = ref_call->attrs.as<SimulatedQuantizeAttrs>();
-  CHECK_EQ(param->rounding, "round");
+  ICHECK_EQ(param->rounding, "round");
 
   Expr dom_scale = new_args[1];
   Expr clip_min = new_args[2];
@@ -153,7 +153,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
     }
 
     float shift_nbit = std::log2(odom_scale_imm / idom_scale_imm);
-    CHECK_NE(shift_nbit, 0);
+    ICHECK_NE(shift_nbit, 0);
     if (static_cast<int>(shift_nbit) == shift_nbit) {
       if (shift_nbit > 0) {
         // use right shift
@@ -186,7 +186,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
   }
 
   // quantize from real
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   Expr data = new_args[0];
   Expr scaled_data = Multiply(data, MakeConstantScalar(DataType::Float(32), 1 / dom_scale_imm));
   Expr round_data = Clip(Round(scaled_data), clip_min_imm, clip_max_imm);
@@ -205,14 +205,14 @@ RELAY_REGISTER_OP("relay.op.annotation.simulated_quantize")
 
 Expr Conv2dRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 2);
+  ICHECK_EQ(new_args.size(), 2);
   if (!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>()) {
     return Expr(nullptr);
   }
   const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
-  CHECK(lhs);
+  ICHECK(lhs);
   const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
-  CHECK(rhs);
+  ICHECK(rhs);
 
   Expr ldata = lhs->data;
   if (lhs->dtype != cfg->dtype_input) {
@@ -234,9 +234,40 @@ Expr Conv2dRealize(const Call& ref_call, const Array<Expr>& new_args, const Obje
 
 RELAY_REGISTER_OP("nn.conv2d").set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dRealize);
 
-Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
+Expr Conv1dRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   CHECK_EQ(new_args.size(), 2);
+  if (!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>()) {
+    return Expr(nullptr);
+  }
+  const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
+  CHECK(lhs);
+  const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
+  CHECK(rhs);
+
+  Expr ldata = lhs->data;
+  if (lhs->dtype != cfg->dtype_input) {
+    ldata = Cast(ldata, cfg->dtype_input);
+  }
+  Expr rdata = Cast(rhs->data, cfg->dtype_weight);
+
+  const auto ref_attrs = ref_call->attrs.as<Conv1DAttrs>();
+  auto attrs = make_object<Conv1DAttrs>();
+  *attrs = *ref_attrs;
+  DataType out_dtype = cfg->dtype_activation;
+  attrs->out_dtype = out_dtype;
+
+  Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
+  Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
+  Expr dom_scale = FoldConstantOpt(mul);
+  return QRealizeIntExpr(ret, dom_scale, out_dtype);
+}
+
+RELAY_REGISTER_OP("nn.conv1d").set_attr<FForwardRewrite>("FQRealizeRewrite", Conv1dRealize);
+
+Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
+  const QConfig& cfg = QConfig::Current();
+  ICHECK_EQ(new_args.size(), 2);
   if (!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>()) {
     return Expr(nullptr);
   }
@@ -265,7 +296,7 @@ RELAY_REGISTER_OP("nn.dense").set_attr<FForwardRewrite>("FQRealizeRewrite", Dens
 
 Expr MulRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 2);
+  ICHECK_EQ(new_args.size(), 2);
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     // execute the operation with activation data type.
     const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
@@ -286,7 +317,7 @@ Expr MulRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
     Expr dom_scale = FoldConstantOpt(mul);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -309,7 +340,8 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 /* \brief Unify the dom scale of arguments */
 Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args,
-                            DataType* dtype_ptr, Expr* scale_ptr) {
+                            DataType* dtype_ptr, Expr* scale_ptr,
+                            DataType dtype = DataType::Void()) {
   static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
@@ -317,20 +349,22 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
   Array<Expr> ret;
   for (auto arg : args) {
     const auto* nptr = arg.as<QRealizeIntExprNode>();
-    CHECK(nptr);
+    ICHECK(nptr);
     nptrs.push_back(nptr);
     ret.push_back(nptr->data);
   }
 
   // unify the data type
-  CHECK_EQ(ref_args.size(), args.size());
-  DataType dtype;
+  ICHECK_EQ(ref_args.size(), args.size());
 
-  if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
-    dtype = cfg->dtype_input;
-  } else {
-    dtype = cfg->dtype_activation;
+  if (dtype.is_void()) {
+    if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
+      dtype = cfg->dtype_input;
+    } else {
+      dtype = cfg->dtype_activation;
+    }
   }
+
   for (size_t i = 0; i < ret.size(); ++i) {
     auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
@@ -357,23 +391,32 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
 }
 
 Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
-  CHECK_EQ(new_args.size(), 2);
+  ICHECK_EQ(new_args.size(), 2);
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
+    // execute the operation with activation data type.
+    const QConfig& cfg = QConfig::Current();
+    Array<Expr> ret_args =
+        UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation);
+    for (size_t i = 0; i < ret_args.size(); ++i) {
+      // do not fuse float32 arg
+      if (new_args[i].as<QRealizeIntExprNode>()->dtype == DataType::Float(32)) {
+        ret_args.Set(i, StopFusion(ret_args[i]));
+      }
+    }
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
 
-  CHECK(!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
 RELAY_REGISTER_OP("add").set_attr<FForwardRewrite>("FQRealizeRewrite", AddRealize);
 
 Expr ClipRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     const auto ref_attrs = ref_call->attrs.as<ClipAttrs>();
     auto attrs = make_object<ClipAttrs>();
@@ -384,20 +427,20 @@ Expr ClipRealize(const Call& ref_call, const Array<Expr>& new_args, const Object
     Expr ret = Call(ref_call->op, {n->data}, Attrs(attrs), ref_call->type_args);
     return QRealizeIntExpr(ret, n->dom_scale, n->dtype);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
 RELAY_REGISTER_OP("clip").set_attr<FForwardRewrite>("FQRealizeRewrite", ClipRealize);
 
 Expr ConcatenateRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
-  CHECK_EQ(new_args.size(), 1);
-  CHECK_EQ(ref_call->args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(ref_call->args.size(), 1);
 
   const auto* tuple = new_args[0].as<TupleNode>();
   const auto* ref_tuple = ref_call->args[0].as<TupleNode>();
-  CHECK(tuple);
-  CHECK(ref_tuple);
+  ICHECK(tuple);
+  ICHECK(ref_tuple);
   const Array<Expr>& arr = tuple->fields;
   const Array<Expr>& ref_arr = ref_tuple->fields;
 
@@ -409,7 +452,7 @@ Expr ConcatenateRealize(const Call& ref_call, const Array<Expr>& new_args, const
     return QRealizeIntExpr(ret, dom_scale, dtype);
   } else {
     for (auto arg : new_args) {
-      CHECK(!arg->IsInstance<TempExprNode>());
+      ICHECK(!arg->IsInstance<TempExprNode>());
     }
     return Expr(nullptr);
   }
@@ -419,22 +462,26 @@ RELAY_REGISTER_OP("concatenate").set_attr<FForwardRewrite>("FQRealizeRewrite", C
 
 /* \brief forward the original operator */
 Expr IdentityRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     Expr ret = ForwardOp(ref_call, {n->data});
     return QRealizeIntExpr(ret, n->dom_scale, n->dtype);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
 RELAY_REGISTER_OP("nn.relu").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
+RELAY_REGISTER_OP("reshape").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
+
 RELAY_REGISTER_OP("strided_slice").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
 RELAY_REGISTER_OP("nn.batch_flatten")
     .set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
+RELAY_REGISTER_OP("transpose").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
+
 RELAY_REGISTER_OP("annotation.stop_fusion")
     .set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
@@ -442,22 +489,25 @@ RELAY_REGISTER_OP("annotation.stop_fusion")
 Expr CastDtypeInputRealize(const Call& ref_call, const Array<Expr>& new_args,
                            const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     Expr data = Cast(n->data, cfg->dtype_input);
     Expr ret = ForwardOp(ref_call, {data});
     return QRealizeIntExpr(ret, n->dom_scale, cfg->dtype_input);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
 RELAY_REGISTER_OP("nn.max_pool2d")
     .set_attr<FForwardRewrite>("FQRealizeRewrite", CastDtypeInputRealize);
 
+RELAY_REGISTER_OP("nn.max_pool1d")
+    .set_attr<FForwardRewrite>("FQRealizeRewrite", CastDtypeInputRealize);
+
 Expr AvgPoolRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     Expr data = n->data;
     if (n->dtype != cfg->dtype_activation) {
@@ -466,7 +516,7 @@ Expr AvgPoolRealize(const Call& ref_call, const Array<Expr>& new_args, const Obj
     Expr ret = ForwardOp(ref_call, {data});
     return QRealizeIntExpr(ret, n->dom_scale, cfg->dtype_activation);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -477,12 +527,12 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
 
 Expr CastHintRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const auto param = ref_call->attrs.as<CastHintAttrs>();
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     Expr ret = Cast(n->data, param->dtype);
     return QRealizeIntExpr(ret, n->dom_scale, param->dtype);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
diff --git a/src/relay/transforms/alter_op_layout.cc b/src/relay/transforms/alter_op_layout.cc
index 3d242cd09f7d..924e61ad0d16 100644
--- a/src/relay/transforms/alter_op_layout.cc
+++ b/src/relay/transforms/alter_op_layout.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <vector>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 #include "transform_layout.h"
 
 namespace tvm {
@@ -97,7 +97,7 @@ class AlterTransformMemorizer : public TransformMemorizer {
     }
 
     const CallNode* new_call = new_e.as<CallNode>();
-    CHECK(new_call) << "Can only replace the original operator with another call node";
+    ICHECK(new_call) << "Can only replace the original operator with another call node";
     return GetRef<Call>(new_call);
   }
 
diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
index 015489dd0857..d5f1e4cc1752 100644
--- a/src/relay/transforms/annotate_target.cc
+++ b/src/relay/transforms/annotate_target.cc
@@ -29,7 +29,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/container.h>
 
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -69,7 +69,7 @@ class AnnotateTargetRewriter : public ExprRewriter {
       if (call && call->op == CompilerBeginOp()) {
         // Argument is already compiler begin node meaning that this is not the first time
         // running this pass, so we simply remove it and will add a new one later.
-        CHECK_EQ(call->args.size(), 1U);
+        ICHECK_EQ(call->args.size(), 1U);
         const CallNode* end = call->args[0].as<CallNode>();
         if (end->op == CompilerEndOp()) {
           arg_target = end->attrs.as<CompilerAttrs>()->compiler;
@@ -77,7 +77,13 @@ class AnnotateTargetRewriter : public ExprRewriter {
         compiler_ends.push_back(call->args[0]);
       } else if (op_expr_to_target_.find(arg) != op_expr_to_target_.end()) {
         arg_target = op_expr_to_target_[arg];
-        compiler_ends.push_back(InsertAnnotation(arg, arg_target, make_end_op));
+        // If an argument is a call node and has no argument, then it should be tensor ops such as
+        // zeros, so we treat it as input vars.
+        if (call && call->args.size() == 0) {
+          compiler_ends.push_back(arg);
+        } else {
+          compiler_ends.push_back(InsertAnnotation(arg, arg_target, make_end_op));
+        }
       } else {
         // Input vars.
         compiler_ends.push_back(arg);
@@ -113,14 +119,16 @@ class AnnotateTargetRewriter : public ExprRewriter {
      * \brief This function inserts compiler end to expr and maps the corresponding target to the
      * new expression.
      *
-     *  This function checks for expr existence within the map and inserts the annotation
+     *  This function checks for expr existence within the map and inserts the annotation.
+     *  If the expression has a free variable (e.g: relay.zeros, relay.ones) we do not insert
+     *  compiler end, since there are no compiler begins for it.
      *  Further, it propagates the target to the new expression and returns it
      *
      * \param expr A relay expression
      * \return An annotated and target-propagated relay expression.
      */
     Expr new_expr = expr;
-    if (op_expr_to_target_.find(expr) != op_expr_to_target_.end()) {
+    if (op_expr_to_target_.find(expr) != op_expr_to_target_.end() && FreeVars(expr).size() != 0) {
       new_expr = InsertAnnotation(expr, op_expr_to_target_[expr], make_end_op);
       op_expr_to_target_[new_expr] = op_expr_to_target_[expr];
     }
@@ -137,13 +145,13 @@ class AnnotateTargetRewriter : public ExprRewriter {
     if (op_node && pre->op == CompilerBeginOp()) {
       // Bypass compiler begin due to lack of target information. It will be processed
       // when the following op handling arguments.
-      CHECK_EQ(pre->args.size(), 1U);
+      ICHECK_EQ(pre->args.size(), 1U);
       return post.as<CallNode>()->args[0];
     } else if (op_node && pre->op == CompilerEndOp()) {
       // Override compiler end with the new target.
-      CHECK_EQ(pre->args.size(), 1U);
+      ICHECK_EQ(pre->args.size(), 1U);
       auto input_expr = post.as<CallNode>()->args[0];
-      CHECK(op_expr_to_target_.find(input_expr) != op_expr_to_target_.end());
+      ICHECK(op_expr_to_target_.find(input_expr) != op_expr_to_target_.end());
       return InsertAnnotation(input_expr, op_expr_to_target_[input_expr], make_end_op);
     }
     // Check prior to peeking first argument
@@ -164,13 +172,14 @@ class AnnotateTargetRewriter : public ExprRewriter {
       // TVM operators: Check target specific op checking function and add to supported_targets
       // if it is supported.
       Op op = Downcast<Op>(pre->op);
-      CHECK(op.defined());
+      ICHECK(op.defined());
       for (const auto& target : this->targets_) {
         if (!Op::HasAttrMap("target." + std::string(target))) {
           continue;
         }
         auto fannotate = Op::GetAttrMap<FTVMAnnotateTarget>("target." + std::string(target));
-        if (fannotate.count(op) && fannotate[op](pre->attrs, pre->args)) {
+        const Expr& ex = GetRef<Expr>(pre);
+        if (fannotate.count(op) && fannotate[op](ex)) {
           supported_targets.push_back(target);
         }
       }
@@ -178,7 +187,7 @@ class AnnotateTargetRewriter : public ExprRewriter {
       // Composite function: Add the target of a composite function to supported_targets
       // if it is in the target list.
       Function func = Downcast<Function>(pre->op);
-      CHECK(func.defined());
+      ICHECK(func.defined());
 
       if (auto comp_name = func->GetAttr<String>(attr::kComposite)) {
         std::string comp_name_str = comp_name.value();
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
new file mode 100644
index 000000000000..c9875ef5d718
--- /dev/null
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file auto_scheduler_layout_rewrite.h
+ * \brief Rewrite the layout of "layout free" tensors (e.g., the weight tensors in
+ * conv2d and dense layers) according to the tile structure generated by the auto-scheduler.
+ */
+
+#include "auto_scheduler_layout_rewrite.h"
+
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/transform.h>
+
+#include <deque>
+#include <functional>
+#include <vector>
+
+#include "../backend/compile_engine.h"
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+// Two global variables for receiving layout information from python
+std::deque<std::string> AutoSchedulerLayoutRewriter::global_ori_layouts_queue;
+std::deque<std::string> AutoSchedulerLayoutRewriter::global_new_layouts_queue;
+
+// Copy an Attrs but with a new auto_scheduler_rewritten_layout filed.
+template <typename T>
+Attrs CopyAttrsWithNewLayout(const T* ptr, const std::string& layout) {
+  auto n = make_object<T>(*ptr);
+  n->auto_scheduler_rewritten_layout = layout;
+  return Attrs(n);
+}
+
+// Mutate ops in a function
+class FuncMutator : public ExprMutator {
+ public:
+  FuncMutator(const std::deque<std::string>& ori_layouts_queue,
+              const std::deque<std::string>& new_layouts_queue)
+      : ExprMutator(),
+        ori_layouts_queue_(ori_layouts_queue),
+        new_layouts_queue_(new_layouts_queue) {}
+
+  Expr VisitExpr_(const CallNode* n) {
+    auto new_n = ExprMutator::VisitExpr_(n);
+
+    const auto* call = new_n.as<CallNode>();
+    if (call && call->op.as<OpNode>() &&
+        (std::find(target_ops_.begin(), target_ops_.end(), n->op.as<OpNode>()->name) !=
+         target_ops_.end()) &&
+        !ori_layouts_queue_.empty() && !new_layouts_queue_.empty()) {
+      // Pop a new layout from the queue
+      const std::string ori_layout = ori_layouts_queue_.front();
+      const std::string new_layout = new_layouts_queue_.front();
+      ori_layouts_queue_.pop_front();
+      new_layouts_queue_.pop_front();
+
+      // Insert a new op to do layout transform. (This will be simplified by FoldConstant later).
+      Expr updated_kernel = MakeAutoSchedulerLayoutTransform(call->args[1], ori_layout, new_layout);
+      Array<Expr> updated_args = {call->args[0], updated_kernel};
+
+      // Update the attrs
+      Attrs updated_attrs;
+      if (auto pattr = call->attrs.as<Conv2DAttrs>()) {
+        updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
+      }
+      new_n = Call(call->op, updated_args, updated_attrs);
+    }
+    return new_n;
+  }
+
+ private:
+  std::deque<std::string> ori_layouts_queue_;
+  std::deque<std::string> new_layouts_queue_;
+
+  std::vector<std::string> target_ops_{"nn.conv2d"};
+};
+
+Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
+  auto new_n = ExprMutator::VisitExpr_(n);
+
+  if (const auto* call = new_n.as<CallNode>()) {
+    if (const auto* func = call->op.as<FunctionNode>()) {
+      global_ori_layouts_queue.clear();
+      global_new_layouts_queue.clear();
+
+      // Use ScheduleGetter to call python lower functions.
+      // This is used to get the layout transform information.
+      // The layout transformation will be recorded to global_ori_layout_queue
+      // and global_new_layouts_queue in ComputeDAG::RewriteLayout.
+      auto f = runtime::Registry::Get("auto_scheduler.enter_layout_rewrite");
+      CHECK(f) << "Could not find auto_scheduler.enter_layout_rewrite function.";
+      (*f)();
+
+      CreateSchedule(GetRef<Function>(func), Target::Current());
+
+      f = runtime::Registry::Get("auto_scheduler.exit_layout_rewrite");
+      CHECK(f) << "Could not find ansor.exit_layout_rewrite function.";
+      (*f)();
+
+      // Mutate the called function
+      if (!global_ori_layouts_queue.empty() && !global_new_layouts_queue.empty()) {
+        auto ret = FuncMutator(global_ori_layouts_queue, global_new_layouts_queue).VisitExpr(new_n);
+        return ret;
+      }
+    }
+  }
+
+  return new_n;
+}
+
+Expr AutoSchedulerLayoutRewrite(const Expr& expr) {
+  return AutoSchedulerLayoutRewriter().Mutate(expr);
+}
+
+namespace transform {
+
+Pass AutoSchedulerLayoutRewrite() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(relay::AutoSchedulerLayoutRewrite(f));
+      };
+  return CreateFunctionPass(pass_func, 3, "AutoSchedulerLayoutRewrite", {"InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.AutoSchedulerLayoutRewrite")
+    .set_body_typed(AutoSchedulerLayoutRewrite);
+
+TVM_REGISTER_GLOBAL("relay.attrs.get_auto_scheduler_rewritten_layout")
+    .set_body_typed([](const Attrs& attrs) {
+      if (attrs->IsInstance<Conv2DAttrs>()) {
+        return attrs.as<Conv2DAttrs>()->auto_scheduler_rewritten_layout;
+      }
+      return std::string();
+    });
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.h b/src/relay/transforms/auto_scheduler_layout_rewrite.h
new file mode 100644
index 000000000000..d0d89db42e68
--- /dev/null
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file auto_scheduler_layout_rewrite.h
+ * \brief Rewrite the layout of "layout free" tensors (e.g., the weight tensors in
+ * conv2d and dense layers) according to the tile structure generated by the auto-scheduler.
+ */
+
+#ifndef TVM_RELAY_TRANSFORMS_AUTO_SCHEDULER_LAYOUT_REWRITE_H_
+#define TVM_RELAY_TRANSFORMS_AUTO_SCHEDULER_LAYOUT_REWRITE_H_
+
+#include <tvm/relay/expr_functor.h>
+
+#include <deque>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+class AutoSchedulerLayoutRewriter : public ExprMutator {
+ public:
+  Expr VisitExpr_(const CallNode* n) final;
+
+  // Two global variables for receiving layout information from python
+  static std::deque<std::string> global_ori_layouts_queue;
+  static std::deque<std::string> global_new_layouts_queue;
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_TRANSFORMS_AUTO_SCHEDULER_LAYOUT_REWRITE_H_
diff --git a/src/relay/transforms/canonicalize_cast.cc b/src/relay/transforms/canonicalize_cast.cc
index 055ab1480a6e..b0e96cc47514 100644
--- a/src/relay/transforms/canonicalize_cast.cc
+++ b/src/relay/transforms/canonicalize_cast.cc
@@ -26,8 +26,8 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 
-#include "pass_util.h"
-#include "pattern_util.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -106,13 +106,13 @@ class CastCanonicalizer : public ExprMutator {
       if (call->op == cast_op_) {
         auto attrs = call->attrs.as<CastAttrs>();
         const auto* from_type = call->args[0]->type_as<TensorTypeNode>();
-        CHECK(from_type);
+        ICHECK(from_type);
 
         if (from_type->dtype.bits() < attrs->dtype.bits()) {
           if (++ref_counter_[call] > 1) {
             const CallNode* new_call = new_expr.as<CallNode>();
-            CHECK(new_call);
-            CHECK(new_call->op == cast_op_);
+            ICHECK(new_call);
+            ICHECK(new_call->op == cast_op_);
             return Call(new_call->op, new_call->args, new_call->attrs, new_call->type_args);
           }
         }
diff --git a/src/relay/transforms/canonicalize_ops.cc b/src/relay/transforms/canonicalize_ops.cc
index fec757ee68d5..cf14ddcb7c5b 100644
--- a/src/relay/transforms/canonicalize_ops.cc
+++ b/src/relay/transforms/canonicalize_ops.cc
@@ -28,7 +28,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -41,7 +41,7 @@ class BiasAddSimplifier : public ExprRewriter {
     auto new_n = post;
     if (n->op == bias_add_op_) {
       Call call = Downcast<Call>(new_n);
-      CHECK_EQ(call->args.size(), 2);
+      ICHECK_EQ(call->args.size(), 2);
       const BiasAddAttrs* param = call->attrs.as<BiasAddAttrs>();
 
       auto ttype = n->args[0]->type_as<TensorTypeNode>();
diff --git a/src/relay/transforms/combine_parallel_batch_matmul.cc b/src/relay/transforms/combine_parallel_batch_matmul.cc
index b2b9703c28bc..5b56504602a9 100644
--- a/src/relay/transforms/combine_parallel_batch_matmul.cc
+++ b/src/relay/transforms/combine_parallel_batch_matmul.cc
@@ -42,7 +42,7 @@
 
 #include "./combine_parallel_op.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index 54aec99f46fb..20b206e0423c 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -44,7 +44,7 @@
 
 #include "./combine_parallel_op.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -62,8 +62,8 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     const Layout kOIHW("OIHW");
     const auto* attrs_a = a->attrs.as<Conv2DAttrs>();
     const auto* attrs_b = b->attrs.as<Conv2DAttrs>();
-    CHECK(attrs_a);
-    CHECK(attrs_b);
+    ICHECK(attrs_a);
+    ICHECK(attrs_b);
     const auto* tweight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* tweight_b = b->args[1]->type_as<TensorTypeNode>();
     const auto shape_a =
@@ -89,7 +89,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
 
     const CallNode* group_root = branches[0][0];
     const auto* attrs = group_root->attrs.as<Conv2DAttrs>();
-    CHECK(attrs);
+    ICHECK(attrs);
     const auto new_attrs = make_object<Conv2DAttrs>();
     new_attrs->strides = attrs->strides;
     new_attrs->padding = attrs->padding;
@@ -105,7 +105,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     const std::string& layout =
         new_attrs->out_layout == "" ? new_attrs->data_layout : new_attrs->out_layout;
     channel_pos_ = layout.find('C');
-    CHECK_NE(channel_pos_, std::string::npos);
+    ICHECK_NE(channel_pos_, std::string::npos);
 
     return Call(conv2d, {data, new_weight}, Attrs{new_attrs}, {});
   }
@@ -198,7 +198,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     }
     auto index =
         branches[0][0]->attrs.as<Conv2DAttrs>()->kernel_layout.operator std::string().find('O');
-    CHECK_NE(index, std::string::npos);
+    ICHECK_NE(index, std::string::npos);
     return std::make_tuple(MakeConcatenate(Tuple(weights), index),
                            tir::make_const(DataType::Int(32), num_filters));
   }
diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc
index 76b26d0e085b..6d4c8c000f31 100644
--- a/src/relay/transforms/combine_parallel_dense.cc
+++ b/src/relay/transforms/combine_parallel_dense.cc
@@ -43,7 +43,7 @@
 
 #include "./combine_parallel_op_batch.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -61,8 +61,8 @@ class ParallelDenseToBatchCombiner : public ParallelOpBatchCombiner {
     StructuralEqual eq;
     const auto* attrs_a = a->attrs.as<DenseAttrs>();
     const auto* attrs_b = b->attrs.as<DenseAttrs>();
-    CHECK(attrs_a);
-    CHECK(attrs_b);
+    ICHECK(attrs_a);
+    ICHECK(attrs_b);
     const auto* weight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* weight_b = b->args[1]->type_as<TensorTypeNode>();
 
@@ -89,7 +89,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     const auto* attrs_b = b->attrs.as<DenseAttrs>();
     const auto* weight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* weight_b = b->args[1]->type_as<TensorTypeNode>();
-    CHECK(attrs_a != nullptr && attrs_b != nullptr && weight_a != nullptr && weight_b != nullptr);
+    ICHECK(attrs_a != nullptr && attrs_b != nullptr && weight_a != nullptr && weight_b != nullptr);
     // output dims (weight->shape[0]) can be different
     return eq(attrs_a->out_dtype, attrs_b->out_dtype) && eq(weight_a->shape[1], weight_b->shape[1]);
   }
@@ -102,7 +102,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     // concat all weights into one
     std::tie(new_weight, new_output_dims) = TransformWeight(branches);
     const auto* origin_attrs = branches[0][0]->attrs.as<DenseAttrs>();
-    CHECK(origin_attrs);
+    ICHECK(origin_attrs);
     const auto dense_attrs = make_object<DenseAttrs>();
     dense_attrs->units = new_output_dims;
     dense_attrs->out_dtype = origin_attrs->out_dtype;
@@ -115,7 +115,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     auto tb = b->args[index]->type_as<TensorTypeNode>();
     auto toutput_a = a->type_as<TensorTypeNode>();
     auto toutput_b = b->type_as<TensorTypeNode>();
-    CHECK(ta != nullptr && tb != nullptr && toutput_a != nullptr && toutput_b != nullptr);
+    ICHECK(ta != nullptr && tb != nullptr && toutput_a != nullptr && toutput_b != nullptr);
 
     if (!eq(ta->dtype, tb->dtype) || ta->shape.size() != tb->shape.size()) {
       return false;
@@ -148,7 +148,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
         auto parent = branch[depth]->args[parent_index];
         auto& parent_shape = parent->type_as<TensorTypeNode>()->shape;
         auto out_dim = tir::as_const_int(parent_shape[parent_shape.size() - 1]);
-        CHECK(out_dim != nullptr);
+        ICHECK(out_dim != nullptr);
 
         auto arg = branch[depth]->args[i];
         auto& arg_shape = arg->type_as<TensorTypeNode>()->shape;
@@ -158,7 +158,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
           arg = MakeExpandDims(arg, -1, 1);
         } else {
           auto arg_last_dim = tir::as_const_int(arg_shape[arg_shape.size() - 1]);
-          CHECK(arg_last_dim != nullptr);
+          ICHECK(arg_last_dim != nullptr);
           if (*out_dim > 1 && *arg_last_dim == 1) {
             repeat_last_dim = true;
           }
@@ -182,7 +182,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
       const CallNode* call = branch[depth];
       auto& out_shape = call->type_as<TensorTypeNode>()->shape;
       auto out_dims = tir::as_const_int(out_shape[out_shape.size() - 1]);
-      CHECK(out_dims != nullptr);
+      ICHECK(out_dims != nullptr);
       Array<Integer> begin;
       Array<Integer> end;
       Array<Integer> strides;
diff --git a/src/relay/transforms/combine_parallel_op.cc b/src/relay/transforms/combine_parallel_op.cc
index 7ca2ce8b5dba..1c9a58f49824 100644
--- a/src/relay/transforms/combine_parallel_op.cc
+++ b/src/relay/transforms/combine_parallel_op.cc
@@ -40,7 +40,7 @@
 #include <utility>
 
 #include "expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -64,7 +64,7 @@ std::vector<Group> BranchGroupFinder::Find(const Expr& expr) {
       auto&& branch = CreateBranch(child);
       // add the branch to a group, or create a new group
       auto it = std::find_if(groups.begin() + ngroups, groups.end(), [&](const Group& group) {
-        CHECK(!group.empty() && !group[0].empty());
+        ICHECK(!group.empty() && !group[0].empty());
         return fare_compatible_ops_(child, group[0][0]);
       });
       if (it != groups.end()) {
@@ -141,7 +141,7 @@ void ParallelOpCombiner::CombineBranches(const Group& branches) {
     for (parent_index = 0; parent_index < branches[0][i]->args.size(); parent_index++) {
       if (branches[0][i]->args[parent_index].get() == branches[0][i - 1]) break;
     }
-    CHECK_NE(parent_index, branches[0][i]->args.size());
+    ICHECK_NE(parent_index, branches[0][i]->args.size());
     if (!CheckLevel(branches, i, parent_index)) break;
     combined = MakeCombinedCallFromFollowingOps(combined, branches, i, parent_index);
   }
diff --git a/src/relay/transforms/combine_parallel_op.h b/src/relay/transforms/combine_parallel_op.h
index 6f53e86d534b..9785a366299b 100644
--- a/src/relay/transforms/combine_parallel_op.h
+++ b/src/relay/transforms/combine_parallel_op.h
@@ -38,7 +38,7 @@
 #include <vector>
 
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_op_batch.cc b/src/relay/transforms/combine_parallel_op_batch.cc
index 2e9ffdb9bb3c..a41e1e0d6674 100644
--- a/src/relay/transforms/combine_parallel_op_batch.cc
+++ b/src/relay/transforms/combine_parallel_op_batch.cc
@@ -58,7 +58,7 @@
 
 #include "./combine_parallel_op.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_op_batch.h b/src/relay/transforms/combine_parallel_op_batch.h
index 9f87d9d2184f..7a518e9ac370 100644
--- a/src/relay/transforms/combine_parallel_op_batch.h
+++ b/src/relay/transforms/combine_parallel_op_batch.h
@@ -37,7 +37,7 @@
 
 #include "./combine_parallel_op.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/convert_layout.cc b/src/relay/transforms/convert_layout.cc
index 65fdeda5f6cd..ba443f602c19 100644
--- a/src/relay/transforms/convert_layout.cc
+++ b/src/relay/transforms/convert_layout.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <vector>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 #include "transform_layout.h"
 
 namespace tvm {
@@ -112,7 +112,7 @@ class ConvertTransformMemorizer : public TransformMemorizer {
     }
 
     const CallNode* new_call = new_e.as<CallNode>();
-    CHECK(new_call) << "Can only replace the original operator with another call node";
+    ICHECK(new_call) << "Can only replace the original operator with another call node";
     return GetRef<Call>(new_call);
   }
 
diff --git a/src/relay/transforms/convert_sparse_dense.cc b/src/relay/transforms/convert_sparse_dense.cc
index 36aaa478eab6..5f4dbe642c3d 100644
--- a/src/relay/transforms/convert_sparse_dense.cc
+++ b/src/relay/transforms/convert_sparse_dense.cc
@@ -75,9 +75,9 @@ class DenseToSparseDenseMutator : public ExprRewriter {
   DenseToSparseDenseMutator(const Array<ObjectRef>& weight_name,
                             const Array<Array<PrimExpr> >& weight_shape)
       : dense_op_(Op::Get("nn.dense")), sparse_dense_op_(Op::Get("nn.sparse_dense")) {
-    CHECK_EQ(weight_name.size(), weight_shape.size());
+    ICHECK_EQ(weight_name.size(), weight_shape.size());
     for (size_t i = 0; i < weight_name.size(); ++i) {
-      CHECK(weight_name[i]->IsInstance<runtime::StringObj>());
+      ICHECK(weight_name[i]->IsInstance<runtime::StringObj>());
       std::string k = weight_name[i].as<runtime::StringObj>()->data;
       const auto& ws = weight_shape[i];
       std::vector<int> v(ws.size());
diff --git a/src/relay/transforms/de_duplicate.cc b/src/relay/transforms/de_duplicate.cc
index d90e5c584df3..43b71f6f10cc 100644
--- a/src/relay/transforms/de_duplicate.cc
+++ b/src/relay/transforms/de_duplicate.cc
@@ -31,7 +31,7 @@ namespace tvm {
 namespace relay {
 
 Expr DeDup(const Expr& e) {
-  class DeDupMutator : public TypeMutator, public ExprMutator, public PatternMutator {
+  class DeDupMutator : public TypeMutator, public MixedModeMutator, public PatternMutator {
    public:
     TypeVar Fresh(const TypeVar& tv) {
       TypeVar ret = TypeVar(tv->name_hint, tv->kind);
@@ -40,19 +40,21 @@ Expr DeDup(const Expr& e) {
     }
 
     Var Fresh(const Var& v) {
-      CHECK_EQ(rename_.count(v), 0);
-      CHECK_EQ(memo_.count(v), 0) << v.as<VarNode>();
+      ICHECK_EQ(rename_.count(v), 0);
+      ICHECK_EQ(memo_.count(v), 0) << v.as<VarNode>();
       Var ret = Var(v->name_hint(), VisitType(v->type_annotation));
       rename_[v] = ret;
       return ret;
     }
 
-    Expr VisitExpr(const Expr& e) final {
+    Expr DispatchVisitExpr(const Expr& e) final {
       auto ret = ExprMutator::VisitExpr(e);
       ret->checked_type_ = e->checked_type_;
       return ret;
     }
 
+    using MixedModeMutator::VisitExpr_;
+
     Expr VisitExpr_(const VarNode* op) final {
       Var v = GetRef<Var>(op);
       return rename_.count(v) != 0 ? rename_.at(v) : v;
@@ -92,10 +94,10 @@ Expr DeDup(const Expr& e) {
     std::unordered_map<Var, Var, ObjectPtrHash, ObjectPtrEqual> rename_;
     std::unordered_map<TypeVar, TypeVar, ObjectPtrHash, ObjectPtrEqual> type_rename_;
   };
-  CHECK(WellFormed(e)) << AsText(e, false);
+  ICHECK(WellFormed(e)) << AsText(e, false);
   Expr ret = DeDupMutator().VisitExpr(e);
-  CHECK(WellFormed(ret));
-  CHECK_EQ(FreeVars(e).size(), FreeVars(ret).size());
+  ICHECK(WellFormed(ret));
+  ICHECK_EQ(FreeVars(e).size(), FreeVars(ret).size());
   return ret;
 }
 
diff --git a/src/relay/transforms/dead_code.cc b/src/relay/transforms/dead_code.cc
index f6c2272a3018..2e7c08a684dc 100644
--- a/src/relay/transforms/dead_code.cc
+++ b/src/relay/transforms/dead_code.cc
@@ -46,7 +46,7 @@ class FindDef : private ExprVisitor {
   VarMap<Expr> expr_map_;
 
   void VisitExpr_(const LetNode* l) final {
-    CHECK_EQ(expr_map_.count(l->var), 0);
+    ICHECK_EQ(expr_map_.count(l->var), 0);
     expr_map_[l->var] = l->value;
     VisitExpr(l->value);
     VisitExpr(l->body);
diff --git a/src/relay/transforms/defunctionalization.cc b/src/relay/transforms/defunctionalization.cc
index ec614d23a02e..14a86bc8d080 100644
--- a/src/relay/transforms/defunctionalization.cc
+++ b/src/relay/transforms/defunctionalization.cc
@@ -68,7 +68,7 @@
 #include <tvm/te/operation.h>
 
 #include "../analysis/type_solver.h"
-#include "../transforms/pass_util.h"
+#include "../transforms/pass_utils.h"
 namespace tvm {
 namespace relay {
 
@@ -103,12 +103,12 @@ class DefuncMutator : public ExprMutator {
 
   Expr VisitExpr_(const CallNode* call) {
     if (auto op = call->op.as<GlobalVarNode>()) {
-      CHECK_EQ(call->type_args.size(), op->checked_type().as<FuncTypeNode>()->type_params.size())
+      ICHECK_EQ(call->type_args.size(), op->checked_type().as<FuncTypeNode>()->type_params.size())
           << "all type args must be explicit";
 
       auto op_type = InstFuncType(op->checked_type().as<FuncTypeNode>(), call->type_args);
-      CHECK_EQ(FreeTypeVars(op_type, mod).size(), 0) << "free type vars in instantiated";
-      CHECK(!HasFuncType(op_type->ret_type)) << "returning functions not supported";
+      ICHECK_EQ(FreeTypeVars(op_type, mod).size(), 0) << "free type vars in instantiated";
+      ICHECK(!HasFuncType(op_type->ret_type)) << "returning functions not supported";
 
       if (!IsHigherOrderFunc(op_type)) {
         // not higher order function
@@ -152,7 +152,7 @@ class DefuncMutator : public ExprMutator {
       // var node will be encoded as datatype
       // so we need to use the `apply` helper method
       auto var_original_type = GetUnencodedType(op->type_annotation).as<FuncTypeNode>();
-      CHECK(var_original_type) << "var original type not saved in var_save_type map";
+      ICHECK(var_original_type) << "var original type not saved in var_save_type map";
       auto op_type = InstFuncType(var_original_type, call->type_args);
 
       Array<Expr> args = {GetRef<Var>(op)};
@@ -209,7 +209,7 @@ class DefuncMutator : public ExprMutator {
    */
   void AddApplyCase(GlobalVar apply_gv, FuncType ft, Constructor c, const Expr& expr,
                     const Array<Pattern> patterns) {
-    CHECK(c->inputs.size() == patterns.size())
+    ICHECK(c->inputs.size() == patterns.size())
         << "constructor function and pattern vars have different sizes";
     if (!mod->ContainGlobalVar(apply_gv->name_hint)) {
       auto x = Var("x", TypeCall(c->belong_to, {}));
@@ -229,7 +229,7 @@ class DefuncMutator : public ExprMutator {
     } else {
       auto f = Downcast<Function>(mod->Lookup(apply_gv));
       auto body = f->body.as<MatchNode>();
-      CHECK(body) << "internal invariant broken; apply function body should be a match node";
+      ICHECK(body) << "internal invariant broken; apply function body should be a match node";
 
       auto clauses = body->clauses;
       auto x = f->params[0];
@@ -245,8 +245,8 @@ class DefuncMutator : public ExprMutator {
 
   Expr EncodeArg(const Expr& arg, const Type& type) {
     // we assume arg is either an identifier (var or globalvar) or a function
-    CHECK(type.as<FuncTypeNode>()) << "assume no nested functions";
-    CHECK(arg.as<VarNode>() || arg.as<GlobalVarNode>() || arg.as<FunctionNode>())
+    ICHECK(type.as<FuncTypeNode>()) << "assume no nested functions";
+    ICHECK(arg.as<VarNode>() || arg.as<GlobalVarNode>() || arg.as<FunctionNode>())
         << "assume all first-order-parameters are identifiers or functions";
 
     if (arg.as<VarNode>()) {
@@ -334,11 +334,11 @@ class DefuncMutator : public ExprMutator {
    */
   FuncType GetUnencodedType(const Type& t) {
     auto tc = t.as<TypeCallNode>();
-    CHECK(tc) << "expected type call when getting original type from encoded type";
+    ICHECK(tc) << "expected type call when getting original type from encoded type";
     auto gv = tc->func.as<GlobalTypeVarNode>();
-    CHECK(gv) << "expected global type var in encoded type";
+    ICHECK(gv) << "expected global type var in encoded type";
     auto type = original_func_type_map[GetRef<GlobalTypeVar>(gv)];
-    CHECK(type.defined()) << "reverse mapping from encoded type to original type not found";
+    ICHECK(type.defined()) << "reverse mapping from encoded type to original type not found";
     return Downcast<FuncType>(type);
   }
 
@@ -357,8 +357,8 @@ class DefuncMutator : public ExprMutator {
    * \brief specialize a function type
    */
   FuncType InstFuncType(const FuncTypeNode* fty, const Array<Type> type_args) {
-    CHECK(fty) << "InstFuncType functype is null";
-    CHECK_EQ(fty->type_params.size(), type_args.size())
+    ICHECK(fty) << "InstFuncType functype is null";
+    ICHECK_EQ(fty->type_params.size(), type_args.size())
         << "size mismatch between function type params and type args";
     auto map = tvm::Map<TypeVar, Type>();
     for (size_t i = 0; i < type_args.size(); i++) {
@@ -372,7 +372,7 @@ class DefuncMutator : public ExprMutator {
    * \brief specialize a function expression
    */
   Function Specialize(const Function& f, const Array<Type> type_args) {
-    CHECK_EQ(f->type_params.size(), type_args.size())
+    ICHECK_EQ(f->type_params.size(), type_args.size())
         << "cannot specialize function with size mismatch between function type params and type "
            "args";
     auto map = tvm::Map<TypeVar, Type>();
@@ -389,7 +389,7 @@ class DefuncMutator : public ExprMutator {
    * using the `apply` function for applications
    */
   Function FirstifyVars(const Function& f) {
-    CHECK(f->type_params.size() == 0) << "firstify function has type params";
+    ICHECK(f->type_params.size() == 0) << "firstify function has type params";
 
     tvm::Map<Var, Expr> var_bind_map;
     Array<Var> params;
@@ -403,7 +403,7 @@ class DefuncMutator : public ExprMutator {
         var_bind_map.Set(var, new_var);
         params.push_back(new_var);
       } else {
-        CHECK(!HasFuncType(var->type_annotation))
+        ICHECK(!HasFuncType(var->type_annotation))
             << "nested function type in parameter not supported yet";
         params.push_back(var);
       }
@@ -416,11 +416,11 @@ class DefuncMutator : public ExprMutator {
 
 Expr Defunctionalization(const Function& f, const IRModule& mod) {
   // f is the starting point of the program, all types MUST be known
-  CHECK(f->type_params.size() == 0) << "no polymorphism supported for defunctionalization";
+  ICHECK(f->type_params.size() == 0) << "no polymorphism supported for defunctionalization";
   for (const auto& p : f->params) {
-    CHECK(!HasFuncType(p->checked_type())) << "program cannot have func type parameters";
+    ICHECK(!HasFuncType(p->checked_type())) << "program cannot have func type parameters";
   }
-  CHECK(!HasFuncType(f->ret_type)) << "return type cannot contain function";
+  ICHECK(!HasFuncType(f->ret_type)) << "return type cannot contain function";
 
   return Downcast<Function>(DefuncMutator(mod).VisitExpr(f));
 }
diff --git a/src/relay/transforms/defuse_ops.cc b/src/relay/transforms/defuse_ops.cc
new file mode 100644
index 000000000000..6abf4c31d359
--- /dev/null
+++ b/src/relay/transforms/defuse_ops.cc
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *
+ * \file src/relay/transforms/defuse_ops.cc
+ * \brief This is an inverse operation of fusion pass. It transforms a fused
+ * program returned by relay::transform::FuseOps into the program before FuseOps.
+ * (i.e., x == DefuseOps(FuseOps(x)))
+ */
+
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include <string>
+#include <unordered_map>
+
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+class DefuseOpsMutator : public ExprMutator {
+ public:
+  class FuncBodyMutator : public ExprMutator {
+   public:
+    explicit FuncBodyMutator(const Array<Expr>& args) : ExprMutator() { args_ = args; }
+
+    Expr VisitExpr_(const VarNode* n) {
+      const std::string& name = n->name_hint();
+      ICHECK(!name.empty() && (name[0] == 'p'));
+      std::string id_str = name.substr(1);
+      int id = std::stoi(id_str);
+      ICHECK(id >= 0 && size_t(id) < args_.size());
+      return args_[id];
+    }
+
+   private:
+    Array<Expr> args_;
+  };
+
+  Expr VisitExpr_(const CallNode* n) {
+    auto new_n = ExprMutator::VisitExpr_(n);
+
+    if (const auto* call = new_n.as<CallNode>()) {
+      if (const auto* func = call->op.as<FunctionNode>()) {
+        if (func->body->IsInstance<CallNode>()) {
+          return FuncBodyMutator(call->args).Mutate(func->body);
+        }
+      }
+    }
+    return new_n;
+  }
+};
+
+Expr DefuseOps(const Expr& expr) { return DefuseOpsMutator().Mutate(expr); }
+
+namespace transform {
+
+Pass DefuseOps() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) { return Downcast<Function>(DefuseOps(f)); };
+  return CreateFunctionPass(pass_func, 3, "DefuseOps", {"InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.DefuseOps").set_body_typed(DefuseOps);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index b3f22e00fda4..e744fb51e0a6 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -72,16 +72,16 @@ class ValidateAnnotation : private ExprVisitor {
     if (IsOnDeviceNode(call_node)) {
       int device_type = GetDeviceId(call_node);
       if (annotation_map_.count(call_node)) {
-        CHECK_EQ(annotation_map_.at(call_node), device_type)
+        ICHECK_EQ(annotation_map_.at(call_node), device_type)
             << "An expression node can only be annotated to one device.";
       } else {
         annotation_map_.insert({call_node, GetDeviceId(call_node)});
       }
 
-      CHECK_EQ(call_node->args.size(), 1U);
+      ICHECK_EQ(call_node->args.size(), 1U);
       const auto* node = call_node->args[0].operator->();
       if (annotation_map_.count(node)) {
-        CHECK_EQ(annotation_map_.at(node), device_type)
+        ICHECK_EQ(annotation_map_.at(node), device_type)
             << "An expression node can only be annotated to one device.";
       } else {
         annotation_map_.insert({node, GetDeviceId(call_node)});
@@ -103,7 +103,7 @@ class ValidateAnnotation : private ExprVisitor {
    * \return The device type.
    */
   int GetDeviceId(const CallNode* call_node) {
-    CHECK(IsOnDeviceNode(call_node)) << "The input call node must be on_device node.";
+    ICHECK(IsOnDeviceNode(call_node)) << "The input call node must be on_device node.";
     const OnDeviceAttrs* on_device_attr = call_node->attrs.as<OnDeviceAttrs>();
     return on_device_attr->device_type;
   }
@@ -226,7 +226,7 @@ class RewriteAnnotation : public ExprMutator {
     const auto sit = annotation_map_.find(src_node);
     if (sit == annotation_map_.end()) {
       const auto dit = annotation_map_.find(dst);
-      CHECK(dit != annotation_map_.end())
+      ICHECK(dit != annotation_map_.end())
           << "Device copy op is not required when both src and dst ops are not "
              "annotated.";
       return CreateDeviceCopy(src, fallback_device_, dit->second);
@@ -391,7 +391,7 @@ class DeviceInfo {
       // Skip annotation nodes.
       if (!IsOnDeviceNode(call)) {
         if (const auto* node = GetDeviceCopyNode(call)) {
-          CHECK(node->IsInstance<CallNode>());
+          ICHECK(node->IsInstance<CallNode>());
           const auto* call_node = static_cast<const CallNode*>(node);
           auto attrs = call_node->attrs.as<DeviceCopyAttrs>();
 
@@ -496,7 +496,7 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
           new_body.push_back(field);
         }
       }
-      CHECK_GT(new_body.size(), 0U);
+      ICHECK_GT(new_body.size(), 0U);
       if (new_body.size() == 1) {
         return Function(params, new_body[0], Type(nullptr), fn->type_params, fn->attrs);
       } else if (tuple->fields.size() == new_body.size()) {
@@ -515,7 +515,7 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
         new_fields.push_back(field);
       }
     }
-    CHECK_GT(new_fields.size(), 0U);
+    ICHECK_GT(new_fields.size(), 0U);
     if (tuple->fields.size() == new_fields.size()) {
       return new_fields.size() == 1 ? new_fields[0] : new_expr;
     } else {
diff --git a/src/relay/transforms/dynamic_to_static.cc b/src/relay/transforms/dynamic_to_static.cc
index edcb83972cc7..f78d05bd9d2c 100644
--- a/src/relay/transforms/dynamic_to_static.cc
+++ b/src/relay/transforms/dynamic_to_static.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -39,7 +39,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
         {Op::Get("dyn.reshape"),
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
-             CHECK_EQ(shape->data->ndim, 1);
+             ICHECK_EQ(shape->data->ndim, 1);
              return MakeReshape(call_node->args[0], ToVector(shape->data));
            }
            return Expr(nullptr);
@@ -47,7 +47,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
         {Op::Get("dyn.tile"),
          [](const CallNode* call_node) {
            if (const ConstantNode* reps = call_node->args[1].as<ConstantNode>()) {
-             CHECK_EQ(reps->data->ndim, 1);
+             ICHECK_EQ(reps->data->ndim, 1);
              return MakeTile(call_node->args[0], ToVector(reps->data));
            }
            return Expr(nullptr);
@@ -56,7 +56,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* k = call_node->args[1].as<ConstantNode>()) {
              const TopKAttrs* param = call_node->attrs.as<TopKAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeTopK(call_node->args[0], static_cast<int>(ToScalar(k->data, 0)),
                              param->axis, param->ret_type, param->is_ascend, param->dtype);
            }
@@ -65,7 +65,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
         {Op::Get("dyn.broadcast_to"),
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
-             CHECK_EQ(shape->data->ndim, 1);
+             ICHECK_EQ(shape->data->ndim, 1);
              return MakeBroadCastTo(call_node->args[0], ToVector(shape->data));
            }
            return Expr(nullptr);
@@ -74,7 +74,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[0].as<ConstantNode>()) {
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeZeros(ToVector(shape->data), param->dtype);
            }
            return Expr(nullptr);
@@ -83,7 +83,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[0].as<ConstantNode>()) {
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeOnes(ToVector(shape->data), param->dtype);
            }
            return Expr(nullptr);
@@ -92,7 +92,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* depth = call_node->args[3].as<ConstantNode>()) {
              const OneHotAttrs* param = call_node->attrs.as<OneHotAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeOneHot(call_node->args[0], call_node->args[1], call_node->args[2],
                                static_cast<int>(ToScalar(depth->data, 0)), param->axis,
                                param->dtype);
@@ -103,7 +103,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* size = call_node->args[1].as<ConstantNode>()) {
              const ResizeAttrs* param = call_node->attrs.as<ResizeAttrs>();
-             CHECK(param);
+             ICHECK(param);
              auto size_int = ToVector(size->data);
              Array<PrimExpr> size_prim;
              for (size_t i = 0; i < size_int.size(); ++i) {
@@ -117,9 +117,9 @@ class DynamicToStaticMutator : public MixedModeMutator {
         {Op::Get("dyn.full"),
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
-             CHECK_EQ(shape->data->ndim, 1);
+             ICHECK_EQ(shape->data->ndim, 1);
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeFull(call_node->args[0], ToVector(shape->data), param->dtype);
            }
            return Expr(nullptr);
@@ -129,10 +129,10 @@ class DynamicToStaticMutator : public MixedModeMutator {
            const ConstantNode* scale_h = call_node->args[1].as<ConstantNode>();
            const ConstantNode* scale_w = call_node->args[2].as<ConstantNode>();
            if (scale_h && scale_w) {
-             CHECK_EQ(scale_h->data->ndim, 0);
-             CHECK_EQ(scale_w->data->ndim, 0);
+             ICHECK_EQ(scale_h->data->ndim, 0);
+             ICHECK_EQ(scale_w->data->ndim, 0);
              const UpSamplingAttrs* param = call_node->attrs.as<UpSamplingAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeUpSampling(call_node->args[0], ToScalar(scale_h->data),
                                    ToScalar(scale_w->data), param->layout, param->method,
                                    param->align_corners);
@@ -145,11 +145,11 @@ class DynamicToStaticMutator : public MixedModeMutator {
            const ConstantNode* scale_h = call_node->args[2].as<ConstantNode>();
            const ConstantNode* scale_w = call_node->args[3].as<ConstantNode>();
            if (scale_d && scale_h && scale_w) {
-             CHECK_EQ(scale_d->data->ndim, 0);
-             CHECK_EQ(scale_h->data->ndim, 0);
-             CHECK_EQ(scale_w->data->ndim, 0);
+             ICHECK_EQ(scale_d->data->ndim, 0);
+             ICHECK_EQ(scale_h->data->ndim, 0);
+             ICHECK_EQ(scale_w->data->ndim, 0);
              const UpSampling3DAttrs* param = call_node->attrs.as<UpSampling3DAttrs>();
-             CHECK(param);
+             ICHECK(param);
 
              return MakeUpSampling3D(call_node->args[0], ToScalar(scale_d->data),
                                      ToScalar(scale_h->data), ToScalar(scale_w->data),
@@ -163,11 +163,11 @@ class DynamicToStaticMutator : public MixedModeMutator {
            const ConstantNode* pad_width = call_node->args[1].as<ConstantNode>();
            const ConstantNode* pad_fill = call_node->args[2].as<ConstantNode>();
            if (pad_width && pad_fill) {
-             CHECK_EQ(pad_fill->data->ndim, 0);   // pad_val is 1d
-             CHECK_EQ(pad_width->data->ndim, 2);  // pad_width is 2d
+             ICHECK_EQ(pad_fill->data->ndim, 0);   // pad_val is 1d
+             ICHECK_EQ(pad_width->data->ndim, 2);  // pad_width is 2d
 
              const PadAttrs* param = call_node->attrs.as<PadAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakePad(call_node->args[0], ToMatrix(pad_width->data), ToScalar(pad_fill->data),
                             param->pad_mode);
            }
@@ -179,16 +179,26 @@ class DynamicToStaticMutator : public MixedModeMutator {
            const ConstantNode* end = call_node->args[2].as<ConstantNode>();
            const ConstantNode* stride = call_node->args[3].as<ConstantNode>();
            if (begin && end && stride) {
-             CHECK_EQ(begin->data->ndim, 1);
-             CHECK_EQ(end->data->ndim, 1);
-             CHECK_EQ(stride->data->ndim, 1);
+             ICHECK_EQ(begin->data->ndim, 1);
+             ICHECK_EQ(end->data->ndim, 1);
+             ICHECK_EQ(stride->data->ndim, 1);
              const StridedSliceAttrs* param = call_node->attrs.as<StridedSliceAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeStridedSlice(call_node->args[0], ToVector(begin->data), ToVector(end->data),
                                      ToVector(stride->data), param->slice_mode);
            }
            return Expr(nullptr);
          }},
+        {Op::Get("dyn.sparse_to_dense"),
+         [](const CallNode* call_node) {
+           const ConstantNode* output_shape = call_node->args[3].as<ConstantNode>();
+           if (output_shape) {
+             ICHECK_EQ(output_shape->data->ndim, 1);
+             return MakeSparseToDense(call_node->args[0], ToVector(output_shape->data),
+                                      call_node->args[1], call_node->args[2]);
+           }
+           return Expr(nullptr);
+         }},
     };
   }
 
diff --git a/src/relay/transforms/eliminate_common_subexpr.cc b/src/relay/transforms/eliminate_common_subexpr.cc
index 92cc64dedba6..e9603575111d 100644
--- a/src/relay/transforms/eliminate_common_subexpr.cc
+++ b/src/relay/transforms/eliminate_common_subexpr.cc
@@ -32,7 +32,7 @@
 
 #include <unordered_map>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -45,7 +45,7 @@ class CommonSubexprEliminator : public MixedModeMutator {
     static auto op_stateful = Op::GetAttrMap<TOpIsStateful>("TOpIsStateful");
     Expr new_expr = post;
     const CallNode* new_call = new_expr.as<CallNode>();
-    CHECK(new_call);
+    ICHECK(new_call);
     const OpNode* op = new_call->op.as<OpNode>();
     StructuralEqual attrs_equal;
 
@@ -83,7 +83,7 @@ class CommonSubexprEliminator : public MixedModeMutator {
   Expr Rewrite_(const TupleGetItemNode* op, const Expr& post) final {
     Expr new_expr = post;
     const TupleGetItemNode* new_tuple_item = new_expr.as<TupleGetItemNode>();
-    CHECK(new_tuple_item);
+    ICHECK(new_tuple_item);
 
     if (fskip_ != nullptr && fskip_(new_expr)) {
       return new_expr;
diff --git a/src/relay/transforms/eta_expand.cc b/src/relay/transforms/eta_expand.cc
index 42718eec9179..4023c9dafef4 100644
--- a/src/relay/transforms/eta_expand.cc
+++ b/src/relay/transforms/eta_expand.cc
@@ -62,7 +62,7 @@ class EtaExpander : public ExprMutator {
         type_var_replacer_(TypeVarReplacer()),
         expand_constructor_(expand_constructor),
         expand_global_var_(expand_global_var) {
-    CHECK(expand_constructor || expand_global_var) << "must expand at least one language feature";
+    ICHECK(expand_constructor || expand_global_var) << "must expand at least one language feature";
   }
 
   IRModule Expand() {
diff --git a/src/relay/transforms/fast_math.cc b/src/relay/transforms/fast_math.cc
index 3c8d8db637c8..91fb4cfa8973 100644
--- a/src/relay/transforms/fast_math.cc
+++ b/src/relay/transforms/fast_math.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index a3f2f69f7a58..48af31f9a11f 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -31,7 +31,7 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -75,7 +75,7 @@ TVM_REGISTER_GLOBAL("relay.analysis.check_constant").set_body_typed(ConstantChec
 
 // TODO(tvm-team) consider combine dead-code with constant folder.
 // or make a more powerful partial evaluator.
-class ConstantFolder : public ExprMutator {
+class ConstantFolder : public MixedModeMutator {
  public:
   explicit ConstantFolder(IRModule module)
       : module_(module),
@@ -89,6 +89,8 @@ class ConstantFolder : public ExprMutator {
         cast_op_(Op::Get("cast")),
         ndarray_size_op_(Op::Get("ndarray_size")) {}
 
+  using MixedModeMutator::VisitExpr_;
+
   Expr VisitExpr_(const LetNode* op) final {
     Expr value = this->Mutate(op->value);
     if (value.as<ConstantNode>()) {
@@ -108,7 +110,7 @@ class ConstantFolder : public ExprMutator {
   bool inside_primitive = false;
   Expr VisitExpr_(const FunctionNode* op) final {
     if (op->HasNonzeroAttr(attr::kPrimitive)) {
-      CHECK_EQ(inside_primitive, false);
+      ICHECK_EQ(inside_primitive, false);
       inside_primitive = true;
       auto ret = ExprMutator::VisitExpr_(op);
       inside_primitive = false;
@@ -118,7 +120,7 @@ class ConstantFolder : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const CallNode* call) final {
+  Expr Rewrite_(const CallNode* call, const Expr& post) final {
     if (inside_primitive) {
       return GetRef<Expr>(call);
     }
@@ -127,32 +129,34 @@ class ConstantFolder : public ExprMutator {
     std::unordered_set<std::string> skip_list{"zeros_like", "ones_like", "full_like", "full"};
 
     auto origin_args = call->args;
-    Expr res = ExprMutator::VisitExpr_(call);
-    call = res.as<CallNode>();
+    call = post.as<CallNode>();
     // We don't constant fold function with zero arguments.
     // This is a heuristic that is useful.
     // For example it is harmful to fold ones(shape=(4, 5)).
-    if (call->args.size() == 0) return res;
+    if (call->args.size() == 0) return post;
     const OpNode* op = call->op.as<OpNode>();
-    if (op == nullptr) return res;
+    if (op == nullptr) return post;
     if (skip_list.count(op->name)) {
-      return res;
+      return post;
     }
     // skip stateful ops.
-    if (op_stateful.get(GetRef<Op>(op), false)) return res;
+    if (op_stateful.get(GetRef<Op>(op), false)) return post;
     // Try to evaluate shape_of op
     if (call->op == shape_of_op_ || call->op == vm_shape_of_op_) {
-      return EvaluateShapeOf(res, origin_args, call->attrs);
+      return EvaluateShapeOf(post, origin_args, call->attrs);
     }
 
     if (call->op == ndarray_size_op_) {
-      return EvaluateNdarraySize(res, origin_args, call->attrs);
+      return EvaluateNdarraySize(post, origin_args, call->attrs);
     }
 
     // We should think about potentially constant evaluation over these ops too.
-    if (call->op == invoke_tvm_op_ || call->op == shape_func_op_ || call->op == alloc_tensor_op_ ||
-        call->op == alloc_storage_op_ || call->op == device_copy_op_) {
-      return GetRef<Call>(call);
+    static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
+    if (const auto* call_node = call->op.as<OpNode>()) {
+      Op op = GetRef<Op>(call_node);
+      if ((fnoncomputational.count(op) && fnoncomputational[op]) || (call->op == device_copy_op_)) {
+        return GetRef<Call>(call);
+      }
     }
 
     bool all_const_args = true;
@@ -162,19 +166,18 @@ class ConstantFolder : public ExprMutator {
       }
     }
     if (all_const_args) {
-      return ConstEvaluate(res);
+      return ConstEvaluate(post);
     } else {
-      return res;
+      return post;
     }
   }
 
-  Expr VisitExpr_(const TupleGetItemNode* op) final {
-    Expr res = ExprMutator::VisitExpr_(op);
-    op = res.as<TupleGetItemNode>();
+  Expr Rewrite_(const TupleGetItemNode* op, const Expr& post) final {
+    op = post.as<TupleGetItemNode>();
     if (const auto* tuple = op->tuple.as<TupleNode>()) {
       return tuple->fields[op->index];
     } else {
-      return res;
+      return post;
     }
   }
 
@@ -199,9 +202,6 @@ class ConstantFolder : public ExprMutator {
   Expr ObjectToExpr(const ObjectRef& value) {
     if (value->IsInstance<runtime::NDArray::ContainerType>()) {
       auto nd_array = Downcast<runtime::NDArray>(value);
-      for (auto dim : nd_array.Shape()) {
-        CHECK_GT(dim, 0) << "invalid dimension after constant eval";
-      }
       return Constant(nd_array);
     } else if (const auto* val = value.as<runtime::ADTObj>()) {
       runtime::ADT adt = GetRef<runtime::ADT>(val);
@@ -253,7 +253,7 @@ class ConstantFolder : public ExprMutator {
   Expr EvaluateShapeOf(Expr expr, Array<Expr> args, Attrs attrs) {
     Expr input = args[0];
     const auto* param = attrs.as<ShapeOfAttrs>();
-    CHECK(param != nullptr);
+    ICHECK(param != nullptr);
 
     tvm::Array<IndexExpr> ishape;
     if (auto opt = GetConstantShape(input)) {
@@ -271,7 +271,7 @@ class ConstantFolder : public ExprMutator {
     if (ishape.size() == 0) {
       value = runtime::NDArray::Empty({}, cdtype, ctx);
     } else {
-      CHECK_NE(ishape.size(), 0);
+      ICHECK_NE(ishape.size(), 0);
       std::vector<int64_t> cshape = {static_cast<int64_t>(ishape.size())};
       value = runtime::NDArray::Empty(cshape, cdtype, ctx);
       int32_t* dims = static_cast<int32_t*>(value->data);
@@ -300,7 +300,7 @@ class ConstantFolder : public ExprMutator {
   Expr EvaluateNdarraySize(Expr expr, Array<Expr> args, Attrs attrs) {
     Expr input = args[0];
     const auto* param = attrs.as<NdarraySizeAttrs>();
-    CHECK(param != nullptr);
+    ICHECK(param != nullptr);
 
     tvm::Array<IndexExpr> ishape;
     if (auto opt = GetConstantShape(input)) {
diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc
index 0c2abbfdd238..a93532895b5a 100644
--- a/src/relay/transforms/fold_scale_axis.cc
+++ b/src/relay/transforms/fold_scale_axis.cc
@@ -30,8 +30,8 @@
 #include <tvm/tir/data_layout.h>
 
 #include "../op/tensor/transform.h"
-#include "pass_util.h"
-#include "pattern_util.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -182,7 +182,7 @@ class ScaledExprNode : public TempExprNode {
   Expr scale = NullValue<Expr>();
 
   Expr Realize() const final {
-    CHECK(!axes.defined()) << "outstanding scale";
+    ICHECK(!axes.defined()) << "outstanding scale";
     return value;
   }
 
@@ -243,7 +243,18 @@ class ForwardPrep : private ExprVisitor {
     }
   }
   // Visitor pattern override.
-  void VisitExpr_(const LetNode* call) { LOG(FATAL) << "FoldScaleAxis only accept dataflow-form"; }
+  void VisitExpr_(const LetNode* op) {
+    ExprVisitor::VisitExpr_(op);
+    // do pass through condition
+    // by assigning NullValue<Message>
+    // it means fuse signal cannot pass
+    // through into these subexpressions.
+    auto flazy = [this, op]() {
+      this->Update(op->value, NullValue<Message>());
+      this->Update(op->body, NullValue<Message>());
+    };
+    flist_.push_back(flazy);
+  }
 
   void VisitExpr_(const FunctionNode* op) {
     ExprVisitor::VisitExpr_(op);
@@ -268,7 +279,7 @@ class ForwardPrep : private ExprVisitor {
       auto f = fprep.get(call->op, nullptr);
       if (f != nullptr) {
         Array<Message> in_messages = f(GetRef<Call>(call), out_message);
-        CHECK_EQ(in_messages.size(), call->args.size());
+        ICHECK_EQ(in_messages.size(), call->args.size());
         for (size_t i = 0; i < call->args.size(); ++i) {
           this->Update(call->args[i], in_messages[i]);
         }
@@ -400,8 +411,8 @@ Expr AddSubForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
   auto rnode = make_object<ScaledExprNode>();
 
   if (slhs != nullptr) {
-    CHECK(srhs == nullptr);
-    CHECK(MatchBroadcastToLeftAxes(tlhs, trhs, slhs->axes));
+    ICHECK(srhs == nullptr);
+    ICHECK(MatchBroadcastToLeftAxes(tlhs, trhs, slhs->axes));
     Expr scale = ReshapeOrExpandToMatchAxis(slhs->scale, tlhs->shape, slhs->axes);
     if (!scale.defined()) {
       return Expr();
@@ -411,8 +422,8 @@ Expr AddSubForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
     rnode->scale = slhs->scale;
     rnode->axes = slhs->axes;
   } else {
-    CHECK(srhs != nullptr);
-    CHECK(MatchBroadcastToLeftAxes(trhs, tlhs, srhs->axes));
+    ICHECK(srhs != nullptr);
+    ICHECK(MatchBroadcastToLeftAxes(trhs, tlhs, srhs->axes));
     Expr scale = ReshapeOrExpandToMatchAxis(srhs->scale, trhs->shape, srhs->axes);
     if (!scale.defined()) {
       return Expr();
@@ -441,12 +452,12 @@ Expr MultiplyForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
                             const Message& message) {
   if (!message.defined()) return Expr();
   const auto& expected_out_axes = message->axes;
-  CHECK(expected_out_axes.defined() && expected_out_axes.size());
+  ICHECK(expected_out_axes.defined() && expected_out_axes.size());
   // TODO(tvm-team) allow same axes accumulation
   // not as important because it is less common in nn.
   const auto* slhs = new_args[0].as<ScaledExprNode>();
   const auto* srhs = new_args[1].as<ScaledExprNode>();
-  CHECK(!slhs && !srhs);
+  ICHECK(!slhs && !srhs);
 
   const auto* tlhs = ref_call->args[0]->type_as<TensorTypeNode>();
   const auto* trhs = ref_call->args[1]->type_as<TensorTypeNode>();
@@ -480,13 +491,13 @@ Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
   // TODO(tvm-team) support general data layout
   // by transforming weight
   const auto* param = call->attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
   int c_big_axis = data_layout.IndexOf(LayoutAxis::Get('C'));
   int c_small_axis = data_layout.IndexOf(LayoutAxis::Get('c'));
 
-  CHECK_GE(c_big_axis, 0);
+  ICHECK_GE(c_big_axis, 0);
   Message none = NullValue<Message>();
   // For now, we only support simple pattern (no folded weight/data)
   // More general layout can be supported under the current framework.
@@ -520,11 +531,11 @@ Expr Conv2DForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
   if (sdata == nullptr) return Expr();
   if (sweight != nullptr) return Expr();
   const auto* param = ref_call->attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
   int c_big_axis = data_layout.IndexOf(LayoutAxis::Get('C'));
-  CHECK_GE(c_big_axis, 0);
+  ICHECK_GE(c_big_axis, 0);
   int small_ko_axis = kernel_layout.IndexOf(LayoutAxis::Get('o'));
   int small_ki_axis = kernel_layout.IndexOf(LayoutAxis::Get('i'));
   int big_ki_axis = kernel_layout.IndexOf(LayoutAxis::Get('I'));
@@ -532,11 +543,11 @@ Expr Conv2DForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
 
   bool is_simple = (small_ko_axis < 0 && small_ki_axis < 0 && big_ki_axis >= 0);
   bool is_blocking = (small_ko_axis >= 0 && small_ki_axis >= 0 && big_ki_axis >= 0);
-  CHECK(is_simple || is_blocking);
+  ICHECK(is_simple || is_blocking);
 
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, kernel_layout);
-  CHECK(param->groups == 1 || is_depthwise_conv2d);
+  ICHECK(param->groups == 1 || is_depthwise_conv2d);
 
   Expr weight = new_args[1];
 
@@ -628,7 +639,7 @@ class BackwardPrep : private ExprVisitor {
     auto f = fprep.get(call->op, nullptr);
     if (f == nullptr) return;
     auto rit = ref_counter_.find(call);
-    CHECK(rit != ref_counter_.end());
+    ICHECK(rit != ref_counter_.end());
     // We only allow propagation of scale backward
     // if the expression is only referred by a single parent.
     if (rit->second != 1) return;
@@ -668,7 +679,7 @@ class BackwardTransformerNode : public Object, private ExprMutator {
     if (const CallNode* call_node = expr.as<CallNode>()) {
       return Transform(call_node, message, scale);
     } else {
-      CHECK(!message.defined()) << "outstanding scale";
+      ICHECK(!message.defined()) << "outstanding scale";
       return ExprMutator::VisitExpr(expr);
     }
   }
@@ -738,7 +749,7 @@ Expr BackwardTransformerNode::Transform(const CallNode* call_node, Message messa
     memo_[call] = new_expr;
     return new_expr;
   } else {
-    CHECK(!message.defined()) << "outstanding scale";
+    ICHECK(!message.defined()) << "outstanding scale";
     return NormalCallTransform(call_node);
   }
 }
@@ -807,13 +818,13 @@ Expr AddSubBackwardTransform(const Call& call, const Message& message, const Exp
   StructuralEqual equal;
 
   if (lhs_message.defined() && rhs_message.defined()) {
-    CHECK(equal(lhs_message->axes, rhs_message->axes));
-    CHECK(equal(message->axes, lhs_message->axes));
+    ICHECK(equal(lhs_message->axes, rhs_message->axes));
+    ICHECK(equal(message->axes, lhs_message->axes));
     Expr lhs = transformer->Transform(call->args[0], message, scale);
     Expr rhs = transformer->Transform(call->args[1], message, scale);
     return Call(call->op, {lhs, rhs}, call->attrs, call->type_args);
   } else if (lhs_message.defined()) {
-    CHECK(equal(message->axes, lhs_message->axes));
+    ICHECK(equal(message->axes, lhs_message->axes));
     Expr lhs = transformer->Transform(call->args[0], message, scale);
     Expr rhs = transformer->Transform(call->args[1], NullValue<Message>(), NullValue<Expr>());
     Expr rhs_scale = ReshapeOrExpandToMatchAxis(scale, tlhs->shape, message->axes);
@@ -823,7 +834,7 @@ Expr AddSubBackwardTransform(const Call& call, const Message& message, const Exp
     rhs = Multiply(rhs, rhs_scale);
     return Call(call->op, {lhs, rhs}, call->attrs, call->type_args);
   } else if (rhs_message.defined()) {
-    CHECK(equal(message->axes, rhs_message->axes));
+    ICHECK(equal(message->axes, rhs_message->axes));
     Expr lhs = transformer->Transform(call->args[0], NullValue<Message>(), NullValue<Expr>());
     Expr rhs = transformer->Transform(call->args[1], message, scale);
     Expr lhs_scale = ReshapeOrExpandToMatchAxis(scale, trhs->shape, message->axes);
@@ -852,13 +863,13 @@ RELAY_REGISTER_OP("subtract")
 // Multiply produces the scale-axis pair.
 Expr MultiplyBackwardTransform(const Call& call, const Message& message, const Expr& scale,
                                const BackwardTransformer& transformer) {
-  CHECK(!message.defined()) << "outstanding scale";
+  ICHECK(!message.defined()) << "outstanding scale";
   const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
   const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
   Message lhs_message = transformer->GetMessage(call->args[0]);
   Message rhs_message = transformer->GetMessage(call->args[1]);
   if (lhs_message.defined()) {
-    CHECK(lhs_message->axes.defined() && lhs_message->axes.size());
+    ICHECK(lhs_message->axes.defined() && lhs_message->axes.size());
     // NOTE we won't recursively call mutating on scale part.
     // since there  won't be scale chance within scale part.
     Expr rhs = call->args[1];
@@ -867,7 +878,7 @@ Expr MultiplyBackwardTransform(const Call& call, const Message& message, const E
       return transformer->Transform(call->args[0], lhs_message, rhs);
     }
   } else if (rhs_message.defined()) {
-    CHECK(rhs_message->axes.defined() && rhs_message->axes.size());
+    ICHECK(rhs_message->axes.defined() && rhs_message->axes.size());
     Expr lhs = call->args[0];
     if (MatchBroadcastToLeftAxes(trhs, tlhs, rhs_message->axes, &lhs) &&
         (!rhs_message->require_positive || IsAllPositiveConstant(lhs))) {
@@ -884,13 +895,13 @@ RELAY_REGISTER_OP("multiply")
 // Conv2D send out requirement of axis folding.
 Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages) {
   const auto* param = call->attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   int c_big_axis = out_layout.IndexOf(LayoutAxis::Get('C'));
   int c_small_axis = out_layout.IndexOf(LayoutAxis::Get('c'));
 
-  CHECK_GE(c_big_axis, 0);
+  ICHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // More general layout can be supported under the current framework.
   // By using a unified layout transformation.
@@ -921,11 +932,11 @@ Expr Conv2DBackwardTransform(const Call& call, const Message& message, const Exp
     return transformer->NormalCallTransform(call.operator->());
   }
   const auto* param = call->attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   int c_big_axis = out_layout.IndexOf(LayoutAxis::Get('C'));
-  CHECK_GE(c_big_axis, 0);
+  ICHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
   int small_ko_axis = kernel_layout.IndexOf(LayoutAxis::Get('o'));
@@ -934,10 +945,10 @@ Expr Conv2DBackwardTransform(const Call& call, const Message& message, const Exp
   int big_ko_axis = kernel_layout.IndexOf(LayoutAxis::Get('O'));
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  CHECK(param->groups == 1 || is_depthwise_conv2d);
+  ICHECK(param->groups == 1 || is_depthwise_conv2d);
   bool is_simple = (small_ko_axis < 0 && small_ki_axis < 0 && big_ki_axis >= 0);
   bool is_blocking = (small_ko_axis >= 0 && small_ki_axis >= 0 && big_ki_axis >= 0);
-  CHECK(is_simple || is_blocking);
+  ICHECK(is_simple || is_blocking);
 
   Expr data = transformer->Transform(call->args[0], NullValue<Message>(), NullValue<Expr>());
   Expr weight = transformer->Transform(call->args[1], NullValue<Message>(), NullValue<Expr>());
diff --git a/src/relay/transforms/forward_rewrite.cc b/src/relay/transforms/forward_rewrite.cc
index f093f5425d94..be2d37477eb6 100644
--- a/src/relay/transforms/forward_rewrite.cc
+++ b/src/relay/transforms/forward_rewrite.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/transform.h>
 
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -89,7 +89,7 @@ class ForwardRewriter : private MixedModeMutator {
     if (fmulti_ref_trigger_ != nullptr) {
       Expr ret = post;
       auto it = ref_counter_.find(expr.get());
-      CHECK(it != ref_counter_.end());
+      ICHECK(it != ref_counter_.end());
       if (it->second > 1) {
         ret = fmulti_ref_trigger_(ret);
       }
@@ -136,7 +136,7 @@ class ForwardRewriter : private MixedModeMutator {
     if (rewrite_func_) {
       frewrite = *rewrite_func_;
     } else {
-      CHECK(rewrite_map_);
+      ICHECK(rewrite_map_);
       frewrite = rewrite_map_->get(call_node->op, nullptr);
     }
     const auto* post_node = post.as<CallNode>();
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 10fa05435288..29f3bfa0a17e 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -31,8 +31,8 @@
 #include <tvm/tir/op.h>
 
 #include "../../support/arena.h"
-#include "pass_util.h"
-#include "pattern_util.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -188,9 +188,9 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
 
   void AddNode(const tvm::Object* key) {
     auto it = graph_.node_map.find(key);
-    CHECK(it != graph_.node_map.end()) << "Cannot find node " << GetRef<ObjectRef>(key);
+    ICHECK(it != graph_.node_map.end()) << "Cannot find node " << GetRef<ObjectRef>(key);
     IndexedForwardGraph::Node* node = it->second;
-    CHECK(node->ref == nullptr);
+    ICHECK(node->ref == nullptr);
     node->ref = key;
     node->index = graph_.post_dfs_order.size();
     graph_.post_dfs_order.push_back(node);
@@ -226,7 +226,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
-    CHECK(graph_.node_map.count(call));
+    ICHECK(graph_.node_map.count(call));
     Node* node = graph_.node_map.at(call);
     static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
     // Now we set the pattern of this call.
@@ -270,7 +270,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
   }
 
   void VisitExpr_(const TupleNode* op) final {
-    CHECK(graph_.node_map.count(op));
+    ICHECK(graph_.node_map.count(op));
     Node* tuple_node = graph_.node_map.at(op);
     tuple_node->pattern = kTuple;
     for (const Expr& field : op->fields) {
@@ -286,7 +286,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
 
   void VisitExpr_(const TupleGetItemNode* op) final {
     auto tuple_type = op->tuple->checked_type().as<TupleTypeNode>();
-    CHECK(tuple_type);
+    ICHECK(tuple_type);
     // When TVM lowers a fused function, it expects all arguments to be a Tensor or
     // a tuple containing only Tensors. But this tuple may contain a reference or
     // another tuple. To avoid modifying codegen logic, we do not allow fusing through this node
@@ -302,7 +302,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     if (has_non_tensor) {
       this->Update(op->tuple, nullptr, kOpaque);
     } else {
-      CHECK(graph_.node_map.count(op));
+      ICHECK(graph_.node_map.count(op));
       Node* node = graph_.node_map.at(op);
       node->pattern = kInjective;
       this->Update(op->tuple, node, kInjective);
@@ -443,9 +443,9 @@ class DominatorTree {
     }
     auto get_node = [&](const IndexedForwardGraph::Edge& edge) {
       size_t oindex = edge.node->index;
-      CHECK_LT(oindex, nodes.size());
+      ICHECK_LT(oindex, nodes.size());
       Node* onode = nodes[oindex];
-      CHECK(onode != nullptr);
+      ICHECK(onode != nullptr);
       return onode;
     };
     Node* parent = get_node(link->value);
@@ -563,7 +563,7 @@ class GraphPartitioner {
     if (visited_.count(src)) return true;
     visited_.insert(src);
     Group* gnode = groups_[src->index];
-    CHECK(gnode != nullptr);
+    ICHECK(gnode != nullptr);
     gnode = gnode->FindRoot();
     if (!fcond(gnode->pattern, src == sink)) return false;
     if (src == sink) return true;
@@ -586,9 +586,9 @@ class GraphPartitioner {
    */
   template <typename F>
   bool CheckPath(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink, F fcond) {
-    CHECK(!src->extern_ref);
+    ICHECK(!src->extern_ref);
     visited_.clear();
-    CHECK(src != sink);
+    ICHECK(src != sink);
     for (auto link = src->outputs.head; link != nullptr; link = link->next) {
       if (!CheckPath_(link->value.node, sink, fcond)) return false;
     }
@@ -616,7 +616,7 @@ class GraphPartitioner {
     child->parent = parent;
     // update anchor ref and pattern
     if (child->anchor_ref != nullptr) {
-      CHECK(parent->anchor_ref == nullptr);
+      ICHECK(parent->anchor_ref == nullptr);
       parent->anchor_ref = child->anchor_ref;
       parent->pattern = CombinePattern(child->pattern, parent->pattern);
     }
@@ -627,7 +627,7 @@ class GraphPartitioner {
     if (visited_.count(src)) return;
     visited_.insert(src);
     Group* gnode = groups_[src->index];
-    CHECK(gnode != nullptr);
+    ICHECK(gnode != nullptr);
     // merge the current group to the parent if possible.
     MergeFromTo(gnode, target);
     for (auto link = src->outputs.head; link != nullptr; link = link->next) {
@@ -643,7 +643,7 @@ class GraphPartitioner {
   void CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink) {
     Group* target = groups_[sink->index];
     visited_.clear();
-    CHECK(src != sink);
+    ICHECK(src != sink);
     CommitFuse_(src, sink, target);
   }
 
@@ -651,7 +651,7 @@ class GraphPartitioner {
     if (src == sink || visited_.count(src)) return 0;
     visited_.insert(src);
     Group* gnode = groups_[src->index];
-    CHECK(gnode != nullptr);
+    ICHECK(gnode != nullptr);
     auto sum = gnode->num_nodes;
     for (auto link = src->outputs.head; link != nullptr; link = link->next) {
       sum += CountNodesUptoSink_(link->value.node, sink);
@@ -669,7 +669,7 @@ class GraphPartitioner {
                                      IndexedForwardGraph::Node* dom_parent) {
     Group* target = groups_[dom_parent->index];
     visited_.clear();
-    CHECK(child != dom_parent);
+    ICHECK(child != dom_parent);
     return target->FindRoot()->num_nodes + CountNodesUptoSink_(child, dom_parent);
   }
 
@@ -696,12 +696,12 @@ class GraphPartitioner {
       auto* graph_node = graph.post_dfs_order[nid];
       auto* dom_node = post_dom_tree.nodes[nid];
       Group* group_node = groups_[nid];
-      CHECK(group_node != nullptr);
+      ICHECK(group_node != nullptr);
       // no actions for opaque nodes
       if (group_node->pattern == kOpaque) continue;
       // no actions needed if the current node have no dominator
       if (dom_node->parent == nullptr) continue;
-      CHECK(!graph_node->extern_ref);
+      ICHECK(!graph_node->extern_ref);
       size_t dom_parent_gindex = dom_node->parent->gnode->index;
 
       // refuse the fusion if too many ops are going to be fused together
@@ -740,7 +740,7 @@ class GraphPartitioner {
         // Path for OutEWiseFusable: conv2d
         // Check if the dominator relation is elemwise.
         if (dom_node->parent != nullptr && dom_node->pattern == kElemWise) {
-          CHECK(dom_node->parent->gnode != nullptr);
+          ICHECK(dom_node->parent->gnode != nullptr);
           // The fuse can be executed if all the intermediate ops are still broadcast.
           auto fcond = [](OpPatternKind kind, bool is_sink) { return kind <= kBroadcast; };
           if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
@@ -778,7 +778,7 @@ class GraphPartitioner {
         }
       } else {
         // do nothing.
-        CHECK(group_node->pattern == kCommReduce);
+        ICHECK(group_node->pattern == kCommReduce);
       }
     }
   }
@@ -805,7 +805,7 @@ class FuseMutator : private ExprMutator {
     auto graph = IndexedForwardGraph::Create(&arena_, body);
     auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth).Partition(graph);
     for (size_t nid = 0; nid < graph.post_dfs_order.size(); ++nid) {
-      CHECK(graph.post_dfs_order[nid]->ref != nullptr);
+      ICHECK(graph.post_dfs_order[nid]->ref != nullptr);
       gmap_[graph.post_dfs_order[nid]->ref] = groups[nid];
     }
     // The following line can be used for debug.
@@ -863,14 +863,14 @@ class FuseMutator : private ExprMutator {
 
       // If it is a primitive op call
       // then we must have a group assignment for it already.
-      CHECK(gmap_.count(call));
+      ICHECK(gmap_.count(call));
       if (call->op == stop_fusion_op) {
         return ExprMutator::VisitExpr(call->args[0]);
       }
       auto* ret_group = gmap_.at(call)->FindRoot();
       Array<Expr> new_args = GetNewArguments(call->args, ret_group);
 
-      auto new_call = Call(call->op, new_args, call->attrs, call->type_args);
+      auto new_call = Call(call->op, new_args, call->attrs, call->type_args, call->span);
 
       if (ret_group->root_ref == call) {
         // This is the root of the group
diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/gradient.cc
index bf8105080317..cd3a99655341 100644
--- a/src/relay/transforms/gradient.cc
+++ b/src/relay/transforms/gradient.cc
@@ -29,8 +29,8 @@
 #include <tvm/te/operation.h>
 
 #include "let_list.h"
-#include "pass_util.h"
-#include "pattern_util.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -74,7 +74,7 @@ Expr FirstOrderGradient(const Expr& e, const Optional<IRModule>& mod);
 Type WithGradientType(const Type& t) {
   // TODO(@M.K.): stricter checking
   auto ty = t.as<FuncTypeNode>();
-  CHECK(ty) << "input should be a function";
+  ICHECK(ty) << "input should be a function";
   return FuncType(ty->arg_types, TupleType({ty->ret_type, TupleType(ty->arg_types)}), {}, {});
 }
 
@@ -102,7 +102,7 @@ struct ADValueNode {
   template <typename T>
   T& get() {
     auto ret = dynamic_cast<T*>(this);
-    CHECK(ret) << "cannot downcast";
+    ICHECK(ret) << "cannot downcast";
     return *ret;
   }
 };
@@ -181,9 +181,25 @@ struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
     return ret;
   }
 
+  Expr UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) {
+    if (t.as<TensorTypeNode>()) {
+      return ll->Push(Add(arg, grad));
+    } else if (auto* tt = t.as<TupleTypeNode>()) {
+      Array<Expr> updates;
+      for (size_t i = 0; i < tt->fields.size(); ++i) {
+        updates.push_back(this->UpdateGrad(tt->fields[i], ll->Push(GetField(arg, i)),
+                                           ll->Push(GetField(grad, i)), ll));
+      }
+      return ll->Push(Tuple(updates));
+    } else {
+      LOG(FATAL) << "unsupported arg type of operator: " << t;
+      throw;
+    }
+  }
+
   ADValue VisitExpr_(const OpNode* op) final {
     Op op_ref = GetRef<Op>(op);
-    CHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined";
+    ICHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined";
     return std::make_shared<ADFunction>(
         [this, op_ref](const Type& orig_type, const std::vector<ADValue>& args, const Attrs& attrs,
                        const tvm::Array<Type>& type_args) {
@@ -196,10 +212,12 @@ struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
           auto ret = std::make_shared<ADTensor>(ll, orig);
           backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) {
             tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
-            CHECK(args.size() == rev.size());
+            ICHECK(args.size() == rev.size());
             for (size_t i = 0; i < args.size(); ++i) {
+              auto ad_arg = args[i]->get<ADTensor>();
+              auto ad_arg_type = ad_arg.forward->checked_type();
               args[i]->get<ADTensor>().reverse =
-                  ll->Push(Add(args[i]->get<ADTensor>().reverse, rev[i]));
+                  this->UpdateGrad(ad_arg_type, ad_arg.reverse, rev[i], ll);
             }
           });
           return ret;
@@ -271,7 +289,7 @@ struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
     return std::make_shared<ADFunction>(
         [this, f](const Type& orig_type, const std::vector<ADValue>& args, const Attrs& attrs,
                   const tvm::Array<Type>& type_args) {
-          CHECK_EQ(f->params.size(), args.size());
+          ICHECK_EQ(f->params.size(), args.size());
           for (size_t i = 0; i < f->params.size(); ++i) {
             env[f->params[i]] = args[i];
           }
@@ -305,8 +323,8 @@ Expr FirstOrderGradient(const Expr& re, const Optional<IRModule>& mod) {
   // order case.
   auto e = DeGlobal(mod, re);
   auto f = e.as<FunctionNode>();
-  CHECK(f) << "FOWithGradient expects its argument to be a function: " << f;
-  CHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
+  ICHECK(f) << "FOWithGradient expects its argument to be a function: " << f;
+  ICHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
 
   // We will then build a sequence of lets which implement reverse mode.
   Expr body = LetList::With([&](LetList* ll) {
@@ -364,7 +382,7 @@ Type ReverseType(const Type& t) { return ReverseADType()(t); }
 Expr LiftTensor(const std::function<Expr(const Expr& t)>& f,
                 const std::function<Type(const Type&)>& tf, const Type& forward_type, const Expr& e,
                 LetList* ll) {
-  CHECK(IsAtomic(e)) << e;
+  ICHECK(IsAtomic(e)) << e;
   if (forward_type.as<TensorTypeNode>()) {
     auto ret = ll->Push(f(e));
     ret->checked_type_ = tf(forward_type);
@@ -390,8 +408,8 @@ Expr LiftTensor(const std::function<Expr(const Expr& t)>& f,
  * by stitching the references in the AD values.
  */
 void TransferGrads(const Type& forward_type, const Expr& from, const Expr& to, LetList* ll) {
-  CHECK(IsAtomic(from)) << from;
-  CHECK(IsAtomic(to)) << to;
+  ICHECK(IsAtomic(from)) << from;
+  ICHECK(IsAtomic(to)) << to;
   if (forward_type.as<TensorTypeNode>()) {
     auto from_ref = TupleGetItem(from, 1);
     auto to_ref = TupleGetItem(to, 1);
@@ -487,9 +505,9 @@ struct ReverseAD : ExprMutator {
 
   Expr VisitCheckpoint(const CallNode* call) {
     const OpNode* op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "expected op in call";
+    ICHECK(op_node) << "expected op in call";
     Op op_ref = GetRef<Op>(op_node);
-    CHECK(op_ref->name == "annotation.checkpoint") << "expected checkpoint annotation";
+    ICHECK(op_ref->name == "annotation.checkpoint") << "expected checkpoint annotation";
     auto x = call->args[0];
     return LetList::With([&](LetList* ll) {
       auto x_var = ll->Push(Remap(x));
@@ -518,7 +536,7 @@ struct ReverseAD : ExprMutator {
         return VisitCheckpoint(call);
       }
 
-      CHECK(rev_map.count(op_ref)) << op_node->name << " does not have reverse mode defined";
+      ICHECK(rev_map.count(op_ref)) << op_node->name << " does not have reverse mode defined";
       return LetList::With([&](LetList* ll) {
         std::vector<Var> args;
         for (const auto& arg : call->args) {
@@ -536,7 +554,7 @@ struct ReverseAD : ExprMutator {
         auto bpv = ll->Push(RefRead(bp));
         Expr nbp_body = LetList::With([&](LetList* ll) {
           tvm::Array<Expr> rev = rev_map[op_ref](orig, GetGrad(call->checked_type(), ret, ll));
-          CHECK(args.size() == rev.size());
+          ICHECK(args.size() == rev.size());
           for (size_t i = 0; i < args.size(); ++i) {
             UpdateGrad(call->args[i]->checked_type(), args[i], rev[i], ll);
           }
@@ -585,7 +603,7 @@ struct ReverseAD : ExprMutator {
   Expr VisitExpr_(const GlobalVarNode* op) final {
     // todo: concatenating string to add attribute seems like a brittle hack.
     // maybe get module indexed by a rose tree of string?
-    CHECK(mod.defined());
+    ICHECK(mod.defined());
     auto orig_gv = GetRef<GlobalVar>(op);
     if (ad_gvars->count(orig_gv) == 0) {
       GlobalVar gv(op->name_hint + "_grad");
@@ -653,12 +671,12 @@ Expr Gradient(const Expr& re, const Optional<IRModule>& mod) {
   }
   auto e = DeGlobal(mod, re);
   auto f = e.as<FunctionNode>();
-  CHECK(f) << "input need to be a function";
-  CHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
+  ICHECK(f) << "input need to be a function";
+  ICHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
   for (const auto& p : f->params) {
-    CHECK(p->checked_type().as<TensorTypeNode>()) << "input parameters need to be tensor";
+    ICHECK(p->checked_type().as<TensorTypeNode>()) << "input parameters need to be tensor";
   }
-  CHECK(!MissingGrad(e)) << "input has operators with missing gradients";
+  ICHECK(!MissingGrad(e)) << "input has operators with missing gradients";
   Expr body = LetList::With([&](LetList* ll) {
     Var bp = ll->Push(BPEmpty(), bpt);
     Expr rev = ReverseAD(mod, bp, std::make_shared<ReverseAD::ADVarMap>(),
@@ -676,7 +694,7 @@ Expr Gradient(const Expr& re, const Optional<IRModule>& mod) {
       if (t.as<TensorTypeNode>()) {
         ll->Push(RefWrite(GetField(e, 1), OnesLike(GetField(e, 0))));
       } else if (auto tt = t.as<TupleTypeNode>()) {
-        CHECK_GT(tt->fields.size(), 0);
+        ICHECK_GT(tt->fields.size(), 0);
         init_grad(ll->Push(GetField(e, 0)), tt->fields[0]);
       } else {
         LOG(FATAL) << "unhandled type " << t;
diff --git a/src/relay/transforms/infer_layout_util.h b/src/relay/transforms/infer_layout_utils.h
similarity index 96%
rename from src/relay/transforms/infer_layout_util.h
rename to src/relay/transforms/infer_layout_utils.h
index 5cc180e8e2e3..7edb07ce71ce 100644
--- a/src/relay/transforms/infer_layout_util.h
+++ b/src/relay/transforms/infer_layout_utils.h
@@ -18,14 +18,14 @@
  */
 
 /*!
- * \file infer_layout_util.h
+ * \file infer_layout_utils.h
  * \brief Utility functions to alter the layouts of operators or replace primitive operators with
           other expressions. This pass can be used for computing convolution in
           custom layouts or other general weight pre-transformation.
  */
 
-#ifndef TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTIL_H_
-#define TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTIL_H_
+#ifndef TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTILS_H_
+#define TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTILS_H_
 
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op_attr_types.h>
@@ -34,7 +34,7 @@
 #include <string>
 #include <tuple>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -108,7 +108,7 @@ inline Array<Array<Layout>> ElemwiseArbitraryLayout(const Attrs& attrs,
   Layout ret;
 
   if (new_in_layouts.defined()) {
-    CHECK_GE(new_in_layouts.size(), 1);
+    ICHECK_GE(new_in_layouts.size(), 1);
     ret = new_in_layouts[0];
   } else {
     for (size_t i = 0; i < old_in_layouts.size(); ++i) {
@@ -130,7 +130,7 @@ inline Array<Array<Layout>> BinaryBroadcastLayout(const Attrs& attrs,
   Array<Layout> layouts;
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
 
@@ -217,7 +217,7 @@ static inline std::tuple<Array<Layout>, Array<Layout>, bool> InferCorrectLayouts
   if (finfer_layout.count(op)) {
     Array<Array<Layout>> inferred_layouts;
     inferred_layouts = finfer_layout[op](call->attrs, new_in_layouts, old_in_layouts, old_in_types);
-    CHECK_EQ(inferred_layouts.size(), 2)
+    ICHECK_EQ(inferred_layouts.size(), 2)
         << "FInferCorrectLayout should return an array with size of 2";
     for (auto x : inferred_layouts) {
       for (auto y : x) {
@@ -235,4 +235,4 @@ static inline std::tuple<Array<Layout>, Array<Layout>, bool> InferCorrectLayouts
 }  //  namespace relay
 }  //  namespace tvm
 
-#endif  // TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTIL_H_
+#endif  // TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTILS_H_
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index c9a0de44e2d4..dae34674de77 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -114,16 +114,16 @@ class Inliner : ExprMutator {
 
   // Make a new Relay expression to replace the callee.
   Expr MakeNewExpr(const GlobalVar& global, const Array<Expr>& args, const Expr& callee) {
-    CHECK(callee->IsInstance<CallNode>() || callee->IsInstance<GlobalVarNode>());
+    ICHECK(callee->IsInstance<CallNode>() || callee->IsInstance<GlobalVarNode>());
     auto base_func = call_graph_->GetGlobalFunction(global);
     const auto* fn = base_func.as<FunctionNode>();
-    CHECK(fn) << "Expected to work on a Relay function.";
+    ICHECK(fn) << "Expected to work on a Relay function.";
 
     auto func = Function(fn->params, fn->body, fn->ret_type, fn->type_params, fn->attrs);
     // Inline the function body to the caller if this function uses default
     // compiler, i.e. no external codegen is needed.
     if (!func->GetAttr<String>(attr::kCompiler).defined()) {
-      CHECK_EQ(func->params.size(), args.size())
+      ICHECK_EQ(func->params.size(), args.size())
           << "Mismatch found in the number of parameters and call args";
       // Bind the parameters with call args.
       Map<Var, Expr> bind_map;
@@ -137,7 +137,7 @@ class Inliner : ExprMutator {
         // its body when the global var returns FuncType.
         return ret_type->IsInstance<FuncTypeNode>() ? std::move(func) : func->body;
       } else {
-        CHECK(callee->IsInstance<CallNode>());
+        ICHECK(callee->IsInstance<CallNode>());
         return Bind(func->body, bind_map);
       }
     } else if (const auto* call_node = callee.as<CallNode>()) {
@@ -189,7 +189,7 @@ IRModule Inline(const IRModule& module) {
     if (const auto* fn = base_func.as<FunctionNode>()) {
       auto func = GetRef<Function>(fn);
       if (func->HasNonzeroAttr(attr::kInline)) {
-        CHECK_EQ(cgn->GetRefCount(), 0U)
+        ICHECK_EQ(cgn->GetRefCount(), 0U)
             << cgn->GetNameHint() << " is marked as inline but not inlined.";
         cgn->CleanCallGraphEntries();
         cg->RemoveGlobalVarFromModule(cgn, /*update_call_graph*/ true);
diff --git a/src/relay/transforms/lazy_gradient_init.cc b/src/relay/transforms/lazy_gradient_init.cc
index de9406ec309d..079b790e74c0 100644
--- a/src/relay/transforms/lazy_gradient_init.cc
+++ b/src/relay/transforms/lazy_gradient_init.cc
@@ -131,8 +131,8 @@ class LazyGradientInitializer : public ExprMutator, public TypeMutator {
     auto* f = e.as<FunctionNode>();
     auto* transformed = this->Mutate(e).as<FunctionNode>();
 
-    CHECK(f);
-    CHECK(transformed);
+    ICHECK(f);
+    ICHECK(transformed);
 
     if (e.same_as(GetRef<Function>(transformed))) {
       return GetRef<Function>(transformed);
diff --git a/src/relay/transforms/legalize.cc b/src/relay/transforms/legalize.cc
index 89f59f625a8d..7daa028bbcf3 100644
--- a/src/relay/transforms/legalize.cc
+++ b/src/relay/transforms/legalize.cc
@@ -73,7 +73,7 @@ class Legalizer : public ExprRewriter {
         if (legalized_value.defined()) {
           // Check that the returned Expr from legalize is CallNode.
           const CallNode* legalized_call_node = legalized_value.as<CallNode>();
-          CHECK(legalized_call_node)
+          ICHECK(legalized_call_node)
               << "Can only replace the original operator with another call node";
           return legalized_value;
         }
diff --git a/src/relay/transforms/let_list.h b/src/relay/transforms/let_list.h
index c925dc0922a4..c75f18f6831c 100644
--- a/src/relay/transforms/let_list.h
+++ b/src/relay/transforms/let_list.h
@@ -64,8 +64,8 @@ class LetList {
    * \return a Var that hold the inserted expr.
    */
   Var Push(Var pv, Expr expr) {
-    CHECK(!used_);
-    CHECK(WellFormed(expr));
+    ICHECK(!used_);
+    ICHECK(WellFormed(expr));
     lets_.emplace_back(std::make_pair(pv, expr));
     return pv;
   }
@@ -98,7 +98,7 @@ class LetList {
    *  \return the wrapped expr.
    */
   Expr Get(const Expr& body) {
-    CHECK(!used_);
+    ICHECK(!used_);
     Expr ret = body;
     for (auto rit = lets_.rbegin(); rit != lets_.rend(); ++rit) {
       ret = Let(std::get<0>(*rit), std::get<1>(*rit), ret);
diff --git a/src/relay/transforms/merge_compiler_regions.cc b/src/relay/transforms/merge_compiler_regions.cc
index 5e615e4316bd..d18c17e63ca1 100644
--- a/src/relay/transforms/merge_compiler_regions.cc
+++ b/src/relay/transforms/merge_compiler_regions.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-/*
+/*!
  * \file src/relay/transforms/merge_compiler_regions.cc
  *
  * \brief After operators have been annotated with the targets that support
@@ -43,7 +43,7 @@
 #include <vector>
 
 #include "../analysis/annotated_region_set.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -64,14 +64,14 @@ class RegionMerger : public MixedModeVisitor {
 
       // Check the region target.
       auto compiler_attrs = call->attrs.as<CompilerAttrs>();
-      CHECK_EQ(region->GetTarget(), compiler_attrs->compiler);
+      ICHECK_EQ(region->GetTarget(), compiler_attrs->compiler);
 
       // Visit the unmerged parent regions.
       for (const auto& arg : region->GetInputs()) {
         // Region inputs must be begin annotation, and the region of
         // the begin annotation's argument is the parent region.
         auto begin = Downcast<Call>(arg);
-        CHECK_EQ(begin->op, CompilerBeginOp());
+        ICHECK_EQ(begin->op, CompilerBeginOp());
         auto parent_region = regions_->GetRegion(begin->args[0]);
 
         // Skip this region if it has been merged.
@@ -86,7 +86,7 @@ class RegionMerger : public MixedModeVisitor {
       std::unordered_set<AnnotatedRegion, ObjectPtrHash, ObjectPtrEqual> mergeable_regions;
       for (const auto& arg : region->GetInputs()) {
         auto begin = Downcast<Call>(arg);
-        CHECK_EQ(begin->op, CompilerBeginOp());
+        ICHECK_EQ(begin->op, CompilerBeginOp());
         auto parent_region = regions_->GetRegion(begin->args[0]);
         if (parent_region.defined()) {
           mergeable_regions.insert(parent_region);
diff --git a/src/relay/transforms/merge_composite.cc b/src/relay/transforms/merge_composite.cc
index 7e7ad0e665a7..51f1387fd9ca 100644
--- a/src/relay/transforms/merge_composite.cc
+++ b/src/relay/transforms/merge_composite.cc
@@ -46,7 +46,7 @@ Function InferType(const Function& expr, const IRModule& m) {
 Expr MergeComposite(const Function& func, const Array<runtime::String>& pattern_names,
                     const Array<DFPattern>& patterns, const std::vector<PackedFunc>& checks,
                     const IRModule& m) {
-  CHECK_EQ(pattern_names.size(), patterns.size());
+  ICHECK_EQ(pattern_names.size(), patterns.size());
   Function merged_func = func;
   // merge the patterns one-by-one in order
   for (size_t i = 0; i < patterns.size(); i++) {
diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc
index afe2bd5d9302..fa080a7ff22c 100644
--- a/src/relay/transforms/partial_eval.cc
+++ b/src/relay/transforms/partial_eval.cc
@@ -98,7 +98,7 @@
 #include <tvm/relay/transform.h>
 
 #include "let_list.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -279,7 +279,7 @@ class FuelNode : public RelayNode {
   }
   /*! \brief return the new Fuel, and write (*progress | is progress made) to *progress. */
   virtual Fuel Meet(const Fuel& f, bool* progress) const {
-    CHECK(progress);
+    ICHECK(progress);
     auto ret = Meet(f);
     *progress |= std::get<1>(ret);
     return std::get<0>(ret);
@@ -295,8 +295,8 @@ struct FSeqNode : FuelNode {
   std::vector<Fuel> fuels;
   Fuel Meet(const Fuel& f, bool* progress) const final {
     auto x = f.as<FSeqNode>();
-    CHECK(x);
-    CHECK_EQ(fuels.size(), x->fuels.size());
+    ICHECK(x);
+    ICHECK_EQ(fuels.size(), x->fuels.size());
     std::vector<Fuel> new_fuels;
     for (size_t i = 0; i < fuels.size(); ++i) {
       new_fuels.push_back(fuels[i]->Meet(x->fuels[i], progress));
@@ -320,7 +320,7 @@ struct FTimeNode : FuelNode {
   Time time;
   std::tuple<Fuel, bool> Meet(const Fuel& f) const final {
     auto x = f.as<FTimeNode>();
-    CHECK(x);
+    ICHECK(x);
     Time new_time = std::min(time, x->time);
     return std::make_tuple(MkFTime(new_time), new_time < time);
   }
@@ -342,7 +342,7 @@ struct FTValueNode : FuelNode {
   size_t tvalue;
   std::tuple<Fuel, bool> Meet(const Fuel& f) const final {
     auto x = f.as<FTValueNode>();
-    CHECK(x);
+    ICHECK(x);
     size_t new_tvalue = std::min(tvalue, x->tvalue);
     return std::make_tuple(MkFTValue(new_tvalue), new_tvalue < tvalue);
   }
@@ -401,9 +401,9 @@ class Environment {
   }
 
   void Insert(const Var& v, const PStatic& ps) {
-    CHECK(ps.defined());
-    CHECK_GT(env_.size(), 0);
-    CHECK_EQ(env_.back().locals.count(v), 0);
+    ICHECK(ps.defined());
+    ICHECK_GT(env_.size(), 0);
+    ICHECK_EQ(env_.back().locals.count(v), 0);
     env_.back().locals[v] = ps;
   }
 
@@ -459,7 +459,7 @@ class Store {
   }
 
   void Insert(const SRefNode* r, const PStatic& ps) {
-    CHECK(r);
+    ICHECK(r);
     store_.back().store[r] = ps;
   }
 
@@ -503,7 +503,7 @@ class Store {
 };
 
 PStatic HasStatic(const Static& stat, const Expr& dynamic) {
-  CHECK(stat.defined());
+  ICHECK(stat.defined());
   return PStatic(make_object<PStaticNode>(stat, dynamic));
 }
 
@@ -579,8 +579,8 @@ Function AsFunc(const Expr& e) {
   if (e.as<FunctionNode>()) {
     return Downcast<Function>(e);
   } else if (const CallNode* c = e.as<CallNode>()) {
-    CHECK(c->op == with_funcid_op);
-    CHECK_EQ(c->args.size(), 1);
+    ICHECK(c->op == with_funcid_op);
+    ICHECK_EQ(c->args.size(), 1);
     return AsFunc(c->args[0]);
   } else {
     LOG(FATAL) << "Unknown case";
@@ -595,20 +595,20 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
   PStatic VisitExpr(const Expr& e, LetList* ll) final {
     PStatic ret = ExprFunctor<PStatic(const Expr&, LetList*)>::VisitExpr(e, ll);
-    CHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
+    ICHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
     return ret;
   }
 
   PStatic VisitExpr(const Expr& e, LetList* ll, const Var& name) {
     if (const CallNode* c = e.as<CallNode>()) {
       if (c->op == with_funcid_op) {
-        CHECK_EQ(c->args.size(), 1);
+        ICHECK_EQ(c->args.size(), 1);
         return VisitExpr(c->args[0], ll, name);
       }
     }
     PStatic ret =
         e.as<FunctionNode>() ? VisitFunc(Downcast<Function>(e), ll, name) : VisitExpr(e, ll);
-    CHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
+    ICHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
     return ret;
   }
 
@@ -639,7 +639,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   PStatic VisitExpr_(const VarNode* op, LetList* ll) final { return env_.Lookup(GetRef<Var>(op)); }
 
   PStatic VisitGlobalVar(const GlobalVar& gv) {
-    CHECK(mod_.defined());
+    ICHECK(mod_.defined());
     if (gv_map_.count(gv) == 0) {
       BaseFunc base_func = mod_->Lookup(gv);
       if (auto* n = base_func.as<FunctionNode>()) {
@@ -670,7 +670,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     PStatic c = VisitExpr(op->cond, ll);
     if (c->pstatic.defined()) {
       NDArray cpu_array = Downcast<STensor>(c->pstatic)->data.CopyTo(CPUContext());
-      CHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
+      ICHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
       if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
         return VisitExpr(op->true_branch, ll);
       } else {
@@ -719,7 +719,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
   PStatic VisitExpr_(const CallNode* op, LetList* ll) final {
     if (op->op == with_funcid_op) {
-      CHECK_EQ(op->args.size(), 1);
+      ICHECK_EQ(op->args.size(), 1);
       return VisitExpr(op->args[0], ll);
     }
     PStatic f = VisitExpr(op->op, ll);
@@ -743,7 +743,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     FuncId fid_;
     Fuel old_fuel;
     FuelFrame(PartialEvaluator* pe, FuncId fid, const Fuel& new_fuel) : pe_(pe), fid_(fid) {
-      CHECK_GT(pe_->fuel_map_.count(fid_), 0);
+      ICHECK_GT(pe_->fuel_map_.count(fid_), 0);
       old_fuel = pe_->fuel_map_[fid_];
       pe_->fuel_map_[fid_] = new_fuel;
     }
@@ -775,7 +775,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   Func VisitFuncStatic(const Function& func, const Expr& var) {
-    CHECK(IsAtomic(var));
+    ICHECK(IsAtomic(var));
     if (func->HasNonzeroAttr(attr::kPrimitive)) {
       return ConstEvaluateFunc(func);
     }
@@ -788,8 +788,8 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     return [=](const PStatic& self, const std::vector<PStatic>& pv, const Attrs& attrs,
                const tvm::Array<Type>& type_args, LetList* ll) {
       return env_.Extend<PStatic>([&]() {
-        CHECK_EQ(pv.size(), func->params.size());
-        CHECK_GT(func_map_.count(func), 0);
+        ICHECK_EQ(pv.size(), func->params.size());
+        ICHECK_GT(func_map_.count(func), 0);
         FuncId fid = func_map_.at(func);
         if (fuel_map_.count(fid) == 0) {
           fuel_map_.insert({fid, MkFTop()});
@@ -914,7 +914,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   Func ConstEvaluateFunc(const Expr& expr) {
-    CHECK_EQ(FreeVars(expr).size(), 0);
+    ICHECK_EQ(FreeVars(expr).size(), 0);
     return [=](const PStatic& self, const std::vector<PStatic>& pv, const Attrs& attrs,
                const tvm::Array<Type>& type_args, LetList* ll) {
       tvm::Array<Expr> ns_args;
@@ -1002,10 +1002,10 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   MatchStatus VisitPattern_(const PatternConstructorNode* op, const PStatic& ps) final {
     if (ps->pstatic.defined()) {
       SConstructor scn = Downcast<SConstructor>(ps->pstatic);
-      CHECK_NE(op->constructor->tag, -1);
-      CHECK_NE(scn->constructor->tag, -1);
+      ICHECK_NE(op->constructor->tag, -1);
+      ICHECK_NE(scn->constructor->tag, -1);
       if (op->constructor->tag == scn->constructor->tag) {
-        CHECK_EQ(op->patterns.size(), scn->fields.size());
+        ICHECK_EQ(op->patterns.size(), scn->fields.size());
         MatchStatus current_match_status = MatchStatus::Match;
         for (size_t i = 0; i < op->patterns.size(); ++i) {
           MatchStatus ms = VisitPattern(op->patterns[i], scn->fields[i]);
@@ -1029,7 +1029,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   MatchStatus VisitPattern_(const PatternTupleNode* op, const PStatic& ps) final {
     if (ps->pstatic.defined()) {
       STuple stn = Downcast<STuple>(ps->pstatic);
-      CHECK_EQ(op->patterns.size(), stn->fields.size());
+      ICHECK_EQ(op->patterns.size(), stn->fields.size());
       MatchStatus current_match_status = MatchStatus::Match;
       for (size_t i = 0; i < op->patterns.size(); ++i) {
         MatchStatus ms = VisitPattern(op->patterns[i], stn->fields[i]);
@@ -1055,7 +1055,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
       void VisitExpr_(const FunctionNode* op) final {
         Function f = GetRef<Function>(op);
-        CHECK_EQ(pe->func_map_.count(f), 0);
+        ICHECK_EQ(pe->func_map_.count(f), 0);
         pe->func_map_.insert({f, pe->func_map_.size()});
         VisitExpr(f->body);
       }
@@ -1072,13 +1072,13 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
       void VisitExpr_(const CallNode* op) final {
         if (op->op == with_funcid_op) {
-          CHECK_EQ(op->args.size(), 1);
-          CHECK(op->attrs.defined());
-          CHECK(op->attrs.as<WithFuncIdAttrs>());
+          ICHECK_EQ(op->args.size(), 1);
+          ICHECK(op->attrs.defined());
+          ICHECK(op->attrs.as<WithFuncIdAttrs>());
           Function f = AsFunc(op->args[0]);
           FuncId fid = op->attrs.as<WithFuncIdAttrs>()->fid;
           if (pe->func_map_.count(f) != 0) {
-            CHECK_EQ(pe->func_map_.at(f), fid);
+            ICHECK_EQ(pe->func_map_.at(f), fid);
           }
           pe->func_map_.insert({f, fid});
         }
@@ -1087,7 +1087,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
       void VisitExpr_(const FunctionNode* op) final {
         Function f = GetRef<Function>(op);
-        CHECK_GT(pe->func_map_.count(f), 0);
+        ICHECK_GT(pe->func_map_.count(f), 0);
         ExprVisitor::VisitExpr_(op);
       }
 
@@ -1104,7 +1104,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
       Expr VisitExpr_(const FunctionNode* op) final {
         Function f = GetRef<Function>(op);
-        CHECK_GT(pe->func_map_.count(f), 0);
+        ICHECK_GT(pe->func_map_.count(f), 0);
         return MkWithFuncId(ExprMutator::VisitExpr_(op), pe->func_map_.at(f));
       }
 
@@ -1163,7 +1163,7 @@ Expr StripWithFuncId(const Expr& e) {
   struct StripWithFuncIdMutator : ExprMutator, PatternMutator {
     Expr VisitExpr_(const CallNode* op) final {
       if (op->op == with_funcid_op) {
-        CHECK_EQ(op->args.size(), 1);
+        ICHECK_EQ(op->args.size(), 1);
         return VisitExpr(op->args[0]);
       } else {
         return ExprMutator::VisitExpr_(op);
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index e4560c093115..7508d4437c18 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-/*
+/*!
  * \file src/relay/transforms/partition_graph.cc
  *
  * \brief Partition an input function into multiple functions according based
@@ -44,7 +44,7 @@
 
 #include "../analysis/annotated_region_set.h"
 #include "../backend/utils.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -81,19 +81,19 @@ struct RegionFuncMetadata {
  * a compiler attribute so that it will be handled by any compilers that are not
  * in the TVM stack.
  *
- * Input : A Relay module that have functions with disjoint annotated regions
+ * Input : A Relay module that has functions with disjoint annotated regions
  *         using compiler_begin and compiler_end. There could be multiple
- * outputs.
+ *         outputs.
  *
  * Output : A Relay module with global functions for such disjoint annotated
- * regions with calls inserted at the respective location
+ *          regions with calls inserted at the respective location
  *
  * Dependencies : AnnotatedRegionSet Utility class.
  *
  * Methodology :
  *      1) The AnnotatedRegionSet utility class is able to construct a collection
- *      of nodes that are bound by a given annotation -- here we use
- *      compiler_begin and compiler_end
+ *         of nodes that are bound by a given annotation -- here we use
+ *         compiler_begin and compiler_end
  *      2) Initially, for each function in the module RegionSets are populated.
  *      3) Then, Vistor pass is traversed until a compiler_end node is encountered
  *         that belongs to a "region".
@@ -130,7 +130,7 @@ class Partitioner : public MixedModeMutator {
       return post;
     } else if (call->op == CompilerBeginOp()) {
       // The annotation node is inserted on edge so it must have only one argument.
-      CHECK_EQ(call->args.size(), 1U);
+      ICHECK_EQ(call->args.size(), 1U);
 
       // Traverse the rest graph.
       Expr parent = call->args[0];
@@ -147,7 +147,7 @@ class Partitioner : public MixedModeMutator {
 
       AnnotatedRegion sg = GetRegion(GetRef<Call>(call));
       int index = GetArgIdx(sg, GetRef<Call>(call));
-      CHECK_NE(index, -1);
+      ICHECK_NE(index, -1);
 
       if (region_func_meta_[sg].region_func_in.count(parent)) {
         return region_func_meta_[sg].region_func_in[parent];
@@ -169,10 +169,10 @@ class Partitioner : public MixedModeMutator {
         return std::move(var);
       }
     } else {
-      CHECK_EQ(call->op, CompilerEndOp());
+      ICHECK_EQ(call->op, CompilerEndOp());
       // The annotation node is inserted on edge so it must have only one
       // argument.
-      CHECK_EQ(call->args.size(), 1U);
+      ICHECK_EQ(call->args.size(), 1U);
 
       AnnotatedRegion region = GetRegion(GetRef<Call>(call));
 
@@ -182,7 +182,7 @@ class Partitioner : public MixedModeMutator {
 
       // Traverse subgraph inputs.
       auto input = Downcast<Call>(post)->args[0];
-      CHECK(region.defined()) << "Region not defined for " << GetRef<Call>(call);
+      ICHECK(region.defined()) << "Region not defined for " << GetRef<Call>(call);
       // functions are created for each annotated regions,
       // when their first output is encountered.
       // If multiple outputs are there, a tuple node is inserted at the end.
@@ -194,7 +194,7 @@ class Partitioner : public MixedModeMutator {
 
       // Retrieve this particular output of function.
       Expr region_out_expr = Downcast<Call>(GetRef<Call>(call))->args[0];
-      CHECK(region_func_meta_[region].region_func_out.count(region_out_expr));
+      ICHECK(region_func_meta_[region].region_func_out.count(region_out_expr));
       return region_func_meta_[region].region_func_out[region_out_expr];
     }
   }
@@ -325,7 +325,7 @@ class Partitioner : public MixedModeMutator {
     global_region_func = WithAttr(std::move(global_region_func), attr::kInline, tvm::Integer(1));
 
     std::string fname = name;
-    CHECK(!module_->ContainGlobalVar(fname)) << "Global function " << fname << " already exists";
+    ICHECK(!module_->ContainGlobalVar(fname)) << "Global function " << fname << " already exists";
     // Create a global function and add it to the IRModule for the region.
     // This way we lift the functions that should be handled by external
     // codegen to the module scope and rely on the pass manager to prevent
@@ -444,7 +444,7 @@ IRModule FlattenTupleOutputs(IRModule module) {
       if (call->op == CompilerEndOp()) {
         std::string target = call->attrs.as<CompilerAttrs>()->compiler;
         // Arguments of annotation ops should be 1
-        CHECK_EQ(call->args.size(), 1U);
+        ICHECK_EQ(call->args.size(), 1U);
         auto annotated_op = Downcast<Call>(post)->args[0];
         if (const auto* tn = annotated_op.as<TupleNode>()) {
           Array<Expr> new_fields;
diff --git a/src/relay/transforms/pass_util.h b/src/relay/transforms/pass_utils.h
similarity index 98%
rename from src/relay/transforms/pass_util.h
rename to src/relay/transforms/pass_utils.h
index f3c99ccfa120..a2f22cbbf106 100644
--- a/src/relay/transforms/pass_util.h
+++ b/src/relay/transforms/pass_utils.h
@@ -19,11 +19,11 @@
 
 /*!
  *
- * \file tvm/relay/_transforms/pass_util.h
+ * \file tvm/relay/_transforms/pass_utils.h
  * \brief Utilities for writing passes
  */
-#ifndef TVM_RELAY_TRANSFORMS_PASS_UTIL_H_
-#define TVM_RELAY_TRANSFORMS_PASS_UTIL_H_
+#ifndef TVM_RELAY_TRANSFORMS_PASS_UTILS_H_
+#define TVM_RELAY_TRANSFORMS_PASS_UTILS_H_
 
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/relay/expr.h>
@@ -276,4 +276,4 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_TRANSFORMS_PASS_UTIL_H_
+#endif  // TVM_RELAY_TRANSFORMS_PASS_UTILS_H_
diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_utils.h
similarity index 96%
rename from src/relay/transforms/pattern_util.h
rename to src/relay/transforms/pattern_utils.h
index 3c653af01e2e..8ef86e088193 100644
--- a/src/relay/transforms/pattern_util.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -19,12 +19,12 @@
 
 /*!
  *
- * \file tvm/relay/_pattern_util.h
+ * \file tvm/relay/transforms/pattern_utils.h
  * \brief Header of internal operator functions
  *  These can be used for writing passes.
  */
-#ifndef TVM_RELAY_TRANSFORMS_PATTERN_UTIL_H_
-#define TVM_RELAY_TRANSFORMS_PATTERN_UTIL_H_
+#ifndef TVM_RELAY_TRANSFORMS_PATTERN_UTILS_H_
+#define TVM_RELAY_TRANSFORMS_PATTERN_UTILS_H_
 
 #include <builtin_fp16.h>
 #include <tvm/node/structural_equal.h>
@@ -163,7 +163,7 @@ inline Expr ExpandBiasToMatchAxis(Expr bias, int target_ndim, const Array<Intege
       }
     } else {
       int64_t diff = axes[i]->value - axes[i - 1]->value;
-      CHECK_GE(diff, 0L);
+      ICHECK_GE(diff, 0L);
       if (diff > 0) {
         auto attrs = make_object<ExpandDimsAttrs>();
         attrs->axis = i;
@@ -199,7 +199,7 @@ inline int64_t GetConv2DSuperChannelsDim(const CallNode* call) {
   auto param = call->attrs.as<Conv2DAttrs>();
   auto tweight = call->args[1]->type_as<TensorTypeNode>();
   auto index = param->kernel_layout.operator std::string().find('O');
-  CHECK_NE(index, std::string::npos);
+  ICHECK_NE(index, std::string::npos);
   auto channels = tir::as_const_int(tweight->shape[index]);
   return *channels;
 }
@@ -331,8 +331,8 @@ static inline Constant CheckConstantShape(const Array<IndexExpr>& shape) {
   auto* shape_data = static_cast<int64_t*>(shape_array->data);
   for (size_t i = 0; i < shape.size(); ++i) {
     const auto& dim_val = shape[i].as<IntImmNode>();
-    CHECK(dim_val) << "Do not support symbolic shape for "
-                      "Array format. Pass shape as Expr instead.";
+    ICHECK(dim_val) << "Do not support symbolic shape for "
+                       "Array format. Pass shape as Expr instead.";
     shape_data[i] = dim_val->value;
   }
   return Constant(shape_array);
@@ -350,8 +350,8 @@ static inline Array<Integer> CheckConstantShapeArrayInteger(const Array<IndexExp
 
   for (size_t i = 0; i < shape.size(); ++i) {
     const auto& dim_val = shape[i].as<IntImmNode>();
-    CHECK(dim_val) << "Do not support symbolic shape for "
-                      "Array format. Pass shape as Expr instead.";
+    ICHECK(dim_val) << "Do not support symbolic shape for "
+                       "Array format. Pass shape as Expr instead.";
 
     constShape.push_back(dim_val->value);
   }
@@ -423,7 +423,7 @@ static inline long double ToScalar(const runtime::NDArray& array, size_t i = 0)
  */
 static inline Array<Integer> ToVector(const runtime::NDArray& array) {
   size_t ndim = array.Shape().size();
-  CHECK_EQ(ndim, 1) << "This function should only be used for 1D NDArrays";
+  ICHECK_EQ(ndim, 1) << "This function should only be used for 1D NDArrays";
   size_t len = array.Shape().front();
   Array<Integer> out;
   for (size_t i = 0; i < len; ++i) {
@@ -440,7 +440,7 @@ static inline Array<Integer> ToVector(const runtime::NDArray& array) {
  */
 static inline Array<Array<Integer>> ToMatrix(const runtime::NDArray& array) {
   size_t ndim = array.Shape().size();
-  CHECK_EQ(ndim, 2) << "This function should only used for 2D NDArrays";
+  ICHECK_EQ(ndim, 2) << "This function should only used for 2D NDArrays";
   size_t dim1 = array.Shape().at(0);
   size_t dim2 = array.Shape().at(1);
 
@@ -494,8 +494,8 @@ inline Expr Log(Expr e) {
 template <typename T>
 T GetScalarFromConstant(Expr expr) {
   const auto* n = expr.as<ConstantNode>();
-  CHECK(n) << "Expr must be a constant expr - " << AsText(expr, false);
-  CHECK(n->is_scalar());
+  ICHECK(n) << "Expr must be a constant expr - " << AsText(expr, false);
+  ICHECK(n->is_scalar());
   return static_cast<T*>(n->data->data)[0];
 }
 
@@ -594,9 +594,9 @@ inline Expr LeftShift(Expr x, Expr nbit) {
   return Call(op, {x, nbit}, Attrs(), {});
 }
 
-inline Expr ReshapeLike(Expr lhs, Expr rhs) {
-  static const Op& op = Op::Get("reshape_like");
-  return Call(op, {lhs, rhs}, Attrs(), {});
+inline Expr ReshapeLike(Expr lhs, Expr rhs, int lhs_begin, Integer lhs_end, int rhs_begin,
+                        Integer rhs_end) {
+  return MakeReshapeLike(lhs, rhs, lhs_begin, lhs_end, rhs_begin, rhs_end);
 }
 
 inline Expr Copy(Expr data) {
@@ -676,4 +676,4 @@ Expr CastHint(Expr data, DataType dtype);
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_TRANSFORMS_PATTERN_UTIL_H_
+#endif  // TVM_RELAY_TRANSFORMS_PATTERN_UTILS_H_
diff --git a/src/relay/transforms/simplify_fc_transpose.cc b/src/relay/transforms/simplify_fc_transpose.cc
index 99ded0ba591d..b5090e7e6fe4 100644
--- a/src/relay/transforms/simplify_fc_transpose.cc
+++ b/src/relay/transforms/simplify_fc_transpose.cc
@@ -81,7 +81,7 @@ class FCTransposeMutator : public ExprRewriter {
   explicit FCTransposeMutator(const Array<ObjectRef>& target_weights)
       : dense_op_(Op::Get("nn.dense")), transpose_op_(Op::Get("transpose")) {
     for (size_t i = 0; i < target_weights.size(); ++i) {
-      CHECK(target_weights[i]->IsInstance<runtime::StringObj>());
+      ICHECK(target_weights[i]->IsInstance<runtime::StringObj>());
       std::string k = target_weights[i].as<runtime::StringObj>()->data;
       target_weights_.emplace(k);
     }
@@ -96,7 +96,7 @@ class FCTransposeMutator : public ExprRewriter {
           const auto arg = weight->args[0];
           if (arg.as<VarNode>()) {
             const auto& arg_node = arg.as<VarNode>();
-            CHECK_GT(target_weights_.count(arg_node->name_hint()), 0);
+            ICHECK_GT(target_weights_.count(arg_node->name_hint()), 0);
             const auto& tt = arg_node->type_annotation.as<TensorTypeNode>();
             auto wt_type = TensorType({tt->shape[1], tt->shape[0]}, tt->dtype);
             Var wt(arg_node->name_hint() + ".T", wt_type);
diff --git a/src/relay/transforms/simplify_inference.cc b/src/relay/transforms/simplify_inference.cc
index 8728e90f55a3..7e587664b4dc 100644
--- a/src/relay/transforms/simplify_inference.cc
+++ b/src/relay/transforms/simplify_inference.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -34,7 +34,7 @@ namespace relay {
 Expr BatchNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta, Expr moving_mean,
                             Expr moving_var, Type tdata) {
   auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
+  ICHECK(ttype);
   const auto param = attrs.as<BatchNormAttrs>();
   Expr epsilon = MakeConstantScalar(ttype->dtype, static_cast<float>(param->epsilon));
   Expr var_add_eps = Add(moving_var, epsilon);
@@ -62,9 +62,9 @@ Expr BatchNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta,
 
 Expr GroupNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta, Type tdata) {
   auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
+  ICHECK(ttype);
   const auto param = attrs.as<GroupNormAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   int ndim = ttype->shape.size();
   int axis = (param->axis < 0) ? param->axis + ndim : param->axis;
@@ -117,9 +117,9 @@ Expr GroupNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta,
 
 Expr LayerNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta, Type tdata) {
   auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
+  ICHECK(ttype);
   const auto param = attrs.as<LayerNormAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   Expr epsilon = MakeConstantScalar(ttype->dtype, static_cast<float>(param->epsilon));
   Expr mean = Mean(data, {param->axis}, true, false);
@@ -140,9 +140,9 @@ Expr LayerNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta,
 
 Expr InstanceNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta, Type tdata) {
   auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
+  ICHECK(ttype);
   const auto param = attrs.as<InstanceNormAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   int ndim = ttype->shape.size();
   int axis = (param->axis < 0) ? param->axis + ndim : param->axis;
@@ -168,7 +168,7 @@ Expr InstanceNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr be
 
 Expr L2NormToInferUnpack(const Attrs attrs, Expr data) {
   const auto param = attrs.as<L2NormalizeAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   Expr epsilon = MakeConstantScalar(DataType::Float(32), static_cast<float>(param->eps));
 
diff --git a/src/relay/transforms/to_a_normal_form.cc b/src/relay/transforms/to_a_normal_form.cc
index adb757b9de0c..05844477cc5b 100644
--- a/src/relay/transforms/to_a_normal_form.cc
+++ b/src/relay/transforms/to_a_normal_form.cc
@@ -31,7 +31,7 @@
 #include "../../support/arena.h"
 #include "../analysis/dependency_graph.h"
 #include "let_list.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -65,7 +65,7 @@ std::pair<NodeScopeMap, ExprSet> CalcScope(const DependencyGraph& dg) {
     auto iit = n->parents.head;
     Scope s;
     if (iit == nullptr) {
-      CHECK(!global_scope_used);
+      ICHECK(!global_scope_used);
       s = global_scope;
       global_scope_used = true;
     } else {
@@ -90,7 +90,7 @@ std::pair<NodeScopeMap, ExprSet> CalcScope(const DependencyGraph& dg) {
       expr_scope.insert({n, s});
     }
   }
-  CHECK(global_scope_used);
+  ICHECK(global_scope_used);
   return std::make_pair(expr_scope, lifted_exprs);
 }
 
@@ -114,11 +114,11 @@ Scope Fill::GetSubScope(const Expr& e, size_t i) {
   DependencyGraph::Node* n = dg_.expr_node.at(e);
   auto h = n->children.head;
   while (i != 0) {
-    CHECK(h);
+    ICHECK(h);
     --i;
     h = h->next;
   }
-  CHECK(h);
+  ICHECK(h);
   return node_scope_->at(h->value);
 }
 
@@ -130,7 +130,7 @@ Expr Fill::VisitExpr(const Expr& e, const Var& v) {
   }
   auto ret = memo.at(e);
   // if no include_set is specified, every expression should be atomic.
-  if (include_set_ == nullptr) CHECK(IsAtomic(ret));
+  if (include_set_ == nullptr) ICHECK(IsAtomic(ret));
   return ret;
 }
 
@@ -258,12 +258,12 @@ IRModule ToANormalForm(const IRModule& m) {
   tvm::Map<GlobalVar, Function> updates;
   auto funcs = m->functions;
   for (const auto& it : funcs) {
-    CHECK_EQ(FreeVars(it.second).size(), 0);
+    ICHECK_EQ(FreeVars(it.second).size(), 0);
     if (const auto* n = it.second.as<FunctionNode>()) {
       if (n->GetAttr<String>(attr::kCompiler).defined()) continue;
     }
     Expr ret = TransformF([&](const Expr& e) { return transform::ToANormalForm(e); }, it.second);
-    CHECK_EQ(FreeVars(ret).size(), 0)
+    ICHECK_EQ(FreeVars(ret).size(), 0)
         << AsText(ret) << "should not has free vars: " << FreeVars(ret);
     updates.Set(it.first, Downcast<Function>(ret));
   }
diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc
index 5fc01e151760..1aab367cf22a 100644
--- a/src/relay/transforms/to_basic_block_normal_form.cc
+++ b/src/relay/transforms/to_basic_block_normal_form.cc
@@ -31,7 +31,7 @@
 #include "../../support/arena.h"
 #include "../analysis/dependency_graph.h"
 #include "let_list.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -54,7 +54,7 @@ IRModule ToBasicBlockNormalForm(const IRModule& mod) {
   tvm::Map<GlobalVar, Function> updates;
   auto funcs = mod->functions;
   for (const auto& it : funcs) {
-    CHECK_EQ(FreeVars(it.second).size(), 0) << "Expected no free variables";
+    ICHECK_EQ(FreeVars(it.second).size(), 0) << "Expected no free variables";
     if (const auto* n = it.second.as<FunctionNode>()) {
       if (n->GetAttr<String>(attr::kCompiler).defined()) continue;
     }
diff --git a/src/relay/transforms/to_cps.cc b/src/relay/transforms/to_cps.cc
index 7c11ce5d4cd9..b7f9cafbc7dc 100644
--- a/src/relay/transforms/to_cps.cc
+++ b/src/relay/transforms/to_cps.cc
@@ -57,7 +57,7 @@
 #include <tvm/relay/transform.h>
 
 #include "let_list.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -134,7 +134,7 @@ Function ToCPS(const Function& f, const IRModule& m, CPSMap* cm, VarMap* vm,
     }
 
     Expr VisitExpr_(const FunctionNode* op, const MCont& k) final {
-      CHECK(!op->HasNonzeroAttr(attr::kPrimitive)) << "primitive func not supported yet.";
+      ICHECK(!op->HasNonzeroAttr(attr::kPrimitive)) << "primitive func not supported yet.";
       return k(ToCPS(GetRef<Function>(op), m, cm, vm, answer));
     }
 
@@ -309,14 +309,14 @@ Function ToCPS(const Function& f, const IRModule& m) {
 
 Function UnCPS(const Function& f) {
   CheckFeature(f, FeatureSet::All() - fGraph);
-  CHECK_GT(f->params.size(), 0);
+  ICHECK_GT(f->params.size(), 0);
   std::vector<Var> new_params;
   for (const auto& p : f->params) {
     new_params.push_back(Var(p->name_hint(), p->checked_type()));
   }
   auto cont_type = Downcast<FuncType>(new_params.back()->type_annotation);
   new_params.pop_back();
-  CHECK_EQ(cont_type->arg_types.size(), 1);
+  ICHECK_EQ(cont_type->arg_types.size(), 1);
   auto new_ret_type = Type(cont_type->arg_types[0]);
   std::vector<TypeVar> new_type_params;
   for (const auto& tp : f->type_params) {
@@ -325,7 +325,7 @@ Function UnCPS(const Function& f) {
   auto answer_type = new_type_params.back();
   new_type_params.pop_back();
   // TODO(@M.K.): make alphaequal work on free term
-  // CHECK(tvm::StructuralEqual()(cont_type, Arrow(new_ret_type, answer_type)));
+  // ICHECK(tvm::StructuralEqual()(cont_type, Arrow(new_ret_type, answer_type)));
   auto x = Var("x", new_ret_type);
   auto cont = Function({x}, x, new_ret_type, {}, {});
   tvm::Array<Expr> args;
diff --git a/src/relay/transforms/transform_layout.h b/src/relay/transforms/transform_layout.h
index 61a74404afd1..35fb176c6bca 100644
--- a/src/relay/transforms/transform_layout.h
+++ b/src/relay/transforms/transform_layout.h
@@ -34,8 +34,8 @@
 #include <unordered_map>
 #include <vector>
 
-#include "infer_layout_util.h"
-#include "pattern_util.h"
+#include "infer_layout_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -138,9 +138,9 @@ class TransformMemorizer : public ObjectRef {
     }
 
     // 2) Insert layout transform on the transformed src.
-    CHECK(new_src_layout.defined() && dst_layout.defined())
+    ICHECK(new_src_layout.defined() && dst_layout.defined())
         << "Cannot insert layout transform because there are undefined layouts";
-    CHECK(tir::BijectiveLayout(new_src_layout, dst_layout).defined())
+    ICHECK(tir::BijectiveLayout(new_src_layout, dst_layout).defined())
         << "Cannot insert layout transform because there are inconvertible layouts: "
         << new_src_layout << " v.s. " << dst_layout;
     return MakeLayoutTransform(input_expr, new_src_layout.name(), dst_layout.name());
@@ -299,7 +299,7 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
   if (!success) {
     return Expr(nullptr);
   }
-  CHECK_EQ(old_in.size(), new_in.size());
+  ICHECK_EQ(old_in.size(), new_in.size());
 
   // if new_in == 'undef':  new_in = old_in
   for (size_t i = 0; i < new_in.size(); ++i) {
@@ -322,9 +322,9 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
     return Expr(nullptr);
   }
 
-  CHECK_EQ(new_out.size(), old_out.size())
+  ICHECK_EQ(new_out.size(), old_out.size())
       << "The number of output nodes should keep the same during alter_op_layout";
-  CHECK_EQ(new_in.size(), new_in2.size())
+  ICHECK_EQ(new_in.size(), new_in2.size())
       << "The number of input nodes should keep the same during alter_op_layout";
 
   // if (new_in != new_in2): insert transform (new_in -> new_in2)
@@ -344,7 +344,7 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
       pt++;
     }
   }
-  CHECK_EQ(pt, inputs.size());
+  ICHECK_EQ(pt, inputs.size());
 
   // state[node] = (old_out, new_out)
   // (handle tuple output)
@@ -362,7 +362,7 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
     return Tuple(fields);
   } else {
     auto rnode = make_object<LayoutAlternatedExprNode<TransformMemorizerT>>();
-    CHECK_EQ(new_out.size(), 1);
+    ICHECK_EQ(new_out.size(), 1);
     rnode->value = Call(new_call->op, transformed_args, new_call->attrs);
     rnode->old_layout = old_out[0];
     rnode->new_layout = new_out[0];
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index d34a662778a4..327b5d1e260a 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -46,7 +46,7 @@
 #include <tvm/relay/transform.h>
 
 #include "../analysis/type_solver.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -60,15 +60,15 @@ struct TupleGetItemAttrs : public tvm::AttrsNode<TupleGetItemAttrs> {
 
 bool TupleGetItemRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   if (types[0].as<IncompleteTypeNode>()) return false;
   const auto* data = types[0].as<TupleTypeNode>();
-  CHECK(data != nullptr) << "TupleGetItem expect input type to be TupleType "
-                         << " get " << types[0] << " instead";
+  ICHECK(data != nullptr) << "TupleGetItem expect input type to be TupleType "
+                          << " get " << types[0] << " instead";
   const auto* param = attrs.as<TupleGetItemAttrs>();
-  CHECK(param != nullptr);
-  CHECK_GE(param->index, 0);
-  CHECK_LT(param->index, data->fields.size());
+  ICHECK(param != nullptr);
+  ICHECK_GE(param->index, 0);
+  ICHECK_LT(param->index, data->fields.size());
   reporter->Assign(types[1], data->fields[param->index]);
   return true;
 }
@@ -129,6 +129,37 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   TypeRelationFn tuple_getitem_rel_;
   TypeRelationFn make_tuple_rel_;
 
+  /*! \brief Internal map used for memoization. */
+  std::unordered_map<Expr, Type, ObjectPtrHash, ObjectPtrEqual> memo_;
+
+  void VisitLeaf(const Expr& expr) {
+    if (!memo_.count(expr)) {
+      Type ret = this->DispatchVisitExpr(expr);
+      memo_[expr] = ret;
+    }
+  }
+
+  bool CheckVisited(const Expr& expr) {
+    if (memo_.count(expr)) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  Type DispatchVisitExpr(const Expr& expr) { return ExprFunctor::VisitExpr(expr); }
+
+  Type VisitExpr(const Expr& expr) final {
+    auto fcheck_visited = [this](const Expr& expr) { return this->CheckVisited(expr); };
+    auto fvisit_leaf = [this](const Expr& expr) { return this->VisitLeaf(expr); };
+    if (memo_.count(expr)) {
+      return memo_[expr];
+    } else {
+      ExpandDataflow(expr, fcheck_visited, fvisit_leaf);
+      return memo_[expr];
+    }
+  }
+
   // Perform unification on two types and report the error at the expression
   // or the span of the expression.
   Type Unify(const Type& t1, const Type& t2, const Span& span) {
@@ -149,7 +180,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
       return it->second.checked_type;
     }
     Type ret = this->VisitExpr(expr);
-    CHECK(ret.defined());
+    ICHECK(ret.defined());
     KindCheck(ret, mod_, this->diag_ctx);
     ResolvedTypeInfo& rti = type_map_[expr];
     rti.checked_type = ret;
@@ -202,8 +233,8 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   }
 
   void VisitPattern_(const PatternConstructorNode* con, const Type& t) {
-    CHECK(mod_.defined()) << "Cannot do type inference without a environment:"
-                          << con->constructor->name_hint;
+    ICHECK(mod_.defined()) << "Cannot do type inference without a environment:"
+                           << con->constructor->name_hint;
     TypeData td = mod_->type_definitions.at(con->constructor->belong_to);
     auto pc = GetRef<PatternConstructor>(con);
 
@@ -264,7 +295,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     if (!tt) {
       this->EmitFatal(Diagnostic::Error(pt->span) << "Expected a tuple type, got " << unified);
     }
-    CHECK(tup->patterns.size() == tt->fields.size()) << "not enough pattern";
+    ICHECK(tup->patterns.size() == tt->fields.size()) << "not enough pattern";
     for (size_t i = 0; i < tup->patterns.size(); ++i) {
       VisitPattern(tup->patterns[i], tt->fields[i]);
     }
@@ -325,7 +356,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     Type vtype = GetType(let->value);
     let_type = Unify(let_type, vtype, let->span);
 
-    CHECK(is_functional_literal || !type_map_.count(let->var));
+    ICHECK(is_functional_literal || !type_map_.count(let->var));
     // NOTE: no scoping is necessary because var are unique in program
     type_map_[let->var].checked_type = let_type;
     return GetType(let->body);
@@ -368,7 +399,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
 
     // Build a subsitituion map up from the function type and type arguments.
     // Eventually allow the type vars to be passed in.
-    CHECK(fn_ty->type_params.size() == ty_args.size())
+    ICHECK(fn_ty->type_params.size() == ty_args.size())
         << "number of type parameters does not match expected";
     for (size_t i = 0; i < ty_args.size(); ++i) {
       subst_map.Set(fn_ty->type_params[i], ty_args[i]);
@@ -408,7 +439,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     if (type_info == type_map_.end()) {
       type_map_.insert({expr, ResolvedTypeInfo(Type(), type_args)});
     } else {
-      CHECK(!type_info->second.type_args.defined());
+      ICHECK(!type_info->second.type_args.defined());
       type_info->second.type_args = type_args;
     }
   }
@@ -511,7 +542,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     if (f->ret_type.defined()) {
       rtype = this->Unify(f->ret_type, rtype, GetRef<Function>(f)->span);
     }
-    CHECK(rtype.defined());
+    ICHECK(rtype.defined());
     auto ret = FuncType(arg_types, rtype, f->type_params, {});
     return solver_.Resolve(ret);
   }
@@ -532,7 +563,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   }
 
   Type VisitExpr_(const ConstructorNode* c) final {
-    CHECK(mod_.defined()) << "Cannot do type inference without a environment:" << c->name_hint;
+    ICHECK(mod_.defined()) << "Cannot do type inference without a environment:" << c->name_hint;
     TypeData td = mod_->LookupTypeDef(c->belong_to);
     std::vector<Type> types;
     for (const auto& t : td->type_vars) {
@@ -546,12 +577,14 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   }
 };
 
-class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
+class TypeInferencer::Resolver : public MixedModeMutator, PatternMutator {
  public:
   Resolver(const std::unordered_map<Expr, ResolvedTypeInfo, ObjectPtrHash, ObjectPtrEqual>& tmap,
            TypeSolver* solver)
       : tmap_(tmap), solver_(solver) {}
 
+  using MixedModeMutator::VisitExpr_;
+
   Expr VisitExpr_(const VarNode* op) final { return VisitVar(GetRef<Var>(op)); }
 
   Expr VisitExpr_(const ConstantNode* op) final { return AttachCheckedType(op); }
@@ -560,13 +593,15 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
 
   Expr VisitExpr_(const OpNode* op) final { return ExprMutator::VisitExpr_(op); }
 
-  Expr VisitExpr_(const TupleNode* op) final { return AttachCheckedType(op); }
+  Expr Rewrite_(const TupleNode* op, const Expr& post) final { return AttachCheckedType(op, post); }
 
-  Expr VisitExpr_(const TupleGetItemNode* op) final { return AttachCheckedType(op); }
+  Expr Rewrite_(const TupleGetItemNode* op, const Expr& post) final {
+    return AttachCheckedType(op, post);
+  }
 
   Expr VisitExpr_(const FunctionNode* op) final { return AttachCheckedType(op); }
 
-  Expr VisitExpr_(const CallNode* op) final { return AttachCheckedType(op); }
+  Expr Rewrite_(const CallNode* op, const Expr& post) final { return AttachCheckedType(op, post); }
 
   Expr VisitExpr_(const LetNode* op) final { return AttachCheckedType(op); }
 
@@ -593,9 +628,9 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
 
   // attach checked type to the mutated node.
   template <typename T>
-  Expr AttachCheckedType(const T* op) {
+  Expr AttachCheckedType(const T* op, const Expr& post = Expr()) {
     auto it = tmap_.find(GetRef<Expr>(op));
-    CHECK(it != tmap_.end());
+    ICHECK(it != tmap_.end());
     Type checked_type = solver_->Resolve(it->second.checked_type);
 
     if (checked_type.as<IncompleteTypeNode>() != nullptr) {
@@ -606,7 +641,7 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
           << " check other reported errors for hints of what may of happened.");
     }
 
-    Expr new_e = ExprMutator::VisitExpr_(op);
+    Expr new_e = post.defined() ? post : ExprMutator::VisitExpr_(op);
     // new_call and new_var's code is only going to be valid for VarNode/CallNode.
     // Compiler optimization will likely fold these away for other nodes.
     CallNode* new_call = (std::is_base_of<CallNode, T>::value
@@ -664,7 +699,7 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
     }
     if (need_update_fn) {
       auto* fn_type = checked_type.as<FuncTypeNode>();
-      CHECK(fn_type != nullptr);
+      ICHECK(fn_type != nullptr);
       new_fn->ret_type = fn_type->ret_type;
     }
     return new_e;
@@ -702,8 +737,8 @@ Expr TypeInferencer::Infer(GlobalVar var, Function function) {
   return resolved_expr;
 }
 
-struct AllCheckTypePopulated : ExprVisitor {
-  void VisitExpr(const Expr& e) {
+struct AllCheckTypePopulated : MixedModeVisitor {
+  void DispatchExprVisit(const Expr& e) {
     if (e.as<OpNode>()) {
       return;
     }
@@ -713,7 +748,7 @@ struct AllCheckTypePopulated : ExprVisitor {
     if (e.as<ConstructorNode>()) {
       return;
     }
-    CHECK(e->checked_type_.defined()) << "Expression: " << e;
+    ICHECK(e->checked_type_.defined()) << "Expression: " << e;
     return ExprVisitor::VisitExpr(e);
   }
 };
@@ -788,7 +823,7 @@ Pass InferType() {
             }
 
             auto free_tvars = FreeTypeVars(updated_func, mod);
-            CHECK(free_tvars.size() == 0)
+            ICHECK(free_tvars.size() == 0)
                 << "Found unbound type variables in " << updated_func << ": " << free_tvars;
             EnsureCheckedType(updated_func);
             updates.push_back({it.first, Downcast<Function>(updated_func)});
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 9222cebe3a4d..4d10a72e2391 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -44,47 +44,47 @@ namespace runtime {
 
 std::string GetCustomTypeName(uint8_t type_code) {
   auto f = tvm::runtime::Registry::Get("runtime._datatype_get_type_name");
-  CHECK(f) << "Function runtime._datatype_get_type_name not found";
+  ICHECK(f) << "Function runtime._datatype_get_type_name not found";
   return (*f)(type_code).operator std::string();
 }
 
 uint8_t GetCustomTypeCode(const std::string& type_name) {
   auto f = tvm::runtime::Registry::Get("runtime._datatype_get_type_code");
-  CHECK(f) << "Function runtime._datatype_get_type_code not found";
+  ICHECK(f) << "Function runtime._datatype_get_type_code not found";
   return (*f)(type_name).operator int();
 }
 
 bool GetCustomTypeRegistered(uint8_t type_code) {
   auto f = tvm::runtime::Registry::Get("runtime._datatype_get_type_registered");
-  CHECK(f) << "Function runtime._datatype_get_type_registered not found";
+  ICHECK(f) << "Function runtime._datatype_get_type_registered not found";
   return (*f)(type_code).operator bool();
 }
 
 uint8_t ParseCustomDatatype(const std::string& s, const char** scan) {
-  CHECK(s.substr(0, 6) == "custom") << "Not a valid custom datatype string";
+  ICHECK(s.substr(0, 6) == "custom") << "Not a valid custom datatype string";
 
   auto tmp = s.c_str();
 
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   *scan = s.c_str() + 6;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   if (**scan != '[') LOG(FATAL) << "expected opening brace after 'custom' type in" << s;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   *scan += 1;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   size_t custom_name_len = 0;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   while (*scan + custom_name_len <= s.c_str() + s.length() && *(*scan + custom_name_len) != ']')
     ++custom_name_len;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   if (*(*scan + custom_name_len) != ']')
     LOG(FATAL) << "expected closing brace after 'custom' type in" << s;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   *scan += custom_name_len + 1;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
 
   auto type_name = s.substr(7, custom_name_len);
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   return GetCustomTypeCode(type_name);
 }
 
@@ -129,7 +129,7 @@ class DeviceAPIManager {
     std::string factory = "device_api." + name;
     auto* f = Registry::Get(factory);
     if (f == nullptr) {
-      CHECK(allow_missing) << "Device API " << name << " is not enabled.";
+      ICHECK(allow_missing) << "Device API " << name << " is not enabled.";
       return nullptr;
     }
     void* ptr = (*f)();
@@ -177,7 +177,7 @@ void DeviceAPI::SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src,
 /*!
  * \brief Normalize error message
  *
- *  Parse them header generated by by LOG(FATAL) and CHECK
+ *  Parse them header generated by by LOG(FATAL) and ICHECK
  *  and reformat the message into the standard format.
  *
  *  This function will also merge all the stack traces into
@@ -452,7 +452,7 @@ int TVMFuncCall(TVMFunctionHandle func, TVMValue* args, int* arg_type_codes, int
 
 int TVMCFuncSetReturn(TVMRetValueHandle ret, TVMValue* value, int* type_code, int num_ret) {
   API_BEGIN();
-  CHECK_EQ(num_ret, 1);
+  ICHECK_EQ(num_ret, 1);
   TVMRetValue* rv = static_cast<TVMRetValue*>(ret);
   *rv = TVMArgValue(value[0], type_code[0]);
   API_END();
diff --git a/src/runtime/container.cc b/src/runtime/container.cc
index 253243271d93..916a912b3c5e 100644
--- a/src/runtime/container.cc
+++ b/src/runtime/container.cc
@@ -45,7 +45,7 @@ TVM_REGISTER_GLOBAL("runtime.GetADTFields").set_body([](TVMArgs args, TVMRetValu
   ObjectRef obj = args[0];
   int idx = args[1];
   const auto& adt = Downcast<ADT>(obj);
-  CHECK_LT(idx, adt.size());
+  ICHECK_LT(idx, adt.size());
   *rv = adt[idx];
 });
 
diff --git a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
index 2feb5b03c88b..f9a67010e6e2 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
@@ -29,7 +29,7 @@ namespace runtime {
 namespace contrib {
 
 void* ACLAllocator::allocate(size_t size, size_t alignment) {
-  CHECK_GT(size, 0) << "Cannot allocate size less than or equal to zero";
+  ICHECK_GT(size, 0) << "Cannot allocate size less than or equal to zero";
   return this->device_api_->AllocWorkspace(this->ctx_, size, {});
 }
 
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index e5f2c2d47281..09879bdc6e95 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -75,7 +75,7 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param consts The constant params from compiled model.
    */
   void Init(const Array<NDArray>& consts) override {
-    CHECK_EQ(consts.size(), const_idx_.size())
+    ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     SetupConstants(consts);
     BuildEngine();
@@ -222,7 +222,7 @@ class ACLRuntime : public JSONRuntimeBase {
     arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);
 
     int groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
-    CHECK(groups == 1) << "Arm Compute Library NEON convolution only supports group size of 1.";
+    ICHECK(groups == 1) << "Arm Compute Library NEON convolution only supports group size of 1.";
 
     arm_compute::ActivationLayerInfo act_info;
     if (node.HasAttr("activation_type")) {
@@ -242,7 +242,7 @@ class ACLRuntime : public JSONRuntimeBase {
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.conv2d") {
-      CHECK(num_inputs >= 8U && num_inputs <= 9U)
+      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
           << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
       has_bias = num_inputs == 9;
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
@@ -253,7 +253,7 @@ class ACLRuntime : public JSONRuntimeBase {
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
-      CHECK(num_inputs >= 2U && num_inputs <= 3U)
+      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
           << "Convolution requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
@@ -286,7 +286,7 @@ class ACLRuntime : public JSONRuntimeBase {
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.dense") {
-      CHECK(num_inputs >= 8U && num_inputs <= 9U)
+      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
           << "Quantized fully connected (dense) layer requires 9 inputs with a bias, 8 inputs "
              "without.";
       has_bias = num_inputs == 9;
@@ -298,7 +298,7 @@ class ACLRuntime : public JSONRuntimeBase {
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
-      CHECK(num_inputs >= 2U && num_inputs <= 3U)
+      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
           << "Fully connected (dense) layer requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 59c941df5195..604c619bf49c 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -35,7 +35,8 @@ namespace contrib {
 using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
 
 void CheckACLError(const arm_compute::Status& status) {
-  CHECK(status.error_code() == arm_compute::ErrorCode::OK) << "ACL: " << status.error_description();
+  ICHECK(status.error_code() == arm_compute::ErrorCode::OK)
+      << "ACL: " << status.error_description();
 }
 
 arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
@@ -44,6 +45,7 @@ arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
   std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
   DLDataType dtype = tensor_rep.GetOpDataType()[0];
   arm_compute::TensorInfo info = MakeACLTensorInfo(shape, dtype, scale, offset);
+  info.set_is_resizable(false);
   tensor.allocator()->init(info);
   if (data != nullptr) {
     CheckACLError(tensor.allocator()->import_memory(data));
@@ -65,7 +67,7 @@ arm_compute::TensorInfo MakeACLTensorInfo(const std::vector<int64_t>& shape,
   if (scale != nullptr && offset != nullptr) {
     std::vector<float> scale_data = GetVectorFromDLTensor<float>(scale);
     std::vector<int> offset_data = GetVectorFromDLTensor<int>(offset);
-    CHECK(scale_data.size() == 1 && offset_data.size() == 1)
+    ICHECK(scale_data.size() == 1 && offset_data.size() == 1)
         << "Currently only per-layer quantization is supported in the Arm Compute Library runtime.";
     arm_compute::QuantizationInfo qinfo(scale_data[0], offset_data[0]);
     info.set_quantization_info(qinfo);
@@ -134,7 +136,7 @@ arm_compute::DataType MakeACLDataType(const DLDataType& data_type) {
 
 template <typename T>
 std::vector<T> GetVectorFromDLTensor(const DLTensor* tensor) {
-  CHECK(tensor) << "Cannot convert a nullptr";
+  ICHECK(tensor) << "Cannot convert a nullptr";
   int len = 1;
   for (int i = 0; i < tensor->ndim; i++) {
     len *= tensor->shape[i];
diff --git a/src/runtime/contrib/cblas/cblas.cc b/src/runtime/contrib/cblas/cblas.cc
index 80d39f6efa9c..16496e06aae3 100644
--- a/src/runtime/contrib/cblas/cblas.cc
+++ b/src/runtime/contrib/cblas/cblas.cc
@@ -20,9 +20,9 @@
 /*!
  * \file Use external cblas library call.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 extern "C" {
 #include <cblas.h>
@@ -125,7 +125,7 @@ struct CblasDgemmBatchIterativeOp {
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
 
   if (TypeMatch(A->dtype, kDLFloat, 32))
     CallGemm(args, ret, CblasSgemmOp());
@@ -135,7 +135,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul").set_body([](TVMArgs args, TVMRet
 
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.batch_matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
   if (TypeMatch(A->dtype, kDLFloat, 32)) {
     CallBatchGemm(args, ret, CblasSgemmBatchOp());
   } else {
@@ -146,7 +146,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cblas.batch_matmul").set_body([](TVMArgs args,
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.batch_matmul_iterative")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       DLTensor* A = args[0];
-      CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+      ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
       if (TypeMatch(A->dtype, kDLFloat, 32)) {
         CallBatchGemm(args, ret, CblasSgemmBatchIterativeOp());
       } else {
diff --git a/src/runtime/contrib/cblas/gemm_common.h b/src/runtime/contrib/cblas/gemm_common.h
index d92f9d710a44..6c31fbdd06a3 100644
--- a/src/runtime/contrib/cblas/gemm_common.h
+++ b/src/runtime/contrib/cblas/gemm_common.h
@@ -71,23 +71,23 @@ inline void CallGemm(TVMArgs args, TVMRetValue* ret, TGemmOp op) {
   bool transa = args[3];
   bool transb = args[4];
   int bit_depth = sizeof(typename TGemmOp::TDatatype) * 8;
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
 
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed(C));
 
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  CHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
-  CHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
+  ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
   op(transb, transa, ColumnCount(B, transb), RowCount(A, transa), ColumnCount(A, transa),
@@ -118,24 +118,24 @@ inline void CallU8S8S32Gemm(TVMArgs args, TVMRetValue* ret, TGemmOp op) {
   int offset_c[1];
   offset_c[0] = 0;
 
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
 
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed(C));
 
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  CHECK(TypeMatch(A->dtype, kDLUInt, 8));
-  CHECK(TypeMatch(B->dtype, kDLInt, 8));
-  CHECK(TypeMatch(C->dtype, kDLInt, 32));
+  ICHECK(TypeMatch(A->dtype, kDLUInt, 8));
+  ICHECK(TypeMatch(B->dtype, kDLInt, 8));
+  ICHECK(TypeMatch(C->dtype, kDLInt, 32));
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
   op(transb, transa, ColumnCount(B, transb), RowCount(A, transa), ColumnCount(A, transa),
@@ -180,22 +180,22 @@ inline void CallBatchGemm(TVMArgs args, TVMRetValue* ret, TBatchGemmOp op) {
   bool transa = args[3];
   bool transb = args[4];
   int bit_depth = sizeof(DType) * 8;
-  CHECK_EQ(A->ndim, 3);
-  CHECK_EQ(B->ndim, 3);
-  CHECK_EQ(C->ndim, 3);
+  ICHECK_EQ(A->ndim, 3);
+  ICHECK_EQ(B->ndim, 3);
+  ICHECK_EQ(C->ndim, 3);
   int batch_size = BatchCount3D(A);
-  CHECK_EQ(BatchCount3D(B), batch_size);
-  CHECK_EQ(BatchCount3D(C), batch_size);
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(BatchCount3D(B), batch_size);
+  ICHECK_EQ(BatchCount3D(C), batch_size);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed3D(C));
+  ICHECK(!IsInPlaceTransposed3D(C));
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed3D(A) ? !transa : transa;
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
-  CHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
-  CHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
+  ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
   const int A_size = A->shape[1] * A->shape[2];
diff --git a/src/runtime/contrib/cblas/mkl.cc b/src/runtime/contrib/cblas/mkl.cc
index 14e2375a311e..273aa45367dd 100644
--- a/src/runtime/contrib/cblas/mkl.cc
+++ b/src/runtime/contrib/cblas/mkl.cc
@@ -20,9 +20,9 @@
 /*!
  * \file Use external mkl library call.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 extern "C" {
 #include <mkl_cblas.h>
@@ -156,7 +156,7 @@ struct MKLDgemmBatchIterativeOp {
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.mkl.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
 
   if (TypeMatch(A->dtype, kDLFloat, 32))
     CallGemm(args, ret, MKLSgemmOp());
@@ -169,15 +169,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.mkl.matmul_u8s8s32").set_body([](TVMArgs args,
   DLTensor* A = args[0];
   DLTensor* B = args[1];
   DLTensor* C = args[2];
-  CHECK(TypeMatch(A->dtype, kDLUInt, 8) && TypeMatch(B->dtype, kDLInt, 8) &&
-        TypeMatch(C->dtype, kDLInt, 32));
+  ICHECK(TypeMatch(A->dtype, kDLUInt, 8) && TypeMatch(B->dtype, kDLInt, 8) &&
+         TypeMatch(C->dtype, kDLInt, 32));
 
   CallU8S8S32Gemm(args, ret, MKLGemmU8S8S32Op());
 });
 
 TVM_REGISTER_GLOBAL("tvm.contrib.mkl.batch_matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
   if (TypeMatch(A->dtype, kDLFloat, 32)) {
     CallBatchGemm(args, ret, MKLSgemmBatchOp());
   } else {
@@ -188,7 +188,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.mkl.batch_matmul").set_body([](TVMArgs args, TV
 TVM_REGISTER_GLOBAL("tvm.contrib.mkl.batch_matmul_iterative")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       DLTensor* A = args[0];
-      CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+      ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
       if (TypeMatch(A->dtype, kDLFloat, 32)) {
         CallBatchGemm(args, ret, MKLSgemmBatchIterativeOp());
       } else {
diff --git a/src/runtime/contrib/cblas/mkldnn.cc b/src/runtime/contrib/cblas/mkldnn.cc
index 43c0dba595cc..1c3fa023dcc7 100644
--- a/src/runtime/contrib/cblas/mkldnn.cc
+++ b/src/runtime/contrib/cblas/mkldnn.cc
@@ -20,9 +20,9 @@
 /*!
  * \file Use external cblas library call.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 extern "C" {
 #include <dnnl.h>
@@ -48,7 +48,7 @@ struct MKLDNNSgemmOp {
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
   CallGemm(args, ret, MKLDNNSgemmOp());
 });
 }  // namespace contrib
diff --git a/src/runtime/contrib/coreml/coreml_runtime.mm b/src/runtime/contrib/coreml/coreml_runtime.mm
index fafc14a6898a..18d4f735a55e 100644
--- a/src/runtime/contrib/coreml/coreml_runtime.mm
+++ b/src/runtime/contrib/coreml/coreml_runtime.mm
@@ -59,7 +59,7 @@
 
   MLMultiArray* dest = [[MLMultiArray alloc] initWithShape:shape dataType:dataType error:nil];
 
-  CHECK(data_in->strides == NULL);
+  ICHECK(data_in->strides == NULL);
   memcpy(dest.dataPointer, data_in->data, size);
 
   NSString* nsKey = [NSString stringWithUTF8String:key.c_str()];
@@ -155,7 +155,8 @@
 
       // Copy input tensors to corresponding data entries.
       for (auto i = 0; i < args.size() - 1; ++i) {
-        CHECK(args[i].type_code() == kTVMDLTensorHandle || args[i].type_code() == kTVMNDArrayHandle)
+        ICHECK(args[i].type_code() == kTVMDLTensorHandle ||
+               args[i].type_code() == kTVMNDArrayHandle)
             << "Expect NDArray or DLTensor as inputs\n";
         if (args[i].type_code() == kTVMDLTensorHandle) {
           model_->SetInput([input_names[i] UTF8String], args[i]);
@@ -238,7 +239,7 @@ Module CoreMLRuntimeLoadFromBinary(void* strm) {
   NSString* model_path = [tempDir stringByAppendingPathComponent:dirname];
   NSURL* url = [NSURL fileURLWithPath:model_path];
   BOOL res = [dirWrapper writeToURL:url options:0 originalContentsURL:nil error:nil];
-  CHECK(res) << "Failed to create model directory " << [model_path UTF8String];
+  ICHECK(res) << "Failed to create model directory " << [model_path UTF8String];
 
   auto exec = make_object<CoreMLRuntime>();
   exec->Init(symbol, [model_path UTF8String]);
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index 59367d17405d..ce69d4ca7bde 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -20,9 +20,9 @@
 /*!
  * \file Use external cblas library call.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include "../cblas/gemm_common.h"
 #include "cublas_utils.h"
@@ -152,19 +152,19 @@ inline void CallLtIgemm(TVMArgs args, TVMRetValue* ret, cublasLtHandle_t hdl) {
   int lda = M * K / (roundoff(K, 32) / 32);
   int ldb = K * N / (roundoff(K, 32) / 32);
   int ldc = M * N_out / (roundoff(N_out, 32) / 32);
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
 
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
-  CHECK(TypeEqual(A->dtype, B->dtype));
-  CHECK(TypeMatch(A->dtype, kDLInt, 8));
-  CHECK(TypeMatch(C->dtype, kDLInt, 32));
+  ICHECK(TypeEqual(A->dtype, B->dtype));
+  ICHECK(TypeMatch(A->dtype, kDLInt, 8));
+  ICHECK(TypeMatch(C->dtype, kDLInt, 32));
 
-  CHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
+  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
   int32_t alpha = args.size() > 5 ? args[5] : 1;
   int32_t beta = args.size() > 6 ? args[6] : 0;
   cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
@@ -214,27 +214,27 @@ inline void CallGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl) {
   DLTensor* C = args[2];
   bool transa = args[3];
   bool transb = args[4];
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
 
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
-  CHECK(TypeEqual(A->dtype, B->dtype));
+  ICHECK(TypeEqual(A->dtype, B->dtype));
 
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed(C));
 
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  CHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
-  CHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
+  ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
-  CHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
@@ -272,29 +272,29 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl)
   DLTensor* C = args[2];
   bool transa = args[3];
   bool transb = args[4];
-  CHECK_EQ(A->ndim, 3);
-  CHECK_EQ(B->ndim, 3);
-  CHECK_EQ(C->ndim, 3);
+  ICHECK_EQ(A->ndim, 3);
+  ICHECK_EQ(B->ndim, 3);
+  ICHECK_EQ(C->ndim, 3);
   int batch_size = BatchCount3D(A);
-  CHECK_EQ(BatchCount3D(B), batch_size);
-  CHECK_EQ(BatchCount3D(C), batch_size);
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(BatchCount3D(B), batch_size);
+  ICHECK_EQ(BatchCount3D(C), batch_size);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
-  CHECK(TypeEqual(A->dtype, B->dtype));
+  ICHECK(TypeEqual(A->dtype, B->dtype));
 
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed(C));
 
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  CHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type";
-  CHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type";
+  ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
-  CHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
@@ -339,8 +339,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cublas.matmul").set_body([](TVMArgs args, TVMRe
   CUBLASTryEnableTensorCore(entry_ptr->handle);
 
   if (TypeEqual(A->dtype, C->dtype)) {
-    CHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-          TypeMatch(A->dtype, kDLFloat, 64));
+    ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
+           TypeMatch(A->dtype, kDLFloat, 64));
 
     if (TypeMatch(A->dtype, kDLFloat, 16))
       CallGemm(args, ret, CublasHgemmOp(entry_ptr->handle));
@@ -361,7 +361,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cublaslt.matmul").set_body([](TVMArgs args, TVM
 
   CUBLASTryEnableTensorCore(entry_ptr->handle);
 
-  CHECK(TypeMatch(A->dtype, kDLInt, 8)) << "Expects dtype to be int8\n";
+  ICHECK(TypeMatch(A->dtype, kDLInt, 8)) << "Expects dtype to be int8\n";
   cublasLtHandle_t ltHandle;
   CHECK_CUBLAS_ERROR(cublasLtCreate(&ltHandle));
   CallLtIgemm(args, ret, ltHandle);
@@ -377,8 +377,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cublas.batch_matmul").set_body([](TVMArgs args,
 
   CUBLASTryEnableTensorCore(entry_ptr->handle);
   if (TypeEqual(A->dtype, C->dtype)) {
-    CHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-          TypeMatch(A->dtype, kDLFloat, 64));
+    ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
+           TypeMatch(A->dtype, kDLFloat, 64));
 
     if (TypeMatch(A->dtype, kDLFloat, 16))
       CallBatchGemm(args, ret, CublasHgemmBatchOp(entry_ptr->handle));
diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h
index 5189c4f483a8..32c3b03ddbb0 100644
--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <dlpack/dlpack.h>
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #include <cstdint>
 #if CUDART_VERSION >= 10010
@@ -63,10 +63,10 @@ inline const char* GetCublasErrorString(int error) {
 }
 
 #ifndef CHECK_CUBLAS_ERROR
-#define CHECK_CUBLAS_ERROR(fn)                                                           \
-  do {                                                                                   \
-    int error = static_cast<int>(fn);                                                    \
-    CHECK_EQ(error, CUBLAS_STATUS_SUCCESS) << "CUBLAS: " << GetCublasErrorString(error); \
+#define CHECK_CUBLAS_ERROR(fn)                                                            \
+  do {                                                                                    \
+    int error = static_cast<int>(fn);                                                     \
+    ICHECK_EQ(error, CUBLAS_STATUS_SUCCESS) << "CUBLAS: " << GetCublasErrorString(error); \
   } while (0)  // ; intentionally left off.
 #endif         // CHECK_CUBLAS_ERROR
 
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h
index 1b4eb40f193f..528298b75187 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -25,18 +25,18 @@
 #define TVM_RUNTIME_CONTRIB_CUDNN_CUDNN_UTILS_H_
 
 #include <cudnn.h>
-#include <dmlc/logging.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/support/logging.h>
 
 #include "../../cuda/cuda_common.h"
 
 namespace tvm {
 namespace contrib {
 
-#define CUDNN_CALL(func)                                                      \
-  {                                                                           \
-    cudnnStatus_t e = (func);                                                 \
-    CHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \
+#define CUDNN_CALL(func)                                                       \
+  {                                                                            \
+    cudnnStatus_t e = (func);                                                  \
+    ICHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \
   }
 
 /*! breif Convert DLTensor type to CuDNN type */
diff --git a/src/runtime/contrib/cudnn/softmax.cc b/src/runtime/contrib/cudnn/softmax.cc
index ff6d6a1dbd81..648c9b633ea4 100644
--- a/src/runtime/contrib/cudnn/softmax.cc
+++ b/src/runtime/contrib/cudnn/softmax.cc
@@ -39,7 +39,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.softmax.forward")
       int ndim = x->ndim;
       int64_t* shape = x->shape;
       if (axis < 0) axis += ndim;
-      CHECK(axis >= 0 && axis < ndim);
+      ICHECK(axis >= 0 && axis < ndim);
 
       CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
       entry_ptr->softmax_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index bda9f1a44932..eef67a702d9c 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -54,7 +54,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
   void Init(const Array<NDArray>& consts) override {
     BuildEngine();
 
-    CHECK_EQ(consts.size(), const_idx_.size())
+    ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
 
     // Setup constants entries for weights.
@@ -98,7 +98,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       if (node.GetOpType() == "kernel") {
-        CHECK_EQ(node.GetOpType(), "kernel");
+        ICHECK_EQ(node.GetOpType(), "kernel");
         auto op_name = node.GetOpName();
         if ("nn.conv2d" == op_name) {
           Conv2d(nid);
@@ -137,12 +137,12 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     auto eid = EntryID(entry);
     // Since the DNNL memory has been created before calling this function, we assume the entry
     // has not yet been bound to the other DNNL memory; otherwise it may have memory leak.
-    CHECK_EQ(entry_out_mem_.count(eid), 0);
+    ICHECK_EQ(entry_out_mem_.count(eid), 0);
 
     // TODO(@comanic): Support other data types (i.e., int8).
     auto data_node = nodes_[entry.id_];
     auto dltype = data_node.GetOpDataType()[entry.index_];
-    CHECK_EQ(dltype.bits, 32);
+    ICHECK_EQ(dltype.bits, 32);
 
     entry_out_mem_[eid] = {mem, offset};
     return entry_out_mem_[eid].first;
@@ -214,11 +214,11 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     net_.push_back(conv);
 
     // Data memory.
-    CHECK_EQ(node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
+    ICHECK_EQ(node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
     auto conv2d_src_memory = BindDNNLMemory(data_entry, {src_dims, dt::f32, tag::nchw});
 
     // Weight memory.
-    CHECK_EQ(node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
+    ICHECK_EQ(node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
     auto conv2d_weights_memory = BindDNNLMemory(
         weight_entry, {weights_dims, dt::f32, (groups > 1) ? tag::goihw : tag::oihw});
 
@@ -343,7 +343,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     auto relu_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference,
                                                  dnnl::algorithm::eltwise_relu, data_md, 0);
     auto relu_prim_desc = dnnl::eltwise_forward::primitive_desc(relu_desc, engine_);
-    CHECK(data_md == relu_prim_desc.dst_desc());
+    ICHECK(data_md == relu_prim_desc.dst_desc());
 
     auto relu = dnnl::eltwise_forward(relu_prim_desc);
     net_.push_back(relu);
@@ -364,7 +364,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::vector<dnnl::memory::desc> data_mds;
     std::vector<dnnl::memory> data_memories;
 
-    CHECK_EQ(node.GetInputs().size(), 2U);
+    ICHECK_EQ(node.GetInputs().size(), 2U);
     for (auto entry : node.GetInputs()) {
       auto data_shape = nodes_[entry.id_].GetOpShape()[entry.index_];
       dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dt::f32);
@@ -373,7 +373,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       data_mds.push_back(data_md);
       data_memories.push_back(BindDNNLMemory(entry, data_md));
     }
-    CHECK(data_dims[0] == data_dims[1]);
+    ICHECK(data_dims[0] == data_dims[1]);
     auto out_md = data_mds[0];
     JSONGraphNodeEntry out_entry(nid, 0);
     auto out_memory = BindDNNLMemory(out_entry, out_md);
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.cc b/src/runtime/contrib/ethosn/ethosn_runtime.cc
index f5164e55c46c..7fc44368af6d 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.cc
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <vector>
 
-#include "../../file_util.h"
+#include "../../file_utils.h"
 #include "ethosn_device.h"
 #include "ethosn_driver_library/Inference.hpp"
 #include "ethosn_driver_library/Network.hpp"
diff --git a/src/runtime/contrib/json/json_node.h b/src/runtime/contrib/json/json_node.h
index 6a07129bf006..77c289b04c6d 100644
--- a/src/runtime/contrib/json/json_node.h
+++ b/src/runtime/contrib/json/json_node.h
@@ -73,13 +73,13 @@ class JSONGraphNodeEntry {
    */
   void Load(dmlc::JSONReader* reader) {
     reader->BeginArray();
-    CHECK(reader->NextArrayItem()) << "invalid json format";
+    ICHECK(reader->NextArrayItem()) << "invalid json format";
     reader->Read(&id_);
-    CHECK(reader->NextArrayItem()) << "invalid json format";
+    ICHECK(reader->NextArrayItem()) << "invalid json format";
     reader->Read(&index_);
     if (reader->NextArrayItem()) {
       reader->Read(&version_);
-      CHECK(!reader->NextArrayItem()) << "invalid json format";
+      ICHECK(!reader->NextArrayItem()) << "invalid json format";
     } else {
       version_ = 0;
     }
@@ -145,27 +145,27 @@ class JSONGraphNode {
       } else if (key == "dtype") {
         std::vector<std::string> tmp;
         reader->BeginArray();
-        CHECK(reader->NextArrayItem());
+        ICHECK(reader->NextArrayItem());
         reader->Read(&tmp);
-        CHECK(!reader->NextArrayItem());
+        ICHECK(!reader->NextArrayItem());
         for (const auto& it : tmp) {
           dtype_.push_back(tvm::runtime::String2DLDataType(it));
         }
       } else if (key == "shape") {
         reader->BeginArray();
-        CHECK(reader->NextArrayItem());
+        ICHECK(reader->NextArrayItem());
         reader->Read(&shape_);
-        CHECK(!reader->NextArrayItem());
+        ICHECK(!reader->NextArrayItem());
       } else {
         reader->BeginArray();
-        CHECK(reader->NextArrayItem());
+        ICHECK(reader->NextArrayItem());
         std::vector<std::string> tmp;
         reader->Read(&tmp);
         attrs_[key] = tmp;
-        CHECK(!reader->NextArrayItem());
+        ICHECK(!reader->NextArrayItem());
       }
     }
-    CHECK_EQ(shape_.size(), dtype_.size());
+    ICHECK_EQ(shape_.size(), dtype_.size());
   }
 
   /*!
@@ -256,7 +256,7 @@ class JSONGraphNode {
    */
   template <typename T>
   T GetAttr(const std::string& key) const {
-    CHECK_GT(attrs_.count(key), 0U) << "Key: " << key << "is not found";
+    ICHECK_GT(attrs_.count(key), 0U) << "Key: " << key << "is not found";
     return dmlc::get<T>(attrs_.at(key));
   }
 
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 9eb7fcd2f689..3ae652ccaf24 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -78,7 +78,7 @@ class JSONRuntimeBase : public ModuleNode {
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->const_names_; });
     } else if (this->symbol_name_ == name) {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK(this->initialized_) << "The module has not been initialized";
+        ICHECK(this->initialized_) << "The module has not been initialized";
 
         // Bind argument tensors to data entries.
         this->SetInputOutputBuffers(args);
@@ -88,7 +88,7 @@ class JSONRuntimeBase : public ModuleNode {
     } else if ("__init_" + this->symbol_name_ == name) {
       // The function to initialize constant tensors.
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.size(), 1U);
+        ICHECK_EQ(args.size(), 1U);
         this->Init(args[0]);
         this->initialized_ = true;
         *rv = 0;
@@ -119,9 +119,9 @@ class JSONRuntimeBase : public ModuleNode {
     std::string graph_json;
     std::vector<std::string> consts;
     // Load the symbol
-    CHECK(stream->Read(&symbol)) << "Loading symbol name failed";
-    CHECK(stream->Read(&graph_json)) << "Loading graph json failed";
-    CHECK(stream->Read(&consts)) << "Loading the const name list failed";
+    ICHECK(stream->Read(&symbol)) << "Loading symbol name failed";
+    ICHECK(stream->Read(&graph_json)) << "Loading graph json failed";
+    ICHECK(stream->Read(&consts)) << "Loading the const name list failed";
     Array<String> const_names;
     for (const auto& it : consts) {
       const_names.push_back(it);
@@ -146,13 +146,13 @@ class JSONRuntimeBase : public ModuleNode {
    * \param args The packed args.
    */
   void SetInputOutputBuffers(const TVMArgs& args) {
-    CHECK_EQ(args.size(), input_var_eid_.size() + outputs_.size())
+    ICHECK_EQ(args.size(), input_var_eid_.size() + outputs_.size())
         << "Found mismatch in the number of provided data entryies and required.";
 
     for (size_t i = 0; i < static_cast<size_t>(args.size()); i++) {
       auto eid = i < input_var_eid_.size() ? input_var_eid_[i]
                                            : EntryID(outputs_[i - input_var_eid_.size()]);
-      CHECK(args[i].type_code() == kTVMNDArrayHandle || args[i].type_code() == kTVMDLTensorHandle)
+      ICHECK(args[i].type_code() == kTVMNDArrayHandle || args[i].type_code() == kTVMDLTensorHandle)
           << "Expect NDArray or DLTensor as inputs";
 
       const DLTensor* arg;
@@ -183,23 +183,23 @@ class JSONRuntimeBase : public ModuleNode {
       uint32_t nid = input_nodes_[i];
       std::string name = nodes_[nid].name_;
       if (nodes_[nid].op_type_ == "input") {
-        CHECK_EQ(nodes_[nid].GetOpShape().size(), nodes_[nid].GetOpDataType().size());
+        ICHECK_EQ(nodes_[nid].GetOpShape().size(), nodes_[nid].GetOpDataType().size());
         for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
           input_var_eid_.push_back(EntryID(nid, j));
         }
       } else {
-        CHECK_EQ(nodes_[nid].op_type_, "const");
+        ICHECK_EQ(nodes_[nid].op_type_, "const");
         auto pos = std::find(std::begin(const_names_), std::end(const_names_), name);
-        CHECK(pos != std::end(const_names_)) << "Found non-existent constant: " << name;
+        ICHECK(pos != std::end(const_names_)) << "Found non-existent constant: " << name;
         const_idx_.push_back(nid);
         consts.push_back(name);
       }
     }
-    CHECK_EQ(consts.size(), const_names_.size())
+    ICHECK_EQ(consts.size(), const_names_.size())
         << "Found mismatch for the number of constants in the graph and required.";
 
     for (size_t i = 0; i < consts.size(); i++) {
-      CHECK_EQ(consts[i], const_names_[i])
+      ICHECK_EQ(consts[i], const_names_[i])
           << "The position of constant in the graph must be the same as the required.";
     }
 
diff --git a/src/runtime/contrib/miopen/miopen_utils.h b/src/runtime/contrib/miopen/miopen_utils.h
index 4dec2ad710ba..9982f0914f6b 100644
--- a/src/runtime/contrib/miopen/miopen_utils.h
+++ b/src/runtime/contrib/miopen/miopen_utils.h
@@ -24,9 +24,9 @@
 #ifndef TVM_RUNTIME_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
 #define TVM_RUNTIME_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
 
-#include <dmlc/logging.h>
 #include <miopen/miopen.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/support/logging.h>
 
 #include <string>
 
@@ -38,10 +38,10 @@ namespace miopen {
 
 std::string miopenGetErrorString(int error_code);
 
-#define MIOPEN_CALL(func)                                                            \
-  {                                                                                  \
-    miopenStatus_t e = (func);                                                       \
-    CHECK_EQ(e, miopenStatusSuccess) << "miopen error: " << miopenGetErrorString(e); \
+#define MIOPEN_CALL(func)                                                             \
+  {                                                                                   \
+    miopenStatus_t e = (func);                                                        \
+    ICHECK_EQ(e, miopenStatusSuccess) << "miopen error: " << miopenGetErrorString(e); \
   }
 
 struct ConvEntry {
diff --git a/src/runtime/contrib/mps/conv.mm b/src/runtime/contrib/mps/conv.mm
index b598014f0267..3b16f0820d64 100644
--- a/src/runtime/contrib/mps/conv.mm
+++ b/src/runtime/contrib/mps/conv.mm
@@ -80,15 +80,15 @@
   int pad = args[3];
   int stride = args[4];
 
-  CHECK_EQ(data->ndim, 4);
-  CHECK_EQ(weight->ndim, 4);
-  CHECK_EQ(output->ndim, 4);
-  CHECK(output->strides == nullptr);
-  CHECK(weight->strides == nullptr);
-  CHECK(data->strides == nullptr);
-
-  CHECK_EQ(data->shape[0], 1);
-  CHECK_EQ(output->shape[0], 1);
+  ICHECK_EQ(data->ndim, 4);
+  ICHECK_EQ(weight->ndim, 4);
+  ICHECK_EQ(output->ndim, 4);
+  ICHECK(output->strides == nullptr);
+  ICHECK(weight->strides == nullptr);
+  ICHECK(data->strides == nullptr);
+
+  ICHECK_EQ(data->shape[0], 1);
+  ICHECK_EQ(output->shape[0], 1);
 
   int oCh = weight->shape[0];
   int kH = weight->shape[1];
diff --git a/src/runtime/contrib/mps/gemm.mm b/src/runtime/contrib/mps/gemm.mm
index 109c952ff0c4..c1d80dbed7f3 100644
--- a/src/runtime/contrib/mps/gemm.mm
+++ b/src/runtime/contrib/mps/gemm.mm
@@ -31,19 +31,19 @@
   bool transa = args[3];
   bool transb = args[4];
   // call gemm for simple compact code.
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
-  CHECK(C->strides == nullptr);
-  CHECK(B->strides == nullptr);
-  CHECK(A->strides == nullptr);
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-  CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-  CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
+  ICHECK(C->strides == nullptr);
+  ICHECK(B->strides == nullptr);
+  ICHECK(A->strides == nullptr);
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(B->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
   // Get Metal device API
   MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal();
-  // CHECK_EQ(A->ctx, B->ctx);
-  // CHECK_EQ(A->ctx, C->ctx);
+  // ICHECK_EQ(A->ctx, B->ctx);
+  // ICHECK_EQ(A->ctx, C->ctx);
   id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(A->ctx);
   id<MTLCommandQueue> queue = entry_ptr->metal_api->GetCommandQueue(A->ctx);
   id<MTLCommandBuffer> cb = [queue commandBuffer];
@@ -51,7 +51,7 @@
   NSUInteger N = B->shape[1 - (transb ? 1 : 0)];
   NSUInteger K = B->shape[0 + (transb ? 1 : 0)];
 
-  CHECK_EQ(A->shape[1 - (transa ? 1 : 0)], K);
+  ICHECK_EQ(A->shape[1 - (transa ? 1 : 0)], K);
   // mps a
   MPSDataType dtype = MPSType::DLTypeToMPSType(A->dtype);
   MPSMatrixDescriptor* descA =
@@ -86,7 +86,7 @@
                                            interiorColumns:K
                                                      alpha:1.0f
                                                       beta:0.0f];
-  CHECK(sgemm != nil);
+  ICHECK(sgemm != nil);
   [sgemm encodeToCommandBuffer:cb leftMatrix:matrixA rightMatrix:matrixB resultMatrix:matrixC];
   [cb commit];
 });
diff --git a/src/runtime/contrib/mps/mps_utils.h b/src/runtime/contrib/mps/mps_utils.h
index 170451ea385b..d1c49732318a 100644
--- a/src/runtime/contrib/mps/mps_utils.h
+++ b/src/runtime/contrib/mps/mps_utils.h
@@ -25,11 +25,11 @@
 #define TVM_RUNTIME_CONTRIB_MPS_MPS_UTILS_H_
 
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <vector>
 
diff --git a/src/runtime/contrib/nnpack/convolution.cc b/src/runtime/contrib/nnpack/convolution.cc
index 54c9ea4f969b..b3ea6c891d43 100644
--- a/src/runtime/contrib/nnpack/convolution.cc
+++ b/src/runtime/contrib/nnpack/convolution.cc
@@ -20,11 +20,11 @@
 /*!
  * \file Use external nnpack library call.
  */
-#include <dmlc/logging.h>
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include "nnpack_utils.h"
 
@@ -36,7 +36,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal();
       static std::once_flag flag;
-      std::call_once(flag, []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); });
       DLTensor* input = args[0];
       DLTensor* kernel = args[1];
       DLTensor* bias = nullptr;
@@ -52,36 +52,36 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
 
       uint64_t algo_ = args[11];
       nnp_convolution_algorithm algo = static_cast<nnp_convolution_algorithm>(algo_);
-      CHECK_EQ(input->ndim, 4);
-      CHECK_EQ(kernel->ndim, 4);
+      ICHECK_EQ(input->ndim, 4);
+      ICHECK_EQ(kernel->ndim, 4);
       if (bias) {
-        CHECK_EQ(bias->ndim, 1);
+        ICHECK_EQ(bias->ndim, 1);
       }
-      CHECK_EQ(output->ndim, 4);
-      CHECK_EQ(input->shape[1], kernel->shape[1]);
-      CHECK_EQ(input->shape[0], output->shape[0]);
+      ICHECK_EQ(output->ndim, 4);
+      ICHECK_EQ(input->shape[1], kernel->shape[1]);
+      ICHECK_EQ(input->shape[0], output->shape[0]);
       size_t input_channels = input->shape[1];
-      CHECK_EQ(output->shape[1], kernel->shape[0]);
+      ICHECK_EQ(output->shape[1], kernel->shape[0]);
       if (bias) {
-        CHECK_EQ(output->shape[1], bias->shape[0]);
+        ICHECK_EQ(output->shape[1], bias->shape[0]);
       }
       size_t output_channels = output->shape[1];
       nnp_size input_size{static_cast<size_t>(input->shape[2]),
                           static_cast<size_t>(input->shape[3])};
       nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
                            static_cast<size_t>(kernel->shape[3])};
-      CHECK(input->strides == nullptr);
-      CHECK(kernel->strides == nullptr);
+      ICHECK(input->strides == nullptr);
+      ICHECK(kernel->strides == nullptr);
       if (bias) {
-        CHECK(bias->strides == nullptr);
+        ICHECK(bias->strides == nullptr);
       }
 
-      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(input->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
       if (bias) {
-        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+        ICHECK(TypeMatch(bias->dtype, kDLFloat, 32));
       }
-      CHECK(TypeMatch(output->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(output->dtype, kDLFloat, 32));
 
       // Allocate a zero-bias if we don't pass one in.
       std::unique_ptr<std::vector<float>> zero_bias;
@@ -94,7 +94,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
           algo, nnp_convolution_transform_strategy_compute, input_channels, output_channels,
           input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr,
           nullptr, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      ICHECK_EQ(status, nnp_status_success);
 
       // Division with rounding up, in case size is not multiple of sizeof(float)
       const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float);
@@ -105,7 +105,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
       DeviceAPI* cpu_api = DeviceAPI::Get(ctx);
       void* workspace_buffer =
           cpu_api->AllocWorkspace(ctx, workspace_elements * sizeof(float), type_hint);
-      CHECK(workspace_buffer != nullptr);
+      ICHECK(workspace_buffer != nullptr);
 
       for (auto n = 0; n < input->shape[0]; ++n) {
         nnp_status status = nnp_convolution_inference(
@@ -120,7 +120,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
             workspace_buffer, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool,
             nullptr);
 
-        CHECK_EQ(status, nnp_status_success);
+        ICHECK_EQ(status, nnp_status_success);
       }
       cpu_api->FreeWorkspace(ctx, workspace_buffer);
     });
@@ -129,7 +129,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal();
       static std::once_flag flag;
-      std::call_once(flag, []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); });
       DLTensor* input = args[0];
       DLTensor* transformed_kernel = args[1];
       DLTensor* bias = nullptr;
@@ -145,32 +145,32 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
 
       uint64_t algo_ = args[11];
       nnp_convolution_algorithm algo = static_cast<nnp_convolution_algorithm>(algo_);
-      CHECK_EQ(input->ndim, 4);
+      ICHECK_EQ(input->ndim, 4);
       if (bias) {
-        CHECK_EQ(bias->ndim, 1);
+        ICHECK_EQ(bias->ndim, 1);
       }
-      CHECK_EQ(output->ndim, 4);
-      CHECK_EQ(input->shape[0], output->shape[0]);
+      ICHECK_EQ(output->ndim, 4);
+      ICHECK_EQ(input->shape[0], output->shape[0]);
       size_t input_channels = input->shape[1];
       if (bias) {
-        CHECK_EQ(output->shape[1], bias->shape[0]);
+        ICHECK_EQ(output->shape[1], bias->shape[0]);
       }
       size_t output_channels = output->shape[1];
       nnp_size input_size{static_cast<size_t>(input->shape[2]),
                           static_cast<size_t>(input->shape[3])};
       nnp_size kernel_size{3, 3};
-      CHECK(input->strides == nullptr);
-      CHECK(transformed_kernel->strides == nullptr);
+      ICHECK(input->strides == nullptr);
+      ICHECK(transformed_kernel->strides == nullptr);
       if (bias) {
-        CHECK(bias->strides == nullptr);
+        ICHECK(bias->strides == nullptr);
       }
 
-      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(transformed_kernel->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(input->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(transformed_kernel->dtype, kDLFloat, 32));
       if (bias) {
-        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+        ICHECK(TypeMatch(bias->dtype, kDLFloat, 32));
       }
-      CHECK(TypeMatch(output->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(output->dtype, kDLFloat, 32));
 
       // Allocate a zero-bias if we don't pass one in.
       std::unique_ptr<std::vector<float>> zero_bias;
@@ -183,7 +183,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
           algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
           input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr,
           nullptr, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      ICHECK_EQ(status, nnp_status_success);
 
       // Division with rounding up, in case size is not multiple of sizeof(float)
       const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float);
@@ -194,7 +194,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
       DeviceAPI* cpu_api = DeviceAPI::Get(ctx);
       void* workspace_buffer =
           cpu_api->AllocWorkspace(ctx, workspace_elements * sizeof(float), type_hint);
-      CHECK(workspace_buffer != nullptr);
+      ICHECK(workspace_buffer != nullptr);
 
       for (auto n = 0; n < input->shape[0]; ++n) {
         nnp_status status = nnp_convolution_inference(
@@ -208,7 +208,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
                 n * output->shape[1] * output->shape[2] * output->shape[3],
             workspace_buffer, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool,
             nullptr);
-        CHECK_EQ(status, nnp_status_success);
+        ICHECK_EQ(status, nnp_status_success);
       }
 
       cpu_api->FreeWorkspace(ctx, workspace_buffer);
@@ -218,7 +218,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal();
       static std::once_flag flag;
-      std::call_once(flag, []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); });
       DLTensor* kernel = args[0];
       DLTensor* transformed_kernel = args[1];
       // Dummy sizes
@@ -231,15 +231,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
 
       uint64_t algo_ = args[3];
       nnp_convolution_algorithm algo = static_cast<nnp_convolution_algorithm>(algo_);
-      CHECK_EQ(kernel->ndim, 4);
+      ICHECK_EQ(kernel->ndim, 4);
       size_t input_channels = kernel->shape[1];
       size_t output_channels = kernel->shape[0];
-      CHECK_EQ(kernel->shape[2], 3);
-      CHECK_EQ(kernel->shape[3], 3);
+      ICHECK_EQ(kernel->shape[2], 3);
+      ICHECK_EQ(kernel->shape[3], 3);
       nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
                            static_cast<size_t>(kernel->shape[3])};
-      CHECK(kernel->strides == nullptr);
-      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+      ICHECK(kernel->strides == nullptr);
+      ICHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
 
       size_t transformed_kernel_size = 0;
       nnp_status status;
@@ -248,9 +248,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
           input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr,
           nullptr, &transformed_kernel_size, nnp_activation_identity, nullptr, entry->threadpool,
           nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      ICHECK_EQ(status, nnp_status_success);
 
-      CHECK_LE(transformed_kernel_size, GetDataSize(*transformed_kernel));
+      ICHECK_LE(transformed_kernel_size, GetDataSize(*transformed_kernel));
 
       status = nnp_convolution_inference(
           algo, nnp_convolution_transform_strategy_precompute, input_channels, output_channels,
@@ -258,7 +258,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
           static_cast<float*>(kernel->data), nullptr, nullptr,
           static_cast<float*>(transformed_kernel->data), &transformed_kernel_size,
           nnp_activation_identity, nullptr, entry->threadpool, nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      ICHECK_EQ(status, nnp_status_success);
     });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/contrib/nnpack/fully_connected.cc b/src/runtime/contrib/nnpack/fully_connected.cc
index 543d23958633..8b72eb38e08c 100644
--- a/src/runtime/contrib/nnpack/fully_connected.cc
+++ b/src/runtime/contrib/nnpack/fully_connected.cc
@@ -20,10 +20,10 @@
 /*!
  * \file Use external nnpack library call.
  */
-#include <dmlc/logging.h>
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include "nnpack_utils.h"
 
@@ -42,17 +42,17 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_inference")
       DLTensor* C = args[2];
       NNPackConfig(args[3]);
 
-      CHECK_EQ(A->ndim, 1);
-      CHECK_EQ(B->ndim, 2);
-      CHECK_EQ(C->ndim, 1);
-      CHECK_EQ(B->shape[0], C->shape[0]);
-      CHECK_EQ(B->shape[1], A->shape[0]);
-      CHECK(C->strides == nullptr);
-      CHECK(B->strides == nullptr);
-      CHECK(A->strides == nullptr);
-      CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+      ICHECK_EQ(A->ndim, 1);
+      ICHECK_EQ(B->ndim, 2);
+      ICHECK_EQ(C->ndim, 1);
+      ICHECK_EQ(B->shape[0], C->shape[0]);
+      ICHECK_EQ(B->shape[1], A->shape[0]);
+      ICHECK(C->strides == nullptr);
+      ICHECK(B->strides == nullptr);
+      ICHECK(A->strides == nullptr);
+      ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(B->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
       nnp_fully_connected_inference(B->shape[1], B->shape[0], static_cast<float*>(A->data),
                                     static_cast<float*>(B->data), static_cast<float*>(C->data),
diff --git a/src/runtime/contrib/nnpack/nnpack_utils.cc b/src/runtime/contrib/nnpack/nnpack_utils.cc
index 91cf865128e9..2fd6f69bf20c 100644
--- a/src/runtime/contrib/nnpack/nnpack_utils.cc
+++ b/src/runtime/contrib/nnpack/nnpack_utils.cc
@@ -35,7 +35,7 @@ NNPackThreadLocalEntry* NNPackThreadLocalEntry::ThreadLocal() {
 bool NNPackConfig(uint64_t nthreads) {
   NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal();
   if (entry->threadpool && pthreadpool_get_threads_count(entry->threadpool) == nthreads) {
-    CHECK_NE(nthreads, 1);
+    ICHECK_NE(nthreads, 1);
     return true;
   }
   if (entry->threadpool) {
@@ -46,7 +46,7 @@ bool NNPackConfig(uint64_t nthreads) {
   if (nthreads == 1) {
     // a null threadpool means the function is invoked on the calling thread,
     // which is the desired logic for nthreads == 1
-    CHECK(!entry->threadpool);
+    ICHECK(!entry->threadpool);
     return true;
   }
 
diff --git a/src/runtime/contrib/nnpack/nnpack_utils.h b/src/runtime/contrib/nnpack/nnpack_utils.h
index bbb0d16bc868..231309baaa8e 100644
--- a/src/runtime/contrib/nnpack/nnpack_utils.h
+++ b/src/runtime/contrib/nnpack/nnpack_utils.h
@@ -22,11 +22,11 @@
  */
 #ifndef TVM_RUNTIME_CONTRIB_NNPACK_NNPACK_UTILS_H_
 #define TVM_RUNTIME_CONTRIB_NNPACK_NNPACK_UTILS_H_
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 namespace tvm {
 namespace contrib {
diff --git a/src/runtime/contrib/onnx/onnx_module.cc b/src/runtime/contrib/onnx/onnx_module.cc
index 9574b8674c8b..b235d63dbc58 100644
--- a/src/runtime/contrib/onnx/onnx_module.cc
+++ b/src/runtime/contrib/onnx/onnx_module.cc
@@ -53,8 +53,8 @@ class ONNXSourceModuleNode : public runtime::ModuleNode {
   std::string GetSource(const std::string& format) final { return code_; }
 
   void SaveToFile(const std::string& path, const std::string& format) final {
-    CHECK_EQ(format, "onnx") << "Can only save to onnx format";
-    CHECK_NE(code_.length(), 0);
+    ICHECK_EQ(format, "onnx") << "Can only save to onnx format";
+    ICHECK_NE(code_.length(), 0);
     const PackedFunc* to_onnx_ = runtime::Registry::Get("relay.ext.onnx.save_to_file");
     (*to_onnx_)(code_, path, format);
   }
diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc
index 8c20f0700ee7..49bc056dcafb 100644
--- a/src/runtime/contrib/random/mt_random_engine.cc
+++ b/src/runtime/contrib/random/mt_random_engine.cc
@@ -21,9 +21,9 @@
  * \file random/mt_random_engine.cc
  * \brief mt19937 random engine
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <ctime>
@@ -71,8 +71,8 @@ class RandomEngine {
    * \brief Fills a tensor with values drawn from Unif(low, high)
    */
   void SampleUniform(DLTensor* data, float low, float high) {
-    CHECK_GT(high, low) << "high must be bigger than low";
-    CHECK(data->strides == nullptr);
+    ICHECK_GT(high, low) << "high must be bigger than low";
+    ICHECK(data->strides == nullptr);
 
     DLDataType dtype = data->dtype;
     int64_t size = 1;
@@ -80,7 +80,7 @@ class RandomEngine {
       size *= data->shape[i];
     }
 
-    CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
+    ICHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
 
     if (data->ctx.device_type == kDLCPU) {
       std::uniform_real_distribution<float> uniform_dist(low, high);
@@ -95,8 +95,8 @@ class RandomEngine {
    * \brief Fills a tensor with values drawn from Normal(loc, scale**2)
    */
   void SampleNormal(DLTensor* data, float loc, float scale) {
-    CHECK_GT(scale, 0) << "standard deviation must be positive";
-    CHECK(data->strides == nullptr);
+    ICHECK_GT(scale, 0) << "standard deviation must be positive";
+    ICHECK(data->strides == nullptr);
 
     DLDataType dtype = data->dtype;
     int64_t size = 1;
@@ -104,7 +104,7 @@ class RandomEngine {
       size *= data->shape[i];
     }
 
-    CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
+    ICHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
 
     if (data->ctx.device_type == kDLCPU) {
       std::normal_distribution<float> normal_dist(loc, scale);
diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc
index 14bdd267d38c..edcd20883369 100644
--- a/src/runtime/contrib/random/random.cc
+++ b/src/runtime/contrib/random/random.cc
@@ -20,10 +20,10 @@
 /*!
  * \file External random functions for tensor.
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 
@@ -73,8 +73,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.random.randint").set_body([](TVMArgs args, TVMR
   int64_t low = args[0];
   int64_t high = args[1];
   DLTensor* out = args[2];
-  CHECK_GT(high, low) << "high must be bigger than low";
-  CHECK(out->strides == nullptr);
+  ICHECK_GT(high, low) << "high must be bigger than low";
+  ICHECK(out->strides == nullptr);
 
   DLDataType dtype = out->dtype;
   int64_t size = 1;
diff --git a/src/runtime/contrib/rocblas/rocblas.cc b/src/runtime/contrib/rocblas/rocblas.cc
index bca00a591d48..dca1ebc6ed83 100644
--- a/src/runtime/contrib/rocblas/rocblas.cc
+++ b/src/runtime/contrib/rocblas/rocblas.cc
@@ -22,9 +22,9 @@
  */
 #include "rocblas.h"
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 namespace tvm {
 namespace contrib {
@@ -56,15 +56,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.matmul").set_body([](TVMArgs args, TVMR
   bool transa = args[3];
   bool transb = args[4];
   // call gemm for simple compact code.
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
-  CHECK(C->strides == nullptr);
-  CHECK(B->strides == nullptr);
-  CHECK(A->strides == nullptr);
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-  CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-  CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
+  ICHECK(C->strides == nullptr);
+  ICHECK(B->strides == nullptr);
+  ICHECK(A->strides == nullptr);
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(B->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
   rocblas_handle handle;
   CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle));
@@ -97,12 +97,12 @@ TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.batch_matmul")
       bool transa = args[3];
       bool transb = args[4];
       // call gemm for simple compact code.
-      CHECK_EQ(A->ndim, 3);
-      CHECK_EQ(B->ndim, 3);
-      CHECK_EQ(C->ndim, 3);
-      CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+      ICHECK_EQ(A->ndim, 3);
+      ICHECK_EQ(B->ndim, 3);
+      ICHECK_EQ(C->ndim, 3);
+      ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(B->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
       rocblas_handle handle;
       CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle));
diff --git a/src/runtime/contrib/sort/sort.cc b/src/runtime/contrib/sort/sort.cc
index 9543e4b4c64e..31cf38d7d7a5 100644
--- a/src/runtime/contrib/sort/sort.cc
+++ b/src/runtime/contrib/sort/sort.cc
@@ -68,15 +68,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort_nms").set_body([](TVMArgs args, TV
   }
 
   // Currently only supports input dtype to be float32.
-  CHECK_EQ(dtype.code, 2) << "Currently only supports input dtype "
-                             "to be float.";
+  ICHECK_EQ(dtype.code, 2) << "Currently only supports input dtype "
+                              "to be float.";
 #if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC != 1)
-  CHECK_EQ(dtype.bits, 32) << "Currently only supports input dtype "
-                              "to be float32.";
+  ICHECK_EQ(dtype.bits, 32) << "Currently only supports input dtype "
+                               "to be float32.";
 #endif
-  CHECK_LT(axis, input->ndim) << "Axis out of boundary for "
-                                 "input ndim "
-                              << input->ndim;
+  ICHECK_LT(axis, input->ndim) << "Axis out of boundary for "
+                                  "input ndim "
+                               << input->ndim;
 
   for (int i = 0; i < input->ndim; ++i) {
     if (i < axis) {
@@ -175,9 +175,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort").set_body([](TVMArgs args, TVMRet
   if (axis < 0) {
     axis = input->ndim + axis;
   }
-  CHECK_LT(axis, input->ndim) << "Axis out of boundary for "
-                                 "input ndim "
-                              << input->ndim;
+  ICHECK_LT(axis, input->ndim) << "Axis out of boundary for "
+                                  "input ndim "
+                               << input->ndim;
 
   auto data_dtype = DLDataType2String(input->dtype);
   auto out_dtype = DLDataType2String(output->dtype);
@@ -322,7 +322,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.topk").set_body([](TVMArgs args, TVMRetVal
   if (axis < 0) {
     axis = input->ndim + axis;
   }
-  CHECK(axis >= 0 && axis < input->ndim) << "Axis out of boundary for input ndim " << input->ndim;
+  ICHECK(axis >= 0 && axis < input->ndim) << "Axis out of boundary for input ndim " << input->ndim;
 
   auto data_dtype = DLDataType2String(input->dtype);
   auto out_dtype = (indices_out == nullptr) ? "int64" : DLDataType2String(indices_out->dtype);
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index 9180bd7b5ff4..4060b240cf8e 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -18,97 +18,34 @@
 
 /*!
  * \file runtime/contrib/tensorrt/tensorrt_builder.cc
- * \brief Contains TensorRTBuilder class which can be used to convert a relay
- * program into a TRT engine which can be used for inference.
+ * \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph into a TRT engine
+ * which can be used for inference.
  */
 
 #include "tensorrt_builder.h"
 
+#include <tvm/runtime/ndarray.h>
+
 #include <memory>
 #include <string>
 
 #include "tensorrt_logger.h"
 #include "tensorrt_ops.h"
-#include "utils.h"
+#include "tensorrt_utils.h"
 
 namespace tvm {
-namespace relay {
+namespace runtime {
 namespace contrib {
 
-const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<TrtOpConverter>>>
-GetOpConverters() {
-  static auto map =
-      std::make_shared<std::unordered_map<std::string, std::shared_ptr<TrtOpConverter>>>();
-  if (!map->empty()) return map;
-  map->emplace("nn.relu", std::make_shared<ActivationOpConverter>());
-  map->emplace("sigmoid", std::make_shared<ActivationOpConverter>());
-  map->emplace("tanh", std::make_shared<ActivationOpConverter>());
-  map->emplace("nn.batch_norm", std::make_shared<BatchNormOpConverter>());
-  map->emplace("nn.softmax", std::make_shared<SoftmaxOpConverter>());
-  map->emplace("nn.conv2d", std::make_shared<Conv2DOpConverter>());
-  map->emplace("nn.dense", std::make_shared<DenseOpConverter>());
-  map->emplace("nn.bias_add", std::make_shared<BiasAddOpConverter>());
-  map->emplace("add", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("subtract", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("multiply", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("divide", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("power", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("maximum", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("minimum", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("nn.max_pool2d", std::make_shared<PoolingOpConverter>());
-  map->emplace("nn.avg_pool2d", std::make_shared<PoolingOpConverter>());
-  map->emplace("nn.global_max_pool2d", std::make_shared<GlobalPoolingOpConverter>());
-  map->emplace("nn.global_avg_pool2d", std::make_shared<GlobalPoolingOpConverter>());
-  map->emplace("exp", std::make_shared<UnaryOpConverter>());
-  map->emplace("log", std::make_shared<UnaryOpConverter>());
-  map->emplace("sqrt", std::make_shared<UnaryOpConverter>());
-  map->emplace("abs", std::make_shared<UnaryOpConverter>());
-  map->emplace("negative", std::make_shared<UnaryOpConverter>());
-  map->emplace("nn.batch_flatten", std::make_shared<BatchFlattenOpConverter>());
-  map->emplace("expand_dims", std::make_shared<ExpandDimsOpConverter>());
-  map->emplace("squeeze", std::make_shared<SqueezeOpConverter>());
-  map->emplace("concatenate", std::make_shared<ConcatOpConverter>());
-  map->emplace("nn.conv2d_transpose", std::make_shared<Conv2DTransposeOpConverter>());
-  map->emplace("transpose", std::make_shared<TransposeOpConverter>());
-  map->emplace("reshape", std::make_shared<ReshapeOpConverter>());
-  map->emplace("nn.pad", std::make_shared<PadOpConverter>());
-  map->emplace("sum", std::make_shared<ReduceOpConverter>());
-  map->emplace("prod", std::make_shared<ReduceOpConverter>());
-  map->emplace("max", std::make_shared<ReduceOpConverter>());
-  map->emplace("min", std::make_shared<ReduceOpConverter>());
-  map->emplace("mean", std::make_shared<ReduceOpConverter>());
-  map->emplace("nn.adaptive_max_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
-  map->emplace("nn.adaptive_avg_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
-#if TRT_VERSION_GE(5, 1, 5)
-  map->emplace("clip", std::make_shared<ActivationOpConverter>());
-  map->emplace("nn.leaky_relu", std::make_shared<ActivationOpConverter>());
-  map->emplace("sin", std::make_shared<UnaryOpConverter>());
-  map->emplace("cos", std::make_shared<UnaryOpConverter>());
-  map->emplace("atan", std::make_shared<UnaryOpConverter>());
-  map->emplace("ceil", std::make_shared<UnaryOpConverter>());
-  map->emplace("floor", std::make_shared<UnaryOpConverter>());
-  map->emplace("strided_slice", std::make_shared<StridedSliceOpConverter>());
-  map->emplace("split", std::make_shared<SplitOpConverter>());
-#else
-  map->emplace("clip", std::make_shared<ClipLegacyOpConverter>());
-#endif
-#if TRT_VERSION_GE(6, 0, 1)
-  map->emplace("image.resize", std::make_shared<ResizeOpConverter>());
-  map->emplace("nn.upsampling", std::make_shared<UpsamplingOpConverter>());
-  map->emplace("nn.conv3d", std::make_shared<Conv3DOpConverter>());
-  map->emplace("nn.max_pool3d", std::make_shared<Pooling3DOpConverter>());
-  map->emplace("nn.avg_pool3d", std::make_shared<Pooling3DOpConverter>());
-  map->emplace("nn.conv3d_transpose", std::make_shared<Conv3DTransposeOpConverter>());
-#endif
-  return map;
-}
-
-TensorRTBuilder::TensorRTBuilder(runtime::TensorRTLogger* logger,
-                                 const std::vector<DLTensor*>& args, size_t max_workspace_size,
-                                 bool use_implicit_batch)
-    : execution_args_(args),
+TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
+                                 const std::vector<const DLTensor*>& data_entry,
+                                 size_t max_workspace_size, bool use_implicit_batch, bool use_fp16,
+                                 int batch_size)
+    : data_entry_(data_entry),
       max_workspace_size_(max_workspace_size),
-      use_implicit_batch_(use_implicit_batch) {
+      use_implicit_batch_(use_implicit_batch),
+      use_fp16_(use_fp16),
+      batch_size_(batch_size) {
   // Create TRT builder and network.
   builder_ = nvinfer1::createInferBuilder(*logger);
 #if TRT_VERSION_GE(6, 0, 1)
@@ -117,67 +54,104 @@ TensorRTBuilder::TensorRTBuilder(runtime::TensorRTLogger* logger,
       1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
   if (use_implicit_batch_) {
     flags = 0U;
-    batch_size_ = args[0]->shape[0];
     builder_->setMaxBatchSize(batch_size_);
   }
   network_ = builder_->createNetworkV2(flags);
 #else
   // Use INetwork with implicit batch.
-  batch_size_ = args[0]->shape[0];
   builder_->setMaxBatchSize(batch_size_);
   builder_->setMaxWorkspaceSize(max_workspace_size_);
-  const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
-  builder_->setFp16Mode(use_fp16);
+  builder_->setFp16Mode(use_fp16_);
   network_ = builder_->createNetwork();
 #endif
 }
 
-void TensorRTBuilder::ProcessInputs(const Function& func) {
-  // All input names in order. This order matches that of execution args.
-  for (size_t i = 0; i < func->params.size(); i++) {
-    network_input_names_.push_back(func->params[i]->name_hint());
-    network_input_map_[func->params[i]->name_hint()] = i;
+void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node) {
+  auto node_name = node.GetOpName();
+  auto shapes = node.GetOpShape();
+  auto dtypes = node.GetOpDataType();
+  ICHECK_EQ(shapes.size(), dtypes.size());
+  node_output_map_[nid] = {};
+  for (size_t i = 0; i < shapes.size(); ++i) {
+    const std::string name = node_name + "_" + std::to_string(i);
+    auto shape = shapes[i];
+    // Remove batch dim when not in explicit batch mode.
+    if (use_implicit_batch_ && shape.size() > 1) {
+      shape.erase(shape.begin());
+    }
+    nvinfer1::Dims dims = VectorToTrtDims(shape);
+    ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
+    auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+    node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
+    network_input_names_.push_back(name);
+    entry_id_map_[name] = entry_id + i;
   }
-  // Assume all inputs are real to start. If an input is baked into the TRT
-  // engine, we will set the entry in this array to true.
-  network_input_is_baked_.assign(func->params.size(), false);
 }
 
-void TensorRTBuilder::ProcessOutputs(const Expr& expr) {
-  // Mark outputs.
-  auto it = node_output_map_.find(expr.operator->());
-  CHECK(it != node_output_map_.end()) << "Output was not found.";
-  auto network_outputs = it->second;
-  for (size_t i = 0; i < network_outputs.size(); ++i) {
-    CHECK(network_outputs[i].type == kTensor);
-    auto out_tensor = network_outputs[i].tensor;
-    std::string output_name = "tensorrt_output" + std::to_string(i);
-    // If the network is already marked as an output, make a copy to avoid TRT crash. This shouldn't
-    // happen since duplicate output issue in partitioning was fixed.
-    if (out_tensor->isNetworkOutput()) {
-      LOG(WARNING) << output_name << " is a duplicate output.";
-      out_tensor = network_->addIdentity(*out_tensor)->getOutput(0);
-    } else if (out_tensor->isNetworkInput()) {
-      LOG(WARNING) << output_name << " is also an input.";
-      out_tensor = network_->addIdentity(*out_tensor)->getOutput(0);
+void TensorRTBuilder::AddConstant(int nid, const DLTensor* data) {
+  nvinfer1::Weights weight = GetDLTensorAsWeights(data, kDLCPU);
+  std::vector<int> shape(data->shape, data->shape + data->ndim);
+  // Remove batch dim when not in explicit batch mode.
+  if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) {
+    shape.erase(shape.begin());
+  }
+  node_output_map_[nid] = {TensorRTOpInput(weight, shape)};
+}
+
+void TensorRTBuilder::AddOutput(const JSONGraphNodeEntry& node, uint32_t entry_id) {
+  auto it = node_output_map_.find(node.id_);
+  ICHECK(it != node_output_map_.end()) << "Output was not found.";
+  auto out_tensor = it->second[node.index_].tensor;
+  std::string name = "tensorrt_output_" + std::to_string(network_output_names_.size());
+  out_tensor->setName(name.c_str());
+  network_->markOutput(*out_tensor);
+  network_output_names_.push_back(name);
+  entry_id_map_[name] = entry_id;
+}
+
+void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
+  TensorRTOpConverterParams params(network_, node, &trt_weights_);
+  // Look up converter.
+  auto it = GetOpConverters()->find(params.op_name);
+  ICHECK(it != GetOpConverters()->end())
+      << "Unsupported operator conversion to TRT, op name: " << params.op_name;
+  const auto converter = it->second;
+  // Get inputs.
+  for (size_t i = 0; i < node.GetInputs().size(); ++i) {
+    auto in_node = node.GetInputs()[i];
+    auto it = node_output_map_.find(in_node.id_);
+    ICHECK(it != node_output_map_.end()) << "Input was not found.";
+    auto input = it->second[in_node.index_];
+    if (!converter->variable_input_count) {
+      if (converter->input_types[i] == kTensor && input.type == kWeight) {
+        input = TensorRTOpInput(GetInputAsTensor(input));
+      } else if (converter->input_types[i] == kWeight && input.type == kTensor) {
+        LOG(FATAL) << "Input " << i << " for " << params.op_name
+                   << " requires weights but got a tensor.";
+      }
     }
-    out_tensor->setName(output_name.c_str());
-    network_output_names_.push_back(output_name);
-    network_->markOutput(*out_tensor);
-    DLOG(INFO) << "Added TRT network output: " << out_tensor->getName() << " -> " << output_name;
+    params.inputs.push_back(input);
+  }
+  ICHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
+      << "Op expected a different number of inputs.";
+
+  // Convert op to TRT.
+  converter->Convert(&params);
+
+  // Get outputs.
+  node_output_map_[nid] = {};
+  for (auto out : params.outputs) {
+    node_output_map_[nid].push_back(TensorRTOpInput(out));
   }
 }
 
-runtime::TrtEngineAndContext TensorRTBuilder::BuildEngine(const Function& func) {
+TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
   // Process graph to create INetworkDefinition.
-  ProcessInputs(func);
-  VisitExpr(func->body);
-  ProcessOutputs(func->body);
 // Build engine.
 #if TRT_VERSION_GE(6, 0, 1)
   config_ = builder_->createBuilderConfig();
   config_->setMaxWorkspaceSize(max_workspace_size_);
-  if (dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false)) {
+  if (use_fp16_) {
     config_->setFlag(nvinfer1::BuilderFlag::kFP16);
   }
   // Add profiles.
@@ -196,274 +170,50 @@ runtime::TrtEngineAndContext TensorRTBuilder::BuildEngine(const Function& func)
 #else
   nvinfer1::ICudaEngine* engine = builder_->buildCudaEngine(*network_);
 #endif
-  CleanUp();
-  const int num_input_bindings =
-      std::count(network_input_is_baked_.begin(), network_input_is_baked_.end(), false);
-  CHECK_EQ(engine->getNbBindings(), num_input_bindings + network_output_names_.size());
+  ICHECK_EQ(engine->getNbBindings(), network_input_names_.size() + network_output_names_.size());
   nvinfer1::IExecutionContext* context = engine->createExecutionContext();
-  std::vector<runtime::NDArray> device_buffers;
-  device_buffers.resize(engine->getNbBindings());
+  CleanUp();
 
-  for (size_t i = 0; i < network_input_names_.size(); i++) {
-    if (network_input_is_baked_[i] || execution_args_[i]->ctx.device_type == kDLGPU) {
-      continue;
-    } else {
-      int binding_index = engine->getBindingIndex(network_input_names_[i].c_str());
-      std::vector<int64_t> shape(execution_args_[i]->shape,
-                                 execution_args_[i]->shape + execution_args_[i]->ndim);
-      device_buffers[binding_index] =
-          runtime::NDArray::Empty(shape, execution_args_[i]->dtype, {kDLGPU, 0});
-    }
+  // Allocate I/O buffers on GPU for TVM inputs which are on a different context.
+  std::vector<runtime::NDArray> device_buffers(engine->getNbBindings());
+  for (size_t i = 0; i < network_input_names_.size(); ++i) {
+    AllocateDeviceBuffer(engine, network_input_names_[i], &device_buffers);
   }
-
-  for (size_t i = 0; i < network_output_names_.size(); i++) {
-    int index_in_args = execution_args_.size() - network_output_names_.size() + i;
-    if (execution_args_[index_in_args]->ctx.device_type == kDLGPU) {
-      continue;
-    } else {
-      int binding_index = engine->getBindingIndex(network_output_names_[i].c_str());
-      std::vector<int64_t> shape(
-          execution_args_[index_in_args]->shape,
-          execution_args_[index_in_args]->shape + execution_args_[index_in_args]->ndim);
-      device_buffers[binding_index] =
-          runtime::NDArray::Empty(shape, execution_args_[index_in_args]->dtype, {kDLGPU, 0});
-    }
+  for (size_t i = 0; i < network_output_names_.size(); ++i) {
+    AllocateDeviceBuffer(engine, network_output_names_[i], &device_buffers);
   }
-  return {
-      engine,        context, network_input_names_, network_input_is_baked_, network_output_names_,
-      device_buffers};
+  return {engine, context, network_input_names_, network_output_names_, device_buffers};
 }
 
-nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(DLTensor* dptr, DLDeviceType src_device) {
-  CHECK_EQ(dptr->ctx.device_type, src_device);
-  CHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
-        static_cast<int>(dptr->dtype.code) == kDLInt);
+nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
+                                                        DLDeviceType src_device) {
+  ICHECK_EQ(dptr->ctx.device_type, src_device);
+  ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
+         static_cast<int>(dptr->dtype.code) == kDLInt);
   const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
                              ? nvinfer1::DataType::kFLOAT
                              : nvinfer1::DataType::kINT32;
-  const size_t weight_bytes = runtime::GetDataSize(*dptr);
+  const size_t weight_bytes = GetDataSize(*dptr);
   nvinfer1::Weights weight{trt_dtype, nullptr, 0};
   size_t count = 1;
   for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
     count *= dptr->shape[i];
   }
-  CHECK_EQ(count * 4, weight_bytes);
+  ICHECK_EQ(count * 4, weight_bytes);
   weight.count = count;
   weight.values = new float[count];
-  CHECK_EQ(TVMArrayCopyToBytes(dptr, const_cast<void*>(weight.values), weight_bytes), 0)
+  ICHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
+                                weight_bytes),
+            0)
       << TVMGetLastError();
   trt_weights_.push_back(weight);
   return weight;
 }
 
-nvinfer1::Weights TensorRTBuilder::GetNdArrayAsWeights(const runtime::NDArray& array,
-                                                       DLDeviceType src_device) {
-  DLTensor* dptr = const_cast<DLTensor*>(array.operator->());
-  return GetDLTensorAsWeights(dptr, src_device);
-}
-
-void TensorRTBuilder::GetInputAsWeights(const VarNode* node) {
-  const int var_node_idx = network_input_map_[node->name_hint()];
-  // This input will be baked into TensorRT engine using value from first invocation.
-  network_input_is_baked_[var_node_idx] = true;
-  nvinfer1::Weights weight = GetDLTensorAsWeights(execution_args_[var_node_idx], kDLGPU);
-  node_output_map_[node] = {TrtOpInput(weight, GetShape(node->checked_type()))};
-}
-
-void TensorRTBuilder::GetConstantAsWeights(const ConstantNode* node) {
-  auto weight = GetNdArrayAsWeights(node->data, kDLCPU);
-  auto shape_long = node->data.Shape();
-  std::vector<int> shape(shape_long.begin(), shape_long.end());
-  node_output_map_[node] = {TrtOpInput(weight, shape)};
-}
-
-void TensorRTBuilder::GetInputAsTransposedWeights(const CallNode* transpose, const VarNode* node) {
-  GetInputAsWeights(node);
-  CHECK_EQ(node_output_map_[node].size(), 1);
-  const nvinfer1::Weights& original_weight = node_output_map_[node][0].weight;
-  const auto& shape = node_output_map_[node][0].weight_shape;
-  const float* original_values = static_cast<const float*>(original_weight.values);
-  float* values = new float[original_weight.count];
-  // Get order and new shape.
-  const auto* attrs = transpose->attrs.as<TransposeAttrs>();
-  std::vector<int> order(attrs->axes.size(), 0);
-  std::vector<int> new_shape(attrs->axes.size(), 0);
-  for (size_t i = 0; i < attrs->axes.size(); ++i) {
-    const int axis = attrs->axes[i].as<IntImmNode>()->value;
-    order[i] = axis;
-    new_shape[i] = shape[axis];
-  }
-  // Perform transpose.
-  if (order.size() == 4 && order[0] == 3 && order[1] == 2 && order[2] == 0 && order[3] == 1) {
-    const int output_strides[4] = {shape[1], 1, shape[0] * shape[1],
-                                   shape[0] * shape[1] * shape[2]};
-    TransposeWeights4D(shape, output_strides, original_values, values);
-  } else if (order.size() == 4 && order[0] == 2 && order[1] == 3 && order[2] == 0 &&
-             order[3] == 1) {
-    const int output_strides[4] = {shape[1], 1, shape[0] * shape[1] * shape[3],
-                                   shape[0] * shape[1]};
-    TransposeWeights4D(shape, output_strides, original_values, values);
-  } else if (order.size() == 2 && order[0] == 1 && order[1] == 0) {
-    TransposeWeights2D(shape, original_values, values);
-  } else {
-    LOG(FATAL) << "Constant transpose " << DebugString(order) << " is not supported.";
-  }
-  // Map as output of transpose op.
-  nvinfer1::Weights transposed_weight{nvinfer1::DataType::kFLOAT, values, original_weight.count};
-  trt_weights_.push_back(transposed_weight);
-  node_output_map_[transpose] = {TrtOpInput(transposed_weight, new_shape)};
-}
-
-void TensorRTBuilder::VisitExpr_(const TupleGetItemNode* op) {
-  if (const auto* tuple = op->tuple.as<TupleNode>()) {
-    Expr item = tuple->fields[op->index];
-    VisitExpr(item);
-    node_output_map_[op] = node_output_map_[item.operator->()];
-  } else {
-    VisitExpr(op->tuple);
-    // Index into tensor outputs from expr.
-    node_output_map_[op] = {node_output_map_[op->tuple.operator->()][op->index]};
-  }
-}
-
-void TensorRTBuilder::VisitExpr_(const TupleNode* op) {
-  std::vector<TrtOpInput> outputs;
-  for (auto item : op->fields) {
-    VisitExpr(item);
-    auto item_outputs = node_output_map_[item.operator->()];
-    outputs.reserve(outputs.size() + item_outputs.size());
-    outputs.insert(outputs.end(), item_outputs.begin(), item_outputs.end());
-  }
-  node_output_map_[op] = outputs;
-}
-
-nvinfer1::ITensor* TensorRTBuilder::AddInput(const std::string& tensor_name, const Type& type) {
-  auto shape = GetShape(type);
-  // Remove batch dim when not in explicit batch mode.
-  if (use_implicit_batch_ && shape.size() > 1) {
-    shape.erase(shape.begin());
-  }
-  DLOG(INFO) << "Added TRT network input: " << tensor_name << " " << DebugString(shape);
-  nvinfer1::Dims dims = VectorToTrtDims(shape);
-  auto type_node = type.as<TensorTypeNode>();
-  CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-      << "Only FP32 inputs are supported.";
-  return network_->addInput(tensor_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
-}
-
-void TensorRTBuilder::VisitExpr_(const VarNode* node) {
-  if (node->checked_type().as<TupleTypeNode>()) {
-    // Handle TupleTypes by creating multiple TRT inputs from one.
-    auto* tuple_type = node->type_as<TupleTypeNode>();
-    std::vector<TrtOpInput> outputs;
-    const std::string& original_name = node->name_hint();
-    std::vector<std::string> new_names;
-    for (int i = 0; i < tuple_type->fields.size(); ++i) {
-      std::string tensor_name = original_name + "_" + std::to_string(i);
-      new_names.push_back(tensor_name);
-      outputs.push_back(TrtOpInput(AddInput(tensor_name, tuple_type->fields[i])));
-    }
-    node_output_map_[node] = outputs;
-    // Update network_input_map_
-    const int original_index = network_input_map_[original_name];
-    network_input_map_.erase(original_name);
-    // Push all other inputs back.
-    for (auto it : network_input_map_) {
-      if (it.second > original_index) {
-        network_input_map_[it.first] += new_names.size() - 1;
-      }
-    }
-    for (size_t i = 0; i < new_names.size(); ++i) {
-      network_input_map_[new_names[i]] = original_index + i;
-    }
-    // Update network_input_names_
-    network_input_names_.erase(network_input_names_.begin() + original_index);
-    network_input_names_.insert(network_input_names_.begin() + original_index, new_names.begin(),
-                                new_names.end());
-    // Update network_input_is_baked_
-    bool is_baked = network_input_is_baked_[original_index];
-    network_input_is_baked_.erase(network_input_is_baked_.begin() + original_index);
-    network_input_is_baked_.insert(network_input_is_baked_.begin() + original_index,
-                                   new_names.size(), is_baked);
-  } else if (node->checked_type().as<TensorTypeNode>()) {
-    // Standard TensorType case.
-    const std::string& tensor_name = node->name_hint();
-    node_output_map_[node] = {TrtOpInput(AddInput(tensor_name, node->checked_type()))};
-  } else {
-    LOG(FATAL) << "VarNode must be Tensor or Tuple type.";
-  }
-}
-
-void TensorRTBuilder::VisitExpr_(const ConstantNode* node) {
-  nvinfer1::Weights weight = GetNdArrayAsWeights(node->data, kDLCPU);
-  auto shape = node->data.Shape();
-  // Remove batch dim when not in explicit batch mode.
-  if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) {
-    shape.erase(shape.begin());
-  }
-  nvinfer1::Dims dims = VectorToTrtDims(shape);
-  auto const_layer = network_->addConstant(dims, weight);
-  CHECK(const_layer != nullptr);
-  node_output_map_[node] = {TrtOpInput(const_layer->getOutput(0))};
-}
-
-void TensorRTBuilder::VisitExpr_(const CallNode* call) {
-  AddTrtLayerParams params(network_, call, &trt_weights_);
-  // Look up converter.
-  auto it = GetOpConverters()->find(params.op_name);
-  CHECK(it != GetOpConverters()->end())
-      << "Unsupported operator conversion to TRT, op name: " << params.op_name;
-  const auto converter = it->second;
-
-  // Ensure that nodes are processed in topological order by visiting their inputs first.
-  for (size_t i = 0; i < call->args.size(); ++i) {
-    if (converter->variable_input_count || converter->input_types[i] != kWeight) {
-      VisitExpr(call->args[i]);
-      continue;
-    }
-    // Handle special case where input must be constant array on CPU.
-    if (auto* var = call->args[i].as<VarNode>()) {
-      GetInputAsWeights(var);
-    } else if (auto* node = call->args[i].as<ConstantNode>()) {
-      GetConstantAsWeights(node);
-    } else {
-      // Temporary workaround for transposed weights. Once partitioning is
-      // available, the transpose will be computed by tvm and the result will be
-      // a var input. Also not needed when params are bound to constants since
-      // FoldConstants will remove the transpose for us.
-      const CallNode* transpose = call->args[i].as<CallNode>();
-      const VarNode* weights = nullptr;
-      if (transpose && transpose->op.as<OpNode>()->name == "transpose" &&
-          (weights = transpose->args[0].as<VarNode>())) {
-        GetInputAsTransposedWeights(transpose, weights);
-      } else {
-        LOG(FATAL) << "TRT requires a constant input here.";
-      }
-    }
-  }
-
-  // Get inputs.
-  for (size_t i = 0; i < call->args.size(); ++i) {
-    auto it = node_output_map_.find(call->args[i].operator->());
-    CHECK(it != node_output_map_.end()) << "Input was not found.";
-    for (auto out : it->second) {
-      params.inputs.push_back(out);
-    }
-  }
-  if (!converter->variable_input_count) {
-    CHECK_EQ(converter->input_types.size(), params.inputs.size())
-        << "Op expected a different number of inputs.";
-  }
-
-  // Convert op to TRT.
-  converter->Convert(&params);
-
-  // Get outputs.
-  node_output_map_[call] = {};
-  std::vector<TrtOpInput> outputs;
-  for (auto out : params.outputs) {
-    node_output_map_[call].push_back(TrtOpInput(out));
-  }
+nvinfer1::ITensor* TensorRTBuilder::GetInputAsTensor(const TensorRTOpInput& input) {
+  if (input.type == kTensor) return input.tensor;
+  auto dims = VectorToTrtDims(input.weight_shape);
+  return network_->addConstant(dims, input.weight)->getOutput(0);
 }
 
 void TensorRTBuilder::CleanUp() {
@@ -481,37 +231,19 @@ void TensorRTBuilder::CleanUp() {
   }
 }
 
-void TransposeWeights4D(const std::vector<int>& original_shape, const int* output_strides,
-                        const float* input_values, float* output_values) {
-  const int input_strides[4] = {original_shape[1] * original_shape[2] * original_shape[3],
-                                original_shape[2] * original_shape[3], original_shape[3], 1};
-  for (int i = 0; i < original_shape[0]; i++) {
-    for (int j = 0; j < original_shape[1]; j++) {
-      for (int k = 0; k < original_shape[2]; k++) {
-        for (int l = 0; l < original_shape[3]; l++) {
-          const int input_index = (i * input_strides[0]) + (j * input_strides[1]) +
-                                  (k * input_strides[2]) + (l * input_strides[3]);
-          const int output_index = (i * output_strides[0]) + (j * output_strides[1]) +
-                                   (k * output_strides[2]) + (l * output_strides[3]);
-          output_values[output_index] = input_values[input_index];
-        }
-      }
-    }
+void TensorRTBuilder::AllocateDeviceBuffer(nvinfer1::ICudaEngine* engine, const std::string& name,
+                                           std::vector<runtime::NDArray>* device_buffers) {
+  const uint32_t entry_id = entry_id_map_[name];
+  if (data_entry_[entry_id]->ctx.device_type != kDLGPU) {
+    const int binding_index = engine->getBindingIndex(name.c_str());
+    ICHECK_NE(binding_index, -1);
+    std::vector<int64_t> shape(data_entry_[entry_id]->shape,
+                               data_entry_[entry_id]->shape + data_entry_[entry_id]->ndim);
+    device_buffers->at(binding_index) =
+        runtime::NDArray::Empty(shape, data_entry_[entry_id]->dtype, {kDLGPU, 0});
   }
 }
 
-void TransposeWeights2D(const std::vector<int>& original_shape, const float* input_values,
-                        float* output_values) {
-  const int c = original_shape[0];
-  const int k = original_shape[1];
-  for (int i = 0; i < c; i++) {
-    for (int j = 0; j < k; j++) {
-      const int input_index = i * k + j;
-      const int output_index = j * c + i;
-      output_values[output_index] = input_values[input_index];
-    }
-  }
-}
 }  // namespace contrib
-}  // namespace relay
+}  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
index e99873a781ce..4926a4d02685 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h
@@ -17,154 +17,120 @@
  */
 
 /*!
-* \file runtime/contrib/tensorrt/tensorrt_builder.h
-* \brief Contains TensorRTBuilder class which can be used to convert a relay
-* program into a TRT engine which can be used for inference.
-*/
+ * \file runtime/contrib/tensorrt/tensorrt_builder.h
+ * \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph into a TRT engine
+ * which can be used for inference.
+ */
 
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
 
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/type.h>
 #include <tvm/runtime/ndarray.h>
 
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "NvInfer.h"
-
-#define TRT_VERSION_GE(major, minor, patch)                    \
-  ((NV_TENSORRT_MAJOR > major) ||                              \
-  (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
-  (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
-  NV_TENSORRT_PATCH >= patch))
 
+#include "../json/json_node.h"
+#include "NvInfer.h"
 #include "tensorrt_logger.h"
+#include "tensorrt_ops.h"
 
 namespace tvm {
 namespace runtime {
+namespace contrib {
+
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
 
 /*!
  * \brief The product of TensorRTBuilder which provides everything needed to
  * perform inference.
  */
-struct TrtEngineAndContext {
+struct TensorRTEngineAndContext {
   nvinfer1::ICudaEngine* engine;
   nvinfer1::IExecutionContext* context;
   std::vector<std::string> inputs;
-  std::vector<bool> input_is_baked;
   std::vector<std::string> outputs;
-  std::vector<runtime::NDArray> device_mem_buffers;
-};
-
-}  // namespace runtime
-
-namespace relay {
-namespace contrib {
-
-/*!
- * \brief An input to a op may be either kTensor in the case of nvifner::ITensor
- * or kWeight for nvinfer1::Weights.
- */
-enum TrtInputType {
-  kTensor,
-  kWeight,
+  /*! \brief GPU buffers for inputs and outputs. */
+  std::vector<NDArray> device_buffers;
 };
 
 /*!
- * \brief An input to a TrtOpConverter. The type of the input is either kTensor
- * or kWeight. For kTensor, "tensor" contains the input tensor. For kWeight,
- * "weight" contains the input weight and "weight_shape" contains the shape.
+ * \brief Converts a JSONRuntime graph into a TensorRT engine and execution context. Inputs,
+ * constants, layers, and outputs can be added to construct the TensorRT network definition.
+ * BuildEngine() will then use the network definition to build the TensorRT engine and context which
+ * can be used to run inference - this phase can take a long time because TensorRT will query the
+ * performance of all available kernels and fusions to optimize the engine.
  */
-struct TrtOpInput {
-  TrtInputType type;
-  nvinfer1::ITensor* tensor;
-  nvinfer1::Weights weight;
-  std::vector<int> weight_shape;
-
-  explicit TrtOpInput(nvinfer1::ITensor* tensor)
-      : tensor(tensor), type(kTensor) {}
-  TrtOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
-      : weight(weight), type(kWeight), weight_shape(shape) {}
-};
-
-/*!
- * \brief An ExprVisitor to convert a relay expression into a TensorRT engine
- * and execution context.
- */
-class TensorRTBuilder : public ExprVisitor {
+class TensorRTBuilder {
  public:
   /*!
    * \brief Create TensorRT builder.
-   * \param args Inputs to this execution.
+   * \param logger TensorRT logger to use for errors and warnings.
+   * \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
+   * \param use_implicit_batch Whether to use implicit batch mode (default)
+   * \param use_fp16 Whether to use implicit batch mode (default)
+   * \param batch_size If use_implicit_batch,
    */
-  explicit TensorRTBuilder(runtime::TensorRTLogger* logger, const std::vector<DLTensor*>& args,
-                           size_t max_workspace_size, bool use_implicit_batch_);
-
-  void VisitExpr_(const VarNode* node) final;
-
-  void VisitExpr_(const ConstantNode* node) final;
-
-  void VisitExpr_(const TupleGetItemNode* op) final;
-
-  void VisitExpr_(const TupleNode* op) final;
-
-  void VisitExpr_(const CallNode* call) final;
+  TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
+                  size_t max_workspace_size, bool use_implicit_batch, bool use_fp16,
+                  int batch_size);
 
   /*!
-   * \brief Convert Expr into TensorRT.
-   * \param expr The relay expression.
-   * \return TRT engine, context, and input/output information.
+   * \brief Add TensorRT input(s) for input node in network definition.
+   * \param nid The input node id.
+   * \param entry_id The index into data_entry_ for first entry in node.
+   * \param node The input node.
    */
-  runtime::TrtEngineAndContext BuildEngine(const Function& func);
+  void AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node);
 
- private:
   /*!
-   * \brief Helper function fto convert NDArray to TRT Weights.
-   * \param array NDArray containing data.
-   * \param src_device Which device the data is expected to be on.
-   * \return Newly created weights
+   * \brief Add TensorRT weight for input constant in network definition.
+   * \param nid The input node id.
+   * \param node The data tensor on CPU.
    */
-  nvinfer1::Weights GetNdArrayAsWeights(const runtime::NDArray& array,
-                                        DLDeviceType src_device);
+  void AddConstant(int nid, const DLTensor* data);
 
   /*!
-   * \brief Helper function fto convert DLTensor to TRT Weights.
-   * \param dptr Pointer to DLTensor containing data.
-   * \param src_device Which device the data is expected to be on.
-   * \return Newly created weights
+   * \brief Add TensorRT layer for op node in network definition.
+   * \param nid The input node id.
+   * \param node The op node.
    */
-  nvinfer1::Weights GetDLTensorAsWeights(DLTensor* dptr,
-                                         DLDeviceType src_device);
+  void AddLayer(int nid, const JSONGraphNode& node);
 
-  nvinfer1::ITensor* AddInput(const std::string& tensor_name, const Type& type);
+  /*!
+   * \brief Mark TensorRT output in network definition.
+   * \param entry The output node entry.
+   * \param entry_id The output node entry id.
+   */
+  void AddOutput(const JSONGraphNodeEntry& entry, uint32_t entry_id);
 
-  /*! \brief Gets value from execution args and converts to constant weight
-   * stored in node_output_map_ with node as the key. */
-  void GetInputAsWeights(const VarNode* node);
+  /*!
+   * \brief Takes network definition and "compiles" a TensorRT engine which can be used for
+   * inference. This step is time confusing.
+   * \return TRT engine, context, and input/output information.
+   */
+  TensorRTEngineAndContext BuildEngine();
 
-  /*! \brief Gets value from ConstantNode data and converts to constant weight
-   * stored in node_output_map_ with node as the key. */
-  void GetConstantAsWeights(const ConstantNode* node);
+ private:
+  /*! \brief Convert a DLTensor to a TensorRT weight. */
+  nvinfer1::Weights GetDLTensorAsWeights(const DLTensor* dptr, DLDeviceType src_device);
 
-  /*! \brief Temporary workaround for transposed weights. */
-  void GetInputAsTransposedWeights(const CallNode* transpose,
-                                   const VarNode* node);
+  /*! \brief Convert an input to a Tensor if it is a Weight */
+  nvinfer1::ITensor* GetInputAsTensor(const TensorRTOpInput& input);
 
-  /*! \brief Deallocates weights and destroys network definition. */
+  /*! \brief Clean up resources used to create engine. */
   void CleanUp();
 
-  /*! \brief Initializes network_input_names_, network_input_map_ and
-   * network_input_is_baked_ based on function parameters. */
-  void ProcessInputs(const Function& expr);
-
-  /*! \brief Populates network_output_names_ from the final outputs of the
-   * processed expr. */
-  void ProcessOutputs(const Expr& expr);
+  /*! \brief Allocate a GPU buffer for input or output DLTensor, only if the context is not GPU
+   * already. Inputs that are already on the GPU can be passed directly to TensorRT and will not
+   * need a buffer. */
+  void AllocateDeviceBuffer(nvinfer1::ICudaEngine* engine, const std::string& name,
+                            std::vector<runtime::NDArray>* device_buffers);
 
   /*! \brief Maps a node to its outputs. */
-  std::unordered_map<const ExprNode*, std::vector<TrtOpInput>> node_output_map_;
+  std::unordered_map<int, std::vector<TensorRTOpInput>> node_output_map_;
 
   /*! \brief TensorRT builder. */
   nvinfer1::IBuilder* builder_;
@@ -180,11 +146,11 @@ class TensorRTBuilder : public ExprVisitor {
   /*! \brief List of all weights held in memory. */
   std::vector<nvinfer1::Weights> trt_weights_;
 
-  /*! \brief Execution inputs from this invocation. */
-  const std::vector<DLTensor*>& execution_args_;
+  /*! \brief Input and output tensors from TVM. */
+  const std::vector<const DLTensor*>& data_entry_;
 
-  /*! \brief Batch size of inputs from this invocation. */
-  int batch_size_;
+  /*! \brief Map TensorRT binding name to index in data_entry_. */
+  std::unordered_map<std::string, uint32_t> entry_id_map_;
 
   /*! \brief Max workspace size in bytes for TRT. */
   size_t max_workspace_size_;
@@ -192,47 +158,21 @@ class TensorRTBuilder : public ExprVisitor {
   /*! \brief Whether to use implicit batch mode. */
   bool use_implicit_batch_;
 
-  /*! \brief Input names in same order as execution args during runtime. Some of
-   * these are not actual input bindings in the TRT engine - use
-   * network_input_is_baked_ to find out which. */
-  std::vector<std::string> network_input_names_;
+  /*! \brief Whether to automatically convert model to 16-bit floating point precision. */
+  bool use_fp16_;
 
-  /*! \brief Maps input name to execution args index. */
-  std::unordered_map<std::string, int> network_input_map_;
+  /*! \brief Batch size to optimize for. */
+  int batch_size_;
 
-  /*! \brief True if the corresponding input is baked into the TensorRT engine
-   * and therefore should not be included in the input bindings during
-   * execution. */
-  std::vector<bool> network_input_is_baked_;
+  /*! \brief Input names. */
+  std::vector<std::string> network_input_names_;
 
-  /*! \brief Output names in same order as execution args during runtime. */
+  /*! \brief Output names. */
   std::vector<std::string> network_output_names_;
 };
 
-/*!
- * \brief Helper function for GetInputAsTransposedWeights to transpose 4-D
- * weights.
- * \param original_shape Shape of weight before transpose.
- * \param output_strides Multipliers for each index to compute output index in
- * flat buffer. Must be of length 4.
- * \param input_values The original weight values.
- * \param output_values Buffer where transposed values will be placed.
- */
-void TransposeWeights4D(const std::vector<int>& original_shape,
-                        const int* output_strides, const float* input_values,
-                        float* output_values);
-
-/*!
- * \brief Helper function for GetInputAsTransposedWeights to transpose CK to KC.
- * \param original_shape Shape of weight before transpose.
- * \param input_values The original weight values.
- * \param output_values Buffer where transposed values will be placed.
- */
-void TransposeWeights2D(const std::vector<int>& original_shape,
-                        const float* input_values, float* output_values);
-
 }  // namespace contrib
-}  // namespace relay
+}  // namespace runtime
 }  // namespace tvm
 
 #endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_logger.h b/src/runtime/contrib/tensorrt/tensorrt_logger.h
index c606ffdb0b68..087cb010189c 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_logger.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_logger.h
@@ -25,10 +25,14 @@
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
 
+#include <tvm/support/logging.h>
+
 #include "NvInfer.h"
+#include "tensorrt_utils.h"
 
 namespace tvm {
 namespace runtime {
+namespace contrib {
 
 /*! \brief Logger for TensorRT info/warning/errors. */
 class TensorRTLogger : public nvinfer1::ILogger {
@@ -67,6 +71,7 @@ class TensorRTLogger : public nvinfer1::ILogger {
   Severity reportable_severity{Severity::kWARNING};
 };
 
+}  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/contrib/tensorrt/tensorrt_module.cc b/src/runtime/contrib/tensorrt/tensorrt_module.cc
deleted file mode 100644
index d1a83736a504..000000000000
--- a/src/runtime/contrib/tensorrt/tensorrt_module.cc
+++ /dev/null
@@ -1,389 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime/contrib/tensorrt/tensorrt_module.cc
- * \brief TensorRTModule is the runtime module for tensorrt backend.
- */
-
-#include "tensorrt_module.h"
-
-#include <cuda_runtime_api.h>
-#include <stdlib.h>
-#include <tvm/node/serialization.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/ndarray.h>
-
-#include <fstream>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "../../file_util.h"
-
-#ifdef TVM_GRAPH_RUNTIME_TENSORRT
-#include "NvInfer.h"
-#include "tensorrt_builder.h"
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-
-namespace tvm {
-namespace runtime {
-
-struct PairHash {
-  template <class T1, class T2>
-  std::size_t operator()(const std::pair<T1, T2>& pair) const {
-    return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
-  }
-};
-
-/*! \brief A module for TensorRT runtime. */
-class TensorRTModule : public runtime::ModuleNode {
- public:
-  explicit TensorRTModule(const std::unordered_map<std::string, std::string>& serialized_subgraphs)
-      : serialized_subgraphs_(serialized_subgraphs) {
-    max_workspace_size_ = dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(1) << 31);
-    use_implicit_batch_ = dmlc::GetEnv("TVM_TENSORRT_USE_IMPLICIT_BATCH", true);
-#if TVM_GRAPH_RUNTIME_TENSORRT
-    GetCachedEnginesFromDisk();
-#endif
-  }
-
-  ~TensorRTModule() {
-#if TVM_GRAPH_RUNTIME_TENSORRT
-    for (auto& it : trt_engine_cache_) {
-      it.second.context->destroy();
-      it.second.engine->destroy();
-    }
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-  }
-
-  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
-    // Returning nullptr tells TVM that the function is not in this module, so
-    // it can look for the correct one.
-    auto it_subgraph = serialized_subgraphs_.find(name);
-    if (it_subgraph == serialized_subgraphs_.end()) {
-      return PackedFunc(nullptr);
-    }
-#if TVM_GRAPH_RUNTIME_TENSORRT
-    // Generate an external packed function
-    return PackedFunc([this, name](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      auto inputs = ConvertInputs(args);
-      const int batch_size = inputs[0]->shape[0];
-      auto it = trt_engine_cache_.find(std::make_pair(name, batch_size));
-      if (it == trt_engine_cache_.end()) {
-        // Build new trt engine and place in cache.
-        LOG(INFO) << "Building new TensorRT engine for subgraph " << name << " with batch size "
-                  << batch_size;
-        auto func = Downcast<relay::Function>(LoadJSON(this->serialized_subgraphs_[name]));
-        auto inputs = ConvertInputs(args);
-        std::string key = GetSubgraphKey(serialized_subgraphs_[name]);
-        relay::contrib::TensorRTBuilder builder(&logger_, inputs, max_workspace_size_,
-                                                use_implicit_batch_);
-        auto engine_and_context = builder.BuildEngine(func);
-        CacheEngineToDisk(key, engine_and_context);
-        LOG(INFO) << "Finished building TensorRT engine for subgraph " << name;
-        this->trt_engine_cache_[std::make_pair(name, batch_size)] = engine_and_context;
-        this->ExecuteEngine(&this->trt_engine_cache_[std::make_pair(name, batch_size)], args, rv);
-      } else {
-        this->ExecuteEngine(&it->second, args, rv);
-      }
-    });
-#else
-    LOG(FATAL) << "TVM was not built with TensorRT runtime enabled. Build "
-               << "with USE_TENSORRT=ON.";
-    return PackedFunc();
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-  }
-
-  const char* type_key() const { return "tensorrt"; }
-
-  void SaveToFile(const std::string& file_name, const std::string& format) final {
-    std::string fmt = runtime::GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, type_key()) << "Can only save to format=" << type_key();
-    SaveBinaryToFile(file_name, SerializeModuleToString());
-  }
-
-  void SaveToBinary(dmlc::Stream* stream) final { stream->Write(SerializeModuleToString()); }
-
-  static Module LoadFromFile(const std::string& path) {
-    std::ifstream filep(path);
-    filep.seekg(0, std::ios::end);
-    size_t size = filep.tellg();
-    std::string serialized_module(size, ' ');
-    filep.seekg(0);
-    filep.read(&serialized_module[0], size);
-    return CreateModuleFromString(serialized_module);
-  }
-
-  static Module LoadFromBinary(void* strm) {
-    dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
-    std::string serialized_module;
-    stream->Read(&serialized_module);
-    return CreateModuleFromString(serialized_module);
-  }
-
- private:
-  /*! \brief Relay program serialized using SaveJSON */
-  std::unordered_map<std::string, std::string> serialized_subgraphs_;
-
-  /*! \brief Max workspace size for TensorRT */
-  size_t max_workspace_size_;
-
-  /*! \brief Whether to use implicit batch mode. */
-  bool use_implicit_batch_;
-
-#if TVM_GRAPH_RUNTIME_TENSORRT
-  /*! \brief Map of function name to TRT engine if built already. */
-  std::unordered_map<std::pair<std::string, int>, TrtEngineAndContext, PairHash> trt_engine_cache_;
-
-  /*! \brief TensorRT object used to log warnings and errors. */
-  TensorRTLogger logger_;
-
-  /*!
-   * \brief Convert TVMArgs to make compatible with VM or graph runtime.
-   * \param args Inputs to the PackedFunc.
-   * \return Inputs converted to vector of DLTensor*
-   */
-  std::vector<DLTensor*> ConvertInputs(tvm::TVMArgs args) {
-    std::vector<DLTensor*> inputs(args.size(), nullptr);
-    for (size_t i = 0; i < args.size(); ++i) {
-      if (args[i].type_code() == kTVMNDArrayHandle) {
-        // Relay Debug/VM uses NDArray
-        runtime::NDArray array = args[i];
-        inputs[i] = const_cast<DLTensor*>(array.operator->());
-      } else if (args[i].type_code() == kTVMDLTensorHandle) {
-        // Graph runtime uses DLTensors
-        inputs[i] = args[i];
-      } else {
-        LOG(FATAL) << "Invalid TVMArgs type.";
-      }
-    }
-    return inputs;
-  }
-
-  /*!
-   * \brief Perform inference using TensorRT.
-   * \param engine_and_context TRT engine from TrtBuilder::BuildEngine()
-   * \param args Inputs to the PackedFunc.
-   * \param rv Return value pointer for the PackedFunc.
-   * \return Inputs converted to vector of DLTensor*
-   */
-  void ExecuteEngine(TrtEngineAndContext* engine_and_context, tvm::TVMArgs args,
-                     tvm::TVMRetValue* rv) {
-    auto engine = engine_and_context->engine;
-    auto context = engine_and_context->context;
-    auto& device_buffers = engine_and_context->device_mem_buffers;
-    const int num_bindings = engine->getNbBindings();
-    std::vector<void*> bindings(num_bindings, nullptr);
-    // Set inputs.
-    auto inputs = ConvertInputs(args);
-    const size_t num_outputs = engine_and_context->outputs.size();
-    CHECK_GT(inputs.size(), num_outputs);
-    for (size_t i = 0; i < engine_and_context->inputs.size(); ++i) {
-      // If an input was baked into the engine, skip.
-      if (engine_and_context->input_is_baked[i]) continue;
-      DLTensor* arg = inputs[i];
-      int binding_index = engine->getBindingIndex(engine_and_context->inputs[i].c_str());
-      CHECK_NE(binding_index, -1);
-      if (!runtime::TypeMatch(arg->dtype, kDLFloat, 32)) {
-        LOG(FATAL) << "Only float32 inputs are supported.";
-      }
-      if (inputs[i]->ctx.device_type == kDLGPU) {
-        bindings[binding_index] = reinterpret_cast<float*>(arg->data);
-      } else {
-        device_buffers[binding_index].CopyFrom(inputs[i]);
-        bindings[binding_index] = reinterpret_cast<float*>(device_buffers[binding_index]->data);
-      }
-#if TRT_VERSION_GE(6, 0, 1)
-      // Set binding dimensions for INetworkV2 explicit batch mode engines.
-      if (!use_implicit_batch_) {
-        nvinfer1::Dims dims;
-        dims.d[0] = 1;
-        dims.nbDims = arg->ndim;
-        for (int i = 0; i < arg->ndim; ++i) {
-          dims.d[i] = arg->shape[i];
-        }
-        context->setBindingDimensions(binding_index, dims);
-      }
-#endif
-    }
-    // Set outputs.
-    for (size_t i = 0; i < num_outputs; ++i) {
-      const int index_in_inputs = inputs.size() - num_outputs + i;
-      DLTensor* out_arg = inputs[index_in_inputs];
-      int binding_index = engine->getBindingIndex(engine_and_context->outputs[i].c_str());
-      CHECK_NE(binding_index, -1);
-      if (out_arg->ctx.device_type == kDLGPU) {
-        bindings[binding_index] = reinterpret_cast<float*>(out_arg->data);
-      } else {
-        bindings[binding_index] = reinterpret_cast<float*>(device_buffers[binding_index]->data);
-      }
-    }
-#if TRT_VERSION_GE(6, 0, 1)
-    if (use_implicit_batch_) {
-      // Use batch size from first input.
-      const int batch_size = inputs[0]->shape[0];
-      CHECK(context->execute(batch_size, bindings.data())) << "Running TensorRT failed.";
-    } else {
-      CHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
-    }
-    for (size_t i = 0; i < num_outputs; ++i) {
-      const int index_in_inputs = inputs.size() - num_outputs + i;
-      DLTensor* out_arg = inputs[index_in_inputs];
-      int binding_index = engine->getBindingIndex(engine_and_context->outputs[i].c_str());
-      CHECK_NE(binding_index, -1);
-      if (out_arg->ctx.device_type != kDLGPU) {
-        device_buffers[binding_index].CopyTo(out_arg);
-      }
-    }
-#else
-    // Use batch size from first input.
-    const int batch_size = inputs[0]->shape[0];
-    CHECK(context->execute(batch_size, bindings.data())) << "Running TensorRT failed.";
-#endif
-    *rv = bindings[num_bindings - num_outputs];
-  }
-
-  std::string GetSubgraphKey(const std::string& serialized_subgraph) {
-    if (dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string("")).empty()) return "";
-    std::string key = std::to_string(std::hash<std::string>()(serialized_subgraph));
-    if (dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false)) {
-      key += "_fp16";
-    }
-    return key;
-  }
-
-  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
-   * already built TRT engines and load into trt_engine_cache_ so they don't
-   * have to be built at first inference.
-   */
-  void GetCachedEnginesFromDisk() {
-    std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
-    if (cache_dir.empty()) return;
-    for (auto it : serialized_subgraphs_) {
-      std::string key = GetSubgraphKey(it.second);
-      std::string path = cache_dir + "/" + key + ".plan";
-      // Check if engine is in the cache.
-      std::ifstream infile(path, std::ios::binary);
-      if (!infile.good()) continue;
-      LOG(INFO) << "Loading cached TensorRT engine from " << path;
-      infile.close();
-      std::string serialized_engine;
-      LoadBinaryFromFile(path, &serialized_engine);
-      // Deserialize engine
-      nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger_);
-      TrtEngineAndContext engine_and_context;
-      engine_and_context.engine =
-          runtime->deserializeCudaEngine(&serialized_engine[0], serialized_engine.size(), nullptr);
-      engine_and_context.context = engine_and_context.engine->createExecutionContext();
-      // Load metadata
-      std::string meta_path = cache_dir + "/" + key + ".meta";
-      std::string serialized_meta;
-      LoadBinaryFromFile(meta_path, &serialized_meta);
-      std::istringstream is(serialized_meta);
-      dmlc::JSONReader reader(&is);
-      dmlc::JSONObjectReadHelper helper;
-      helper.DeclareField("inputs", &engine_and_context.inputs);
-      helper.DeclareField("input_is_baked", &engine_and_context.input_is_baked);
-      helper.DeclareField("outputs", &engine_and_context.outputs);
-      helper.ReadAllFields(&reader);
-      const int batch_size = 1;
-      trt_engine_cache_[std::make_pair(it.first, batch_size)] = engine_and_context;
-    }
-  }
-
-  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will save the engine to that
-   * directory so it can be loaded later. A hash of the source relay function is
-   * used as the key for the file name.
-   * \param name Subgraph name
-   * \param engine_and_context Engine to cache
-   */
-  void CacheEngineToDisk(const std::string& key, const TrtEngineAndContext& engine_and_context) {
-    std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
-    if (cache_dir.empty()) return;
-    std::string path = cache_dir + "/" + key + ".plan";
-    LOG(INFO) << "Caching TensorRT engine to " << path;
-    // Serialize engine to disk
-    nvinfer1::IHostMemory* serialized_engine = engine_and_context.engine->serialize();
-    SaveBinaryToFile(path, std::string(static_cast<const char*>(serialized_engine->data()),
-                                       serialized_engine->size()));
-    serialized_engine->destroy();
-    // Serialize metadata
-    std::ostringstream os;
-    dmlc::JSONWriter writer(&os);
-    writer.BeginObject();
-    writer.WriteObjectKeyValue("inputs", engine_and_context.inputs);
-    writer.WriteObjectKeyValue("input_is_baked", engine_and_context.input_is_baked);
-    writer.WriteObjectKeyValue("outputs", engine_and_context.outputs);
-    writer.EndObject();
-    std::string meta_path = cache_dir + "/" + key + ".meta";
-    SaveBinaryToFile(meta_path, os.str());
-  }
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-
-  /*! \brief Serialize this module to a string. To be used during codegen. */
-  std::string SerializeModuleToString() {
-    std::ostringstream os;
-    dmlc::JSONWriter writer(&os);
-    writer.BeginObject();
-    writer.WriteObjectKeyValue("subgraphs", serialized_subgraphs_);
-    writer.WriteObjectKeyValue("max_workspace_size", max_workspace_size_);
-    writer.WriteObjectKeyValue("use_implicit_batch", use_implicit_batch_);
-    writer.EndObject();
-    return os.str();
-  }
-
-  /*! \brief Load serialized module from string created by SerializeModuleToString. */
-  static Module CreateModuleFromString(const std::string& str) {
-    std::unordered_map<std::string, std::string> serialized_subgraphs;
-    size_t max_workspace_size = 0;
-    bool use_implicit_batch = true;
-    std::istringstream is(str);
-    dmlc::JSONReader reader(&is);
-    dmlc::JSONObjectReadHelper helper;
-    helper.DeclareField("subgraphs", &serialized_subgraphs);
-    helper.DeclareOptionalField("max_workspace_size", &max_workspace_size);
-    helper.DeclareOptionalField("use_implicit_batch", &use_implicit_batch);
-    helper.ReadAllFields(&reader);
-    auto n = make_object<TensorRTModule>(serialized_subgraphs);
-    // Use max_workspace_size from artifact if it is set and it is not overriden by env var.
-    if (max_workspace_size != 0 && dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", 0) != 0) {
-      n->max_workspace_size_ = max_workspace_size;
-    }
-    n->use_implicit_batch_ = use_implicit_batch;
-    return Module(n);
-  }
-};
-
-Module TensorRTModuleCreate(
-    const std::unordered_map<std::string, std::string>& serialized_subgraphs) {
-  auto n = make_object<TensorRTModule>(serialized_subgraphs);
-  return Module(n);
-}
-
-TVM_REGISTER_GLOBAL("runtime.module.loadfile_tensorrt").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = TensorRTModule::LoadFromFile(args[0]);
-});
-
-TVM_REGISTER_GLOBAL("runtime.module.loadbinary_tensorrt")
-    .set_body_typed(TensorRTModule::LoadFromBinary);
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
new file mode 100644
index 000000000000..c3ff1c45f50e
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -0,0 +1,1074 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/tensorrt_ops.cc
+ * \brief Converters from Relay ops into TensorRT layers. Converters should
+ * inherit from TensorRTOpConverter and implement the Convert() method.
+ */
+
+#include "tensorrt_ops.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "NvInfer.h"
+#include "tensorrt_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+TensorRTOpConverter::TensorRTOpConverter(const std::vector<TensorRTInputType>& input_types,
+                                         bool variable_input_count)
+    : input_types(input_types), variable_input_count(variable_input_count) {}
+
+nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* params,
+                                                nvinfer1::ITensor* input,
+                                                const std::vector<int>& new_shape) const {
+  auto layer = params->network->addShuffle(*input);
+  ICHECK(layer != nullptr);
+  layer->setReshapeDimensions(VectorToTrtDims(new_shape));
+  return layer->getOutput(0);
+}
+
+nvinfer1::ITensor* TensorRTOpConverter::Transpose(TensorRTOpConverterParams* params,
+                                                  nvinfer1::ITensor* input,
+                                                  const std::vector<int>& order) const {
+  auto layer = params->network->addShuffle(*input);
+  ICHECK(layer != nullptr);
+  nvinfer1::Permutation perm;
+  if (TRT_HAS_IMPLICIT_BATCH(params)) {
+    // Batch dimension cannot be modified.
+    ICHECK_EQ(input->getDimensions().nbDims, order.size() - 1);
+    ICHECK_EQ(order[0], 0);
+    for (size_t i = 0; i < order.size(); ++i) {
+      perm.order[i] = order[i + 1] - 1;
+    }
+  } else {
+    ICHECK_EQ(input->getDimensions().nbDims, order.size());
+    for (size_t i = 0; i < order.size(); ++i) {
+      perm.order[i] = order[i];
+    }
+  }
+  layer->setFirstTranspose(perm);
+  return layer->getOutput(0);
+}
+
+int TensorRTOpConverter::ConvertAxis(TensorRTOpConverterParams* params, int axis,
+                                     int input_rank) const {
+  // Add 1 for missing batch dim.
+  if (TRT_HAS_IMPLICIT_BATCH(params)) {
+    input_rank += 1;
+  }
+  ICHECK(axis >= -input_rank && axis < input_rank);
+  if (axis < 0) axis += input_rank;
+  if (TRT_HAS_IMPLICIT_BATCH(params)) {
+    // Can't modify batch dimenson.
+    ICHECK_NE(axis, 0);
+    // Subtract 1 for implicit batch dim.
+    axis -= 1;
+  }
+  return axis;
+}
+
+nvinfer1::ITensor* TensorRTOpConverter::CreateScalar(
+    TensorRTOpConverterParams* params, float value, const nvinfer1::Dims& broadcast_to_dims) const {
+  nvinfer1::Dims dims;
+  dims.nbDims = broadcast_to_dims.nbDims;
+  std::fill_n(dims.d, dims.nbDims, 1);
+  float* values = new float[1];
+  values[0] = value;
+  nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
+  params->trt_weights->push_back(weights);
+  return params->network->addConstant(dims, weights)->getOutput(0);
+}
+
+void TensorRTOpConverter::GetPadding(const std::vector<std::string>& padding,
+                                     bool* use_asymmetric_padding, nvinfer1::DimsHW* prepadding,
+                                     nvinfer1::DimsHW* postpadding) const {
+  ICHECK(padding.size() == 1 || padding.size() == 2 || padding.size() == 4);
+  if (padding.size() == 4) {
+    // four int : padding width in the order of (top, left, bottom, right).
+    *prepadding = nvinfer1::DimsHW(std::stoi(padding[0]), std::stoi(padding[1]));
+    *postpadding = nvinfer1::DimsHW(std::stoi(padding[2]), std::stoi(padding[3]));
+    *use_asymmetric_padding = true;
+  } else if (padding.size() == 2) {
+    // two int : bottom, right will use same padding as top, left
+    *prepadding = nvinfer1::DimsHW(std::stoi(padding[0]), std::stoi(padding[1]));
+    *postpadding = *prepadding;
+    *use_asymmetric_padding = false;
+  } else {
+    // one int : same padding used on all sides
+    *prepadding = nvinfer1::DimsHW(std::stoi(padding[0]), std::stoi(padding[0]));
+    *postpadding = *prepadding;
+    *use_asymmetric_padding = false;
+  }
+}
+
+void TensorRTOpConverter::GetPadding3D(const std::vector<std::string>& padding,
+                                       bool* use_asymmetric_padding, nvinfer1::Dims* prepadding,
+                                       nvinfer1::Dims* postpadding) const {
+  ICHECK(padding.size() == 1 || padding.size() == 3 || padding.size() == 6);
+  if (padding.size() == 6) {
+    // six int : padding width in the order of (front, top, left, back, bottom, right)
+    *prepadding =
+        nvinfer1::Dims3(std::stoi(padding[0]), std::stoi(padding[1]), std::stoi(padding[2]));
+    *postpadding =
+        nvinfer1::Dims3(std::stoi(padding[3]), std::stoi(padding[4]), std::stoi(padding[5]));
+    *use_asymmetric_padding = true;
+  } else if (padding.size() == 3) {
+    // three int : back, bottom, right will use same padding as front, top, left
+    *prepadding =
+        nvinfer1::Dims3(std::stoi(padding[0]), std::stoi(padding[1]), std::stoi(padding[2]));
+    *postpadding = *prepadding;
+    *use_asymmetric_padding = false;
+  } else {
+    // one int : same padding used on all sides
+    *prepadding =
+        nvinfer1::Dims3(std::stoi(padding[0]), std::stoi(padding[0]), std::stoi(padding[0]));
+    *postpadding = *prepadding;
+    *use_asymmetric_padding = false;
+  }
+}
+
+class ActivationOpConverter : public TensorRTOpConverter {
+ public:
+  ActivationOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    static const std::unordered_map<std::string, nvinfer1::ActivationType> op_map = {
+      {"nn.relu", nvinfer1::ActivationType::kRELU},
+      {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
+      {"tanh", nvinfer1::ActivationType::kTANH},
+#if TRT_VERSION_GE(5, 1, 5)
+      {"clip", nvinfer1::ActivationType::kCLIP},
+      {"nn.leaky_relu", nvinfer1::ActivationType::kLEAKY_RELU},
+#endif
+    };
+    auto it = op_map.find(params->op_name);
+    ICHECK(it != op_map.end()) << "Unsupported activation type " << params->op_name;
+    nvinfer1::IActivationLayer* act_layer =
+        params->network->addActivation(*params->inputs.at(0).tensor, it->second);
+#if TRT_VERSION_GE(5, 1, 5)
+    if (params->op_name == "clip") {
+      float a_min = std::stof(params->node.GetAttr<std::vector<std::string>>("a_min")[0]);
+      float a_max = std::stof(params->node.GetAttr<std::vector<std::string>>("a_max")[0]);
+      act_layer->setAlpha(a_min);
+      act_layer->setBeta(a_max);
+    } else if (params->op_name == "nn.leaky_relu") {
+      float alpha = std::stof(params->node.GetAttr<std::vector<std::string>>("alpha")[0]);
+      act_layer->setAlpha(alpha);
+    }
+#endif
+    ICHECK(act_layer != nullptr);
+    params->outputs.push_back(act_layer->getOutput(0));
+  }
+};
+
+class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
+ public:
+  ElementWiseBinaryOpConverter() : TensorRTOpConverter({kTensor, kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation> op_map = {
+        {"add", nvinfer1::ElementWiseOperation::kSUM},
+        {"subtract", nvinfer1::ElementWiseOperation::kSUB},
+        {"multiply", nvinfer1::ElementWiseOperation::kPROD},
+        {"divide", nvinfer1::ElementWiseOperation::kDIV},
+        {"power", nvinfer1::ElementWiseOperation::kPOW},
+        {"maximum", nvinfer1::ElementWiseOperation::kMAX},
+        {"minimum", nvinfer1::ElementWiseOperation::kMIN}};
+    auto it = op_map.find(params->op_name);
+    ICHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
+    // Broadcast
+    auto input0 = params->inputs.at(0).tensor;
+    auto input0_dims = TrtDimsToVector(input0->getDimensions());
+    auto input1 = params->inputs.at(1).tensor;
+    auto input1_dims = TrtDimsToVector(input1->getDimensions());
+    const bool need_broadcast = input0_dims.size() != input1_dims.size();
+    if (need_broadcast) {
+      if (input0_dims.size() < input1_dims.size()) {
+        std::vector<int> new_shape(input0_dims);
+        while (new_shape.size() < input1_dims.size()) new_shape.insert(new_shape.begin(), 1);
+        input0 = Reshape(params, input0, new_shape);
+      } else if (input1_dims.size() < input0_dims.size()) {
+        std::vector<int> new_shape(input1_dims);
+        while (new_shape.size() < input0_dims.size()) new_shape.insert(new_shape.begin(), 1);
+        input1 = Reshape(params, input1, new_shape);
+      }
+    }
+
+    nvinfer1::IElementWiseLayer* elemwise_layer =
+        params->network->addElementWise(*input0, *input1, it->second);
+    ICHECK(elemwise_layer != nullptr);
+    params->outputs.push_back(elemwise_layer->getOutput(0));
+  }
+};
+
+class Conv2DOpConverter : public TensorRTOpConverter {
+ public:
+  Conv2DOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    auto weight_shape = params->inputs.at(1).weight_shape;
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
+    ICHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+           params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
+    int channels = weight_shape[0];
+    if (params->node.HasAttr("channels") &&
+        !params->node.GetAttr<std::vector<std::string>>("channels")[0].empty()) {
+      channels = std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
+    }
+    // TRT conv2d op doesn't support asymmetric padding before 5.1, so we
+    // workaround by adding a padding layer before the pooling op.
+    nvinfer1::DimsHW prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+#if !TRT_VERSION_GE(5, 1, 5)
+    if (use_asymmetric_padding) {
+      auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
+      ICHECK(pad_layer != nullptr);
+      input_tensor = pad_layer->getOutput(0);
+      // No need for conv op to do any padding.
+      use_asymmetric_padding = false;
+      prepadding = nvinfer1::DimsHW(0, 0);
+    }
+#endif
+
+    const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
+                                                      params->inputs.at(1).weight, bias);
+    ICHECK(conv_layer != nullptr);
+    if (use_asymmetric_padding) {
+#if TRT_VERSION_GE(5, 1, 5)
+      conv_layer->setPrePadding(prepadding);
+      conv_layer->setPostPadding(postpadding);
+#endif
+    } else {
+      conv_layer->setPadding(prepadding);
+    }
+    ICHECK_EQ(str_strides.size(), 2);
+    const auto strides = nvinfer1::DimsHW(std::stoi(str_strides[0]), std::stoi(str_strides[1]));
+    conv_layer->setStride(strides);
+    ICHECK_EQ(str_dilation.size(), 2);
+    const auto dilation = nvinfer1::DimsHW(std::stoi(str_dilation[0]), std::stoi(str_dilation[1]));
+    conv_layer->setDilation(dilation);
+    conv_layer->setNbGroups(groups);
+    params->outputs.push_back(conv_layer->getOutput(0));
+  }
+};
+
+#if TRT_VERSION_GE(6, 0, 1)
+class Conv3DOpConverter : public TensorRTOpConverter {
+ public:
+  Conv3DOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    auto weight_shape = params->inputs.at(1).weight_shape;
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCDHW");
+    ICHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+           params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCDHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIDHW");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
+
+    nvinfer1::Dims prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+
+    // Could use attrs->channels.as<IntImmNode>()->value
+    const int num_outputs = weight_shape[0];
+    const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
+                                                        params->inputs.at(1).weight, bias);
+    ICHECK(conv_layer != nullptr);
+    if (use_asymmetric_padding) {
+      conv_layer->setPrePadding(prepadding);
+      conv_layer->setPostPadding(postpadding);
+    } else {
+      conv_layer->setPaddingNd(prepadding);
+    }
+    ICHECK_EQ(str_strides.size(), 3);
+    const auto strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
+                                         std::stoi(str_strides[2]));
+    conv_layer->setStrideNd(strides);
+    ICHECK_EQ(str_dilation.size(), 3);
+    const auto dilation = nvinfer1::Dims3(std::stoi(str_dilation[0]), std::stoi(str_dilation[1]),
+                                          std::stoi(str_dilation[2]));
+    conv_layer->setDilationNd(dilation);
+    conv_layer->setNbGroups(groups);
+    params->outputs.push_back(conv_layer->getOutput(0));
+  }
+};
+#endif  // TRT_VERSION_GE(6, 0, 1)
+
+class DenseOpConverter : public TensorRTOpConverter {
+ public:
+  DenseOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    ICHECK(input_dims.size() > 0 && input_dims.size() <= 3);
+    const size_t required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
+    const bool need_reshape_on_input = input_dims.size() != required_rank;
+    if (need_reshape_on_input) {
+      // Add dims of size 1 until rank is required_rank.
+      std::vector<int> new_shape(input_dims);
+      while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1);
+      input_tensor = Reshape(params, input_tensor, new_shape);
+    }
+    // Weights are in KC format.
+    ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
+    const int num_units = params->inputs.at(1).weight_shape[0];
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
+        *input_tensor, num_units, params->inputs.at(1).weight, bias);
+    ICHECK(fc_layer != nullptr);
+    auto output_tensor = fc_layer->getOutput(0);
+    if (need_reshape_on_input) {
+      // Remove added dims.
+      input_dims[input_dims.size() - 1] = num_units;
+      output_tensor = Reshape(params, output_tensor, input_dims);
+    }
+    params->outputs.push_back(output_tensor);
+  }
+};
+
+class BatchNormOpConverter : public TensorRTOpConverter {
+ public:
+  BatchNormOpConverter() : TensorRTOpConverter({kTensor, kWeight, kWeight, kWeight, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto gamma = params->inputs.at(1).weight;
+    auto beta = params->inputs.at(2).weight;
+    auto mean = params->inputs.at(3).weight;
+    auto var = params->inputs.at(4).weight;
+    ICHECK_EQ(gamma.count, beta.count);
+    ICHECK_EQ(gamma.count, mean.count);
+    ICHECK_EQ(gamma.count, var.count);
+    const float epsilon = std::stof(params->node.GetAttr<std::vector<std::string>>("epsilon")[0]);
+    const int axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
+    const bool scale = std::stoi(params->node.GetAttr<std::vector<std::string>>("scale")[0]);
+    const bool center = std::stoi(params->node.GetAttr<std::vector<std::string>>("center")[0]);
+    ICHECK(axis == 1 || axis == 3);
+    const bool need_transpose = axis == 3;
+
+    void* weight_scale_ptr = new float[gamma.count];
+    nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
+    params->trt_weights->push_back(weight_scale);
+    void* weight_shift_ptr = new float[gamma.count];
+    nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
+    params->trt_weights->push_back(weight_shift);
+    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    // fill in the content of weights for the Scale layer
+    const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
+    const float* beta_ptr = reinterpret_cast<const float*>(beta.values);
+    const float* mean_ptr = reinterpret_cast<const float*>(mean.values);
+    const float* var_ptr = reinterpret_cast<const float*>(var.values);
+    float* scale_ptr = reinterpret_cast<float*>(weight_scale_ptr);
+    float* shift_ptr = reinterpret_cast<float*>(weight_shift_ptr);
+    for (int i = 0; i < gamma.count; ++i) {
+      scale_ptr[i] = 1.0 / std::sqrt(var_ptr[i] + epsilon);
+      if (scale) {
+        scale_ptr[i] *= gamma_ptr[i];
+      }
+      shift_ptr[i] = -mean_ptr[i] * scale_ptr[i];
+      if (center) {
+        shift_ptr[i] += beta_ptr[i];
+      }
+    }
+    if (need_transpose) {
+      input = Transpose(params, input, {0, 3, 1, 2});
+    }
+    nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
+        *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power);
+    ICHECK(scale_layer != nullptr);
+    auto output = scale_layer->getOutput(0);
+    if (need_transpose) {
+      output = Transpose(params, output, {0, 2, 3, 1});
+    }
+    params->outputs.push_back(output);
+  }
+};
+
+class BatchFlattenOpConverter : public TensorRTOpConverter {
+ public:
+  BatchFlattenOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    std::vector<int> new_shape{-1};
+    if (!TRT_HAS_IMPLICIT_BATCH(params)) {
+      new_shape.insert(new_shape.begin(), params->inputs.at(0).tensor->getDimensions().d[0]);
+    }
+    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, new_shape));
+  }
+};
+
+class SoftmaxOpConverter : public TensorRTOpConverter {
+ public:
+  SoftmaxOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    const int input_rank = input->getDimensions().nbDims;
+    const int original_axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
+    const int axis = ConvertAxis(params, original_axis, input_rank);
+    nvinfer1::ISoftMaxLayer* softmax_layer = params->network->addSoftMax(*input);
+    softmax_layer->setAxes(1 << axis);
+    ICHECK(softmax_layer != nullptr);
+    params->outputs.push_back(softmax_layer->getOutput(0));
+  }
+};
+
+class PoolingOpConverter : public TensorRTOpConverter {
+ public:
+  PoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
+        {"nn.max_pool2d", nvinfer1::PoolingType::kMAX},
+        {"nn.avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
+    auto it = op_map.find(params->op_name);
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+    auto str_pool_size = params->node.GetAttr<std::vector<std::string>>("pool_size");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    nvinfer1::DimsHW prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+    bool ceil_mode = std::stoi(params->node.GetAttr<std::vector<std::string>>("ceil_mode")[0]);
+
+// TRT pooling op doesn't support asymmetric padding before 5.1, so we
+// workaround by adding a padding layer before the pooling op.
+#if !TRT_VERSION_GE(5, 1, 5)
+    if (use_asymmetric_padding) {
+      auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
+      ICHECK(pad_layer != nullptr);
+      input = pad_layer->getOutput(0);
+      // No need for pooling op to do any padding.
+      use_asymmetric_padding = false;
+      prepadding = nvinfer1::DimsHW(0, 0);
+    }
+#endif
+
+    nvinfer1::DimsHW window_size =
+        nvinfer1::DimsHW(std::stoi(str_pool_size[0]), std::stoi(str_pool_size[1]));
+    auto pool_layer = params->network->addPooling(*input, it->second, window_size);
+    ICHECK(pool_layer != nullptr);
+    nvinfer1::DimsHW strides =
+        nvinfer1::DimsHW(std::stoi(str_strides[0]), std::stoi(str_strides[1]));
+    pool_layer->setStride(strides);
+    if (use_asymmetric_padding) {
+#if TRT_VERSION_GE(5, 1, 5)
+      pool_layer->setPrePadding(prepadding);
+      pool_layer->setPostPadding(postpadding);
+#endif
+    } else {
+      pool_layer->setPadding(prepadding);
+    }
+    if (params->op_name == "nn.avg_pool2d") {
+      bool count_include_pad =
+          std::stoi(params->node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
+      // count_include_pad=True is useless if there is no padding. TRT doesn't
+      // like count_include_pad in combination with strides even when there is
+      // no padding or assymetric padding even, so turn off inclusive to avoid
+      // error message. Note: Padding will always be symmetric with
+      // count_include_pad since partitioner will prevent unsupported case.
+      if (prepadding.h() == 0 && prepadding.w() == 0) {
+        count_include_pad = false;
+      }
+      pool_layer->setAverageCountExcludesPadding(!count_include_pad);
+    }
+#if TRT_VERSION_GE(5, 1, 5)
+    if (ceil_mode) {
+      pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
+    }
+#else
+    ICHECK(!ceil_mode);
+#endif
+    params->outputs.push_back(pool_layer->getOutput(0));
+  }
+};
+
+#if TRT_VERSION_GE(6, 0, 1)
+class Pooling3DOpConverter : public TensorRTOpConverter {
+ public:
+  Pooling3DOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
+        {"nn.max_pool3d", nvinfer1::PoolingType::kMAX},
+        {"nn.avg_pool3d", nvinfer1::PoolingType::kAVERAGE}};
+    auto it = op_map.find(params->op_name);
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCDHW");
+    auto str_pool_size = params->node.GetAttr<std::vector<std::string>>("pool_size");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    nvinfer1::DimsHW prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+    bool ceil_mode = std::stoi(params->node.GetAttr<std::vector<std::string>>("ceil_mode")[0]);
+    nvinfer1::Dims window_size = nvinfer1::Dims3(
+        std::stoi(str_pool_size[0]), std::stoi(str_pool_size[1]), std::stoi(str_pool_size[2]));
+    auto pool_layer = params->network->addPoolingNd(*input, it->second, window_size);
+    ICHECK(pool_layer != nullptr);
+    nvinfer1::Dims strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
+                                             std::stoi(str_strides[2]));
+    pool_layer->setStrideNd(strides);
+    if (use_asymmetric_padding) {
+      pool_layer->setPrePadding(prepadding);
+      pool_layer->setPostPadding(postpadding);
+    } else {
+      pool_layer->setPaddingNd(prepadding);
+    }
+    if (params->op_name == "nn.avg_pool3d") {
+      bool count_include_pad =
+          std::stoi(params->node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
+      pool_layer->setAverageCountExcludesPadding(!count_include_pad);
+    }
+    if (ceil_mode) {
+      pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
+    }
+    params->outputs.push_back(pool_layer->getOutput(0));
+  }
+};
+#endif  // TRT_VERSION_GE(6, 0, 1)
+
+class GlobalPoolingOpConverter : public TensorRTOpConverter {
+ public:
+  GlobalPoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
+        {"nn.global_max_pool2d", nvinfer1::PoolingType::kMAX},
+        {"nn.global_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
+    auto it = op_map.find(params->op_name);
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+    const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
+    const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
+    auto pool_layer =
+        params->network->addPooling(*input_tensor, it->second, nvinfer1::DimsHW(h, w));
+    ICHECK(pool_layer != nullptr);
+    params->outputs.push_back(pool_layer->getOutput(0));
+  }
+};
+
+class ExpandDimsOpConverter : public TensorRTOpConverter {
+ public:
+  ExpandDimsOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    const int original_axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
+    const int num_newaxis =
+        std::stoi(params->node.GetAttr<std::vector<std::string>>("num_newaxis")[0]);
+    const int axis = ConvertAxis(params, original_axis, input_dims.size() + 1);
+    for (int i = 0; i < num_newaxis; ++i) {
+      input_dims.insert(input_dims.begin() + axis, 1);
+    }
+    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, input_dims));
+  }
+};
+
+class SqueezeOpConverter : public TensorRTOpConverter {
+ public:
+  SqueezeOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    auto str_axis = params->node.GetAttr<std::vector<std::string>>("axis");
+    for (size_t i = 0; i < str_axis.size(); ++i) {
+      const int axis = ConvertAxis(params, std::stoi(str_axis[i]), input_dims.size());
+      input_dims[axis] = 0;
+    }
+    input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0), input_dims.end());
+    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, input_dims));
+  }
+};
+
+class UnaryOpConverter : public TensorRTOpConverter {
+ public:
+  UnaryOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    // The following ops are supported by TRT but don't exist in relay yet:
+    // recip, tan, sinh, cosh, asin, acos, asinh, acosh, atanh
+    static const std::unordered_map<std::string, nvinfer1::UnaryOperation> op_map = {
+      {"exp", nvinfer1::UnaryOperation::kEXP},
+      {"log", nvinfer1::UnaryOperation::kLOG},
+      {"sqrt", nvinfer1::UnaryOperation::kSQRT},
+      {"abs", nvinfer1::UnaryOperation::kABS},
+      {"negative", nvinfer1::UnaryOperation::kNEG},
+#if TRT_VERSION_GE(5, 1, 5)
+      {"sin", nvinfer1::UnaryOperation::kSIN},
+      {"cos", nvinfer1::UnaryOperation::kCOS},
+      {"atan", nvinfer1::UnaryOperation::kATAN},
+      {"ceil", nvinfer1::UnaryOperation::kCEIL},
+      {"floor", nvinfer1::UnaryOperation::kFLOOR},
+#endif
+    };
+    auto it = op_map.find(params->op_name);
+    ICHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name;
+    nvinfer1::IUnaryLayer* unary_layer =
+        params->network->addUnary(*params->inputs.at(0).tensor, it->second);
+    ICHECK(unary_layer != nullptr);
+    params->outputs.push_back(unary_layer->getOutput(0));
+  }
+};
+
+class ConcatOpConverter : public TensorRTOpConverter {
+ public:
+  ConcatOpConverter() : TensorRTOpConverter({}, /*variable_input_count=*/true) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    const int num_inputs = params->inputs.size();
+    ICHECK_GT(num_inputs, 0);
+    const int input_rank = params->inputs[0].tensor->getDimensions().nbDims;
+    std::vector<nvinfer1::ITensor*> input_tensors;
+    for (auto input : params->inputs) {
+      ICHECK(input.type == kTensor);
+      ICHECK_EQ(input_rank, input.tensor->getDimensions().nbDims);
+      input_tensors.push_back(input.tensor);
+    }
+
+    const int original_axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
+    const int axis = ConvertAxis(params, original_axis, input_rank);
+
+    nvinfer1::IConcatenationLayer* concat_layer =
+        params->network->addConcatenation(input_tensors.data(), input_tensors.size());
+    ICHECK(concat_layer != nullptr);
+    concat_layer->setAxis(axis);
+    params->outputs.push_back(concat_layer->getOutput(0));
+  }
+};
+
+class BiasAddOpConverter : public TensorRTOpConverter {
+ public:
+  BiasAddOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    const size_t required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
+    ICHECK(input_dims.size() > 0 && input_dims.size() <= required_rank);
+    const bool need_reshape_on_input = input_dims.size() != required_rank;
+    if (need_reshape_on_input) {
+      // Add dims of size 1 until rank is required_rank.
+      std::vector<int> new_shape(input_dims);
+      while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1);
+      input_tensor = Reshape(params, input_tensor, new_shape);
+    }
+
+    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
+        *input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
+    ICHECK(scale_layer != nullptr);
+    auto output_tensor = scale_layer->getOutput(0);
+    if (need_reshape_on_input) {
+      // Remove added dims.
+      output_tensor = Reshape(params, output_tensor, input_dims);
+    }
+    params->outputs.push_back(output_tensor);
+  }
+};
+
+class Conv2DTransposeOpConverter : public TensorRTOpConverter {
+ public:
+  Conv2DTransposeOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto weight_shape = params->inputs.at(1).weight_shape;
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
+    ICHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+           params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
+    auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
+    ICHECK(std::stoi(str_dilation[0]) == 1 && std::stoi(str_dilation[1]) == 1);
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    auto str_output_padding = params->node.GetAttr<std::vector<std::string>>("output_padding");
+    int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
+
+    // TRT deconv op doesn't support asymmetric padding before 5.1, so we
+    // workaround by adding a padding layer before the pooling op.
+    nvinfer1::DimsHW prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+#if !TRT_VERSION_GE(5, 1, 5)
+    if (use_asymmetric_padding) {
+      auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
+      ICHECK(pad_layer != nullptr);
+      input_tensor = pad_layer->getOutput(0);
+      // No need for conv op to do any padding.
+      use_asymmetric_padding = false;
+      prepadding = nvinfer1::DimsHW(0, 0);
+    }
+#endif
+
+    // Could use conv2d_attr->channels.as<IntImmNode>()->value
+    const int num_outputs = weight_shape[1];
+    const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
+                                                          params->inputs.at(1).weight, bias);
+    ICHECK(deconv_layer != nullptr);
+    if (use_asymmetric_padding) {
+#if TRT_VERSION_GE(5, 1, 5)
+      deconv_layer->setPrePadding(prepadding);
+      deconv_layer->setPostPadding(postpadding);
+#endif
+    } else {
+      deconv_layer->setPadding(prepadding);
+    }
+    const auto strides = nvinfer1::DimsHW(std::stoi(str_strides[0]), std::stoi(str_strides[1]));
+    deconv_layer->setStride(strides);
+    deconv_layer->setNbGroups(groups);
+    nvinfer1::ITensor* output = deconv_layer->getOutput(0);
+    // Output padding.
+    if (str_output_padding.size()) {
+      GetPadding(str_output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+      if (prepadding.h() != 0 || prepadding.w() != 0 || postpadding.h() != 0 ||
+          postpadding.w() != 0) {
+        // Output padding for Conv2D transpose is always asymmetric and applied to post only.
+        prepadding = nvinfer1::DimsHW(0, 0);
+        auto pad_layer = params->network->addPadding(*output, prepadding, postpadding);
+        output = pad_layer->getOutput(0);
+      }
+    }
+    params->outputs.push_back(output);
+  }
+};
+
+#if TRT_VERSION_GE(6, 0, 1)
+class Conv3DTransposeOpConverter : public TensorRTOpConverter {
+ public:
+  Conv3DTransposeOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto weight_shape = params->inputs.at(1).weight_shape;
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCDHW");
+    ICHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+           params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCDHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIDHW");
+    auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
+    ICHECK_EQ(str_dilation.size(), 3);
+    ICHECK(std::stoi(str_dilation[0]) == 1 && std::stoi(str_dilation[1]) == 1 &&
+           std::stoi(str_dilation[2]) == 1);
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    auto str_output_padding = params->node.GetAttr<std::vector<std::string>>("output_padding");
+    int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
+    nvinfer1::Dims prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+
+    // Could use attrs->channels.as<IntImmNode>()->value
+    const int num_outputs = weight_shape[1];
+    const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
+                                                            params->inputs.at(1).weight, bias);
+    ICHECK(deconv_layer != nullptr);
+    if (use_asymmetric_padding) {
+      deconv_layer->setPrePadding(prepadding);
+      deconv_layer->setPostPadding(postpadding);
+    } else {
+      deconv_layer->setPaddingNd(prepadding);
+    }
+    ICHECK_EQ(str_strides.size(), 3);
+    const auto strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
+                                         std::stoi(str_strides[2]));
+    deconv_layer->setStrideNd(strides);
+    deconv_layer->setNbGroups(groups);
+    nvinfer1::ITensor* output = deconv_layer->getOutput(0);
+    // Output padding.
+    if (str_output_padding.size()) {
+      GetPadding3D(str_output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+      // Are any post-padding values non-zero?
+      ICHECK(!std::any_of(postpadding.d, postpadding.d + postpadding.nbDims, [](int x) {
+        return x != 0;
+      })) << "TRT does not support padding on 3 dimensions.";
+    }
+    params->outputs.push_back(output);
+  }
+};
+#endif  // TRT_VERSION_GE(6, 0, 1)
+
+class TransposeOpConverter : public TensorRTOpConverter {
+ public:
+  TransposeOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto str_axes = params->node.GetAttr<std::vector<std::string>>("axes");
+    std::vector<int> order;
+    for (size_t i = 0; i < str_axes.size(); ++i) {
+      order.push_back(std::stoi(str_axes[i]));
+    }
+    params->outputs.push_back(Transpose(params, input, order));
+  }
+};
+
+class LayoutTransformOpConverter : public TensorRTOpConverter {
+ public:
+  LayoutTransformOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto src = params->node.GetAttr<std::vector<std::string>>("src_layout")[0];
+    auto dst = params->node.GetAttr<std::vector<std::string>>("dst_layout")[0];
+    std::vector<int> order;
+    if (src == "NCHW" && dst == "NHWC") {
+      order = {0, 2, 3, 1};
+    } else if (src == "NHWC" && dst == "NCHW") {
+      order = {0, 3, 1, 2};
+    } else if (src == "NDHWC" && dst == "NCDHW") {
+      order = {0, 4, 1, 2, 3};
+    } else if (src == "NCDHW" && dst == "NDHWC") {
+      order = {0, 2, 3, 4, 1};
+    }
+    params->outputs.push_back(Transpose(params, input, order));
+  }
+};
+
+class ReshapeOpConverter : public TensorRTOpConverter {
+ public:
+  ReshapeOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    ICHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("reverse")[0]), false);
+    auto str_newshape = params->node.GetAttr<std::vector<std::string>>("newshape");
+    std::vector<int> new_shape;
+    const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
+    for (size_t i = start_index; i < str_newshape.size(); ++i) {
+      const int value = std::stoi(str_newshape[i]);
+      ICHECK_GE(value, -1);
+      new_shape.push_back(value);
+    }
+    params->outputs.push_back(Reshape(params, input, new_shape));
+  }
+};
+
+class PadOpConverter : public TensorRTOpConverter {
+ public:
+  PadOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto str_paddding = params->node.GetAttr<std::vector<std::string>>("padding");
+    nvinfer1::DimsHW prepadding =
+        nvinfer1::DimsHW(std::stoi(str_paddding[0]), std::stoi(str_paddding[1]));
+    nvinfer1::DimsHW postpadding =
+        nvinfer1::DimsHW(std::stoi(str_paddding[2]), std::stoi(str_paddding[3]));
+    auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
+    params->outputs.push_back(pad_layer->getOutput(0));
+  }
+};
+
+class ReduceOpConverter : public TensorRTOpConverter {
+ public:
+  ReduceOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    static const std::unordered_map<std::string, nvinfer1::ReduceOperation> op_map = {
+        {"sum", nvinfer1::ReduceOperation::kSUM},
+        {"prod", nvinfer1::ReduceOperation::kPROD},
+        {"max", nvinfer1::ReduceOperation::kMAX},
+        {"min", nvinfer1::ReduceOperation::kMIN},
+        {"mean", nvinfer1::ReduceOperation::kAVG}};
+    auto it = op_map.find(params->op_name);
+    ICHECK(it != op_map.end()) << "Unsupported reduce type " << params->op_name;
+
+    auto input = params->inputs.at(0).tensor;
+    ICHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("exclude")[0]), false);
+    bool keepdims = std::stoi(params->node.GetAttr<std::vector<std::string>>("keepdims")[0]);
+    auto str_axis = params->node.GetAttr<std::vector<std::string>>("axis");
+    // TODO(trevmorr): Support reduce to scalar.
+    ICHECK_GT(str_axis.size(), 0);
+    uint32_t reduce_axes = 0;
+    for (size_t i = 0; i < str_axis.size(); ++i) {
+      const int axis = ConvertAxis(params, std::stoi(str_axis[i]), input->getDimensions().nbDims);
+      reduce_axes |= 1 << axis;
+    }
+    auto reduce_layer = params->network->addReduce(*input, it->second, reduce_axes, keepdims);
+    params->outputs.push_back(reduce_layer->getOutput(0));
+  }
+};
+
+#if TRT_VERSION_GE(5, 1, 5)
+class StridedSliceOpConverter : public TensorRTOpConverter {
+ public:
+  StridedSliceOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input->getDimensions());
+    auto str_start = params->node.GetAttr<std::vector<std::string>>("start");
+    auto str_size = params->node.GetAttr<std::vector<std::string>>("size");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<int> start, size, strides;
+    std::transform(str_start.begin(), str_start.end(), std::back_inserter(start),
+                   [](const std::string& s) { return std::stoi(s); });
+    std::transform(str_size.begin(), str_size.end(), std::back_inserter(size),
+                   [](const std::string& s) { return std::stoi(s); });
+    std::transform(str_strides.begin(), str_strides.end(), std::back_inserter(strides),
+                   [](const std::string& s) { return std::stoi(s); });
+    if (TRT_HAS_IMPLICIT_BATCH(params)) {
+      start.erase(start.begin());
+      size.erase(size.begin());
+      strides.erase(strides.begin());
+    }
+    auto slice_layer = params->network->addSlice(*input, VectorToTrtDims(start),
+                                                 VectorToTrtDims(size), VectorToTrtDims(strides));
+    params->outputs.push_back(slice_layer->getOutput(0));
+  }
+};
+#endif
+
+class AdaptivePoolingOpConverter : public TensorRTOpConverter {
+ public:
+  AdaptivePoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
+        {"nn.adaptive_max_pool2d", nvinfer1::PoolingType::kMAX},
+        {"nn.adaptive_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
+    auto it = op_map.find(params->op_name);
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+
+    // This is an approximation of adaptive pooling. Results will not be
+    // mathematically exact except when output_size is (1, 1).
+    // Annotation rules will only allow output size of (1, 1).
+    auto output_size = nvinfer1::DimsHW(1, 1);
+    const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
+    const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
+    const auto stride = nvinfer1::DimsHW(h / output_size.h(), w / output_size.w());
+    const auto window_size = nvinfer1::DimsHW(h - (output_size.h() - 1) * stride.h(),
+                                              w - (output_size.w() - 1) * stride.w());
+    auto pool_layer = params->network->addPooling(*input_tensor, it->second, window_size);
+    ICHECK(pool_layer != nullptr);
+    pool_layer->setStride(stride);
+    params->outputs.push_back(pool_layer->getOutput(0));
+  }
+};
+
+const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>
+GetOpConverters() {
+  static auto map =
+      std::make_shared<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>();
+  if (!map->empty()) return map;
+  map->emplace("nn.relu", std::make_shared<ActivationOpConverter>());
+  map->emplace("sigmoid", std::make_shared<ActivationOpConverter>());
+  map->emplace("tanh", std::make_shared<ActivationOpConverter>());
+  map->emplace("nn.batch_norm", std::make_shared<BatchNormOpConverter>());
+  map->emplace("nn.softmax", std::make_shared<SoftmaxOpConverter>());
+  map->emplace("nn.conv2d", std::make_shared<Conv2DOpConverter>());
+  map->emplace("nn.dense", std::make_shared<DenseOpConverter>());
+  map->emplace("nn.bias_add", std::make_shared<BiasAddOpConverter>());
+  map->emplace("add", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("subtract", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("multiply", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("divide", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("power", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("maximum", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("minimum", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("nn.max_pool2d", std::make_shared<PoolingOpConverter>());
+  map->emplace("nn.avg_pool2d", std::make_shared<PoolingOpConverter>());
+  map->emplace("nn.global_max_pool2d", std::make_shared<GlobalPoolingOpConverter>());
+  map->emplace("nn.global_avg_pool2d", std::make_shared<GlobalPoolingOpConverter>());
+  map->emplace("exp", std::make_shared<UnaryOpConverter>());
+  map->emplace("log", std::make_shared<UnaryOpConverter>());
+  map->emplace("sqrt", std::make_shared<UnaryOpConverter>());
+  map->emplace("abs", std::make_shared<UnaryOpConverter>());
+  map->emplace("negative", std::make_shared<UnaryOpConverter>());
+  map->emplace("nn.batch_flatten", std::make_shared<BatchFlattenOpConverter>());
+  map->emplace("expand_dims", std::make_shared<ExpandDimsOpConverter>());
+  map->emplace("squeeze", std::make_shared<SqueezeOpConverter>());
+  map->emplace("concatenate", std::make_shared<ConcatOpConverter>());
+  map->emplace("nn.conv2d_transpose", std::make_shared<Conv2DTransposeOpConverter>());
+  map->emplace("transpose", std::make_shared<TransposeOpConverter>());
+  map->emplace("layout_transform", std::make_shared<LayoutTransformOpConverter>());
+  map->emplace("reshape", std::make_shared<ReshapeOpConverter>());
+  map->emplace("nn.pad", std::make_shared<PadOpConverter>());
+  map->emplace("sum", std::make_shared<ReduceOpConverter>());
+  map->emplace("prod", std::make_shared<ReduceOpConverter>());
+  map->emplace("max", std::make_shared<ReduceOpConverter>());
+  map->emplace("min", std::make_shared<ReduceOpConverter>());
+  map->emplace("mean", std::make_shared<ReduceOpConverter>());
+  map->emplace("nn.adaptive_max_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
+  map->emplace("nn.adaptive_avg_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
+#if TRT_VERSION_GE(5, 1, 5)
+  map->emplace("clip", std::make_shared<ActivationOpConverter>());
+  map->emplace("nn.leaky_relu", std::make_shared<ActivationOpConverter>());
+  map->emplace("sin", std::make_shared<UnaryOpConverter>());
+  map->emplace("cos", std::make_shared<UnaryOpConverter>());
+  map->emplace("atan", std::make_shared<UnaryOpConverter>());
+  map->emplace("ceil", std::make_shared<UnaryOpConverter>());
+  map->emplace("floor", std::make_shared<UnaryOpConverter>());
+  map->emplace("strided_slice", std::make_shared<StridedSliceOpConverter>());
+#endif  // TRT_VERSION_GE(5, 1, 5)
+#if TRT_VERSION_GE(6, 0, 1)
+  map->emplace("nn.conv3d", std::make_shared<Conv3DOpConverter>());
+  map->emplace("nn.max_pool3d", std::make_shared<Pooling3DOpConverter>());
+  map->emplace("nn.avg_pool3d", std::make_shared<Pooling3DOpConverter>());
+  map->emplace("nn.conv3d_transpose", std::make_shared<Conv3DTransposeOpConverter>());
+#endif  // TRT_VERSION_GE(6, 0, 1)
+  return map;
+}
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.h b/src/runtime/contrib/tensorrt/tensorrt_ops.h
index 58fc8d7acb65..e9871d42146c 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.h
@@ -19,25 +19,22 @@
 /*!
  * \file runtime/contrib/tensorrt/tensorrt_ops.h
  * \brief Converters from Relay ops into TensorRT layers. Converters should
- * inherit from TrtOpConverter and implement the Convert() method.
+ * inherit from TensorRTOpConverter and implement the Convert() method.
  */
 
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
 
-#include <tvm/relay/attrs/image.h>
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/attrs/reduce.h>
-#include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/attrs/vision.h>
-
 #include <algorithm>
+#include <cmath>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "../json/json_node.h"
 #include "NvInfer.h"
-#include "utils.h"
+#include "tensorrt_utils.h"
 
 #if TRT_VERSION_GE(6, 0, 1)
 #define TRT_HAS_IMPLICIT_BATCH(params) (params->network->hasImplicitBatchDimension())
@@ -46,47 +43,71 @@
 #endif
 
 namespace tvm {
-namespace relay {
+namespace runtime {
 namespace contrib {
 
-/*! \brief Parameters to convert an Op from relay to TensorRT. */
-struct AddTrtLayerParams {
-  /*! \brief The corresponding relay Call node. */
-  const CallNode* call;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+/*!
+ * \brief An input to a op may be either kTensor in the case of nvinfer::ITensor*
+ * or kWeight for nvinfer1::Weights.
+ */
+enum TensorRTInputType {
+  kTensor,
+  kWeight,
+};
+
+/*!
+ * \brief An input to a TensorRTOpConverter. The type of the input is either kTensor
+ * or kWeight. For kTensor, "tensor" contains the input tensor. For kWeight,
+ * "weight" contains the input weight and "weight_shape" contains the shape.
+ */
+struct TensorRTOpInput {
+  /*! \brief If type is kTensor, will store input tensor. */
+  nvinfer1::ITensor* tensor;
+
+  /*! \brief If type is kWeight, will store input weight. */
+  nvinfer1::Weights weight;
+
+  /*! \brief Whether the input is in tensor or weight. */
+  TensorRTInputType type;
+
+  /*! \brief If type is kWeight, will store weight shape. */
+  std::vector<int> weight_shape;
+
+  explicit TensorRTOpInput(nvinfer1::ITensor* tensor)
+      : tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), type(kTensor) {}
+  TensorRTOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
+      : tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {}
+};
+
+/*! \brief Parameters to convert an Op from Relay to TensorRT. */
+struct TensorRTOpConverterParams {
   /*! \brief The TRT network that the new layer should be added to. */
   nvinfer1::INetworkDefinition* network;
+  /*! \brief The corresponding serialized node. */
+  const JSONGraphNode& node;
   /*! \brief The type of op. */
   std::string op_name;
   /*! \brief Inputs to the op. */
-  std::vector<TrtOpInput> inputs;
+  std::vector<TensorRTOpInput> inputs;
   /*! \brief Outputs of the op should be populated here during Convert(). */
   std::vector<nvinfer1::ITensor*> outputs;
   /*! \brief Any newly allocated weights should be stored here also. */
   std::vector<nvinfer1::Weights>* trt_weights;
 
-  AddTrtLayerParams(nvinfer1::INetworkDefinition* network, const CallNode* call,
-                    std::vector<nvinfer1::Weights>* trt_weights)
-      : network(network), call(call), trt_weights(trt_weights) {
-    if (auto* op = call->op.as<OpNode>()) {
-      op_name = op->name;
-    } else if (call->op->IsInstance<FunctionNode>()) {
-      Function func = Downcast<Function>(call->op);
-      const auto name_node = func->GetAttr<String>(attr::kComposite);
-      if (!name_node.defined() || name_node.value() == "") {
-        LOG(FATAL) << "Only composite functions can be converted.";
-      }
-      op_name = name_node.value();
-    } else {
-      LOG(FATAL) << "Call must be Op or Function.";
-    }
+  TensorRTOpConverterParams(nvinfer1::INetworkDefinition* network, const JSONGraphNode& node,
+                            std::vector<nvinfer1::Weights>* trt_weights)
+      : network(network), node(node), trt_weights(trt_weights) {
+    op_name = node.GetOpName();
   }
 };
 
 /*! \brief Base class for an op converter from Relay to TRT. */
-class TrtOpConverter {
+class TensorRTOpConverter {
  public:
   /*! \brief Used to specify whether each input is tensor or weight. */
-  const std::vector<TrtInputType> input_types;
+  const std::vector<TensorRTInputType> input_types;
   /*! \brief If set to true, any number of tensor inputs can be used for the op.
    */
   const bool variable_input_count;
@@ -96,15 +117,14 @@ class TrtOpConverter {
    * input_types or variable_input_count.
    * \param input_types For each input to the op, there should be a
    * corresponding entry in input_types to determine whether that input should
-   * be a tensor or a weight. TrtBuilder will prepare inputs in
-   * AddTrtLayerParams according to this.
+   * be a tensor or a weight. TensorRTBuilder will prepare inputs in
+   * TensorRTOpConverter according to this.
    * \param variable_input_count If the op can have multiple inputs, set this to
    * true. input_types vector will be ignored and any number of input tensors
    * can be used for this op. All inputs will be tensors and not weights.
    */
-  explicit TrtOpConverter(const std::vector<TrtInputType>& input_types,
-                          bool variable_input_count = false)
-      : input_types(input_types), variable_input_count(variable_input_count) {}
+  explicit TensorRTOpConverter(const std::vector<TensorRTInputType>& input_types,
+                               bool variable_input_count = false);
 
   /*!
    * \brief Convert to TRT. Implementation should use inputs and attributes
@@ -112,7 +132,7 @@ class TrtOpConverter {
    * should be pushed to outputs vector.
    * \param params Parameters for this op.
    */
-  virtual void Convert(AddTrtLayerParams* params) const = 0;
+  virtual void Convert(TensorRTOpConverterParams* params) const = 0;
 
   /*!
    * \brief Helper function to reshape a tensor.
@@ -121,13 +141,8 @@ class TrtOpConverter {
    * \param new_shape New shape, does not include batch dim.
    * \return Reshaped tensor
    */
-  nvinfer1::ITensor* Reshape(AddTrtLayerParams* params, nvinfer1::ITensor* input,
-                             const std::vector<int>& new_shape) const {
-    auto layer = params->network->addShuffle(*input);
-    CHECK(layer != nullptr);
-    layer->setReshapeDimensions(VectorToTrtDims(new_shape));
-    return layer->getOutput(0);
-  }
+  nvinfer1::ITensor* Reshape(TensorRTOpConverterParams* params, nvinfer1::ITensor* input,
+                             const std::vector<int>& new_shape) const;
 
   /*!
    * \brief Helper function to transpose a tensor.
@@ -136,27 +151,8 @@ class TrtOpConverter {
    * \param order New order of axes, does include batch dim.
    * \return Transposed tensor
    */
-  nvinfer1::ITensor* Transpose(AddTrtLayerParams* params, nvinfer1::ITensor* input,
-                               const std::vector<int>& order) const {
-    auto layer = params->network->addShuffle(*input);
-    CHECK(layer != nullptr);
-    nvinfer1::Permutation perm;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      // Batch dimension cannot be modified.
-      CHECK_EQ(input->getDimensions().nbDims, order.size() - 1);
-      CHECK_EQ(order[0], 0);
-      for (int i = 0; i < order.size(); ++i) {
-        perm.order[i] = order[i + 1] - 1;
-      }
-    } else {
-      CHECK_EQ(input->getDimensions().nbDims, order.size());
-      for (int i = 0; i < order.size(); ++i) {
-        perm.order[i] = order[i];
-      }
-    }
-    layer->setFirstTranspose(perm);
-    return layer->getOutput(0);
-  }
+  nvinfer1::ITensor* Transpose(TensorRTOpConverterParams* params, nvinfer1::ITensor* input,
+                               const std::vector<int>& order) const;
 
   /*!
    * \brief Helper function to convert an axis to TRT format.
@@ -164,21 +160,7 @@ class TrtOpConverter {
    * \param input_rank Rank of input, does not include batch dim.
    * \return Axis in TRT format.
    */
-  int ConvertAxis(AddTrtLayerParams* params, int axis, int input_rank) const {
-    // Add 1 for missing batch dim.
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      input_rank += 1;
-    }
-    CHECK(axis >= -input_rank && axis < input_rank);
-    if (axis < 0) axis += input_rank;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      // Can't modify batch dimenson.
-      CHECK_NE(axis, 0);
-      // Subtract 1 for implicit batch dim.
-      axis -= 1;
-    }
-    return axis;
-  }
+  int ConvertAxis(TensorRTOpConverterParams* params, int axis, int input_rank) const;
 
   /*!
    * \brief Create constant that is broadcastable.
@@ -187,1167 +169,39 @@ class TrtOpConverter {
    * \param broadcast_to_dims Dims that scalar should be broadcastable against.
    * \return Constant tensor.
    */
-  nvinfer1::ITensor* CreateScalar(AddTrtLayerParams* params, float value,
-                                  const nvinfer1::Dims& broadcast_to_dims) const {
-    nvinfer1::Dims dims;
-    dims.nbDims = broadcast_to_dims.nbDims;
-    std::fill_n(dims.d, dims.nbDims, 1);
-    float* values = new float[1];
-    values[0] = value;
-    nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
-    params->trt_weights->push_back(weights);
-    return params->network->addConstant(dims, weights)->getOutput(0);
-  }
+  nvinfer1::ITensor* CreateScalar(TensorRTOpConverterParams* params, float value,
+                                  const nvinfer1::Dims& broadcast_to_dims) const;
 
   /*!
    * \brief Get pre/post padding values from padding attributes array.
-   * \param padding Padding from relay op attributes.
+   * \param padding Serialized padding from op attributes.
    * \param padding_is_asymmetric True if both pre and post are needed for asymmetric padding.
    * \param prepadding Prepadding value or symmetric padding values if !padding_is_asymmetric.
    * \param postpadding Postpadding value if padding_is_asymmetric.
    */
-  void GetPadding(const Array<IndexExpr>& padding, bool* use_asymmetric_padding,
-                  nvinfer1::DimsHW* prepadding, nvinfer1::DimsHW* postpadding) const {
-    CHECK(padding.size() == 1 || padding.size() == 2 || padding.size() == 4);
-    if (padding.size() == 4) {
-      // four int : padding width in the order of (top, left, bottom, right).
-      *prepadding =
-          nvinfer1::DimsHW(padding[0].as<IntImmNode>()->value, padding[1].as<IntImmNode>()->value);
-      *postpadding =
-          nvinfer1::DimsHW(padding[2].as<IntImmNode>()->value, padding[3].as<IntImmNode>()->value);
-      *use_asymmetric_padding = true;
-    } else if (padding.size() == 2) {
-      // two int : bottom, right will use same padding as top, left
-      *prepadding =
-          nvinfer1::DimsHW(padding[0].as<IntImmNode>()->value, padding[1].as<IntImmNode>()->value);
-      *postpadding = *prepadding;
-      *use_asymmetric_padding = false;
-    } else {
-      // one int : same padding used on all sides
-      *prepadding =
-          nvinfer1::DimsHW(padding[0].as<IntImmNode>()->value, padding[0].as<IntImmNode>()->value);
-      *postpadding = *prepadding;
-      *use_asymmetric_padding = false;
-    }
-  }
+  void GetPadding(const std::vector<std::string>& padding, bool* use_asymmetric_padding,
+                  nvinfer1::DimsHW* prepadding, nvinfer1::DimsHW* postpadding) const;
 
   /*!
    * \brief Get pre/post padding values from padding attributes array for volumetric ops.
-   * \param padding Padding from relay op attributes.
+   * \param padding Serialized padding from op attributes.
    * \param padding_is_asymmetric True if both pre and post are needed for asymmetric padding.
    * \param prepadding Prepadding value or symmetric padding values if !padding_is_asymmetric.
    * \param postpadding Postpadding value if padding_is_asymmetric.
    */
-  void GetPadding3D(const Array<IndexExpr>& padding, bool* use_asymmetric_padding,
-                    nvinfer1::Dims* prepadding, nvinfer1::Dims* postpadding) const {
-    CHECK(padding.size() == 1 || padding.size() == 3 || padding.size() == 6);
-    if (padding.size() == 6) {
-      // six int : padding width in the order of (front, top, left, back, bottom, right)
-      *prepadding =
-          nvinfer1::Dims3(padding[0].as<IntImmNode>()->value, padding[1].as<IntImmNode>()->value,
-                          padding[2].as<IntImmNode>()->value);
-      *postpadding =
-          nvinfer1::Dims3(padding[3].as<IntImmNode>()->value, padding[4].as<IntImmNode>()->value,
-                          padding[5].as<IntImmNode>()->value);
-      *use_asymmetric_padding = true;
-    } else if (padding.size() == 3) {
-      // three int : back, bottom, right will use same padding as front, top, left
-      *prepadding =
-          nvinfer1::Dims3(padding[0].as<IntImmNode>()->value, padding[1].as<IntImmNode>()->value,
-                          padding[2].as<IntImmNode>()->value);
-      *postpadding = *prepadding;
-      *use_asymmetric_padding = false;
-    } else {
-      // one int : same padding used on all sides
-      *prepadding =
-          nvinfer1::Dims3(padding[0].as<IntImmNode>()->value, padding[0].as<IntImmNode>()->value,
-                          padding[0].as<IntImmNode>()->value);
-      *postpadding = *prepadding;
-      *use_asymmetric_padding = false;
-    }
-  }
-};
-
-class ActivationOpConverter : public TrtOpConverter {
- public:
-  ActivationOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    CHECK_EQ(params->inputs.size(), 1) << "Activation op expects 1 input.";
-    static const std::unordered_map<std::string, nvinfer1::ActivationType> op_map = {
-      {"nn.relu", nvinfer1::ActivationType::kRELU},
-      {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
-      {"tanh", nvinfer1::ActivationType::kTANH},
-#if TRT_VERSION_GE(5, 1, 5)
-      {"clip", nvinfer1::ActivationType::kCLIP},
-      {"nn.leaky_relu", nvinfer1::ActivationType::kLEAKY_RELU},
-#endif
-    };
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported activation type " << params->op_name;
-    nvinfer1::IActivationLayer* act_layer =
-        params->network->addActivation(*params->inputs.at(0).tensor, it->second);
-#if TRT_VERSION_GE(5, 1, 5)
-    if (params->op_name == "clip") {
-      const auto* clip_attr = params->call->attrs.as<ClipAttrs>();
-      act_layer->setAlpha(clip_attr->a_min);
-      act_layer->setBeta(clip_attr->a_max);
-    } else if (params->op_name == "nn.leaky_relu") {
-      const auto* leaky_relu_attr = params->call->attrs.as<LeakyReluAttrs>();
-      act_layer->setAlpha(leaky_relu_attr->alpha);
-    }
-#endif
-    CHECK(act_layer != nullptr);
-    params->outputs.push_back(act_layer->getOutput(0));
-  }
-};
-
-class ClipLegacyOpConverter : public TrtOpConverter {
- public:
-  ClipLegacyOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    const auto* attrs = params->call->attrs.as<ClipAttrs>();
-    CHECK_EQ(params->inputs.size(), 1) << "Activation op expects 1 input.";
-    auto input = params->inputs.at(0).tensor;
-    // relu(x)
-    nvinfer1::ITensor* output = nullptr;
-    if (attrs->a_min == 0.0f) {
-      // Use relu instead of max(x, 0) because relu can be fused.
-      nvinfer1::IActivationLayer* relu_layer =
-          params->network->addActivation(*input, nvinfer1::ActivationType::kRELU);
-      CHECK(relu_layer != nullptr);
-      output = relu_layer->getOutput(0);
-    } else {
-      // max(x, a_min)
-      nvinfer1::ITensor* a_min = CreateScalar(params, attrs->a_min, input->getDimensions());
-      nvinfer1::IElementWiseLayer* max_layer =
-          params->network->addElementWise(*input, *a_min, nvinfer1::ElementWiseOperation::kMAX);
-      CHECK(max_layer != nullptr);
-      output = max_layer->getOutput(0);
-    }
-    // min(relu(x), a_max)
-    nvinfer1::ITensor* a_max = CreateScalar(params, attrs->a_max, input->getDimensions());
-    nvinfer1::IElementWiseLayer* min_layer =
-        params->network->addElementWise(*output, *a_max, nvinfer1::ElementWiseOperation::kMIN);
-    params->outputs.push_back(min_layer->getOutput(0));
-  }
-};
-
-class ElementWiseBinaryOpConverter : public TrtOpConverter {
- public:
-  ElementWiseBinaryOpConverter() : TrtOpConverter({kTensor, kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation> op_map = {
-        {"add", nvinfer1::ElementWiseOperation::kSUM},
-        {"subtract", nvinfer1::ElementWiseOperation::kSUB},
-        {"multiply", nvinfer1::ElementWiseOperation::kPROD},
-        {"divide", nvinfer1::ElementWiseOperation::kDIV},
-        {"power", nvinfer1::ElementWiseOperation::kPOW},
-        {"maximum", nvinfer1::ElementWiseOperation::kMAX},
-        {"minimum", nvinfer1::ElementWiseOperation::kMIN}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
-    // Broadcast
-    auto input0 = params->inputs.at(0).tensor;
-    auto input0_dims = TrtDimsToVector(input0->getDimensions());
-    auto input1 = params->inputs.at(1).tensor;
-    auto input1_dims = TrtDimsToVector(input1->getDimensions());
-    const bool need_broadcast = input0_dims.size() != input1_dims.size();
-    if (need_broadcast) {
-      if (input0_dims.size() < input1_dims.size()) {
-        std::vector<int> new_shape(input0_dims);
-        while (new_shape.size() < input1_dims.size()) new_shape.insert(new_shape.begin(), 1);
-        input0 = Reshape(params, input0, new_shape);
-      } else if (input1_dims.size() < input0_dims.size()) {
-        std::vector<int> new_shape(input1_dims);
-        while (new_shape.size() < input0_dims.size()) new_shape.insert(new_shape.begin(), 1);
-        input1 = Reshape(params, input1, new_shape);
-      }
-    }
-
-    nvinfer1::IElementWiseLayer* elemwise_layer =
-        params->network->addElementWise(*input0, *input1, it->second);
-    CHECK(elemwise_layer != nullptr);
-    params->outputs.push_back(elemwise_layer->getOutput(0));
-  }
-};
-
-class Conv2DOpConverter : public TrtOpConverter {
- public:
-  Conv2DOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    auto weight_shape = params->inputs.at(1).weight_shape;
-    const auto* conv2d_attr = params->call->attrs.as<Conv2DAttrs>();
-    CHECK_EQ(conv2d_attr->data_layout, "NCHW");
-    CHECK(conv2d_attr->out_layout == "" || conv2d_attr->out_layout == "NCHW");
-    CHECK_EQ(conv2d_attr->kernel_layout, "OIHW");
-
-    // TRT conv2d op doesn't support asymmetric padding before 5.1, so we
-    // workaround by adding a padding layer before the pooling op.
-    nvinfer1::DimsHW prepadding, postpadding;
-    bool use_asymmetric_padding;
-    GetPadding(conv2d_attr->padding, &use_asymmetric_padding, &prepadding, &postpadding);
-#if !TRT_VERSION_GE(5, 1, 5)
-    if (use_asymmetric_padding) {
-      auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
-      input_tensor = pad_layer->getOutput(0);
-      // No need for conv op to do any padding.
-      use_asymmetric_padding = false;
-      prepadding = nvinfer1::DimsHW(0, 0);
-    }
-#endif
-
-    // Could use conv2d_attr->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[0];
-    const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto conv_layer = params->network->addConvolution(*input_tensor, num_outputs, kernel_size,
-                                                      params->inputs.at(1).weight, bias);
-    CHECK(conv_layer != nullptr);
-    if (use_asymmetric_padding) {
-#if TRT_VERSION_GE(5, 1, 5)
-      conv_layer->setPrePadding(prepadding);
-      conv_layer->setPostPadding(postpadding);
-#endif
-    } else {
-      conv_layer->setPadding(prepadding);
-    }
-    CHECK_EQ(conv2d_attr->strides.size(), 2);
-    const auto strides = nvinfer1::DimsHW(conv2d_attr->strides[0].as<IntImmNode>()->value,
-                                          conv2d_attr->strides[1].as<IntImmNode>()->value);
-    conv_layer->setStride(strides);
-    CHECK_EQ(conv2d_attr->dilation.size(), 2);
-    const auto dilation = nvinfer1::DimsHW(conv2d_attr->dilation[0].as<IntImmNode>()->value,
-                                           conv2d_attr->dilation[1].as<IntImmNode>()->value);
-    conv_layer->setDilation(dilation);
-    conv_layer->setNbGroups(conv2d_attr->groups);
-    params->outputs.push_back(conv_layer->getOutput(0));
-  }
-};
-
-#if TRT_VERSION_GE(6, 0, 1)
-class Conv3DOpConverter : public TrtOpConverter {
- public:
-  Conv3DOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    auto weight_shape = params->inputs.at(1).weight_shape;
-    const auto* attrs = params->call->attrs.as<Conv3DAttrs>();
-    CHECK_EQ(attrs->data_layout, "NCDHW");
-    CHECK(attrs->out_layout == "" || attrs->out_layout == "NCDHW");
-    CHECK_EQ(attrs->kernel_layout, "OIDHW");
-
-    nvinfer1::Dims prepadding, postpadding;
-    bool use_asymmetric_padding;
-    GetPadding3D(attrs->padding, &use_asymmetric_padding, &prepadding, &postpadding);
-
-    // Could use attrs->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[0];
-    const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
-                                                        params->inputs.at(1).weight, bias);
-    CHECK(conv_layer != nullptr);
-    if (use_asymmetric_padding) {
-      conv_layer->setPrePadding(prepadding);
-      conv_layer->setPostPadding(postpadding);
-    } else {
-      conv_layer->setPaddingNd(prepadding);
-    }
-    CHECK_EQ(attrs->strides.size(), 3);
-    const auto strides = nvinfer1::Dims3(attrs->strides[0].as<IntImmNode>()->value,
-                                         attrs->strides[1].as<IntImmNode>()->value,
-                                         attrs->strides[2].as<IntImmNode>()->value);
-    conv_layer->setStrideNd(strides);
-    CHECK_EQ(attrs->dilation.size(), 3);
-    const auto dilation = nvinfer1::Dims3(attrs->dilation[0].as<IntImmNode>()->value,
-                                          attrs->dilation[1].as<IntImmNode>()->value,
-                                          attrs->dilation[2].as<IntImmNode>()->value);
-    conv_layer->setDilationNd(dilation);
-    conv_layer->setNbGroups(attrs->groups);
-    params->outputs.push_back(conv_layer->getOutput(0));
-  }
+  void GetPadding3D(const std::vector<std::string>& padding, bool* use_asymmetric_padding,
+                    nvinfer1::Dims* prepadding, nvinfer1::Dims* postpadding) const;
 };
-#endif  // TRT_VERSION_GE(6, 0, 1)
 
-// Using FullyConnected
-class DenseOpConverter : public TrtOpConverter {
- public:
-  DenseOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    CHECK(input_dims.size() > 0 && input_dims.size() <= 3);
-    const int required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    const bool need_reshape_on_input = input_dims.size() != required_rank;
-    if (need_reshape_on_input) {
-      // Add dims of size 1 until rank is required_rank.
-      std::vector<int> new_shape(input_dims);
-      while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1);
-      input_tensor = Reshape(params, input_tensor, new_shape);
-    }
-    // Weights are in KC format.
-    CHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
-    const int num_units = params->inputs.at(1).weight_shape[0];
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
-        *input_tensor, num_units, params->inputs.at(1).weight, bias);
-    CHECK(fc_layer != nullptr);
-    auto output_tensor = fc_layer->getOutput(0);
-    if (need_reshape_on_input) {
-      // Remove added dims.
-      input_dims[input_dims.size() - 1] = num_units;
-      output_tensor = Reshape(params, output_tensor, input_dims);
-    }
-    params->outputs.push_back(output_tensor);
-  }
-};
-
-class BatchNormOpConverter : public TrtOpConverter {
- public:
-  BatchNormOpConverter() : TrtOpConverter({kTensor, kWeight, kWeight, kWeight, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    auto gamma = params->inputs.at(1).weight;
-    auto beta = params->inputs.at(2).weight;
-    auto mean = params->inputs.at(3).weight;
-    auto var = params->inputs.at(4).weight;
-    const auto* bn_attr = params->call->attrs.as<BatchNormAttrs>();
-    CHECK_EQ(gamma.count, beta.count);
-    CHECK_EQ(gamma.count, mean.count);
-    CHECK_EQ(gamma.count, var.count);
-    CHECK(bn_attr->axis == 1 || bn_attr->axis == 3);
-    const bool need_transpose = bn_attr->axis == 3;
-
-    void* weight_scale_ptr = new float[gamma.count];
-    nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
-    params->trt_weights->push_back(weight_scale);
-    void* weight_shift_ptr = new float[gamma.count];
-    nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
-    params->trt_weights->push_back(weight_shift);
-    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
-
-    // fill in the content of weights for the Scale layer
-    const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
-    const float* beta_ptr = reinterpret_cast<const float*>(beta.values);
-    const float* mean_ptr = reinterpret_cast<const float*>(mean.values);
-    const float* var_ptr = reinterpret_cast<const float*>(var.values);
-    float* scale_ptr = reinterpret_cast<float*>(weight_scale_ptr);
-    float* shift_ptr = reinterpret_cast<float*>(weight_shift_ptr);
-    for (int i = 0; i < gamma.count; ++i) {
-      scale_ptr[i] = 1.0 / std::sqrt(var_ptr[i] + bn_attr->epsilon);
-      if (bn_attr->scale) {
-        scale_ptr[i] *= gamma_ptr[i];
-      }
-      shift_ptr[i] = -mean_ptr[i] * scale_ptr[i];
-      if (bn_attr->center) {
-        shift_ptr[i] += beta_ptr[i];
-      }
-    }
-    if (need_transpose) {
-      input = Transpose(params, input, {0, 3, 1, 2});
-    }
-    nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
-        *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power);
-    CHECK(scale_layer != nullptr);
-    auto output = scale_layer->getOutput(0);
-    if (need_transpose) {
-      output = Transpose(params, output, {0, 2, 3, 1});
-    }
-    params->outputs.push_back(output);
-  }
-};
-
-class BatchFlattenOpConverter : public TrtOpConverter {
- public:
-  BatchFlattenOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    std::vector<int> new_shape{-1};
-    if (!TRT_HAS_IMPLICIT_BATCH(params)) {
-      new_shape.insert(new_shape.begin(), params->inputs.at(0).tensor->getDimensions().d[0]);
-    }
-    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, new_shape));
-  }
-};
-
-class SoftmaxOpConverter : public TrtOpConverter {
- public:
-  SoftmaxOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const int input_rank = input->getDimensions().nbDims;
-    const auto* softmax_attr = params->call->attrs.as<SoftmaxAttrs>();
-    const int axis = ConvertAxis(params, softmax_attr->axis, input_rank);
-    nvinfer1::ISoftMaxLayer* softmax_layer = params->network->addSoftMax(*input);
-    softmax_layer->setAxes(1 << axis);
-    CHECK(softmax_layer != nullptr);
-    params->outputs.push_back(softmax_layer->getOutput(0));
-  }
-};
-
-class PoolingOpConverter : public TrtOpConverter {
- public:
-  PoolingOpConverter() : TrtOpConverter({kTensor}) {}
-
-  // Get attributes from MaxPool2DAttrs or AvgPool2DAttrs. If
-  // use_assymetric_padding is false, symmetric padding values will be returned
-  // in prepadding only.
-  template <class PoolAttrs>
-  void GetPoolAttrs(const PoolAttrs* attrs, nvinfer1::DimsHW* prepadding,
-                    nvinfer1::DimsHW* postpadding, nvinfer1::DimsHW* window_size,
-                    nvinfer1::DimsHW* strides, bool* ceil_mode,
-                    bool* use_asymmetric_padding) const {
-    CHECK_EQ(attrs->layout, "NCHW");
-    GetPadding(attrs->padding, use_asymmetric_padding, prepadding, postpadding);
-    *window_size = nvinfer1::DimsHW(attrs->pool_size[0].template as<IntImmNode>()->value,
-                                    attrs->pool_size[1].template as<IntImmNode>()->value);
-    *strides = nvinfer1::DimsHW(attrs->strides[0].template as<IntImmNode>()->value,
-                                attrs->strides[1].template as<IntImmNode>()->value);
-    *ceil_mode = attrs->ceil_mode;
-  }
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
-        {"nn.max_pool2d", nvinfer1::PoolingType::kMAX},
-        {"nn.avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-
-    nvinfer1::DimsHW prepadding, postpadding, window_size, strides;
-    bool use_asymmetric_padding = false, ceil_mode = false, count_include_pad = true;
-    if (params->op_name == "nn.max_pool2d") {
-      const auto* attrs = params->call->attrs.as<MaxPool2DAttrs>();
-      GetPoolAttrs<MaxPool2DAttrs>(attrs, &prepadding, &postpadding, &window_size, &strides,
-                                   &ceil_mode, &use_asymmetric_padding);
-    } else if (params->op_name == "nn.avg_pool2d") {
-      const auto* attrs = params->call->attrs.as<AvgPool2DAttrs>();
-      count_include_pad = attrs->count_include_pad;
-      GetPoolAttrs<AvgPool2DAttrs>(attrs, &prepadding, &postpadding, &window_size, &strides,
-                                   &ceil_mode, &use_asymmetric_padding);
-    }
-
-// TRT pooling op doesn't support asymmetric padding before 5.1, so we
-// workaround by adding a padding layer before the pooling op.
-#if !TRT_VERSION_GE(5, 1, 5)
-    if (use_asymmetric_padding) {
-      auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
-      input = pad_layer->getOutput(0);
-      // No need for pooling op to do any padding.
-      use_asymmetric_padding = false;
-      prepadding = nvinfer1::DimsHW(0, 0);
-    }
-#endif
-
-    auto pool_layer = params->network->addPooling(*input, it->second, window_size);
-    CHECK(pool_layer != nullptr);
-    pool_layer->setStride(strides);
-    if (use_asymmetric_padding) {
-#if TRT_VERSION_GE(5, 1, 5)
-      pool_layer->setPrePadding(prepadding);
-      pool_layer->setPostPadding(postpadding);
-#endif
-    } else {
-      pool_layer->setPadding(prepadding);
-    }
-    if (params->op_name == "nn.avg_pool2d") {
-      // count_include_pad=True is useless if there is no padding. TRT doesn't
-      // like count_include_pad in combination with strides even when there is
-      // no padding or assymetric padding even, so turn off inclusive to avoid
-      // error message. Note: Padding will always be symmetric with
-      // count_include_pad since partitioner will prevent unsupported case.
-      if (prepadding.h() == 0 && prepadding.w() == 0) {
-        count_include_pad = false;
-      }
-      pool_layer->setAverageCountExcludesPadding(!count_include_pad);
-    }
-#if TRT_VERSION_GE(5, 1, 5)
-    if (ceil_mode) {
-      pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
-    }
-#else
-    CHECK(!ceil_mode);
-#endif
-    params->outputs.push_back(pool_layer->getOutput(0));
-  }
-};
-
-#if TRT_VERSION_GE(6, 0, 1)
-class Pooling3DOpConverter : public TrtOpConverter {
- public:
-  Pooling3DOpConverter() : TrtOpConverter({kTensor}) {}
-
-  // Get attributes from MaxPool2DAttrs or AvgPool2DAttrs. If
-  // use_assymetric_padding is false, symmetric padding values will be returned
-  // in prepadding only.
-  template <class PoolAttrs>
-  void GetPoolAttrs(const PoolAttrs* attrs, nvinfer1::Dims* prepadding, nvinfer1::Dims* postpadding,
-                    nvinfer1::Dims* window_size, nvinfer1::Dims* strides, bool* ceil_mode,
-                    bool* use_asymmetric_padding) const {
-    CHECK_EQ(attrs->layout, "NCDHW");
-    GetPadding3D(attrs->padding, use_asymmetric_padding, prepadding, postpadding);
-    *window_size = nvinfer1::Dims3(attrs->pool_size[0].template as<IntImmNode>()->value,
-                                   attrs->pool_size[1].template as<IntImmNode>()->value,
-                                   attrs->pool_size[2].template as<IntImmNode>()->value);
-    *strides = nvinfer1::Dims3(attrs->strides[0].template as<IntImmNode>()->value,
-                               attrs->strides[1].template as<IntImmNode>()->value,
-                               attrs->strides[2].template as<IntImmNode>()->value);
-    *ceil_mode = attrs->ceil_mode;
-  }
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
-        {"nn.max_pool3d", nvinfer1::PoolingType::kMAX},
-        {"nn.avg_pool3d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-
-    nvinfer1::Dims prepadding, postpadding, window_size, strides;
-    bool use_asymmetric_padding = false, ceil_mode = false, count_include_pad = true;
-    if (params->op_name == "nn.max_pool3d") {
-      const auto* attrs = params->call->attrs.as<MaxPool3DAttrs>();
-      GetPoolAttrs<MaxPool3DAttrs>(attrs, &prepadding, &postpadding, &window_size, &strides,
-                                   &ceil_mode, &use_asymmetric_padding);
-    } else if (params->op_name == "nn.avg_pool3d") {
-      const auto* attrs = params->call->attrs.as<AvgPool3DAttrs>();
-      count_include_pad = attrs->count_include_pad;
-      GetPoolAttrs<AvgPool3DAttrs>(attrs, &prepadding, &postpadding, &window_size, &strides,
-                                   &ceil_mode, &use_asymmetric_padding);
-    }
-    auto pool_layer = params->network->addPoolingNd(*input, it->second, window_size);
-    CHECK(pool_layer != nullptr);
-    pool_layer->setStrideNd(strides);
-    if (use_asymmetric_padding) {
-      pool_layer->setPrePadding(prepadding);
-      pool_layer->setPostPadding(postpadding);
-    } else {
-      pool_layer->setPaddingNd(prepadding);
-    }
-    if (params->op_name == "nn.avg_pool3d") {
-      // count_include_pad=True is useless if there is no padding. TRT doesn't
-      // like count_include_pad in combination with strides even when there is
-      // no padding or assymetric padding even, so turn off inclusive to avoid
-      // error message. Note: Padding will always be symmetric with
-      // count_include_pad since partitioner will prevent unsupported case.
-      pool_layer->setAverageCountExcludesPadding(!count_include_pad);
-    }
-    if (ceil_mode) {
-      pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
-    }
-    params->outputs.push_back(pool_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
-
-class GlobalPoolingOpConverter : public TrtOpConverter {
- public:
-  GlobalPoolingOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
-        {"nn.global_max_pool2d", nvinfer1::PoolingType::kMAX},
-        {"nn.global_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-    const auto* pool_attr = params->call->attrs.as<GlobalPool2DAttrs>();
-    CHECK_EQ(pool_attr->layout, "NCHW");
-    const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
-    const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
-    auto pool_layer =
-        params->network->addPooling(*input_tensor, it->second, nvinfer1::DimsHW(h, w));
-    CHECK(pool_layer != nullptr);
-    params->outputs.push_back(pool_layer->getOutput(0));
-  }
-};
-
-class ExpandDimsOpConverter : public TrtOpConverter {
- public:
-  ExpandDimsOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    const auto* attrs = params->call->attrs.as<ExpandDimsAttrs>();
-    const int axis = ConvertAxis(params, attrs->axis, input_dims.size() + 1);
-    for (int i = 0; i < attrs->num_newaxis; ++i) {
-      input_dims.insert(input_dims.begin() + axis, 1);
-    }
-    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, input_dims));
-  }
-};
-
-class SqueezeOpConverter : public TrtOpConverter {
- public:
-  SqueezeOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    const auto* attrs = params->call->attrs.as<SqueezeAttrs>();
-    // TODO(tmorris): if axis not defined, squeeze all dimensions with size 1.
-    CHECK(attrs->axis.defined());
-    for (size_t i = 0; i < attrs->axis.size(); ++i) {
-      const int axis =
-          ConvertAxis(params, attrs->axis[i].as<IntImmNode>()->value, input_dims.size());
-      input_dims[axis] = 0;
-    }
-    input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0), input_dims.end());
-    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, input_dims));
-  }
-};
-
-class UnaryOpConverter : public TrtOpConverter {
- public:
-  UnaryOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    // The following ops are supported by TRT but don't exist in relay yet:
-    // recip, tan, sinh, cosh, asin, acos, asinh, acosh, atanh
-    static const std::unordered_map<std::string, nvinfer1::UnaryOperation> op_map = {
-      {"exp", nvinfer1::UnaryOperation::kEXP},
-      {"log", nvinfer1::UnaryOperation::kLOG},
-      {"sqrt", nvinfer1::UnaryOperation::kSQRT},
-      {"abs", nvinfer1::UnaryOperation::kABS},
-      {"negative", nvinfer1::UnaryOperation::kNEG},
-#if TRT_VERSION_GE(5, 1, 5)
-      {"sin", nvinfer1::UnaryOperation::kSIN},
-      {"cos", nvinfer1::UnaryOperation::kCOS},
-      {"atan", nvinfer1::UnaryOperation::kATAN},
-      {"ceil", nvinfer1::UnaryOperation::kCEIL},
-      {"floor", nvinfer1::UnaryOperation::kFLOOR},
-#endif
-    };
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name;
-    nvinfer1::IUnaryLayer* unary_layer =
-        params->network->addUnary(*params->inputs.at(0).tensor, it->second);
-    CHECK(unary_layer != nullptr);
-    params->outputs.push_back(unary_layer->getOutput(0));
-  }
-};
-
-class ConcatOpConverter : public TrtOpConverter {
- public:
-  ConcatOpConverter() : TrtOpConverter({}, /*variable_input_count=*/true) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    const int num_inputs = params->inputs.size();
-    CHECK_GT(num_inputs, 0);
-    const int input_rank = params->inputs[0].tensor->getDimensions().nbDims;
-    std::vector<nvinfer1::ITensor*> input_tensors;
-    for (auto input : params->inputs) {
-      CHECK(input.type == kTensor);
-      CHECK_EQ(input_rank, input.tensor->getDimensions().nbDims);
-      input_tensors.push_back(input.tensor);
-    }
-
-    const auto* concat_attr = params->call->attrs.as<ConcatenateAttrs>();
-    const int axis = ConvertAxis(params, concat_attr->axis, input_rank);
-
-    nvinfer1::IConcatenationLayer* concat_layer =
-        params->network->addConcatenation(input_tensors.data(), input_tensors.size());
-    CHECK(concat_layer != nullptr);
-    concat_layer->setAxis(axis);
-    params->outputs.push_back(concat_layer->getOutput(0));
-  }
-};
-
-class BiasAddOpConverter : public TrtOpConverter {
- public:
-  BiasAddOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    const int required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    CHECK(input_dims.size() > 0 && input_dims.size() <= required_rank);
-    const bool need_reshape_on_input = input_dims.size() != required_rank;
-    if (need_reshape_on_input) {
-      // Add dims of size 1 until rank is required_rank.
-      std::vector<int> new_shape(input_dims);
-      while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1);
-      input_tensor = Reshape(params, input_tensor, new_shape);
-    }
-
-    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
-        *input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
-    CHECK(scale_layer != nullptr);
-    auto output_tensor = scale_layer->getOutput(0);
-    if (need_reshape_on_input) {
-      // Remove added dims.
-      output_tensor = Reshape(params, output_tensor, input_dims);
-    }
-    params->outputs.push_back(output_tensor);
-  }
-};
-
-class Conv2DTransposeOpConverter : public TrtOpConverter {
- public:
-  Conv2DTransposeOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto weight_shape = params->inputs.at(1).weight_shape;
-    const auto* conv2d_attr = params->call->attrs.as<Conv2DTransposeAttrs>();
-    CHECK_EQ(conv2d_attr->data_layout, "NCHW");
-    CHECK(conv2d_attr->out_layout == "" || conv2d_attr->out_layout == "NCHW");
-    CHECK_EQ(conv2d_attr->kernel_layout, "OIHW");
-    CHECK(conv2d_attr->dilation[0].as<IntImmNode>()->value == 1 &&
-          conv2d_attr->dilation[1].as<IntImmNode>()->value == 1);
-
-    // TRT deconv op doesn't support asymmetric padding before 5.1, so we
-    // workaround by adding a padding layer before the pooling op.
-    nvinfer1::DimsHW prepadding, postpadding;
-    bool use_asymmetric_padding;
-    GetPadding(conv2d_attr->padding, &use_asymmetric_padding, &prepadding, &postpadding);
-#if !TRT_VERSION_GE(5, 1, 5)
-    if (use_asymmetric_padding) {
-      auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
-      input_tensor = pad_layer->getOutput(0);
-      // No need for conv op to do any padding.
-      use_asymmetric_padding = false;
-      prepadding = nvinfer1::DimsHW(0, 0);
-    }
-#endif
-
-    // Could use conv2d_attr->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[1];
-    const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
-                                                          params->inputs.at(1).weight, bias);
-    CHECK(deconv_layer != nullptr);
-    if (use_asymmetric_padding) {
-#if TRT_VERSION_GE(5, 1, 5)
-      deconv_layer->setPrePadding(prepadding);
-      deconv_layer->setPostPadding(postpadding);
-#endif
-    } else {
-      deconv_layer->setPadding(prepadding);
-    }
-    const auto strides = nvinfer1::DimsHW(conv2d_attr->strides[0].as<IntImmNode>()->value,
-                                          conv2d_attr->strides[1].as<IntImmNode>()->value);
-    deconv_layer->setStride(strides);
-    deconv_layer->setNbGroups(conv2d_attr->groups);
-    nvinfer1::ITensor* output = deconv_layer->getOutput(0);
-    // Output padding.
-    if (conv2d_attr->output_padding.size()) {
-      GetPadding(conv2d_attr->output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
-      if (prepadding.h() != 0 || prepadding.w() != 0 || postpadding.h() != 0 ||
-          postpadding.w() != 0) {
-        // Output padding for Conv2D transpose is always asymmetric and applied to post only.
-        prepadding = nvinfer1::DimsHW(0, 0);
-        auto pad_layer = params->network->addPadding(*output, prepadding, postpadding);
-        output = pad_layer->getOutput(0);
-      }
-    }
-    params->outputs.push_back(output);
-  }
-};
-
-#if TRT_VERSION_GE(6, 0, 1)
-class Conv3DTransposeOpConverter : public TrtOpConverter {
- public:
-  Conv3DTransposeOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto weight_shape = params->inputs.at(1).weight_shape;
-    const auto* attrs = params->call->attrs.as<Conv3DTransposeAttrs>();
-    CHECK_EQ(attrs->data_layout, "NCDHW");
-    CHECK(attrs->out_layout == "" || attrs->out_layout == "NCDHW");
-    CHECK_EQ(attrs->kernel_layout, "OIDHW");
-    CHECK(attrs->dilation[0].as<IntImmNode>()->value == 1 &&
-          attrs->dilation[1].as<IntImmNode>()->value == 1 &&
-          attrs->dilation[2].as<IntImmNode>()->value == 1);
-
-    nvinfer1::Dims prepadding, postpadding;
-    bool use_asymmetric_padding;
-    GetPadding3D(attrs->padding, &use_asymmetric_padding, &prepadding, &postpadding);
-
-    // Could use attrs->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[1];
-    const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
-                                                            params->inputs.at(1).weight, bias);
-    CHECK(deconv_layer != nullptr);
-    if (use_asymmetric_padding) {
-      deconv_layer->setPrePadding(prepadding);
-      deconv_layer->setPostPadding(postpadding);
-    } else {
-      deconv_layer->setPaddingNd(prepadding);
-    }
-    const auto strides = nvinfer1::Dims3(attrs->strides[0].as<IntImmNode>()->value,
-                                         attrs->strides[1].as<IntImmNode>()->value,
-                                         attrs->strides[2].as<IntImmNode>()->value);
-    deconv_layer->setStrideNd(strides);
-    deconv_layer->setNbGroups(attrs->groups);
-    nvinfer1::ITensor* output = deconv_layer->getOutput(0);
-    // Output padding.
-    if (attrs->output_padding.size()) {
-      GetPadding3D(attrs->output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
-      // Are any post-padding values non-zero?
-      CHECK(!std::any_of(postpadding.d, postpadding.d + postpadding.nbDims, [](int x) {
-        return x != 0;
-      })) << "TRT does not support padding on 3 dimensions.";
-    }
-    params->outputs.push_back(output);
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
-
-class TransposeOpConverter : public TrtOpConverter {
- public:
-  TransposeOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<TransposeAttrs>();
-    std::vector<int> order;
-    for (size_t i = 0; i < attrs->axes.size(); ++i) {
-      order.push_back(attrs->axes[i].as<IntImmNode>()->value);
-    }
-    params->outputs.push_back(Transpose(params, input, order));
-  }
-};
-
-class ReshapeOpConverter : public TrtOpConverter {
- public:
-  ReshapeOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<ReshapeAttrs>();
-    CHECK_EQ(attrs->reverse, false);
-    std::vector<int> new_shape;
-    const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
-    for (size_t i = start_index; i < attrs->newshape.size(); ++i) {
-      CHECK(attrs->newshape[i].defined());
-      const int value = attrs->newshape[i]->value;
-      CHECK_GE(value, -1);
-      new_shape.push_back(value);
-    }
-    params->outputs.push_back(Reshape(params, input, new_shape));
-  }
-};
-
-class PadOpConverter : public TrtOpConverter {
- public:
-  PadOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<PadAttrs>();
-    const int input_rank_with_batch =
-        input->getDimensions().nbDims + (TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0);
-    CHECK_EQ(input_rank_with_batch, attrs->pad_width.size());
-    CHECK(!TRT_HAS_IMPLICIT_BATCH(params) || (attrs->pad_width[0][0].as<IntImmNode>()->value == 0 &&
-                                              attrs->pad_width[0][1].as<IntImmNode>()->value == 0))
-        << "Cannot pad on batch dimension.";
-
-    nvinfer1::DimsHW prepadding, postpadding;
-    // Check if we need to transpose from NHWC -> NCHW.
-    const bool need_transpose = attrs->pad_width[1][0].as<IntImmNode>()->value != 0 ||
-                                attrs->pad_width[1][1].as<IntImmNode>()->value != 0;
-    if (need_transpose) {
-      input = Transpose(params, input, {0, 3, 1, 2});
-      prepadding = nvinfer1::DimsHW(attrs->pad_width[1][0].as<IntImmNode>()->value,
-                                    attrs->pad_width[2][0].as<IntImmNode>()->value);
-      postpadding = nvinfer1::DimsHW(attrs->pad_width[1][1].as<IntImmNode>()->value,
-                                     attrs->pad_width[2][1].as<IntImmNode>()->value);
-    } else {
-      prepadding = nvinfer1::DimsHW(attrs->pad_width[2][0].as<IntImmNode>()->value,
-                                    attrs->pad_width[3][0].as<IntImmNode>()->value);
-      postpadding = nvinfer1::DimsHW(attrs->pad_width[2][1].as<IntImmNode>()->value,
-                                     attrs->pad_width[3][1].as<IntImmNode>()->value);
-    }
-    auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
-    CHECK(pad_layer != nullptr);
-    auto output = pad_layer->getOutput(0);
-    if (need_transpose) {
-      // NCHW -> NHWC
-      output = Transpose(params, output, {0, 2, 3, 1});
-    }
-    params->outputs.push_back(output);
-  }
-};
-
-class ReduceOpConverter : public TrtOpConverter {
- public:
-  ReduceOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    static const std::unordered_map<std::string, nvinfer1::ReduceOperation> op_map = {
-        {"sum", nvinfer1::ReduceOperation::kSUM},
-        {"prod", nvinfer1::ReduceOperation::kPROD},
-        {"max", nvinfer1::ReduceOperation::kMAX},
-        {"min", nvinfer1::ReduceOperation::kMIN},
-        {"mean", nvinfer1::ReduceOperation::kAVG}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported reduce type " << params->op_name;
-
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<ReduceAttrs>();
-    CHECK(attrs->exclude == false);
-    // TODO(trevmorr): Support reduce to scalar.
-    CHECK(attrs->axis.defined() && attrs->axis.size() > 0);
-    uint32_t reduce_axes = 0;
-    for (size_t i = 0; i < attrs->axis.size(); ++i) {
-      const int axis = ConvertAxis(params, attrs->axis[i].as<IntImmNode>()->value,
-                                   input->getDimensions().nbDims);
-      reduce_axes |= 1 << axis;
-    }
-    auto reduce_layer =
-        params->network->addReduce(*input, it->second, reduce_axes, attrs->keepdims);
-    params->outputs.push_back(reduce_layer->getOutput(0));
-  }
-};
-
-#if TRT_VERSION_GE(5, 1, 5)
-class StridedSliceOpConverter : public TrtOpConverter {
- public:
-  StridedSliceOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input->getDimensions());
-    const auto* attrs = params->call->attrs.as<StridedSliceAttrs>();
-    // Dynamic shapes not supported.
-    CHECK(attrs->begin && attrs->end && attrs->strides);
-    const int input_rank_with_batch =
-        input->getDimensions().nbDims + (TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0);
-    CHECK_EQ(input_rank_with_batch, attrs->begin.value().size());
-    CHECK_EQ(input_rank_with_batch, attrs->end.value().size());
-    const bool default_strides =
-        !attrs->strides.value().defined() || attrs->strides.value().size() == 0;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      CHECK(default_strides || !attrs->strides.value()[0].defined() ||
-            attrs->strides.value()[0].as<IntImmNode>()->value == 1);
-    }
-
-    auto process_slice_index = [](Integer x, int default_value) {
-      if (!x.defined()) return default_value;
-      int value = x.as<IntImmNode>()->value;
-      if (value == -1) return default_value;
-      return value;
-    };
-
-    const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
-    std::vector<int> start, size, strides;
-    for (size_t i = start_index; i < attrs->begin.value().size(); ++i) {
-      const int begin_value = process_slice_index(attrs->begin.value()[i], 0);
-      const int end_value = process_slice_index(attrs->end.value()[i], input_dims[i - start_index]);
-      const int stride_value = (default_strides || i >= attrs->strides.value().size() ||
-                                !attrs->strides.value()[i].defined())
-                                   ? 1
-                                   : attrs->strides.value()[i].as<IntImmNode>()->value;
-      CHECK_GT(stride_value, 0);
-      const int size_value = (end_value - begin_value + stride_value - 1) / stride_value;
-      CHECK_GE(begin_value, 0);
-      CHECK_GT(size_value, 0);
-      start.push_back(begin_value);
-      size.push_back(size_value);
-      strides.push_back(stride_value);
-    }
-
-    auto slice_layer = params->network->addSlice(*input, VectorToTrtDims(start),
-                                                 VectorToTrtDims(size), VectorToTrtDims(strides));
-    params->outputs.push_back(slice_layer->getOutput(0));
-  }
-};
-#endif
-
-class AdaptivePoolingOpConverter : public TrtOpConverter {
- public:
-  AdaptivePoolingOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
-        {"nn.adaptive_max_pool2d", nvinfer1::PoolingType::kMAX},
-        {"nn.adaptive_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-    const auto* attrs = params->call->attrs.as<AdaptivePool2DAttrs>();
-    CHECK_EQ(attrs->layout, "NCHW");
-
-    // This is an approximation of adaptive pooling. Results will not be
-    // mathematically exact except when output_size is (1, 1).
-    // Annotation rules will only allow output size of (1, 1).
-    auto output_size = nvinfer1::DimsHW(1, 1);
-    const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
-    const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
-    const auto stride = nvinfer1::DimsHW(h / output_size.h(), w / output_size.w());
-    const auto window_size = nvinfer1::DimsHW(h - (output_size.h() - 1) * stride.h(),
-                                              w - (output_size.w() - 1) * stride.w());
-    auto pool_layer = params->network->addPooling(*input_tensor, it->second, window_size);
-    CHECK(pool_layer != nullptr);
-    pool_layer->setStride(stride);
-    params->outputs.push_back(pool_layer->getOutput(0));
-  }
-};
-
-#if TRT_VERSION_GE(6, 0, 1)
-class ResizeOpConverter : public TrtOpConverter {
- public:
-  ResizeOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<ResizeAttrs>();
-    static const std::unordered_map<std::string, nvinfer1::ResizeMode> op_map = {
-        {"nearest_neighbor", nvinfer1::ResizeMode::kNEAREST},
-        {"bilinear", nvinfer1::ResizeMode::kLINEAR}};
-    auto it = op_map.find(attrs->method);
-    CHECK(it != op_map.end()) << "Unsupported resize type " << attrs->method;
-    CHECK_EQ(attrs->size.size(), 2);
-    auto output_dims = TrtDimsToVector(input->getDimensions());
-    const int required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    CHECK_EQ(output_dims.size(), required_rank);
-    CHECK(attrs->layout == "NCHW" || attrs->layout == "NHWC");
-    int h_index = attrs->layout == "NCHW" ? 2 : 1;
-    int w_index = attrs->layout == "NCHW" ? 3 : 2;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      h_index -= 1;
-      w_index -= 1;
-    }
-    output_dims[h_index] = attrs->size[0].as<IntImmNode>()->value;
-    output_dims[w_index] = attrs->size[1].as<IntImmNode>()->value;
-
-    nvinfer1::IResizeLayer* resize_layer = params->network->addResize(*input);
-    CHECK(resize_layer != nullptr);
-    resize_layer->setResizeMode(it->second);
-    resize_layer->setOutputDimensions(VectorToTrtDims(output_dims));
-    resize_layer->setAlignCorners(attrs->coordinate_transformation_mode == "align_corners");
-    params->outputs.push_back(resize_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
-
-#if TRT_VERSION_GE(5, 1, 5)
-class SplitOpConverter : public TrtOpConverter {
- public:
-  SplitOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input->getDimensions());
-    const auto* attrs = params->call->attrs.as<SplitAttrs>();
-    const int input_rank = input->getDimensions().nbDims;
-    const int axis = ConvertAxis(params, attrs->axis, input_dims.size());
-    const int sections = attrs->indices_or_sections.as<IntImmNode>()->value;
-
-    std::vector<int> start(input_dims.size(), 0);
-    std::vector<int> size(input_dims.begin(), input_dims.end());
-    size[axis] = input_dims[axis] / sections;
-    std::vector<int> strides(input_dims.size(), 1);
-    for (int i = 0; i < sections; ++i) {
-      start[axis] = i * size[axis];
-      auto slice_layer = params->network->addSlice(*input, VectorToTrtDims(start),
-                                                   VectorToTrtDims(size), VectorToTrtDims(strides));
-
-      params->outputs.push_back(slice_layer->getOutput(0));
-    }
-  }
-};
-#endif  // TRT_VERSION_GE(5, 1, 5)
-
-#if TRT_VERSION_GE(5, 1, 5)
-// TODO(trevmorr): Not needed due to SimplifySliceLike which converts slice_like
-// to strided_slice. slice_like has a false dependency on the second input
-// tensor since only the shape is needed. This confuses TRT.
-class SliceLikeOpConverter : public TrtOpConverter {
- public:
-  SliceLikeOpConverter() : TrtOpConverter({kTensor, kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    auto input_2 = params->inputs.at(1).tensor;
-    auto input_dims = TrtDimsToVector(input->getDimensions());
-    auto new_dims = TrtDimsToVector(input_2->getDimensions());
-    const auto* attrs = params->call->attrs.as<SliceLikeAttrs>();
-    if (attrs->axes.defined()) {
-      for (int i = 0; i < attrs->axes.size(); i++) {
-        const int axis =
-            ConvertAxis(params, attrs->axes[i].as<IntImmNode>()->value, input_dims.size());
-        input_dims[axis] = new_dims[axis];
-      }
-    } else {
-      // Use all dims when axes is not defined.
-      CHECK_EQ(input_dims.size(), new_dims.size());
-      input_dims = new_dims;
-    }
-
-    // slice_like always begins at 0.
-    std::vector<int> start(input_dims.size(), 0);
-    std::vector<int> strides(input_dims.size(), 1);
-    auto slice_layer = params->network->addSlice(
-        *input, VectorToTrtDims(start), VectorToTrtDims(input_dims), VectorToTrtDims(strides));
-
-    params->outputs.push_back(slice_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(5, 1, 5)
-
-#if TRT_VERSION_GE(6, 0, 1)
-class UpsamplingOpConverter : public TrtOpConverter {
- public:
-  UpsamplingOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<UpSamplingAttrs>();
-    static const std::unordered_map<std::string, nvinfer1::ResizeMode> op_map = {
-        {"nearest_neighbor", nvinfer1::ResizeMode::kNEAREST},
-        {"bilinear", nvinfer1::ResizeMode::kLINEAR}};
-    auto it = op_map.find(attrs->method);
-    CHECK(it != op_map.end()) << "Unsupported resize type " << attrs->method;
-    auto output_dims = TrtDimsToVector(input->getDimensions());
-    const int required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    CHECK_EQ(output_dims.size(), required_rank);
-    CHECK(attrs->layout == "NCHW" || attrs->layout == "NHWC");
-    int h_index = attrs->layout == "NCHW" ? 2 : 1;
-    int w_index = attrs->layout == "NCHW" ? 3 : 2;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      h_index -= 1;
-      w_index -= 1;
-    }
-    output_dims[h_index] *= attrs->scale_h;
-    output_dims[w_index] *= attrs->scale_w;
-
-    nvinfer1::IResizeLayer* resize_layer = params->network->addResize(*input);
-    CHECK(resize_layer != nullptr);
-    resize_layer->setResizeMode(it->second);
-    resize_layer->setOutputDimensions(VectorToTrtDims(output_dims));
-    resize_layer->setAlignCorners(attrs->align_corners);
-    params->outputs.push_back(resize_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
+/*!
+ * \brief Get the map of available TensorRTOpConverters, where the key is the name of the relay op.
+ * \return Map of TensorRTOpConverters.
+ */
+const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>
+GetOpConverters();
 
 }  // namespace contrib
-}  // namespace relay
+}  // namespace runtime
 }  // namespace tvm
 
 #endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
new file mode 100644
index 000000000000..3f87f8d00ee6
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+ * \brief JSON runtime implementation for TensorRT.
+ */
+
+#include <dmlc/parameter.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+
+#include "../../file_utils.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#include "tensorrt_builder.h"
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+struct PairHash {
+  template <class T1, class T2>
+  std::size_t operator()(const std::pair<T1, T2>& pair) const {
+    return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
+  }
+};
+
+using namespace tvm::runtime::json;
+
+class TensorRTRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The TensorRT runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit TensorRTRuntime(const std::string& symbol_name, const std::string& graph_json,
+                           const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names),
+        use_implicit_batch_(true),
+        max_workspace_size_(size_t(1) << 30) {}
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "tensorrt"; }
+
+  /*!
+   * \brief Initialize runtime. Create TensorRT layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    ICHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    LoadGlobalAttributes();
+    if (GetCachedEnginesFromDisk()) return;
+    SetupConstants(consts);
+  }
+
+  void LoadGlobalAttributes() {
+    // These settings are global to the entire subgraph. Codegen will add them as attributes to all
+    // op nodes. Read from first one.
+    for (size_t i = 0; i < nodes_.size(); ++i) {
+      if (nodes_[i].HasAttr("use_implicit_batch") && nodes_[i].HasAttr("max_workspace_size")) {
+        use_implicit_batch_ =
+            std::stoi(nodes_[i].GetAttr<std::vector<std::string>>("use_implicit_batch")[0]);
+        // Allow max_workspace_size to be overridden at runtime.
+        size_t runtime_max_workspace_size =
+            dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(0));
+        if (runtime_max_workspace_size != 0) {
+          max_workspace_size_ = runtime_max_workspace_size;
+        } else {
+          max_workspace_size_ =
+              std::stoul(nodes_[i].GetAttr<std::vector<std::string>>("max_workspace_size")[0]);
+        }
+        return;
+      }
+    }
+  }
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+  /*! \brief Run inference using built engine. */
+  void Run() override {
+    BuildEngine();
+    batch_size_ = data_entry_[input_var_eid_[0]]->shape[0];
+    if (batch_size_ == 0) return;
+    auto& engine_and_context = trt_engine_cache_.at(std::make_pair(symbol_name_, batch_size_));
+    auto engine = engine_and_context.engine;
+    auto context = engine_and_context.context;
+    auto& device_buffers = engine_and_context.device_buffers;
+    std::vector<void*> bindings(engine->getNbBindings(), nullptr);
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      if (nodes_[nid].GetOpType() == "input") {
+        for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
+          uint32_t eid = EntryID(nid, j);
+          const std::string name = nodes_[nid].GetOpName() + "_" + std::to_string(j);
+          int binding_index = engine->getBindingIndex(name.c_str());
+          ICHECK_NE(binding_index, -1);
+          if (data_entry_[eid]->ctx.device_type == kDLGPU) {
+            bindings[binding_index] = data_entry_[eid]->data;
+          } else {
+            device_buffers[binding_index].CopyFrom(data_entry_[eid]);
+            bindings[binding_index] = device_buffers[binding_index]->data;
+          }
+        }
+      }
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      const std::string& name = engine_and_context.outputs[i];
+      int binding_index = engine->getBindingIndex(name.c_str());
+      ICHECK_NE(binding_index, -1);
+      if (data_entry_[eid]->ctx.device_type == kDLGPU) {
+        bindings[binding_index] = data_entry_[eid]->data;
+      } else {
+        bindings[binding_index] = device_buffers[binding_index]->data;
+      }
+    }
+
+#if TRT_VERSION_GE(6, 0, 1)
+    if (use_implicit_batch_) {
+      ICHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
+    } else {
+      ICHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
+    }
+#else
+    ICHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
+#endif
+
+    // Copy outputs from GPU buffers if needed.
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      const std::string& name = engine_and_context.outputs[i];
+      int binding_index = engine->getBindingIndex(name.c_str());
+      ICHECK_NE(binding_index, -1);
+      if (data_entry_[eid]->ctx.device_type != kDLGPU) {
+        device_buffers[binding_index].CopyTo(const_cast<DLTensor*>(data_entry_[eid]));
+      }
+    }
+  }
+
+ private:
+  /*!
+   * \brief Build TensorRT engine from JSON representation and cache it. If engine is already built,
+   * do nothing.
+   */
+  void BuildEngine() {
+    batch_size_ = data_entry_[input_var_eid_[0]]->shape[0];
+    if (trt_engine_cache_.count(std::make_pair(symbol_name_, batch_size_))) return;
+    DLOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_
+               << " with batch size " << batch_size_;
+    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
+    TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
+                            use_fp16, batch_size_);
+
+    // Add inputs and constants.
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      const auto& node = nodes_[nid];
+      std::string name = node.GetOpName();
+      if (node.GetOpType() == "input") {
+        builder.AddInput(nid, EntryID(nid, 0), node);
+      } else {
+        ICHECK_EQ(node.GetOpType(), "const");
+        uint32_t eid = EntryID(nid, 0);
+        builder.AddConstant(nid, data_entry_[eid]);
+      }
+    }
+
+    // Add layers.
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() != "kernel") continue;
+      builder.AddLayer(nid, node);
+    }
+
+    // Add outputs.
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      builder.AddOutput(outputs_[i], EntryID(outputs_[i]));
+    }
+
+    // Build engine.
+    trt_engine_cache_[std::make_pair(symbol_name_, batch_size_)] = builder.BuildEngine();
+    DLOG(INFO) << "Finished building TensorRT engine for subgraph " << symbol_name_
+               << " with batch size " << batch_size_;
+    CacheEngineToDisk();
+  }
+
+  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
+   * already built TRT engines and load into trt_engine_cache_ so they don't
+   * have to be built at first inference.
+   */
+  bool GetCachedEnginesFromDisk() {
+    std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
+    if (cache_dir.empty()) return false;
+    std::string key = GetSubgraphKey();
+    std::string path = cache_dir + "/" + key + ".plan";
+    // Check if engine is in the cache.
+    std::ifstream infile(path, std::ios::binary);
+    if (!infile.good()) return false;
+    DLOG(INFO) << "Loading cached TensorRT engine from " << path;
+    infile.close();
+    std::string serialized_engine;
+    LoadBinaryFromFile(path, &serialized_engine);
+    // Deserialize engine
+    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger_);
+    TensorRTEngineAndContext engine_and_context;
+    engine_and_context.engine =
+        runtime->deserializeCudaEngine(&serialized_engine[0], serialized_engine.size(), nullptr);
+    engine_and_context.context = engine_and_context.engine->createExecutionContext();
+    // Load metadata
+    std::string meta_path = cache_dir + "/" + key + ".meta";
+    std::string serialized_meta;
+    LoadBinaryFromFile(meta_path, &serialized_meta);
+    std::istringstream is(serialized_meta);
+    dmlc::JSONReader reader(&is);
+    dmlc::JSONObjectReadHelper helper;
+    helper.DeclareField("inputs", &engine_and_context.inputs);
+    helper.DeclareField("outputs", &engine_and_context.outputs);
+    helper.ReadAllFields(&reader);
+    const int batch_size = 1;
+    trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = engine_and_context;
+    return true;
+  }
+
+  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will save the engine to that
+   * directory so it can be loaded later.
+   */
+  void CacheEngineToDisk() {
+    batch_size_ = data_entry_[input_var_eid_[0]]->shape[0];
+    std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
+    if (cache_dir.empty()) return;
+    std::string key = GetSubgraphKey();
+    std::string path = cache_dir + "/" + key + ".plan";
+    DLOG(INFO) << "Caching TensorRT engine to " << path;
+    // Serialize engine to disk
+    nvinfer1::IHostMemory* serialized_engine =
+        trt_engine_cache_[std::make_pair(symbol_name_, batch_size_)].engine->serialize();
+    SaveBinaryToFile(path, std::string(static_cast<const char*>(serialized_engine->data()),
+                                       serialized_engine->size()));
+    serialized_engine->destroy();
+    // Serialize metadata
+    std::ostringstream os;
+    dmlc::JSONWriter writer(&os);
+    writer.BeginObject();
+    writer.WriteObjectKeyValue("inputs",
+                               trt_engine_cache_[std::make_pair(symbol_name_, batch_size_)].inputs);
+    writer.WriteObjectKeyValue(
+        "outputs", trt_engine_cache_[std::make_pair(symbol_name_, batch_size_)].outputs);
+    writer.EndObject();
+    std::string meta_path = cache_dir + "/" + key + ".meta";
+    SaveBinaryToFile(meta_path, os.str());
+  }
+
+  std::string GetSubgraphKey() {
+    // Using this key will only allow a single model per TVM_TENSORRT_CACHE_DIR directory. We could
+    // instead use a hash of graph_json and all weights to allow many models in the same directory,
+    // but the cost of computing the hash is high.
+    return symbol_name_ + (dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) ? "_fp16" : "_fp32");
+  }
+
+  /*! \brief Get the batch size when in implicit_batch mode. */
+  int GetBatchSize() {
+    if (!use_implicit_batch_) return -1;
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      if (nodes_[nid].GetOpType() == "input") {
+        // Get batch size from first input.
+        return nodes_[nid].GetOpShape()[0][0];
+      }
+    }
+    return -1;
+  }
+
+  /*! \brief Map of function name to TRT engine if built already. */
+  std::unordered_map<std::pair<std::string, int>, TensorRTEngineAndContext, PairHash>
+      trt_engine_cache_;
+
+  /*! \brief TensorRT logger. */
+  TensorRTLogger logger_;
+
+  /*! \brief Batch size that the engine is optimized for. */
+  int batch_size_;
+
+#else
+  void Run() override {
+    LOG(FATAL) << "TensorRT runtime is not enabled. "
+               << "Please build with USE_TENSORRT_RUNTIME.";
+  }
+
+  void BuildEngine() {
+    LOG(WARNING) << "TensorRT runtime is not enabled. "
+                 << "Please build with USE_TENSORRT_RUNTIME.";
+  }
+
+  bool GetCachedEnginesFromDisk() { return false; }
+
+  void CacheEngineToDisk() {}
+#endif
+
+  bool use_implicit_batch_;
+
+  size_t max_workspace_size_;
+};
+
+runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,
+                                      const Array<String>& const_names) {
+  auto n = make_object<TensorRTRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.tensorrt_runtime_create").set_body_typed(TensorRTRuntimeCreate);
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_tensorrt")
+    .set_body_typed(JSONRuntimeBase::LoadFromBinary<TensorRTRuntime>);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/utils.h b/src/runtime/contrib/tensorrt/tensorrt_utils.h
similarity index 57%
rename from src/runtime/contrib/tensorrt/utils.h
rename to src/runtime/contrib/tensorrt/tensorrt_utils.h
index a43183bda336..ab9b169f26d6 100644
--- a/src/runtime/contrib/tensorrt/utils.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_utils.h
@@ -19,21 +19,26 @@
 
 /*!
  * \file runtime/contrib/tensorrt/utils.h
- * \brief Helper functions used by TensorRTBuilder or TrtOpConverters.
+ * \brief Helper functions used by TensorRTBuilder or TensorRTOpConverters.
  */
 
-#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_UTILS_H_
-#define TVM_RUNTIME_CONTRIB_TENSORRT_UTILS_H_
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_UTILS_H_
 
-#include "NvInfer.h"
-
-#include <tvm/relay/expr.h>
-#include <tvm/relay/type.h>
 #include <string>
 #include <vector>
 
+#include "NvInfer.h"
+
+// There is a conflict between cpplint and clang-format-10.
+// clang-format off
+#define TRT_VERSION_GE(major, minor, patch)                                                    \
+  ((NV_TENSORRT_MAJOR > major) || (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
+  (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && NV_TENSORRT_PATCH >= patch))
+// clang-format on
+
 namespace tvm {
-namespace relay {
+namespace runtime {
 namespace contrib {
 
 /*!
@@ -42,7 +47,7 @@ namespace contrib {
  * \return TRT Dims.
  */
 template <typename T>
-nvinfer1::Dims VectorToTrtDims(const std::vector<T>& vec) {
+inline nvinfer1::Dims VectorToTrtDims(const std::vector<T>& vec) {
   nvinfer1::Dims dims;
   // Dims(nbDims=0, d[0]=1) is used to represent a scalar in TRT.
   dims.d[0] = 1;
@@ -58,40 +63,12 @@ nvinfer1::Dims VectorToTrtDims(const std::vector<T>& vec) {
  * \param vec TRT Dims.
  * \return Vector.
  */
-std::vector<int> TrtDimsToVector(const nvinfer1::Dims& dims) {
+inline std::vector<int> TrtDimsToVector(const nvinfer1::Dims& dims) {
   return std::vector<int>(dims.d, dims.d + dims.nbDims);
 }
 
-/*!
- * \brief Helper function to convert vector to string.
- * \param vec Vector.
- * \return Vector as a string.
- */
-std::string DebugString(const std::vector<int>& vec) {
-  std::ostringstream ss;
-  ss << "(";
-  for (size_t i = 0; i < vec.size(); ++i) {
-    if (i != 0) ss << ", ";
-    ss << vec[i];
-  }
-  ss << ")";
-  return ss.str();
-}
-
-std::vector<int> GetShape(const Type& type) {
-  const auto* ttype = type.as<TensorTypeNode>();
-  CHECK(ttype);
-  std::vector<int> _shape;
-  _shape.reserve(ttype->shape.size());
-  for (size_t i = 0; i < ttype->shape.size(); ++i) {
-    auto* val = ttype->shape[i].as<IntImmNode>();
-    _shape.push_back(val ? val->value : -1);
-  }
-  return _shape;
-}
-
 }  // namespace contrib
-}  // namespace relay
+}  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_UTILS_H_
+#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_UTILS_H_
diff --git a/src/runtime/contrib/tflite/tflite_runtime.cc b/src/runtime/contrib/tflite/tflite_runtime.cc
index 8b34e90312b0..9a434fde2955 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.cc
+++ b/src/runtime/contrib/tflite/tflite_runtime.cc
@@ -117,7 +117,7 @@ void TFLiteRuntime::SetInput(int index, DLTensor* data_in) {
   TVM_DTYPE_DISPATCH(dtype, DType, {
     DType* dest = interpreter_->typed_input_tensor<DType>(index);
     DType* src = static_cast<DType*>(data_in->data);
-    CHECK(data_in->strides == NULL);
+    ICHECK(data_in->strides == NULL);
     int64_t size = 1;
     for (int64_t i = 0; i < data_in->ndim; ++i) {
       size *= data_in->shape[i];
@@ -128,6 +128,8 @@ void TFLiteRuntime::SetInput(int index, DLTensor* data_in) {
   });
 }
 
+void TFLiteRuntime::SetNumThreads(int num_threads) { interpreter_->SetNumThreads(num_threads); }
+
 NDArray TFLiteRuntime::GetOutput(int index) const {
   TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[index]);
   DataType dtype = TfLiteDType2TVMDType(output->type);
@@ -155,7 +157,7 @@ PackedFunc TFLiteRuntime::GetFunction(const std::string& name,
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       int in_idx = args[0];
-      CHECK_GE(in_idx, 0);
+      ICHECK_GE(in_idx, 0);
       this->SetInput(in_idx, args[1]);
     });
   } else if (name == "get_output") {
@@ -163,6 +165,12 @@ PackedFunc TFLiteRuntime::GetFunction(const std::string& name,
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetOutput(args[0]); });
   } else if (name == "invoke") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Invoke(); });
+  } else if (name == "set_num_threads") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int num_threads = args[0];
+      CHECK_GE(num_threads, 1);
+      this->SetNumThreads(num_threads);
+    });
   } else {
     return PackedFunc();
   }
diff --git a/src/runtime/contrib/tflite/tflite_runtime.h b/src/runtime/contrib/tflite/tflite_runtime.h
index f3e3bd90bba4..3311f10975be 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.h
+++ b/src/runtime/contrib/tflite/tflite_runtime.h
@@ -37,7 +37,7 @@
 namespace tvm {
 namespace runtime {
 
-#define CHECK_TFLITE_STATUS(ret) CHECK_EQ(ret, kTfLiteOk)
+#define CHECK_TFLITE_STATUS(ret) ICHECK_EQ(ret, kTfLiteOk)
 
 /*!
  * \brief Tflite runtime.
@@ -93,6 +93,11 @@ class TFLiteRuntime : public ModuleNode {
    * \return NDArray corresponding to given output node index.
    */
   NDArray GetOutput(int index) const;
+  /*!
+   * \brief Set the number of threads available to the interpreter.
+   * \param num_threads The number of threads to be set.
+   */
+  void SetNumThreads(int num_threads);
 
   // Buffer backing the interpreter's model
   std::unique_ptr<char[]> flatBuffersBuffer_;
diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu
index c40235d7cc9e..2054db710b6d 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -130,7 +130,7 @@ void thrust_sort_common(DLTensor* input,
 
 TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort_nms")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_GE(args.num_args, 5);
+  ICHECK_GE(args.num_args, 5);
   DLTensor* input = args[0];
   DLTensor* valid_count = args[1];
   DLTensor* values_out = args[2];
@@ -149,7 +149,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort_nms")
 
 TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_GE(args.num_args, 4);
+  ICHECK_GE(args.num_args, 4);
   DLTensor* input = args[0];
   DLTensor* values_out = args[1];
   DLTensor* indices_out = args[2];
diff --git a/src/runtime/contrib/verilator/verilator_device.h b/src/runtime/contrib/verilator/verilator_device.h
new file mode 100644
index 000000000000..acd91a53bcff
--- /dev/null
+++ b/src/runtime/contrib/verilator/verilator_device.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/verilator/verilator_device.h
+ * \brief Use external verilator device.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
+#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+typedef void* VerilatorHandle;
+
+/* allocate Verilator object */
+extern "C" TVM_DLL VerilatorHandle VerilatorAlloc();
+
+/* deallocate Verilator object */
+extern "C" TVM_DLL void VerilatorDealloc(VerilatorHandle handle);
+
+/* read Verilator register or memory */
+extern "C" TVM_DLL int VerilatorRead(VerilatorHandle handle, int id, int addr);
+
+/* write Verilator register or memory */
+extern "C" TVM_DLL void VerilatorWrite(VerilatorHandle handle, int id, int addr, int value);
+
+/* reset Verilator for n clock cycles */
+extern "C" TVM_DLL void VerilatorReset(VerilatorHandle handle, int n);
+
+/* run Verilator for n clock cycles */
+extern "C" TVM_DLL void VerilatorRun(VerilatorHandle handle, int n);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_module.h b/src/runtime/contrib/verilator/verilator_kernel.h
similarity index 53%
rename from src/runtime/contrib/tensorrt/tensorrt_module.h
rename to src/runtime/contrib/verilator/verilator_kernel.h
index 889930eb4f54..f62097c0d795 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_module.h
+++ b/src/runtime/contrib/verilator/verilator_kernel.h
@@ -1,4 +1,5 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -17,31 +18,25 @@
  */
 
 /*!
- * \file runtime/contrib/tensorrt/tensorrt_module.h
- * \brief TensorRTModule is the runtime module for tensorrt backend.
+ * \file src/runtime/contrib/verilator/verilator_kernel.h
+ * \brief Use external verilator library kernels.
  */
 
-#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_MODULE_H_
-#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_MODULE_H_
+#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
+#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
 
-#include <tvm/ir/module.h>
+#include <tvm/runtime/c_runtime_api.h>
 
-#include <string>
-#include <unordered_map>
+#include "verilator_device.h"
 
 namespace tvm {
 namespace runtime {
+namespace contrib {
 
-/*!
- * \brief Create a TensorRTModule.
- * \param serialized_subgraphs Function name -> Relay func serialized with
- * SaveJSON.
- * \return TensorRTModule created from subgraphs.
- */
-Module TensorRTModuleCreate(
-    const std::unordered_map<std::string, std::string>& serialized_subgraphs);
+extern "C" TVM_DLL void verilator_add(VerilatorHandle handle, int* data, int* weight, int* out,
+                                      int p_h_, int p_w_);
 
+}  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
-
-#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_MODULE_H_
+#endif  // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
diff --git a/src/runtime/contrib/verilator/verilator_runtime.cc b/src/runtime/contrib/verilator/verilator_runtime.cc
new file mode 100644
index 000000000000..a44faf6d3274
--- /dev/null
+++ b/src/runtime/contrib/verilator/verilator_runtime.cc
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/verilator/verilator_runtime.cc
+ * \brief A simple JSON runtime for Verilator.
+ */
+
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+#include "verilator_device.h"
+#include "verilator_kernel.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::json;
+
+class VerilatorJSONRuntime : public JSONRuntimeBase {
+ public:
+  VerilatorJSONRuntime(const std::string& symbol_name, const std::string& graph_json,
+                       const Array<String> const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+  const char* type_key() const { return "verilator_json"; }
+
+  void Init(const Array<NDArray>& consts) override {
+    BuildEngine();
+
+    CHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+
+    // Setup constants entries for weights.
+    SetupConstants(consts);
+  }
+
+  void Run() override {
+    std::vector<int*> in_ptr;
+    std::vector<int*> out_ptr;
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      uint32_t eid = EntryID(input_nodes_[i], 0);
+      int* data = static_cast<int*>(data_entry_[eid]->data);
+      in_ptr.push_back(data);
+    }
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      int* data = static_cast<int*>(data_entry_[eid]->data);
+      out_ptr.push_back(data);
+    }
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() == "kernel") {
+        CHECK_EQ(node.GetOpType(), "kernel");
+        auto op_name = node.GetOpName();
+        if ("add" == op_name) {
+          auto entry = node.GetInputs()[0];
+          auto shape = nodes_[entry.id_].GetOpShape()[entry.index_];
+          verilator_add(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]);
+        } else {
+          LOG(FATAL) << "Unsupported op: " << op_name;
+        }
+      }
+    }
+  }
+
+ private:
+  void BuildEngine() {
+    device_ = VerilatorAlloc();
+    // reset for 10 cycles
+    VerilatorReset(device_, 10);
+  }
+
+  /* The verilator handle. */
+  VerilatorHandle device_{nullptr};
+};
+
+runtime::Module VerilatorJSONRuntimeCreate(String symbol_name, String graph_json,
+                                           const Array<String>& const_names) {
+  auto n = make_object<VerilatorJSONRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.VerilatorJSONRuntimeCreate")
+    .set_body_typed(VerilatorJSONRuntimeCreate);
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_verilator_json")
+    .set_body_typed(JSONRuntimeBase::LoadFromBinary<VerilatorJSONRuntime>);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
new file mode 100755
index 000000000000..37dc767d31af
--- /dev/null
+++ b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file vitis_ai_runtime.cc
+ */
+
+#include "vitis_ai_runtime.h"
+
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <streambuf>
+#include <string>
+#include <vector>
+
+using namespace pyxir::runtime;
+
+namespace tvm {
+namespace runtime {
+
+VitisAIRuntime::VitisAIRuntime(const std::string& symbol_name, const Array<String> const_names,
+                               const std::string& serialized_rt_mod,
+                               const std::string& export_rt_mod_path)
+    : symbol_name_(symbol_name),
+      const_names_(const_names),
+      export_rt_mod_path_(export_rt_mod_path) {
+  std::istringstream sstream(serialized_rt_mod);
+  rt_mod_.reset(new RuntimeModule());
+  rt_mod_->deserialize(sstream);
+  in_tensor_names_ = rt_mod_->get_in_tensor_names();
+  out_tensor_names_ = rt_mod_->get_out_tensor_names();
+}
+
+VitisAIRuntime::VitisAIRuntime(const std::string& symbol_name, const std::string& xgraph_str,
+                               const Array<String> const_names, const std::string& target,
+                               const std::string& build_dir, const std::string& work_dir,
+                               const std::string& export_rt_mod_path)
+    : symbol_name_(symbol_name),
+      const_names_(const_names),
+      export_rt_mod_path_(export_rt_mod_path) {
+  std::istringstream xgraph_sstream(xgraph_str);
+  pyxir::XGraphHolder xgraph = std::make_shared<pyxir::graph::XGraph>("");
+  pyxir::read(xgraph, xgraph_sstream);
+  in_tensor_names_ = xgraph->get_input_names();
+  out_tensor_names_ = xgraph->get_meta_attr("tvm_out_tensors").get_strings();
+
+  pyxir::partition(xgraph, std::vector<std::string>{target}, "");
+
+  pyxir::RunOptionsHolder run_options(new pyxir::runtime::RunOptions());
+  run_options->on_the_fly_quantization = true;
+  run_options->build_dir = build_dir;
+  if (!work_dir.empty()) run_options->work_dir = work_dir;
+  rt_mod_ =
+      pyxir::build_rt(xgraph, target, in_tensor_names_, out_tensor_names_, "vai", run_options);
+}
+
+Module VitisAIRuntimeCreate(const std::string& name, const std::string& xgraph_str,
+                            const std::string& target, const std::string& build_dir,
+                            const std::string& work_dir, const std::string& export_rt_mod_path) {
+  Array<String> const_vars;
+  auto exec = make_object<VitisAIRuntime>(name, xgraph_str, const_vars, target, build_dir, work_dir,
+                                          export_rt_mod_path);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.vitis_ai_runtime.from_xgraph").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = VitisAIRuntimeCreate(args[0], args[1], args[2], args[3], args[4], args[5]);
+});
+
+Module VitisAIRuntimeCreate(const std::string& name, const std::string& serialized_rt_mod,
+                            const std::string& export_rt_mod_path) {
+  Array<String> const_vars;
+  auto exec = make_object<VitisAIRuntime>(name, const_vars, serialized_rt_mod, export_rt_mod_path);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.vitis_ai_runtime.from_rt_mod").set_body([](TVMArgs args, TVMRetValue* rv) {
+  std::string load_rt_mod_path = args[1];
+  assert(!load_rt_mod_path.empty());
+  std::ifstream in_file(load_rt_mod_path);
+  std::stringstream buffer;
+  buffer << in_file.rdbuf();
+  std::string serialized_rt_mod = buffer.str();
+  in_file.close();
+  *rv = VitisAIRuntimeCreate(args[0], serialized_rt_mod, args[2]);
+});
+
+Module VitisAIRuntimeLoadFromBinary(void* strm) {
+  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+  std::string symbol_name;
+  std::vector<std::string> const_vars;
+  std::string serialized_rt_mod;
+  std::string export_rt_mod_path;
+  stream->Read(&serialized_rt_mod);
+  stream->Read(&export_rt_mod_path);
+  stream->Read(&symbol_name);
+  stream->Read(&const_vars);
+  Array<String> const_names;
+  for (const auto& it : const_vars) {
+    const_names.push_back(it);
+  }
+  auto exec =
+      make_object<VitisAIRuntime>(symbol_name, const_names, serialized_rt_mod, export_rt_mod_path);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_VitisAIRuntime")
+    .set_body_typed(VitisAIRuntimeLoadFromBinary);
+
+void VitisAIRuntime::SaveToBinary(dmlc::Stream* stream) {
+  std::ostringstream sstream;
+  rt_mod_->serialize(sstream);
+  stream->Write(sstream.str());
+  stream->Write(export_rt_mod_path_);
+  stream->Write(symbol_name_);
+  std::vector<std::string> consts;
+  for (const auto& it : const_names_) {
+    consts.push_back(it);
+  }
+  stream->Write(consts);
+
+  // If export_rt_mod_path_ member variable is set, we will additionally export the PyXIR
+  //  runtime_module to the specified file
+  if (!export_rt_mod_path_.empty()) {
+    std::ofstream out_file(export_rt_mod_path_);
+    out_file << sstream.str();
+    out_file.close();
+  }
+}
+
+PackedFunc VitisAIRuntime::GetFunction(const std::string& name,
+                                       const ObjectPtr<Object>& sptr_to_self) {
+  if (name == "get_symbol") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->symbol_name_; });
+  } else if (name == "get_const_vars") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->const_names_; });
+  } else if ("__init_" + this->symbol_name_ == name) {
+    // The function to initialize constant tensors.
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      CHECK_EQ(args.size(), 1U);
+      this->initialized_ = true;
+      *rv = 0;
+    });
+  } else if (this->symbol_name_ == name) {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      // Initialize input tensors
+      DLTensor* inputs = args[0];
+      std::vector<pyxir::XBufferHolder> in_tensors;
+      std::vector<ssize_t> in_shape;
+      for (int i = 0; i < inputs->ndim; ++i) in_shape.push_back(inputs->shape[i]);
+      in_tensors.push_back(std::shared_ptr<pyxir::XBuffer>(
+          new pyxir::XBuffer(reinterpret_cast<void*>(static_cast<float*>(inputs->data)), 4, "f",
+                             in_shape.size(), in_shape, false, false)));
+
+      // Initialize output tensors
+      std::vector<pyxir::XBufferHolder> out_tensors;
+      for (unsigned i = 0; i < out_tensor_names_.size(); ++i) {
+        DLTensor* output_tensor = args[args.size() - out_tensor_names_.size() + i];
+        std::vector<ssize_t> out_shape;
+        for (int i = 0; i < output_tensor->ndim; ++i) out_shape.push_back(output_tensor->shape[i]);
+        void* output_data = reinterpret_cast<void*>(static_cast<float*>(output_tensor->data));
+        out_tensors.push_back(std::shared_ptr<pyxir::XBuffer>(
+            new pyxir::XBuffer(output_data, 4, "f", out_shape.size(), out_shape, false, false)));
+      }
+
+      // Execute the subgraph.
+      rt_mod_->execute(in_tensors, out_tensors);
+    });
+  } else {
+    return PackedFunc();
+  }
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h
new file mode 100755
index 000000000000..1092bc0ba27b
--- /dev/null
+++ b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Vitis-AI runtime that can run model
+ *        containing only tvm PackedFunc.
+ * \file vitis_ai_runtime.h
+ */
+#ifndef TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
+#define TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+// clang-format off
+#include <memory>
+#include <string>
+#include <vector>
+// clang-format on
+#include <pyxir/pyxir.hpp>
+#include <pyxir/runtime/run_options.hpp>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief VAI runtime.
+ *
+ *  This runtime can be accessed in various language via
+ *  TVM runtime PackedFunc API.
+ */
+class VitisAIRuntime : public ModuleNode {
+ public:
+  /*!
+   * \brief Create VitisAI runtime from serialized XGraph
+   * \param symbol_name The name of the function.
+   * \param const_names The names of each constant in the sub-graph.
+   * \param serialized_rt_mod The serialized runtime module.
+   * \param export_rt_mod_path The path to the file to be used for exporting the
+   *        PyXIR runtime module.
+   */
+  VitisAIRuntime(const std::string& symbol_name, const Array<String> const_names,
+                 const std::string& serialized_rt_mod, const std::string& export_rt_mod);
+
+  /*!
+   * \brief Create VitisAI runtime from serialized XGraph
+   * \param symbol_name The name of the function.
+   * \param xgraph_str serialized XGraph representation
+   * \param const_names The names of each constant in the sub-graph.
+   * \param target The Vitis-AI device target (e.g. DPUCADX8G, DPUCZDX8G).
+   * \param build_dir The directory to be used for Vitis-AI build files.
+   * \param work_dir The directory to be used for Vitis-AI work files.
+   * \param export_rt_mod_path The path to the file to be used for exporting the
+   *        PyXIR runtime module.
+   */
+  VitisAIRuntime(const std::string& symbol_name, const std::string& xgraph_str,
+                 const Array<String> const_names, const std::string& target,
+                 const std::string& build_dir, const std::string& work_dir,
+                 const std::string& export_runtime_module_path);
+
+  /*!
+   * \brief Get member function to front-end.
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+
+  /*!
+   * \return The type key of the executor.
+   */
+  const char* type_key() const { return "VitisAIRuntime"; }
+
+  /*!
+   * \brief Serialize the content of the pyxir directory and save it to
+   *        binary stream.
+   * \param stream The binary stream to save to.
+   */
+  void SaveToBinary(dmlc::Stream* stream) final;
+
+ private:
+  /*! \brief The only subgraph name for this module */
+  std::string symbol_name_;
+  /*! \brief The required constant names */
+  Array<String> const_names_;
+  /*! \brief The runtime module */
+  pyxir::RtModHolder rt_mod_;
+  /*! \brief The XGraph input tensor names in the order as provided by TVM */
+  std::vector<std::string> in_tensor_names_;
+  /*! \brief The XGraph output tensor names in the order as provided by TVM */
+  std::vector<std::string> out_tensor_names_;
+  /*! \brief The file path for exporting the runtime module if set */
+  std::string export_rt_mod_path_;
+  /*! \brief Whether constant tensors have been initialized */
+  bool initialized_{false};
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index 5eeba29ee755..6cb8b820e6cc 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -20,10 +20,10 @@
 /*!
  * \file cpu_device_api.cc
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <cstdlib>
 #include <cstring>
diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile
index 8a24db4e8b2b..6e462431173f 100644
--- a/src/runtime/crt/Makefile
+++ b/src/runtime/crt/Makefile
@@ -65,7 +65,7 @@ $(notdir $(1)): $${BUILD_DIR}/lib$(notdir $(1)).a
 
 endef
 
-LIBS = src/runtime/crt/common src/runtime/crt/graph_runtime src/runtime/crt/utvm_rpc_common src/runtime/crt/utvm_rpc_server
+LIBS = src/runtime/crt/common src/runtime/crt/graph_runtime src/runtime/crt/graph_runtime_module src/runtime/crt/utvm_rpc_common src/runtime/crt/utvm_rpc_server
 
 $(foreach lib,$(LIBS),$(eval $(call LIB_template,$(lib))))
 
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index d6f78d9e3a03..f2d67ccfbeab 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -127,7 +127,7 @@ static TVMModuleHandle EncodeModuleHandle(tvm_module_index_t module_index) {
   return (TVMModuleHandle)((uintptr_t)(module_index | 0x8000));
 }
 
-static int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle) {
+int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle) {
   tvm_module_index_t idx;
 
   for (idx = 0; idx < TVM_CRT_MAX_REGISTERED_MODULES; idx++) {
@@ -229,17 +229,17 @@ int TVMFuncCall(TVMFunctionHandle func_handle, TVMValue* arg_values, int* type_c
   return func(arg_values, type_codes, num_args, ret_val, ret_type_code, resource_handle);
 }
 
-static int FindFunctionOrSetAPIError(tvm_module_index_t module_index,
-                                     const TVMFuncRegistry* registry, const char* name,
-                                     TVMFunctionHandle* out) {
+static tvm_crt_error_t FindFunctionOrSetAPIError(tvm_module_index_t module_index,
+                                                 const TVMFuncRegistry* registry, const char* name,
+                                                 TVMFunctionHandle* out) {
   tvm_function_index_t function_index;
-  if (TVMFuncRegistry_Lookup(registry, name, &function_index) != 0) {
-    TVMAPIErrorf("failed to get function: mod_index=%04" PRIx16 ", name=%s", module_index, name);
-    return -1;
+  tvm_crt_error_t err = TVMFuncRegistry_Lookup(registry, name, &function_index);
+  if (err != kTvmErrorNoError) {
+    return err;
   }
 
   *out = EncodeFunctionHandle(module_index, function_index);
-  return 0;
+  return kTvmErrorNoError;
 }
 
 int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out) {
@@ -279,6 +279,14 @@ int ModuleGetFunction(TVMValue* args, int* type_codes, int num_args, TVMValue* r
 
   if (to_return == 0) {
     ret_type_codes[0] = kTVMPackedFuncHandle;
+  } else {
+    ret_value->v_handle = NULL;
+  }
+
+  // NOTE: For compatibility with C++ runtime API, return no error (but NULL function) when the
+  // function lookup failed.
+  if (to_return == kTvmErrorFunctionNameNotFound) {
+    to_return = kTvmErrorNoError;
   }
 
   return to_return;
diff --git a/src/runtime/crt/common/memory.c b/src/runtime/crt/common/memory.c
index 68cad3645146..876c10efe3ea 100644
--- a/src/runtime/crt/common/memory.c
+++ b/src/runtime/crt/common/memory.c
@@ -151,8 +151,8 @@ void* MemoryManager_Alloc(MemoryManager* mgr, tvm_index_t size) {
   }
   vleak_size++;
 #if TVM_CRT_DEBUG > 1
-  printf("allocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", data, start,
-         ptable->max_pages, npage, vleak_size);
+  TVMLogf("allocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", data, start,
+          ptable->max_pages, npage, vleak_size);
 #endif  // TVM_CRT_DEBUG
   return data;
 }
@@ -229,9 +229,8 @@ void* MemoryManager_Realloc(MemoryManager* mgr, void* ptr, tvm_index_t size) {
     vleak_size++;
   }
 #if TVM_CRT_DEBUG > 1
-  printf("reallocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d, size=%" PRId64
-         "\n",
-         data, start, mgr->ptable.max_pages, npage, vleak_size, size);
+  TVMLogf("reallocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d, size=%zu", data,
+          start, mgr->ptable.max_pages, npage, vleak_size, size);
 #endif  // TVM_CRT_DEBUG
   return data;
 }
@@ -251,8 +250,8 @@ void MemoryManager_Free(MemoryManager* mgr, void* ptr) {
   free_map->insert(free_map, p->num_pages, p);
   vleak_size--;
 #if TVM_CRT_DEBUG > 1
-  printf("release: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", ptr,
-         entry->page.ptable_begin, mgr->ptable.max_pages, entry->page.num_pages, vleak_size);
+  TVMLogf("release: addr=%p, start=%" PRId64 "/%zu, npage=%zu, vleak=%d", ptr,
+          entry->page.ptable_begin, mgr->ptable.max_pages, entry->page.num_pages, vleak_size);
 #endif  // TVM_CRT_DEBUG
 }
 
diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index 4bae6de7da39..b7d4999254e6 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -51,7 +51,8 @@ TVMNDArray TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType d
   for (idx = 0; idx < ret.dl_tensor.ndim; ++idx) {
     num_elems *= shape[idx];
   }
-  ret.dl_tensor.data = TVMBackendAllocWorkspace(kDLCPU, 0, num_elems, dtype.code, dtype.bits);
+  ret.dl_tensor.data =
+      TVMBackendAllocWorkspace(kDLCPU, 0, num_elems * dtype.bits / 8, dtype.code, dtype.bits);
   memset(ret.dl_tensor.data, 0, num_elems * elem_bytes);
   return ret;
 }
diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c
index a6cd77ad6a22..450272d8722b 100644
--- a/src/runtime/crt/graph_runtime/graph_runtime.c
+++ b/src/runtime/crt/graph_runtime/graph_runtime.c
@@ -539,6 +539,13 @@ uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint
   return runtime->node_row_ptr[nid] + index;
 }
 
+/*!
+ * \brief Get the number of input tensors allocated.
+ * \param runtime The graph runtime.
+ * \return the number of input tensors allocated.
+ */
+int TVMGraphRuntime_GetNumInputs(TVMGraphRuntime* runtime) { return runtime->input_nodes_count; }
+
 /*!
  * \brief Get the input index given the name of input.
  * \param runtime The graph runtime.
@@ -675,6 +682,13 @@ void TVMGraphRuntime_Run(TVMGraphRuntime* runtime) {
   }
 }
 
+/*!
+ * \brief Get the number of output tensors allocated.
+ * \param runtime The graph runtime.
+ * \return the number of output tensors allocated.
+ */
+int TVMGraphRuntime_GetNumOutputs(TVMGraphRuntime* runtime) { return runtime->outputs_count; }
+
 int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTensor* out) {
   int status = 0;
   uint32_t nid = runtime->outputs[idx].node_id;
@@ -693,8 +707,20 @@ int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTen
 }
 
 void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
+  TVMPackedFunc lookup_linked_param;
+  int lookup_linked_param_valid;
   uint32_t idx;
 
+  {
+    TVMArgs temp_args;
+    temp_args.values[0].v_int64 = 0;
+    temp_args.tcodes[0] = kTVMArgInt;
+    temp_args.values_count = 1;
+    lookup_linked_param_valid =
+        (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, runtime->module_handle,
+                                      "_lookup_linked_param", &temp_args) == 0);
+  }
+
   // Grab saved optimization plan from graph.
   TVMGraphRuntimeGraphAttr* attrs = &(runtime->attrs);
   DLDataType* vtype = vmalloc(sizeof(DLDataType) * attrs->dltype_count);
@@ -721,24 +747,47 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     if (sid >= pool_entry_count) {
       pool_entry_count = sid + 1;
     }
+    pool_entry[sid].entry_id = idx;
     pool_entry[sid].size = MAX(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
   }
 
   // Allocate the space.
   for (idx = 0; idx < pool_entry_count; idx++) {
-    runtime->storage_pool =
-        vrealloc(runtime->storage_pool, sizeof(TVMNDArray) * (runtime->storage_pool_count + 1));
+    runtime->storage_pool = vrealloc(runtime->storage_pool, sizeof(TVMGraphRuntimeStorageEntry) *
+                                                                (runtime->storage_pool_count + 1));
     TVMGraphRuntimePoolEntry pit = pool_entry[idx];
-    int64_t shape[TVM_CRT_MAX_NDIM] = {
-        0,
-    };
     TVMContext ctx = runtime->ctxs[0];
-    DLDataType dtype = {kDLFloat, 32, 1};
-    shape[0] = (pit.size + 3) / 4;
-    runtime->storage_pool[runtime->storage_pool_count] = TVMNDArray_Empty(1, shape, dtype, ctx);
-    CHECK_NE(runtime->storage_pool[runtime->storage_pool_count].dl_tensor.data, 0,
-             "fail to create storage_pool with idx=%d\n", idx);
+    uint8_t did_find_linked_param = 0;
+    if (lookup_linked_param_valid) {
+      lookup_linked_param.args.values[0].v_int64 = idx;
+      CHECK_EQ(lookup_linked_param.Call(&lookup_linked_param), 0, "lookup_linked_param");
+
+      void* linked_param_data = lookup_linked_param.ret_value.values[0].v_handle;
+      if (linked_param_data != NULL) {
+        runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 1;
+        DLTensor* tensor = &runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor;
+        tensor->data = linked_param_data;
+        tensor->ctx = ctx;
+        tensor->ndim = attrs->ndim[pit.entry_id];
+        tensor->shape = attrs->shape + idx * TVM_CRT_MAX_NDIM;
+        tensor->strides = NULL;
+        tensor->byte_offset = 0;
+        did_find_linked_param = 1;
+      }
+    }
+    if (did_find_linked_param == 0) {
+      int64_t shape[TVM_CRT_MAX_NDIM] = {
+          0,
+      };
+      DLDataType dtype = {kDLFloat, 32, 1};
+      shape[0] = (pit.size + 3) / 4;
+      runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 0;
+      runtime->storage_pool[runtime->storage_pool_count].array =
+          TVMNDArray_Empty(1, shape, dtype, ctx);
+      CHECK_NE(runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor.data, 0,
+               "fail to create storage_pool with idx=%d\n", idx);
+    }
     runtime->storage_pool_count++;
   }
 
@@ -751,7 +800,7 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     uint32_t storage_id = attrs->storage_id[idx];
     CHECK(storage_id < runtime->storage_pool_count);
     runtime->data_entry[idx] =
-        TVMNDArray_CreateView(&(runtime->storage_pool[storage_id]),
+        TVMNDArray_CreateView(&(runtime->storage_pool[storage_id].array),
                               attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx], vtype[idx]);
     CHECK_NE(runtime->data_entry[idx].dl_tensor.data, 0,
              "fail to create for node with idx=%d, storage_id=%u\n", idx, storage_id);
@@ -858,28 +907,28 @@ int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam*
 /*!
  * \brief Initialize the graph executor with graph and context.
  * \param graph_json The execution graph.
- * \param module The module containing the compiled functions for the host
+ * \param module_handle The module containing the compiled functions for the host
  * processor.
  * \param ctxs The context of the host and devices where graph nodes will be
  * executed on.
  */
-void TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, const TVMModule* module,
-                          const TVMContext* ctxs) {
+void TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json,
+                          TVMModuleHandle module_handle, const TVMContext* ctxs) {
   JSONReader reader = JSONReader_Create(graph_json);
   TVMGraphRuntime_Load(runtime, &reader);
   JSONReader_Release(&reader);
+  runtime->module_handle = module_handle;
   runtime->ctxs[0] = ctxs[0];
   TVMGraphRuntime_SetupStorage(runtime);
   TVMGraphRuntime_SetupOpExecs(runtime);
 }
 
-TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, const TVMModule* m,
+TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle,
                                         const TVMContext* ctxs) {
-  CHECK_EQ(vleak_size, 1, "memory leak checking won't work with concurrent CRT use");
   TVMGraphRuntime* runtime = (TVMGraphRuntime*)vmalloc(sizeof(TVMGraphRuntime));  // NOLINT(*)
   memset(runtime, 0, sizeof(TVMGraphRuntime));
   // init
-  TVMGraphRuntime_Init(runtime, sym_json, m, ctxs);
+  TVMGraphRuntime_Init(runtime, sym_json, module_handle, ctxs);
   return runtime;
 }
 
@@ -892,7 +941,9 @@ void TVMGraphRuntime_Release(TVMGraphRuntime** pptr) {
   vfree(runtime->nodes);
   TVMGraphRuntimeGraphAttr_Release(&(runtime->attrs));
   for (idx = 0; idx < runtime->storage_pool_count; ++idx) {
-    TVMNDArray_Release(&(runtime->storage_pool[idx]));
+    if (runtime->storage_pool[idx].is_linked_param == 0) {
+      TVMNDArray_Release(&(runtime->storage_pool[idx].array));
+    }
   }
   for (idx = 0; idx < runtime->data_entry_count; ++idx) {
     vfree(runtime->data_entry[idx].dl_tensor.shape);
@@ -909,6 +960,4 @@ void TVMGraphRuntime_Release(TVMGraphRuntime** pptr) {
     vfree(g_fexecs);
     g_fexecs = 0;
   }
-
-  CHECK_EQ(vleak_size, 1, "found memory leak, leak size=%d", vleak_size - 1);
 }
diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
new file mode 100644
index 000000000000..2a32a0251507
--- /dev/null
+++ b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+/*!
+ * \file graph_runtime_module.c
+ * \brief wrap graph_runtime into a TVMModule for use with RPC.
+ */
+
+#include <tvm/runtime/crt/func_registry.h>
+#include <tvm/runtime/crt/graph_runtime.h>
+#include <tvm/runtime/crt/graph_runtime_module.h>
+#include <tvm/runtime/crt/module.h>
+
+#include "tvm/runtime/crt/internal/graph_runtime/graph_runtime.h"
+
+typedef struct {
+  TVMModule mod;
+  TVMGraphRuntime* runtime;
+} GraphRuntimeModule;
+
+static GraphRuntimeModule graph_runtime;
+
+int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                     int* ret_tcodes, void* resource_handle) {
+  if (graph_runtime.runtime != NULL) {
+    return kTvmErrorGraphModuleAlreadyCreated;
+  }
+
+  if (nargs != 4) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr || tcodes[1] != kTVMModuleHandle || tcodes[2] != kTVMArgInt ||
+      tcodes[3] != kTVMArgInt) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  if (args[2].v_int64 != kDLCPU || args[3].v_int64 != 0) {
+    return kTvmErrorGraphModuleBadContext;
+  }
+
+  TVMContext ctx = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64};
+  graph_runtime.runtime = TVMGraphRuntime_Create(args[0].v_str, args[1].v_handle, &ctx);
+
+  TVMModuleHandle out;
+  int ret_value = TVMModCreateFromCModule(&graph_runtime.mod, &out);
+  if (ret_value != 0) {
+    ret_tcodes[0] = kTVMNullptr;
+    TVMGraphRuntime_Release(&graph_runtime.runtime);
+    return ret_value;
+  }
+
+  ret_values[0].v_handle = out;
+  ret_tcodes[0] = kTVMModuleHandle;
+  return kTvmErrorNoError;
+}
+
+int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                       int* ret_tcodes, void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  int index = TVMGraphRuntime_GetInputIndex(graph_runtime.runtime, args[0].v_str);
+  if (index < 0) {
+    return kTvmErrorGraphModuleNoSuchInput;
+  }
+
+  uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime,
+                                            graph_runtime.runtime->input_nodes[index], 0);
+  ret_values[0].v_handle = (void*)&graph_runtime.runtime->data_entry[eid].dl_tensor;
+  ret_tcodes[0] = kTVMNDArrayHandle;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs,
+                                           TVMValue* ret_values, int* ret_tcodes,
+                                           void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  ret_values[0].v_int64 = TVMGraphRuntime_GetNumInputs();
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs,
+                                            TVMValue* ret_values, int* ret_tcodes,
+                                            void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  ret_values[0].v_int64 = TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime);
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
+                                        TVMValue* ret_values, int* ret_tcodes,
+                                        void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMArgInt) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  int output_index = args[0].v_int64;
+  if (output_index < 0 || output_index > TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime)) {
+    return kTvmErrorGraphModuleNoSuchInput;
+  }
+
+  uint32_t nid = graph_runtime.runtime->outputs[output_index].node_id;
+  uint32_t index = graph_runtime.runtime->outputs[output_index].index;
+  uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, nid, index);
+
+  ret_values[0].v_handle = (void*)&(graph_runtime.runtime->data_entry[eid].dl_tensor);
+  ret_tcodes[0] = kTVMNDArrayHandle;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs,
+                                         TVMValue* ret_values, int* ret_tcodes,
+                                         void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMBytes) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  ret_tcodes[0] = kTVMNullptr;
+
+  TVMByteArray* arr = (TVMByteArray*)args[0].v_handle;
+  return TVMGraphRuntime_LoadParams(graph_runtime.runtime, arr->data, arr->size);
+}
+
+int32_t TVMGraphRuntimeModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                  int* ret_tcodes, void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  TVMGraphRuntime_Run(graph_runtime.runtime);
+
+  ret_tcodes[0] = kTVMNullptr;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                       int* ret_tcodes, void* resource_handle) {
+  if (nargs != 2) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr || tcodes[1] != kTVMDLTensorHandle) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  TVMGraphRuntime_SetInput(graph_runtime.runtime, args[0].v_str, (DLTensor*)args[1].v_handle);
+
+  ret_tcodes[0] = kTVMNullptr;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_NotImplemented(TVMValue* args, int* tcodes, int nargs,
+                                             TVMValue* ret_values, int* ret_tcodes,
+                                             void* resource_handle) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
+
+static const TVMBackendPackedCFunc graph_runtime_registry_funcs[] = {
+    &TVMGraphRuntimeModule_GetInput,      &TVMGraphRuntimeModule_GetNumInputs,
+    &TVMGraphRuntimeModule_GetNumOutputs, &TVMGraphRuntimeModule_GetOutput,
+    &TVMGraphRuntimeModule_LoadParams,    &TVMGraphRuntimeModule_Run,
+    &TVMGraphRuntimeModule_SetInput,      &TVMGraphRuntimeModule_NotImplemented,
+};
+
+static const TVMFuncRegistry graph_runtime_registry = {
+    "\x08get_input\0"
+    "get_num_inputs\0"
+    "get_num_outputs\0"
+    "get_output\0"
+    "load_params\0"
+    "run\0"
+    "set_input\0"
+    "share_params\0",
+    graph_runtime_registry_funcs};
+
+tvm_crt_error_t TVMGraphRuntimeModule_Register() {
+  graph_runtime.mod.registry = &graph_runtime_registry;
+  graph_runtime.runtime = NULL;
+
+  return TVMFuncRegisterGlobal("tvm.graph_runtime.create", &TVMGraphRuntimeModule_Create, 0);
+}
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index dcca305b8b65..41f2dc3b0a1b 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -32,6 +32,10 @@
 
 #include "crt_config.h"
 
+#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE
+#include <tvm/runtime/crt/graph_runtime_module.h>
+#endif
+
 using namespace std::chrono;
 
 extern "C" {
@@ -43,6 +47,11 @@ ssize_t UTvmWriteFunc(void* context, const uint8_t* data, size_t num_bytes) {
   return to_return;
 }
 
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args) {
+  return vsnprintf(out_buf, out_buf_size_bytes, fmt, args);
+}
+
 void TVMPlatformAbort(tvm_crt_error_t error_code) {
   std::cerr << "TVMPlatformAbort: " << error_code << std::endl;
   throw "Aborted";
@@ -90,6 +99,11 @@ int main(int argc, char** argv) {
   utvm_rpc_server_t rpc_server =
       UTvmRpcServerInit(memory, sizeof(memory), 8, &UTvmWriteFunc, nullptr);
 
+#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE
+  CHECK_EQ(TVMGraphRuntimeModule_Register(), kTvmErrorNoError,
+           "failed to register GraphRuntime TVMModule");
+#endif
+
   if (TVMFuncRegisterGlobal("tvm.testing.reset_server", (TVMFunctionHandle)&testonly_reset_server,
                             0)) {
     fprintf(stderr, "utvm runtime: internal error registering global packedfunc; exiting\n");
@@ -109,13 +123,18 @@ int main(int argc, char** argv) {
       fprintf(stderr, "utvm runtime: 0-length read, exiting!\n");
       return 2;
     }
-    if (UTvmRpcServerReceiveByte(rpc_server, c) != 1) {
-      abort();
-    }
-    if (!UTvmRpcServerLoop(rpc_server)) {
-      execvp(argv[0], argv);
-      perror("utvm runtime: error restarting");
-      return 2;
+    uint8_t* cursor = &c;
+    size_t bytes_to_process = 1;
+    while (bytes_to_process > 0) {
+      tvm_crt_error_t err = UTvmRpcServerLoop(rpc_server, &cursor, &bytes_to_process);
+      if (err == kTvmErrorPlatformShutdown) {
+        break;
+      } else if (err != kTvmErrorNoError) {
+        char buf[1024];
+        snprintf(buf, sizeof(buf), "utvm runtime: UTvmRpcServerLoop error: %08x", err);
+        perror(buf);
+        return 2;
+      }
     }
   }
   return 0;
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
index 7ea7a4f035c8..8e0faaa4f199 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
@@ -33,6 +33,7 @@
 typedef struct TVMGraphRuntimePoolEntry {
   size_t size;
   int device_type;
+  int entry_id;
 } TVMGraphRuntimePoolEntry;
 
 // Node entry
@@ -44,6 +45,12 @@ typedef struct TVMGraphRuntimeNodeEntry {
   void (*Load)(JSONReader* reader);
 } TVMGraphRuntimeNodeEntry;
 
+// Storage entry.
+typedef struct TVMGraphRuntimeStorageEntry {
+  uint8_t is_linked_param;
+  TVMNDArray array;
+} TVMGraphRuntimeStorageEntry;
+
 // Node
 typedef struct TVMGraphRuntimeNode {
   // operator type in string
@@ -87,7 +94,7 @@ typedef struct TVMGraphRuntime {
   TVMContext ctxs[1];
   uint32_t ctxs_count;
   /*! \brief Common storage pool for all devices. */
-  TVMNDArray* storage_pool;
+  TVMGraphRuntimeStorageEntry* storage_pool;
   uint32_t storage_pool_count;
   /*! \brief Data entry of each node. */
   TVMNDArray* data_entry;
@@ -100,6 +107,7 @@ typedef struct TVMGraphRuntime {
 typedef DLTensor* DLTensorPtr;
 
 // private functions
+uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint32_t index);
 void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in);
 int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
                                const uint32_t param_size);
diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
index f36e67223c98..6674d5993cc6 100644
--- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
@@ -117,7 +117,6 @@ class MicroRPCServer {
         io_{&session_, &receive_buffer_},
         unframer_{session_.Receiver()},
         rpc_server_{&io_},
-        has_pending_byte_{false},
         is_running_{true} {}
 
   void* operator new(size_t count, void* ptr) { return ptr; }
@@ -126,25 +125,30 @@ class MicroRPCServer {
 
   /*! \brief Process one message from the receive buffer, if possible.
    *
-   * \return true if additional messages could be processed. false if the server shutdown request
-   * has been received.
+   * \param new_data If not nullptr, a pointer to a buffer pointer, which should point at new input
+   *     data to process. On return, updated to point past data that has been consumed.
+   * \param new_data_size_bytes Points to the number of valid bytes in `new_data`. On return,
+   *     updated to the number of unprocessed bytes remaining in `new_data` (usually 0).
+   * \return an error code indicating the outcome of the processing loop.
    */
-  bool Loop() {
-    if (has_pending_byte_) {
+  tvm_crt_error_t Loop(uint8_t** new_data, size_t* new_data_size_bytes) {
+    if (!is_running_) {
+      return kTvmErrorPlatformShutdown;
+    }
+
+    tvm_crt_error_t err = kTvmErrorNoError;
+    if (new_data != nullptr && new_data_size_bytes != nullptr && *new_data_size_bytes > 0) {
       size_t bytes_consumed;
-      CHECK_EQ(unframer_.Write(&pending_byte_, 1, &bytes_consumed), kTvmErrorNoError,
-               "unframer_.Write");
-      CHECK_EQ(bytes_consumed, 1, "bytes_consumed");
-      has_pending_byte_ = false;
+      err = unframer_.Write(*new_data, *new_data_size_bytes, &bytes_consumed);
+      *new_data += bytes_consumed;
+      *new_data_size_bytes -= bytes_consumed;
     }
 
-    return is_running_;
-  }
+    if (err == kTvmErrorNoError && !is_running_) {
+      err = kTvmErrorPlatformShutdown;
+    }
 
-  void HandleReceivedByte(uint8_t byte) {
-    CHECK(!has_pending_byte_);
-    has_pending_byte_ = true;
-    pending_byte_ = byte;
+    return err;
   }
 
   void Log(const uint8_t* message, size_t message_size_bytes) {
@@ -164,8 +168,6 @@ class MicroRPCServer {
   Unframer unframer_;
   MinRPCServer<MicroIOHandler> rpc_server_;
 
-  bool has_pending_byte_;
-  uint8_t pending_byte_;
   bool is_running_;
 
   void HandleCompleteMessage(MessageType message_type, FrameBuffer* buf) {
@@ -217,7 +219,7 @@ void TVMLogf(const char* format, ...) {
   va_list args;
   char log_buffer[256];
   va_start(args, format);
-  size_t num_bytes_logged = vsnprintf(log_buffer, sizeof(log_buffer), format, args);
+  size_t num_bytes_logged = TVMPlatformFormatMessage(log_buffer, sizeof(log_buffer), format, args);
   va_end(args);
 
   // Most header-based logging frameworks tend to insert '\n' at the end of the log message.
@@ -243,19 +245,11 @@ void TVMLogf(const char* format, ...) {
   }
 }
 
-size_t UTvmRpcServerReceiveByte(utvm_rpc_server_t server_ptr, uint8_t byte) {
-  // NOTE(areusch): In the future, this function is intended to work from an IRQ context. That's not
-  // needed at present.
-  tvm::runtime::micro_rpc::MicroRPCServer* server =
-      static_cast<tvm::runtime::micro_rpc::MicroRPCServer*>(server_ptr);
-  server->HandleReceivedByte(byte);
-  return 1;
-}
-
-bool UTvmRpcServerLoop(utvm_rpc_server_t server_ptr) {
+tvm_crt_error_t UTvmRpcServerLoop(utvm_rpc_server_t server_ptr, uint8_t** new_data,
+                                  size_t* new_data_size_bytes) {
   tvm::runtime::micro_rpc::MicroRPCServer* server =
       static_cast<tvm::runtime::micro_rpc::MicroRPCServer*>(server_ptr);
-  return server->Loop();
+  return server->Loop(new_data, new_data_size_bytes);
 }
 
 }  // extern "C"
diff --git a/src/runtime/cuda/cuda_common.h b/src/runtime/cuda/cuda_common.h
index 25ff28a91a6c..471fefb230a1 100644
--- a/src/runtime/cuda/cuda_common.h
+++ b/src/runtime/cuda/cuda_common.h
@@ -44,10 +44,11 @@ namespace runtime {
     }                                                                   \
   }
 
-#define CUDA_CALL(func)                                                                            \
-  {                                                                                                \
-    cudaError_t e = (func);                                                                        \
-    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) << "CUDA: " << cudaGetErrorString(e); \
+#define CUDA_CALL(func)                                       \
+  {                                                           \
+    cudaError_t e = (func);                                   \
+    ICHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+        << "CUDA: " << cudaGetErrorString(e);                 \
   }
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index f7b88ccdd964..30abfc8dc559 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -107,7 +107,7 @@ class CUDADeviceAPI final : public DeviceAPI {
   }
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
-    CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
+    ICHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
     void* ret;
     if (ctx.device_type == kDLCPUPinned) {
       CUDA_CALL(cudaMallocHost(&ret, nbytes));
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index bf844c1ad798..a877bc634300 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -32,7 +32,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
@@ -71,11 +71,11 @@ class CUDAModuleNode : public runtime::ModuleNode {
     std::string fmt = GetFileFormat(file_name, format);
     std::string meta_file = GetMetaFilePath(file_name);
     if (fmt == "cu") {
-      CHECK_NE(cuda_source_.length(), 0);
+      ICHECK_NE(cuda_source_.length(), 0);
       SaveMetaDataToFile(meta_file, fmap_);
       SaveBinaryToFile(file_name, cuda_source_);
     } else {
-      CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+      ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
       SaveMetaDataToFile(meta_file, fmap_);
       SaveBinaryToFile(file_name, data_);
     }
@@ -124,7 +124,7 @@ class CUDAModuleNode : public runtime::ModuleNode {
     size_t nbytes;
 
     CUresult result = cuModuleGetGlobal(&global, &nbytes, module_[device_id], global_name.c_str());
-    CHECK_EQ(nbytes, expect_nbytes);
+    ICHECK_EQ(nbytes, expect_nbytes);
     if (result != CUDA_SUCCESS) {
       const char* msg;
       cuGetErrorName(result, &msg);
@@ -232,8 +232,8 @@ class CUDAPrepGlobalBarrier {
 
 PackedFunc CUDAModuleNode::GetFunction(const std::string& name,
                                        const ObjectPtr<Object>& sptr_to_self) {
-  CHECK_EQ(sptr_to_self.get(), this);
-  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  ICHECK_EQ(sptr_to_self.get(), this);
+  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
   if (name == symbol::tvm_prepare_global_barrier) {
     return PackedFunc(CUDAPrepGlobalBarrier(this, sptr_to_self));
   }
diff --git a/src/runtime/dso_library.cc b/src/runtime/dso_library.cc
index 6d3eec402306..c439bde82497 100644
--- a/src/runtime/dso_library.cc
+++ b/src/runtime/dso_library.cc
@@ -63,7 +63,7 @@ class DSOLibrary final : public Library {
     // use wstring version that is needed by LLVM.
     std::wstring wname(name.begin(), name.end());
     lib_handle_ = LoadLibraryW(wname.c_str());
-    CHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name;
+    ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name;
   }
 
   void Unload() {
@@ -76,8 +76,8 @@ class DSOLibrary final : public Library {
   // load the library
   void Load(const std::string& name) {
     lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
-                                  << dlerror();
+    ICHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name << " " << dlerror();
   }
 
   void* GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
diff --git a/src/runtime/file_util.cc b/src/runtime/file_utils.cc
similarity index 94%
rename from src/runtime/file_util.cc
rename to src/runtime/file_utils.cc
index 68d174e470a2..42cbfdc3b1ed 100644
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_utils.cc
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file file_util.cc
+ * \file file_utils.cc
  */
-#include "file_util.h"
+#include "file_utils.h"
 
 #include <dmlc/json.h>
-#include <dmlc/logging.h>
 #include <tvm/runtime/serializer.h>
+#include <tvm/support/logging.h>
 
 #include <fstream>
 #include <unordered_map>
@@ -114,7 +114,7 @@ std::string GetMetaFilePath(const std::string& file_name) {
 
 void LoadBinaryFromFile(const std::string& file_name, std::string* data) {
   std::ifstream fs(file_name, std::ios::in | std::ios::binary);
-  CHECK(!fs.fail()) << "Cannot open " << file_name;
+  ICHECK(!fs.fail()) << "Cannot open " << file_name;
   // get its size:
   fs.seekg(0, std::ios::end);
   size_t size = static_cast<size_t>(fs.tellg());
@@ -125,7 +125,7 @@ void LoadBinaryFromFile(const std::string& file_name, std::string* data) {
 
 void SaveBinaryToFile(const std::string& file_name, const std::string& data) {
   std::ofstream fs(file_name, std::ios::out | std::ios::binary);
-  CHECK(!fs.fail()) << "Cannot open " << file_name;
+  ICHECK(!fs.fail()) << "Cannot open " << file_name;
   fs.write(&data[0], data.length());
 }
 
@@ -133,7 +133,7 @@ void SaveMetaDataToFile(const std::string& file_name,
                         const std::unordered_map<std::string, FunctionInfo>& fmap) {
   std::string version = "0.1.0";
   std::ofstream fs(file_name.c_str());
-  CHECK(!fs.fail()) << "Cannot open file " << file_name;
+  ICHECK(!fs.fail()) << "Cannot open file " << file_name;
   dmlc::JSONWriter writer(&fs);
   writer.BeginObject();
   writer.WriteObjectKeyValue("tvm_version", version);
@@ -145,7 +145,7 @@ void SaveMetaDataToFile(const std::string& file_name,
 void LoadMetaDataFromFile(const std::string& file_name,
                           std::unordered_map<std::string, FunctionInfo>* fmap) {
   std::ifstream fs(file_name.c_str());
-  CHECK(!fs.fail()) << "Cannot open file " << file_name;
+  ICHECK(!fs.fail()) << "Cannot open file " << file_name;
   std::string version;
   dmlc::JSONReader reader(&fs);
   dmlc::JSONObjectReadHelper helper;
diff --git a/src/runtime/file_util.h b/src/runtime/file_utils.h
similarity index 93%
rename from src/runtime/file_util.h
rename to src/runtime/file_utils.h
index 1c350357ec9a..696a9760c2e1 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file file_util.h
- * \brief Minimum file manipulation util for runtime.
+ * \file file_utils.h
+ * \brief Minimum file manipulation utils for runtime.
  */
-#ifndef TVM_RUNTIME_FILE_UTIL_H_
-#define TVM_RUNTIME_FILE_UTIL_H_
+#ifndef TVM_RUNTIME_FILE_UTILS_H_
+#define TVM_RUNTIME_FILE_UTILS_H_
 
 #include <string>
 #include <unordered_map>
@@ -94,4 +94,4 @@ void LoadMetaDataFromFile(const std::string& file_name,
 void RemoveFile(const std::string& file_name);
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_RUNTIME_FILE_UTIL_H_
+#endif  // TVM_RUNTIME_FILE_UTILS_H_
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 5439be9109f9..d02a6d9a0d64 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -148,7 +148,7 @@ class GraphRuntimeDebug : public GraphRuntime {
    * \param data_out the node data.
    */
   void DebugGetNodeOutput(int index, DLTensor* data_out) {
-    CHECK_LT(static_cast<size_t>(index), op_execs_.size());
+    ICHECK_LT(static_cast<size_t>(index), op_execs_.size());
     uint32_t eid = index;
 
     for (size_t i = 0; i < op_execs_.size(); ++i) {
@@ -185,9 +185,9 @@ PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name,
       int number = args[0];
       int repeat = args[1];
       int min_repeat_ms = args[2];
-      CHECK_GT(number, 0);
-      CHECK_GT(repeat, 0);
-      CHECK_GE(min_repeat_ms, 0);
+      ICHECK_GT(number, 0);
+      ICHECK_GT(repeat, 0);
+      ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
   } else {
@@ -202,17 +202,26 @@ PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name,
  * \param ctxs All devices contexts.
  */
 Module GraphRuntimeDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m,
-                               const std::vector<TVMContext>& ctxs) {
+                               const std::vector<TVMContext>& ctxs,
+                               PackedFunc lookup_linked_param_func) {
   auto exec = make_object<GraphRuntimeDebug>();
-  exec->Init(sym_json, m, ctxs);
+  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
   return Module(exec);
 }
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
-                                "at least 4, but it has "
-                             << args.num_args;
-  *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
+  ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
+                                 "at least 4, but it has "
+                              << args.num_args;
+  PackedFunc lookup_linked_param_func;
+  int ctx_start_arg = 2;
+  if (args[2].type_code() == kTVMPackedFuncHandle) {
+    lookup_linked_param_func = args[2];
+    ctx_start_arg++;
+  }
+
+  *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args, ctx_start_arg),
+                                lookup_linked_param_func);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 5eeca52b68f2..9e1670e67fc0 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -64,14 +64,20 @@ void GraphRuntime::Run() {
  * processor.
  * \param ctxs The context of the host and devices where graph nodes will be
  * executed on.
+ * \param lookup_linked_param_func Linked parameter lookup function.
  */
 void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module,
-                        const std::vector<TVMContext>& ctxs) {
+                        const std::vector<TVMContext>& ctxs, PackedFunc lookup_linked_param_func) {
   std::istringstream is(graph_json);
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
   module_ = module;
   ctxs_ = ctxs;
+  lookup_linked_param_ = lookup_linked_param_func;
+  if (lookup_linked_param_ == nullptr) {
+    lookup_linked_param_ = PackedFunc(
+        [this](TVMArgs args, TVMRetValue* rv) { this->DefaultLookupLinkedParam(args, rv); });
+  }
   this->SetupStorage();
   this->SetupOpExecs();
   for (size_t i = 0; i < input_nodes_.size(); i++) {
@@ -99,7 +105,7 @@ int GraphRuntime::GetInputIndex(const std::string& name) {
  * \param data_in The input data.
  */
 void GraphRuntime::SetInput(int index, DLTensor* data_in) {
-  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   data_entry_[eid].CopyFrom(data_in);
 }
@@ -140,18 +146,18 @@ std::vector<std::string> GraphRuntime::GetWeightNames() const {
  * \param data_ref The input data that is referred.
  */
 void GraphRuntime::SetInputZeroCopy(int index, DLTensor* data_ref) {
-  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   const DLTensor* old_t = data_entry_[eid].operator->();
 
   // check the consistency of input
-  CHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*data_ref));
-  CHECK_EQ(reinterpret_cast<size_t>(data_ref->data) % kAllocAlignment, 0);
-  CHECK_EQ(old_t->ndim, static_cast<size_t>(data_ref->ndim));
-  CHECK_EQ(old_t->ctx.device_type, data_ref->ctx.device_type);
-  CHECK_EQ(old_t->ctx.device_id, data_ref->ctx.device_id);
+  ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*data_ref));
+  ICHECK_EQ(reinterpret_cast<size_t>(data_ref->data) % kAllocAlignment, 0);
+  ICHECK_EQ(old_t->ndim, static_cast<size_t>(data_ref->ndim));
+  ICHECK_EQ(old_t->ctx.device_type, data_ref->ctx.device_type);
+  ICHECK_EQ(old_t->ctx.device_id, data_ref->ctx.device_id);
   for (auto i = 0; i < data_ref->ndim; ++i) {
-    CHECK_EQ(old_t->shape[i], data_ref->shape[i]);
+    ICHECK_EQ(old_t->shape[i], data_ref->shape[i]);
   }
 
   // Update the data pointer for each argument of each op
@@ -190,7 +196,7 @@ std::string GraphRuntime::GetOutputType(int index) const {
  * \return NDArray corresponding to given input node index.
  */
 NDArray GraphRuntime::GetInput(int index) const {
-  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   return data_entry_[eid];
 }
@@ -201,7 +207,7 @@ NDArray GraphRuntime::GetInput(int index) const {
  * \return NDArray corresponding to given output node index.
  */
 NDArray GraphRuntime::GetOutput(int index) const {
-  CHECK_LT(static_cast<size_t>(index), outputs_.size());
+  ICHECK_LT(static_cast<size_t>(index), outputs_.size());
   uint32_t eid = this->entry_id(outputs_[index]);
   return data_entry_[eid];
 }
@@ -211,14 +217,14 @@ NDArray GraphRuntime::GetOutput(int index) const {
  * \param data_out the output data.
  */
 void GraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
-  CHECK_LT(static_cast<size_t>(index), outputs_.size());
+  ICHECK_LT(static_cast<size_t>(index), outputs_.size());
   uint32_t eid = this->entry_id(outputs_[index]);
 
   // Check the shapes to avoid receiving in different dimension but same size.
   const NDArray& data = data_entry_[eid];
-  CHECK_EQ(data->ndim, data_out->ndim);
+  ICHECK_EQ(data->ndim, data_out->ndim);
   for (int32_t j = 0; j < data->ndim; ++j) {
-    CHECK_EQ(data->shape[j], data_out->shape[j]);
+    ICHECK_EQ(data->shape[j], data_out->shape[j]);
   }
 
   data_entry_[eid].CopyTo(data_out);
@@ -235,14 +241,15 @@ void GraphRuntime::LoadParams(const std::string& param_blob) {
 
 void GraphRuntime::LoadParams(dmlc::Stream* strm) {
   uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid parameters file format";
-  CHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
-  CHECK(strm->Read(&reserved)) << "Invalid parameters file format";
-  CHECK(strm->Read(&weight_names_)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
+  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
+
+  ICHECK(strm->Read(&weight_names_)) << "Invalid parameters file format";
   uint64_t sz;
   strm->Read(&sz);
   size_t size = static_cast<size_t>(sz);
-  CHECK(size == weight_names_.size()) << "Invalid parameters file format";
+  ICHECK(size == weight_names_.size()) << "Invalid parameters file format";
   for (size_t i = 0; i < size; ++i) {
     int in_idx = GetInputIndex(weight_names_[i]);
     if (in_idx < 0) {
@@ -251,7 +258,7 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
       continue;
     }
     uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
-    CHECK_LT(eid, data_entry_.size());
+    ICHECK_LT(eid, data_entry_.size());
 
     // The data_entry is allocated on device, NDArray.load always load the array into CPU.
     NDArray temp;
@@ -262,29 +269,66 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
 
 void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
   uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid parameters file format";
-  CHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
-  CHECK(strm->Read(&reserved)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
+  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
   std::vector<std::string> names;
-  CHECK(strm->Read(&names)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&names)) << "Invalid parameters file format";
   uint64_t sz;
   strm->Read(&sz);
   size_t size = static_cast<size_t>(sz);
-  CHECK(size == names.size()) << "Invalid parameters file format";
+  ICHECK(size == names.size()) << "Invalid parameters file format";
   for (size_t i = 0; i < size; ++i) {
     int in_idx = GetInputIndex(names[i]);
     if (in_idx < 0) continue;
     uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
-    CHECK_LT(eid, data_entry_.size());
-    CHECK_EQ(data_entry_[eid].use_count(), 1);
+    ICHECK_LT(eid, data_entry_.size());
+    ICHECK_EQ(data_entry_[eid].use_count(), 1);
     data_entry_[eid] = other.GetInput(GetInputIndex(names[i]));
-    CHECK_GT(data_entry_[eid].use_count(), 1);
+    ICHECK_GT(data_entry_[eid].use_count(), 1);
     const DLTensor* tmp = data_entry_[eid].operator->();
     data_alignment_[eid] = details::GetDataAlignment(*tmp);
   }
   this->SetupOpExecs();
 }
 
+void GraphRuntime::LinkedNDArrayDeleter(Object* container) {
+  // container is the NDArray::Container which needs to get deleted.
+  // The data member points to global const memory, so it does not need deleting.
+  delete static_cast<NDArray::Container*>(container);
+}
+
+void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
+  Module mod = args[0];
+  int64_t storage_id = args[1];
+  DLTensor* template_tensor = args[2];
+  TVMContext ctx = args[3];
+  // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
+  // params are present.
+  if (!module_lookup_linked_param_valid_) {
+    module_lookup_linked_param_ =
+        mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
+  }
+  if (module_lookup_linked_param_ == nullptr) {
+    *rv = nullptr;
+    return;
+  }
+
+  TVMRetValue opaque_handle = module_lookup_linked_param_(storage_id);
+  if (opaque_handle.type_code() == kTVMNullptr) {
+    *rv = nullptr;
+    return;
+  }
+
+  std::vector<int64_t> shape_vec{template_tensor->shape,
+                                 template_tensor->shape + template_tensor->ndim};
+
+  std::unique_ptr<NDArray::Container> container{new NDArray::Container(
+      static_cast<void*>(opaque_handle), shape_vec, template_tensor->dtype, ctx)};
+  container->SetDeleter(GraphRuntime::LinkedNDArrayDeleter);
+  *rv = NDArray(GetObjectPtr<Object>(container.release()));
+}
+
 void GraphRuntime::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<DLDataType> vtype;
@@ -306,34 +350,50 @@ void GraphRuntime::SetupStorage() {
     for (int64_t sz : attrs_.shape[i]) {
       size *= static_cast<size_t>(sz);
     }
-    CHECK_GE(storage_id, 0) << "Do not support runtime shape op";
+    ICHECK_GE(storage_id, 0) << "Do not support runtime shape op";
     DLDataType t = vtype[i];
     size_t bits = t.bits * t.lanes;
-    CHECK(bits % 8U == 0U || bits == 1U);
+    ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
     size_t bytes = ((bits + 7U) / 8U) * size;
 
     uint32_t sid = static_cast<uint32_t>(storage_id);
     if (sid >= pool_entry.size()) {
       pool_entry.resize(sid + 1, {0, -1});
     } else {
-      CHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
+      ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
     }
+    TVMRetValue lookup_rv;
+    {
+      std::vector<int64_t> shape_vec{attrs_.shape[i].begin(), attrs_.shape[i].end()};
+      DLTensor template_tensor{nullptr,  TVMContext{kDLCPU, 0}, static_cast<int>(shape_vec.size()),
+                               vtype[i], shape_vec.data(),      nullptr,
+                               0};
+      lookup_rv = lookup_linked_param_(module_, sid, &template_tensor, ctxs_[0]);
+    }
+    if (lookup_rv.type_code() != kTVMNullptr) {
+      pool_entry[sid].linked_param = lookup_rv;
+    }
+    pool_entry[sid].param_data_entry = i;
     pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
   }
 
   // Allocate the space.
   for (const auto& pit : pool_entry) {
-    std::vector<int64_t> shape;
     // This for loop is very fast since there are usually only a couple of
     // devices available on the same hardware.
     const auto& cit = std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) {
       return pit.device_type == static_cast<int>(c.device_type);
     });
     TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
-    shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
-    storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
+    if (pit.linked_param.defined()) {
+      storage_pool_.push_back(pit.linked_param);
+    } else {
+      std::vector<int64_t> shape;
+      shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
+      storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
+    }
   }
 
   // Assign the pooled entries. A unified memory pool is used to simplifiy
@@ -343,8 +403,9 @@ void GraphRuntime::SetupStorage() {
   data_alignment_.resize(num_node_entries());
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
-    CHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
+    ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
     data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+
     const DLTensor* tmp = data_entry_[i].operator->();
     data_alignment_[i] = details::GetDataAlignment(*tmp);
   }
@@ -372,7 +433,7 @@ void GraphRuntime::SetupOpExecs() {
       uint32_t eid = this->entry_id(nid, index);
       args.push_back(*(data_entry_[eid].operator->()));
     }
-    CHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
+    ICHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
 
     std::shared_ptr<OpArgs> op_args = nullptr;
     std::tie(op_execs_[nid], op_args) = CreateTVMOp(inode.param, args, inode.inputs.size());
@@ -425,7 +486,7 @@ std::pair<std::function<void()>, std::shared_ptr<GraphRuntime::OpArgs> > GraphRu
   // Get compiled function from the module that contains both host and device
   // code.
   tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, true);
-  CHECK(pf != nullptr) << "no such function in module: " << param.func_name;
+  ICHECK(pf != nullptr) << "no such function in module: " << param.func_name;
 
   auto fexec = [arg_ptr, pf]() {
     TVMRetValue rv;
@@ -492,7 +553,7 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name,
   } else if (name == "share_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       const auto& module = args[0].operator Module();
-      CHECK_EQ(module.operator->()->type_key(), "GraphRuntime");
+      ICHECK_EQ(module.operator->()->type_key(), "GraphRuntime");
       const auto& param_blob = args[1].operator std::string();
       dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
       this->ShareParams(dynamic_cast<const GraphRuntime&>(*module.operator->()), &strm);
@@ -503,18 +564,19 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name,
 }
 
 Module GraphRuntimeCreate(const std::string& sym_json, const tvm::runtime::Module& m,
-                          const std::vector<TVMContext>& ctxs) {
+                          const std::vector<TVMContext>& ctxs,
+                          const PackedFunc lookup_linked_param_func) {
   auto exec = make_object<GraphRuntime>();
-  exec->Init(sym_json, m, ctxs);
+  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
   return Module(exec);
 }
 
 // Get all context for the host and other runtime devices.
-std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
+std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg) {
   // Reserve the first item as the fallback device.
   std::vector<TVMContext> ret;
   TVMContext ctx;
-  for (int i = 2; i < args.num_args; i += 2) {
+  for (int i = ctx_start_arg; i < args.num_args; i += 2) {
     int dev_type = args[i];
     ctx.device_type = static_cast<DLDeviceType>(dev_type);
     ctx.device_id = args[i + 1];
@@ -529,11 +591,17 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
 // be passed in. The third one is the number of devices.
 // Eventually, we will only probably pass TVMContext for all the languages.
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
-                                "at least 4, but it has "
-                             << args.num_args;
-  const auto& contexts = GetAllContext(args);
-  *rv = GraphRuntimeCreate(args[0], args[1], contexts);
+  ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
+                                 "at least 4, but it has "
+                              << args.num_args;
+  PackedFunc lookup_linked_param_func;
+  int ctx_start_arg = 2;
+  if (args[2].type_code() == kTVMPackedFuncHandle) {
+    lookup_linked_param_func = args[2];
+    ctx_start_arg++;
+  }
+  const auto& contexts = GetAllContext(args, ctx_start_arg);
+  *rv = GraphRuntimeCreate(args[0], args[1], contexts, lookup_linked_param_func);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 34f34715d66f..81aa87d6ed90 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -41,10 +41,10 @@ namespace tvm {
 namespace runtime {
 
 /*! \brief macro to do C API call */
-#define TVM_CCALL(func)                    \
-  {                                        \
-    int ret = (func);                      \
-    CHECK_EQ(ret, 0) << TVMGetLastError(); \
+#define TVM_CCALL(func)                     \
+  {                                         \
+    int ret = (func);                       \
+    ICHECK_EQ(ret, 0) << TVMGetLastError(); \
   }
 
 /*! \brief Magic number for NDArray list file  */
@@ -94,10 +94,13 @@ class TVM_DLL GraphRuntime : public ModuleNode {
    *  processor.
    * \param ctxs The context of the host and devices where graph nodes will be
    *  executed on.
+   * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters
+   *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
+   *  which is not compatible with RPCModules.
    */
 
   void Init(const std::string& graph_json, tvm::runtime::Module module,
-            const std::vector<TVMContext>& ctxs);
+            const std::vector<TVMContext>& ctxs, const PackedFunc lookup_linked_param_func);
 
   /*!
    * \brief Get the input index given the name of input.
@@ -209,7 +212,10 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   struct PoolEntry {
     size_t size;
     int device_type;
-    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
+    int param_data_entry;
+    NDArray linked_param;
+    //    PoolEntry(int s, int dev_type, void* pre_linked_param) :
+    //        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
   // Node entry
   struct NodeEntry {
@@ -219,13 +225,13 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     // JSON Loader
     void Load(dmlc::JSONReader* reader) {
       reader->BeginArray();
-      CHECK(reader->NextArrayItem()) << "invalid json format";
+      ICHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&node_id);
-      CHECK(reader->NextArrayItem()) << "invalid json format";
+      ICHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&index);
       if (reader->NextArrayItem()) {
         reader->Read(&version);
-        CHECK(!reader->NextArrayItem()) << "invalid json format";
+        ICHECK(!reader->NextArrayItem()) << "invalid json format";
       } else {
         version = 0;
       }
@@ -265,7 +271,7 @@ class TVM_DLL GraphRuntime : public ModuleNode {
           bitmask |= 8;
         }
       }
-      CHECK_EQ(bitmask, 1 | 2 | 4 | 8) << "invalid format";
+      ICHECK_EQ(bitmask, 1 | 2 | 4 | 8) << "invalid format";
     }
     // JSON Loader
     void Load(dmlc::JSONReader* reader) {
@@ -290,7 +296,7 @@ class TVM_DLL GraphRuntime : public ModuleNode {
           LOG(FATAL) << "do not support key " << key;
         }
       }
-      CHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
+      ICHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
     }
   };
   struct GraphAttr {
@@ -307,58 +313,58 @@ class TVM_DLL GraphRuntime : public ModuleNode {
       while (reader->NextObjectItem(&key)) {
         if (key == "dltype") {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
-          CHECK_EQ(type, "list_str");
-          CHECK(reader->NextArrayItem());
+          ICHECK_EQ(type, "list_str");
+          ICHECK(reader->NextArrayItem());
           reader->Read(&dltype);
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
           bitmask |= 1;
         } else if (key == "storage_id") {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
+          ICHECK_EQ(type, "list_int");
+          ICHECK(reader->NextArrayItem());
           reader->Read(&storage_id);
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
           bitmask |= 2;
         } else if (key == "shape") {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
-          CHECK_EQ(type, "list_shape");
-          CHECK(reader->NextArrayItem());
+          ICHECK_EQ(type, "list_shape");
+          ICHECK(reader->NextArrayItem());
           reader->Read(&shape);
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
           bitmask |= 4;
         } else if (key == "device_index") {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
+          ICHECK_EQ(type, "list_int");
+          ICHECK(reader->NextArrayItem());
           reader->Read(&device_index);
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
         } else {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
           if (type == "list_int") {
-            CHECK(reader->NextArrayItem());
+            ICHECK(reader->NextArrayItem());
             std::vector<int> temp;
             reader->Read(&temp);
           } else if (type == "size_t") {
-            CHECK(reader->NextArrayItem());
+            ICHECK(reader->NextArrayItem());
             size_t temp;
             reader->Read(&temp);
           } else {
             LOG(FATAL) << "cannot skip graph attr " << key;
           }
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
         }
       }
-      CHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
+      ICHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
     }
   };
   // The graph attribute fields.
@@ -388,8 +394,12 @@ class TVM_DLL GraphRuntime : public ModuleNode {
         LOG(FATAL) << "key " << key << " is not supported";
       }
     }
-    CHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
+    ICHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
   }
+  /*! \brief PackedFunc to lookup a linked paramter from a local Module. */
+  void DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv);
+  /*! \brief Delete NDArray::Container with linked (i.e. static) data. */
+  static void LinkedNDArrayDeleter(Object* container);
   /*! \brief Setup the temporal storage */
   void SetupStorage();
   /*! \brief Setup the executors. */
@@ -437,9 +447,18 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   std::vector<size_t> data_alignment_;
   /*! \brief Operator on each node. */
   std::vector<std::function<void()>> op_execs_;
+  /*! \brief Linked parameter lookup function. */
+  PackedFunc lookup_linked_param_;
+  /*! \brief Module's _lookup_linked_param function, used by DefaultLookupLinkedParam. */
+  PackedFunc module_lookup_linked_param_;
+  /*!
+   * \brief True when module_lookup_linked_param_ is valid.
+   * When the module does not include linked parmeters, module_lookup_linked_param_ will be nullptr.
+   */
+  bool module_lookup_linked_param_valid_;
 };
 
-std::vector<TVMContext> GetAllContext(const TVMArgs& args);
+std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg);
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc
index aa35afaf70f8..2c055e16cc9f 100644
--- a/src/runtime/graph/graph_runtime_factory.cc
+++ b/src/runtime/graph/graph_runtime_factory.cc
@@ -55,9 +55,9 @@ PackedFunc GraphRuntimeFactory::GetFunction(
     });
   } else if (name == "debug_create") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_GE(args.size(), 2);
+      ICHECK_GE(args.size(), 2);
       std::string module_name = args[0].operator String();
-      CHECK(module_name == module_name_) << "Currently we only support single model for now.";
+      ICHECK(module_name == module_name_) << "Currently we only support single model for now.";
       std::vector<TVMContext> contexts;
       for (int i = 1; i < args.num_args; ++i) {
         contexts.emplace_back(args[i].operator TVMContext());
@@ -86,7 +86,7 @@ void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) {
     arrays.emplace_back(const_cast<DLTensor*>(v.second.operator->()));
   }
   uint64_t sz = arrays.size();
-  CHECK(sz == names.size());
+  ICHECK(sz == names.size());
   stream->Write(sz);
   stream->Write(names);
   for (size_t i = 0; i < sz; ++i) {
@@ -97,7 +97,7 @@ void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) {
 
 Module GraphRuntimeFactory::RuntimeCreate(const std::vector<TVMContext>& ctxs) {
   auto exec = make_object<GraphRuntime>();
-  exec->Init(this->graph_json_, this->imports_[0], ctxs);
+  exec->Init(this->graph_json_, this->imports_[0], ctxs, PackedFunc());
   // set params
   SetParams(exec.get(), this->params_);
   return Module(exec);
@@ -105,8 +105,8 @@ Module GraphRuntimeFactory::RuntimeCreate(const std::vector<TVMContext>& ctxs) {
 
 Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector<TVMContext>& ctxs) {
   const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_debug.create");
-  CHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_debug.create in registry. "
-                          "Do you enable debug graph runtime build?";
+  ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_debug.create in registry. "
+                           "Do you enable debug graph runtime build?";
   // Debug runtime create packed function will call GetAllContexs, so we unpack the ctxs.
   std::vector<int> unpacked_ctxs;
   for (const auto& ctx : ctxs) {
@@ -135,29 +135,29 @@ Module GraphRuntimeFactoryModuleLoadBinary(void* strm) {
   std::string graph_json;
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
   std::string module_name;
-  CHECK(stream->Read(&graph_json));
+  ICHECK(stream->Read(&graph_json));
   uint64_t sz;
-  CHECK(stream->Read(&sz));
+  ICHECK(stream->Read(&sz));
   std::vector<std::string> names;
-  CHECK(stream->Read(&names));
-  CHECK(sz == names.size());
+  ICHECK(stream->Read(&names));
+  ICHECK(sz == names.size());
   for (size_t i = 0; i < sz; ++i) {
     tvm::runtime::NDArray temp;
     temp.Load(stream);
     params[names[i]] = temp;
   }
-  CHECK(stream->Read(&module_name));
+  ICHECK(stream->Read(&module_name));
   auto exec = make_object<GraphRuntimeFactory>(graph_json, params, module_name);
   return Module(exec);
 }
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime_factory.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_GE(args.num_args, 3) << "The expected number of arguments for "
-                                "graph_runtime_factory.create needs at least 3, "
-                                "but it has "
-                             << args.num_args;
+  ICHECK_GE(args.num_args, 3) << "The expected number of arguments for "
+                                 "graph_runtime_factory.create needs at least 3, "
+                                 "but it has "
+                              << args.num_args;
   // The argument order is graph_json, module, module_name, params.
-  CHECK_EQ((args.size() - 3) % 2, 0);
+  ICHECK_EQ((args.size() - 3) % 2, 0);
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
   for (size_t i = 3; i < static_cast<size_t>(args.size()); i += 2) {
     std::string name = args[i].operator String();
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index a89015707f99..605c55eb89b9 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -17,9 +17,9 @@
  * under the License.
  */
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <cstring>
@@ -60,12 +60,12 @@ inline void HexagonDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRe
 
 inline void* HexagonDeviceAPI::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                                               DLDataType type_hint) {
-  CHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
   return hexagon::Device::Global()->Alloc(nbytes, alignment);
 }
 
 inline void HexagonDeviceAPI::FreeDataSpace(TVMContext ctx, void* ptr) {
-  CHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
   hexagon::Device::Global()->Free(ptr);
 }
 
@@ -85,22 +85,22 @@ inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offse
     if (ctx_from.device_type == kDLCPU) {
       memmove(dst, src, num_bytes);
     } else if (static_cast<int>(ctx_from.device_type) == kDLHexagon) {
-      CHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
-      CHECK_EQ(ctx_from.device_id, ctx_to.device_id);
-      CHECK(Is32bit(dst) && Is32bit(src));
+      ICHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
+      ICHECK_EQ(ctx_from.device_id, ctx_to.device_id);
+      ICHECK(Is32bit(dst) && Is32bit(src));
       hexagon::Device::Global()->CopyDeviceToDevice(dst, src, num_bytes);
     }
   } else {
     if (ctx_from.device_type == kDLCPU) {
-      CHECK_EQ(static_cast<int>(ctx_to.device_type), kDLHexagon);
-      CHECK(Is32bit(dst));
-      CHECK(hexagon::Device::ValidateDeviceId(ctx_to.device_id));
+      ICHECK_EQ(static_cast<int>(ctx_to.device_type), kDLHexagon);
+      ICHECK(Is32bit(dst));
+      ICHECK(hexagon::Device::ValidateDeviceId(ctx_to.device_id));
       hexagon::Device::Global()->CopyHostToDevice(dst, src, num_bytes);
     } else {
-      CHECK_EQ(static_cast<int>(ctx_from.device_type), kDLHexagon);
-      CHECK_EQ(ctx_to.device_type, kDLCPU);
-      CHECK(Is32bit(src));
-      CHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
+      ICHECK_EQ(static_cast<int>(ctx_from.device_type), kDLHexagon);
+      ICHECK_EQ(ctx_to.device_type, kDLCPU);
+      ICHECK(Is32bit(src));
+      ICHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
       hexagon::Device::Global()->CopyDeviceToHost(dst, src, num_bytes);
     }
   }
@@ -109,7 +109,7 @@ inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offse
 inline void HexagonDeviceAPI::StreamSync(TVMContext ctx, TVMStreamHandle stream) {}
 
 inline void* HexagonDeviceAPI::AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint) {
-  CHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
   if (type_hint.code == 100) {
     size_t align = std::min(nbytes, 2048lu);
     return hexagon::Device::Global()->AllocVtcm(nbytes, align);
@@ -118,7 +118,7 @@ inline void* HexagonDeviceAPI::AllocWorkspace(TVMContext ctx, size_t nbytes, DLD
 }
 
 inline void HexagonDeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) {
-  CHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
   DeviceAPI::FreeWorkspace(ctx, ptr);
 }
 
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 66e2a5698fb2..994e24b99084 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -22,8 +22,8 @@
 #ifdef __ANDROID__
 #include <android/log.h>
 #endif
-#include <dmlc/logging.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <memory>
 #include <set>
@@ -31,7 +31,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 
 namespace tvm {
@@ -176,8 +176,8 @@ void ArgLayout::Push(uint32_t* v, unsigned t_size, unsigned t_align) {
 
   if (!InReg) {
     // Allocate on stack.
-    CHECK_EQ((t_align & (t_align - 1)), 0) << "Alignment should be a power of 2";
-    CHECK_GE(t_align, 4) << "Alignment should be at least 4";
+    ICHECK_EQ((t_align & (t_align - 1)), 0) << "Alignment should be a power of 2";
+    ICHECK_GE(t_align, 4) << "Alignment should be at least 4";
     // Round t_size up to a multiple of 4.
     unsigned s_size = Stack.size();
     unsigned s_align = t_align / 4;  // Alignment of T in words on the stack.
@@ -223,18 +223,18 @@ class HexagonModuleNode final : public runtime::ModuleNode {
       std::string meta_file = GetMetaFilePath(file_name);
       SaveMetaDataToFile(meta_file, fmap_);
       std::string c = "cp " + data_ + " " + file_name;
-      CHECK(std::system(c.c_str()) == 0) << "Cannot create " + file_name;
+      ICHECK(std::system(c.c_str()) == 0) << "Cannot create " + file_name;
     } else if (fmt == "s" || fmt == "asm") {
-      CHECK(!asm_.empty()) << "Assembler source not available";
+      ICHECK(!asm_.empty()) << "Assembler source not available";
       SaveBinaryToFile(file_name, asm_);
     } else if (fmt == "o" || fmt == "obj") {
-      CHECK(!obj_.empty()) << "Object data not available";
+      ICHECK(!obj_.empty()) << "Object data not available";
       SaveBinaryToFile(file_name, obj_);
     } else if (fmt == "ll") {
-      CHECK(!ir_.empty()) << "LLVM IR source not available";
+      ICHECK(!ir_.empty()) << "LLVM IR source not available";
       SaveBinaryToFile(file_name, ir_);
     } else if (fmt == "bc") {
-      CHECK(!bc_.empty()) << "LLVM IR bitcode not available";
+      ICHECK(!bc_.empty()) << "LLVM IR bitcode not available";
       SaveBinaryToFile(file_name, bc_);
     } else {
       LOG(FATAL) << "HexagonModuleNode::SaveToFile: unhandled format `" << fmt << "'";
@@ -480,7 +480,7 @@ hexagon::ArgLayout HexagonModuleNode::BuildArgLayout(const TVMArgs& As) const {
         // types, so there is no way to tell if the value being passed needs
         // one or two registers. Assume that all integers are 32-bit, and
         // simply abort if the actual value does not fit.
-        CHECK_EQ(static_cast<int64_t>(A), static_cast<int32_t>(A));
+        ICHECK_EQ(static_cast<int64_t>(A), static_cast<int32_t>(A));
         Args.Push(static_cast<int>(A));
         break;
       // 64-bit values
diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h
index b922b169bd61..e558997b7a4c 100644
--- a/src/runtime/hexagon/hexagon_module.h
+++ b/src/runtime/hexagon/hexagon_module.h
@@ -20,8 +20,8 @@
 #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 #define TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/module.h>
+#include <tvm/support/logging.h>
 
 #include <array>
 #include <memory>
diff --git a/src/runtime/hexagon/sim/hexagon_device_sim.cc b/src/runtime/hexagon/sim/hexagon_device_sim.cc
index 477da09c1c65..6cc7dcf3209f 100644
--- a/src/runtime/hexagon/sim/hexagon_device_sim.cc
+++ b/src/runtime/hexagon/sim/hexagon_device_sim.cc
@@ -17,12 +17,12 @@
  * under the License.
  */
 
-#include <dmlc/logging.h>
 #include <llvm/ADT/Optional.h>
 #include <llvm/ADT/STLExtras.h>
 #include <llvm/ADT/StringRef.h>
 #include <llvm/Support/FileSystem.h>
 #include <llvm/Support/Process.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <deque>
@@ -107,7 +107,7 @@ struct non_const_str {
   }
   size_t size() const { return pointers_.size(); }
   operator char*() {
-    CHECK_EQ(pointers_.size(), 1);
+    ICHECK_EQ(pointers_.size(), 1);
     return pointers_[0];
   }
   operator char* *() { return pointers_.data(); }
@@ -394,17 +394,17 @@ decltype(HexagonSimulator::opt_map_) HexagonSimulator::opt_map_ = {
     {"--verbose", &HexagonSimulator::HandleVerbose},
 };
 
-#define CHECKED_CALL(func, ...)                                                      \
-  do {                                                                               \
-    HEXAPI_Status s = sim_->func(__VA_ARGS__);                                       \
-    CHECK_EQ(s, HEX_STAT_SUCCESS) << "HexagonSimulator: " #func " failed with code " \
-                                  << HexagonSimulator::to_string(s);                 \
+#define CHECKED_CALL(func, ...)                                                               \
+  do {                                                                                        \
+    HEXAPI_Status s = sim_->func(__VA_ARGS__);                                                \
+    ICHECK_EQ(s, HEX_STAT_SUCCESS)                                                            \
+        << "HexagonSimulator: " #func " failed with code " << HexagonSimulator::to_string(s); \
   } while (false)
 
 inline HEX_VA_t HexagonSimulator::p2va(const void* p) {
   uintptr_t u = reinterpret_cast<uintptr_t>(p);
   HEX_VA_t va = static_cast<HEX_VA_t>(u);
-  CHECK_EQ(static_cast<uintptr_t>(va), u);
+  ICHECK_EQ(static_cast<uintptr_t>(va), u);
   return va;
 }
 
@@ -425,13 +425,13 @@ template <unsigned N>
 void HexagonSimulator::CopyNToV(HEX_VA_t dst, const void* host_src) {
   using src_uint_t = typename unalign<typename uint<N>::type>::type;
   auto* ps = reinterpret_cast<const src_uint_t*>(host_src);
-  CHECK_EQ(sim_->WriteVirtual(dst, -1u, N, ps->value), HEX_STAT_SUCCESS);
+  ICHECK_EQ(sim_->WriteVirtual(dst, -1u, N, ps->value), HEX_STAT_SUCCESS);
 }
 
 template <unsigned N>
 void HexagonSimulator::CopyNFromV(void* host_dst, HEX_VA_t src) {
   typename uint<N>::type v;
-  CHECK_EQ(sim_->ReadVirtual(src, -1u, N, &v), HEX_STAT_SUCCESS);
+  ICHECK_EQ(sim_->ReadVirtual(src, -1u, N, &v), HEX_STAT_SUCCESS);
 
   using dst_uint_t = typename unalign<typename uint<N>::type>::type;
   auto* pd = reinterpret_cast<dst_uint_t*>(host_dst);
@@ -465,7 +465,7 @@ void HexagonSimulator::CopyToV(HEX_VA_t dst, const void* host_src, unsigned len)
     src++;
     len--;
   }
-  CHECK_EQ(len, 0);
+  ICHECK_EQ(len, 0);
 }
 
 void HexagonSimulator::CopyFromV(void* host_dst, HEX_VA_t src, unsigned len) {
@@ -495,7 +495,7 @@ void HexagonSimulator::CopyFromV(void* host_dst, HEX_VA_t src, unsigned len) {
     src++;
     len--;
   }
-  CHECK_EQ(len, 0);
+  ICHECK_EQ(len, 0);
 }
 
 void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
@@ -504,13 +504,13 @@ void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
     HEX_4u_t result;
     HEX_8u_t cycles0, cycles1;
     if (report_cycles) {
-      CHECK_EQ(sim_->GetSimulatedCycleCount(&cycles0), HEX_STAT_SUCCESS);
+      ICHECK_EQ(sim_->GetSimulatedCycleCount(&cycles0), HEX_STAT_SUCCESS);
     }
 
     core = sim_->Run(&result);
-    CHECK_EQ(core, HEX_CORE_BREAKPOINT);
+    ICHECK_EQ(core, HEX_CORE_BREAKPOINT);
     if (report_cycles) {
-      CHECK_EQ(sim_->GetSimulatedCycleCount(&cycles1), HEX_STAT_SUCCESS);
+      ICHECK_EQ(sim_->GetSimulatedCycleCount(&cycles1), HEX_STAT_SUCCESS);
       LOG(INFO) << "host: execution took " << (cycles1 - cycles0) << " cycles";
     }
   };
@@ -522,8 +522,8 @@ void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
 
   // Receive the acknowledgement with the address for the payload.
   CopyFromV(&r, message_buffer_v_, sizeof(r));
-  CHECK_EQ(r.code, kMsgAck);
-  CHECK_GE(r.len, m.len);
+  ICHECK_EQ(r.code, kMsgAck);
+  ICHECK_GE(r.len, m.len);
 
   // Send the actual message.
   m.va = r.va;
@@ -533,7 +533,7 @@ void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
 
   // Receive the return data.
   CopyFromV(&m, message_buffer_v_, sizeof(m));
-  CHECK_EQ(m.code, kNone);
+  ICHECK_EQ(m.code, kNone);
 }
 
 HexagonSimulator::HexagonSimulator(bool enable_queuing) {
@@ -610,12 +610,12 @@ void* HexagonSimulator::Alloc(unsigned size, unsigned align) {
   MsgAlloc ma = {size, align};
   SendMsg(m, &ma, true);
 
-  CHECK_EQ(sizeof(MsgPointer), m.len);
+  ICHECK_EQ(sizeof(MsgPointer), m.len);
   MsgPointer mp;
   CopyFromV(&mp, m.va, m.len);
 
   LOG(INFO) << "HexagonSimulator::Alloc -> " << std::hex << mp.va << std::dec;
-  CHECK_NE(mp.va, 0);
+  ICHECK_NE(mp.va, 0);
   return va2p(mp.va);
 }
 
@@ -623,7 +623,7 @@ void HexagonSimulator::Free(void* ptr) {
   LOG(INFO) << "HexagonSimulator::Free(ptr=" << std::hex << ptr << std::dec << ')';
   if (task_queuing_) {
     Message mf = {kFlush, 0, 0};
-    SendMsg(mf, 0, true);
+    SendMsg(mf, nullptr, true);
   }
   Message m = {kFree, sizeof(MsgPointer), 0u};
   MsgPointer mp = {p2va(ptr)};
@@ -636,12 +636,12 @@ void* HexagonSimulator::AllocVtcm(unsigned size, unsigned align) {
   MsgAlloc ma = {size, align};
   SendMsg(m, &ma, true);
 
-  CHECK_EQ(sizeof(MsgPointer), m.len);
+  ICHECK_EQ(sizeof(MsgPointer), m.len);
   MsgPointer mp;
   CopyFromV(&mp, m.va, m.len);
 
   LOG(INFO) << "HexagonSimulator::AllocVtcm -> " << std::hex << mp.va << std::dec;
-  CHECK_NE(mp.va, 0);
+  ICHECK_NE(mp.va, 0);
   return va2p(mp.va);
 }
 
@@ -650,7 +650,7 @@ void HexagonSimulator::FreeVtcm(void* ptr) {}
 void HexagonSimulator::CopyDeviceToDevice(void* dst, const void* src, unsigned len) {
   LOG(INFO) << "HexagonSimulator::CopyDeviceToDevice(dst=" << std::hex << dst << ", src=" << src
             << ", len=" << std::dec << len << ')';
-  CHECK(dst != nullptr && src != nullptr);
+  ICHECK(dst != nullptr && src != nullptr);
   Message m = {kCopy, sizeof(MsgCopy), 0u};
   MsgCopy mc = {p2va(dst), p2va(src), len};
   SendMsg(m, &mc, true);
@@ -661,7 +661,7 @@ void HexagonSimulator::CopyDeviceToHost(void* host_dst, const void* src, unsigne
             << ", len=" << len << ')';
   if (task_queuing_) {
     Message mf = {kFlush, 0, 0};
-    SendMsg(mf, 0, true);
+    SendMsg(mf, nullptr, true);
   }
   CopyFromV(host_dst, p2va(src), len);
 }
@@ -677,7 +677,7 @@ void* HexagonSimulator::Load(const std::string& data, const std::string& fmt) {
   Message m = {kLoad, static_cast<uint32_t>(data.size() + 1), 0u};
   SendMsg(m, data.c_str(), false);
 
-  CHECK_EQ(sizeof(MsgPointer), m.len);
+  ICHECK_EQ(sizeof(MsgPointer), m.len);
   MsgPointer mp;
   CopyFromV(&mp, m.va, sizeof(mp));
 
@@ -685,7 +685,7 @@ void* HexagonSimulator::Load(const std::string& data, const std::string& fmt) {
 }
 
 void HexagonSimulator::Unload(void* mod) {
-  CHECK(mod);
+  ICHECK(mod);
   Message m = {kUnload, sizeof(MsgPointer), 0u};
   MsgPointer mp = {p2va(mod)};
   SendMsg(m, &mp, false);
@@ -696,7 +696,7 @@ void* HexagonSimulator::Resolve(const std::string& sym) {
   Message m = {kResolve, static_cast<uint32_t>(sym.size() + 1), 0u};
   SendMsg(m, sym.c_str(), true);
 
-  CHECK_EQ(sizeof(MsgPointer), m.len);
+  ICHECK_EQ(sizeof(MsgPointer), m.len);
   MsgPointer mp;
   CopyFromV(&mp, m.va, sizeof(mp));
 
@@ -717,7 +717,7 @@ void HexagonSimulator::Call(void* func, uint32_t* scalar, unsigned sc_num, uint3
   // Copy the MsgCall contents into the data vector as a sequence of uints.
   MsgCall me = {p2va(func), sc_num, st_num};
 
-  CHECK((is_multiple_of<sizeof(MsgCall), sizeof(uint32_t)>()));
+  ICHECK((is_multiple_of<sizeof(MsgCall), sizeof(uint32_t)>()));
   for (unsigned i = 0, e = sizeof(me) / sizeof(uint32_t); i != e; ++i)
     data.push_back(reinterpret_cast<uint32_t*>(&me)[i]);
 
@@ -739,7 +739,7 @@ void HexagonSimulator::Call(void* func, uint32_t* scalar, unsigned sc_num, uint3
 
   if (!task_queuing_) {
     Message mf = {kFlush, 0, 0};
-    SendMsg(mf, 0, true);
+    SendMsg(mf, nullptr, true);
   }
 
   std::vector<uint8_t> rv(m.len);
@@ -763,14 +763,14 @@ bool HexagonSimulator::Configure(string_list& opts) {
       LOG(FATAL) << "Unrecognized simulator option: " << key;
       // unreachable
     }
-    CHECK((this->*f->second)(opts)) << "error handling option: " << key;
+    ICHECK((this->*f->second)(opts)) << "error handling option: " << key;
   }
 
   // Check AHB.
   if (ahb_.first.hasValue() && ahb_.second.hasValue()) {
     CHECKED_CALL(ConfigureAHB, *ahb_.first, *ahb_.second);
   } else {
-    CHECK(!ahb_.first.hasValue() && !ahb_.second.hasValue())
+    ICHECK(!ahb_.first.hasValue() && !ahb_.second.hasValue())
         << "HexagonSimulator: please specify both low and high addresses "
            "for AHB";
   }
@@ -779,7 +779,7 @@ bool HexagonSimulator::Configure(string_list& opts) {
   if (axi2_.first.hasValue() && axi2_.second.hasValue()) {
     CHECKED_CALL(ConfigureAXI2, *axi2_.first, *axi2_.second);
   } else {
-    CHECK(!axi2_.first.hasValue() && !axi2_.second.hasValue())
+    ICHECK(!axi2_.first.hasValue() && !axi2_.second.hasValue())
         << "HexagonSimulator: please specify both low and high addresses "
            "for AXI2";
   }
@@ -806,7 +806,7 @@ bool HexagonSimulator::HandleAHBBusRatio(string_list& rest) {
 
 bool HexagonSimulator::HandleAHBHighAddr(string_list& rest) {
   auto addr = detail::to_uint(detail::pop_front(rest));
-  CHECK(addr) << "HexagonSimulator: invalid value for AHB high adddress";
+  ICHECK(addr) << "HexagonSimulator: invalid value for AHB high adddress";
   if (addr) {
     ahb_.second = *addr;
   }
@@ -815,7 +815,7 @@ bool HexagonSimulator::HandleAHBHighAddr(string_list& rest) {
 
 bool HexagonSimulator::HandleAHBLowAddr(string_list& rest) {
   auto addr = detail::to_uint(detail::pop_front(rest));
-  CHECK(addr) << "HexagonSimulator: invalid value for AHB low adddress";
+  ICHECK(addr) << "HexagonSimulator: invalid value for AHB low adddress";
   if (addr) {
     ahb_.first = *addr;
   }
@@ -841,7 +841,7 @@ bool HexagonSimulator::HandleAXI2BusRatio(string_list& rest) {
 
 bool HexagonSimulator::HandleAXI2HighAddr(string_list& rest) {
   auto addr = detail::to_uint(detail::pop_front(rest));
-  CHECK(addr) << "HexagonSimulator: invalid value for AXI2 high adddress";
+  ICHECK(addr) << "HexagonSimulator: invalid value for AXI2 high adddress";
   if (addr) {
     axi2_.second = *addr;
   }
@@ -850,7 +850,7 @@ bool HexagonSimulator::HandleAXI2HighAddr(string_list& rest) {
 
 bool HexagonSimulator::HandleAXI2LowAddr(string_list& rest) {
   auto addr = detail::to_uint(detail::pop_front(rest));
-  CHECK(addr) << "HexagonSimulator: invalid value for AXI2 low adddress";
+  ICHECK(addr) << "HexagonSimulator: invalid value for AXI2 low adddress";
   if (addr) {
     axi2_.first = *addr;
   }
diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
index bf10feb652cd..d494db82e2c7 100644
--- a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
@@ -21,8 +21,8 @@
 #include "hexagon_dsprpcapi.h"
 
 #include <dlfcn.h>
-#include <dmlc/logging.h>
 #include <stdint.h>
+#include <tvm/support/logging.h>
 
 #include "hexagon_target_log.h"
 
@@ -32,7 +32,7 @@ namespace runtime {
 namespace hexagon {
 
 DspRpcAPI::DspRpcAPI() {
-  CHECK(lib_handle_ = dlopen(rpc_lib_name_, RTLD_LAZY | RTLD_LOCAL));
+  ICHECK(lib_handle_ = dlopen(rpc_lib_name_, RTLD_LAZY | RTLD_LOCAL));
 
 #define RESOLVE(n) n##_ = GetSymbol<n##_t*>(#n)
   RESOLVE(remote_handle_close);
diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.h b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
index ca812e6c2f1f..c0e40805ecbf 100644
--- a/src/runtime/hexagon/target/hexagon_dsprpcapi.h
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
@@ -21,8 +21,8 @@
 #define TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_DSPRPCAPI_H_
 
 #ifdef __ANDROID__
-#include <dmlc/logging.h>
 #include <stdint.h>
+#include <tvm/support/logging.h>
 
 #include "remote.h"
 #include "remote64.h"
@@ -109,7 +109,7 @@ class DspRpcAPI {
 
 #define DECLFUNC(fn)                                   \
   fn##_t* fn##_ptr(bool allow_nullptr = false) const { \
-    if (!allow_nullptr) CHECK(fn##_ != nullptr);       \
+    if (!allow_nullptr) ICHECK(fn##_ != nullptr);      \
     return fn##_;                                      \
   }
   DECLFUNC(remote_handle_close)
diff --git a/src/runtime/hexagon/target/hexagon_stubapi.cc b/src/runtime/hexagon/target/hexagon_stubapi.cc
index 2ed33471b98f..5428ae7c1cff 100644
--- a/src/runtime/hexagon/target/hexagon_stubapi.cc
+++ b/src/runtime/hexagon/target/hexagon_stubapi.cc
@@ -21,9 +21,9 @@
 #include "hexagon_stubapi.h"
 
 #include <dlfcn.h>
-#include <dmlc/logging.h>
 #include <stdint.h>
 #include <sys/stat.h>
+#include <tvm/support/logging.h>
 
 #include "hexagon_target_log.h"
 
@@ -45,7 +45,7 @@ StubAPI::StubAPI() {
   constexpr auto nondomain_lib_name = "libtvm_remote_nd_stub.so";
 
   const char* lib_name = enable_domains_ ? domain_lib_name : nondomain_lib_name;
-  CHECK(lib_handle_ = dlopen(lib_name, RTLD_LAZY | RTLD_LOCAL));
+  ICHECK(lib_handle_ = dlopen(lib_name, RTLD_LAZY | RTLD_LOCAL));
 
 #define RESOLVE(fn) p##fn##_ = GetSymbol<fn##_t*>(#fn)
   if (enable_domains_) {
diff --git a/src/runtime/hexagon/target/hexagon_stubapi.h b/src/runtime/hexagon/target/hexagon_stubapi.h
index 5213b6d0d7af..cc5b7b7413ca 100644
--- a/src/runtime/hexagon/target/hexagon_stubapi.h
+++ b/src/runtime/hexagon/target/hexagon_stubapi.h
@@ -22,9 +22,9 @@
 
 #ifdef __ANDROID__
 #include <AEEStdErr.h>
-#include <dmlc/logging.h>
 #include <rpcmem.h>
 #include <stdint.h>
+#include <tvm/support/logging.h>
 
 #include <tuple>
 
diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc
index a5935491fcd7..30ef2141c508 100644
--- a/src/runtime/library_module.cc
+++ b/src/runtime/library_module.cc
@@ -46,7 +46,7 @@ class LibraryModuleNode final : public ModuleNode {
     if (name == runtime::symbol::tvm_module_main) {
       const char* entry_name =
           reinterpret_cast<const char*>(lib_->GetSymbol(runtime::symbol::tvm_module_main));
-      CHECK(entry_name != nullptr)
+      ICHECK(entry_name != nullptr)
           << "Symbol " << runtime::symbol::tvm_module_main << " is not presented";
       faddr = reinterpret_cast<TVMBackendPackedCFunc>(lib_->GetSymbol(entry_name));
     } else {
@@ -75,7 +75,7 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>&
     int ret_type_code = kTVMNullptr;
     int ret = (*faddr)(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
                        args.num_args, &ret_value, &ret_type_code, nullptr);
-    CHECK_EQ(ret, 0) << TVMGetLastError();
+    ICHECK_EQ(ret, 0) << TVMGetLastError();
     if (ret_type_code != kTVMNullptr) {
       *rv = TVMRetValue::MoveFromCHost(ret_value, ret_type_code);
     }
@@ -107,7 +107,7 @@ void InitContextFunctions(std::function<void*(const char*)> fgetsymbol) {
  * \return Root Module.
  */
 runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
-  CHECK(mblob != nullptr);
+  ICHECK(mblob != nullptr);
   uint64_t nbytes = 0;
   for (size_t i = 0; i < sizeof(nbytes); ++i) {
     uint64_t c = mblob[i];
@@ -117,21 +117,21 @@ runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
                                  static_cast<size_t>(nbytes));
   dmlc::Stream* stream = &fs;
   uint64_t size;
-  CHECK(stream->Read(&size));
+  ICHECK(stream->Read(&size));
   std::vector<Module> modules;
   std::vector<uint64_t> import_tree_row_ptr;
   std::vector<uint64_t> import_tree_child_indices;
   for (uint64_t i = 0; i < size; ++i) {
     std::string tkey;
-    CHECK(stream->Read(&tkey));
+    ICHECK(stream->Read(&tkey));
     // Currently, _lib is for DSOModule, but we
     // don't have loadbinary function for it currently
     if (tkey == "_lib") {
       auto dso_module = Module(make_object<LibraryModuleNode>(lib));
       modules.emplace_back(dso_module);
     } else if (tkey == "_import_tree") {
-      CHECK(stream->Read(&import_tree_row_ptr));
-      CHECK(stream->Read(&import_tree_child_indices));
+      ICHECK(stream->Read(&import_tree_row_ptr));
+      ICHECK(stream->Read(&import_tree_child_indices));
     } else {
       std::string loadkey = "runtime.module.loadbinary_";
       std::string fkey = loadkey + tkey;
@@ -146,7 +146,7 @@ runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
             loaders += name.substr(loadkey.size());
           }
         }
-        CHECK(f != nullptr)
+        ICHECK(f != nullptr)
             << "Binary was created using " << tkey
             << " but a loader of that name is not registered. Available loaders are " << loaders
             << ". Perhaps you need to recompile with this runtime enabled.";
@@ -169,12 +169,12 @@ runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
       for (size_t j = import_tree_row_ptr[i]; j < import_tree_row_ptr[i + 1]; ++j) {
         auto module_import_addr = ModuleInternal::GetImportsAddr(modules[i].operator->());
         auto child_index = import_tree_child_indices[j];
-        CHECK(child_index < modules.size());
+        ICHECK(child_index < modules.size());
         module_import_addr->emplace_back(modules[child_index]);
       }
     }
   }
-  CHECK(!modules.empty());
+  ICHECK(!modules.empty());
   // invariance: root module is always at location 0.
   // The module order is collected via DFS
   return modules[0];
diff --git a/src/runtime/metadata_module.cc b/src/runtime/metadata_module.cc
index 56f894c46906..acef9d4736fd 100644
--- a/src/runtime/metadata_module.cc
+++ b/src/runtime/metadata_module.cc
@@ -69,7 +69,7 @@ class MetadataModuleNode : public ModuleNode {
     // Run the module.
     // Normally we would only have a limited number of submodules. The runtime
     // symobl lookup overhead should be minimal.
-    CHECK(!this->imports().empty());
+    ICHECK(!this->imports().empty());
     for (Module it : this->imports()) {
       PackedFunc pf = it.GetFunction(name);
       if (pf != nullptr) return pf;
@@ -86,10 +86,10 @@ class MetadataModuleNode : public ModuleNode {
    */
   Array<NDArray> GetRequiredMetadata(const std::string& symbol) {
     Array<NDArray> ret;
-    CHECK_GT(sym_vars_.count(symbol), 0U) << "No symbol is recorded for " << symbol;
+    ICHECK_GT(sym_vars_.count(symbol), 0U) << "No symbol is recorded for " << symbol;
     std::vector<std::string> vars = sym_vars_[symbol];
     for (const auto& it : vars) {
-      CHECK_GT(metadata_.count(it), 0U) << "Found not recorded constant variable: " << it;
+      ICHECK_GT(metadata_.count(it), 0U) << "Found not recorded constant variable: " << it;
       ret.push_back(metadata_[it]);
     }
     return ret;
@@ -119,7 +119,7 @@ class MetadataModuleNode : public ModuleNode {
         // Initialize the module with metadata.
         int ret = init(md);
         // Report the error if initialization is failed.
-        CHECK_EQ(ret, 0) << TVMGetLastError();
+        ICHECK_EQ(ret, 0) << TVMGetLastError();
         break;
       }
     }
@@ -164,10 +164,10 @@ class MetadataModuleNode : public ModuleNode {
 
     // Load the variables.
     std::vector<std::string> variables;
-    CHECK(stream->Read(&variables)) << "Loading variables failed";
+    ICHECK(stream->Read(&variables)) << "Loading variables failed";
     uint64_t sz;
-    CHECK(stream->Read(&sz, sizeof(sz))) << "Loading metadata size failed";
-    CHECK_EQ(static_cast<size_t>(sz), variables.size())
+    ICHECK(stream->Read(&sz, sizeof(sz))) << "Loading metadata size failed";
+    ICHECK_EQ(static_cast<size_t>(sz), variables.size())
         << "The number of variables and ndarray counts must match";
     // Load the list of ndarray.
     std::vector<NDArray> arrays;
@@ -179,19 +179,19 @@ class MetadataModuleNode : public ModuleNode {
 
     std::unordered_map<std::string, NDArray> metadata;
     for (uint64_t i = 0; i < sz; i++) {
-      CHECK_EQ(metadata.count(variables[i]), 0U);
+      ICHECK_EQ(metadata.count(variables[i]), 0U);
       metadata[variables[i]] = arrays[i];
     }
 
     // Load the symbol to list of required constant variables mapping
     std::vector<std::string> symbols;
-    CHECK(stream->Read(&symbols)) << "Loading symbols failed";
-    CHECK(stream->Read(&sz, sizeof(sz))) << "Loading number of symbols failed";
-    CHECK_EQ(static_cast<size_t>(sz), symbols.size());
+    ICHECK(stream->Read(&symbols)) << "Loading symbols failed";
+    ICHECK(stream->Read(&sz, sizeof(sz))) << "Loading number of symbols failed";
+    ICHECK_EQ(static_cast<size_t>(sz), symbols.size());
     std::vector<std::vector<std::string>> const_vars;
     for (uint64_t i = 0; i < sz; i++) {
       std::vector<std::string> vars;
-      CHECK(stream->Read(&vars)) << "Loading const variables failed";
+      ICHECK(stream->Read(&vars)) << "Loading const variables failed";
       const_vars.push_back(vars);
     }
 
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index 634ee305153b..d13ac7e78982 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -30,10 +30,10 @@
 #import <Metal/MTLCommandQueue.h>
 #import <Metal/MTLDevice.h>
 #import <Metal/MTLLibrary.h>
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/support/logging.h>
 
 #include <memory>
 #include <mutex>
@@ -64,15 +64,15 @@ class MetalWorkspace final : public DeviceAPI {
   ~MetalWorkspace();
   // Get command queue for given context.
   id<MTLCommandQueue> GetCommandQueue(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kDLMetal);
-    CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
+    ICHECK_EQ(ctx.device_type, kDLMetal);
+    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
         << "Invalid Metal device_id=" << ctx.device_id;
     return queues[ctx.device_id];
   }
   // Get device for given context
   id<MTLDevice> GetDevice(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kDLMetal);
-    CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < devices.size())
+    ICHECK_EQ(ctx.device_type, kDLMetal);
+    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < devices.size())
         << "Invalid Metal device_id=" << ctx.device_id;
     return devices[ctx.device_id];
   }
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 79007394b18f..0169a4c2ec28 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -43,7 +43,7 @@
     *rv = int(index < devices.size());
     return;
   }
-  CHECK_LT(index, devices.size()) << "Invalid device id " << index;
+  ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
   switch (kind) {
     case kMaxThreadsPerBlock: {
       *rv = static_cast<int>([devices[ctx.device_id] maxThreadsPerThreadgroup].width);
@@ -101,11 +101,11 @@ int GetWarpSize(id<MTLDevice> dev) {
   id<MTLLibrary> lib = [dev newLibraryWithSource:[NSString stringWithUTF8String:kDummyKernel]
                                          options:nil
                                            error:&error_msg];
-  CHECK(lib != nil) << [[error_msg localizedDescription] UTF8String];
+  ICHECK(lib != nil) << [[error_msg localizedDescription] UTF8String];
   id<MTLFunction> f = [lib newFunctionWithName:[NSString stringWithUTF8String:"CopyKernel"]];
-  CHECK(f != nil);
+  ICHECK(f != nil);
   id<MTLComputePipelineState> state = [dev newComputePipelineStateWithFunction:f error:&error_msg];
-  CHECK(state != nil) << [[error_msg localizedDescription] UTF8String];
+  ICHECK(state != nil) << [[error_msg localizedDescription] UTF8String];
   return static_cast<int>(state.threadExecutionWidth);
 }
 
@@ -159,7 +159,7 @@ int GetWarpSize(id<MTLDevice> dev) {
   #endif
   */
   id<MTLBuffer> buf = [dev newBufferWithLength:nbytes options:storage_mode];
-  CHECK(buf != nil);
+  ICHECK(buf != nil);
   return (void*)(CFBridgingRetain(buf));
 }
 
@@ -176,7 +176,7 @@ int GetWarpSize(id<MTLDevice> dev) {
                                     TVMContext ctx_to, DLDataType type_hint,
                                     TVMStreamHandle stream) {
   this->Init();
-  CHECK(stream == nullptr);
+  ICHECK(stream == nullptr);
   TVMContext ctx = ctx_from;
   if (ctx_from.device_type == kDLCPU) ctx = ctx_to;
   id<MTLCommandQueue> queue = GetCommandQueue(ctx);
@@ -185,7 +185,7 @@ int GetWarpSize(id<MTLDevice> dev) {
   int to_dev_type = static_cast<int>(ctx_to.device_type);
 
   if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) {
-    CHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy.";
+    ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy.";
     id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
     [encoder copyFromBuffer:(__bridge id<MTLBuffer>)(from)
                sourceOffset:from_offset
@@ -237,7 +237,7 @@ int GetWarpSize(id<MTLDevice> dev) {
 }
 
 void MetalWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
-  CHECK(stream == nullptr);
+  ICHECK(stream == nullptr);
   // commit an empty command buffer and wait until it completes.
   id<MTLCommandQueue> queue = GetCommandQueue(ctx);
   id<MTLCommandBuffer> cb = [queue commandBuffer];
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index 8d10ff210d8d..7d46811fe78d 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -27,7 +27,7 @@
 #include <array>
 #include <mutex>
 #include <string>
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
@@ -50,7 +50,7 @@ explicit MetalModuleNode(std::string data, std::string fmt,
 
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+    ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
     std::string meta_file = GetMetaFilePath(file_name);
     SaveMetaDataToFile(meta_file, fmap_);
     SaveBinaryToFile(file_name, data_);
@@ -74,7 +74,7 @@ void SaveToBinary(dmlc::Stream* stream) final {
   // get a from primary context in device_id
   id<MTLComputePipelineState> GetPipelineState(size_t device_id, const std::string& func_name) {
     metal::MetalWorkspace* w = metal::MetalWorkspace::Global();
-    CHECK_LT(device_id, w->devices.size());
+    ICHECK_LT(device_id, w->devices.size());
     // start lock scope.
     std::lock_guard<std::mutex> lock(mutex_);
     if (finfo_.size() <= device_id) {
@@ -118,16 +118,16 @@ void SaveToBinary(dmlc::Stream* stream) final {
     }
     id<MTLFunction> f =
         [e.lib newFunctionWithName:[NSString stringWithUTF8String:func_name.c_str()]];
-    CHECK(f != nil) << "cannot find function " << func_name;
+    ICHECK(f != nil) << "cannot find function " << func_name;
     id<MTLComputePipelineState> state =
         [w->devices[device_id] newComputePipelineStateWithFunction:f error:&err_msg];
-    CHECK(state != nil) << "cannot get state:"
-                        << " for function " << func_name
-                        << [[err_msg localizedDescription] UTF8String];
+    ICHECK(state != nil) << "cannot get state:"
+                         << " for function " << func_name
+                         << [[err_msg localizedDescription] UTF8String];
     // The state.threadExecutionWidth can change dynamically according
     // to the resource constraint in kernel, so it is not strictly hold
     // Turn of warp aware optimziation for now.
-    // CHECK_EQ(state.threadExecutionWidth, w->warp_size[device_id]);
+    // ICHECK_EQ(state.threadExecutionWidth, w->warp_size[device_id]);
     e.smap[func_name] = [state retain];
     return state;
   }
@@ -231,8 +231,8 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const
 
 PackedFunc MetalModuleNode::GetFunction(const std::string& name,
                                         const ObjectPtr<Object>& sptr_to_self) {
-  CHECK_EQ(sptr_to_self.get(), this);
-  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  ICHECK_EQ(sptr_to_self.get(), this);
+  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
   auto it = fmap_.find(name);
   if (it == fmap_.end()) return PackedFunc();
   const FunctionInfo& info = it->second;
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 0ac2a014d858..ceaa5dd6245b 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -23,13 +23,16 @@
 
 #include "micro_session.h"
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/crt/rpc_common/framing.h>
 #include <tvm/runtime/crt/rpc_common/session.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
+#include <algorithm>
+#include <chrono>
 #include <cstdarg>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <utility>
 
@@ -45,75 +48,180 @@ namespace micro_rpc {
 
 class CallbackWriteStream : public WriteStream {
  public:
-  explicit CallbackWriteStream(PackedFunc fsend) : fsend_{fsend} {}
+  explicit CallbackWriteStream(PackedFunc fsend, ::std::chrono::microseconds write_timeout)
+      : fsend_{fsend}, write_timeout_{write_timeout} {}
 
   ssize_t Write(const uint8_t* data, size_t data_size_bytes) override {
     TVMByteArray bytes;
     bytes.data = (const char*)data;
     bytes.size = data_size_bytes;
-    int64_t n = fsend_(bytes);
-    return n;
+    if (write_timeout_ == ::std::chrono::microseconds::zero()) {
+      return static_cast<int64_t>(fsend_(bytes, nullptr));
+    } else {
+      return static_cast<int64_t>(fsend_(bytes, write_timeout_.count()));
+    }
   }
 
   void PacketDone(bool is_valid) override {}
 
+  void SetWriteTimeout(::std::chrono::microseconds timeout) { write_timeout_ = timeout; }
+
  private:
   PackedFunc fsend_;
+  ::std::chrono::microseconds write_timeout_;
 };
 
 class MicroTransportChannel : public RPCChannel {
  public:
-  MicroTransportChannel(PackedFunc fsend, PackedFunc frecv)
-      : write_stream_{fsend},
+  enum class State : uint8_t {
+    kReset = 0,               // state entered before the transport has been read or written to.
+    kSessionTerminated = 1,   // session is terminated, but transport is alive.
+    kSessionEstablished = 2,  // session is alive.
+  };
+
+  /*!
+   * \brief Construct a new MicroTransportChannel.
+   * \param fsend A PackedFunc accepting (data_bytes, timeout_usec) and returning the number of
+   *  bytes sent. If a timeout_usec elapses before all data is sent, it should return 0.
+   * \param frecv A PackedFunc accepting (num_bytes, timeout_usec) and returning a string containing
+   *  the received data. Must not return an empty string, except to indicate a timeout.
+   * \param session_start_retry_timeout During session initialization, the session start message is
+   *  re-sent after this many microseconds elapse without a reply. If 0, the session start message
+   *  is sent only once.
+   * \param session_start_timeout Session initialization is considered "timed out" if no reply is
+   *  received this many microseconds after the session start is sent. If 0, a session start never
+   *  times out.
+   * \param session_established_timeout Timeout used for the Recv() function. This is used for
+   *  messages sent after a session is already established. If 0, Recv() never times out.
+   */
+  MicroTransportChannel(PackedFunc fsend, PackedFunc frecv,
+                        ::std::chrono::microseconds session_start_retry_timeout,
+                        ::std::chrono::microseconds session_start_timeout,
+                        ::std::chrono::microseconds session_established_timeout)
+      : state_{State::kReset},
+        session_start_retry_timeout_{session_start_retry_timeout},
+        session_start_timeout_{session_start_timeout},
+        session_established_timeout_{session_established_timeout},
+        write_stream_{fsend, session_start_timeout},
         framer_{&write_stream_},
         receive_buffer_{new uint8_t[TVM_CRT_MAX_PACKET_SIZE_BYTES], TVM_CRT_MAX_PACKET_SIZE_BYTES},
-        session_{0x5b, &framer_, &receive_buffer_, &HandleMessageReceivedCb, this},
+        session_{0x5c, &framer_, &receive_buffer_, &HandleMessageReceivedCb, this},
         unframer_{session_.Receiver()},
         did_receive_message_{false},
         frecv_{frecv},
         message_buffer_{nullptr} {}
 
-  size_t ReceiveUntil(TypedPackedFunc<bool(void)> pf) {
-    size_t bytes_received = 0;
+ private:
+  static constexpr const size_t kReceiveBufferSizeBytes = 128;
+
+  /*
+   * \brief Receive data until either pf() returns true or a timeout occurs.
+   *
+   * The condition function is called first, so this function may return without performing a read.
+   * Following this call, received data is consumed and frecv_ is invoked until the timeout occurs
+   * or the condition function passes.
+   *
+   * \param pf A condition function that returns true when enough data has been received for the
+   *  caller to proceed.
+   * \param timeout Pointer to number of microseconds to wait before timing out. If nullptr, no
+   *  timeout ever occurs in this function, so it may block forever. If 0, a single non-blocking
+   *  read is performed, and any data returned is processed.
+   * \return true if the condition passed, false if the timeout expired.
+   */
+  bool ReceiveUntil(TypedPackedFunc<bool(void)> pf, ::std::chrono::microseconds* timeout) {
     if (pf()) {
-      return 0;
+      return true;
     }
 
+    auto end_time = ::std::chrono::steady_clock::now();
+    if (timeout != nullptr) {
+      end_time += *timeout;
+    }
     for (;;) {
-      while (pending_chunk_.size() > 0) {
-        size_t bytes_consumed = 0;
-        int unframer_error = unframer_.Write((const uint8_t*)pending_chunk_.data(),
-                                             pending_chunk_.size(), &bytes_consumed);
-
-        CHECK(bytes_consumed <= pending_chunk_.size());
-        pending_chunk_ = pending_chunk_.substr(bytes_consumed);
-        bytes_received += bytes_consumed;
-        if (unframer_error < 0) {
-          LOG(ERROR) << "unframer got error code: " << unframer_error;
-        } else {
-          if (pf()) {
-            return bytes_received;
-          }
-        }
+      if (ConsumeReceivedPayload(pf)) {
+        return true;
       }
 
-      std::string chunk = frecv_(128);
+      ::std::string chunk;
+      if (timeout != nullptr) {
+        ::std::chrono::microseconds iter_timeout{
+            ::std::max(::std::chrono::microseconds{0},
+                       ::std::chrono::duration_cast<::std::chrono::microseconds>(
+                           end_time - ::std::chrono::steady_clock::now()))};
+        chunk =
+            frecv_(size_t(kReceiveBufferSizeBytes), iter_timeout.count()).operator std::string();
+      } else {
+        chunk = frecv_(size_t(kReceiveBufferSizeBytes), nullptr).operator std::string();
+      }
       pending_chunk_ = chunk;
-      CHECK(pending_chunk_.size() != 0) << "zero-size chunk encountered";
-      CHECK_GT(pending_chunk_.size(), 0);
+      if (pending_chunk_.size() == 0) {
+        // Timeout occurred
+        return false;
+      }
+    }
+  }
+
+  bool StartSessionInternal() {
+    using ::std::chrono::duration_cast;
+    using ::std::chrono::microseconds;
+    using ::std::chrono::steady_clock;
+
+    steady_clock::time_point start_time = steady_clock::now();
+    ICHECK_EQ(kTvmErrorNoError, session_.Initialize());
+    ICHECK_EQ(kTvmErrorNoError, session_.StartSession());
+
+    if (session_start_timeout_ == microseconds::zero() &&
+        session_start_retry_timeout_ == microseconds::zero()) {
+      ICHECK(ReceiveUntil([this]() -> bool { return session_.IsEstablished(); }, nullptr))
+          << "ReceiveUntil indicated timeout expired, but no timeout set!";
+      ICHECK(session_.IsEstablished()) << "Session not established, but should be";
+      return true;
+    }
+
+    auto session_start_end_time = start_time + session_start_timeout_;
+    steady_clock::time_point end_time;
+    if (session_start_retry_timeout_ != ::std::chrono::microseconds::zero()) {
+      end_time = start_time + session_start_retry_timeout_;
+    } else {
+      end_time = session_start_end_time;
+    }
+
+    while (!session_.IsEstablished()) {
+      microseconds time_remaining =
+          ::std::max(microseconds{0}, duration_cast<microseconds>(end_time - steady_clock::now()));
+      if (ReceiveUntil([this]() -> bool { return session_.IsEstablished(); }, &time_remaining)) {
+        break;
+      }
+
+      if (session_start_timeout_ != microseconds::zero() && end_time >= session_start_end_time) {
+        return false;
+      }
+      end_time += session_start_retry_timeout_;
+
+      ICHECK_EQ(kTvmErrorNoError, session_.Initialize());
+      ICHECK_EQ(kTvmErrorNoError, session_.StartSession());
     }
+
+    return true;
   }
 
-  void StartSession() {
-    CHECK_EQ(kTvmErrorNoError, session_.Initialize());
-    CHECK_EQ(kTvmErrorNoError, session_.StartSession());
-    ReceiveUntil([this]() -> bool { return session_.IsEstablished(); });
+ public:
+  bool StartSession() {
+    ICHECK(state_ == State::kReset)
+        << "MicroSession: state_: expected kReset, got " << uint8_t(state_);
+
+    bool to_return = StartSessionInternal();
+    if (to_return) {
+      write_stream_.SetWriteTimeout(session_established_timeout_);
+    }
+
+    return to_return;
   }
 
   size_t Send(const void* data, size_t size) override {
     const uint8_t* data_bytes = static_cast<const uint8_t*>(data);
-    ssize_t ret = session_.SendMessage(MessageType::kNormal, data_bytes, size);
-    CHECK(ret == 0) << "SendMessage returned " << ret;
+    tvm_crt_error_t err = session_.SendMessage(MessageType::kNormal, data_bytes, size);
+    ICHECK(err == kTvmErrorNoError) << "SendMessage returned " << err;
 
     return size;
   }
@@ -128,13 +236,25 @@ class MicroTransportChannel : public RPCChannel {
           session_.ClearReceiveBuffer();
         }
         if (num_bytes_recv == size) {
-          CHECK(message_buffer_ == nullptr || message_buffer_->ReadAvailable() > 0);
+          ICHECK(message_buffer_ == nullptr || message_buffer_->ReadAvailable() > 0);
           return num_bytes_recv;
         }
       }
 
       did_receive_message_ = false;
-      ReceiveUntil([this]() -> bool { return did_receive_message_; });
+      if (session_established_timeout_ == ::std::chrono::microseconds::zero()) {
+        ICHECK(ReceiveUntil([this]() -> bool { return did_receive_message_; }, nullptr))
+            << "ReceiveUntil timeout expired, but no timeout configured!";
+      } else {
+        if (!ReceiveUntil([this]() -> bool { return did_receive_message_; },
+                          &session_established_timeout_)) {
+          std::stringstream ss;
+          ss << "MicroSessionTimeoutError: failed to read reply message after timeout "
+             << session_established_timeout_.count() / 1e6 << "s";
+
+          throw std::runtime_error(ss.str());
+        }
+      }
     }
 
     return num_bytes_recv;
@@ -150,6 +270,37 @@ class MicroTransportChannel : public RPCChannel {
   }
 
  private:
+  /*!
+   * \brief Consume the entire received payload, unless the pf condition is met halfway through.
+   *
+   * This function expects pending_chunk_ to contain a chunk of unprocessed packet data. It
+   * repeatedly writes the chunk to the Unframer until either a) pf() returns True or b) no more
+   * data remains to be written.
+   *
+   * \param pf A PackedFunc which returns true when ReceiveUntil should return.
+   * \returns true if pf() returned true during processing; false otherwise.
+   */
+  bool ConsumeReceivedPayload(TypedPackedFunc<bool(void)> pf) {
+    while (pending_chunk_.size() > 0) {
+      size_t bytes_consumed = 0;
+      int unframer_error = unframer_.Write((const uint8_t*)pending_chunk_.data(),
+                                           pending_chunk_.size(), &bytes_consumed);
+
+      ICHECK(bytes_consumed <= pending_chunk_.size())
+          << "consumed " << bytes_consumed << " want <= " << pending_chunk_.size();
+      pending_chunk_ = pending_chunk_.substr(bytes_consumed);
+      if (unframer_error < 0) {
+        LOG(ERROR) << "unframer got error code: " << unframer_error;
+      } else {
+        if (pf()) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
   static void HandleMessageReceivedCb(void* context, MessageType message_type, FrameBuffer* buf) {
     static_cast<MicroTransportChannel*>(context)->HandleMessageReceived(message_type, buf);
   }
@@ -158,11 +309,21 @@ class MicroTransportChannel : public RPCChannel {
     size_t message_size_bytes;
     switch (message_type) {
       case MessageType::kStartSessionInit:
+        break;
+
       case MessageType::kStartSessionReply:
+        state_ = State::kSessionEstablished;
         break;
 
       case MessageType::kTerminateSession:
-        LOG(FATAL) << "SessionTerminatedError: remote side has probably reset";
+        if (state_ == State::kReset) {
+          state_ = State::kSessionTerminated;
+        } else if (state_ == State::kSessionTerminated) {
+          LOG(FATAL) << "SessionTerminatedError: multiple session-terminated messages received; "
+                        "device in reboot loop?";
+        } else if (state_ == State::kSessionEstablished) {
+          LOG(FATAL) << "SessionTerminatedError: remote device terminated connection";
+        }
         break;
 
       case MessageType::kLog:
@@ -176,7 +337,7 @@ class MicroTransportChannel : public RPCChannel {
           return;
         }
 
-        CHECK_EQ(buf->Read(message, sizeof(message) - 1), message_size_bytes);
+        ICHECK_EQ(buf->Read(message, sizeof(message) - 1), message_size_bytes);
         message[message_size_bytes] = 0;
         LOG(INFO) << "remote: " << message;
         session_.ClearReceiveBuffer();
@@ -189,6 +350,10 @@ class MicroTransportChannel : public RPCChannel {
     }
   }
 
+  State state_;
+  ::std::chrono::microseconds session_start_retry_timeout_;
+  ::std::chrono::microseconds session_start_timeout_;
+  ::std::chrono::microseconds session_established_timeout_;
   CallbackWriteStream write_stream_;
   Framer framer_;
   FrameBuffer receive_buffer_;
@@ -201,8 +366,16 @@ class MicroTransportChannel : public RPCChannel {
 };
 
 TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue* rv) {
-  MicroTransportChannel* micro_channel = new MicroTransportChannel(args[1], args[2]);
-  micro_channel->StartSession();
+  MicroTransportChannel* micro_channel =
+      new MicroTransportChannel(args[1], args[2], ::std::chrono::microseconds(uint64_t(args[3])),
+                                ::std::chrono::microseconds(uint64_t(args[4])),
+                                ::std::chrono::microseconds(uint64_t(args[5])));
+  if (!micro_channel->StartSession()) {
+    std::stringstream ss;
+    ss << "MicroSessionTimeoutError: session start handshake failed after " << double(args[4]) / 1e6
+       << "s";
+    throw std::runtime_error(ss.str());
+  }
   std::unique_ptr<RPCChannel> channel(micro_channel);
   auto ep = RPCEndpoint::Create(std::move(channel), args[0], "");
   auto sess = CreateClientSession(ep);
@@ -224,5 +397,5 @@ void TVMLogf(const char* fmt, ...) {
   LOG(INFO) << msg_buf;
 }
 
-void TVMPlatformAbort(int error_code) { CHECK(false) << "TVMPlatformAbort: " << error_code; }
+void TVMPlatformAbort(int error_code) { ICHECK(false) << "TVMPlatformAbort: " << error_code; }
 }
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index 565f92ad59be..62f7236b8e2a 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -46,7 +46,7 @@
 #endif
 
 #if TVM_MINRPC_ENABLE_LOGGING
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 #endif
 
 namespace tvm {
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 98b0b3a83466..4cec5e3643c1 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -28,7 +28,7 @@
 #include <cstring>
 #include <unordered_set>
 
-#include "file_util.h"
+#include "file_utils.h"
 
 namespace tvm {
 namespace runtime {
@@ -39,7 +39,7 @@ void ModuleNode::Import(Module other) {
     static const PackedFunc* fimport_ = nullptr;
     if (fimport_ == nullptr) {
       fimport_ = runtime::Registry::Get("rpc.ImportRemoteModule");
-      CHECK(fimport_ != nullptr);
+      ICHECK(fimport_ != nullptr);
     }
     (*fimport_)(GetRef<Module>(this), other);
     return;
@@ -57,7 +57,7 @@ void ModuleNode::Import(Module other) {
       stack.push_back(next);
     }
   }
-  CHECK(!visited.count(this)) << "Cyclic dependency detected during import";
+  ICHECK(!visited.count(this)) << "Cyclic dependency detected during import";
   this->imports_.emplace_back(std::move(other));
 }
 
@@ -68,6 +68,9 @@ PackedFunc ModuleNode::GetFunction(const std::string& name, bool query_imports)
   if (query_imports) {
     for (Module& m : self->imports_) {
       pf = m.operator->()->GetFunction(name, query_imports);
+      if (pf != nullptr) {
+        return pf;
+      }
     }
   }
   return pf;
@@ -75,13 +78,13 @@ PackedFunc ModuleNode::GetFunction(const std::string& name, bool query_imports)
 
 Module Module::LoadFromFile(const std::string& file_name, const std::string& format) {
   std::string fmt = GetFileFormat(file_name, format);
-  CHECK(fmt.length() != 0) << "Cannot deduce format of file " << file_name;
+  ICHECK(fmt.length() != 0) << "Cannot deduce format of file " << file_name;
   if (fmt == "dll" || fmt == "dylib" || fmt == "dso") {
     fmt = "so";
   }
   std::string load_f_name = "runtime.module.loadfile_" + fmt;
   const PackedFunc* f = Registry::Get(load_f_name);
-  CHECK(f != nullptr) << "Loader of " << format << "(" << load_f_name << ") is not presented.";
+  ICHECK(f != nullptr) << "Loader of " << format << "(" << load_f_name << ") is not presented.";
   Module m = (*f)(file_name, format);
   return m;
 }
@@ -109,8 +112,8 @@ const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) {
   }
   if (pf == nullptr) {
     const PackedFunc* f = Registry::Get(name);
-    CHECK(f != nullptr) << "Cannot find function " << name
-                        << " in the imported modules or global registry";
+    ICHECK(f != nullptr) << "Cannot find function " << name
+                         << " in the imported modules or global registry";
     return f;
   } else {
     import_cache_.insert(std::make_pair(name, std::make_shared<PackedFunc>(pf)));
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 9c1eeeb973d6..dae775606a7e 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -21,10 +21,10 @@
  * \file ndarray.cc
  * \brief NDArray container infratructure.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/support/logging.h>
 
 #include "runtime_base.h"
 
@@ -39,9 +39,9 @@ namespace tvm {
 namespace runtime {
 
 inline void VerifyDataType(DLDataType dtype) {
-  CHECK_GE(dtype.lanes, 1);
+  ICHECK_GE(dtype.lanes, 1);
   if (dtype.code == kDLFloat) {
-    CHECK_EQ(dtype.bits % 8, 0);
+    ICHECK_EQ(dtype.bits % 8, 0);
   } else {
     // allow uint1 as a special flag for bool.
     if (dtype.bits == 1 && dtype.code == kDLUInt) return;
@@ -53,9 +53,9 @@ inline void VerifyDataType(DLDataType dtype) {
     else if (dtype.bits == 4 && dtype.code == kDLInt)
       return;
     else
-      CHECK_EQ(dtype.bits % 8, 0);
+      ICHECK_EQ(dtype.bits % 8, 0);
   }
-  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
+  ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
 inline size_t GetDataAlignment(const DLTensor& arr) {
@@ -69,8 +69,8 @@ void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
   cpu_ctx.device_type = kDLCPU;
   cpu_ctx.device_id = 0;
   size_t arr_size = GetDataSize(*handle);
-  CHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
-  CHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";
+  ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
+  ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";
   DeviceAPI::Get(handle->ctx)
       ->CopyDataFromTo(data, 0, handle->data, static_cast<size_t>(handle->byte_offset), nbytes,
                        cpu_ctx, handle->ctx, handle->dtype, nullptr);
@@ -83,8 +83,8 @@ void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
   cpu_ctx.device_type = kDLCPU;
   cpu_ctx.device_id = 0;
   size_t arr_size = GetDataSize(*handle);
-  CHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
-  CHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";
+  ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
+  ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";
   DeviceAPI::Get(handle->ctx)
       ->CopyDataFromTo(handle->data, static_cast<size_t>(handle->byte_offset), data, 0, nbytes,
                        handle->ctx, cpu_ctx, handle->dtype, nullptr);
@@ -153,7 +153,7 @@ struct NDArray::Internal {
   }
 
   static DLManagedTensor* ToDLPack(NDArray::Container* from) {
-    CHECK(from != nullptr);
+    ICHECK(from != nullptr);
     DLManagedTensor* ret = new DLManagedTensor();
     ret->dl_tensor = from->dl_tensor;
     ret->manager_ctx = from;
@@ -169,13 +169,13 @@ struct NDArray::Internal {
 };
 
 NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
-  CHECK(data_ != nullptr);
-  CHECK(get_mutable()->dl_tensor.strides == nullptr) << "Can only create view for compact tensor";
+  ICHECK(data_ != nullptr);
+  ICHECK(get_mutable()->dl_tensor.strides == nullptr) << "Can only create view for compact tensor";
   NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.ctx);
   ret.get_mutable()->dl_tensor.byte_offset = this->get_mutable()->dl_tensor.byte_offset;
   size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor);
   size_t view_size = GetDataSize(ret.get_mutable()->dl_tensor);
-  CHECK_LE(view_size, curr_size)
+  ICHECK_LE(view_size, curr_size)
       << "Tries to create a view that has bigger memory than current one";
   // increase ref count
   get_mutable()->IncRef();
@@ -211,25 +211,25 @@ NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
 }
 
 void NDArray::CopyToBytes(void* data, size_t nbytes) const {
-  CHECK(data != nullptr);
-  CHECK(data_ != nullptr);
+  ICHECK(data != nullptr);
+  ICHECK(data_ != nullptr);
   ArrayCopyToBytes(&get_mutable()->dl_tensor, data, nbytes);
 }
 
 void NDArray::CopyFromBytes(const void* data, size_t nbytes) {
-  CHECK(data != nullptr);
-  CHECK(data_ != nullptr);
+  ICHECK(data != nullptr);
+  ICHECK(data_ != nullptr);
   ArrayCopyFromBytes(&get_mutable()->dl_tensor, data, nbytes);
 }
 
 void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   size_t from_size = GetDataSize(*from);
   size_t to_size = GetDataSize(*to);
-  CHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match";
+  ICHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match";
 
-  CHECK(from->ctx.device_type == to->ctx.device_type || from->ctx.device_type == kDLCPU ||
-        to->ctx.device_type == kDLCPU || from->ctx.device_type == kDLCPUPinned ||
-        to->ctx.device_type == kDLCPUPinned)
+  ICHECK(from->ctx.device_type == to->ctx.device_type || from->ctx.device_type == kDLCPU ||
+         to->ctx.device_type == kDLCPU || from->ctx.device_type == kDLCPUPinned ||
+         to->ctx.device_type == kDLCPUPinned)
       << "Can not copy across different ctx types directly";
 
   // Use the context that is *not* a cpu context to get the correct device
diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index dc5f1ceabbae..ad68c70698ea 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -20,9 +20,9 @@
  * \file src/runtime/object.cc
  * \brief Object type management system.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <iostream>
 #include <mutex>
@@ -70,7 +70,7 @@ class TypeContext {
     if (child_tindex == parent_tindex) return true;
     {
       std::lock_guard<std::mutex> lock(mutex_);
-      CHECK_LT(child_tindex, type_table_.size());
+      ICHECK_LT(child_tindex, type_table_.size());
       while (child_tindex > parent_tindex) {
         child_tindex = type_table_[child_tindex].parent_index;
       }
@@ -87,10 +87,10 @@ class TypeContext {
       return it->second;
     }
     // try to allocate from parent's type table.
-    CHECK_LT(parent_tindex, type_table_.size())
+    ICHECK_LT(parent_tindex, type_table_.size())
         << " skey= " << skey << "static_index=" << static_tindex;
     TypeInfo& pinfo = type_table_[parent_tindex];
-    CHECK_EQ(pinfo.index, parent_tindex);
+    ICHECK_EQ(pinfo.index, parent_tindex);
 
     // if parent cannot overflow, then this class cannot.
     if (!pinfo.child_slots_can_overflow) {
@@ -104,8 +104,8 @@ class TypeContext {
     if (static_tindex != TypeIndex::kDynamic) {
       // statically assigned type
       allocated_tindex = static_tindex;
-      CHECK_LT(static_tindex, type_table_.size());
-      CHECK_EQ(type_table_[allocated_tindex].allocated_slots, 0U)
+      ICHECK_LT(static_tindex, type_table_.size());
+      ICHECK_EQ(type_table_[allocated_tindex].allocated_slots, 0U)
           << "Conflicting static index " << static_tindex << " between "
           << type_table_[allocated_tindex].name << " and " << skey;
     } else if (pinfo.allocated_slots + num_slots <= pinfo.num_slots) {
@@ -114,15 +114,15 @@ class TypeContext {
       // update parent's state
       pinfo.allocated_slots += num_slots;
     } else {
-      CHECK(pinfo.child_slots_can_overflow)
+      ICHECK(pinfo.child_slots_can_overflow)
           << "Reach maximum number of sub-classes for " << pinfo.name;
       // allocate new entries.
       allocated_tindex = type_counter_;
       type_counter_ += num_slots;
-      CHECK_LE(type_table_.size(), type_counter_);
+      ICHECK_LE(type_table_.size(), type_counter_);
       type_table_.resize(type_counter_, TypeInfo());
     }
-    CHECK_GT(allocated_tindex, parent_tindex);
+    ICHECK_GT(allocated_tindex, parent_tindex);
     // initialize the slot.
     type_table_[allocated_tindex].index = allocated_tindex;
     type_table_[allocated_tindex].parent_index = parent_tindex;
@@ -138,21 +138,21 @@ class TypeContext {
 
   std::string TypeIndex2Key(uint32_t tindex) {
     std::lock_guard<std::mutex> lock(mutex_);
-    CHECK(tindex < type_table_.size() && type_table_[tindex].allocated_slots != 0)
+    ICHECK(tindex < type_table_.size() && type_table_[tindex].allocated_slots != 0)
         << "Unknown type index " << tindex;
     return type_table_[tindex].name;
   }
 
   size_t TypeIndex2KeyHash(uint32_t tindex) {
     std::lock_guard<std::mutex> lock(mutex_);
-    CHECK(tindex < type_table_.size() && type_table_[tindex].allocated_slots != 0)
+    ICHECK(tindex < type_table_.size() && type_table_[tindex].allocated_slots != 0)
         << "Unknown type index " << tindex;
     return type_table_[tindex].name_hash;
   }
 
   uint32_t TypeKey2Index(const std::string& skey) {
     auto it = type_key2index_.find(skey);
-    CHECK(it != type_key2index_.end())
+    ICHECK(it != type_key2index_.end())
         << "Cannot find type " << skey
         << ". Did you forget to register the node by TVM_REGISTER_NODE_TYPE ?";
     return it->second;
@@ -229,7 +229,7 @@ TVM_REGISTER_GLOBAL("runtime.DumpTypeTable").set_body_typed([](int min_child_cou
 
 int TVMObjectGetTypeIndex(TVMObjectHandle obj, unsigned* out_tindex) {
   API_BEGIN();
-  CHECK(obj != nullptr);
+  ICHECK(obj != nullptr);
   out_tindex[0] = static_cast<tvm::runtime::Object*>(obj)->type_index();
   API_END();
 }
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index aab0c27cb39b..fa118ed9525b 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -24,10 +24,10 @@
 #ifndef TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
 #define TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/support/logging.h>
 
 /* There are many OpenCL platforms that do not yet support OpenCL 2.0,
  * hence we use 1.2 APIs, some of which are now deprecated.  In order
@@ -51,7 +51,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
@@ -167,7 +167,7 @@ inline const char* CLGetErrorString(cl_int error) {
  * \param func Expression to call.
  */
 #define OPENCL_CHECK_ERROR(e) \
-  { CHECK(e == CL_SUCCESS) << "OpenCL Error, code=" << e << ": " << cl::CLGetErrorString(e); }
+  { ICHECK(e == CL_SUCCESS) << "OpenCL Error, code=" << e << ": " << cl::CLGetErrorString(e); }
 
 #define OPENCL_CALL(func)  \
   {                        \
@@ -221,9 +221,9 @@ class OpenCLWorkspace : public DeviceAPI {
   virtual bool IsOpenCLDevice(TVMContext ctx) { return ctx.device_type == kDLOpenCL; }
   // get the queue of the context
   cl_command_queue GetQueue(TVMContext ctx) {
-    CHECK(IsOpenCLDevice(ctx));
+    ICHECK(IsOpenCLDevice(ctx));
     this->Init();
-    CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
+    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
         << "Invalid OpenCL device_id=" << ctx.device_id;
     return queues[ctx.device_id];
   }
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 83944cd4a83e..a3ec21e28f1d 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -47,7 +47,7 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
     *rv = static_cast<int>(index < devices.size());
     return;
   }
-  CHECK_LT(index, devices.size()) << "Invalid device id " << index;
+  ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
   switch (kind) {
     case kExist:
       break;
@@ -119,7 +119,7 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
 void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
-  CHECK(context != nullptr) << "No OpenCL device";
+  ICHECK(context != nullptr) << "No OpenCL device";
   cl_int err_code;
   cl_mem mptr = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
   OPENCL_CHECK_ERROR(err_code);
@@ -140,7 +140,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void*
                                      TVMContext ctx_to, DLDataType type_hint,
                                      TVMStreamHandle stream) {
   this->Init();
-  CHECK(stream == nullptr);
+  ICHECK(stream == nullptr);
   if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) {
     OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(ctx_to),
                                     static_cast<cl_mem>((void*)from),  // NOLINT(*)
@@ -163,7 +163,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void*
 }
 
 void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
-  CHECK(stream == nullptr);
+  ICHECK(stream == nullptr);
   OPENCL_CALL(clFinish(this->GetQueue(ctx)));
 }
 
@@ -266,7 +266,7 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   this->context = clCreateContext(nullptr, this->devices.size(), &(this->devices[0]), nullptr,
                                   nullptr, &err_code);
   OPENCL_CHECK_ERROR(err_code);
-  CHECK_EQ(this->queues.size(), 0U);
+  ICHECK_EQ(this->queues.size(), 0U);
   for (size_t i = 0; i < this->devices.size(); ++i) {
     cl_device_id did = this->devices[i];
     this->queues.push_back(clCreateCommandQueue(this->context, did, 0, &err_code));
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 590a446efe64..a4c61e47b376 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -50,7 +50,7 @@ class OpenCLWrappedFunc {
   }
   // invoke the function with void arguments
   void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const {
-    CHECK(w_->context != nullptr) << "No OpenCL device";
+    ICHECK(w_->context != nullptr) << "No OpenCL device";
     cl::OpenCLThreadEntry* t = w_->GetThreadEntry();
     // get the kernel from thread local kernel table.
     if (entry_.kernel_id >= t->kernel_table.size()) {
@@ -116,8 +116,8 @@ cl::OpenCLWorkspace* OpenCLModuleNode::GetGlobalWorkspace() {
 
 PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
                                          const ObjectPtr<Object>& sptr_to_self) {
-  CHECK_EQ(sptr_to_self.get(), this);
-  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  ICHECK_EQ(sptr_to_self.get(), this);
+  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
   auto it = fmap_.find(name);
   if (it == fmap_.end()) return PackedFunc();
   const FunctionInfo& info = it->second;
@@ -125,13 +125,13 @@ PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
   std::vector<size_t> arg_size(info.arg_types.size());
   for (size_t i = 0; i < info.arg_types.size(); ++i) {
     DLDataType t = info.arg_types[i];
-    CHECK_EQ(t.lanes, 1U);
+    ICHECK_EQ(t.lanes, 1U);
     if (t.code == kTVMOpaqueHandle) {
       // specially store pointer type size in OpenCL driver
       arg_size[i] = sizeof(void*);
     } else {
       uint32_t bits = t.bits;
-      CHECK_EQ(bits % 8, 0U);
+      ICHECK_EQ(bits % 8, 0U);
       arg_size[i] = bits / 8;
     }
   }
@@ -142,7 +142,7 @@ PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
 
 void OpenCLModuleNode::SaveToFile(const std::string& file_name, const std::string& format) {
   std::string fmt = GetFileFormat(file_name, format);
-  CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+  ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
   std::string meta_file = GetMetaFilePath(file_name);
   SaveMetaDataToFile(meta_file, fmap_);
   SaveBinaryToFile(file_name, data_);
diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h
index ae9771641b23..45cde22bda08 100644
--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -119,7 +119,7 @@ enum ArgConvertCode {
 };
 
 inline ArgConvertCode GetArgConvertCode(DLDataType t) {
-  CHECK_EQ(t.lanes, 1U) << "Cannot pass vector type argument to devic function for now";
+  ICHECK_EQ(t.lanes, 1U) << "Cannot pass vector type argument to devic function for now";
   if (t.code == kDLInt) {
     if (t.bits == 64U) return INT64_TO_INT64;
     if (t.bits == 32U) return INT64_TO_INT32;
@@ -284,7 +284,7 @@ inline size_t NumBufferArgs(const std::vector<DLDataType>& arg_types) {
     }
   }
   for (size_t i = base; i < arg_types.size(); ++i) {
-    CHECK(arg_types[i].code != kTVMOpaqueHandle) << "Device function need to be organized";
+    ICHECK(arg_types[i].code != kTVMOpaqueHandle) << "Device function need to be organized";
   }
   return base;
 }
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index 641532a83927..a65235090bfd 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -21,9 +21,9 @@
  * \file registry.cc
  * \brief The global registry of packed function.
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <array>
 #include <memory>
@@ -65,7 +65,7 @@ Registry& Registry::Register(const std::string& name, bool can_override) {  // N
   Manager* m = Manager::Global();
   std::lock_guard<std::mutex> lock(m->mutex);
   if (m->fmap.count(name)) {
-    CHECK(can_override) << "Global PackedFunc " << name << " is already registered";
+    ICHECK(can_override) << "Global PackedFunc " << name << " is already registered";
   }
 
   Registry* r = new Registry();
@@ -146,3 +146,9 @@ int TVMFuncListGlobalNames(int* out_size, const char*** out_array) {
   *out_size = static_cast<int>(ret->ret_vec_str.size());
   API_END();
 }
+
+int TVMFuncRemoveGlobal(const char* name) {
+  API_BEGIN();
+  tvm::runtime::Registry::Remove(name);
+  API_END();
+}
diff --git a/src/runtime/rocm/rocm_common.h b/src/runtime/rocm/rocm_common.h
index 6ed9bccb1ab7..b258e37508df 100644
--- a/src/runtime/rocm/rocm_common.h
+++ b/src/runtime/rocm/rocm_common.h
@@ -43,10 +43,10 @@ namespace runtime {
     }                                                                                          \
   }
 
-#define ROCM_CALL(func)                                             \
-  {                                                                 \
-    hipError_t e = (func);                                          \
-    CHECK(e == hipSuccess) << "ROCM HIP: " << hipGetErrorString(e); \
+#define ROCM_CALL(func)                                              \
+  {                                                                  \
+    hipError_t e = (func);                                           \
+    ICHECK(e == hipSuccess) << "ROCM HIP: " << hipGetErrorString(e); \
   }
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 7f5bc99380a4..26e44eca0d12 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -21,12 +21,12 @@
  * \file rocm_device_api.cc
  * \brief GPU specific API
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <hip/hip_runtime_api.h>
 #include <hsa/hsa.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include "rocm_common.h"
 
@@ -122,7 +122,7 @@ class ROCMDeviceAPI final : public DeviceAPI {
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
     ROCM_CALL(hipSetDevice(ctx.device_id));
-    CHECK_EQ(256 % alignment, 0U) << "ROCM space is aligned at 256 bytes";
+    ICHECK_EQ(256 % alignment, 0U) << "ROCM space is aligned at 256 bytes";
     void* ret;
     ROCM_CALL(hipMalloc(&ret, nbytes));
     return ret;
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 79958d20aa1f..567557c56794 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -31,7 +31,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
@@ -70,7 +70,7 @@ class ROCMModuleNode : public runtime::ModuleNode {
     std::string fmt = GetFileFormat(file_name, format);
     std::string meta_file = GetMetaFilePath(file_name);
     // note: llvm and asm formats are not laodable, so we don't save them
-    CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+    ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
     SaveMetaDataToFile(meta_file, fmap_);
     SaveBinaryToFile(file_name, data_);
   }
@@ -121,7 +121,7 @@ class ROCMModuleNode : public runtime::ModuleNode {
     size_t nbytes = 0;
 
     ROCM_DRIVER_CALL(hipModuleGetGlobal(&global, &nbytes, module_[device_id], global_name.c_str()));
-    CHECK_EQ(nbytes, expect_nbytes);
+    ICHECK_EQ(nbytes, expect_nbytes);
     return global;
   }
 
@@ -189,8 +189,8 @@ class ROCMWrappedFunc {
 
 PackedFunc ROCMModuleNode::GetFunction(const std::string& name,
                                        const ObjectPtr<Object>& sptr_to_self) {
-  CHECK_EQ(sptr_to_self.get(), this);
-  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  ICHECK_EQ(sptr_to_self.get(), this);
+  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
   auto it = fmap_.find(name);
   if (it == fmap_.end()) return PackedFunc();
   const FunctionInfo& info = it->second;
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 196a97ecbd66..a1e96e92b4e0 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -20,9 +20,9 @@
 /*!
  * \file rpc_device_api.cc
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <utility>
 
@@ -34,19 +34,19 @@ namespace runtime {
 class RPCDeviceAPI final : public DeviceAPI {
  public:
   void SetDevice(TVMContext ctx) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->SetDevice(remote_ctx);
   }
 
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv);
   }
 
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
     auto sess = GetSess(ctx);
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     void* data =
         sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, nbytes, alignment, type_hint);
 
@@ -57,7 +57,7 @@ class RPCDeviceAPI final : public DeviceAPI {
   }
   void FreeDataSpace(TVMContext ctx, void* ptr) final {
     RemoteSpace* space = static_cast<RemoteSpace*>(ptr);
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     try {
       GetSess(ctx)->GetDeviceAPI(remote_ctx)->FreeDataSpace(remote_ctx, space->data);
     } catch (const dmlc::Error& e) {
@@ -68,13 +68,11 @@ class RPCDeviceAPI final : public DeviceAPI {
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
-    int from_dev_type = ctx_from.device_type;
-    int to_dev_type = ctx_to.device_type;
-    if (from_dev_type > kRPCSessMask && to_dev_type > kRPCSessMask) {
-      CHECK(ctx_from.device_type == ctx_to.device_type)
+    if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) {
+      ICHECK(ctx_from.device_type == ctx_to.device_type)
           << "Cannot copy across two different remote session";
-      auto remote_ctx_from = RemoveSessMask(ctx_from);
-      auto remote_ctx_to = RemoveSessMask(ctx_to);
+      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
+      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
       auto remote_ctx = remote_ctx_from;
       if (remote_ctx.device_type == kDLCPU) remote_ctx = remote_ctx_to;
       GetSess(ctx_from)
@@ -82,12 +80,12 @@ class RPCDeviceAPI final : public DeviceAPI {
           ->CopyDataFromTo(static_cast<const RemoteSpace*>(from)->data, from_offset,
                            static_cast<const RemoteSpace*>(to)->data, to_offset, size,
                            remote_ctx_from, remote_ctx_to, type_hint, stream);
-    } else if (from_dev_type > kRPCSessMask && to_dev_type == kDLCPU) {
-      auto remote_ctx_from = RemoveSessMask(ctx_from);
+    } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) {
+      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
       GetSess(ctx_from)->CopyFromRemote(static_cast<const RemoteSpace*>(from)->data, from_offset,
                                         to, to_offset, size, remote_ctx_from, type_hint);
-    } else if (from_dev_type == kDLCPU && to_dev_type > kRPCSessMask) {
-      auto remote_ctx_to = RemoveSessMask(ctx_to);
+    } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) {
+      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
       GetSess(ctx_to)->CopyToRemote(const_cast<void*>(from), from_offset,
                                     static_cast<const RemoteSpace*>(to)->data, to_offset, size,
                                     remote_ctx_to, type_hint);
@@ -97,22 +95,15 @@ class RPCDeviceAPI final : public DeviceAPI {
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream);
   }
 
  private:
   std::shared_ptr<RPCSession> GetSess(TVMContext ctx) {
-    int dev_type = ctx.device_type;
-    CHECK_GE(dev_type, kRPCSessMask);
-    int tbl_index = dev_type / kRPCSessMask - 1;
+    int tbl_index = GetRPCSessionIndex(ctx);
     return RPCSession::Get(tbl_index);
   }
-
-  static TVMContext RemoveSessMask(TVMContext ctx) {
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
-    return ctx;
-  }
 };
 
 TVM_REGISTER_GLOBAL("device_api.rpc").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index 2deae07b0315..fbdd93fb4f62 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -122,7 +122,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
           break;
         case kRecvPacketNumBytes: {
           uint64_t packet_nbytes;
-          CHECK(this->Read(&packet_nbytes));
+          ICHECK(this->Read(&packet_nbytes));
           if (packet_nbytes != 0) {
             this->SwitchToState(kProcessPacket);
             this->RequestBytes(packet_nbytes);
@@ -178,7 +178,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
                    << args[i].AsObjectRef<ObjectRef>()->GetTypeKey() << " is not supported by RPC";
       } else if (tcode == kTVMContext) {
         DLContext ctx = args[i];
-        CHECK_LT(static_cast<int>(ctx.device_type), kRPCSessMask)
+        ICHECK(!IsRPCSessionContext(ctx))
             << "InternalError: cannot pass RPC context in the channel";
       }
     }
@@ -254,7 +254,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
   void SwitchToState(State state) {
     // invariant
     if (state != kCopyAckReceived) {
-      CHECK_EQ(pending_request_bytes_, 0U) << "state=" << state;
+      ICHECK_EQ(pending_request_bytes_, 0U) << "state=" << state;
     }
     // need to actively flush the writer
     // so the data get pushed out.
@@ -262,7 +262,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       flush_writer_();
     }
     state_ = state;
-    CHECK(state != kInitHeader) << "cannot switch to init header";
+    ICHECK(state != kInitHeader) << "cannot switch to init header";
     if (state == kRecvPacketNumBytes) {
       this->RequestBytes(sizeof(uint64_t));
       // recycle arena for the next session.
@@ -280,7 +280,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       this->RequestBytes(len);
       return;
     } else {
-      CHECK_EQ(init_header_step_, 1);
+      ICHECK_EQ(init_header_step_, 1);
       this->ReadArray(dmlc::BeginPtr(*remote_key_), remote_key_->length());
       this->SwitchToState(kRecvPacketNumBytes);
     }
@@ -378,7 +378,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       LOG(FATAL) << "RPCError: Error caught from RPC call:\n" << msg;
     }
 
-    CHECK(setreturn != nullptr) << "fsetreturn not available";
+    ICHECK(setreturn != nullptr) << "fsetreturn not available";
     setreturn(args);
 
     this->SwitchToState(kReturnReceived);
@@ -518,10 +518,10 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
     TVMArgs args = RecvPackedSeq();
 
     try {
-      CHECK(serving_session_ == nullptr) << "Server has already been initialized";
+      ICHECK(serving_session_ == nullptr) << "Server has already been initialized";
 
       std::string server_protocol_ver = kRPCProtocolVer;
-      CHECK_EQ(client_protocol_ver, server_protocol_ver)
+      ICHECK_EQ(client_protocol_ver, server_protocol_ver)
           << "Server[" << name_ << "]: Client protocol version mismatch with the server "
           << " server protocol=" << server_protocol_ver
           << ", client protocol=" << client_protocol_ver;
@@ -538,7 +538,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       }
 
       auto* fconstructor = Registry::Get(constructor_name);
-      CHECK(fconstructor != nullptr) << " Cannot find session constructor " << constructor_name;
+      ICHECK(fconstructor != nullptr) << " Cannot find session constructor " << constructor_name;
       TVMRetValue con_ret;
 
       try {
@@ -549,12 +549,12 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
                    << e.what();
       }
 
-      CHECK_EQ(con_ret.type_code(), kTVMModuleHandle)
+      ICHECK_EQ(con_ret.type_code(), kTVMModuleHandle)
           << "Server[" << name_ << "]:"
           << " Constructor " << constructor_name << " need to return an RPCModule";
       Module mod = con_ret;
       std::string tkey = mod->type_key();
-      CHECK_EQ(tkey, "rpc") << "Constructor " << constructor_name << " to return an RPCModule";
+      ICHECK_EQ(tkey, "rpc") << "Constructor " << constructor_name << " to return an RPCModule";
       serving_session_ = RPCModuleGetSession(mod);
       this->ReturnVoid();
     } catch (const std::runtime_error& e) {
@@ -606,9 +606,9 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
 
  private:
   RPCSession* GetServingSession() const {
-    CHECK(serving_session_ != nullptr)
+    ICHECK(serving_session_ != nullptr)
         << "Need to call InitRemoteSession first before any further actions";
-    CHECK(!serving_session_->IsAsync() || async_server_mode_)
+    ICHECK(!serving_session_->IsAsync() || async_server_mode_)
         << "Cannot host an async session in a non-Event driven server";
 
     return serving_session_.get();
@@ -616,7 +616,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
   // Utility functions
   // Internal read function, update pending_request_bytes_
   size_t Read(void* data, size_t size) final {
-    CHECK_LE(size, pending_request_bytes_);
+    ICHECK_LE(size, pending_request_bytes_);
     reader_->Read(data, size);
     pending_request_bytes_ -= size;
     return size;
@@ -693,13 +693,21 @@ void RPCEndpoint::Init() {
     handler_->SendPackedSeq(args.values, args.type_codes, args.num_args, true);
 
     code = HandleUntilReturnEvent(true, [rv](TVMArgs args) {
-      CHECK_EQ(args.size(), 1);
+      ICHECK_EQ(args.size(), 1);
       *rv = args[0];
     });
-    CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
+    ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
   });
 }
 
+/*!
+ * \brief Create a new RPCEndpoint instance.
+ * \param channel RPCChannel used to communicate
+ * \param name Name of this session, used to identify log messages from this RPCEndpoint instance.
+ * \param The remote key reported during protocol initialization, or "%toinit" if the RPCEndpoint
+ *     should handle this phase of the protocol for you. Some servers may prefer to access parts of
+ *     the key to modify their behavior.
+ */
 std::shared_ptr<RPCEndpoint> RPCEndpoint::Create(std::unique_ptr<RPCChannel> channel,
                                                  std::string name, std::string remote_key) {
   std::shared_ptr<RPCEndpoint> endpt = std::make_shared<RPCEndpoint>();
@@ -739,7 +747,7 @@ void RPCEndpoint::ServerLoop() {
     (*f)();
   }
   TVMRetValue rv;
-  CHECK(HandleUntilReturnEvent(false, [](TVMArgs) {}) == RPCCode::kShutdown);
+  ICHECK(HandleUntilReturnEvent(false, [](TVMArgs) {}) == RPCCode::kShutdown);
   if (const auto* f = Registry::Get("tvm.rpc.server.shutdown")) {
     (*f)();
   }
@@ -757,7 +765,7 @@ int RPCEndpoint::ServerAsyncIOEventHandler(const std::string& in_bytes, int even
         [this](const void* data, size_t size) { return channel_->Send(data, size); },
         writer_.bytes_available());
   }
-  CHECK(code != RPCCode::kReturn && code != RPCCode::kCopyAck);
+  ICHECK(code != RPCCode::kReturn && code != RPCCode::kCopyAck);
   if (code == RPCCode::kShutdown) return 0;
   if (writer_.bytes_available() != 0) return 2;
   return 1;
@@ -781,7 +789,7 @@ void RPCEndpoint::InitRemoteSession(TVMArgs args) {
   handler_->SendPackedSeq(args.values, args.type_codes, args.num_args, true);
 
   code = HandleUntilReturnEvent(true, [](TVMArgs args) {});
-  CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
+  ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
 }
 
 // Get remote function with name
@@ -804,7 +812,7 @@ void RPCEndpoint::CallFunc(RPCSession::PackedFuncHandle h, const TVMValue* arg_v
   handler_->SendPackedSeq(arg_values, arg_type_codes, num_args, true);
 
   code = HandleUntilReturnEvent(true, encode_return);
-  CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
+  ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
 }
 
 void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset,
@@ -827,7 +835,7 @@ void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t
   handler_->Write(type_hint);
   handler_->WriteArray(reinterpret_cast<char*>(from) + from_offset, data_size);
 
-  CHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn);
+  ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn);
 }
 
 void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset,
@@ -850,7 +858,7 @@ void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_
   handler_->Write(type_hint);
 
   TVMRetValue rv;
-  CHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck);
+  ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck);
   handler_->ReadArray(reinterpret_cast<char*>(to) + to_offset, data_size);
   handler_->FinishCopyAck();
 }
@@ -917,7 +925,7 @@ void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
   if (ctx.device_type == kDLCPU) {
     ctx = ctx_to;
   } else {
-    CHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type)
+    ICHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type)
         << "Can not copy across different ctx types directly";
   }
   handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from,
@@ -957,7 +965,7 @@ void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
   }
 
   if (state_ != kWaitForAsyncCallback) {
-    CHECK_EQ(state_, kRecvPacketNumBytes);
+    ICHECK_EQ(state_, kRecvPacketNumBytes);
   }
 }
 
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index d1eb89164fb7..4f721e122a4c 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -22,6 +22,7 @@
  * \brief RPC runtime module.
  */
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 
 #include <cstring>
@@ -36,6 +37,44 @@
 namespace tvm {
 namespace runtime {
 
+// deleter of RPC remote array
+static void RemoteNDArrayDeleter(Object* obj) {
+  auto* ptr = static_cast<NDArray::Container*>(obj);
+  RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
+  if (ptr->manager_ctx != nullptr) {
+    space->sess->FreeHandle(ptr->manager_ctx, kTVMNDArrayHandle);
+  }
+  delete space;
+  delete ptr;
+}
+
+/*!
+ * \brief Build a local NDArray with remote backing storage.
+ * \param sess the RPCSession which owns the given handle.
+ * \param handle A pointer valid on the remote end which should form the `data` field of the
+ *     underlying DLTensor.
+ * \param template_tensor An empty DLTensor whose shape and dtype fields are used to fill the newly
+ *     created array. Needed because it's difficult to pass a shape vector as a PackedFunc arg.
+ * \param ctx Remote context used with this tensor. Must have non-zero RPCSessMask.
+ * \param remote_ndarray_handle The handle returned by RPC server to identify the NDArray.
+ */
+NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle,
+                                      DLTensor* template_tensor, TVMContext ctx,
+                                      void* remote_ndarray_handle) {
+  ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(ctx))
+      << "The TVMContext given does not belong to the given session";
+  RemoteSpace* space = new RemoteSpace();
+  space->sess = sess;
+  space->data = handle;
+  std::vector<int64_t> shape_vec{template_tensor->shape,
+                                 template_tensor->shape + template_tensor->ndim};
+  NDArray::Container* data = new NDArray::Container(static_cast<void*>(space), std::move(shape_vec),
+                                                    template_tensor->dtype, ctx);
+  data->manager_ctx = remote_ndarray_handle;
+  data->SetDeleter(RemoteNDArrayDeleter);
+  return NDArray(GetObjectPtr<Object>(data));
+}
+
 /*!
  * \brief A wrapped remote function as a PackedFunc.
  */
@@ -108,47 +147,10 @@ class RPCWrappedFunc : public Object {
 
   // remove a remote session mask
   TVMContext RemoveSessMask(TVMContext ctx) const {
-    int dev_type = ctx.device_type;
-    CHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
-        << "Can not pass in local context or context with a different remote session";
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
-    return ctx;
-  }
-
-  // deleter of RPC remote array
-  static void RemoteNDArrayDeleter(Object* obj) {
-    auto* ptr = static_cast<NDArray::Container*>(obj);
-    RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
-    space->sess->FreeHandle(ptr->manager_ctx, kTVMNDArrayHandle);
-    delete space;
-    delete ptr;
-  }
-
-  // wrap return value as remote NDArray.
-  NDArray WrapRemoteNDArray(DLTensor* tensor, void* nd_handle) const {
-    NDArray::Container* data = new NDArray::Container();
-    data->manager_ctx = nd_handle;
-    data->SetDeleter(RemoteNDArrayDeleter);
-    RemoteSpace* space = new RemoteSpace();
-    space->sess = sess_;
-    space->data = tensor->data;
-    data->dl_tensor.data = space;
-    NDArray ret(GetObjectPtr<Object>(data));
-    // RAII now in effect
-    data->shape_ = std::vector<int64_t>(tensor->shape, tensor->shape + tensor->ndim);
-    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
-    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
-    // setup dtype
-    data->dl_tensor.dtype = tensor->dtype;
-    // setup ctx, encode as remote session
-    data->dl_tensor.ctx.device_id = tensor->ctx.device_id;
-    data->dl_tensor.ctx.device_type = static_cast<DLDeviceType>(
-        static_cast<int>(tensor->ctx.device_type) + kRPCSessMask * (sess_->table_index() + 1));
-    // check strides.
-    CHECK(tensor->strides == nullptr);
-    // setup byteoffset
-    data->dl_tensor.byte_offset = tensor->byte_offset;
-    return ret;
+    ICHECK(IsRPCSessionContext(ctx)) << "Can not pass in local context";
+    ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index())
+        << "Can not pass in context with a different remote session";
+    return RemoveRPCSessionMask(ctx);
   }
 };
 
@@ -189,10 +191,9 @@ class RPCModuleNode final : public ModuleNode {
                               int min_repeat_ms, const std::string& f_preproc_name) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass ctx by parts.
-    int dev_type = ctx.device_type;
-    CHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
+    ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index())
         << "ValueError: Need to pass the matched remote context to RPCModule.GetTimeEvaluator";
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
+    ctx = RemoveRPCSessionMask(ctx);
 
     if (module_handle_ != nullptr) {
       return remote_get_time_evaluator_(GetRef<Module>(this), name,
@@ -224,7 +225,7 @@ class RPCModuleNode final : public ModuleNode {
   void InitRemoteFunc(FType* func, const std::string& name) {
     if (*func != nullptr) return;
     RPCSession::PackedFuncHandle handle = sess_->GetFunction(name);
-    CHECK(handle != nullptr) << "Cannot found remote function " << name;
+    ICHECK(handle != nullptr) << "Cannot found remote function " << name;
     *func = WrapRemoteFunc(handle);
   }
 
@@ -253,9 +254,9 @@ void* RPCWrappedFunc::UnwrapRemoteValueToHandle(const TVMArgValue& arg) const {
   if (arg.type_code() == kTVMModuleHandle) {
     Module mod = arg;
     std::string tkey = mod->type_key();
-    CHECK_EQ(tkey, "rpc") << "ValueError: Cannot pass a non-RPC module to remote";
+    ICHECK_EQ(tkey, "rpc") << "ValueError: Cannot pass a non-RPC module to remote";
     auto* rmod = static_cast<RPCModuleNode*>(mod.operator->());
-    CHECK(rmod->sess() == sess_)
+    ICHECK(rmod->sess() == sess_)
         << "ValueError: Cannot pass in module into a different remote session";
     return rmod->module_handle();
   } else {
@@ -270,22 +271,24 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons
 
   if (tcode == kTVMNullptr) return;
   if (tcode == kTVMPackedFuncHandle) {
-    CHECK_EQ(args.size(), 2);
+    ICHECK_EQ(args.size(), 2);
     void* handle = args[1];
     auto wf = std::make_shared<RPCWrappedFunc>(handle, sess_);
     *rv = PackedFunc([wf](TVMArgs args, TVMRetValue* rv) { return wf->operator()(args, rv); });
   } else if (tcode == kTVMModuleHandle) {
-    CHECK_EQ(args.size(), 2);
+    ICHECK_EQ(args.size(), 2);
     void* handle = args[1];
     auto n = make_object<RPCModuleNode>(handle, sess_);
     *rv = Module(n);
   } else if (tcode == kTVMDLTensorHandle || tcode == kTVMNDArrayHandle) {
-    CHECK_EQ(args.size(), 3);
+    ICHECK_EQ(args.size(), 3);
     DLTensor* tensor = args[1];
     void* nd_handle = args[2];
-    *rv = WrapRemoteNDArray(tensor, nd_handle);
+    *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor,
+                                        AddRPCSessionMask(tensor->ctx, sess_->table_index()),
+                                        nd_handle);
   } else {
-    CHECK_EQ(args.size(), 2);
+    ICHECK_EQ(args.size(), 2);
     *rv = args[1];
   }
 }
@@ -298,7 +301,7 @@ Module CreateRPCSessionModule(std::shared_ptr<RPCSession> sess) {
 
 std::shared_ptr<RPCSession> RPCModuleGetSession(Module mod) {
   std::string tkey = mod->type_key();
-  CHECK_EQ(tkey, "rpc") << "ValueError: Cannot pass a non-RPC module to remote";
+  ICHECK_EQ(tkey, "rpc") << "ValueError: Cannot pass a non-RPC module to remote";
   auto* rmod = static_cast<RPCModuleNode*>(mod.operator->());
   return rmod->sess();
 }
@@ -340,11 +343,11 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
 
 PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
                              int min_repeat_ms, PackedFunc f_preproc) {
-  CHECK(pf != nullptr);
+  ICHECK(pf != nullptr);
 
   if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
     auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
-    CHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
+    ICHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
     return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
   }
 
@@ -414,7 +417,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
           PackedFunc f_preproc;
           if (!f_preproc_name.empty()) {
             auto* pf_preproc = runtime::Registry::Get(f_preproc_name);
-            CHECK(pf_preproc != nullptr)
+            ICHECK(pf_preproc != nullptr)
                 << "Cannot find " << f_preproc_name << " in the global function";
             f_preproc = *pf_preproc;
           }
@@ -423,11 +426,11 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
         }
       } else {
         auto* pf = runtime::Registry::Get(name);
-        CHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
+        ICHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
         PackedFunc f_preproc;
         if (!f_preproc_name.empty()) {
           auto* pf_preproc = runtime::Registry::Get(f_preproc_name);
-          CHECK(pf_preproc != nullptr)
+          ICHECK(pf_preproc != nullptr)
               << "Cannot find " << f_preproc_name << " in the global function";
           f_preproc = *pf_preproc;
         }
@@ -452,22 +455,29 @@ TVM_REGISTER_GLOBAL("tvm.rpc.server.ModuleGetFunction")
 // functions to access an RPC module.
 TVM_REGISTER_GLOBAL("rpc.LoadRemoteModule").set_body_typed([](Module sess, std::string name) {
   std::string tkey = sess->type_key();
-  CHECK_EQ(tkey, "rpc");
+  ICHECK_EQ(tkey, "rpc");
   return static_cast<RPCModuleNode*>(sess.operator->())->LoadModule(name);
 });
 
 TVM_REGISTER_GLOBAL("rpc.ImportRemoteModule").set_body_typed([](Module parent, Module child) {
   std::string tkey = parent->type_key();
-  CHECK_EQ(tkey, "rpc");
+  ICHECK_EQ(tkey, "rpc");
   static_cast<RPCModuleNode*>(parent.operator->())->ImportModule(child);
 });
 
 TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue* rv) {
   Module m = args[0];
   std::string tkey = m->type_key();
-  CHECK_EQ(tkey, "rpc");
+  ICHECK_EQ(tkey, "rpc");
   *rv = static_cast<RPCModuleNode*>(m.operator->())->sess()->table_index();
 });
 
+TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle")
+    .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx,
+                       void* ndarray_handle) -> NDArray {
+      return NDArrayFromRemoteOpaqueHandle(RPCModuleGetSession(mod), remote_array, template_tensor,
+                                           ctx, ndarray_handle);
+    });
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_pipe_impl.cc b/src/runtime/rpc/rpc_pipe_impl.cc
index 2f4243574909..6f2f7e22deb4 100644
--- a/src/runtime/rpc/rpc_pipe_impl.cc
+++ b/src/runtime/rpc/rpc_pipe_impl.cc
@@ -78,8 +78,8 @@ class PipeChannel final : public RPCChannel {
 Module CreatePipeClient(std::vector<std::string> cmd) {
   int parent2child[2];
   int child2parent[2];
-  CHECK_EQ(pipe(parent2child), 0);
-  CHECK_EQ(pipe(child2parent), 0);
+  ICHECK_EQ(pipe(parent2child), 0);
+  ICHECK_EQ(pipe(child2parent), 0);
 
   int parent_read = child2parent[0];
   int parent_write = parent2child[1];
diff --git a/src/runtime/rpc/rpc_server_env.cc b/src/runtime/rpc/rpc_server_env.cc
index b999a48a376a..7ceb12caaf1f 100644
--- a/src/runtime/rpc/rpc_server_env.cc
+++ b/src/runtime/rpc/rpc_server_env.cc
@@ -23,7 +23,7 @@
  */
 #include <tvm/runtime/registry.h>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 
 namespace tvm {
 namespace runtime {
@@ -31,7 +31,7 @@ namespace runtime {
 std::string RPCGetPath(const std::string& name) {
   // do live lookup everytime as workpath can change.
   const PackedFunc* f = runtime::Registry::Get("tvm.rpc.server.workpath");
-  CHECK(f != nullptr) << "require tvm.rpc.server.workpath";
+  ICHECK(f != nullptr) << "require tvm.rpc.server.workpath";
   return (*f)(name);
 }
 
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 9e05e5d1628d..f5405f0c2fa0 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -108,7 +108,7 @@ class RPCSessTable {
   }
   // Get session from table
   std::shared_ptr<RPCSession> Get(int index) {
-    CHECK(index >= 0 && index < kMaxRPCSession);
+    ICHECK(index >= 0 && index < kMaxRPCSession);
     return tbl_[index].lock();
   }
   // Insert session into table.
@@ -137,7 +137,7 @@ std::shared_ptr<RPCSession> RPCSession::Get(int table_index) {
 }
 
 void RPCSession::InsertToSessionTable(std::shared_ptr<RPCSession> sess) {
-  CHECK_EQ(sess->table_index_, 0);
+  ICHECK_EQ(sess->table_index_, 0);
   sess->table_index_ = RPCSessTable::Global()->Insert(sess);
 }
 
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 77a743be0de6..4e7fe3196d45 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -70,17 +70,17 @@ std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string k
   support::TCPSocket sock;
   support::SockAddr addr(url.c_str(), port);
   sock.Create(addr.ss_family());
-  CHECK(sock.Connect(addr)) << "Connect to " << addr.AsString() << " failed";
+  ICHECK(sock.Connect(addr)) << "Connect to " << addr.AsString() << " failed";
   // hand shake
   std::ostringstream os;
   int code = kRPCMagic;
   int keylen = static_cast<int>(key.length());
-  CHECK_EQ(sock.SendAll(&code, sizeof(code)), sizeof(code));
-  CHECK_EQ(sock.SendAll(&keylen, sizeof(keylen)), sizeof(keylen));
+  ICHECK_EQ(sock.SendAll(&code, sizeof(code)), sizeof(code));
+  ICHECK_EQ(sock.SendAll(&keylen, sizeof(keylen)), sizeof(keylen));
   if (keylen != 0) {
-    CHECK_EQ(sock.SendAll(key.c_str(), keylen), keylen);
+    ICHECK_EQ(sock.SendAll(key.c_str(), keylen), keylen);
   }
-  CHECK_EQ(sock.RecvAll(&code, sizeof(code)), sizeof(code));
+  ICHECK_EQ(sock.RecvAll(&code, sizeof(code)), sizeof(code));
   if (code == kRPCMagic + 2) {
     sock.Close();
     LOG(FATAL) << "URL " << url << ":" << port << " cannot find server that matches key=" << key;
@@ -91,11 +91,11 @@ std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string k
     sock.Close();
     LOG(FATAL) << "URL " << url << ":" << port << " is not TVM RPC server";
   }
-  CHECK_EQ(sock.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
+  ICHECK_EQ(sock.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
   std::string remote_key;
   if (keylen != 0) {
     remote_key.resize(keylen);
-    CHECK_EQ(sock.RecvAll(&remote_key[0], keylen), keylen);
+    ICHECK_EQ(sock.RecvAll(&remote_key[0], keylen), keylen);
   }
   auto endpt =
       RPCEndpoint::Create(std::unique_ptr<SockChannel>(new SockChannel(sock)), key, remote_key);
diff --git a/src/runtime/stackvm/stackvm.cc b/src/runtime/stackvm/stackvm.cc
index 042815b3d68b..4a5211e9c829 100644
--- a/src/runtime/stackvm/stackvm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -360,7 +360,7 @@ void StackVM::Run(State* s) const {
       }
       case PUSH_VALUE: {
         int relpos = code[pc + 1].v_int;
-        CHECK_LE(relpos, 0);
+        ICHECK_LE(relpos, 0);
         stack[sp + 1] = stack[sp + relpos];
         sp += 1;
         pc += 2;
@@ -390,7 +390,7 @@ void StackVM::Run(State* s) const {
         break;
       }
       case ASSERT: {
-        CHECK(stack[sp].v_int64) << str_data[code[pc + 1].v_int];
+        ICHECK(stack[sp].v_int64) << str_data[code[pc + 1].v_int];
         sp -= 1;
         pc += 2;
         break;
@@ -417,8 +417,8 @@ void StackVM::Run(State* s) const {
       }
       case ASSERT_SP: {
         int64_t expected = code[pc + 1].v_int;
-        CHECK_EQ(sp, expected) << "sp assertion failed, expected=" << expected << " now=" << sp
-                               << ", pc=" << pc;
+        ICHECK_EQ(sp, expected) << "sp assertion failed, expected=" << expected << " now=" << sp
+                                << ", pc=" << pc;
         pc += 2;
         break;
       }
@@ -594,19 +594,19 @@ void StackVM::Run(State* s) const {
         break;
       }
     }
-    CHECK_GE(sp, alloca_sp) << "touch allocated space";
-    CHECK_LT(sp, stack_cap) << "Stack overflow";
+    ICHECK_GE(sp, alloca_sp) << "touch allocated space";
+    ICHECK_LT(sp, stack_cap) << "Stack overflow";
   }
 }
 
 const PackedFunc& StackVM::GetExtern(State* s, int fid) const {
-  CHECK_LT(static_cast<size_t>(fid), extern_func_cache_.size());
+  ICHECK_LT(static_cast<size_t>(fid), extern_func_cache_.size());
   // allow race write in this, since write is idempotent
   PackedFunc& f = extern_func_cache_[fid];
   if (f == nullptr) {
-    CHECK(s->mod_ctx != nullptr) << "No local context is set in stackvm";
+    ICHECK(s->mod_ctx != nullptr) << "No local context is set in stackvm";
     const PackedFunc* pf = s->mod_ctx->GetFuncFromEnv(extern_func_name[fid]);
-    CHECK(pf != nullptr);
+    ICHECK(pf != nullptr);
     f = *pf;
   }
   return f;
diff --git a/src/runtime/stackvm/stackvm.h b/src/runtime/stackvm/stackvm.h
index 09581a6d0b62..e57cb0b03952 100644
--- a/src/runtime/stackvm/stackvm.h
+++ b/src/runtime/stackvm/stackvm.h
@@ -162,7 +162,7 @@ class StackVM {
     /*!
      * \brief Assert condition is true.
      * \code
-     *  CHECK(stack[sp]) << str_data[code[pc + 1].v_int];
+     *  ICHECK(stack[sp]) << str_data[code[pc + 1].v_int];
      *  sp = sp - 1;
      * \endcode
      */
@@ -201,7 +201,7 @@ class StackVM {
     /*!
      * \brief debug instruction.
      * \code
-     *  CHECK_EQ(sp, code[pc + 1]).v_int;
+     *  ICHECK_EQ(sp, code[pc + 1]).v_int;
      *  pc += 2;
      * \code
      */
@@ -391,7 +391,7 @@ class StackVM {
    * \return The load opcode
    */
   static OpCode GetLoad(DLDataType t) {
-    CHECK_EQ(t.lanes, 1U);
+    ICHECK_EQ(t.lanes, 1U);
     if (t.code == kTVMOpaqueHandle) return ARRAY_LOAD_HANDLE;
     if (t.code == kDLInt) {
       switch (t.bits) {
@@ -420,7 +420,7 @@ class StackVM {
    * \return The load opcode
    */
   static OpCode GetStore(DLDataType t) {
-    CHECK_EQ(t.lanes, 1U);
+    ICHECK_EQ(t.lanes, 1U);
     if (t.code == kTVMOpaqueHandle) return ARRAY_STORE_HANDLE;
     if (t.code == kDLInt) {
       switch (t.bits) {
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
index 6c9af1cbeb42..c815857ac66f 100644
--- a/src/runtime/stackvm/stackvm_module.cc
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -30,7 +30,7 @@
 #include <unordered_map>
 #include <utility>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 
 namespace tvm {
 namespace runtime {
@@ -71,7 +71,7 @@ class StackVMModuleNode : public runtime::ModuleNode {
     strm->Write(num_imports);
 
     for (runtime::Module im : imports_) {
-      CHECK_EQ(im->imports().size(), 0U) << "Only support simply one-level hierarchy";
+      ICHECK_EQ(im->imports().size(), 0U) << "Only support simply one-level hierarchy";
       std::string tkey = im->type_key();
       strm->Write(tkey);
       LOG(INFO) << "save " << tkey;
@@ -100,7 +100,7 @@ class StackVMModuleNode : public runtime::ModuleNode {
     strm->Read(&num_imports);
     for (uint64_t i = 0; i < num_imports; ++i) {
       std::string tkey;
-      CHECK(strm->Read(&tkey));
+      ICHECK(strm->Read(&tkey));
       std::string loadkey = "runtime.module.loadbinary_";
       std::string fkey = loadkey + tkey;
       const PackedFunc* f = Registry::Get(fkey);
@@ -114,7 +114,7 @@ class StackVMModuleNode : public runtime::ModuleNode {
             loaders += name.substr(loadkey.size());
           }
         }
-        CHECK(f != nullptr)
+        ICHECK(f != nullptr)
             << "Binary was created using " << tkey
             << " but a loader of that name is not registered. Available loaders are " << loaders
             << ". Perhaps you need to recompile with this runtime enabled.";
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index bf4133453e7c..9bb00eea1edc 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -21,13 +21,13 @@
  * \file thread_pool.cc
  * \brief Threadpool for multi-threading runtime.
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/threading_backend.h>
+#include <tvm/support/logging.h>
 #if TVM_THREADPOOL_USE_OPENMP
 #include <omp.h>
 #endif
@@ -189,7 +189,7 @@ class SpscTaskQueue {
     }
     const uint32_t head = head_.load(std::memory_order_relaxed);
     // sanity check if the queue is empty
-    CHECK(tail_.load(std::memory_order_acquire) != head);
+    ICHECK(tail_.load(std::memory_order_acquire) != head);
     *output = buffer_[head];
     head_.store((head + 1) % kRingSize, std::memory_order_release);
     return true;
@@ -280,13 +280,13 @@ class ThreadPool {
   }
   int Launch(FTVMParallelLambda flambda, void* cdata, int num_task, int need_sync) {
     ParallelLauncher* launcher = ParallelLauncher::ThreadLocal();
-    CHECK(!launcher->is_worker)
+    ICHECK(!launcher->is_worker)
         << "Cannot launch parallel job inside worker, consider fuse then parallel";
     if (num_task == 0) {
       num_task = num_workers_used_;
     }
     if (need_sync != 0) {
-      CHECK_LE(num_task, num_workers_used_)
+      ICHECK_LE(num_task, num_workers_used_)
           << "Request parallel sync task larger than number of threads used "
           << " workers=" << num_workers_used_ << " request=" << num_task;
     }
@@ -333,7 +333,7 @@ class ThreadPool {
     // TODO(tulloch): should we make this configurable via standard APIs?
     static size_t spin_count = GetSpinCount();
     while (queue->Pop(&task, spin_count)) {
-      CHECK(task.launcher != nullptr);
+      ICHECK(task.launcher != nullptr);
       TVMParallelGroupEnv* penv = &(task.launcher->env);
       void* cdata = task.launcher->cdata;
       if ((*task.launcher->flambda)(task.task_id, penv, cdata) == 0) {
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 019df3e597c9..2527f4799086 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -21,8 +21,8 @@
  * \file threading_backend.cc
  * \brief Native threading backend
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/threading_backend.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <thread>
@@ -46,7 +46,7 @@ class ThreadGroup::Impl {
  public:
   Impl(int num_workers, std::function<void(int)> worker_callback, bool exclude_worker0)
       : num_workers_(num_workers) {
-    CHECK_GE(num_workers, 1) << "Requested a non-positive number of worker threads.";
+    ICHECK_GE(num_workers, 1) << "Requested a non-positive number of worker threads.";
     for (int i = exclude_worker0; i < num_workers_; ++i) {
       threads_.emplace_back([worker_callback, i] { worker_callback(i); });
     }
@@ -112,7 +112,7 @@ class ThreadGroup::Impl {
 #endif
 #endif
 #if defined(__linux__) || defined(__ANDROID__)
-    CHECK_GE(sorted_order_.size(), num_workers_);
+    ICHECK_GE(sorted_order_.size(), num_workers_);
 
     for (unsigned i = 0; i < threads_.size(); ++i) {
       unsigned core_id;
diff --git a/src/runtime/vm/bytecode.cc b/src/runtime/vm/bytecode.cc
index 78972beb1ed2..f82d708468f7 100644
--- a/src/runtime/vm/bytecode.cc
+++ b/src/runtime/vm/bytecode.cc
@@ -22,7 +22,6 @@
  * \brief The bytecode for Relay virtual machine.
  */
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/vm/bytecode.h>
 #include <tvm/support/logging.h>
 
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index cc1dc8dd19e5..eb1707b25aa3 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -37,15 +37,15 @@
 #include <utility>
 #include <vector>
 
-#include "serialize_util.h"
+#include "serialize_utils.h"
 
 namespace tvm {
 namespace runtime {
 namespace vm {
 
-#define STREAM_CHECK(val, section)                                         \
-  CHECK(val) << "Invalid VM file format in the " << section << " section." \
-             << "\n";
+#define STREAM_CHECK(val, section)                                          \
+  ICHECK(val) << "Invalid VM file format in the " << section << " section." \
+              << "\n";
 
 // Helper to serialize a vm instruction.
 VMInstructionSerializer SerializeInstruction(const Instruction& instr);
@@ -527,7 +527,7 @@ void Executable::LoadConstantSection(dmlc::Stream* strm) {
   // Load the const to device mapping.
   std::vector<size_t> const_device_type;
   STREAM_CHECK(strm->Read(&const_device_type), "constant");
-  CHECK_EQ(size, const_device_type.size());
+  ICHECK_EQ(size, const_device_type.size());
   for (auto dev : const_device_type) {
     this->const_device_type.push_back(static_cast<Index>(dev));
   }
@@ -545,7 +545,7 @@ void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
 // `instr_fields`.
 inline std::vector<Index> ExtractFields(const std::vector<Index>& instr_fields, Index start,
                                         Index cnt) {
-  CHECK_LE(static_cast<size_t>(start + cnt), instr_fields.size());
+  ICHECK_LE(static_cast<size_t>(start + cnt), instr_fields.size());
   std::vector<Index> ret;
   for (auto i = start; i < start + cnt; i++) {
     ret.push_back(instr_fields[i]);
@@ -765,8 +765,8 @@ void Executable::LoadCodeSection(dmlc::Stream* strm) {
     VMFunction vm_func = VMFunction(loaded_func.name, loaded_func.params, instructions,
                                     loaded_func.register_file_size, loaded_func.params_device_type);
     auto it = this->global_map.find(loaded_func.name);
-    CHECK(it != this->global_map.end());
-    CHECK_LE(it->second, this->global_map.size());
+    ICHECK(it != this->global_map.end());
+    ICHECK_LE(it->second, this->global_map.size());
     this->functions[it->second] = vm_func;
   }
 }
@@ -774,14 +774,14 @@ void Executable::LoadCodeSection(dmlc::Stream* strm) {
 TVM_REGISTER_GLOBAL("runtime.GetNumOfGlobals").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec);
+  ICHECK(exec);
   *rv = static_cast<int>(exec->global_map.size());
 });
 
 TVM_REGISTER_GLOBAL("runtime.GetGlobalFields").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec);
+  ICHECK(exec);
   int idx = args[1];
   std::vector<std::pair<std::string, Index> > globals(exec->global_map.begin(),
                                                       exec->global_map.end());
@@ -789,24 +789,24 @@ TVM_REGISTER_GLOBAL("runtime.GetGlobalFields").set_body([](TVMArgs args, TVMRetV
     return a.second < b.second;
   };
   std::sort(globals.begin(), globals.end(), comp);
-  CHECK_LT(idx, globals.size());
+  ICHECK_LT(idx, globals.size());
   *rv = globals[idx].first;
 });
 
 TVM_REGISTER_GLOBAL("runtime.GetNumOfPrimitives").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec);
+  ICHECK(exec);
   *rv = static_cast<int>(exec->primitive_map.size());
 });
 
 TVM_REGISTER_GLOBAL("runtime.GetPrimitiveFields").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec);
+  ICHECK(exec);
   int idx = args[1];
-  CHECK_GE(idx, 0);
-  CHECK_LT(idx, exec->primitive_map.size());
+  ICHECK_GE(idx, 0);
+  ICHECK_LT(idx, exec->primitive_map.size());
 
   for (const auto& it : exec->primitive_map) {
     if (idx == static_cast<int>(it.second)) {
diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc
index 4d443d9a26a2..960b2e20145a 100644
--- a/src/runtime/vm/memory_manager.cc
+++ b/src/runtime/vm/memory_manager.cc
@@ -35,7 +35,7 @@ namespace vm {
 
 static void BufferDeleter(Object* obj) {
   auto* ptr = static_cast<NDArray::Container*>(obj);
-  CHECK(ptr->manager_ctx != nullptr);
+  ICHECK(ptr->manager_ctx != nullptr);
   Buffer* buffer = reinterpret_cast<Buffer*>(ptr->manager_ctx);
   MemoryManager::GetAllocator(buffer->ctx)->Free(*(buffer));
   delete buffer;
@@ -59,15 +59,15 @@ void StorageObj::Deleter(Object* obj) {
 }
 
 inline void VerifyDataType(DLDataType dtype) {
-  CHECK_GE(dtype.lanes, 1);
+  ICHECK_GE(dtype.lanes, 1);
   if (dtype.code == kDLFloat) {
-    CHECK_EQ(dtype.bits % 8, 0);
+    ICHECK_EQ(dtype.bits % 8, 0);
   } else {
     // allow uint1 as a special flag for bool.
     if (dtype.bits == 1 && dtype.code == kDLUInt) return;
-    CHECK_EQ(dtype.bits % 8, 0);
+    ICHECK_EQ(dtype.bits % 8, 0);
   }
-  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
+  ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
 inline size_t GetDataAlignment(const DLTensor& arr) {
@@ -102,7 +102,7 @@ NDArray StorageObj::AllocNDArray(size_t offset, std::vector<int64_t> shape, DLDa
   NDArray ret(GetObjectPtr<Object>(container));
   // RAII in effect, now run the check.
 
-  CHECK(offset + needed_size <= this->buffer.size)
+  ICHECK(offset + needed_size <= this->buffer.size)
       << "storage allocation failure, attempted to allocate " << needed_size << " at offset "
       << offset << " in region that is " << this->buffer.size << "bytes";
 
@@ -110,8 +110,10 @@ NDArray StorageObj::AllocNDArray(size_t offset, std::vector<int64_t> shape, DLDa
 }
 
 MemoryManager* MemoryManager::Global() {
-  static MemoryManager memory_manager;
-  return &memory_manager;
+  // NOTE: explicitly use new to avoid exit-time destruction of global state
+  // Global state will be recycled by OS as the process exits.
+  static auto* inst = new MemoryManager();
+  return inst;
 }
 
 Allocator* MemoryManager::GetOrCreateAllocator(TVMContext ctx, AllocatorType type) {
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 63001634558e..94d827893b92 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -43,7 +43,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
                                             const ObjectPtr<Object>& sptr_to_self) {
   if (name == "get_stat") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.size(), 1U);
+      ICHECK_EQ(args.size(), 1U);
       std::vector<std::pair<Index, double>> op_acc_time;
       for (auto kv : op_durations_) {
         auto val =
@@ -95,7 +95,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
 
 void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
   VirtualMachine::LoadExecutable(exec);
-  CHECK(exec_);
+  ICHECK(exec_);
   for (auto kv : exec_->primitive_map) {
     packed_index_map_[kv.second] = kv.first;
     op_invokes_[kv.second] = 0;
@@ -104,17 +104,17 @@ void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
 
 void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
                                        Index output_size, const std::vector<ObjectRef>& args) {
-  CHECK(exec_);
-  CHECK(!ctxs_.empty()) << "Context has not been initialized yet.";
+  ICHECK(exec_);
+  ICHECK(!ctxs_.empty()) << "Context has not been initialized yet.";
   // The device context of any input of the operator is used for
   // synchronization.
-  CHECK_GT(arg_count, 0U);
+  ICHECK_GT(arg_count, 0U);
   ObjectRef arg = args[0];
   while (arg->IsInstance<ADTObj>()) {
     ADT adt = Downcast<ADT>(arg);
     arg = adt[0];
   }
-  CHECK(arg->IsInstance<NDArray::ContainerType>());
+  ICHECK(arg->IsInstance<NDArray::ContainerType>());
   auto nd_array = Downcast<NDArray>(arg);
   auto ctx = nd_array->ctx;
 
@@ -140,8 +140,8 @@ runtime::Module CreateVirtualMachineDebug(const Executable* exec) {
 TVM_REGISTER_GLOBAL("runtime._VirtualMachineDebug").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec) << "Virtual machine has not been defined yet."
-              << "\n";
+  ICHECK(exec) << "Virtual machine has not been defined yet."
+               << "\n";
   *rv = CreateVirtualMachineDebug(exec);
 });
 
diff --git a/src/runtime/vm/serialize_util.h b/src/runtime/vm/serialize_utils.h
similarity index 92%
rename from src/runtime/vm/serialize_util.h
rename to src/runtime/vm/serialize_utils.h
index d17256d6a079..990da31750d4 100644
--- a/src/runtime/vm/serialize_util.h
+++ b/src/runtime/vm/serialize_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file src/runtime/vm/serialize_util.h
+ * \file src/runtime/vm/serialize_utils.h
  * \brief Definitions of helpers for serializing and deserializing a Relay VM.
  */
-#ifndef TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
-#define TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
+#ifndef TVM_RUNTIME_VM_SERIALIZE_UTILS_H_
+#define TVM_RUNTIME_VM_SERIALIZE_UTILS_H_
 
 #include <dmlc/common.h>
 #include <dmlc/memory_io.h>
@@ -79,8 +79,8 @@ struct VMFunctionSerializer {
   bool Load(dmlc::Stream* strm) {
     std::vector<std::string> func_info;
     if (!strm->Read(&func_info)) return false;
-    CHECK_EQ(func_info.size(), 3U) << "Failed to decode the vm function."
-                                   << "\n";
+    ICHECK_EQ(func_info.size(), 3U) << "Failed to decode the vm function."
+                                    << "\n";
     name = func_info[0];
     register_file_size = std::stoll(func_info[1]);
     // Get the number of instructions.
@@ -135,7 +135,7 @@ struct VMInstructionSerializer {
   bool Load(dmlc::Stream* strm) {
     std::vector<Index> instr;
     if (!strm->Read(&instr)) return false;
-    CHECK_GE(instr.size(), 2U);
+    ICHECK_GE(instr.size(), 2U);
     Index loaded_hash = instr[0];
     opcode = instr[1];
 
@@ -144,7 +144,7 @@ struct VMInstructionSerializer {
     }
 
     Index hash = Hash();
-    CHECK_EQ(loaded_hash, hash) << "Found mismatch in hash for opcode: " << opcode << "\n";
+    ICHECK_EQ(loaded_hash, hash) << "Found mismatch in hash for opcode: " << opcode << "\n";
     return true;
   }
 
@@ -164,4 +164,4 @@ struct VMInstructionSerializer {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
+#endif  // TVM_RUNTIME_VM_SERIALIZE_UTILS_H_
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 0a0ff2697674..473b5d759272 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -70,7 +70,7 @@ inline ObjectRef CopyTo(ObjectRef src, const DLContext& ctx) {
     }
     return src;
   } else {
-    CHECK(src->IsInstance<ADTObj>())
+    ICHECK(src->IsInstance<ADTObj>())
         << "VM data must be NDArray or a list of NDArray, but received: " << src->_type_key;
     std::vector<ObjectRef> ret;
     ADT adt = Downcast<ADT>(src);
@@ -93,7 +93,7 @@ std::vector<int64_t> ToShape(NDArray shape_tensor) {
 
   // Otherwise we should be rank-1, and we will extract the number of dimensions
   // for the output vector.
-  CHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, found " << rank;
+  ICHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, found " << rank;
   int64_t ndim = shape_tensor.Shape().at(0);
   shape.resize(ndim);
 
@@ -115,24 +115,24 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
                                        const ObjectPtr<Object>& sptr_to_self) {
   if (name == "invoke") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK(exec_) << "The executable is not created yet.";
+      ICHECK(exec_) << "The executable is not created yet.";
       std::string func_name = args[0];
       auto git = exec_->global_map.find(func_name);
-      CHECK(git != exec_->global_map.end())
+      ICHECK(git != exec_->global_map.end())
           << "Cannot find function " << func_name << " in the executable";
       auto func = exec_->functions[git->second];
       if (func.params.empty()) {
         *rv = Invoke(func, {});
       } else {
         auto it = inputs_.find(func_name);
-        CHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
+        ICHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
         const std::vector<ObjectRef>& func_args = it->second;
         *rv = Invoke(func, func_args);
       }
     });
   } else if (name == "init") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.size() % 3, 0);
+      ICHECK_EQ(args.size() % 3, 0);
       std::vector<TVMContext> contexts;
       std::vector<AllocatorType> alloc_types;
       for (int i = 0; i < args.size() / 3; ++i) {
@@ -148,16 +148,16 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
     });
   } else if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK(exec_) << "The executable is not created yet.";
+      ICHECK(exec_) << "The executable is not created yet.";
       std::string func_name = args[0];
       auto gvit = exec_->global_map.find(func_name);
-      CHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
+      ICHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
       auto func_index = gvit->second;
       const auto& vm_func = exec_->functions[func_index];
       const auto& param_names = vm_func.params;
-      CHECK_EQ(args.size() - 1, param_names.size())
+      ICHECK_EQ(args.size() - 1, param_names.size())
           << "The number of provided parameters doesn't match the number of arguments";
-      CHECK_EQ(param_names.size(), vm_func.params_device_type.size())
+      ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
           << "The number of provided parameters doesn't match the number of assigned devices";
       std::vector<ObjectRef> func_args(param_names.size());
       for (int i = 1; i < args.size(); ++i) {
@@ -176,10 +176,10 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
 }
 
 inline TVMContext VirtualMachine::GetContext(Index device_type) const {
-  CHECK_GE(ctxs_.size(), device_type) << "ctxs_ list doesn't contain device:" << device_type;
+  ICHECK_GE(ctxs_.size(), device_type) << "ctxs_ list doesn't contain device:" << device_type;
 
   auto ctx = ctxs_[device_type];
-  CHECK_EQ(static_cast<Index>(ctx.device_type), device_type)
+  ICHECK_EQ(static_cast<Index>(ctx.device_type), device_type)
       << "device type " << device_type << " has not been initialized int the context list.";
   return ctx;
 }
@@ -190,7 +190,7 @@ void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction&
 }
 
 Index VirtualMachine::PopFrame() {
-  CHECK_GT(frames_.size(), 0);
+  ICHECK_GT(frames_.size(), 0);
   const VMFrame& fr = frames_.back();
   func_index_ = fr.func_index;
   code_ = fr.code;
@@ -222,9 +222,9 @@ ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<Objec
 }
 
 ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<ObjectRef>& args) {
-  CHECK(exec_) << "The executable has not been created yet.";
+  ICHECK(exec_) << "The executable has not been created yet.";
   auto it = exec_->global_map.find(name);
-  CHECK(it != exec_->global_map.end()) << "Cannot find function " << name << " in the executable";
+  ICHECK(it != exec_->global_map.end()) << "Cannot find function " << name << " in the executable";
   auto func_index_ = it->second;
   DLOG(INFO) << "Invoke Global " << name << " at index " << func_index_;
   return Invoke(exec_->functions[func_index_], args);
@@ -263,12 +263,12 @@ void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func, In
 }
 
 void VirtualMachine::LoadExecutable(const Executable* exec) {
-  CHECK(exec) << "The executable is not created yet.";
+  ICHECK(exec) << "The executable is not created yet.";
   exec_ = exec;
 
   runtime::Module lib = exec_->lib;
   // Get the list of packed functions.
-  CHECK(exec->primitive_map.empty() || lib.operator->())
+  ICHECK(exec->primitive_map.empty() || lib.operator->())
       << "runtime module should have been built for primitive functions"
       << "\n";
   for (const auto& it : exec_->primitive_map) {
@@ -278,17 +278,17 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
       packed_funcs_.resize(packed_index + 1);
     }
     tvm::runtime::PackedFunc pf = lib.GetFunction(packed_name, true);
-    CHECK(pf != nullptr) << "Cannot find function in module: " << packed_name;
+    ICHECK(pf != nullptr) << "Cannot find function in module: " << packed_name;
     packed_funcs_[packed_index] = pf;
   }
   for (size_t i = 0; i < packed_funcs_.size(); ++i) {
-    CHECK(packed_funcs_[i] != nullptr) << "Packed function " << i << " is not initialized";
+    ICHECK(packed_funcs_[i] != nullptr) << "Packed function " << i << " is not initialized";
   }
 }
 
 void VirtualMachine::Init(const std::vector<TVMContext>& ctxs,
                           const std::vector<AllocatorType>& alloc_types) {
-  CHECK_EQ(ctxs.size(), alloc_types.size());
+  ICHECK_EQ(ctxs.size(), alloc_types.size());
   // Cache the context
   for (size_t i = 0; i < ctxs.size(); i++) {
     auto dev_type = static_cast<size_t>(ctxs[i].device_type);
@@ -343,8 +343,8 @@ inline int64_t VirtualMachine::LoadScalarInt(Index r) const {
 }
 
 void VirtualMachine::RunLoop() {
-  CHECK(this->exec_);
-  CHECK(this->code_);
+  ICHECK(this->exec_);
+  ICHECK(this->code_);
   pc_ = 0;
   Index frame_start = frames_.size();
   while (true) {
@@ -398,7 +398,7 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::InvokePacked: {
         DLOG(INFO) << "InvokedPacked " << instr.packed_index << " arity=" << instr.arity;
-        CHECK_LE(instr.packed_index, packed_funcs_.size());
+        ICHECK_LE(instr.packed_index, packed_funcs_.size());
         const auto& func = packed_funcs_[instr.packed_index];
         const auto& arity = instr.arity;
         std::vector<ObjectRef> args;
@@ -456,10 +456,10 @@ void VirtualMachine::RunLoop() {
         int32_t target_val = LoadScalarInt(instr.if_op.target);
 
         if (test_val == target_val) {
-          CHECK_NE(instr.if_op.true_offset, 0);
+          ICHECK_NE(instr.if_op.true_offset, 0);
           pc_ += instr.if_op.true_offset;
         } else {
-          CHECK_NE(instr.if_op.false_offset, 0);
+          ICHECK_NE(instr.if_op.false_offset, 0);
           pc_ += instr.if_op.false_offset;
         }
 
@@ -524,10 +524,10 @@ void VirtualMachine::RunLoop() {
 
         auto storage_obj = SimpleObjAllocator().make_object<StorageObj>();
         auto dev_type = instr.alloc_storage.device_type;
-        CHECK_LT(static_cast<size_t>(dev_type), allocators_.size())
+        ICHECK_LT(static_cast<size_t>(dev_type), allocators_.size())
             << "Memory allocator for device " << dev_type << " has not been initialized";
         auto* alloc = allocators_[dev_type];
-        CHECK(alloc) << "Did you forget to init the VirtualMachine with contexts?";
+        ICHECK(alloc) << "Did you forget to init the VirtualMachine with contexts?";
         storage_obj->buffer = alloc->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
         Storage storage(storage_obj);
         WriteRegister(instr.dst, storage);
@@ -569,8 +569,8 @@ void VirtualMachine::RunLoop() {
         auto shape_obj = ReadRegister(instr.reshape_tensor.newshape);
         NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_ctx));
         const DLTensor* dl_tensor = shape_tensor.operator->();
-        CHECK_EQ(dl_tensor->dtype.code, 0u);
-        CHECK_EQ(dl_tensor->dtype.bits, 64);
+        ICHECK_EQ(dl_tensor->dtype.code, 0u);
+        ICHECK_EQ(dl_tensor->dtype.bits, 64);
         int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
         int64_t ndim = shape_tensor->shape[0];
         std::vector<int64_t> shape(dims, dims + ndim);
@@ -584,7 +584,7 @@ void VirtualMachine::RunLoop() {
         auto tensor_src = ReadRegister(instr.src);
         NDArray src_data = Downcast<NDArray>(tensor_src);
         DLContext src_ctx = src_data->ctx;
-        CHECK_EQ(static_cast<Index>(src_ctx.device_type), instr.src_device_type);
+        ICHECK_EQ(static_cast<Index>(src_ctx.device_type), instr.src_device_type);
 
         DLContext dst_ctx;
         dst_ctx.device_type = static_cast<DLDeviceType>(instr.dst_device_type);
@@ -610,7 +610,7 @@ runtime::Module CreateVirtualMachine(const Executable* exec) {
 TVM_REGISTER_GLOBAL("runtime._VirtualMachine").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec) << "The virtual machine executable has not been defined yet.";
+  ICHECK(exec) << "The virtual machine executable has not been defined yet.";
   *rv = CreateVirtualMachine(exec);
 });
 
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index 5b630337acbb..cbf1974ee3c7 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -26,7 +26,7 @@
 #include <array>
 #include <cstring>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../workspace_pool.h"
@@ -202,7 +202,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
-    CHECK(stream == nullptr);
+    ICHECK(stream == nullptr);
     TVMContext ctx = ctx_from;
     if (ctx_from.device_type == kDLCPU) {
       ctx = ctx_to;
@@ -223,7 +223,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
             copy_info.size = size;
             vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, to_buf->buffer, 1, &copy_info);
             // 2: barrier(transfer-> compute|transfer)
-            CHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Vulkan disallow cross device copy.";
+            ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Vulkan disallow cross device copy.";
             VkMemoryBarrier barrier_info;
             barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
             barrier_info.pNext = nullptr;
@@ -324,7 +324,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    CHECK(stream == nullptr);
+    ICHECK(stream == nullptr);
     VulkanThreadEntry::ThreadLocal()->Stream(ctx.device_id)->Synchronize();
   }
 
@@ -347,7 +347,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
   }
 
   const VulkanContext& context(size_t device_id) const {
-    CHECK_LT(device_id, context_.size());
+    ICHECK_LT(device_id, context_.size());
     return context_[device_id];
   }
 
@@ -363,7 +363,7 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
     *rv = static_cast<int>(index < context_.size());
     return;
   }
-  CHECK_LT(index, context_.size()) << "Invalid device id " << index;
+  ICHECK_LT(index, context_.size()) << "Invalid device id " << index;
   const auto& vctx = context(index);
   switch (kind) {
     case kMaxThreadsPerBlock: {
@@ -600,7 +600,7 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
         ctx.coherent_staging = ty.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
       }
     }
-    CHECK_GE(win_rank, 0) << "Cannot find suitable staging memory on device.";
+    ICHECK_GE(win_rank, 0) << "Cannot find suitable staging memory on device.";
 
     win_rank = -1;
     for (uint32_t k = 0; k < prop.memoryTypeCount; ++k) {
@@ -619,7 +619,7 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
         ctx.compute_mtype_index = k;
       }
     }
-    CHECK_GE(win_rank, 0) << "Cannot find suitable local memory on device.";
+    ICHECK_GE(win_rank, 0) << "Cannot find suitable local memory on device.";
     auto has_extension = [&extensions](const char* query) {
       return std::any_of(extensions.begin(), extensions.end(),
                          [&](const char* extension) { return std::strcmp(query, extension) == 0; });
@@ -740,8 +740,8 @@ class VulkanModuleNode final : public runtime::ModuleNode {
   const char* type_key() const final { return "vulkan"; }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
-    CHECK_EQ(sptr_to_self.get(), this);
-    CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+    ICHECK_EQ(sptr_to_self.get(), this);
+    ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
     auto it = fmap_.find(name);
     if (it == fmap_.end()) return PackedFunc();
     const FunctionInfo& info = it->second;
@@ -757,7 +757,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
     for (size_t device_id = 0; device_id < ecache_.size(); ++device_id) {
       for (auto& kv : ecache_[device_id]) {
         auto& pe = kv.second;
-        CHECK(pe);
+        ICHECK(pe);
         const auto& vctx = VulkanDeviceAPI::Global()->context(device_id);
 
         if (pe->descriptor_update_template != VK_NULL_HANDLE) {
@@ -786,7 +786,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
     {
       // create shader
       auto sit = smap_.find(func_name);
-      CHECK(sit != smap_.end());
+      ICHECK(sit != smap_.end());
       const std::vector<uint32_t>& data = sit->second.data;
       VkShaderModuleCreateInfo shader_cinfo;
       shader_cinfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
@@ -802,7 +802,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
 
     {
       auto fit = fmap_.find(func_name);
-      CHECK(fit != fmap_.end());
+      ICHECK(fit != fmap_.end());
       for (DLDataType arg_type : fit->second.arg_types) {
         if (arg_type.code == kTVMOpaqueHandle) {
           {
@@ -885,7 +885,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
     if (num_pack_args != 0) {
       playout_cinfo.pushConstantRangeCount = 1;
       playout_cinfo.pPushConstantRanges = &crange;
-      CHECK_LE(crange.size, vctx.phy_device_prop.limits.maxPushConstantsSize);
+      ICHECK_LE(crange.size, vctx.phy_device_prop.limits.maxPushConstantsSize);
     } else {
       playout_cinfo.pushConstantRangeCount = 0;
       playout_cinfo.pPushConstantRanges = nullptr;
@@ -932,7 +932,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
 
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, fmt_) << "Can only save to customized format vulkan";
+    ICHECK_EQ(fmt, fmt_) << "Can only save to customized format vulkan";
     std::string meta_file = GetMetaFilePath(file_name);
     SaveMetaDataToFile(meta_file, fmap_);
     std::string data_bin;
@@ -1046,7 +1046,7 @@ VulkanStream* VulkanThreadEntry::Stream(size_t device_id) {
 
 void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const {
   int device_id = VulkanThreadEntry::ThreadLocal()->ctx.device_id;
-  CHECK_LT(device_id, kVulkanMaxNumDevice);
+  ICHECK_LT(device_id, kVulkanMaxNumDevice);
   const auto& vctx = VulkanDeviceAPI::Global()->context(device_id);
   if (!scache_[device_id]) {
     scache_[device_id] = m_->GetPipeline(device_id, func_name_, num_pack_args_);
@@ -1067,7 +1067,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion
     // Can safely capture by reference as this lambda is immediately executed on the calling thread.
     VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Launch([&](VulkanStreamState* state) {
       vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline);
-      CHECK(pipeline->descriptor_update_template != VK_NULL_HANDLE);
+      ICHECK(pipeline->descriptor_update_template != VK_NULL_HANDLE);
       vctx.descriptor_template_khr_functions->vkCmdPushDescriptorSetWithTemplateKHR(
           state->cmd_buffer_, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0,
           descriptor_buffers.data());
@@ -1152,7 +1152,7 @@ Module VulkanModuleLoadFile(const std::string& file_name, const std::string& for
   dmlc::Stream* stream = &fs;
   uint32_t magic;
   stream->Read(&magic);
-  CHECK_EQ(magic, kVulkanModuleMagic) << "VulkanModule Magic mismatch";
+  ICHECK_EQ(magic, kVulkanModuleMagic) << "VulkanModule Magic mismatch";
   stream->Read(&smap);
   return VulkanModuleCreate(smap, fmap, "");
 }
diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h
index 780b11184931..da604f6fa792 100644
--- a/src/runtime/vulkan/vulkan_common.h
+++ b/src/runtime/vulkan/vulkan_common.h
@@ -18,10 +18,10 @@
  */
 #pragma once
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/support/logging.h>
 #include <vulkan/vulkan.h>
 
 #include <memory>
@@ -80,10 +80,10 @@ inline const char* VKGetErrorString(VkResult error) {
  * \brief Protected Vulkan call
  * \param func Expression to call.
  */
-#define VULKAN_CHECK_ERROR(__e)                                     \
-  {                                                                 \
-    CHECK(__e == VK_SUCCESS) << "Vulan Error, code=" << __e << ": " \
-                             << vulkan::VKGetErrorString(__e);      \
+#define VULKAN_CHECK_ERROR(__e)                                      \
+  {                                                                  \
+    ICHECK(__e == VK_SUCCESS) << "Vulan Error, code=" << __e << ": " \
+                              << vulkan::VKGetErrorString(__e);      \
   }
 
 #define VULKAN_CALL(func)    \
diff --git a/src/runtime/vulkan/vulkan_shader.h b/src/runtime/vulkan/vulkan_shader.h
index d56ca61e91cb..7558a95ee45e 100644
--- a/src/runtime/vulkan/vulkan_shader.h
+++ b/src/runtime/vulkan/vulkan_shader.h
@@ -18,10 +18,10 @@
  */
 #pragma once
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/support/logging.h>
 
 #include <vector>
 
diff --git a/src/runtime/vulkan/vulkan_stream.h b/src/runtime/vulkan/vulkan_stream.h
index 388cacc577b0..c5094bdf28db 100644
--- a/src/runtime/vulkan/vulkan_stream.h
+++ b/src/runtime/vulkan/vulkan_stream.h
@@ -93,7 +93,7 @@ class VulkanStream {
   void LaunchDeferred(const std::function<void()>& deferred_initializer,
                       const std::function<void(VulkanStreamState*)>& deferred_kernel,
                       const VulkanStreamToken& deferred_token) {
-    CHECK(!vctx_->UseImmediate());
+    ICHECK(!vctx_->UseImmediate());
 
     // It is invalid to schedule this instance on the current stream if we already
     // have a matching descriptor set and a non-matching buffer set.
diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc
index 49a4c961159d..2d347c32ac10 100644
--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -95,7 +95,7 @@ class WorkspacePool::Pool {
       int index = static_cast<int>(allocated_.size()) - 2;
       for (; index > 0 && allocated_[index].data != data; --index) {
       }
-      CHECK_GT(index, 0) << "trying to free things that has not been allocated";
+      ICHECK_GT(index, 0) << "trying to free things that has not been allocated";
       e = allocated_[index];
       allocated_.erase(allocated_.begin() + index);
     }
@@ -115,7 +115,7 @@ class WorkspacePool::Pool {
   }
   // Release all resources
   void Release(TVMContext ctx, DeviceAPI* device) {
-    CHECK_EQ(allocated_.size(), 1);
+    ICHECK_EQ(allocated_.size(), 1);
     for (size_t i = 1; i < free_list_.size(); ++i) {
       device->FreeDataSpace(ctx, free_list_[i].data);
     }
@@ -160,7 +160,7 @@ void* WorkspacePool::AllocWorkspace(TVMContext ctx, size_t size) {
 }
 
 void WorkspacePool::FreeWorkspace(TVMContext ctx, void* ptr) {
-  CHECK(static_cast<size_t>(ctx.device_id) < array_.size() && array_[ctx.device_id] != nullptr);
+  ICHECK(static_cast<size_t>(ctx.device_id) < array_.size() && array_[ctx.device_id] != nullptr);
   array_[ctx.device_id]->Free(ptr);
 }
 
diff --git a/src/support/base64.h b/src/support/base64.h
index 9849542471c2..901922db8edc 100644
--- a/src/support/base64.h
+++ b/src/support/base64.h
@@ -26,7 +26,7 @@
 #ifndef TVM_SUPPORT_BASE64_H_
 #define TVM_SUPPORT_BASE64_H_
 
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #include <cctype>
 #include <cstdio>
@@ -154,7 +154,7 @@ class Base64InStream : public dmlc::Stream {
       {
         // second byte
         temp_ch_ = reader_.GetChar();
-        CHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
+        ICHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
         nvalue |= DecodeTable[temp_ch_] << 12;
         *cptr++ = (nvalue >> 16) & 0xFF;
         --tlen;
@@ -162,13 +162,13 @@ class Base64InStream : public dmlc::Stream {
       {
         // third byte
         temp_ch_ = reader_.GetChar();
-        CHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
+        ICHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
         // handle termination
         if (temp_ch_ == '=') {
           temp_ch_ = reader_.GetChar();
-          CHECK(temp_ch_ == '=') << "invalid base64 format";
+          ICHECK(temp_ch_ == '=') << "invalid base64 format";
           temp_ch_ = reader_.GetChar();
-          CHECK(temp_ch_ == EOF || isspace(temp_ch_)) << "invalid base64 format";
+          ICHECK(temp_ch_ == EOF || isspace(temp_ch_)) << "invalid base64 format";
           break;
         }
         nvalue |= DecodeTable[temp_ch_] << 6;
@@ -182,10 +182,10 @@ class Base64InStream : public dmlc::Stream {
       {
         // fourth byte
         temp_ch_ = reader_.GetChar();
-        CHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
+        ICHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
         if (temp_ch_ == '=') {
           temp_ch_ = reader_.GetChar();
-          CHECK(temp_ch_ == EOF || isspace(temp_ch_)) << "invalid base64 format";
+          ICHECK(temp_ch_ == EOF || isspace(temp_ch_)) << "invalid base64 format";
           break;
         }
         nvalue |= DecodeTable[temp_ch_];
@@ -200,7 +200,7 @@ class Base64InStream : public dmlc::Stream {
       temp_ch_ = reader_.GetChar();
     }
     if (kStrictCheck) {
-      CHECK_EQ(tlen, 0) << "Base64InStream: read incomplete";
+      ICHECK_EQ(tlen, 0) << "Base64InStream: read incomplete";
     }
     return size - tlen;
   }
diff --git a/src/support/parallel_for.cc b/src/support/parallel_for.cc
index 0b8c810da70b..f4756c29adeb 100644
--- a/src/support/parallel_for.cc
+++ b/src/support/parallel_for.cc
@@ -21,7 +21,7 @@
  * \file parallel_for.cc
  * \brief An implementation to run loop in parallel.
  */
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 #include <tvm/support/parallel_for.h>
 
 #include <future>
@@ -34,8 +34,8 @@ namespace support {
 
 std::vector<std::vector<int>> rr_partitioner(int begin, int end, int step, int num_threads) {
   int total_task_count = (end - begin) / step;
-  CHECK_GE(total_task_count, 0) << "Infinite loop condition with begin: " << begin
-                                << " end: " << end << " step: " << step;
+  ICHECK_GE(total_task_count, 0) << "Infinite loop condition with begin: " << begin
+                                 << " end: " << end << " step: " << step;
   std::vector<std::vector<int>> ret;
   ret.reserve(num_threads);
   for (size_t thread = 0; begin < end; begin += step, thread = (thread + 1) % num_threads) {
@@ -53,8 +53,8 @@ void parallel_for(int begin, int end, const std::function<void(int)>& f, int ste
   static std::mutex M_GLOBAL_PARALLEL_FOR_FLAG;
   {
     std::unique_lock<std::mutex> l(M_GLOBAL_PARALLEL_FOR_FLAG);
-    CHECK(!GLOBAL_PARALLEL_FOR_FLAG) << "There's another parallel_for running. Maybe you're "
-                                     << "currently inside another parallel_for loop.";
+    ICHECK(!GLOBAL_PARALLEL_FOR_FLAG) << "There's another parallel_for running. Maybe you're "
+                                      << "currently inside another parallel_for loop.";
     GLOBAL_PARALLEL_FOR_FLAG = true;
   }
 
@@ -81,7 +81,7 @@ void parallel_for(int begin, int end, const std::function<void(int)>& f, int ste
   }
   {
     std::unique_lock<std::mutex> l(M_GLOBAL_PARALLEL_FOR_FLAG);
-    CHECK(GLOBAL_PARALLEL_FOR_FLAG);
+    ICHECK(GLOBAL_PARALLEL_FOR_FLAG);
     GLOBAL_PARALLEL_FOR_FLAG = false;
   }
   try {
diff --git a/src/support/pipe.h b/src/support/pipe.h
index dcebd0ddf32f..3c1356ba174c 100644
--- a/src/support/pipe.h
+++ b/src/support/pipe.h
@@ -25,7 +25,7 @@
 #define TVM_SUPPORT_PIPE_H_
 
 #include <dmlc/io.h>
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #ifdef _WIN32
 #include <windows.h>
@@ -64,12 +64,12 @@ class Pipe : public dmlc::Stream {
     if (size == 0) return 0;
 #ifdef _WIN32
     DWORD nread;
-    CHECK(ReadFile(handle_, static_cast<TCHAR*>(ptr), &nread, nullptr))
+    ICHECK(ReadFile(handle_, static_cast<TCHAR*>(ptr), &nread, nullptr))
         << "Read Error: " << GetLastError();
 #else
     ssize_t nread;
     nread = read(handle_, ptr, size);
-    CHECK_GE(nread, 0) << "Write Error: " << strerror(errno);
+    ICHECK_GE(nread, 0) << "Write Error: " << strerror(errno);
 #endif
     return static_cast<size_t>(nread);
   }
@@ -83,13 +83,13 @@ class Pipe : public dmlc::Stream {
     if (size == 0) return;
 #ifdef _WIN32
     DWORD nwrite;
-    CHECK(WriteFile(handle_, static_cast<const TCHAR*>(ptr), &nwrite, nullptr) &&
-          static_cast<size_t>(nwrite) == size)
+    ICHECK(WriteFile(handle_, static_cast<const TCHAR*>(ptr), &nwrite, nullptr) &&
+           static_cast<size_t>(nwrite) == size)
         << "Write Error: " << GetLastError();
 #else
     ssize_t nwrite;
     nwrite = write(handle_, ptr, size);
-    CHECK_EQ(static_cast<size_t>(nwrite), size) << "Write Error: " << strerror(errno);
+    ICHECK_EQ(static_cast<size_t>(nwrite), size) << "Write Error: " << strerror(errno);
 #endif
   }
   /*!
diff --git a/src/support/ring_buffer.h b/src/support/ring_buffer.h
index a3938491f1d1..af814158f7b6 100644
--- a/src/support/ring_buffer.h
+++ b/src/support/ring_buffer.h
@@ -93,7 +93,7 @@ class RingBuffer {
    * \param size The number of bytes to read.
    */
   void Read(void* data, size_t size) {
-    CHECK_GE(bytes_available_, size);
+    ICHECK_GE(bytes_available_, size);
     size_t ncopy = std::min(size, ring_.size() - head_ptr_);
     memcpy(data, &ring_[0] + head_ptr_, ncopy);
     if (ncopy < size) {
@@ -112,7 +112,7 @@ class RingBuffer {
   template <typename FSend>
   size_t ReadWithCallback(FSend fsend, size_t max_nbytes) {
     size_t size = std::min(max_nbytes, bytes_available_);
-    CHECK_NE(size, 0U);
+    ICHECK_NE(size, 0U);
     size_t ncopy = std::min(size, ring_.size() - head_ptr_);
     size_t nsend = fsend(&ring_[0] + head_ptr_, ncopy);
     bytes_available_ -= nsend;
diff --git a/src/support/socket.h b/src/support/socket.h
index d70f956a51fb..16fba6b58e3d 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -26,10 +26,14 @@
 #define TVM_SUPPORT_SOCKET_H_
 
 #if defined(_WIN32)
+
+#ifndef NOMINMAX
 #define NOMINMAX
+#endif
+
 #include <winsock2.h>
 #include <ws2tcpip.h>
-#undef NOMINMAX
+
 using ssize_t = int;
 #ifdef _MSC_VER
 #pragma comment(lib, "Ws2_32.lib")
@@ -45,14 +49,14 @@ using ssize_t = int;
 #include <sys/socket.h>
 #include <unistd.h>
 #endif
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #include <cstring>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "../support/util.h"
+#include "../support/utils.h"
 
 #if defined(_WIN32)
 static inline int poll(struct pollfd* pfd, int nfds, int timeout) {
@@ -71,7 +75,7 @@ namespace support {
 inline std::string GetHostName() {
   std::string buf;
   buf.resize(256);
-  CHECK_NE(gethostname(&buf[0], 256), -1);
+  ICHECK_NE(gethostname(&buf[0], 256), -1);
   return std::string(buf.c_str());
 }
 
@@ -113,7 +117,7 @@ struct SockAddr {
     size_t sep = url.find(",");
     std::string host = url.substr(2, sep - 3);
     std::string port = url.substr(sep + 1, url.length() - 1);
-    CHECK(ValidateIP(host)) << "Url address is not valid " << url;
+    ICHECK(ValidateIP(host)) << "Url address is not valid " << url;
     if (host == "localhost") {
       host = "127.0.0.1";
     }
@@ -133,7 +137,7 @@ struct SockAddr {
     hints.ai_socktype = SOCK_STREAM;
     addrinfo* res = nullptr;
     int sig = getaddrinfo(host, nullptr, &hints, &res);
-    CHECK(sig == 0 && res != nullptr) << "cannot obtain address of " << host;
+    ICHECK(sig == 0 && res != nullptr) << "cannot obtain address of " << host;
     switch (res->ai_family) {
       case AF_INET: {
         sockaddr_in* addr4 = reinterpret_cast<sockaddr_in*>(&addr);
@@ -148,7 +152,7 @@ struct SockAddr {
         addr6->sin6_family = AF_INET6;
       } break;
       default:
-        CHECK(false) << "cannot decode address";
+        ICHECK(false) << "cannot decode address";
     }
     freeaddrinfo(res);
   }
@@ -173,7 +177,7 @@ struct SockAddr {
       const in_addr& addr4 = reinterpret_cast<const sockaddr_in*>(&addr)->sin_addr;
       sinx_addr = reinterpret_cast<const void*>(&addr4);
     } else {
-      CHECK(false) << "illegal address";
+      ICHECK(false) << "illegal address";
     }
 
 #ifdef _WIN32
@@ -183,7 +187,7 @@ struct SockAddr {
     const char* s =
         inet_ntop(addr.ss_family, sinx_addr, &buf[0], static_cast<socklen_t>(buf.length()));
 #endif
-    CHECK(s != nullptr) << "cannot decode address";
+    ICHECK(s != nullptr) << "cannot decode address";
     std::ostringstream os;
     os << s << ":" << port();
     return os.str();
@@ -522,8 +526,8 @@ class TCPSocket : public Socket {
    */
   void SendBytes(std::string data) {
     int datalen = data.length();
-    CHECK_EQ(SendAll(&datalen, sizeof(datalen)), sizeof(datalen));
-    CHECK_EQ(SendAll(data.c_str(), datalen), datalen);
+    ICHECK_EQ(SendAll(&datalen, sizeof(datalen)), sizeof(datalen));
+    ICHECK_EQ(SendAll(data.c_str(), datalen), datalen);
   }
   /*!
    * \brief Receive the data to remote.
@@ -531,10 +535,10 @@ class TCPSocket : public Socket {
    */
   std::string RecvBytes() {
     int datalen = 0;
-    CHECK_EQ(RecvAll(&datalen, sizeof(datalen)), sizeof(datalen));
+    ICHECK_EQ(RecvAll(&datalen, sizeof(datalen)), sizeof(datalen));
     std::string data;
     data.resize(datalen);
-    CHECK_EQ(RecvAll(&data[0], datalen), datalen);
+    ICHECK_EQ(RecvAll(&data[0], datalen), datalen);
     return data;
   }
 };
diff --git a/src/support/util.h b/src/support/utils.h
similarity index 91%
rename from src/support/util.h
rename to src/support/utils.h
index 859b372bd761..ce1f2bed43f9 100644
--- a/src/support/util.h
+++ b/src/support/utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file util.h
+ * \file utils.h
  * \brief Defines some common utility function..
  */
-#ifndef TVM_SUPPORT_UTIL_H_
-#define TVM_SUPPORT_UTIL_H_
+#ifndef TVM_SUPPORT_UTILS_H_
+#define TVM_SUPPORT_UTILS_H_
 
 #include <stdio.h>
 #ifndef _WIN32
@@ -152,6 +152,16 @@ inline int Execute(std::string cmd, std::string* err_msg) {
   return 255;
 }
 
+/*!
+ * \brief Combine two hash values into a single one.
+ * \param key The left operand.
+ * \param value The right operand.
+ * \return the combined result.
+ */
+inline size_t HashCombine(size_t key, size_t value) {
+  return key ^ (value + 0x9e3779b9 + (key << 6) + (key >> 2));
+}
+
 }  // namespace support
 }  // namespace tvm
-#endif  // TVM_SUPPORT_UTIL_H_
+#endif  // TVM_SUPPORT_UTILS_H_
diff --git a/src/target/build_common.h b/src/target/build_common.h
index 9d92697aa319..1816c3ac2650 100644
--- a/src/target/build_common.h
+++ b/src/target/build_common.h
@@ -44,7 +44,7 @@ inline std::unordered_map<std::string, runtime::FunctionInfo> ExtractFuncInfo(co
   std::unordered_map<std::string, runtime::FunctionInfo> fmap;
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<tir::PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    ICHECK(kv.second->IsInstance<tir::PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
     auto f = Downcast<tir::PrimFunc>(kv.second);
 
     runtime::FunctionInfo info;
diff --git a/src/target/codegen.cc b/src/target/codegen.cc
index 47603e404635..18aa954787ce 100644
--- a/src/target/codegen.cc
+++ b/src/target/codegen.cc
@@ -55,7 +55,7 @@ runtime::Module Build(IRModule mod, Target target) {
   }
   // the build function.
   const PackedFunc* bf = runtime::Registry::Get(build_f_name);
-  CHECK(bf != nullptr) << build_f_name << " is not enabled";
+  ICHECK(bf != nullptr) << build_f_name << " is not enabled";
   return (*bf)(mod, target);
 }
 
@@ -233,7 +233,7 @@ runtime::Module PackImportsToLLVM(const runtime::Module& mod, bool system_lib,
   std::string codegen_f_name = "codegen.codegen_blob";
   // the codegen function.
   const PackedFunc* codegen_f = runtime::Registry::Get(codegen_f_name);
-  CHECK(codegen_f != nullptr) << "codegen.codegen_blob is not presented.";
+  ICHECK(codegen_f != nullptr) << "codegen.codegen_blob is not presented.";
   return (*codegen_f)(blob_byte_array, system_lib, target_triple);
 }
 
diff --git a/src/target/datatype/registry.cc b/src/target/datatype/registry.cc
index c84f917d5c3e..e7807798741d 100644
--- a/src/target/datatype/registry.cc
+++ b/src/target/datatype/registry.cc
@@ -49,20 +49,20 @@ Registry* Registry::Global() {
 }
 
 void Registry::Register(const std::string& type_name, uint8_t type_code) {
-  CHECK(type_code >= DataType::kCustomBegin)
+  ICHECK(type_code >= DataType::kCustomBegin)
       << "Please choose a type code >= DataType::kCustomBegin for custom types";
   code_to_name_[type_code] = type_name;
   name_to_code_[type_name] = type_code;
 }
 
 uint8_t Registry::GetTypeCode(const std::string& type_name) {
-  CHECK(name_to_code_.find(type_name) != name_to_code_.end())
+  ICHECK(name_to_code_.find(type_name) != name_to_code_.end())
       << "Type name " << type_name << " not registered";
   return name_to_code_[type_name];
 }
 
 std::string Registry::GetTypeName(uint8_t type_code) {
-  CHECK(code_to_name_.find(type_code) != code_to_name_.end())
+  ICHECK(code_to_name_.find(type_code) != code_to_name_.end())
       << "Type code " << static_cast<unsigned>(type_code) << " not registered";
   return code_to_name_[type_code];
 }
diff --git a/src/target/generic_func.cc b/src/target/generic_func.cc
index b5842eebc9e3..16e5a5f9cdc6 100644
--- a/src/target/generic_func.cc
+++ b/src/target/generic_func.cc
@@ -68,7 +68,7 @@ void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name)
   Manager* m = Manager::Global();
   std::lock_guard<std::mutex>(m->mutex);
   auto it = m->fmap.find(name);
-  CHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
+  ICHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
   func->name_ = name;
   m->fmap[name] = func;
 }
@@ -76,7 +76,7 @@ void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name)
 GenericFunc& GenericFunc::set_default(const PackedFunc value, bool allow_override) {
   auto node = static_cast<GenericFuncNode*>(operator->());
   if (!allow_override) {
-    CHECK(node->generic_func_ == nullptr)
+    ICHECK(node->generic_func_ == nullptr)
         << "Generic function already registered for " << node->name_;
   }
   node->generic_func_ = value;
@@ -88,7 +88,7 @@ GenericFunc& GenericFunc::register_func(const std::vector<std::string>& tags,
   for (auto& t : tags) {
     if (!allow_override) {
       auto iter = (*this)->dispatch_dict_.find(t);
-      CHECK(iter == (*this)->dispatch_dict_.end())
+      ICHECK(iter == (*this)->dispatch_dict_.end())
           << "Tag " << t << " already registered for schedule factory " << (*this)->name_;
     }
     (*this)->dispatch_dict_[t] = value;
@@ -112,7 +112,7 @@ void GenericFunc::CallPacked(TVMArgs args, TVMRetValue* ret) const {
   }
 
   if (func == nullptr) {
-    CHECK(node->generic_func_ != nullptr) << "No generic function registered for " << node->name_;
+    ICHECK(node->generic_func_ != nullptr) << "No generic function registered for " << node->name_;
     func = node->generic_func_;
   }
 
diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index fa0ee38d8130..f8f4d0ef5414 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -81,7 +81,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.rsqrt")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
       const CallNode* call = e.as<CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
 
       auto one = make_const(call->args[0].dtype(), 1);
       *rv = one / sqrt(call->args[0]);
@@ -93,7 +93,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sigmoid")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
       const CallNode* call = e.as<CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
 
       auto one = make_const(call->args[0].dtype(), 1);
       *rv = one / (one + exp(-call->args[0]));
@@ -103,7 +103,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.isfinite")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
       const CallNode* call = e.as<CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       *rv = isfinite(call->args[0]);
     });
 
@@ -111,7 +111,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.isinf")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
       const CallNode* call = e.as<CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       *rv = isinf(call->args[0]);
     });
 
@@ -121,44 +121,75 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.q_multiply_shift")
 
       PrimExpr e = args[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
 
       PrimExpr x = call->args[0];
       PrimExpr y = call->args[1];
       PrimExpr q = call->args[2];
       PrimExpr s = call->args[3];
 
-      // Only int32 types are supported (any number of lanes is allowed)
-      CHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
-      CHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
-
-      DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
-      DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
-
-      // 1) Calculating the integer multiplier and integer shift
-      PrimExpr zero = make_const(s.dtype(), 0);
-      PrimExpr left_shift = tir::Select(s > zero, s, zero);
-      PrimExpr right_shift = tir::Select(s > zero, zero, -s);
-
-      // 2) Cast and Multiply the integer multiplier
-      PrimExpr one = make_const(hp_dtype, 1);
-      x = cast(hp_dtype, x);
-      y = cast(hp_dtype, y);
-      x = tir::Select(left_shift != zero, x << left_shift, x);
-
-      // 3) Perform the multiplication in higher precision.
-      x = x * y;
-
-      // 4) Find the rounding scalar
-      PrimExpr total_right_shift = right_shift + q;
-      PrimExpr pos_rounding_value = (one << (total_right_shift - 1));
-      x = x + pos_rounding_value;
-
-      // 5) Simply right shift the result to get the final output.
-      x = x >> total_right_shift;
-
-      // 6) The fixed point multiplication keeps the value in int32 range. Casting back to int32.
-      *rv = cast(lp_dtype, x);
+      // Lambda function to extract the int value from PrimExpr
+      auto get_int_value = [](const PrimExpr node) {
+        if (auto int_node = node.as<IntImmNode>()) {
+          return int_node->value;
+        }
+        auto broadcast_node = node.as<BroadcastNode>();
+        CHECK(broadcast_node != nullptr);
+        auto int_node = broadcast_node->value.as<IntImmNode>();
+        CHECK(int_node != nullptr);
+        return int_node->value;
+      };
+      // Power of 2 is determined by the fixed_point_multiplier == 1 << 30. In case of power of 2,
+      // fixed point multiplier will represent a float value of 0.5. In fixed point, this is
+      // represented by 1 << 30.
+      if (get_int_value(y) == (1 << 30)) {
+        PrimExpr exp = s - 1;
+        int exp_val = get_int_value(s) - 1;
+        if (exp_val > 0) {
+          // power of 2 is greater than 0, apply left shift.
+          *rv = x << exp;
+        } else {
+          // power of 2 is less than 0, round and then apply right shift.
+          DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+          PrimExpr one = make_const(lp_dtype, 1);
+          exp = -exp;
+          PrimExpr rounding_factor = one << (exp - 1);
+          PrimExpr rounded_t = x + rounding_factor;
+          *rv = rounded_t >> exp;
+        }
+      } else {
+        // Only int32 types are supported (any number of lanes is allowed)
+        ICHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
+        ICHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
+
+        DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
+        DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+
+        // 1) Calculating the integer multiplier and integer shift
+        PrimExpr zero = make_const(s.dtype(), 0);
+        PrimExpr left_shift = tir::Select(s > zero, s, zero);
+        PrimExpr right_shift = tir::Select(s > zero, zero, -s);
+
+        // 2) Cast and Multiply the integer multiplier
+        PrimExpr one = make_const(hp_dtype, 1);
+        x = cast(hp_dtype, x);
+        y = cast(hp_dtype, y);
+        x = tir::Select(left_shift != zero, x << left_shift, x);
+
+        // 3) Perform the multiplication in higher precision.
+        x = x * y;
+
+        // 4) Find the rounding scalar
+        PrimExpr total_right_shift = right_shift + q;
+        PrimExpr pos_rounding_value = (one << (total_right_shift - 1));
+        x = x + pos_rounding_value;
+
+        // 5) Simply right shift the result to get the final output.
+        x = x >> total_right_shift;
+
+        // 6) The fixed point multiplication keeps the value in int32 range. Casting back to int32.
+        *rv = cast(lp_dtype, x);
+      }
     });
 
 }  // namespace intrin
diff --git a/src/target/intrin_rule.h b/src/target/intrin_rule.h
index 359c5b9580b5..69196e1b2c39 100644
--- a/src/target/intrin_rule.h
+++ b/src/target/intrin_rule.h
@@ -58,13 +58,13 @@ template <typename T>
 inline void DispatchPureExtern(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   // Use string based dispatch to extern for backward compact
   // TODO(tvm-team) replace once the new dispatching system is inplace.
   const OpNode* op = call->op.as<OpNode>();
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   std::string name = op->name;
-  CHECK_EQ(name.substr(0, 4), "tir.");
+  ICHECK_EQ(name.substr(0, 4), "tir.");
   name = T()(call->dtype, name.substr(4));
 
   if (name.length() != 0) {
diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 1f6eedde0b21..2890c1ce3e56 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -70,11 +70,11 @@ class CodeGenAMDGPU : public CodeGenLLVM {
   }
 
   void VisitStmt_(const AllocateNode* op) final {
-    CHECK(!is_zero(op->condition));
+    ICHECK(!is_zero(op->condition));
     llvm::Value* buf = nullptr;
 
     int32_t constant_size = op->constant_allocation_size();
-    CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
+    ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
 
     StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
     if (constant_size % 4 == 0 && info.alignment == 0) {
@@ -99,7 +99,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       }
       buf = alloca;
     } else {
-      CHECK(info.scope.rank == runtime::StorageRank::kShared)
+      ICHECK(info.scope.rank == runtime::StorageRank::kShared)
           << "Can only allocate shared or local memory inside kernel";
       // Shared memory: address space  == 3
       const unsigned shared_address_space = 3;
@@ -120,7 +120,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
 
     buf = builder_->CreatePointerCast(
         buf, DTypeToLLVMType(op->dtype)->getPointerTo(buf->getType()->getPointerAddressSpace()));
-    CHECK(!var_map_.count(op->buffer_var.get()));
+    ICHECK(!var_map_.count(op->buffer_var.get()));
     var_map_[op->buffer_var.get()] = buf;
     this->VisitStmt(op->body);
   }
@@ -144,7 +144,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
           LOG(FATAL) << "unknown workitem idx";
       }
     } else {
-      CHECK_EQ(ts.rank, 0);
+      ICHECK_EQ(ts.rank, 0);
       switch (ts.dim_index) {
         case 0:
           intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_x;
@@ -207,7 +207,7 @@ runtime::Module BuildAMDGPU(IRModule mod, Target target) {
   cg->Init("TVMAMDGPUModule", tm.get(), ctx.get(), false, false, false);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
     auto f = Downcast<PrimFunc>(kv.second);
     cg->AddFunction(f);
   }
@@ -249,13 +249,13 @@ runtime::Module BuildAMDGPU(IRModule mod, Target target) {
   llvm::legacy::PassManager pass;
 
 #if TVM_LLVM_VERSION <= 60
-  CHECK(tm->addPassesToEmitFile(pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #elif TVM_LLVM_VERSION <= 90
-  CHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::TargetMachine::CGFT_ObjectFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::TargetMachine::CGFT_ObjectFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #else
-  CHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::CGFT_ObjectFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::CGFT_ObjectFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #endif
   pass.run(*mObj);
@@ -263,21 +263,21 @@ runtime::Module BuildAMDGPU(IRModule mod, Target target) {
 
   llvm::legacy::PassManager passAsm;
 #if TVM_LLVM_VERSION <= 60
-  CHECK(tm->addPassesToEmitFile(passAsm, destAsm, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(passAsm, destAsm, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_AssemblyFile";
 #elif TVM_LLVM_VERSION <= 90
-  CHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr,
-                                llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr,
+                                 llvm::TargetMachine::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_AssemblyFile";
 #else
-  CHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr, llvm::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr, llvm::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_AssemblyFile";
 #endif
   passAsm.run(*mAsm);
   std::string assembly(dataAsm.begin(), dataAsm.end());
 
   const auto* f = tvm::runtime::Registry::Get("tvm_callback_rocm_link");
-  CHECK(f != nullptr) << "Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm";
+  ICHECK(f != nullptr) << "Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm";
 
   TVMByteArray arr;
   arr.data = &obj[0];
diff --git a/src/target/llvm/codegen_arm.cc b/src/target/llvm/codegen_arm.cc
index 5e5a94b50064..06f1dfeb1a2d 100644
--- a/src/target/llvm/codegen_arm.cc
+++ b/src/target/llvm/codegen_arm.cc
@@ -89,7 +89,7 @@ PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   PrimExpr input8 = reinterpret(uint8_type, e);
   // Popcount 8bit->8bit
   const CallNode* c0 = input8.as<CallNode>();
-  CHECK(c0 != nullptr);
+  ICHECK(c0 != nullptr);
   Array<PrimExpr> vcnt8_args;
   vcnt8_args.push_back(IntImm(DataType::UInt(32), ctpop_id));
   vcnt8_args.push_back(IntImm(DataType::UInt(32), 1));
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 53104542417e..fea5f8036678 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -119,13 +119,13 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) {
   CodeGenLLVM::AddFunction(f);
   if (f_tvm_register_system_symbol_ != nullptr) {
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
     export_system_symbols_.emplace_back(
         std::make_pair(global_symbol.value().operator std::string(), function_));
   } else if (target_c_runtime_) {
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
     registry_functions_.emplace_back(
         std::make_pair(global_symbol.value().operator std::string(), function_));
@@ -136,7 +136,7 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) {
 // Following Glow |DebugInfo::generateFunctionDebugInfo|, https://git.io/fjadv
 void CodeGenCPU::AddDebugInformation(llvm::Function* function) {
 #if TVM_LLVM_VERSION >= 50 && TVM_LLVM_VERSION < 70
-  CHECK(!function->getSubprogram());
+  ICHECK(!function->getSubprogram());
   llvm::SmallVector<llvm::Metadata*, 4> paramTys;
   llvm::DIType* returnTy =
       getDebugType(builder_.get(), dbg_info_->di_builder_.get(), function->getReturnType());
@@ -159,9 +159,9 @@ void CodeGenCPU::AddDebugInformation(llvm::Function* function) {
       true, 0 /* line number */, llvm::DINode::FlagPrototyped, true /* isOptimized */);
 #endif
 
-  CHECK(DIFunction);
+  ICHECK(DIFunction);
   function->setSubprogram(DIFunction);
-  CHECK_EQ(function->getSubprogram(), DIFunction);
+  ICHECK_EQ(function->getSubprogram(), DIFunction);
 
   IRBuilder builder(&function->getEntryBlock());
   if (!function->getEntryBlock().empty()) {
@@ -223,7 +223,7 @@ llvm::DIType* CodeGenCPU::getDebugType(IRBuilder* builder, llvm::DIBuilder* di_b
 
 void CodeGenCPU::AddMainFunction(const std::string& entry_func_name) {
   llvm::Function* f = module_->getFunction(entry_func_name);
-  CHECK(f) << "Function " << entry_func_name << "does not in module";
+  ICHECK(f) << "Function " << entry_func_name << "does not in module";
   llvm::Type* type = llvm::ArrayType::get(t_char_, entry_func_name.length() + 1);
   llvm::GlobalVariable* global =
       new llvm::GlobalVariable(*module_, type, true, llvm::GlobalValue::WeakAnyLinkage, nullptr,
@@ -233,6 +233,14 @@ void CodeGenCPU::AddMainFunction(const std::string& entry_func_name) {
 #else
   global->setAlignment(1);
 #endif
+  // comdat is needed for windows select any linking to work
+  // set comdat to Any(weak linking)
+  if (target_machine_->getTargetTriple().isOSWindows()) {
+    llvm::Comdat* comdat = module_->getOrInsertComdat(runtime::symbol::tvm_module_main);
+    comdat->setSelectionKind(llvm::Comdat::Any);
+    global->setComdat(comdat);
+  }
+
   global->setInitializer(llvm::ConstantDataArray::getString(*ctx_, entry_func_name));
   global->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass);
 }
@@ -250,7 +258,7 @@ llvm::Value* CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::
     if (buf->getType() == t_void_p_) {
       buf = builder_->CreatePointerCast(buf, t_tvm_array_->getPointerTo());
     } else {
-      CHECK_EQ(buf->getType(), t_tvm_array_->getPointerTo());
+      ICHECK_EQ(buf->getType(), t_tvm_array_->getPointerTo());
     }
   }
   switch (kind) {
@@ -288,8 +296,8 @@ llvm::Value* CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::
       return builder_->CreateInBoundsGEP(buf, {index, ConstInt32(1), ConstInt32(0)});
     }
     case builtin::kTVMValueContent: {
-      CHECK_EQ(t.lanes(), 1);
-      CHECK(t.is_handle() || t.bits() == 64);
+      ICHECK_EQ(t.lanes(), 1);
+      ICHECK(t.is_handle() || t.bits() == 64);
       if (t.is_int()) {
         buf = builder_->CreatePointerCast(buf, t_int64_->getPointerTo());
         return builder_->CreateInBoundsGEP(buf, index);
@@ -297,7 +305,7 @@ llvm::Value* CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::
         buf = builder_->CreatePointerCast(buf, t_float64_->getPointerTo());
         return builder_->CreateInBoundsGEP(buf, index);
       } else {
-        CHECK(t.is_handle());
+        ICHECK(t.is_handle());
         buf = builder_->CreatePointerCast(buf, t_tvm_value_->getPointerTo());
         buf = builder_->CreateInBoundsGEP(buf, index);
         return builder_->CreatePointerCast(buf, t_void_p_->getPointerTo());
@@ -358,11 +366,18 @@ llvm::GlobalVariable* CodeGenCPU::InitContextPtr(llvm::Type* p_type, std::string
 #endif
   gv->setInitializer(llvm::Constant::getNullValue(p_type));
   gv->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+  // comdat is needed for windows select any linking to work
+  // set comdat to Any(weak linking)
+  if (target_machine_->getTargetTriple().isOSWindows()) {
+    llvm::Comdat* comdat = module_->getOrInsertComdat(name);
+    comdat->setSelectionKind(llvm::Comdat::Any);
+    gv->setComdat(comdat);
+  }
   return gv;
 }
 
 llvm::Value* CodeGenCPU::GetContextPtr(llvm::GlobalVariable* gv) {
-  CHECK(gv != nullptr);
+  ICHECK(gv != nullptr);
 #if TVM_LLVM_VERSION >= 110
   llvm::LoadInst* faddr = builder_->CreateAlignedLoad(gv, llvm::Align(gv->getAlignment()));
 #else
@@ -481,7 +496,7 @@ llvm::Value* CodeGenCPU::PackClosureData(const Array<Var>& vfields, uint64_t* nu
   std::vector<llvm::Type*> fields;
   for (Var v : vfields) {
     auto it = var_map_.find(v.get());
-    CHECK(it != var_map_.end());
+    ICHECK(it != var_map_.end());
     fields.push_back(it->second->getType());
   }
   llvm::StructType* tcdata = llvm::StructType::create(fields);
@@ -548,7 +563,7 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) {
   std::swap(var_map_, new_vmap);
   std::swap(parallel_env_, par_env);
   std::swap(function_, f);
-  CHECK_NE(par_env.parallel_loop_count, 0) << "Cannot find parallel loop within parallel launch";
+  ICHECK_NE(par_env.parallel_loop_count, 0) << "Cannot find parallel loop within parallel launch";
   builder_->SetInsertPoint(par_launch_end);
 }
 
@@ -591,7 +606,7 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod
   // setup new variable map, swap it with current var context.
   std::unordered_map<const VarNode*, llvm::Value*> new_vmap;
   UnpackClosureData(cdata, vfields, &new_vmap);
-  CHECK(parallel_env_.penv == nullptr);
+  ICHECK(parallel_env_.penv == nullptr);
   std::swap(function_, f);
   std::swap(var_map_, new_vmap);
   this->VisitStmt(body);
@@ -682,7 +697,7 @@ llvm::BasicBlock* CodeGenCPU::MakeCallPacked(const Array<PrimExpr>& args, llvm::
   llvm::Value* handle = GetPackedFuncHandle(func_name);
   // call the function
   int64_t nargs = end - begin;
-  CHECK_GE(nargs, 0);
+  ICHECK_GE(nargs, 0);
   llvm::Value* stack_value = MakeValue(args[1]);
   llvm::Value* stack_tcode = MakeValue(args[2]);
   llvm::Value* arg_value = builder_->CreateInBoundsGEP(
@@ -711,7 +726,7 @@ llvm::BasicBlock* CodeGenCPU::MakeCallPacked(const Array<PrimExpr>& args, llvm::
 }
 
 llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op) {
-  CHECK_EQ(op->args.size(), 5U);
+  ICHECK_EQ(op->args.size(), 5U);
   llvm::Value* rvalue = nullptr;
   llvm::Value* ret_tcode = nullptr;
   MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImmNode>()->value,
@@ -721,7 +736,7 @@ llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op) {
 
 llvm::Value* CodeGenCPU::CreateCallTracePacked(const CallNode* op) {
   using llvm::BasicBlock;
-  CHECK_EQ(op->args.size(), 6U);
+  ICHECK_EQ(op->args.size(), 6U);
   llvm::Value* rvalue = nullptr;
   llvm::Value* ret_tcode = nullptr;
   BasicBlock* end_block =
@@ -778,7 +793,7 @@ llvm::Value* CodeGenCPU::RuntimeTVMParallelBarrier() {
 
 void CodeGenCPU::AddStartupFunction() {
   if (registry_functions_.size() != 0) {
-    CHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime";
+    ICHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime";
     std::vector<std::string> symbols;
     std::vector<llvm::Constant*> funcs;
     for (auto sym : registry_functions_) {
@@ -846,7 +861,7 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
     builder_->SetInsertPoint(new_bb);
     return ConstInt32(-1);
   } else if (op->op.same_as(builtin::tvm_struct_get())) {
-    CHECK_EQ(op->args.size(), 3U);
+    ICHECK_EQ(op->args.size(), 3U);
     int kind = op->args[2].as<IntImmNode>()->value;
     llvm::Value* ref =
         this->CreateStructRefPtr(op->dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
@@ -856,23 +871,23 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
       return builder_->CreateLoad(ref);
     }
   } else if (op->op.same_as(builtin::tvm_struct_set())) {
-    CHECK_EQ(op->args.size(), 4U);
+    ICHECK_EQ(op->args.size(), 4U);
     int kind = op->args[2].as<IntImmNode>()->value;
     llvm::Value* value = MakeValue(op->args[3]);
     llvm::Value* ref = this->CreateStructRefPtr(op->args[3].dtype(), MakeValue(op->args[0]),
                                                 MakeValue(op->args[1]), kind);
-    CHECK(kind != builtin::kArrAddr);
+    ICHECK(kind != builtin::kArrAddr);
     if (value->getType()->isPointerTy()) {
       value = builder_->CreatePointerCast(value, ref->getType()->getPointerElementType());
     }
     builder_->CreateStore(value, ref);
     return ConstInt32(0);
   } else if (op->op.same_as(builtin::tvm_stack_alloca())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     const std::string& type = op->args[0].as<StringImmNode>()->value;
     return WithFunctionEntry([&]() -> llvm::AllocaInst* {
       const int64_t* pval = as_const_int(op->args[1]);
-      CHECK(pval) << "require stack alloca to contain constant value";
+      ICHECK(pval) << "require stack alloca to contain constant value";
       llvm::Value* num = ConstInt32(pval[0]);
       if (type == "shape") {
         return builder_->CreateAlloca(t_tvm_shape_index_, num);
@@ -926,15 +941,15 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
     this->CreateComputeScope(op);
   } else if (tir::attr::IsPragmaKey(op->attr_key)) {
     if (op->attr_key == "pragma_parallel_stride_pattern") {
-      CHECK(parallel_env_.penv != nullptr)
+      ICHECK(parallel_env_.penv != nullptr)
           << "Pragma parallel_stride_pattern only valid in parallel launch";
       parallel_env_.stride_pattern = true;
       this->VisitStmt(op->body);
     } else if (op->attr_key == "pragma_parallel_launch_point") {
       CreateParallelLaunch(op->body, 0);
     } else if (op->attr_key == "pragma_parallel_barrier_when_finish") {
-      CHECK(parallel_env_.penv != nullptr) << "Cannot run barrier without parallel environment";
-      CHECK(!parallel_env_.in_parallel_loop)
+      ICHECK(parallel_env_.penv != nullptr) << "Cannot run barrier without parallel environment";
+      ICHECK(!parallel_env_.in_parallel_loop)
           << "Cannot not place within parallel loop as the workload may differ, "
           << " place it between parallel and parallel_launch_point";
       this->VisitStmt(op->body);
@@ -947,7 +962,7 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
       builder_->CreateCall(bar_callee, {MakeValue(parallel_env_.task_id), parallel_env_.penv});
     } else if (op->attr_key == tir::attr::pragma_import_llvm) {
       const StringImmNode* value = op->value.as<StringImmNode>();
-      CHECK(value != nullptr);
+      ICHECK(value != nullptr);
       this->HandleImport(value->value);
       this->VisitStmt(op->body);
     } else {
@@ -960,7 +975,7 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
 }
 
 void CodeGenCPU::VisitStmt_(const ForNode* op) {
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   if (op->for_type == ForType::Serial || op->for_type == ForType::Unrolled) {
     CodeGenLLVM::VisitStmt_(op);
   } else if (op->for_type == ForType::Parallel) {
@@ -969,13 +984,13 @@ void CodeGenCPU::VisitStmt_(const ForNode* op) {
           For(op->loop_var, op->min, op->extent, op->for_type, op->device_api, op->body), 0);
     } else {
       // already in parallel env.
-      CHECK(parallel_env_.task_id.defined());
-      CHECK(parallel_env_.num_task.defined());
-      CHECK(parallel_env_.penv != nullptr);
+      ICHECK(parallel_env_.task_id.defined());
+      ICHECK(parallel_env_.num_task.defined());
+      ICHECK(parallel_env_.penv != nullptr);
       DataType t = op->extent.dtype();
       PrimExpr num_task = cast(t, parallel_env_.num_task);
       PrimExpr task_id = cast(t, parallel_env_.task_id);
-      CHECK(!parallel_env_.in_parallel_loop)
+      ICHECK(!parallel_env_.in_parallel_loop)
           << "Nested parallel loop is not supported by threadpool, try fuse them instead";
       parallel_env_.in_parallel_loop = true;
       if (parallel_env_.stride_pattern) {
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index a7e96c95e07f..c1af2a366a6b 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -48,7 +48,7 @@ namespace codegen {
 
 static std::string get_name(const PrimFunc& f) {
   auto global_symbol = f->GetAttr<runtime::String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined())
+  ICHECK(global_symbol.defined())
       << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
   return std::string(global_symbol.value());
 }
@@ -139,9 +139,9 @@ void CodeGenHexagon::InitTarget(llvm::TargetMachine* tm) {
   if (len_end != npos) {
     int hvx_bytes = 0;
     len_begin += std::strlen(hvx_length_feature);
-    CHECK(!fs.substr(len_begin, len_end - len_begin).getAsInteger(10, hvx_bytes))
+    ICHECK(!fs.substr(len_begin, len_end - len_begin).getAsInteger(10, hvx_bytes))
         << "invalid HVX length in feature string: " << fs.str();
-    CHECK(hvx_bytes == 64 || hvx_bytes == 128)
+    ICHECK(hvx_bytes == 64 || hvx_bytes == 128)
         << "invalid HVX vector length: " << hvx_bytes << ", should be 64 or 128";
     native_vector_bits_ = hvx_bytes * 8;
   }
@@ -249,7 +249,7 @@ llvm::GlobalVariable* CodeGenHexagon::InitContextPtr(llvm::Type* p_type, std::st
 }
 
 llvm::Value* CodeGenHexagon::GetContextPtr(llvm::GlobalVariable* gv) {
-  CHECK(gv != nullptr);
+  ICHECK(gv != nullptr);
 #if TVM_LLVM_VERSION >= 110
   llvm::LoadInst* faddr = builder_->CreateAlignedLoad(gv, llvm::Align(gv->getAlignment()));
 #else
@@ -305,7 +305,7 @@ llvm::BasicBlock* CodeGenHexagon::MakeCallPacked(const Array<PrimExpr>& args, ll
   llvm::Value* handle = GetPackedFuncHandle(func_name);
   // call the function
   int64_t nargs = end - begin;
-  CHECK_GE(nargs, 0);
+  ICHECK_GE(nargs, 0);
   llvm::Value* stack_value = MakeValue(args[1]);
   llvm::Value* stack_tcode = MakeValue(args[2]);
   llvm::Value* arg_value = builder_->CreateInBoundsGEP(
@@ -416,7 +416,7 @@ llvm::Value* CodeGenHexagon::CreateCallPacked(const CallNode* op) {
     return ConstInt32(0);
   }
 
-  CHECK_EQ(op->args.size(), 5U);
+  ICHECK_EQ(op->args.size(), 5U);
   llvm::Value* rvalue = nullptr;
   llvm::Value* ret_tcode = nullptr;
   MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImmNode>()->value,
@@ -426,7 +426,7 @@ llvm::Value* CodeGenHexagon::CreateCallPacked(const CallNode* op) {
 
 llvm::Value* CodeGenHexagon::CreateCallTracePacked(const CallNode* op) {
   using llvm::BasicBlock;
-  CHECK_EQ(op->args.size(), 6U);
+  ICHECK_EQ(op->args.size(), 6U);
   llvm::Value* rvalue = nullptr;
   llvm::Value* ret_tcode = nullptr;
   BasicBlock* end_block =
@@ -506,7 +506,7 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::tvm_call_trace_packed_lowered())) {
     return CreateCallTracePacked(op);
   } else if (op->op.same_as(builtin::tvm_struct_get())) {
-    CHECK_EQ(op->args.size(), 3);
+    ICHECK_EQ(op->args.size(), 3);
     int kind = op->args[2].as<IntImmNode>()->value;
     llvm::Value* ref =
         CreateStructRefPtr(op->dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
@@ -515,9 +515,9 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
     }
     return builder_->CreateLoad(ref);
   } else if (op->op.same_as(builtin::tvm_struct_set())) {
-    CHECK_EQ(op->args.size(), 4);
+    ICHECK_EQ(op->args.size(), 4);
     int kind = op->args[2].as<IntImmNode>()->value;
-    CHECK(kind != builtin::kArrAddr);
+    ICHECK(kind != builtin::kArrAddr);
     llvm::Value* ref = CreateStructRefPtr(op->args[3].dtype(), MakeValue(op->args[0]),
                                           MakeValue(op->args[1]), kind);
     llvm::Value* value = MakeValue(op->args[3]);
@@ -527,7 +527,7 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
     builder_->CreateStore(value, ref);
     return ConstInt32(0);
   } else if (op->op.same_as(builtin::tvm_stack_alloca())) {
-    CHECK_EQ(op->args.size(), 2);
+    ICHECK_EQ(op->args.size(), 2);
     const std::string& name = op->args[0].as<StringImmNode>()->value;
     llvm::Value* size = ConstInt32(op->args[1].as<IntImmNode>()->value);
     return builder_->CreateAlloca(types_for_alloca_.at(name), size);
@@ -559,7 +559,7 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll
     if (buf->getType() == t_void_p_) {
       buf = builder_->CreatePointerCast(buf, t_tvm_array_->getPointerTo());
     } else {
-      CHECK_EQ(buf->getType(), t_tvm_array_->getPointerTo());
+      ICHECK_EQ(buf->getType(), t_tvm_array_->getPointerTo());
     }
     /* The following "kinds" are accessing the members of DLTensor:
        typedef struct {
@@ -605,8 +605,8 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll
          TVMContext v_ctx;
        } TVMValue;
     */
-    CHECK_EQ(t.lanes(), 1);
-    CHECK(t.is_handle() || t.bits() == 64);
+    ICHECK_EQ(t.lanes(), 1);
+    ICHECK(t.is_handle() || t.bits() == 64);
     if (t.is_int()) {
       buf = builder_->CreatePointerCast(buf, t_int64_->getPointerTo());
       return builder_->CreateInBoundsGEP(buf, index);
@@ -614,7 +614,7 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll
       buf = builder_->CreatePointerCast(buf, t_float64_->getPointerTo());
       return builder_->CreateInBoundsGEP(buf, index);
     } else {
-      CHECK(t.is_handle());
+      ICHECK(t.is_handle());
       buf = builder_->CreatePointerCast(buf, t_tvm_value_->getPointerTo());
       buf = builder_->CreateInBoundsGEP(buf, index);
       return builder_->CreatePointerCast(buf, t_void_p_->getPointerTo());
@@ -708,7 +708,7 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
   cg->Init("TVMHexagonModule", tm.get(), ctx.get(), false, false, false);
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
     auto f = Downcast<PrimFunc>(kv.second);
     cg->AddFunction(f);
   }
@@ -740,7 +740,7 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
       llvm::raw_svector_ostream os(ss);
       std::unique_ptr<llvm::Module> cm = CloneModule(m);
       legacy::PassManager pass;
-      CHECK(tm->addPassesToEmitFile(pass, os, nullptr, ft) == 0) << "Cannot emit target code";
+      ICHECK(tm->addPassesToEmitFile(pass, os, nullptr, ft) == 0) << "Cannot emit target code";
       pass.run(*cm.get());
       out.assign(ss.c_str(), ss.size());
     }
@@ -752,13 +752,13 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
     llvm::SmallString<64> file_name;
     int fd;
     std::error_code ec = llvm::sys::fs::createTemporaryFile("tvm", suffix, fd, file_name);
-    CHECK_EQ(static_cast<bool>(ec), false) << ec.message();
+    ICHECK_EQ(static_cast<bool>(ec), false) << ec.message();
     llvm::raw_fd_ostream file(fd, true);
     file << data;
-    CHECK(!file.has_error()) << file.error().message();
+    ICHECK(!file.has_error()) << file.error().message();
     // If there is an error, execution will never get here, but return
     // {ec, name} anyway to allow caller to handle error conditions.
-    // This way the "CHECK" above can be removed with minimal effort.
+    // This way the "ICHECK" above can be removed with minimal effort.
     return std::make_pair(file.error(), std::string(file_name.c_str()));
   };
 
@@ -772,12 +772,12 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   so_name += "so";
 
   const auto* f = tvm::runtime::Registry::Get("tvm.contrib.hexagon.link_shared");
-  CHECK(f != nullptr) << "tvm.contrib.hexagon.link_shared does not to exist, "
-                         "do import tvm.contrib.hexagon";
+  ICHECK(f != nullptr) << "tvm.contrib.hexagon.link_shared does not to exist, "
+                          "do import tvm.contrib.hexagon";
 
   Array<PrimExpr> o_names = {StringImm(o_name)};
   int rc = (*f)(so_name, o_names);
-  CHECK(rc == 0) << "Failed to link " << so_name;
+  ICHECK(rc == 0) << "Failed to link " << so_name;
 
   // Move it to ExtractFuncInfo?
   std::set<std::string> export_abi;
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index cb04e6b8055b..d10ed311949c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -25,6 +25,7 @@
 #include "codegen_llvm.h"
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/error_codes.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/tir/op.h>
 
@@ -32,7 +33,10 @@
 
 #include "../../arith/pattern_match.h"
 #include "../build_common.h"
+#include "../func_registry_generator.h"
 #include "codegen_cpu.h"
+#include "codegen_params.h"
+#include "llvm/Support/raw_os_ostream.h"
 namespace tvm {
 namespace codegen {
 
@@ -108,7 +112,7 @@ void CodeGenLLVM::InitFuncState() {
 void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   this->InitFuncState();
 
-  CHECK_EQ(f->buffer_map.size(), 0U)
+  ICHECK_EQ(f->buffer_map.size(), 0U)
       << "Cannot codegen function with buffer_map, please lower them first";
 
   std::vector<llvm::Type*> param_types;
@@ -126,9 +130,9 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
       llvm::FunctionType::get(ret_void ? t_void_ : t_int_, param_types, false);
 
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined())
+  ICHECK(global_symbol.defined())
       << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
-  CHECK(module_->getFunction(static_cast<std::string>(global_symbol.value())) == nullptr)
+  ICHECK(module_->getFunction(static_cast<std::string>(global_symbol.value())) == nullptr)
       << "Function " << global_symbol << " already exist in module";
 
   function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
@@ -172,6 +176,11 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 #endif
 
+  llvm::StringRef fs = target_machine_->getTargetFeatureString();
+  if (!fs.empty()) {
+    function_->addFnAttr("target-features", fs);
+  }
+
   if (ret_void) {
     builder_->CreateRetVoid();
   } else {
@@ -179,10 +188,94 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 }
 
+void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
+  // It would be nice to de-dupe these declarations frm src/tir/transforms/make_packed_api.cc,
+  // but they are at a different layer in the compiler...
+  std::vector<llvm::Type*> param_types;
+  // args
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+  // tcodes
+  param_types.push_back(t_int_->getPointerTo(GetGlobalAddressSpace()));
+  // num_args
+  param_types.push_back(t_int_);
+  // ret_args
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+  // ret_tcodes
+  param_types.push_back(t_int_->getPointerTo(GetGlobalAddressSpace()));
+  // resource_handle
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+
+  llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, param_types, false);
+
+  llvm::Function* function =
+      llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
+                             ::tvm::runtime::symbol::tvm_lookup_linked_param, module_.get());
+  function->setCallingConv(llvm::CallingConv::C);
+  function->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+
+  llvm::BasicBlock* entry = llvm::BasicBlock::Create(*ctx_, "entry", function);
+  builder_->SetInsertPoint(entry);
+  std::vector<llvm::Value*> zero_index_list{llvm::ConstantInt::get(t_int32_, 0)};
+  std::vector<llvm::Value*> zero_array_index_list{llvm::ConstantInt::get(t_int32_, 0),
+                                                  llvm::ConstantInt::get(t_int32_, 0)};
+  auto args_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+      &function->arg_begin()[0],
+#else
+      &(*(function->arg_begin())),
+#endif
+      llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
+  llvm::Value* sid = builder_->CreateBitCast(
+      builder_->CreateLoad(t_void_->getPointerTo(GetGlobalAddressSpace()),
+                           builder_->CreateInBoundsGEP(args_array, zero_index_list)),
+      t_int64_);
+
+  llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
+  auto ret_types_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+      &function->arg_begin()[4],
+#else
+      &(*(std::next(function->arg_begin(), 4))),
+#endif
+      llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+  auto retval_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+      &function->arg_begin()[3],
+#else
+      &(*std::next(function->arg_begin(), 3)),
+#endif
+      llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1)->getPointerTo());
+  llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
+
+  builder_->SetInsertPoint(default_block);
+  builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMNullptr),
+                        builder_->CreateInBoundsGEP(ret_types_array, zero_array_index_list));
+  builder_->CreateRet(ConstInt32(kTvmErrorNoError));
+
+  // Add data to the global section.
+  for (auto kv : params) {
+    auto array = NDArrayToLLVMArray(ctx_, kv.second->param);
+    std::string symbol_name = std::string(::tvm::runtime::symbol::tvm_param_prefix) + kv.first;
+    llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
+        *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage, array, symbol_name);
+
+    llvm::BasicBlock* case_block = llvm::BasicBlock::Create(*ctx_, "case_" + symbol_name, function);
+    switch_inst->addCase(
+        llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)), case_block);
+    builder_->SetInsertPoint(case_block);
+    builder_->CreateStore(
+        builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
+        builder_->CreateInBoundsGEP(retval_array, zero_array_index_list));
+    builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
+                          builder_->CreateInBoundsGEP(ret_types_array, zero_array_index_list));
+    builder_->CreateRet(ConstInt32(0));
+  }
+}
+
 std::unique_ptr<llvm::Module> CodeGenLLVM::Finish() {
   this->AddStartupFunction();
   for (size_t i = 0; i < link_modules_.size(); ++i) {
-    CHECK(!llvm::Linker::linkModules(*module_, std::move(link_modules_[i])))
+    ICHECK(!llvm::Linker::linkModules(*module_, std::move(link_modules_[i])))
         << "Failed to link modules";
   }
   link_modules_.clear();
@@ -302,7 +395,7 @@ unsigned CodeGenLLVM::GetGlobalAddressSpace() const { return 0; }
 
 llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
   if (dtype.is_handle()) {
-    CHECK_EQ(dtype.lanes(), 1);
+    ICHECK_EQ(dtype.lanes(), 1);
     return t_void_p_;
   }
   if (dtype.is_void()) {
@@ -475,7 +568,9 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
   llvm::Constant* undef = llvm::UndefValue::get(type);
   llvm::Constant* zero = ConstInt32(0);
   value = builder_->CreateInsertElement(undef, value, zero);
-#if TVM_LLVM_VERSION >= 110
+#if TVM_LLVM_VERSION >= 120
+  llvm::Constant* mask = llvm::ConstantVector::getSplat(llvm::ElementCount::getFixed(lanes), zero);
+#elif TVM_LLVM_VERSION >= 110
   llvm::Constant* mask =
       llvm::ConstantVector::getSplat(llvm::ElementCount(lanes, /*Scalable=*/false), zero);
 #else
@@ -485,9 +580,9 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
 }
 
 llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) {
-  int num_elems = llvm::cast<llvm::VectorType>(vec->getType())->getNumElements();
+  int num_elems = GetVectorNumElements(vec);
   if (extent == num_elems && begin == 0) return vec;
-  CHECK(begin >= 0 && extent <= num_elems) << "Slicing out of bound!\n";
+  ICHECK(begin >= 0 && extent <= num_elems) << "Slicing out of bound!\n";
   std::vector<llvm::Constant*> indices;
   indices.reserve(extent);
   for (int i = 0; i < extent; ++i) {
@@ -501,7 +596,7 @@ llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent
 }
 
 llvm::Value* CodeGenLLVM::CreateVecFlip(llvm::Value* vec) {
-  int num_elems = llvm::cast<llvm::VectorType>(vec->getType())->getNumElements();
+  int num_elems = GetVectorNumElements(vec);
 #if TVM_LLVM_VERSION >= 110
   std::vector<int> indices;
 #else
@@ -515,9 +610,9 @@ llvm::Value* CodeGenLLVM::CreateVecFlip(llvm::Value* vec) {
 
 llvm::Value* CodeGenLLVM::CreateVecPad(llvm::Value* vec, int target_lanes) {
   llvm::Value* mask = llvm::UndefValue::get(DTypeToLLVMType(DataType::Int(32, target_lanes)));
-  int num_elems = llvm::cast<llvm::VectorType>(vec->getType())->getNumElements();
+  int num_elems = GetVectorNumElements(vec);
   if (num_elems == target_lanes) return vec;
-  CHECK_LT(num_elems, target_lanes);
+  ICHECK_LT(num_elems, target_lanes);
   for (int i = 0; i < num_elems; ++i) {
     mask = builder_->CreateInsertElement(mask, ConstInt32(i), ConstInt32(i));
   }
@@ -529,15 +624,15 @@ llvm::Value* CodeGenLLVM::CreateVecConcat(std::vector<llvm::Value*> vecs) {
   int total_lanes = 0;
 
   for (llvm::Value* v : vecs) {
-    total_lanes += llvm::cast<llvm::VectorType>(v->getType())->getNumElements();
+    total_lanes += GetVectorNumElements(v);
   }
   while (vecs.size() > 1) {
     std::vector<llvm::Value*> new_vecs;
     for (size_t i = 0; i < vecs.size() - 1; i += 2) {
       llvm::Value* lhs = vecs[i];
       llvm::Value* rhs = vecs[i + 1];
-      const size_t lhs_lanes = llvm::cast<llvm::VectorType>(lhs->getType())->getNumElements();
-      const size_t rhs_lanes = llvm::cast<llvm::VectorType>(rhs->getType())->getNumElements();
+      const size_t lhs_lanes = GetVectorNumElements(lhs);
+      const size_t rhs_lanes = GetVectorNumElements(rhs);
       if (lhs_lanes < rhs_lanes) {
         lhs = CreateVecPad(lhs, rhs_lanes);
       } else if (rhs_lanes < lhs_lanes) {
@@ -576,7 +671,7 @@ void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Va
   builder_->SetInsertPoint(for_begin);
   llvm::PHINode* loop_value = builder_->CreatePHI(begin->getType(), 2);
   loop_value->addIncoming(begin, pre_block);
-  CHECK(!var_map_.count(loop_var.get()));
+  ICHECK(!var_map_.count(loop_var.get()));
   var_map_[loop_var.get()] = loop_value;
   builder_->CreateCondBr(CreateLT(loop_var.dtype(), loop_value, end), for_body, for_end,
                          md_very_likely_branch_);
@@ -619,7 +714,7 @@ llvm::Value* CodeGenLLVM::CreateCast(DataType from, DataType to, llvm::Value* va
   } else if (from.is_uint() && to.is_float()) {
     return builder_->CreateUIToFP(value, target);
   } else {
-    CHECK(from.is_float() && to.is_float());
+    ICHECK(from.is_float() && to.is_float());
     return builder_->CreateFPCast(value, target);
   }
 }
@@ -645,7 +740,7 @@ llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) {
 
 llvm::Value* CodeGenLLVM::CreateBufferPtr(DataType t, llvm::Value* buffer, llvm::Value* index) {
   llvm::PointerType* btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
-  CHECK(btype != nullptr);
+  ICHECK(btype != nullptr);
   llvm::PointerType* ptype = DTypeToLLVMType(t)->getPointerTo(btype->getAddressSpace());
   if (btype != ptype) {
     buffer = builder_->CreatePointerCast(buffer, ptype);
@@ -655,7 +750,7 @@ llvm::Value* CodeGenLLVM::CreateBufferPtr(DataType t, llvm::Value* buffer, llvm:
 
 llvm::Value* CodeGenLLVM::GetVarValue(const VarNode* v) const {
   auto it = var_map_.find(v);
-  CHECK(it != var_map_.end()) << "cannot find variable " << v->name_hint;
+  ICHECK(it != var_map_.end()) << "cannot find variable " << v->name_hint;
   return it->second;
 }
 
@@ -745,7 +840,7 @@ llvm::Function* CodeGenLLVM::GetIntrinsicDecl(llvm::Intrinsic::ID id, llvm::Type
 
 llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   if (op->op.same_as(builtin_call_llvm_intrin_) || op->op.same_as(builtin_call_llvm_pure_intrin_)) {
-    CHECK_GE(op->args.size(), 2U);
+    ICHECK_GE(op->args.size(), 2U);
     llvm::Intrinsic::ID id = static_cast<llvm::Intrinsic::ID>(Downcast<IntImm>(op->args[0])->value);
     int64_t num_signature = Downcast<IntImm>(op->args[1])->value;
     std::vector<llvm::Value*> arg_value;
@@ -766,8 +861,8 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     llvm::Type* return_type = (id != llvm::Intrinsic::prefetch) ? GetLLVMType(GetRef<PrimExpr>(op))
                                                                 : llvm::Type::getVoidTy(*ctx_);
     llvm::Function* f = GetIntrinsicDecl(id, return_type, arg_type);
-    CHECK(f) << "Cannot find intrinsic declaration, possible type mismatch: "
-             << llvm::Intrinsic::getName(id, {});
+    ICHECK(f) << "Cannot find intrinsic declaration, possible type mismatch: "
+              << llvm::Intrinsic::getName(id, {});
     return builder_->CreateCall(f, arg_value);
   } else if (op->op.same_as(builtin::bitwise_and())) {
     return builder_->CreateAnd(MakeValue(op->args[0]), MakeValue(op->args[1]));
@@ -789,7 +884,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     return CreateStorageSync(op);
   } else if (op->op.same_as(builtin::address_of())) {
     const LoadNode* l = op->args[0].as<LoadNode>();
-    CHECK(op->args.size() == 1 && l);
+    ICHECK(op->args.size() == 1 && l);
     const RampNode* r = l->index.as<RampNode>();
     llvm::Value* ptr;
     unsigned addrspace;
@@ -807,13 +902,13 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::isnullptr())) {
     return builder_->CreateIsNull(MakeValue(op->args[0]));
   } else if (op->op.same_as(builtin::large_uint_imm())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     uint64_t low = static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
     uint64_t high = static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
     uint64_t val = (high << 32U) | low;
     return llvm::ConstantInt::get(DTypeToLLVMType(op->dtype), val);
   } else if (op->op.same_as(builtin::if_then_else())) {
-    CHECK_EQ(op->args[0].dtype().lanes(), 1) << "if_then_else can only take scalar condition";
+    ICHECK_EQ(op->args[0].dtype().lanes(), 1) << "if_then_else can only take scalar condition";
     using llvm::BasicBlock;
     BasicBlock* then_block = BasicBlock::Create(*ctx_, "if_then", function_);
     BasicBlock* else_block = BasicBlock::Create(*ctx_, "if_else", function_);
@@ -841,16 +936,16 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     return builder_->CreateFCmpUNO(a, a);
   } else if (op->op.same_as(builtin::vectorlow())) {
     llvm::Value* v = MakeValue(op->args[0]);
-    int l = llvm::cast<llvm::VectorType>(v->getType())->getNumElements();
+    int l = GetVectorNumElements(v);
     return CreateVecSlice(v, 0, l / 2);
   } else if (op->op.same_as(builtin::vectorhigh())) {
     llvm::Value* v = MakeValue(op->args[0]);
-    int l = llvm::cast<llvm::VectorType>(v->getType())->getNumElements();
+    int l = GetVectorNumElements(v);
     return CreateVecSlice(v, l / 2, l / 2);
   } else if (op->op.same_as(builtin::vectorcombine())) {
     llvm::Value* v0 = MakeValue(op->args[0]);
     llvm::Value* v1 = MakeValue(op->args[1]);
-    int num_elems = llvm::cast<llvm::VectorType>(v0->getType())->getNumElements() * 2;
+    int num_elems = GetVectorNumElements(v0) * 2;
 #if TVM_LLVM_VERSION >= 110
     std::vector<int> indices;
 #else
@@ -911,7 +1006,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const StringImmNode* op) { return GetConstS
         return builder_->Create##Op(a, b);                                           \
       }                                                                              \
     } else {                                                                         \
-      CHECK(t.is_float());                                                           \
+      ICHECK(t.is_float());                                                          \
       return builder_->CreateF##Op(a, b);                                            \
     }                                                                                \
   }                                                                                  \
@@ -930,7 +1025,7 @@ DEFINE_CODEGEN_BINARY_OP(Mul);
     } else if (t.is_uint()) {                                                        \
       return builder_->CreateICmpU##Op(a, b);                                        \
     } else {                                                                         \
-      CHECK(t.is_float());                                                           \
+      ICHECK(t.is_float());                                                          \
       return builder_->CreateFCmpO##Op(a, b);                                        \
     }                                                                                \
   }                                                                                  \
@@ -951,7 +1046,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const DivNode* op) {
   } else if (op->dtype.is_uint()) {
     return builder_->CreateUDiv(a, b);
   } else {
-    CHECK(op->dtype.is_float());
+    ICHECK(op->dtype.is_float());
     return builder_->CreateFDiv(a, b);
   }
 }
@@ -964,7 +1059,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ModNode* op) {
   } else if (op->dtype.is_uint()) {
     return builder_->CreateURem(a, b);
   } else {
-    CHECK(op->dtype.is_float());
+    ICHECK(op->dtype.is_float());
     return builder_->CreateFRem(a, b);
   }
 }
@@ -1021,7 +1116,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const SelectNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const LetNode* op) {
   auto it = let_binding_.find(op->var);
   if (it != let_binding_.end()) {
-    CHECK(deep_equal_(it->second->value, op->value))
+    ICHECK(deep_equal_(it->second->value, op->value))
         << "Let cannot bind the same var to two different values";
   } else {
     let_binding_[op->var] = op;
@@ -1055,7 +1150,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const LoadNode* op) {
       if (is_one(ramp->stride)) {
         int alignment, native_bits;
         GetAlignment(t, op->buffer_var.get(), ramp->base, &alignment, &native_bits);
-        CHECK_EQ(ramp->lanes, t.lanes());
+        ICHECK_EQ(ramp->lanes, t.lanes());
         llvm::Value* ptr = CreateBufferPtr(t.element_of(), buffer, MakeValue(ramp->base));
         ptr = builder_->CreatePointerCast(ptr, DTypeToLLVMType(t)->getPointerTo(addrspace));
 #if TVM_LLVM_VERSION >= 110
@@ -1091,7 +1186,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
     auto call_op = GetRef<Op>(ptr_op);
     if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
       // call extern intrinsic
-      CHECK_GE(op->args.size(), 1U);
+      ICHECK_GE(op->args.size(), 1U);
       auto global_symbol = Downcast<StringImm>(op->args[0]);
       return this->CreateCallExtern(GetType(GetRef<PrimExpr>(op)), global_symbol->value, op->args,
                                     true);
@@ -1103,7 +1198,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
       return CreateIntrinsic(op);
     }
   } else {
-    CHECK(op->op.as<GlobalVarNode>());
+    ICHECK(op->op.as<GlobalVarNode>());
     LOG(FATAL) << "Do not yet support cross function call";
     return nullptr;
   }
@@ -1129,8 +1224,8 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ShuffleNode* op) {
   std::vector<uint32_t> idx(op->indices.size());
   for (int i = 0, e = op->indices.size(); i < e; ++i) {
     const int64_t* val = as_const_int(op->indices[i]);
-    CHECK(val && *val >= 0 && *val < total_lanes) << "Shuffled indeces are suppose to be int, "
-                                                  << "but get " << op->indices[i] << "\n";
+    ICHECK(val && *val >= 0 && *val < total_lanes) << "Shuffled indeces are suppose to be int, "
+                                                   << "but get " << op->indices[i] << "\n";
     idx[i] = *val;
   }
   llvm::Value* mask = llvm::ConstantDataVector::get(builder_->getContext(), idx);
@@ -1147,7 +1242,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const BroadcastNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
-  CHECK(is_one(op->predicate));
+  ICHECK(is_one(op->predicate)) << op->predicate;
   DataType t = op->value.dtype();
   bool is_volatile = volatile_buf_.count(op->buffer_var.get());
   llvm::Value* buffer = MakeValue(op->buffer_var);
@@ -1173,7 +1268,7 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
       if (is_one(ramp->stride)) {
         int alignment, native_bits;
         GetAlignment(t, op->buffer_var.get(), ramp->base, &alignment, &native_bits);
-        CHECK_EQ(ramp->lanes, t.lanes());
+        ICHECK_EQ(ramp->lanes, t.lanes());
         llvm::Value* ptr = CreateBufferPtr(t.element_of(), buffer, MakeValue(ramp->base));
         ptr = builder_->CreatePointerCast(ptr, DTypeToLLVMType(t)->getPointerTo(addrspace));
 #if TVM_LLVM_VERSION >= 110
@@ -1187,7 +1282,7 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
       }
     }
   }
-  CHECK_GE(t.bits(), 8);
+  ICHECK_GE(t.bits(), 8);
   // scalarized store.
   int basic_align = t.bits() / 8;
   auto f = [&](int i, llvm::Value* index) {
@@ -1205,13 +1300,13 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const ForNode* op) {
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
   if (op->for_type == ForType::Unrolled) {
     LOG(WARNING) << "Unroll hint get ignore at CodeGenLLVM backend, "
                  << " consider set unroll_explicit=True";
   } else {
-    CHECK(op->for_type == ForType::Serial);
+    ICHECK(op->for_type == ForType::Serial);
   }
   CreateSerialFor(MakeValue(op->min), MakeValue(op->extent),
                   llvm::ConstantInt::getSigned(GetLLVMType(op->extent), 1), op->loop_var, op->body);
@@ -1241,11 +1336,11 @@ void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
-  CHECK(!is_zero(op->condition));
+  ICHECK(!is_zero(op->condition));
   llvm::Value* buf = nullptr;
 
   int32_t constant_size = op->constant_allocation_size();
-  CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation";
+  ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation";
   StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
   if (constant_size % 4 == 0 && info.alignment == 0) {
     info.alignment = GetTempAllocaAlignment(op->dtype, constant_size);
@@ -1269,7 +1364,7 @@ void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
 
   buf = builder_->CreatePointerCast(
       buf, DTypeToLLVMType(op->dtype)->getPointerTo(buf->getType()->getPointerAddressSpace()));
-  CHECK(!var_map_.count(op->buffer_var.get()));
+  ICHECK(!var_map_.count(op->buffer_var.get()));
   var_map_[op->buffer_var.get()] = buf;
   this->VisitStmt(op->body);
 }
@@ -1285,12 +1380,12 @@ void CodeGenLLVM::VisitStmt_(const AttrStmtNode* op) {
     }
   } else if (op->attr_key == tir::attr::storage_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     alloc_storage_info_[v].scope =
         runtime::StorageScope::Create(op->value.as<StringImmNode>()->value);
   } else if (op->attr_key == tir::attr::storage_alignment) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     alloc_storage_info_[v].alignment = static_cast<int>(op->value.as<IntImmNode>()->value);
     if (var_map_.count(v) && alloc_storage_info_[v].alignment > 1) {
       builder_->CreateAlignmentAssumption(*data_layout_, GetVarValue(v),
@@ -1298,7 +1393,7 @@ void CodeGenLLVM::VisitStmt_(const AttrStmtNode* op) {
     }
   } else if (op->attr_key == tir::attr::volatile_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     volatile_buf_.insert(v);
   }
   this->VisitStmt(op->body);
@@ -1311,7 +1406,7 @@ void CodeGenLLVM::VisitStmt_(const AssertStmtNode* op) {
 
 void CodeGenLLVM::VisitStmt_(const LetStmtNode* op) {
   const VarNode* v = op->var.get();
-  CHECK(!var_map_.count(v));
+  ICHECK(!var_map_.count(v));
   if (v->dtype.is_handle()) {
     if (!is_restricted_) {
       alias_var_set_.insert(v);
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 351d7d90b4fe..71583708da2c 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -45,7 +45,7 @@
 #include <vector>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "llvm_common.h"
 
 namespace tvm {
@@ -98,6 +98,18 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    * \param mod The module to be linked.
    */
   void AddLinkModule(std::unique_ptr<llvm::Module>&& mod);
+  /*!
+   * \brief Link parameters into the module so they don't need to be supplied at runtime.
+   * Parameters can be linked into the module so that the generated code is easier to use, or so
+   * that RAM space doesn't need to be allocated for them. This function adds the given parameters
+   * to the generated LLVM module.
+   * \param storage_id_offset Offset added to the index of each entry in params_by_sid to form the
+   *     storage_id of that parameter. Storage ids for parameters are expected to be contiguous.
+   * \param params_by_sid Array of NDArray. Each entry is a parameter. The index of the array (added
+   *     to sid_offset) is the storage_id of the param.
+   * \param param_names Array containing the name for each param in params_by_sid.
+   */
+  void LinkParameters(const Map<String, LinkedParam> params);
   /*!
    * \brief Create Value for expression e
    * \param e The expression to be created value for.
@@ -242,6 +254,11 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   llvm::Function* GetIntrinsicDecl(llvm::Intrinsic::ID id, llvm::Type* ret_type,
                                    llvm::ArrayRef<llvm::Type*> arg_types);
+  /*!
+   * \brief Get the number of elements in the given vector value.
+   * \param vec The value, must be of a vector type.
+   */
+  inline int GetVectorNumElements(llvm::Value* vec);
   // initialize the function state.
   void InitFuncState();
   // Get alignment given index.
@@ -348,6 +365,15 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   static std::unique_ptr<DebugInfo> CreateDebugInfo(llvm::Module* module);
 };
+
+inline int CodeGenLLVM::GetVectorNumElements(llvm::Value* vec) {
+#if TVM_LLVM_VERSION >= 120
+  return llvm::cast<llvm::FixedVectorType>(vec->getType())->getNumElements();
+#else
+  return llvm::cast<llvm::VectorType>(vec->getType())->getNumElements();
+#endif
+}
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // LLVM_VERSION
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 601df86d10ba..22e612b11090 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -46,11 +46,11 @@ class CodeGenNVPTX : public CodeGenLLVM {
   }
 
   void VisitStmt_(const AllocateNode* op) final {
-    CHECK(!is_zero(op->condition));
+    ICHECK(!is_zero(op->condition));
     llvm::Value* buf = nullptr;
 
     int32_t constant_size = op->constant_allocation_size();
-    CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
+    ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
     StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
     if (constant_size % 4 == 0 && info.alignment == 0) {
       info.alignment = GetTempAllocaAlignment(op->dtype, constant_size);
@@ -75,7 +75,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
       }
       buf = alloca;
     } else {
-      CHECK(info.scope.rank == runtime::StorageRank::kShared)
+      ICHECK(info.scope.rank == runtime::StorageRank::kShared)
           << "Can only allocate shared or local memory inside kernel";
       // Shared memory: address space  == 3
       const unsigned shared_address_space = 3;
@@ -94,7 +94,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
 
     buf = builder_->CreatePointerCast(
         buf, DTypeToLLVMType(op->dtype)->getPointerTo(buf->getType()->getPointerAddressSpace()));
-    CHECK(!var_map_.count(op->buffer_var.get()));
+    ICHECK(!var_map_.count(op->buffer_var.get()));
     var_map_[op->buffer_var.get()] = buf;
     this->VisitStmt(op->body);
   }
@@ -118,7 +118,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
           LOG(FATAL) << "unknown thread idx";
       }
     } else {
-      CHECK_EQ(ts.rank, 0);
+      ICHECK_EQ(ts.rank, 0);
       switch (ts.dim_index) {
         case 0:
           intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
@@ -238,7 +238,7 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
 
 int GetCUDAComputeVersion(const Target& target) {
   Optional<String> mcpu = target->GetAttr<String>("mcpu");
-  CHECK(mcpu.defined()) << "InternalError: \"-mcpu\" is undefined in the NVPTX target";
+  ICHECK(mcpu.defined()) << "InternalError: \"-mcpu\" is undefined in the NVPTX target";
   std::string sm_version = mcpu.value();
   return std::stoi(sm_version.substr(3));
 }
@@ -255,7 +255,7 @@ runtime::Module BuildNVPTX(IRModule mod, Target target) {
   cg->Init("TVMPTXModule", tm.get(), ctx.get(), false, false, false);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
     auto f = Downcast<PrimFunc>(kv.second);
     cg->AddFunction(f);
   }
@@ -287,14 +287,14 @@ runtime::Module BuildNVPTX(IRModule mod, Target target) {
   // emit ptx
   llvm::legacy::PassManager pass;
 #if TVM_LLVM_VERSION <= 60
-  CHECK(tm->addPassesToEmitFile(pass, dest_ptx, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, dest_ptx, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #elif TVM_LLVM_VERSION <= 90
-  CHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
-        0)
+  ICHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
+         0)
       << "Cannot emit target CGFT_ObjectFile";
 #else
-  CHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #endif
   pass.run(*module);
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
new file mode 100644
index 000000000000..694be5621606
--- /dev/null
+++ b/src/target/llvm/codegen_params.cc
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.cc
+ */
+#ifdef TVM_LLVM_VERSION
+
+#include "codegen_params.h"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace tvm {
+namespace codegen {
+
+template <typename T, typename E = void>
+struct LLVMConstantGetter {
+  static llvm::Constant* getElement(llvm::Type* ty, T t);
+};
+
+template <typename T>
+struct LLVMConstantGetter<
+    T, std::enable_if_t<(std::is_integral<T>::value && std::is_signed<T>::value)>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) {
+    return llvm::ConstantInt::getSigned(ty, t);
+  }
+};
+
+template <typename T>
+struct LLVMConstantGetter<
+    T, std::enable_if_t<(std::is_integral<T>::value && !std::is_signed<T>::value)>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) { return llvm::ConstantInt::get(ty, t); }
+};
+
+template <typename T>
+struct LLVMConstantGetter<T, std::enable_if_t<std::is_floating_point<T>::value>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) { return llvm::ConstantFP::get(ty, t); }
+};
+
+template <typename T, typename = std::enable_if<std::is_pod<T>::value>>
+void BuildLLVMVector(llvm::Type* element_type, void* tensor_data, size_t num_elements,
+                     std::vector<llvm::Constant*>* elements) {
+  elements->resize(num_elements, nullptr);
+  std::transform(static_cast<T*>(tensor_data), static_cast<T*>(tensor_data) + num_elements,
+                 elements->begin(),
+                 [&](T t) { return LLVMConstantGetter<T>::getElement(element_type, t); });
+}
+
+llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
+  llvm::Type* element_type = nullptr;
+
+  auto arr_type = arr.DataType();
+  CHECK(arr.IsContiguous()) << "CodegenParams: only support contiguous arrays";
+  CHECK_EQ(arr->ctx.device_type, kDLCPU) << "CodegenParams: only support contiguous arrays";
+  CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
+                                << arr_type.lanes();
+
+  auto shape = arr.Shape();
+  int num_elements = 1;
+  for (auto shape_elem : shape) {
+    num_elements *= shape_elem;
+  }
+
+  std::vector<llvm::Constant*> elements;
+
+  switch (arr_type.code()) {
+    case runtime::DataType::kInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+
+      switch (arr_type.bits()) {
+        case 8:
+          BuildLLVMVector<int8_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 16:
+          BuildLLVMVector<int16_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 32:
+          BuildLLVMVector<int32_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          BuildLLVMVector<int64_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          ICHECK(false) << "should not get here";
+          break;
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kUInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+
+      switch (arr_type.bits()) {
+        case 8:
+          BuildLLVMVector<uint8_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 16:
+          BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 32:
+          BuildLLVMVector<uint32_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          BuildLLVMVector<uint64_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          ICHECK(false) << "should not get here";
+          break;
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kFloat:
+      switch (arr_type.bits()) {
+        case 16:
+          // NOTE: float16 is treated as uint16_t.
+          element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 32:
+          element_type = llvm::Type::getFloatTy(*ctx);
+          BuildLLVMVector<float>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          element_type = llvm::Type::getDoubleTy(*ctx);
+          BuildLLVMVector<double>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                       << arr_type.bits() << "-bit array";
+          break;
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kBFloat:
+      CHECK(arr_type.bits() == 16)
+          << "CodegenParams: only support 16-bit bfloat; saw " << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+      BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+
+    default:
+      CHECK(false) << "Data type not supported";
+  }
+
+  return llvm::cast<llvm::ConstantArray>(llvm::ConstantArray::get(
+      llvm::ArrayType::get(element_type, num_elements), llvm::ArrayRef<llvm::Constant*>(elements)));
+}
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/codegen_params.h b/src/target/llvm/codegen_params.h
new file mode 100644
index 000000000000..771bc201f7aa
--- /dev/null
+++ b/src/target/llvm/codegen_params.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.h
+ */
+
+#ifndef TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
+#define TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
+
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/ndarray.h>
+
+#include "llvm_common.h"
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Convert an NDArray to an LLVM array of constants.
+ *
+ * The supplied NDArray is flattened, and each element is converted to the appropriate LLVM type.
+ *
+ * \param ctx LLVM context used to create the various primitive datatypes.
+ * \param arr NDArray to convert.
+ * \return LLVM array containing the array data.
+ */
+llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr);
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
diff --git a/src/target/llvm/codegen_x86_64.cc b/src/target/llvm/codegen_x86_64.cc
index f3362fb0f1eb..c2785458a004 100644
--- a/src/target/llvm/codegen_x86_64.cc
+++ b/src/target/llvm/codegen_x86_64.cc
@@ -79,7 +79,7 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const CastNode* op) {
   const auto from = op->value.dtype();
   const auto to = op->dtype;
   if (from.is_float() && to.is_float() && from.bits() == 16 && to.bits() == 32) {
-    CHECK_EQ(from.lanes(), to.lanes());
+    ICHECK_EQ(from.lanes(), to.lanes());
     CHECK_NOTNULL(target_machine_);
 
     const auto has_avx512 = TargetHasFeature(*target_machine_, "avx512f");
@@ -117,20 +117,24 @@ llvm::Value* CodeGenX86_64::CallVectorIntrin(llvm::Intrinsic::ID id, size_t intr
                                              llvm::Type* result_ty,
                                              const std::vector<llvm::Value*>& args) {
   llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), id, {});
+#if TVM_LLVM_VERSION >= 120
+  size_t num_elems = llvm::cast<llvm::FixedVectorType>(result_ty)->getNumElements();
+#else
   size_t num_elems = llvm::cast<llvm::VectorType>(result_ty)->getNumElements();
+#endif
   if (intrin_lanes == num_elems) {
     return builder_->CreateCall(f, args);
   }
 
   // Otherwise, we split the vector into intrin_lanes sized elements (widening where necessary),
   // compute each result, and then concatenate the vectors (slicing the result if necessary).
-  CHECK_LT(intrin_lanes, num_elems);
+  ICHECK_LT(intrin_lanes, num_elems);
   std::vector<llvm::Value*> split_results;
   for (size_t i = 0; i < num_elems; i += intrin_lanes) {
     std::vector<llvm::Value*> split_args;
     for (const auto& v : args) {
       if (v->getType()->isVectorTy()) {
-        CHECK_EQ(llvm::cast<llvm::VectorType>(v->getType())->getNumElements(), num_elems);
+        ICHECK_EQ(GetVectorNumElements(v), num_elems);
         split_args.push_back(CreateVecSlice(v, i, intrin_lanes));
       } else {
         split_args.push_back(v);
diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index abf350e2208a..4c8862bbfb63 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -47,7 +47,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.exp10")
       using tir::make_zero;
       PrimExpr e = targs[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
       PrimExpr ln10 = make_const(x.dtype(), 2.302585093);
       PrimExpr ret = exp(x * ln10);
@@ -93,7 +93,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.tanh")
       using tir::make_zero;
       PrimExpr e = targs[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
       PrimExpr one = make_const(x.dtype(), 1);
       PrimExpr two = make_const(x.dtype(), 2);
@@ -116,7 +116,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.popcount")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.tan").set_body([](const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   const PrimExpr& x = call->args[0];
   PrimExpr tan_x = sin(x) / cos(x);
   *rv = tan_x;
@@ -131,7 +131,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.cosh")
       using tir::make_zero;
       PrimExpr e = targs[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
       PrimExpr two = make_const(x.dtype(), 2);
       PrimExpr neg_one = make_const(x.dtype(), -1);
@@ -150,7 +150,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.sinh")
       using tir::make_zero;
       PrimExpr e = targs[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
       PrimExpr two = make_const(x.dtype(), 2);
       PrimExpr neg_one = make_const(x.dtype(), -1);
diff --git a/src/target/llvm/intrin_rule_llvm.h b/src/target/llvm/intrin_rule_llvm.h
index 1a6775e92e12..99463793d8de 100644
--- a/src/target/llvm/intrin_rule_llvm.h
+++ b/src/target/llvm/intrin_rule_llvm.h
@@ -41,7 +41,7 @@ template <unsigned id, int num_signature>
 inline void DispatchLLVMPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   Array<PrimExpr> cargs;
   // intrin id.
   cargs.push_back(IntImm(DataType::UInt(32), id));
@@ -57,7 +57,7 @@ template <unsigned id, int num_signature>
 inline void DispatchLLVMIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   Array<PrimExpr> cargs;
   // intrin id.
   cargs.push_back(IntImm(DataType::UInt(32), id));
diff --git a/src/target/llvm/intrin_rule_nvptx.cc b/src/target/llvm/intrin_rule_nvptx.cc
index 0e332940339c..bb653e8ee5e0 100644
--- a/src/target/llvm/intrin_rule_nvptx.cc
+++ b/src/target/llvm/intrin_rule_nvptx.cc
@@ -36,13 +36,14 @@ inline void DispatchPureExternLibDevice(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   using namespace tir;
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
-  CHECK(call->dtype.bits() == 32 || call->dtype.bits() == 64) << "Only support float32 or float64.";
+  ICHECK(call != nullptr);
+  ICHECK(call->dtype.bits() == 32 || call->dtype.bits() == 64)
+      << "Only support float32 or float64.";
 
   const OpNode* op = call->op.as<OpNode>();
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   std::string name = op->name;
-  CHECK_EQ(name.substr(0, 4), "tir.");
+  ICHECK_EQ(name.substr(0, 4), "tir.");
 
   std::ostringstream intrinsic_name;
   intrinsic_name << "__nv_" << name.substr(4);
diff --git a/src/target/llvm/intrin_rule_rocm.cc b/src/target/llvm/intrin_rule_rocm.cc
index 22ebf9b192aa..08b32ed1b946 100644
--- a/src/target/llvm/intrin_rule_rocm.cc
+++ b/src/target/llvm/intrin_rule_rocm.cc
@@ -36,12 +36,12 @@ inline void DispatchPureExternOCML(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   using namespace tir;
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
 
   const OpNode* op = call->op.as<OpNode>();
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   std::string name = op->name;
-  CHECK_EQ(name.substr(0, 4), "tir.");
+  ICHECK_EQ(name.substr(0, 4), "tir.");
 
   std::ostringstream intrinsic_name;
   intrinsic_name << "__ocml_" << name.substr(4) << "_f" << call->dtype.bits();
@@ -58,10 +58,10 @@ inline void DispatchShuffle(const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e_call = targs[0];
   using namespace tir;
   const CallNode* call = e_call.as<CallNode>();
-  CHECK(call != nullptr);
-  CHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   PrimExpr var = call->args[1];
-  CHECK_EQ(var.dtype().bits(), 32);
+  ICHECK_EQ(var.dtype().bits(), 32);
 
   // get own lane in self (__lane_id)
   PrimExpr minus_one = tir::make_const(DataType::Int(32), -1);
@@ -82,7 +82,7 @@ inline void DispatchShuffle(const TVMArgs& targs, TVMRetValue* rv) {
     index = self - delta;
     index = Select(index < (self & ~(width - 1)), self, index);
   } else {
-    CHECK(call->op.same_as(builtin::tvm_warp_shuffle_down()));
+    ICHECK(call->op.same_as(builtin::tvm_warp_shuffle_down()));
     PrimExpr delta = call->args[2];
     index = self + delta;
     index = Select((self & (width - 1)) + delta >= width, self, index);
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index e8225ab5b6e4..35bfc8dc2e5b 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -24,7 +24,7 @@
 
 #include "llvm_common.h"
 
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 #include <tvm/target/target.h>
 
 #include <atomic>
@@ -133,7 +133,7 @@ std::unique_ptr<llvm::TargetMachine> GetLLVMTargetMachine(const Target& target,
   std::string err;
   const llvm::Target* llvm_target = llvm::TargetRegistry::lookupTarget(target_triple, err);
   if (llvm_target == nullptr) {
-    CHECK(allow_null) << err << " target_triple=" << target_triple;
+    ICHECK(allow_null) << err << " target_triple=" << target_triple;
     return nullptr;
   }
   llvm::TargetMachine* tm =
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 712980cdbe41..73a3594427d3 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -30,8 +30,9 @@
 
 #include <mutex>
 
-#include "../../runtime/file_util.h"
+#include "../../runtime/file_utils.h"
 #include "../../runtime/library_module.h"
+#include "../func_registry_generator.h"
 #include "codegen_blob.h"
 #include "codegen_llvm.h"
 #include "llvm_common.h"
@@ -76,7 +77,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     if (name == runtime::symbol::tvm_module_main) {
       const char* entry_name =
           reinterpret_cast<const char*>(GetGlobalAddr(runtime::symbol::tvm_module_main));
-      CHECK(entry_name != nullptr)
+      ICHECK(entry_name != nullptr)
           << "Symbol " << runtime::symbol::tvm_module_main << " is not presented";
       faddr = reinterpret_cast<TVMBackendPackedCFunc>(GetFunctionAddr(entry_name));
     } else {
@@ -90,7 +91,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     std::string fmt = runtime::GetFileFormat(file_name, format);
     std::error_code ecode;
     llvm::raw_fd_ostream dest(file_name, ecode, llvm::sys::fs::F_None);
-    CHECK_EQ(ecode.value(), 0) << "Cannot open file: " << file_name << " " << ecode.message();
+    ICHECK_EQ(ecode.value(), 0) << "Cannot open file: " << file_name << " " << ecode.message();
     if (fmt == "o" || fmt == "obj") {
 #if TVM_LLVM_VERSION <= 60
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
@@ -98,16 +99,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
 #endif
       llvm::legacy::PassManager pass;
-      CHECK(tm_);
+      ICHECK(tm_);
 #if TVM_LLVM_VERSION <= 60
-      CHECK(tm_->addPassesToEmitFile(pass, dest, llvm::TargetMachine::CGFT_ObjectFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, llvm::TargetMachine::CGFT_ObjectFile) == 0)
           << "Cannot emit target CGFT_ObjectFile";
 #elif TVM_LLVM_VERSION <= 90
-      CHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::TargetMachine::CGFT_ObjectFile) ==
-            0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::TargetMachine::CGFT_ObjectFile) ==
+             0)
           << "Cannot emit target CGFT_ObjectFile";
 #else
-      CHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::CGFT_ObjectFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::CGFT_ObjectFile) == 0)
           << "Cannot emit target CGFT_ObjectFile";
 #endif
       pass.run(*m);
@@ -118,16 +119,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
 #endif
       llvm::legacy::PassManager pass;
-      CHECK(tm_);
+      ICHECK(tm_);
 #if TVM_LLVM_VERSION <= 60
-      CHECK(tm_->addPassesToEmitFile(pass, dest, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #elif TVM_LLVM_VERSION <= 90
-      CHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
-            0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, nullptr,
+                                      llvm::TargetMachine::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #else
-      CHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::CGFT_AssemblyFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #endif
       pass.run(*m);
@@ -163,16 +164,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
 #endif
       llvm::legacy::PassManager pass;
-      CHECK(tm_);
+      ICHECK(tm_);
 #if TVM_LLVM_VERSION <= 60
-      CHECK(tm_->addPassesToEmitFile(pass, rso, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, rso, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #elif TVM_LLVM_VERSION <= 90
-      CHECK(tm_->addPassesToEmitFile(pass, rso, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
-            0)
+      ICHECK(tm_->addPassesToEmitFile(pass, rso, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
+             0)
           << "Cannot emit target CGFT_AssemblyFile";
 #else
-      CHECK(tm_->addPassesToEmitFile(pass, rso, nullptr, llvm::CGFT_AssemblyFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, rso, nullptr, llvm::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #endif
       pass.run(*m);
@@ -180,7 +181,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     } else if (fmt == "" || fmt == "ll") {
       std::string type_str;
       llvm::raw_string_ostream rso(type_str);
-      CHECK(mptr_ != nullptr);
+      ICHECK(mptr_ != nullptr);
       mptr_->print(rso, nullptr);
       return rso.str();
     } else {
@@ -199,17 +200,31 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
     std::vector<PrimFunc> funcs;
     std::string entry_func;
+    Map<String, LinkedParam> linked_params;
+    bool found_linked_params = false;
+    bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
     for (auto kv : mod->functions) {
-      CHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+      if (could_have_linked_params &&
+          kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
+        Map<String, ObjectRef> attrs_dict =
+            Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
+        CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
+            << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+        linked_params =
+            Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
+        found_linked_params = true;
+        continue;
+      }
+      ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
       auto f = Downcast<PrimFunc>(kv.second);
       if (f->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
         auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-        CHECK(global_symbol.defined());
+        ICHECK(global_symbol.defined());
         entry_func = global_symbol.value();
       }
       funcs.push_back(f);
     }
-    CHECK_NE(funcs.size(), 0U);
+    ICHECK(funcs.size() > 0 || (could_have_linked_params && found_linked_params));
     // TODO(tqchen): remove the entry function behavior as it does not
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
@@ -222,6 +237,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       cg->AddMainFunction(entry_func);
     }
 
+    if (found_linked_params) {
+      cg->LinkParameters(linked_params);
+    }
     module_ = cg->Finish();
     module_->addModuleFlag(llvm::Module::Warning, "tvm_target",
                            llvm::MDString::get(*ctx_, LLVMTargetToString(target)));
@@ -254,7 +272,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     llvm::Metadata* tvm_target = module_->getModuleFlag("tvm_target");
     if (tvm_target != nullptr) {
       llvm::MDString* pstr = llvm::dyn_cast<llvm::MDString>(tvm_target);
-      CHECK(pstr != nullptr);
+      ICHECK(pstr != nullptr);
       target_metadata = pstr->getString().str();
       if (!(target_metadata.length() >= 4 && target_metadata.substr(0, 4) == "llvm")) {
         target_metadata = "llvm " + target_metadata;
@@ -311,12 +329,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
                  << " system=" << tm_sys->getTargetTriple().str();
     }
     llvm::DataLayout layout(tm->createDataLayout());
-    CHECK(layout == mptr_->getDataLayout())
+    ICHECK(layout == mptr_->getDataLayout())
         << "Data layout mismatch between module("
         << mptr_->getDataLayout().getStringRepresentation() << ")"
         << " and ExecutionEngine (" << layout.getStringRepresentation() << ")";
     ee_ = builder.create(tm.release());
-    CHECK(ee_ != nullptr) << "Failed to initialize jit engine for " << mptr_->getTargetTriple();
+    ICHECK(ee_ != nullptr) << "Failed to initialize jit engine for " << mptr_->getTargetTriple();
     ee_->runStaticConstructorsDestructors(false);
 
     if (void** ctx_addr =
diff --git a/src/target/opt/build_cuda_on.cc b/src/target/opt/build_cuda_on.cc
index 780829c256ce..1a0f08920fb6 100644
--- a/src/target/opt/build_cuda_on.cc
+++ b/src/target/opt/build_cuda_on.cc
@@ -109,7 +109,7 @@ std::string NVRTCCompile(const std::string& code, bool include_path = false) {
   std::string log;
   log.resize(log_size);
   NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0]));
-  CHECK_EQ(compile_res, NVRTC_SUCCESS) << log;
+  ICHECK_EQ(compile_res, NVRTC_SUCCESS) << log;
   size_t ptx_size;
   NVRTC_CALL(nvrtcGetPTXSize(prog, &ptx_size));
 
@@ -128,10 +128,10 @@ runtime::Module BuildCUDA(IRModule mod, Target target) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenCUDA: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenCUDA: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenCUDA: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
diff --git a/src/target/source/codegen_aocl.cc b/src/target/source/codegen_aocl.cc
index e90b7d4f8b2c..b3ed7cf32f7f 100644
--- a/src/target/source/codegen_aocl.cc
+++ b/src/target/source/codegen_aocl.cc
@@ -25,7 +25,7 @@
 #include <string>
 #include <vector>
 
-#include "../../runtime/file_util.h"
+#include "../../runtime/file_utils.h"
 #include "../../runtime/opencl/aocl/aocl_module.h"
 #include "../build_common.h"
 #include "codegen_opencl.h"
@@ -41,10 +41,10 @@ runtime::Module BuildAOCL(IRModule mod, Target target, bool emulation) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenOpenCL: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenOpenCL: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodegenOpenCL: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 2f19d6e126ad..417b7a2db508 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -78,7 +78,8 @@ void CodeGenC::AddFunction(const PrimFunc& f) {
   ReserveKeywordsAsUnique();
 
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined()) << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
+  ICHECK(global_symbol.defined())
+      << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
 
   this->PrintFuncPrefix();
@@ -138,10 +139,12 @@ void CodeGenC::PrintExpr(const PrimExpr& n, std::ostream& os) {  // NOLINT(*)
   }
 }
 
+static bool CheckOutermostBracketMatch(const std::string& s);
+
 void CodeGenC::PrintSSAAssign(const std::string& target, const std::string& src, DataType t) {
   PrintType(t, stream);
   stream << ' ' << target << " = ";
-  if (src.length() > 3 && src[0] == '(' && src[src.length() - 1] == ')') {
+  if (CheckOutermostBracketMatch(src)) {
     stream << src.substr(1, src.length() - 2);
   } else {
     stream << src;
@@ -187,7 +190,7 @@ std::string CodeGenC::GetBufferRef(DataType t, const VarNode* buffer, PrimExpr i
       // optimize for constant access
       if (auto* ptr = index.as<tir::IntImmNode>()) {
         int64_t offset = ptr->value;
-        CHECK_EQ(offset % t.lanes(), 0) << "Find unaligned vector load to a vector type";
+        ICHECK_EQ(offset % t.lanes(), 0) << "Find unaligned vector load to a vector type";
         os << vid << '[' << (offset / t.lanes()) << ']';
         return os.str();
       }
@@ -275,7 +278,7 @@ std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const Pri
     os << ')';
     return os.str();
   } else {
-    CHECK_LT(kind, builtin::kTVMValueKindBound_);
+    ICHECK_LT(kind, builtin::kTVMValueKindBound_);
     std::ostringstream os;
     os << "(((TVMValue*)";
     this->PrintExpr(buffer, os);
@@ -305,7 +308,7 @@ void CodeGenC::RegisterHandleType(const VarNode* buf_var, DataType t) {
   if (it == handle_data_type_.end()) {
     handle_data_type_[buf_var] = t;
   } else {
-    CHECK(it->second == t) << "conflicting buf var type";
+    ICHECK(it->second == t) << "conflicting buf var type";
   }
 }
 
@@ -346,11 +349,11 @@ void CodeGenC::PrintStorageSync(const CallNode* op) {  // NOLINT(*)
 }
 
 void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) {  // NOLINT(*)
-  CHECK_EQ(scope, "global");
+  ICHECK_EQ(scope, "global");
 }
 
 void CodeGenC::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
-  CHECK_EQ(t.lanes(), 1) << "do not yet support vector types";
+  ICHECK_EQ(t.lanes(), 1) << "do not yet support vector types";
   if (t.is_handle()) {
     os << "void*";
     return;
@@ -491,7 +494,7 @@ inline void PrintBinaryIntrinsic(const CallNode* op, const char* opstr,
                                  std::ostream& os,  // NOLINT(*)
                                  CodeGenC* p) {
   if (op->dtype.lanes() == 1) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     os << '(';
     p->PrintExpr(op->args[0], os);
     os << opstr;
@@ -576,7 +579,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
     auto call_op = GetRef<Op>(ptr_op);
 
     if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
-      CHECK_GE(op->args.size(), 1U);
+      ICHECK_GE(op->args.size(), 1U);
       auto func = Downcast<StringImm>(op->args[0]);
       this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), func->value, op->args, true, os);
     } else if (op_attr_global_symbol_.count(call_op)) {
@@ -586,7 +589,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
     } else if (op->op.same_as(builtin::bitwise_and())) {
       PrintBinaryIntrinsic(op, " & ", os, this);
     } else if (op->op.same_as(builtin::large_uint_imm())) {
-      CHECK_EQ(op->args.size(), 2U);
+      ICHECK_EQ(op->args.size(), 2U);
       uint64_t low = static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
       uint64_t high = static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
       uint64_t val = (high << 32U) | low;
@@ -596,7 +599,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
     } else if (op->op.same_as(builtin::bitwise_or())) {
       PrintBinaryIntrinsic(op, " | ", os, this);
     } else if (op->op.same_as(builtin::bitwise_not())) {
-      CHECK_EQ(op->args.size(), 1U);
+      ICHECK_EQ(op->args.size(), 1U);
       os << "(~";
       this->PrintExpr(op->args[0], os);
       os << ')';
@@ -614,7 +617,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       os << ")";
     } else if (op->op.same_as(builtin::address_of())) {
       const LoadNode* l = op->args[0].as<LoadNode>();
-      CHECK(op->args.size() == 1 && l);
+      ICHECK(op->args.size() == 1 && l);
       os << "((";
       this->PrintType(l->dtype.element_of(), os);
       os << " *)" << this->GetVarID(l->buffer_var.get()) << " + "
@@ -625,10 +628,10 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       }
       os << "))";
     } else if (op->op.same_as(builtin::tvm_struct_get())) {
-      CHECK_EQ(op->args.size(), 3U);
+      ICHECK_EQ(op->args.size(), 3U);
       os << GetStructRef(op->dtype, op->args[0], op->args[1], op->args[2].as<IntImmNode>()->value);
     } else if (op->op.same_as(builtin::isnullptr())) {
-      CHECK_EQ(op->args.size(), 1U);
+      ICHECK_EQ(op->args.size(), 1U);
       os << "(";
       this->PrintExpr(op->args[0], os);
       os << " == NULL)";
@@ -649,7 +652,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       LOG(FATAL) << "Unresolved call " << op->op;
     }
   } else {
-    CHECK(op->op.as<GlobalVarNode>());
+    ICHECK(op->op.as<GlobalVarNode>());
     LOG(FATAL) << "Do not yet support cross function call";
   }
 }
@@ -678,7 +681,7 @@ void CodeGenC::VisitExpr_(const LoadNode* op, std::ostream& os) {  // NOLINT(*)
     std::string ref = GetBufferRef(op->dtype, op->buffer_var.get(), op->index);
     HandleVolatileLoads(ref, op, os);
   } else {
-    CHECK(is_one(op->predicate)) << "predicated load is not supported";
+    ICHECK(is_one(op->predicate)) << "predicated load is not supported";
 
     arith::PVar<PrimExpr> base;
     if (arith::ramp(base, 1, op->dtype.lanes()).Match(op->index)) {
@@ -722,7 +725,7 @@ void CodeGenC::VisitStmt_(const StoreNode* op) {
     this->PrintIndent();
     stream << ref << " = " << value << ";\n";
   } else {
-    CHECK(is_one(op->predicate)) << "Predicated store is not supported";
+    ICHECK(is_one(op->predicate)) << "Predicated store is not supported";
     arith::PVar<PrimExpr> base;
 
     // The assignment below introduces side-effect, and the resulting value cannot
@@ -767,7 +770,7 @@ void CodeGenC::VisitStmt_(const StoreNode* op) {
 void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
   auto it = let_binding_.find(op->var);
   if (it != let_binding_.end()) {
-    CHECK(deep_equal_(it->second->value, op->value))
+    ICHECK(deep_equal_(it->second->value, op->value))
         << "Let cannot bind the same var to two different values";
   } else {
     let_binding_[op->var] = op;
@@ -779,7 +782,7 @@ void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
 
 void CodeGenC::VisitExpr_(const RampNode* op, std::ostream& os) {  // NOLINT(*)
   // constraint of current logic
-  CHECK_EQ(op->base.dtype(), DataType::Int(32));
+  ICHECK_EQ(op->base.dtype(), DataType::Int(32));
   os << "((int" << op->lanes << ")(";
   for (int i = 0; i < op->lanes; i++) {
     os << "(" << PrintExpr(op->base) << ")"
@@ -810,7 +813,7 @@ void CodeGenC::VisitExpr_(const SelectNode* op, std::ostream& os) {  // NOLINT(*
 void CodeGenC::VisitStmt_(const LetStmtNode* op) {
   std::string value = PrintExpr(op->value);
   if (print_ssa_form_) {
-    CHECK(!var_idmap_.count(op->var.get()));
+    ICHECK(!var_idmap_.count(op->var.get()));
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
@@ -828,12 +831,12 @@ void CodeGenC::VisitStmt_(const LetStmtNode* op) {
 }
 
 void CodeGenC::VisitStmt_(const AllocateNode* op) {
-  CHECK(!is_zero(op->condition));
+  ICHECK(!is_zero(op->condition));
   std::string vid = AllocVarID(op->buffer_var.get());
 
   this->PrintIndent();
   int32_t constant_size = op->constant_allocation_size();
-  CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
+  ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
   const VarNode* buffer = op->buffer_var.as<VarNode>();
   std::string scope = alloc_storage_scope_.at(buffer);
   PrintStorageScope(scope, stream);
@@ -854,15 +857,15 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) {
     }
   } else if (op->attr_key == tir::attr::storage_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     alloc_storage_scope_[v] = op->value.as<StringImmNode>()->value;
   } else if (op->attr_key == tir::attr::volatile_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     volatile_buf_.insert(v);
   } else if (op->attr_key == tir::attr::pragma_import_c) {
     const StringImmNode* value = op->value.as<StringImmNode>();
-    CHECK(value != nullptr);
+    ICHECK(value != nullptr);
     decl_stream << value->value;
   }
   this->PrintStmt(op->body);
@@ -873,7 +876,7 @@ void CodeGenC::VisitStmt_(const AssertStmtNode* op) {
   PrintIndent();
   if (const auto* str = op->message.as<StringImmNode>()) {
     // GLOG style check
-    stream << "CHECK(" << cond << ") << \"" << str->value << "\";\n";
+    stream << "ICHECK(" << cond << ") << \"" << str->value << "\";\n";
   } else {
     stream << "assert(" << cond << ");\n";
   }
@@ -884,7 +887,7 @@ void CodeGenC::VisitStmt_(const ForNode* op) {
   std::string extent = PrintExpr(op->extent);
   PrintIndent();
   std::string vid = AllocVarID(op->loop_var.get());
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   stream << "for (";
   PrintType(op->loop_var.dtype(), stream);
   stream << ' ' << vid << " = 0; " << vid << " < " << extent << "; ++" << vid << ") {\n";
@@ -932,7 +935,7 @@ void CodeGenC::VisitStmt_(const EvaluateNode* op) {
       this->PrintStorageSync(call);
       return;
     } else if (call->op.same_as(builtin::tvm_struct_set())) {
-      CHECK_EQ(call->args.size(), 4);
+      ICHECK_EQ(call->args.size(), 4);
       std::string value = PrintExpr(call->args[3]);
       std::string ref = GetStructRef(call->args[3].dtype(), call->args[0], call->args[1],
                                      call->args[2].as<IntImmNode>()->value);
@@ -949,7 +952,7 @@ void CodeGenC::VisitStmt_(const EvaluateNode* op) {
 }
 
 void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) {
-  CHECK_GT(t.lanes(), 1);
+  ICHECK_GT(t.lanes(), 1);
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (i != 0) {
       os << "|";
@@ -972,5 +975,23 @@ void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
   return;
 }
 
+static bool CheckOutermostBracketMatch(const std::string& s) {
+  if (!s.empty() && s.front() == '(' && s.back() == ')') {
+    size_t len = s.size();
+    int n_unmatched = 0;
+    for (size_t i = 0; i < len; ++i) {
+      if (s[i] == '(') {
+        n_unmatched++;
+      } else if (s[i] == ')') {
+        n_unmatched--;
+      }
+      if (n_unmatched == 0) {
+        return i == len - 1;
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index dc93c31e7024..0a19fc1399b7 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -23,6 +23,8 @@
 #include "codegen_c_host.h"
 
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/crt/error_codes.h>
+#include <tvm/runtime/module.h>
 #include <tvm/target/codegen.h>
 
 #include <string>
@@ -31,15 +33,17 @@
 #include "../../support/str_escape.h"
 #include "../build_common.h"
 #include "../func_registry_generator.h"
+#include "codegen_params.h"
 
 namespace tvm {
 namespace codegen {
 
 CodeGenCHost::CodeGenCHost() { module_name_ = GetUniqueName("__tvm_module_ctx"); }
 
-void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) {
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_str) {
   emit_asserts_ = emit_asserts;
   declared_globals_.clear();
+  decl_stream << "// tvm target: " << target_str << "\n";
   decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
   decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
   decl_stream << "#include <math.h>\n";
@@ -49,13 +53,55 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) {
 
 void CodeGenCHost::AddFunction(const PrimFunc& f) {
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined())
+  ICHECK(global_symbol.defined())
       << "CodeGenCHost: Expect PrimFunc to have the global_symbol attribute";
   function_names_.emplace_back(global_symbol.value());
 
   CodeGenC::AddFunction(f);
 }
 
+void CodeGenCHost::LinkParameters(Map<String, LinkedParam> params) {
+  PrintFuncPrefix();
+  stream << " " << tvm::runtime::symbol::tvm_lookup_linked_param
+         << "(void* args, int* arg_type_ids, int num_args, void* out_ret_value, "
+         << "int* out_ret_tcode, void* resource_handle) {\n";
+  ICHECK_EQ(GetUniqueName(tvm::runtime::symbol::tvm_lookup_linked_param),
+            tvm::runtime::symbol::tvm_lookup_linked_param)
+      << "builtin PackedFunc name already taken: " << tvm::runtime::symbol::tvm_lookup_linked_param;
+  stream << "    switch (((int64_t*) args)[0]) {\n"
+         << "    default:\n"
+         << "        out_ret_tcode[0] = " << kTVMNullptr << ";\n"
+         << "        return 0;\n";
+
+  function_names_.emplace_back(tvm::runtime::symbol::tvm_lookup_linked_param);
+  for (auto kv : params) {
+    decl_stream << "\n"
+                << "#ifdef __cplusplus\n"
+                << "extern \"C\" {\n"
+                << "#endif\n"
+                << "static const ";
+    int64_t num_elements = 1;
+    for (int64_t dim : kv.second->param.Shape()) {
+      num_elements *= dim;
+    }
+    PrintType(kv.second->param.DataType(), decl_stream);
+    decl_stream << " " << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << "["
+                << num_elements << "] = {\n";
+    NDArrayDataToC(kv.second->param, 4, decl_stream);
+    decl_stream << "};\n"
+                << "#ifdef __cplusplus\n"
+                << "}  // extern \"C\"\n"
+                << "#endif\n";
+    stream << "    case " << kv.second->id << ":\n"
+           << "        ((uint64_t*)out_ret_value)[0] = (uint64_t) (uintptr_t) "
+           << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << ";\n"
+           << "        out_ret_tcode[0] = " << kTVMOpaqueHandle << ";\n"
+           << "        return 0;\n";
+  }
+  stream << "    }\n"
+         << "}\n";
+}
+
 void CodeGenCHost::PrintFuncPrefix() {  // NOLINT(*)
   stream << "#ifdef __cplusplus\n"
          << "extern \"C\"\n"
@@ -71,7 +117,7 @@ void CodeGenCHost::PrintFinalReturn() {  // NOLINT(*)
 void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    CHECK_EQ(lanes, 1) << "does not support vector types";
+    ICHECK_EQ(lanes, 1) << "does not support vector types";
     os << "void*";
     return;
   }
@@ -192,7 +238,7 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
     std::string stack_name = GetUniqueName("stack");
     const std::string& type = op->args[0].as<StringImmNode>()->value;
     const IntImmNode* num = op->args[1].as<IntImmNode>();
-    CHECK(num != nullptr);
+    ICHECK(num != nullptr);
     static_assert(alignof(TVMValue) % alignof(DLTensor) == 0, "invariant");
     size_t unit = sizeof(TVMValue);
     size_t size = 0;
@@ -212,18 +258,18 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
     os << stack_name;
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
     const StringImmNode* s = op->args[0].as<StringImmNode>();
-    CHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
+    ICHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
     int64_t begin = op->args[3].as<IntImmNode>()->value;
     int64_t end = op->args[4].as<IntImmNode>()->value;
     int64_t num_args = end - begin;
-    CHECK_GE(num_args, 0);
+    ICHECK_GE(num_args, 0);
     std::string func_name = s->value;
     // NOTE: cannot rely on GetUnique for global decl_stream declarations
     // because it is reset between AddFunction().
     std::string packed_func_name = func_name + "_packed";
     if (declared_globals_.insert(packed_func_name).second) {
       // Still reserve the name among unique names.
-      CHECK(GetUniqueName(packed_func_name) == packed_func_name)
+      ICHECK(GetUniqueName(packed_func_name) == packed_func_name)
           << "Expected name " << packed_func_name << " to not be taken";
       decl_stream << "static void* " << packed_func_name << " = NULL;\n";
     }
@@ -304,16 +350,35 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   bool output_ssa = false;
   bool emit_asserts = false;
   CodeGenCHost cg;
-  cg.Init(output_ssa, emit_asserts);
+  cg.Init(output_ssa, emit_asserts, target->str());
 
+  Map<String, LinkedParam> linked_params;
+  bool found_linked_params = false;
+  bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
+    if (could_have_linked_params &&
+        kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
+      Map<String, ObjectRef> attrs_dict = Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
+      CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
+          << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+      linked_params =
+          Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
+      found_linked_params = true;
+      continue;
+    }
+
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     cg.AddFunction(f);
   }
 
+  if (could_have_linked_params) {
+    ICHECK(found_linked_params) << "-link-params given but none found";
+    cg.LinkParameters(linked_params);
+  }
+
   if (target->GetAttr<Bool>("system-lib").value_or(Bool(false))) {
-    CHECK_EQ(target->GetAttr<String>("runtime").value_or(""), "c")
+    ICHECK_EQ(target->GetAttr<String>("runtime").value_or(""), "c")
         << "c target only supports generating C runtime SystemLibs";
     cg.GenerateFuncRegistry();
     cg.GenerateCrtSystemLib();
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 66ac4ddd99d3..b54b6fbfcfeb 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -38,10 +38,13 @@ namespace codegen {
 class CodeGenCHost final : public CodeGenC {
  public:
   CodeGenCHost();
-  void Init(bool output_ssa, bool emit_asserts);
+  void Init(bool output_ssa, bool emit_asserts, std::string target_str);
 
   void AddFunction(const PrimFunc& f);
 
+  /*! \brief Add linked parameters, if they are present. */
+  void LinkParameters(Map<String, LinkedParam> params);
+
   void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
   void PrintFuncPrefix() final;                        // NOLINT(*)
   void PrintFinalReturn() final;                       // NOLINT(*)
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index d57efa007272..51fcbb633de7 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -41,7 +41,7 @@ void CodeGenCUDA::Init(bool output_ssa) {
   CodeGenC::Init(output_ssa);
   vid_global_barrier_state_ = GetUniqueName(runtime::symbol::tvm_global_barrier_state);
   vid_global_barrier_expect_ = GetUniqueName("__barrier_expect");
-  CHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state);
+  ICHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state);
 }
 
 void CodeGenCUDA::PrintFuncPrefix() { stream << "extern \"C\" __global__ void"; }
@@ -83,7 +83,7 @@ std::string CodeGenCUDA::Finish() {
 }
 
 void CodeGenCUDA::VisitStmt_(const tir::ForNode* op) {
-  CHECK(is_const_int(op->min, 0));
+  ICHECK(is_const_int(op->min, 0));
   if (op->for_type == tir::ForType::Unrolled) {
     PrintIndent();
     stream << "#pragma unroll\n";
@@ -92,14 +92,14 @@ void CodeGenCUDA::VisitStmt_(const tir::ForNode* op) {
 }
 
 void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
-  CHECK(!var_idmap_.count(iv->var.get()));
+  ICHECK(!var_idmap_.count(iv->var.get()));
   var_idmap_[iv->var.get()] = CastFromTo(iv->thread_tag, DataType::UInt(32), iv->var.dtype());
 }
 
 void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    CHECK_EQ(lanes, 1) << "do not yet support vector types";
+    ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
@@ -120,7 +120,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           // h4.z is emitted as *(half2*)(&(u2.y)).x
           // h4.w is emitted as *(half2*)(&(u2.y)).y
           //
-          CHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
+          ICHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
           os << "uint" << lanes / 2;
         } else {
           fail = true;
@@ -308,7 +308,7 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  CHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
+  ICHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
   if ((t.is_int()) && t.bits() == 8) {
     if (t.lanes() == 2 || t.lanes() == 3) {
       os << vec << "." << access[i % t.lanes()];
@@ -332,7 +332,7 @@ void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
                                     const std::string& value) {
   this->PrintIndent();
   static const char access[] = {'x', 'y', 'z', 'w'};
-  CHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
+  ICHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (t.lanes() == 2 || t.lanes() == 3) {
       stream << vec << '.' << access[i % t.lanes()] << "="
@@ -394,8 +394,8 @@ void CodeGenCUDA::PrintStorageSync(const CallNode* op) {
 }
 
 void CodeGenCUDA::PrintStorageScope(const std::string& scope, std::ostream& os) {  // NOLINT(*)
-  CHECK_NE(scope, "global") << "Cannot allocate global memory when targeting CUDA. You must pass "
-                               "all global arrays as input instead";
+  ICHECK_NE(scope, "global") << "Cannot allocate global memory when targeting CUDA. You must pass "
+                                "all global arrays as input instead";
   if (scope == "shared") {
     os << "__shared__ ";
   }
@@ -404,7 +404,7 @@ void CodeGenCUDA::PrintStorageScope(const std::string& scope, std::ostream& os)
 void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
   DataType from_ty = op->value.dtype();
   DataType target_ty = op->dtype;
-  CHECK_EQ(target_ty.lanes(), from_ty.lanes());
+  ICHECK_EQ(target_ty.lanes(), from_ty.lanes());
 
   // Emit simple C-style type conversion.
   if (from_ty.is_scalar()) return CodeGenC::VisitExpr_(op, os);
@@ -496,7 +496,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
 
   if (op->op.same_as(builtin::tvm_fill_fragment())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 6U);
+    ICHECK_EQ(op->args.size(), 6U);
     os << "nvcuda::wmma::fill_fragment(";
     this->PrintExpr(op->args[0], os);
     os << "[";
@@ -506,7 +506,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << ")";
   } else if (op->op.same_as(builtin::tvm_load_matrix_sync())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 8U);
+    ICHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::load_matrix_sync(";
     this->PrintExpr(op->args[0], os);
     os << "[";
@@ -518,7 +518,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << ")";
   } else if (op->op.same_as(builtin::tvm_store_matrix_sync())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 8U);
+    ICHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::store_matrix_sync(";
     this->PrintExpr(op->args[5], os);
     os << ", ";
@@ -535,7 +535,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << ")";
   } else if (op->op.same_as(builtin::tvm_mma_sync())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 8U);
+    ICHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::mma_sync(";
     for (int i = 0; i < 4; ++i) {
       this->PrintExpr(op->args[i * 2], os);
@@ -545,7 +545,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     }
   } else if (op->op.same_as(builtin::tvm_bmma_sync())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 8U);
+    ICHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::bmma_sync(";
     for (int i = 0; i < 4; ++i) {
       this->PrintExpr(op->args[i * 2], os);
@@ -572,24 +572,24 @@ void CodeGenCUDA::VisitStmt_(const AttrStmtNode* op) {
 }
 
 void CodeGenCUDA::VisitStmt_(const AllocateNode* op) {
-  CHECK(!is_zero(op->condition));
+  ICHECK(!is_zero(op->condition));
   std::string vid = AllocVarID(op->buffer_var.get());
 
   this->PrintIndent();
   int32_t constant_size = op->constant_allocation_size();
-  CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
+  ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
   const VarNode* buffer = op->buffer_var.as<VarNode>();
   std::string scope = alloc_storage_scope_.at(buffer);
   if (scope.find("wmma.") == 0) {
     if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
-      CHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Int(8) ||
-            op->dtype == DataType::UInt(8) || op->dtype == DataType::Int(4) ||
-            op->dtype == DataType::UInt(4) || op->dtype == DataType::Int(1))
+      ICHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Int(8) ||
+             op->dtype == DataType::UInt(8) || op->dtype == DataType::Int(4) ||
+             op->dtype == DataType::UInt(4) || op->dtype == DataType::Int(1))
           << "Matrix_a and matrix_b only support half or char or unsigned char "
           << "or uint4 or int4 or int1 type for now";
     } else {
-      CHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Float(32) ||
-            op->dtype == DataType::Int(32))
+      ICHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Float(32) ||
+             op->dtype == DataType::Int(32))
           << "Accumulator only support half, float and int type for now";
     }
     constant_size = GetWmmaFragmentSize(scope, buffer, constant_size);
@@ -640,7 +640,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
   if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 8 && op->lanes == 4) {
     // make_int8x4
     const int64_t* p = as_const_int(op->value);
-    CHECK(p);
+    ICHECK(p);
     int64_t v = *p & 0xFF;
     v = (v << 24) | (v << 16) | (v << 8) | v;
     if (op->dtype.is_uint()) {
@@ -678,7 +678,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
 void CodeGenCUDA::VisitExpr_(const ShuffleNode* op, std::ostream& os) {
   std::vector<std::string> to_shuffle(op->vectors.size());
   for (int i = 0, e = op->vectors.size(); i < e; ++i) {
-    CHECK(op->vectors[i].dtype().lanes() == 1) << "Only scalars can be shuffled in CUDA!";
+    ICHECK(op->vectors[i].dtype().lanes() == 1) << "Only scalars can be shuffled in CUDA!";
     to_shuffle[i] = PrintExpr(op->vectors[i]);
   }
   os << "make_";
@@ -686,7 +686,7 @@ void CodeGenCUDA::VisitExpr_(const ShuffleNode* op, std::ostream& os) {
   os << '(';
   for (int i = 0, e = op->indices.size(); i < e; ++i) {
     const int64_t* val = as_const_int(op->indices[i]);
-    CHECK(val && *val >= 0 && (int)*val < (int)to_shuffle.size());
+    ICHECK(val && *val >= 0 && (int)*val < (int)to_shuffle.size());
     if (i != 0) os << ", ";
     os << to_shuffle[*val];
   }
@@ -701,8 +701,8 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream& os) {
   }
 
   // Codegen vector condition case by serializing the select op.
-  CHECK(op->false_value->dtype == op->dtype && op->true_value->dtype == op->dtype &&
-        op->dtype.lanes() == op->condition.dtype().lanes());
+  ICHECK(op->false_value->dtype == op->dtype && op->true_value->dtype == op->dtype &&
+         op->dtype.lanes() == op->condition.dtype().lanes());
 
   std::string r_var = GetUniqueName("_");
   this->PrintIndent();
@@ -846,7 +846,7 @@ void CodeGenCUDA::HandleVolatileLoads(const std::string& value, const LoadNode*
 
 void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
                                        std::ostream& os) {
-  CHECK_GT(t.lanes(), 1);
+  ICHECK_GT(t.lanes(), 1);
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (!(t.lanes() == 2 || t.lanes() == 3)) {
       if (i != 0) {
diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
index fb235d2d785d..7b69e8fbb903 100644
--- a/src/target/source/codegen_metal.cc
+++ b/src/target/source/codegen_metal.cc
@@ -59,7 +59,8 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
 
   // add to alloc buffer type.
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined()) << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
+  ICHECK(global_symbol.defined())
+      << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
 
   // Function header.
   this->stream << "kernel void " << static_cast<std::string>(global_symbol.value()) << "(";
@@ -97,7 +98,7 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
     decl_stream << "struct " << arg_buf_type << " {\n";
     for (size_t i = num_buffer; i < f->params.size(); ++i) {
       Var v = f->params[i];
-      CHECK(!v.dtype().is_handle());
+      ICHECK(!v.dtype().is_handle());
       std::string vid = AllocVarID(v.get());
       std::ostringstream vref;
       if (v.dtype().bits() == 32) {
@@ -116,8 +117,8 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
     decl_stream << "};\n\n";
   }
   // Setup the thread group info.
-  CHECK_EQ(GetUniqueName("threadIdx"), "threadIdx");
-  CHECK_EQ(GetUniqueName("blockIdx"), "blockIdx");
+  ICHECK_EQ(GetUniqueName("threadIdx"), "threadIdx");
+  ICHECK_EQ(GetUniqueName("blockIdx"), "blockIdx");
   int work_dim = 0;
   auto thread_axis = f->GetAttr<Array<tir::IterVar>>(tir::attr::kDeviceThreadAxis).value();
 
@@ -136,7 +137,7 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
   }
   // bind thread axis
   for (IterVar iv : thread_axis) {
-    CHECK(!var_idmap_.count(iv->var.get()));
+    ICHECK(!var_idmap_.count(iv->var.get()));
     std::string vname = iv->thread_tag;
     if (work_dim <= 1) {
       vname = vname.substr(0, iv->thread_tag.length() - 2);
@@ -154,7 +155,7 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
 }
 
 void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
-  CHECK(!var_idmap_.count(iv->var.get()));
+  ICHECK(!var_idmap_.count(iv->var.get()));
   var_idmap_[iv->var.get()] =
       CastFromTo(iv->thread_tag, DataType::UInt(thread_index_bits_), iv->var.dtype());
 }
@@ -162,7 +163,7 @@ void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
 void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    CHECK_EQ(lanes, 1) << "do not yet support vector types";
+    ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
@@ -289,10 +290,10 @@ runtime::Module BuildMetal(IRModule mod, Target target) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenMetal: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenMetal: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenMetal: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 10cc007c4572..0f79df37701c 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -79,7 +79,7 @@ std::string CodeGenOpenCL::Finish() {
 }
 
 void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
-  CHECK(!var_idmap_.count(iv->var.get()));
+  ICHECK(!var_idmap_.count(iv->var.get()));
   runtime::ThreadScope ts = runtime::ThreadScope::Create(iv->thread_tag);
   std::ostringstream os;
   if (ts.rank == 1) {
@@ -93,7 +93,7 @@ void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
 void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    CHECK_EQ(lanes, 1) << "do not yet support vector types";
+    ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
@@ -233,7 +233,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
   if (op->op.same_as(builtin::address_of())) {
     // Overload tvm_address_of to add storage scope (e.g. __global).
     const LoadNode* load = op->args[0].as<LoadNode>();
-    CHECK(op->args.size() == 1 && load);
+    ICHECK(op->args.size() == 1 && load);
     os << "((";
     auto it = alloc_storage_scope_.find(load->buffer_var.get());
     if (it != alloc_storage_scope_.end()) {
@@ -287,10 +287,10 @@ runtime::Module BuildOpenCL(IRModule mod, Target target) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenOpenCL: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
new file mode 100644
index 000000000000..cc7695abfd25
--- /dev/null
+++ b/src/target/source/codegen_params.cc
@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.cc
+ */
+
+#include "codegen_params.h"
+
+#include <dlpack/dlpack.h>
+
+#include <cmath>
+#include <iomanip>
+#include <memory>
+#include <string>
+
+namespace tvm {
+namespace codegen {
+
+/*! \brief maximum line length of generated parameters, including indent. */
+static constexpr const int kMaxLineLength = 80;
+
+static int ComputeNumElementsPerRow(int one_element_size_bytes, int indent_chars) {
+  if (one_element_size_bytes > kMaxLineLength - indent_chars) {
+    return 1;
+  }
+  // When multiple elements fit per line, divide the available space by the size of one element,
+  // and return the largest power of 2 less than the result. Using power-of-2-sized elements allows
+  // for easily traversing the generated code.
+  int elements_per_row = (kMaxLineLength - indent_chars) / one_element_size_bytes;
+
+  // Implementation of fls. Iteratively clear the LSB until one bit remains.
+  while ((elements_per_row & (elements_per_row - 1)) > 0) {
+    elements_per_row &= elements_per_row - 1;
+  }
+  return elements_per_row;
+}
+
+template <typename T, typename Enable = std::enable_if<std::is_integral<T>::value>>
+void PrintIntegralArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
+  int one_element_size_bytes = (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */);
+  if (std::is_signed<T>::value) {
+    one_element_size_bytes += 1;  // sign character
+    if (sizeof(T) == 64 / 8) {
+      one_element_size_bytes += 2;  // "LL"
+    }
+  } else {
+    if (sizeof(T) == 64 / 8) {
+      one_element_size_bytes += 3;  // "ULL"
+    }
+  }
+
+  int elements_per_row = ComputeNumElementsPerRow(one_element_size_bytes, indent_chars);
+  std::string indent_str(indent_chars, ' ');
+
+  for (size_t i = 0; i < num_elements; i++) {
+    if ((i % elements_per_row) == 0) {
+      if (i != 0) {
+        os << std::endl;
+      }
+      os << indent_str;
+    }
+    int64_t elem = static_cast<T*>(data)[i];
+    if (std::is_signed<T>::value) {
+      uint64_t to_print;
+      if (elem < 0) {
+        os << "-";
+        to_print = -elem;
+      } else {
+        os << "+";
+        to_print = elem;
+      }
+      os << "0x" << std::setw(sizeof(T) * 8 / 4) << static_cast<std::uint64_t>(to_print);
+      if (sizeof(T) == 64 / 8) {
+        os << "LL";
+      }
+    } else {
+      os << "0x" << std::setw(sizeof(T) * 8 / 4) << static_cast<std::uint64_t>(elem);
+      if (sizeof(T) == 64 / 8) {
+        os << "ULL";
+      }
+    }
+    if (i < num_elements - 1) {
+      os << ", ";
+    }
+  }
+
+  if ((num_elements % elements_per_row) != 0) {
+    os << "\n";
+  }
+}
+
+template <typename T, typename Enable = std::enable_if<std::is_floating_point<T>::value>>
+void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
+  // Floats and doubles are printed as hex but casted.
+  int one_element_size_bytes = (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */) + 1 /* sign */ +
+                               1 /* decimal point */ + 1 /* exponent sign */;
+  if (sizeof(T) == 64 / 8) {
+    one_element_size_bytes += 2; /* 4 decimal digits in exponent, relative to bits / 4 */
+  } else if (sizeof(T) == 32 / 8) {
+    one_element_size_bytes += 1; /* extra decimal digit in exponent, relative to bits / 4 */
+  }
+
+  int elements_per_row = ComputeNumElementsPerRow(one_element_size_bytes, indent_chars);
+  std::string indent_str(indent_chars, ' ');
+
+  std::stringstream ss;
+  if (std::is_signed<T>::value) {
+    ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
+            std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+  } else {
+    ss.setf(std::ios::hex | std::ios::fixed | std::ios::scientific,
+            std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+  }
+  for (size_t i = 0; i < num_elements; i++) {
+    if ((i % elements_per_row) == 0) {
+      if (i != 0) {
+        os << std::endl;
+      }
+      os << indent_str;
+    }
+
+    T elem = static_cast<T*>(data)[i];
+    if (std::isinf(elem)) {
+      // C99 standard.
+      os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+    } else if (std::isnan(elem)) {
+      // GNU extension, implemenatation-dependent.
+      os << std::setw(one_element_size_bytes) << "NAN";
+    } else {
+      ss << elem;
+      os << std::setw(one_element_size_bytes) << ss.str();
+      ss.str("");
+    }
+    if (i < num_elements - 1) {
+      os << ", ";
+    }
+  }
+
+  if ((num_elements % elements_per_row) != 0) {
+    os << "\n";
+  }
+}
+
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
+  auto arr_type = arr.DataType();
+  CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
+                                << arr_type.lanes();
+
+  auto shape = arr.Shape();
+  int num_elements = 1;
+  for (auto shape_elem : shape) {
+    num_elements *= shape_elem;
+  }
+
+  auto old_fmtflags = os.flags();
+  os.setf(std::ios::internal | std::ios::hex,
+          std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
+  os.fill('0');
+  switch (arr_type.code()) {
+    case runtime::DataType::kInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+      if (arr_type.bits() == 8) {
+        PrintIntegralArray<int8_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 16) {
+        PrintIntegralArray<int16_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 32) {
+        PrintIntegralArray<int32_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 64) {
+        PrintIntegralArray<int64_t>(arr->data, num_elements, indent_chars, os);
+      } else {
+        CHECK(false) << "should not get here";
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kUInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+
+      if (arr_type.bits() == 8) {
+        PrintIntegralArray<uint8_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 16) {
+        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 32) {
+        PrintIntegralArray<uint32_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 64) {
+        PrintIntegralArray<uint64_t>(arr->data, num_elements, indent_chars, os);
+      } else {
+        CHECK(false) << "should not get here";
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kFloat: {
+      os.fill(' ');
+      os.setf(std::ios::left, std::ios::adjustfield);
+      if (arr_type.bits() == 16) {
+        // NOTE: print types not widely supported by C as uint16_t.
+        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 32) {
+        PrintFloatingPointArray<float>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 64) {
+        PrintFloatingPointArray<double>(arr->data, num_elements, indent_chars, os);
+      } else {
+        CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                     << arr_type.bits() << "-bit array";
+      }
+      break;
+    }
+
+    case runtime::DataType::TypeCode::kBFloat: {
+      // NOTE: print types not widely supported by C as uint16_t.
+      CHECK(arr_type.bits() == 16)
+          << "CodegenParams: only support generating 16-bit bfloat params; saw " << arr_type.bits()
+          << "-bit array";
+      PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      break;
+    }
+
+    default:
+      CHECK(false) << "Data type not supported";
+  }
+
+  os.flags(old_fmtflags);
+}
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/target/source/codegen_params.h b/src/target/source/codegen_params.h
new file mode 100644
index 000000000000..cc126c767c58
--- /dev/null
+++ b/src/target/source/codegen_params.h
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.h
+ */
+
+#ifndef TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
+#define TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
+
+#include <tvm/runtime/ndarray.h>
+
+#include <iostream>
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Write a C representation of arr to os.
+ *
+ * This function generates a comma-separated, indented list of C integer listeals suitable for use
+ * in an initializer. The NDArray is flattened and then the list is produced element by element.
+ * For the int16_t NDArray [-3, -2, -1, 0, 1, 2, 3, ...], and indent_chars = 4, the following output
+ * is produced:
+ *     -0x0003, -0x0002, -0x0001, +0x0000, +0x0001, +0x0002, +0x0003
+ *
+ * \param arr The array to generate
+ * \param indent_chars Number of chars to indent
+ * \param os Output stream where the array data should be written.
+ */
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os);
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 9b2f0345864f..9f0cf9a70b61 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -70,7 +70,7 @@ std::string CodeGenSourceBase::SSAGetID(std::string src, DataType t) {
 }
 
 std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) {
-  CHECK(!var_idmap_.count(v)) << "Need input to be in SSA form dup " << v->name_hint;
+  ICHECK(!var_idmap_.count(v)) << "Need input to be in SSA form dup " << v->name_hint;
   std::string key = v->name_hint;
   std::string vid = GetUniqueName(key);
   var_idmap_[v] = vid;
@@ -79,7 +79,7 @@ std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) {
 
 std::string CodeGenSourceBase::GetVarID(const tir::VarNode* v) const {
   auto it = var_idmap_.find(v);
-  CHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
+  ICHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
   return it->second;
 }
 
@@ -97,7 +97,7 @@ void CodeGenSourceBase::MarkConst(std::string vid) {
     e.scope_id = 0;
     ssa_assign_map_[vid] = e;
   } else {
-    CHECK_EQ(it->second.vid, vid);
+    ICHECK_EQ(it->second.vid, vid);
   }
 }
 
diff --git a/src/target/source/codegen_vhls.cc b/src/target/source/codegen_vhls.cc
index 9401f0682db8..9896d8b833f9 100644
--- a/src/target/source/codegen_vhls.cc
+++ b/src/target/source/codegen_vhls.cc
@@ -146,10 +146,10 @@ runtime::Module BuildSDAccel(IRModule mod, Target target) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenVHLS: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenVHLS: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenVLHS: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
@@ -160,7 +160,7 @@ runtime::Module BuildSDAccel(IRModule mod, Target target) {
   Array<Array<runtime::String> > kernel_info;
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     CodeGenVivadoHLS cg;
     cg.Init(output_ssa);
@@ -171,7 +171,7 @@ runtime::Module BuildSDAccel(IRModule mod, Target target) {
     }
 
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
     kernel_info.push_back({global_symbol.value(), code});
   }
diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc
index 9ffceb68e278..0a68736bcd05 100644
--- a/src/target/source/intrin_rule_cuda.cc
+++ b/src/target/source/intrin_rule_cuda.cc
@@ -102,7 +102,7 @@ struct CUDAWarpIntrinsic {
     } else if (orig_op.same_as(builtin::tvm_warp_shuffle_up())) {
       return Op::Get("tir.cuda.__shfl_up_sync");
     } else {
-      CHECK(orig_op.same_as(builtin::tvm_warp_shuffle_down()));
+      ICHECK(orig_op.same_as(builtin::tvm_warp_shuffle_down()));
       return Op::Get("tir.cuda.__shfl_down_sync");
     }
   }
@@ -117,8 +117,8 @@ template <typename T>
 static void DispatchCUDAShuffle(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
-  CHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   Array<PrimExpr> cuda_args{{call->args[0], call->args[1], call->args[2], call->args[3]}};
 
   *rv = Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args);
diff --git a/src/target/source/intrin_rule_opencl.cc b/src/target/source/intrin_rule_opencl.cc
index 7f81e335ec8d..54da5c74ab02 100644
--- a/src/target/source/intrin_rule_opencl.cc
+++ b/src/target/source/intrin_rule_opencl.cc
@@ -74,10 +74,10 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.cosh").set_body(DispatchPureExtern<D
 static void DispatchIntelShuffle(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
-  CHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   arith::Analyzer analyzer;
-  CHECK(analyzer.CanProve(call->args[3] == call->args[4]))
+  ICHECK(analyzer.CanProve(call->args[3] == call->args[4]))
       << "Intel warp shuffle dose not support width != warp_size";
   Array<PrimExpr> opencl_args{{StringImm("intel_sub_group_shuffle"), call->args[1], call->args[2]}};
   *rv = Call(call->dtype, builtin::call_pure_extern(), opencl_args);
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 68a34c8304f8..3be658aa0125 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -25,7 +25,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
-#include "../../runtime/file_util.h"
+#include "../../runtime/file_utils.h"
 #include "../../runtime/meta_data.h"
 #include "codegen_source_base.h"
 
@@ -67,7 +67,7 @@ runtime::Module CreateMetadataModule(
       for (size_t i = 0; i < variables.size(); i++) {
         arrays.push_back(variables[i].operator std::string());
       }
-      CHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
+      ICHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
       sym_metadata[symbol] = arrays;
     }
   }
@@ -132,10 +132,10 @@ class CSourceModuleNode : public runtime::ModuleNode {
     std::string fmt = GetFileFormat(file_name, format);
     std::string meta_file = GetMetaFilePath(file_name);
     if (fmt == "cc") {
-      CHECK_NE(code_.length(), 0);
+      ICHECK_NE(code_.length(), 0);
       SaveBinaryToFile(file_name, code_);
     } else {
-      CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+      ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
     }
   }
 
@@ -179,7 +179,7 @@ class DeviceSourceModuleNode final : public runtime::ModuleNode {
 
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+    ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
     std::string meta_file = GetMetaFilePath(file_name);
     SaveMetaDataToFile(meta_file, fmap_);
     SaveBinaryToFile(file_name, data_);
diff --git a/src/target/spirv/build_vulkan.cc b/src/target/spirv/build_vulkan.cc
index 1eef2f8f88e5..a0f0b76eefbd 100644
--- a/src/target/spirv/build_vulkan.cc
+++ b/src/target/spirv/build_vulkan.cc
@@ -49,10 +49,10 @@ class SPIRVTools {
                         SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES | SPV_BINARY_TO_TEXT_OPTION_INDENT,
                         &text, &diagnostic);
 
-    CHECK_EQ(res, SPV_SUCCESS) << " line=" << diagnostic->position.line
-                               << " column=" << diagnostic->position.column
-                               << " index=" << diagnostic->position.index
-                               << " error:" << diagnostic->error;
+    ICHECK_EQ(res, SPV_SUCCESS) << " line=" << diagnostic->position.line
+                                << " column=" << diagnostic->position.column
+                                << " index=" << diagnostic->position.index
+                                << " error:" << diagnostic->error;
 
     std::string ret(text->str);
     spvTextDestroy(text);
@@ -78,13 +78,13 @@ runtime::Module BuildSPIRV(IRModule mod, Target target, bool webgpu_restriction)
   CodeGenSPIRV cg;
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenSPIRV: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenSPIRV: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenSPIRV: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenSPIRV: Expect PrimFunc to have the global_symbol attribute";
 
     std::string f_name = global_symbol.value();
@@ -95,7 +95,7 @@ runtime::Module BuildSPIRV(IRModule mod, Target target, bool webgpu_restriction)
 
     if (webgpu_restriction) {
       for (auto param : f->params) {
-        CHECK(param.dtype().is_handle()) << "WebGPU does not yet support non-buffer arguments";
+        ICHECK(param.dtype().is_handle()) << "WebGPU does not yet support non-buffer arguments";
       }
     }
 
@@ -104,7 +104,7 @@ runtime::Module BuildSPIRV(IRModule mod, Target target, bool webgpu_restriction)
       arr.data = reinterpret_cast<const char*>(dmlc::BeginPtr(shader.data));
       arr.size = shader.data.size() * sizeof(uint32_t);
       std::string transformed = (*postproc)(arr);
-      CHECK_EQ(transformed.length() % 4U, 0U);
+      ICHECK_EQ(transformed.length() % 4U, 0U);
       shader.data.resize(transformed.size() / 4U);
       std::copy(transformed.begin(), transformed.end(),
                 reinterpret_cast<char*>(dmlc::BeginPtr(shader.data)));
diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc
index 2a67d953f960..c3b12ab943c6 100644
--- a/src/target/spirv/codegen_spirv.cc
+++ b/src/target/spirv/codegen_spirv.cc
@@ -35,7 +35,7 @@ namespace codegen {
 
 std::vector<uint32_t> CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::string& name) {
   this->InitFuncState();
-  CHECK(f->HasNonzeroAttr(tir::attr::kNoAlias)) << "SPIRV only takes restricted memory model";
+  ICHECK(f->HasNonzeroAttr(tir::attr::kNoAlias)) << "SPIRV only takes restricted memory model";
   std::vector<Var> pod_args;
   uint32_t num_buffer = 0;
 
@@ -44,7 +44,7 @@ std::vector<uint32_t> CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::
     if (t.is_handle()) {
       if (auto* ptr = arg->type_annotation.as<PointerTypeNode>()) {
         auto* prim = ptr->element_type.as<PrimTypeNode>();
-        CHECK(prim);
+        ICHECK(prim);
         DataType value_type = prim->dtype;
         spirv::Value arg_value =
             builder_->BufferArgument(builder_->GetSType(value_type), 0, num_buffer);
@@ -98,9 +98,9 @@ spirv::Value CodeGenSPIRV::GetThreadIndex(const IterVar& iv, const PrimExpr& ext
   if (ts.rank == 1) {
     v = builder_->GetLocalID(ts.dim_index);
     auto* sizeptr = extent.as<tir::IntImmNode>();
-    CHECK(sizeptr) << "SPIRV only allows constant thread group size "
-                   << " get " << extent;
-    CHECK_LT(ts.dim_index, 3);
+    ICHECK(sizeptr) << "SPIRV only allows constant thread group size "
+                    << " get " << extent;
+    ICHECK_LT(ts.dim_index, 3);
     workgroup_size_[ts.dim_index] = static_cast<uint32_t>(sizeptr->value);
   } else {
     v = builder_->GetWorkgroupID(ts.dim_index);
@@ -130,7 +130,7 @@ spirv::Value CodeGenSPIRV::CreateStorageSync(const CallNode* op) {
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const VarNode* op) {
   auto it = var_map_.find(op);
-  CHECK(it != var_map_.end()) << "cannot find variable " << op->name_hint;
+  ICHECK(it != var_map_.end()) << "cannot find variable " << op->name_hint;
   return it->second;
 }
 
@@ -232,7 +232,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const SelectNode* op) {
 spirv::Value CodeGenSPIRV::VisitExpr_(const LetNode* op) {
   auto it = let_binding_.find(op->var);
   if (it != let_binding_.end()) {
-    CHECK(deep_equal_(it->second->value, op->value))
+    ICHECK(deep_equal_(it->second->value, op->value))
         << "Let cannot bind the same var to two different values";
   } else {
     let_binding_[op->var] = op;
@@ -244,7 +244,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const LetNode* op) {
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(builtin::call_spirv_pure_glsl450())) {
-    CHECK_GE(op->args.size(), 2U);
+    ICHECK_GE(op->args.size(), 2U);
     uint32_t inst_id = static_cast<uint32_t>(op->args[0].as<IntImmNode>()->value);
     std::vector<spirv::Value> values;
     for (size_t i = 1; i < op->args.size(); ++i) {
@@ -252,31 +252,31 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     }
     return builder_->CallGLSL450(builder_->GetSType(op->dtype), inst_id, values);
   } else if (op->op.same_as(builtin::bitwise_and())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     return builder_->MakeValue(spv::OpBitwiseAnd, a.stype, a, b);
   } else if (op->op.same_as(builtin::bitwise_xor())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     return builder_->MakeValue(spv::OpBitwiseXor, a.stype, a, b);
   } else if (op->op.same_as(builtin::bitwise_or())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     return builder_->MakeValue(spv::OpBitwiseOr, a.stype, a, b);
   } else if (op->op.same_as(builtin::bitwise_not())) {
-    CHECK_EQ(op->args.size(), 1U);
+    ICHECK_EQ(op->args.size(), 1U);
     spirv::Value a = MakeValue(op->args[0]);
     return builder_->MakeValue(spv::OpNot, a.stype, a);
   } else if (op->op.same_as(builtin::shift_left())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     return builder_->MakeValue(spv::OpShiftLeftLogical, a.stype, a, b);
   } else if (op->op.same_as(builtin::shift_right())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     if (op->args[0].dtype().is_int()) {
@@ -288,7 +288,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     return builder_->MakeValue(spv::OpBitcast, builder_->GetSType(op->dtype),
                                MakeValue(op->args[0]));
   } else if (op->op.same_as(builtin::large_uint_imm())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     uint64_t low = static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
     uint64_t high = static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
     uint64_t val = (high << 32U) | low;
@@ -296,7 +296,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
   } else if (op->op.same_as(builtin::tvm_storage_sync())) {
     return this->CreateStorageSync(op);
   } else if (op->op.same_as(builtin::if_then_else())) {
-    CHECK_EQ(op->args.size(), 3U);
+    ICHECK_EQ(op->args.size(), 3U);
     spirv::Value cond = MakeValue(op->args[0]);
     spirv::Label then_label = builder_->NewLabel();
     spirv::Label else_label = builder_->NewLabel();
@@ -352,9 +352,9 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const BroadcastNode* op) {
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const LoadNode* op) {
-  CHECK(is_one(op->predicate));
+  ICHECK(is_one(op->predicate));
   auto it = storage_info_.find(op->buffer_var.get());
-  CHECK(it != storage_info_.end());
+  ICHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
   if (!info.content_fixed) {
     info.UpdateContentType(op->dtype);
@@ -369,7 +369,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const LoadNode* op) {
     mask |= spv::MemoryAccessVolatileMask;
   }
   if (op->dtype.lanes() == 1) {
-    CHECK_EQ(info.content_type, op->dtype)
+    ICHECK_EQ(info.content_type, op->dtype)
         << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(op->index);
     spirv::Value ptr = builder_->StructArrayAccess(ptr_type, buffer, index);
@@ -387,9 +387,9 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const LoadNode* op) {
     } else {
       if (const RampNode* ramp = op->index.as<RampNode>()) {
         if (is_one(ramp->stride)) {
-          CHECK_EQ(ramp->lanes, op->dtype.lanes());
+          ICHECK_EQ(ramp->lanes, op->dtype.lanes());
           arith::ModularSet me = analyzer_->modular_set(ramp->base);
-          CHECK((me->coeff % ramp->lanes) == 0 && (me->base % ramp->lanes) == 0)
+          ICHECK((me->coeff % ramp->lanes) == 0 && (me->base % ramp->lanes) == 0)
               << "Only aligned vector access is allowed in SPIRV";
           PrimExpr vec_index =
               analyzer_->Simplify(ramp->base / make_const(ramp->base.dtype(), ramp->lanes));
@@ -420,9 +420,9 @@ void CodeGenSPIRV::Scalarize(const PrimExpr& e, std::function<void(int i, spirv:
 }
 
 void CodeGenSPIRV::VisitStmt_(const StoreNode* op) {
-  CHECK(is_one(op->predicate));
+  ICHECK(is_one(op->predicate));
   auto it = storage_info_.find(op->buffer_var.get());
-  CHECK(it != storage_info_.end());
+  ICHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
 
   if (!info.content_fixed) {
@@ -440,7 +440,7 @@ void CodeGenSPIRV::VisitStmt_(const StoreNode* op) {
   }
 
   if (op->value.dtype().lanes() == 1) {
-    CHECK_EQ(info.content_type, op->value.dtype())
+    ICHECK_EQ(info.content_type, op->value.dtype())
         << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(op->index);
     spirv::Value ptr = builder_->StructArrayAccess(ptr_type, buffer, index);
@@ -457,9 +457,9 @@ void CodeGenSPIRV::VisitStmt_(const StoreNode* op) {
     } else {
       if (const RampNode* ramp = op->index.as<RampNode>()) {
         if (is_one(ramp->stride)) {
-          CHECK_EQ(ramp->lanes, op->value.dtype().lanes());
+          ICHECK_EQ(ramp->lanes, op->value.dtype().lanes());
           arith::ModularSet me = analyzer_->modular_set(ramp->base);
-          CHECK((me->coeff % ramp->lanes) == 0 && (me->base % ramp->lanes) == 0)
+          ICHECK((me->coeff % ramp->lanes) == 0 && (me->base % ramp->lanes) == 0)
               << "Only aligned vector access is allowed in SPIRV";
           PrimExpr vec_index =
               analyzer_->Simplify(ramp->base / make_const(ramp->base.dtype(), ramp->lanes));
@@ -474,7 +474,7 @@ void CodeGenSPIRV::VisitStmt_(const StoreNode* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const ForNode* op) {
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
   spirv::Value init_value = MakeValue(op->min);
   spirv::Value extent_value = MakeValue(op->extent);
@@ -544,10 +544,10 @@ void CodeGenSPIRV::VisitStmt_(const IfThenElseNode* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const AllocateNode* op) {
-  CHECK(!is_zero(op->condition));
-  CHECK(!op->dtype.is_handle());
+  ICHECK(!is_zero(op->condition));
+  ICHECK(!op->dtype.is_handle());
   int32_t constant_size = op->constant_allocation_size();
-  CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
+  ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
   spirv::Value buf;
   StorageInfo& info = storage_info_[op->buffer_var.get()];
   spirv::SType etype = builder_->GetSType(op->dtype);
@@ -556,15 +556,15 @@ void CodeGenSPIRV::VisitStmt_(const AllocateNode* op) {
         builder_->Allocate(etype, static_cast<uint32_t>(constant_size), spv::StorageClassFunction);
   } else {
     // shared memory
-    CHECK(info.scope.rank == runtime::StorageRank::kShared)
+    ICHECK(info.scope.rank == runtime::StorageRank::kShared)
         << "Can only allocate shared or local memory inside kernel";
     // Shared memory
     buf =
         builder_->Allocate(etype, static_cast<uint32_t>(constant_size), spv::StorageClassWorkgroup);
   }
-  CHECK(!info.content_fixed);
+  ICHECK(!info.content_fixed);
   info.UpdateContentType(op->dtype);
-  CHECK(!var_map_.count(op->buffer_var.get()));
+  ICHECK(!var_map_.count(op->buffer_var.get()));
   var_map_[op->buffer_var.get()] = buf;
   this->VisitStmt(op->body);
 }
@@ -580,11 +580,11 @@ void CodeGenSPIRV::VisitStmt_(const AttrStmtNode* op) {
     }
   } else if (op->attr_key == tir::attr::storage_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     storage_info_[v].scope = runtime::StorageScope::Create(op->value.as<StringImmNode>()->value);
   } else if (op->attr_key == tir::attr::volatile_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     storage_info_[v].is_volatile = true;
   }
   this->VisitStmt(op->body);
@@ -596,8 +596,8 @@ void CodeGenSPIRV::VisitStmt_(const AssertStmtNode* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const LetStmtNode* op) {
-  CHECK(!var_map_.count(op->var.get()));
-  CHECK(!op->var.dtype().is_handle());
+  ICHECK(!var_map_.count(op->var.get()));
+  ICHECK(!op->var.dtype().is_handle());
   var_map_[op->var.get()] = MakeValue(op->value);
   analyzer_->Bind(op->var, op->value);
   this->VisitStmt(op->body);
diff --git a/src/target/spirv/codegen_spirv.h b/src/target/spirv/codegen_spirv.h
index 9bf81095f066..be755641c8a5 100644
--- a/src/target/spirv/codegen_spirv.h
+++ b/src/target/spirv/codegen_spirv.h
@@ -116,7 +116,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
     // Update content type if it hasn't beenupdated.
     void UpdateContentType(DataType type) {
       if (content_fixed) {
-        CHECK_EQ(type, content_type) << "Cannot use two different content type in GLSL model";
+        ICHECK_EQ(type, content_type) << "Cannot use two different content type in GLSL model";
       } else {
         this->content_type = type;
         content_fixed = true;
diff --git a/src/target/spirv/intrin_rule_spirv.cc b/src/target/spirv/intrin_rule_spirv.cc
index ea575ca83866..90b2eb2a671f 100644
--- a/src/target/spirv/intrin_rule_spirv.cc
+++ b/src/target/spirv/intrin_rule_spirv.cc
@@ -36,7 +36,7 @@ template <unsigned id>
 inline void DispatchGLSLPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   Array<PrimExpr> cargs;
   // intrin id.
   cargs.push_back(IntImm(DataType::UInt(32), id));
diff --git a/src/target/spirv/ir_builder.cc b/src/target/spirv/ir_builder.cc
index 305464ac398b..273fc48c3e30 100644
--- a/src/target/spirv/ir_builder.cc
+++ b/src/target/spirv/ir_builder.cc
@@ -30,7 +30,7 @@ namespace spirv {
 // implementations
 
 void IRBuilder::InitHeader() {
-  CHECK_EQ(header_.size(), 0U);
+  ICHECK_EQ(header_.size(), 0U);
   header_.push_back(spv::MagicNumber);
 
   // Use the spirv version as indicated in the SDK.
@@ -93,7 +93,7 @@ SType IRBuilder::GetSType(const DataType& dtype) {
 }
 
 SType IRBuilder::GetPointerType(const SType& value_type, spv::StorageClass storage_class) {
-  CHECK_NE(storage_class, spv::StorageClassMax);
+  ICHECK_NE(storage_class, spv::StorageClassMax);
   auto key = std::make_pair(value_type.id, storage_class);
   auto it = pointer_type_tbl_.find(key);
   if (it != pointer_type_tbl_.end()) {
@@ -128,7 +128,7 @@ SType IRBuilder::GetStructArrayType(const SType& value_type, uint32_t num_elems)
     ib_.Begin(spv::OpTypeRuntimeArray).AddSeq(arr_type, value_type).Commit(&global_);
   }
   int nbits = value_type.type.bits() * value_type.type.lanes();
-  CHECK_EQ(nbits % 8, 0);
+  ICHECK_EQ(nbits % 8, 0);
   uint32_t nbytes = static_cast<uint32_t>(nbits) / 8;
   // decorate the array type.
   this->Decorate(spv::OpDecorate, arr_type, spv::DecorationArrayStride, nbytes);
@@ -158,7 +158,7 @@ SType IRBuilder::GetStructArrayType(const SType& value_type, uint32_t num_elems)
 }
 
 Value IRBuilder::StructArrayAccess(const SType& res_type, Value buffer, Value index) {
-  CHECK(buffer.flag == kStructArrayPtr);
+  ICHECK(buffer.flag == kStructArrayPtr);
   return MakeValue(spv::OpInBoundsAccessChain, res_type, buffer, const_i32_zero_, index);
 }
 
@@ -177,7 +177,7 @@ Value IRBuilder::FloatImm(const SType& dtype, double value) {
     uint64_t data = ptr[0];
     return GetConst_(dtype, &data);
   } else {
-    CHECK_EQ(dtype.type.bits(), 16);
+    ICHECK_EQ(dtype.type.bits(), 16);
     return Cast(dtype, FloatImm(GetSType(DataType::Float(32)), value));
   }
 }
@@ -204,7 +204,7 @@ Value IRBuilder::BufferArgument(const SType& value_type, uint32_t descriptor_set
 }
 
 Value IRBuilder::DeclarePushConstant(const std::vector<SType>& value_types) {
-  CHECK_EQ(push_const_.id, 0);
+  ICHECK_EQ(push_const_.id, 0);
   SType struct_type;
   struct_type.id = id_counter_++;
   struct_type.type = DataType::Handle();
@@ -221,7 +221,7 @@ Value IRBuilder::DeclarePushConstant(const std::vector<SType>& value_types) {
         .Commit(&decorate_);
     DataType t = value_types[i].type;
     uint32_t nbits = t.bits() * t.lanes();
-    CHECK_EQ(nbits % 8, 0);
+    ICHECK_EQ(nbits % 8, 0);
     offset += nbits / 8;
   }
   // Decorate push constants as UBO
@@ -243,7 +243,7 @@ Value IRBuilder::GetPushConstant(Value ptr_push_const, const SType& v_type, uint
 Value IRBuilder::NewFunction() { return NewValue(t_void_func_, kFunction); }
 
 void IRBuilder::CommitKernelFunction(const Value& func, const std::string& name) {
-  CHECK_EQ(func.flag, kFunction);
+  ICHECK_EQ(func.flag, kFunction);
   ib_.Begin(spv::OpEntryPoint).AddSeq(spv::ExecutionModelGLCompute, func, name);
   if (workgroup_id_.id != 0) {
     ib_.Add(workgroup_id_);
@@ -255,7 +255,7 @@ void IRBuilder::CommitKernelFunction(const Value& func, const std::string& name)
 }
 
 void IRBuilder::StartFunction(const Value& func) {
-  CHECK_EQ(func.flag, kFunction);
+  ICHECK_EQ(func.flag, kFunction);
   // add function declaration to the header.
   ib_.Begin(spv::OpFunction).AddSeq(t_void_, func, 0, t_void_func_).Commit(&func_header_);
 
@@ -265,7 +265,7 @@ void IRBuilder::StartFunction(const Value& func) {
 }
 
 void IRBuilder::SetLocalSize(const Value& func, uint32_t local_size[3]) {
-  CHECK_EQ(func.flag, kFunction);
+  ICHECK_EQ(func.flag, kFunction);
   ib_.Begin(spv::OpExecutionMode)
       .AddSeq(func, spv::ExecutionModeLocalSize, local_size[0], local_size[1], local_size[2])
       .Commit(&exec_mode_);
@@ -273,7 +273,7 @@ void IRBuilder::SetLocalSize(const Value& func, uint32_t local_size[3]) {
 
 Value IRBuilder::Allocate(const SType& value_type, uint32_t num_elems,
                           spv::StorageClass storage_class) {
-  CHECK_NE(num_elems, 0U);
+  ICHECK_NE(num_elems, 0U);
   SType sarr_type = GetStructArrayType(value_type, num_elems);
   SType ptr_type = GetPointerType(sarr_type, storage_class);
   Value val = NewValue(ptr_type, kStructArrayPtr);
@@ -322,7 +322,7 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
   if (it != const_tbl_.end()) {
     return it->second;
   }
-  CHECK_LE(dtype.type.bits(), 64);
+  ICHECK_LE(dtype.type.bits(), 64);
   Value ret = NewValue(dtype, kConstant);
   if (dtype.type == DataType::UInt(1)) {
     // bool types.
@@ -357,7 +357,7 @@ SType IRBuilder::DeclareType(const DataType& dtype) {
     t.id = id_counter_++;
     t.type = dtype;
     if (dtype.bits() == 1) {
-      CHECK(dtype.is_uint());
+      ICHECK(dtype.is_uint());
       ib_.Begin(spv::OpTypeBool).Add(t).Commit(&global_);
     } else if (dtype.is_int()) {
       ib_.Begin(spv::OpTypeInt).AddSeq(t, dtype.bits(), 1).Commit(&global_);
@@ -390,7 +390,7 @@ PhiValue IRBuilder::MakePhi(const SType& out_type, uint32_t num_incoming) {
   phi.stype = out_type;
   phi.flag = kNormal;
   phi.instr = ib_.Commit(&function_);
-  CHECK_EQ(phi.instr.WordCount(), 2 * num_incoming + 3);
+  ICHECK_EQ(phi.instr.WordCount(), 2 * num_incoming + 3);
   return phi;
 }
 
@@ -410,7 +410,7 @@ Value IRBuilder::Concat(const std::vector<Value>& vec) {
   DataType etype = vec[0].stype.type;
   int lanes = etype.lanes();
   for (size_t i = 1; i < vec.size(); ++i) {
-    CHECK_EQ(etype, vec[i].stype.type.element_of())
+    ICHECK_EQ(etype, vec[i].stype.type.element_of())
         << "Cannot concat vector of different element type";
     lanes += vec[i].stype.type.lanes();
     is_const = is_const && (vec[i].flag == kConstant);
@@ -435,11 +435,11 @@ Value IRBuilder::Concat(const std::vector<Value>& vec) {
 }
 
 Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
-  CHECK_NE(value.stype.id, 0U);
+  ICHECK_NE(value.stype.id, 0U);
   if (value.stype.id == dst_type.id) return value;
   const tvm::DataType& from = value.stype.type;
   const tvm::DataType& to = dst_type.type;
-  CHECK_EQ(from.lanes(), to.lanes());
+  ICHECK_EQ(from.lanes(), to.lanes());
   if (from == DataType::Bool()) {
     if (to.is_int()) {
       return Select(value, IntImm(dst_type, 1), IntImm(dst_type, 0));
@@ -493,24 +493,24 @@ Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
 
 #define DEFINE_BUILDER_BINARY_USIGN_OP(_OpName, _Op)       \
   Value IRBuilder::_OpName(Value a, Value b) {             \
-    CHECK_EQ(a.stype.id, b.stype.id);                      \
+    ICHECK_EQ(a.stype.id, b.stype.id);                     \
     if (a.stype.type.is_int() || a.stype.type.is_uint()) { \
       return MakeValue(spv::OpI##_Op, a.stype, a, b);      \
     } else {                                               \
-      CHECK(a.stype.type.is_float());                      \
+      ICHECK(a.stype.type.is_float());                     \
       return MakeValue(spv::OpF##_Op, a.stype, a, b);      \
     }                                                      \
   }
 
 #define DEFINE_BUILDER_BINARY_SIGN_OP(_OpName, _Op)   \
   Value IRBuilder::_OpName(Value a, Value b) {        \
-    CHECK_EQ(a.stype.id, b.stype.id);                 \
+    ICHECK_EQ(a.stype.id, b.stype.id);                \
     if (a.stype.type.is_int()) {                      \
       return MakeValue(spv::OpS##_Op, a.stype, a, b); \
     } else if (a.stype.type.is_uint()) {              \
       return MakeValue(spv::OpU##_Op, a.stype, a, b); \
     } else {                                          \
-      CHECK(a.stype.type.is_float());                 \
+      ICHECK(a.stype.type.is_float());                \
       return MakeValue(spv::OpF##_Op, a.stype, a, b); \
     }                                                 \
   }
@@ -521,28 +521,28 @@ DEFINE_BUILDER_BINARY_USIGN_OP(Mul, Mul);
 DEFINE_BUILDER_BINARY_SIGN_OP(Div, Div);
 
 Value IRBuilder::Mod(Value a, Value b) {
-  CHECK_EQ(a.stype.id, b.stype.id);
+  ICHECK_EQ(a.stype.id, b.stype.id);
   if (a.stype.type.is_int()) {
     return MakeValue(spv::OpSRem, a.stype, a, b);
   } else if (a.stype.type.is_uint()) {
     return MakeValue(spv::OpUMod, a.stype, a, b);
   } else {
-    CHECK(a.stype.type.is_float());
+    ICHECK(a.stype.type.is_float());
     return MakeValue(spv::OpFRem, a.stype, a, b);
   }
 }
 
 #define DEFINE_BUILDER_CMP_OP(_OpName, _Op)                                                     \
   Value IRBuilder::_OpName(Value a, Value b) {                                                  \
-    CHECK_EQ(a.stype.id, b.stype.id);                                                           \
-    CHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                                       \
+    ICHECK_EQ(a.stype.id, b.stype.id);                                                          \
+    ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                                      \
     const auto& bool_type = this->GetSType(DataType::UInt(1).with_lanes(a.stype.type.lanes())); \
     if (a.stype.type.is_int()) {                                                                \
       return MakeValue(spv::OpS##_Op, bool_type, a, b);                                         \
     } else if (a.stype.type.is_uint()) {                                                        \
       return MakeValue(spv::OpU##_Op, bool_type, a, b);                                         \
     } else {                                                                                    \
-      CHECK(a.stype.type.is_float());                                                           \
+      ICHECK(a.stype.type.is_float());                                                          \
       return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                      \
     }                                                                                           \
   }
@@ -554,13 +554,13 @@ DEFINE_BUILDER_CMP_OP(GE, GreaterThanEqual);
 
 #define DEFINE_BUILDER_CMP_UOP(_OpName, _Op)                                                    \
   Value IRBuilder::_OpName(Value a, Value b) {                                                  \
-    CHECK_EQ(a.stype.id, b.stype.id);                                                           \
-    CHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                                       \
+    ICHECK_EQ(a.stype.id, b.stype.id);                                                          \
+    ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                                      \
     const auto& bool_type = this->GetSType(DataType::UInt(1).with_lanes(a.stype.type.lanes())); \
     if (a.stype.type.is_int() || a.stype.type.is_uint()) {                                      \
       return MakeValue(spv::OpI##_Op, bool_type, a, b);                                         \
     } else {                                                                                    \
-      CHECK(a.stype.type.is_float());                                                           \
+      ICHECK(a.stype.type.is_float());                                                          \
       return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                      \
     }                                                                                           \
   }
@@ -569,8 +569,8 @@ DEFINE_BUILDER_CMP_UOP(EQ, Equal);
 DEFINE_BUILDER_CMP_UOP(NE, NotEqual);
 
 Value IRBuilder::Select(Value cond, Value a, Value b) {
-  CHECK_EQ(a.stype.id, b.stype.id);
-  CHECK_EQ(cond.stype.type.element_of(), DataType::UInt(1));
+  ICHECK_EQ(a.stype.id, b.stype.id);
+  ICHECK_EQ(cond.stype.type.element_of(), DataType::UInt(1));
   return MakeValue(spv::OpSelect, a.stype, cond, a, b);
 }
 
diff --git a/src/target/spirv/ir_builder.h b/src/target/spirv/ir_builder.h
index c52f92fd7c20..8a08048e1955 100644
--- a/src/target/spirv/ir_builder.h
+++ b/src/target/spirv/ir_builder.h
@@ -93,7 +93,7 @@ class Instr {
    * \return reference to idx-th word.
    */
   uint32_t& operator[](uint32_t idx) {
-    CHECK_LT(idx, word_count_);
+    ICHECK_LT(idx, word_count_);
     return (*data_)[begin_ + idx];
   }
 
@@ -122,7 +122,7 @@ struct PhiValue : public Value {
    * \param parent The parent label.
    */
   void SetIncoming(uint32_t index, const Value& value, const Label& parent) {
-    CHECK_EQ(this->stype.id, value.stype.id);
+    ICHECK_EQ(this->stype.id, value.stype.id);
     instr[3 + index * 2] = value.id;
     instr[3 + index * 2 + 1] = parent.id;
   }
@@ -152,7 +152,7 @@ class InstrBuilder {
    */
   InstrBuilder& Begin(spv::Op op) {  // NOLINT(*);
     // finish previous build
-    CHECK_EQ(data_.size(), 0U);
+    ICHECK_EQ(data_.size(), 0U);
     op_ = op;
     data_.push_back(0);
     return *this;
diff --git a/src/target/stackvm/codegen_stackvm.cc b/src/target/stackvm/codegen_stackvm.cc
index ac3ba78fa4d5..0dd96e07ed96 100644
--- a/src/target/stackvm/codegen_stackvm.cc
+++ b/src/target/stackvm/codegen_stackvm.cc
@@ -75,12 +75,12 @@ StackVM::StructFieldKind MapFieldKind(int64_t kind) {
 }
 
 StackVM CodeGenStackVM::Compile(const PrimFunc& f) {
-  CHECK_EQ(f->buffer_map.size(), 0U)
+  ICHECK_EQ(f->buffer_map.size(), 0U)
       << "Cannot codegen function with buffer_map, please lower them first";
   for (size_t i = 0; i < f->params.size(); ++i) {
     Var v = f->params[i];
     int vid = AllocVarID(v.get());
-    CHECK_EQ(static_cast<size_t>(vid), i);
+    ICHECK_EQ(static_cast<size_t>(vid), i);
   }
   this->Push(f->body);
   vm_.InitCache();
@@ -101,7 +101,7 @@ void CodeGenStackVM::PushOp(StackVM::OpCode opcode) {
 }
 
 void CodeGenStackVM::SetOperand(int64_t operand_index, int64_t operand) {
-  CHECK(operand >= std::numeric_limits<int>::min() && operand <= std::numeric_limits<int>::max());
+  ICHECK(operand >= std::numeric_limits<int>::min() && operand <= std::numeric_limits<int>::max());
   vm_.code.at(operand_index).v_int = static_cast<int>(operand);
 }
 
@@ -125,9 +125,9 @@ int CodeGenStackVM::GetStrID(const std::string& key) {
 }
 
 int CodeGenStackVM::AllocVarID(const VarNode* v) {
-  CHECK(!var_idmap_.count(v));
+  ICHECK(!var_idmap_.count(v));
   int vid = static_cast<int>(vm_.heap_size);
-  CHECK_EQ(vm_.heap_size, var_idmap_.size());
+  ICHECK_EQ(vm_.heap_size, var_idmap_.size());
   vm_.heap_id_name.push_back(v->name_hint);
   ++vm_.heap_size;
   var_idmap_[v] = vid;
@@ -136,7 +136,7 @@ int CodeGenStackVM::AllocVarID(const VarNode* v) {
 
 int CodeGenStackVM::GetVarID(const VarNode* v) const {
   auto it = var_idmap_.find(v);
-  CHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
+  ICHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
   return it->second;
 }
 
@@ -177,7 +177,7 @@ void CodeGenStackVM::VisitStmt_(const AllocateNode* op) {
 void CodeGenStackVM::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(builtin::address_of())) {
     const LoadNode* l = op->args[0].as<LoadNode>();
-    CHECK(op->args.size() == 1 && l);
+    ICHECK(op->args.size() == 1 && l);
     this->PushOp(StackVM::LOAD_HEAP, GetVarID(l->buffer_var.get()));
     this->Push(l->index);
     this->PushOp(StackVM::PUSH_I64, l->dtype.element_of().bytes());
@@ -186,11 +186,11 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
   } else if (op->op.same_as(builtin::reinterpret())) {
     this->Push(op->args[0]);
   } else if (op->op.same_as(builtin::tvm_struct_get())) {
-    CHECK_EQ(op->args.size(), 3U);
+    ICHECK_EQ(op->args.size(), 3U);
     int kind = op->args[2].as<IntImmNode>()->value;
     this->Push(op->args[0]);
     const IntImmNode* index = op->args[1].as<IntImmNode>();
-    CHECK(index != nullptr);
+    ICHECK(index != nullptr);
     StackVM::Code code;
     code.op_code = StackVM::TVM_STRUCT_GET;
     vm_.code.push_back(code);
@@ -199,9 +199,9 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
     code.v_int = MapFieldKind(kind);
     vm_.code.push_back(code);
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
-    CHECK_GE(op->args.size(), 5U);
+    ICHECK_GE(op->args.size(), 5U);
     const StringImmNode* s = op->args[0].as<StringImmNode>();
-    CHECK(s != nullptr) << "tvm_call_global expect first argument as function name";
+    ICHECK(s != nullptr) << "tvm_call_global expect first argument as function name";
     this->Push(op->args[1]);
     this->Push(op->args[2]);
     int begin = op->args[3].as<IntImmNode>()->value;
@@ -228,10 +228,10 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
     code.v_int = end;
     vm_.code.push_back(code);
   } else if (op->op.same_as(builtin::tvm_stack_alloca())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     const std::string& type = op->args[0].as<StringImmNode>()->value;
     const IntImmNode* num = op->args[1].as<IntImmNode>();
-    CHECK(num != nullptr);
+    ICHECK(num != nullptr);
     static_assert(alignof(TVMValue) % alignof(DLTensor) == 0, "invariant");
     // static_assert(alignof(TVMValue) % alignof(tvm_index_t) == 0, "invariant");
     size_t unit = sizeof(TVMValue);
@@ -251,7 +251,7 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
     vm_.stack_size += size;
     this->PushOp(StackVM::TVM_STACK_ALLOCA_BY_8BYTE, static_cast<int>(size));
   } else if (op->op.same_as(backend_alloc_workspace_op_)) {
-    CHECK_EQ(op->args.size(), 5U);
+    ICHECK_EQ(op->args.size(), 5U);
     this->Push(op->args[0]);
     this->Push(op->args[1]);
     this->Push(op->args[2]);
@@ -259,7 +259,7 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
     this->Push(op->args[4]);
     this->PushOp(StackVM::TVM_DEVICE_ALLOCA);
   } else if (op->op.same_as(backend_free_workspace_op_)) {
-    CHECK_EQ(op->args.size(), 3U);
+    ICHECK_EQ(op->args.size(), 3U);
     this->Push(op->args[0]);
     this->Push(op->args[1]);
     this->Push(op->args[2]);
@@ -267,7 +267,7 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
   } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
     this->PushOp(StackVM::TVM_THROW_LAST_ERROR);
   } else if (op->op.same_as(builtin::isnullptr())) {
-    CHECK_EQ(op->args.size(), 1U);
+    ICHECK_EQ(op->args.size(), 1U);
     this->Push(op->args[0]);
     this->PushOp(StackVM::PUSH_I64, 0);
     this->PushOp(StackVM::EQ_HANDLE);
@@ -305,8 +305,8 @@ void CodeGenStackVM::VisitExpr_(const StringImmNode* op) {
 }
 
 void CodeGenStackVM::VisitExpr_(const IntImmNode* op) {
-  CHECK(op->value >= std::numeric_limits<int>::min() &&
-        op->value <= std::numeric_limits<int>::max())
+  ICHECK(op->value >= std::numeric_limits<int>::min() &&
+         op->value <= std::numeric_limits<int>::max())
       << "Int constant exceed bound";
   this->PushOp(StackVM::PUSH_I64, static_cast<int>(op->value));
 }
@@ -399,7 +399,7 @@ void CodeGenStackVM::VisitExpr_(const NotNode* op) {
 }
 
 void CodeGenStackVM::VisitStmt_(const ForNode* op) {
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   int vid = this->AllocVarID(op->loop_var.get());
   this->PushOp(StackVM::PUSH_I64, 0);
   int64_t loop_head = this->GetPC();
@@ -432,11 +432,11 @@ void CodeGenStackVM::VisitStmt_(const EvaluateNode* ev) {
   if (is_const_int(ev->value)) return;
   const CallNode* op = ev->value.as<CallNode>();
   if (op && op->op.same_as(builtin::tvm_struct_set())) {
-    CHECK_EQ(op->args.size(), 4U);
+    ICHECK_EQ(op->args.size(), 4U);
     this->Push(op->args[0]);
     this->Push(op->args[3]);
     const IntImmNode* index = op->args[1].as<IntImmNode>();
-    CHECK(index != nullptr);
+    ICHECK(index != nullptr);
     StackVM::Code code;
     code.op_code = StackVM::TVM_STRUCT_SET;
     vm_.code.push_back(code);
@@ -515,14 +515,14 @@ runtime::Module BuildStackVM(IRModule mod, Target target) {
   std::string entry_func;
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenStackVM: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenStackVM: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenStackVM: Expect PrimFunc to have the global_symbol attribute";
     std::string f_name = global_symbol.value();
     StackVM vm = codegen::CodeGenStackVM().Compile(f);
-    CHECK(!fmap.count(f_name)) << "Function name " << f_name << "already exist in list";
+    ICHECK(!fmap.count(f_name)) << "Function name " << f_name << "already exist in list";
     fmap[f_name] = std::move(vm);
 
     if (f->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
diff --git a/src/target/tag.cc b/src/target/tag.cc
index 3e47e456691a..8198435a9494 100644
--- a/src/target/tag.cc
+++ b/src/target/tag.cc
@@ -60,7 +60,7 @@ Map<String, Target> TargetTag::ListTags() {
 
 Target TargetTag::AddTag(String name, Map<String, ObjectRef> config, bool override) {
   TargetTagRegEntry& tag = TargetTagRegEntry::RegisterOrGet(name).set_name();
-  CHECK(override || tag.tag_->config.empty())
+  ICHECK(override || tag.tag_->config.empty())
       << "Tag \"" << name << "\" has been previously defined as: " << tag.tag_->config;
   tag.set_config(config);
   return Target(config);
diff --git a/src/target/target.cc b/src/target/target.cc
index 052824249392..e44a15c3ff59 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -421,8 +421,8 @@ void Target::EnterWithScope() {
 
 void Target::ExitWithScope() {
   TVMTargetThreadLocalEntry* entry = TVMTargetThreadLocalStore::Get();
-  CHECK(!entry->context_stack.empty());
-  CHECK(entry->context_stack.top().same_as(*this));
+  ICHECK(!entry->context_stack.empty());
+  ICHECK(entry->context_stack.top().same_as(*this));
   entry->context_stack.pop();
 }
 
@@ -431,7 +431,7 @@ Target Target::Current(bool allow_not_defined) {
   if (entry->context_stack.size() > 0) {
     return entry->context_stack.top();
   }
-  CHECK(allow_not_defined)
+  ICHECK(allow_not_defined)
       << "Target context required. Please set it by constructing a TargetContext";
 
   return Target();
@@ -473,8 +473,8 @@ ObjectPtr<Object> TargetInternal::FromString(const String& tag_or_config_or_targ
 
 ObjectPtr<Object> TargetInternal::FromConfigString(const String& config_str) {
   const auto* loader = tvm::runtime::Registry::Get("target._load_config_dict");
-  CHECK(loader) << "AttributeError: \"target._load_config_dict\" is not registered. Please check "
-                   "if the python module is properly loaded";
+  ICHECK(loader) << "AttributeError: \"target._load_config_dict\" is not registered. Please check "
+                    "if the python module is properly loaded";
   Optional<Map<String, ObjectRef>> config = (*loader)(config_str);
   if (!config.defined()) {
     throw dmlc::Error(": Cannot load config dict with python JSON loader");
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index b5d2bf7ceb85..903c3dcfefb5 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -122,7 +122,7 @@ void CheckOrSetAttr(Map<String, ObjectRef>* attrs, const String& name, const Str
     attrs->Set(name, value);
   } else {
     const auto* str = (*iter).second.as<StringObj>();
-    CHECK(str != nullptr && GetRef<String>(str) == value)
+    ICHECK(str != nullptr && GetRef<String>(str) == value)
         << "ValueError: Expects \"" << name << "\" to be \"" << value
         << "\", but gets: " << (*iter).second;
   }
@@ -143,7 +143,7 @@ Map<String, ObjectRef> UpdateNVPTXAttrs(Map<String, ObjectRef> attrs) {
     // If -mcpu has been specified, validate the correctness
     String mcpu = Downcast<String>(attrs.at("mcpu"));
     arch = ExtractIntWithPrefix(mcpu, "sm_");
-    CHECK(arch != -1) << "ValueError: NVPTX target gets an invalid CUDA arch: -mcpu=" << mcpu;
+    ICHECK(arch != -1) << "ValueError: NVPTX target gets an invalid CUDA arch: -mcpu=" << mcpu;
   } else {
     // Use the compute version of the first CUDA GPU instead
     TVMRetValue version;
@@ -170,7 +170,7 @@ Map<String, ObjectRef> UpdateROCmAttrs(Map<String, ObjectRef> attrs) {
   if (attrs.count("mcpu")) {
     String mcpu = Downcast<String>(attrs.at("mcpu"));
     arch = ExtractIntWithPrefix(mcpu, "gfx");
-    CHECK(arch != -1) << "ValueError: ROCm target gets an invalid GFX version: -mcpu=" << mcpu;
+    ICHECK(arch != -1) << "ValueError: ROCm target gets an invalid GFX version: -mcpu=" << mcpu;
   } else {
     TVMRetValue val;
     if (!DetectDeviceFlag({kDLROCM, 0}, runtime::kGcnArch, &val)) {
@@ -213,12 +213,15 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<String>("mfloat-abi")
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
+    .add_attr_option<Bool>("link-params", Bool(false))
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<Bool>("system-lib")
+    .add_attr_option<Bool>("link-params", Bool(false))
     .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
+    .add_attr_option<String>("march")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLGPU)
diff --git a/src/te/autodiff/ad_simplify.cc b/src/te/autodiff/ad_simplify.cc
index eefd243281c3..cc0e82066171 100644
--- a/src/te/autodiff/ad_simplify.cc
+++ b/src/te/autodiff/ad_simplify.cc
@@ -56,7 +56,7 @@
 #include <memory>
 #include <utility>
 
-#include "ad_util.h"
+#include "ad_utils.h"
 
 namespace tvm {
 namespace te {
@@ -97,8 +97,8 @@ Array<IterVar> IterVarsFromMap(const Array<Var>& vars, const Map<Var, Range>& vr
                                IterVarType iter_type = kDataPar, std::string thread_tag = "") {
   Array<IterVar> res;
   for (const Var& v : vars) {
-    CHECK(vranges.count(v)) << "A range for the variable " << v << " was not provided in map "
-                            << vranges;
+    ICHECK(vranges.count(v)) << "A range for the variable " << v << " was not provided in map "
+                             << vranges;
     res.push_back(IterVar(vranges[v], v, iter_type, thread_tag));
   }
   return res;
@@ -478,7 +478,7 @@ class FactorOutAtomicFormulasFunctor
 // and a non-atomic residual. Atomic formulas are consts, calls, variables and comparisons (a <= b,
 // etc), i.e. formulas which are not logical operators (||, &&, !) on the top level.
 FactorOutAtomicFormulasResult FactorOutAtomicFormulas(const PrimExpr& e) {
-  CHECK(e.dtype().is_bool());
+  ICHECK(e.dtype().is_bool());
   return FactorOutAtomicFormulasFunctor().VisitExpr(e);
 }
 
@@ -494,7 +494,7 @@ inline PrimExpr ModImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   if (mode == kTruncDiv) {
     return truncmod(a, b);
   } else {
-    CHECK_EQ(mode, kFloorDiv);
+    ICHECK_EQ(mode, kFloorDiv);
     return floormod(a, b);
   }
 }
@@ -503,7 +503,7 @@ inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   if (mode == kTruncDiv) {
     return truncdiv(a, b);
   } else {
-    CHECK_EQ(mode, kFloorDiv);
+    ICHECK_EQ(mode, kFloorDiv);
     return floordiv(a, b);
   }
 }
@@ -817,7 +817,7 @@ PrimExpr SimplifyReductionDomain(const PrimExpr& expr, const Map<Var, Range>& ou
 // Extract from cond an implication of cond not containing vars
 std::pair<PrimExpr, PrimExpr> ImplicationNotContainingVars(
     const PrimExpr& cond, const std::unordered_set<const VarNode*>& vars) {
-  CHECK(cond.dtype().is_bool()) << "The type of cond must be bool";
+  ICHECK(cond.dtype().is_bool()) << "The type of cond must be bool";
   // TODO(sgrechanik-h): NOTs could be pushed down using De Morgan laws
   // before running this function but this case didn't seem to be important enough.
   if (const AndNode* op = cond.as<AndNode>()) {
@@ -938,7 +938,7 @@ class RemoveRedundantInequalitiesMutator : public ExprMutator {
 
   virtual PrimExpr VisitExpr_(const ReduceNode* op) {
     Array<PrimExpr> known_with_axes = known_;
-    CHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
+    ICHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
     for (const PrimExpr& axis_cond : IterVarsToInequalities(op->axis)) {
       known_with_axes.push_back(axis_cond);
     }
@@ -1011,7 +1011,7 @@ PrimExpr TrySimplifyCompute(const PrimExpr& expr, const PrimExpr& cond,
   Array<Var> used_res_variables;
   for (const Var& var : res->dst->variables) {
     if (ExprUseVar(new_expr, var)) {
-      CHECK(res->dst->ranges.count(var)) << "Range of " << var << " cannot be inferred.";
+      ICHECK(res->dst->ranges.count(var)) << "Range of " << var << " cannot be inferred.";
       used_res_variables.push_back(var);
     }
   }
@@ -1031,7 +1031,7 @@ PrimExpr TrySimplifyCompute(const PrimExpr& expr, const PrimExpr& cond,
   // Compute volumes before and after
   PrimExpr old_volume = make_const(DataType::Int(64), 1);
   for (const Var& var : outer_axis) {
-    CHECK(vranges.count(var)) << "Range of " << var << " was not provided.";
+    ICHECK(vranges.count(var)) << "Range of " << var << " was not provided.";
     old_volume = old_volume * vranges[var]->extent;
   }
 
@@ -1069,7 +1069,7 @@ class ReductionAsTensorAccessMutator : public ExprMutator {
     ReductionAsTensorAccessMutator new_mutator(Concat(IterVarsToVars(op->axis), outer_axis_),
                                                Merge(vranges_, IterVarsToMap(op->axis)), name_);
 
-    CHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
+    ICHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
     Array<PrimExpr> new_source;
     for (const PrimExpr& src : op->source) {
       new_source.push_back(new_mutator(src));
@@ -1152,7 +1152,7 @@ PrimExpr RemoveJacobianAndLiftNonzeroCondImpl(const PrimExpr& expr_orig, const A
   PrimExpr expr = analyzer.Simplify(expr_orig, kSimplifyRewriteCanonicalRewrite);
 
   if (const ReduceNode* red = expr.as<ReduceNode>()) {
-    CHECK(red->init.empty()) << "Derivative of Reduction with initialization is not implemented";
+    ICHECK(red->init.empty()) << "Derivative of Reduction with initialization is not implemented";
     // TODO(sgrechanik-h): There are some other operations which behave like sum
     bool is_sum = IsSumCombiner(red->combiner, vranges);
     if (is_sum || CanFactorZeroFromCombiner(red->combiner, red->value_index, vranges)) {
diff --git a/src/te/autodiff/ad_util.cc b/src/te/autodiff/ad_utils.cc
similarity index 99%
rename from src/te/autodiff/ad_util.cc
rename to src/te/autodiff/ad_utils.cc
index 024015a601aa..268abab9cacb 100644
--- a/src/te/autodiff/ad_util.cc
+++ b/src/te/autodiff/ad_utils.cc
@@ -18,10 +18,10 @@
  */
 
 /*!
- * \file ad_util.cc
+ * \file ad_utils.cc
  * \brief Utility for tensor-level auto-differentiation.
  */
-#include "ad_util.h"
+#include "ad_utils.h"
 
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt_functor.h>
diff --git a/src/te/autodiff/ad_util.h b/src/te/autodiff/ad_utils.h
similarity index 97%
rename from src/te/autodiff/ad_util.h
rename to src/te/autodiff/ad_utils.h
index 21de61cc46c2..56070ef27267 100644
--- a/src/te/autodiff/ad_util.h
+++ b/src/te/autodiff/ad_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file ad_util.h
+ * \file ad_utils.h
  * \brief Helper utilities to implement auto-differentiation.
  */
-#ifndef TVM_TE_AUTODIFF_AD_UTIL_H_
-#define TVM_TE_AUTODIFF_AD_UTIL_H_
+#ifndef TVM_TE_AUTODIFF_AD_UTILS_H_
+#define TVM_TE_AUTODIFF_AD_UTILS_H_
 
 #include <tvm/arith/int_solver.h>
 #include <tvm/te/operation.h>
@@ -132,4 +132,4 @@ TVM_DLL Tensor RemoveJacobianAndLiftNonzeroCond(const Tensor& tensor,
 
 }  // namespace te
 }  // namespace tvm
-#endif  // TVM_TE_AUTODIFF_AD_UTIL_H_
+#endif  // TVM_TE_AUTODIFF_AD_UTILS_H_
diff --git a/src/te/autodiff/adjoint.cc b/src/te/autodiff/adjoint.cc
index 9f3adfb01a2c..34d38aa75882 100644
--- a/src/te/autodiff/adjoint.cc
+++ b/src/te/autodiff/adjoint.cc
@@ -39,7 +39,7 @@
 #include <memory>
 #include <vector>
 
-#include "ad_util.h"
+#include "ad_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/autodiff/jacobian.cc b/src/te/autodiff/jacobian.cc
index 3724af51c63a..7104424957af 100644
--- a/src/te/autodiff/jacobian.cc
+++ b/src/te/autodiff/jacobian.cc
@@ -30,7 +30,7 @@
 
 #include <memory>
 
-#include "ad_util.h"
+#include "ad_utils.h"
 
 namespace tvm {
 namespace te {
@@ -82,7 +82,7 @@ class JacobianMutator : public ExprMutator {
     auto tensor = Downcast<te::Tensor>(op->producer);
     if (input_.get() && tensor == input_) {
       // Tensor(indices)
-      CHECK_EQ(indices_.size(), op->indices.size());
+      ICHECK_EQ(indices_.size(), op->indices.size());
       PrimExpr condition = const_true();
       for (size_t i = 0; i < input_.ndim(); ++i) {
         condition = And(condition, EQ(indices_[i], op->indices[i]));
@@ -181,7 +181,8 @@ class JacobianMutator : public ExprMutator {
     PrimExpr expr_with_new_axes = te::CloneReduction(GetRef<PrimExpr>(op));
     const ReduceNode* new_op = expr_with_new_axes.as<ReduceNode>();
 
-    CHECK(new_op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
+    ICHECK(new_op->init.empty())
+        << "Derivative of Reduction with initialization is not implemented";
 
     // New lhs and rhs variables of the new combiner consist of
     // variables representing derivatives (which are later derived from new_op->source)
@@ -303,7 +304,7 @@ PrimExpr Jacobian(const PrimExpr& expr, const Tensor& input, const Array<PrimExp
 
 Tensor Jacobian(const Tensor& output, const Tensor& input) {
   const ComputeOpNode* op = output->op.as<ComputeOpNode>();
-  CHECK(op) << "Derivative of this operation is not implemented: " << output->op;
+  ICHECK(op) << "Derivative of this operation is not implemented: " << output->op;
   bool is_input_tensor = false;
   for (const Tensor& child : op->InputTensors()) {
     if (input == child) {
@@ -311,8 +312,8 @@ Tensor Jacobian(const Tensor& output, const Tensor& input) {
       break;
     }
   }
-  CHECK(is_input_tensor) << "Jacobian is called on a pair of tensors such that the output "
-                         << "does not directly depend on the input.";
+  ICHECK(is_input_tensor) << "Jacobian is called on a pair of tensors such that the output "
+                          << "does not directly depend on the input.";
 
   // We have to clone the iteration axes because otherwise the original expression
   // cannot be used together with the derivative (it will lead to errors during lowering)
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index 527b251867ad..3b225760d75d 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -37,7 +37,7 @@
 
 #include "../../arith/interval_set.h"
 #include "../schedule/message_passing.h"
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
@@ -74,12 +74,12 @@ Array<IterVar> BaseComputeOpNode::root_iter_vars() const {
 }
 
 DataType ComputeOpNode::output_dtype(size_t idx) const {
-  CHECK_LT(idx, num_outputs());
+  ICHECK_LT(idx, num_outputs());
   return body[idx].dtype();
 }
 
 Array<PrimExpr> BaseComputeOpNode::output_shape(size_t idx) const {
-  CHECK_LT(idx, num_outputs());
+  ICHECK_LT(idx, num_outputs());
   // for now, all outputs of a BaseComputeOp have the same shape
   Array<PrimExpr> shape;
   for (const auto& ivar : this->axis) {
@@ -170,7 +170,7 @@ Array<Tensor> ComputeOpNode::InputTensors() const {
 
 Operation ComputeOpNode::ReplaceInputs(const Operation& self,
                                        const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   VerifyComputeOp(this);
   Array<PrimExpr> arr;
   if (this->body[0]->IsInstance<tir::ReduceNode>()) {
@@ -202,7 +202,7 @@ Operation ComputeOpNode::ReplaceInputs(const Operation& self,
 void ComputeOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
                                       const std::unordered_map<const VarNode*, IntSet>& dom_map,
                                       std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto fvisit = [&dom_map, out_dom_map, analyzer](const ObjectRef& n) {
     if (auto* pload = n.as<tir::ProducerLoadNode>()) {
       Tensor t = Downcast<Tensor>(pload->producer);
@@ -245,15 +245,15 @@ void ComputeOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* an
 void BaseComputeOpNode::GatherBound(const Operation& self,
                                     const std::unordered_map<Tensor, TensorDom>& tensor_dom,
                                     std::unordered_map<IterVar, Range>* out_dom_map) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   const TensorDom& tdom = tensor_dom.at(self.output(0));
   for (size_t i = 0; i < this->axis.size(); ++i) {
     Range r = arith::Union(tdom.data.at(i)).CoverRange(this->axis[i]->dom);
-    CHECK(!out_dom_map->count(this->axis[i]));
+    ICHECK(!out_dom_map->count(this->axis[i]));
     (*out_dom_map)[this->axis[i]] = r;
   }
   for (size_t i = 0; i < this->reduce_axis.size(); ++i) {
-    CHECK(!out_dom_map->count(this->reduce_axis[i]));
+    ICHECK(!out_dom_map->count(this->reduce_axis[i]));
     (*out_dom_map)[this->reduce_axis[i]] = this->reduce_axis[i]->dom;
   }
 }
@@ -261,7 +261,7 @@ void BaseComputeOpNode::GatherBound(const Operation& self,
 Stmt BaseComputeOpNode::BuildRealize(const Stage& stage,
                                      const std::unordered_map<IterVar, Range>& realize_map,
                                      const Stmt& body) const {
-  CHECK_EQ(stage->op.get(), this);
+  ICHECK_EQ(stage->op.get(), this);
   Region bounds;
   for (IterVar iv : this->axis) {
     bounds.push_back(realize_map.at(iv));
@@ -301,9 +301,9 @@ void MakeReduction(const ComputeOpNode* op, const Array<Tensor>& tensors, Stmt*
 
   size_t size = op->body.size();
   const ReduceNode* reduce = op->body[0].as<ReduceNode>();
-  CHECK(reduce);
+  ICHECK(reduce);
   const CommReducerNode* combiner = reduce->combiner.as<CommReducerNode>();
-  CHECK(combiner);
+  ICHECK(combiner);
   Array<PrimExpr> lhs;
   for (size_t i = 0; i < size; ++i) {
     lhs.push_back(tensors[i](args));
@@ -405,11 +405,11 @@ ComputeType DetectComputeType(const ComputeOpNode* self, const Stage& stage) {
         ++normal_red;
       }
     } else {
-      CHECK_EQ(thread_red, 0) << "Cross thread reduce cannot swap with normal data axis";
+      ICHECK_EQ(thread_red, 0) << "Cross thread reduce cannot swap with normal data axis";
     }
   }
   if (tensorize != 0) {
-    CHECK(thread_red == 0) << "Cannot mix cross thread reduction with Tensorize";
+    ICHECK(thread_red == 0) << "Cannot mix cross thread reduction with Tensorize";
     return ComputeType::kTensorize;
   }
   if (thread_red != 0) {
@@ -423,7 +423,7 @@ ComputeType DetectComputeType(const ComputeOpNode* self, const Stage& stage) {
 Stmt ComputeOpNode::BuildProvide(const Stage& stage,
                                  const std::unordered_map<IterVar, Range>& dom_map,
                                  bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
   ComputeType ctype = DetectComputeType(this, stage);
   if (ctype == ComputeType::kCrossThreadReduction) {
     // specially handle cross thread reduction.
@@ -438,7 +438,7 @@ Stmt ComputeOpNode::BuildProvide(const Stage& stage,
 ComputeLoopNest ComputeLoopNest::Create(const BaseComputeOpNode* self, const Stage& stage,
                                         const std::unordered_map<IterVar, Range>& dom_map,
                                         bool debug_keep_trivial_loop) {
-  CHECK_EQ(stage->op.operator->(), self);
+  ICHECK_EQ(stage->op.operator->(), self);
   ComputeLoopNest ret;
   // make main loop nest
   ret.main_nest = MakeLoopNest(stage, dom_map, 0, false, std::unordered_set<IterVar>(),
@@ -489,7 +489,7 @@ ComputeLoopNest ComputeLoopNest::Create(const BaseComputeOpNode* self, const Sta
       e = likely(e);
     }
   } else {
-    CHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
+    ICHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
     ret.num_common_loop = stage->leaf_iter_vars.size();
   }
   // copy elison here.
@@ -524,12 +524,12 @@ class ComputeVerifier final : protected tir::ExprVisitor {
     for (const PrimExpr e : compute_->body) {
       // Check for consistency of top level reductions
       const tir::ReduceNode* reduce = e.as<tir::ReduceNode>();
-      CHECK((reduce && reduce_) || (!reduce && !reduce_)) << "All ComputeOp should be consistent "
-                                                          << "with being Reduce operation or not.";
+      ICHECK((reduce && reduce_) || (!reduce && !reduce_)) << "All ComputeOp should be consistent "
+                                                           << "with being Reduce operation or not.";
 
       if (reduce && reduce_) {
-        CHECK(ReduceEqual(reduce, reduce_)) << "The Reduce inputs of ComputeOp should "
-                                            << "have the same attribute except value_index";
+        ICHECK(ReduceEqual(reduce, reduce_)) << "The Reduce inputs of ComputeOp should "
+                                             << "have the same attribute except value_index";
       }
 
       level_ = 0;
@@ -548,8 +548,8 @@ class ComputeVerifier final : protected tir::ExprVisitor {
 
   void VisitExpr_(const tir::ReduceNode* op) final {
     // Check for non top level reductions
-    CHECK(0 == level_) << "Reductions are only allowed at the top level of compute. "
-                       << "Please create another tensor for further composition.";
+    ICHECK(0 == level_) << "Reductions are only allowed at the top level of compute. "
+                        << "Please create another tensor for further composition.";
   }
   //@}
 
@@ -581,7 +581,7 @@ Stmt TransformUpdate(const Stage& stage, const std::unordered_map<IterVar, Range
     }
     if (iv->iter_type == kCommReduce) {
       auto vit = dom_map.find(iv);
-      CHECK(vit != dom_map.end());
+      ICHECK(vit != dom_map.end());
       const Range& vrange = vit->second;
       conds.push_back(likely(iv->var > vrange->min));
       banned.insert(iv->var.get());
diff --git a/src/te/operation/cross_thread_reduction.cc b/src/te/operation/cross_thread_reduction.cc
index 6369ecbabad8..b0fb9b667558 100644
--- a/src/te/operation/cross_thread_reduction.cc
+++ b/src/te/operation/cross_thread_reduction.cc
@@ -24,7 +24,7 @@
 #include <tvm/tir/builtin.h>
 
 #include "compute_op.h"
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
@@ -92,12 +92,13 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
                            debug_keep_trivial_loop);
 
   size_t size = self->body.size();
-  CHECK_GT(size, 0);
+  ICHECK_GT(size, 0);
   std::vector<const ReduceNode*> reduces(size);
   for (size_t i = 0; i < size; ++i) {
     const ReduceNode* reduce = self->body[i].as<ReduceNode>();
-    CHECK(reduce);
-    CHECK(reduce->init.empty()) << "Cannot perform cross_thread_reduction for reductions with init";
+    ICHECK(reduce);
+    ICHECK(reduce->init.empty())
+        << "Cannot perform cross_thread_reduction for reductions with init";
     reduces[i] = reduce;
   }
 
@@ -140,7 +141,7 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
     normal_init.reserve(size);
     normal_update.resize(size);
     const CommReducerNode* combiner = reduces[0]->combiner.as<CommReducerNode>();
-    CHECK(combiner);
+    ICHECK(combiner);
     Array<PrimExpr> lhs;
     for (size_t i = 0; i < size; ++i) {
       DataType t = reduces[i]->dtype;
diff --git a/src/te/operation/extern_op.cc b/src/te/operation/extern_op.cc
index e61fe51470fe..1c9a3cb336ae 100644
--- a/src/te/operation/extern_op.cc
+++ b/src/te/operation/extern_op.cc
@@ -28,7 +28,7 @@
 
 #include <unordered_set>
 
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
@@ -60,14 +60,14 @@ ExternOp::ExternOp(std::string name, std::string tag, Map<String, ObjectRef> att
   n->name = std::move(name);
   n->tag = std::move(tag);
   n->attrs = std::move(attrs);
-  CHECK_EQ(inputs.size(), input_placeholders.size());
+  ICHECK_EQ(inputs.size(), input_placeholders.size());
   for (size_t i = 0; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i]->dtype, input_placeholders[i]->dtype);
-    CHECK_EQ(inputs[i]->shape.size(), input_placeholders[i]->shape.size());
+    ICHECK_EQ(inputs[i]->dtype, input_placeholders[i]->dtype);
+    ICHECK_EQ(inputs[i]->shape.size(), input_placeholders[i]->shape.size());
     for (size_t dim = 0; dim < inputs[i]->shape.size(); ++dim) {
-      CHECK(inputs[i]->shape[dim].same_as(input_placeholders[i]->shape[dim]));
+      ICHECK(inputs[i]->shape[dim].same_as(input_placeholders[i]->shape[dim]));
     }
-    CHECK_EQ(input_placeholders[i]->strides.size(), 0U);
+    ICHECK_EQ(input_placeholders[i]->strides.size(), 0U);
   }
   n->inputs = std::move(inputs);
   n->input_placeholders = std::move(input_placeholders);
@@ -87,7 +87,7 @@ Array<Tensor> ExternOpNode::InputTensors() const { return inputs; }
 
 Operation ExternOpNode::ReplaceInputs(const Operation& self,
                                       const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto n = make_object<ExternOpNode>(*this);
   n->body = ReplaceTensor(this->body, rmap);
   for (size_t i = 0; i < n->inputs.size(); ++i) {
@@ -125,7 +125,7 @@ void ExternOpNode::GatherBound(const Operation& self,
 Stmt ExternOpNode::BuildRealize(const Stage& stage,
                                 const std::unordered_map<IterVar, Range>& realize_map,
                                 const Stmt& body) const {
-  CHECK_EQ(stage->op.get(), this);
+  ICHECK_EQ(stage->op.get(), this);
   Stmt realize_body = body;
   for (int k = 0; k < num_outputs(); ++k) {
     Tensor t = stage->op.output(k);
@@ -141,7 +141,7 @@ Stmt ExternOpNode::BuildRealize(const Stage& stage,
 Stmt ExternOpNode::BuildProvide(const Stage& stage,
                                 const std::unordered_map<IterVar, Range>& dom_map,
                                 bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
   Stmt ret = AttrStmt(make_zero(DataType::Int(32)), tir::attr::extern_scope, 0, this->body);
   auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) {
     Array<ObjectRef> bind_spec;
diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc
index 01162cb14e18..94e06d206ddb 100644
--- a/src/te/operation/hybrid_op.cc
+++ b/src/te/operation/hybrid_op.cc
@@ -35,7 +35,7 @@
 #include <unordered_set>
 #include <utility>
 
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
@@ -101,7 +101,7 @@ Array<Tensor> HybridOpNode::InputTensors() const {
 
 Operation HybridOpNode::ReplaceInputs(const Operation& self,
                                       const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto n = make_object<HybridOpNode>(*this);
   n->body = te::ReplaceTensor(this->body, rmap);
   for (size_t i = 0; i < n->inputs.size(); ++i) {
@@ -137,7 +137,7 @@ void HybridOpNode::GatherBound(const Operation& self,
                                const std::unordered_map<Tensor, TensorDom>& tensor_dom,
                                std::unordered_map<IterVar, Range>* out_dom_map) const {
   for (auto iter_var : axis) {
-    CHECK(!out_dom_map->count(iter_var));
+    ICHECK(!out_dom_map->count(iter_var));
     out_dom_map->operator[](iter_var) = iter_var->dom;
   }
 }
@@ -146,7 +146,7 @@ Stmt HybridOpNode::BuildRealize(const Stage& stage,
                                 const std::unordered_map<IterVar, Range>& realize_map,
                                 const Stmt& body) const {
   // TODO(@were): Add attribute inject here and remove it from hybrid parser.
-  CHECK_EQ(stage->op.get(), this);
+  ICHECK_EQ(stage->op.get(), this);
   Stmt realize_body = body;
   for (int k = 0; k < num_outputs(); ++k) {
     Tensor t = stage->op.output(k);
@@ -162,7 +162,7 @@ Stmt HybridOpNode::BuildRealize(const Stage& stage,
 Stmt HybridOpNode::BuildProvide(const Stage& stage,
                                 const std::unordered_map<IterVar, Range>& dom_map,
                                 bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
   Stmt ret = AttrStmt(make_zero(DataType::Int(32)), tir::attr::extern_scope, 0, this->body);
   std::unordered_map<Tensor, Tensor> rmap;
   for (int i = 0; i < this->num_outputs(); ++i) {
@@ -213,14 +213,14 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range
       parent = split->parent->var.get();
 
       auto& inner_ = split->inner;
-      CHECK(dom_map.count(inner_));
+      ICHECK(dom_map.count(inner_));
       auto& inner_dom = dom_map.find(inner_)->second;
-      CHECK(is_const_int(inner_dom->min, 0));
+      ICHECK(is_const_int(inner_dom->min, 0));
 
       auto& outer_ = split->outer;
-      CHECK(dom_map.count(outer_));
+      ICHECK(dom_map.count(outer_));
       auto& outer_dom = dom_map.find(outer_)->second;
-      CHECK(is_const_int(outer_dom->min, 0));
+      ICHECK(is_const_int(outer_dom->min, 0));
 
       inner = IterVar(inner_dom, inner_->var, inner_->iter_type);
       outer = IterVar(outer_dom, outer_->var, outer_->iter_type);
@@ -264,7 +264,7 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range
     // TODO(@were): Handle imperfect loops
     Stmt VisitStmt_(const ForNode* op) final {
       if (op->loop_var.get() == inner) {
-        CHECK(under_outer);
+        ICHECK(under_outer);
         std::unordered_map<const VarNode*, PrimExpr> rmap;
         rmap[op->loop_var.get()] = indexmod(parent, op->extent);
         extent = op->extent;
@@ -295,11 +295,11 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range
     if (const SplitNode* split = rel.as<SplitNode>()) {
       LoopSpliter Spliter(split, dom_map);
       stmt = Spliter(stmt);
-      CHECK(Spliter.splitted);
+      ICHECK(Spliter.splitted);
     } else if (const FuseNode* fuse = rel.as<FuseNode>()) {
       LoopFuser Fuser(fuse);
       stmt = Fuser(stmt);
-      CHECK(Fuser.fused);
+      ICHECK(Fuser.fused);
     }
   }
 
@@ -322,8 +322,8 @@ Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar,
         if (attr->bind_thread.defined()) {
           const auto& iter_var = attr->bind_thread;
           if (iter_var->dom.defined()) {
-            CHECK(is_const_int(iter_var->dom->min, 0));
-            CHECK(expr_equal(iter_var->dom->extent, op->extent))
+            ICHECK(is_const_int(iter_var->dom->min, 0));
+            ICHECK(expr_equal(iter_var->dom->extent, op->extent))
                 << "Thread extent and loop extent mismatch!\n";
           }
           std::unordered_map<const VarNode*, PrimExpr> rmap;
@@ -361,7 +361,7 @@ Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar,
       }
     });
 
-    CHECK_EQ(found, 1) << " iter var should be found exactly once!";
+    ICHECK_EQ(found, 1) << " iter var should be found exactly once!";
     if (need_change) {
       stmt = LoopAnnotator(var, attr)(std::move(stmt));
     }
@@ -377,14 +377,14 @@ Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map<IterVar, Range>
   });
   std::reverse(current_order.begin(), current_order.end());
   auto& required_ord = stage->leaf_iter_vars;
-  CHECK_EQ(current_order.size(), required_ord.size()) << "Cannot reorder the loops!";
+  ICHECK_EQ(current_order.size(), required_ord.size()) << "Cannot reorder the loops!";
   std::unordered_map<const VarNode*, IterVar> reorder;
   bool need_reorder = false;
   for (size_t i = 0; i < current_order.size(); ++i) {
     auto& current = current_order[i];
     const IterVar& iter_var = required_ord[i];
     const IterVar& required = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
-    CHECK(required->dom.defined() || dom_map.count(required)) << required << "\n";
+    ICHECK(required->dom.defined() || dom_map.count(required)) << required << "\n";
     reorder[current] = required;
     if (current != required->var.get()) {
       need_reorder = true;
@@ -404,7 +404,7 @@ Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map<IterVar, Range>
     Stmt VisitStmt_(const ForNode* op) final {
       // Reorder from in to out
       Stmt body_ = this->VisitStmt(op->body);
-      CHECK(reorder.count(op->loop_var.get()));
+      ICHECK(reorder.count(op->loop_var.get()));
       auto target = reorder.find(op->loop_var.get())->second;
       if (body_.same_as(op->body) && op->loop_var.get() == target->var.get())
         return GetRef<Stmt>(op);
@@ -431,8 +431,8 @@ Stmt ApplySchedule(const Stage& stage, const std::unordered_map<IterVar, Range>&
   for (auto rel : stage->relations) {
     if (const auto* rebase = rel.as<RebaseNode>()) {
       rebased[rebase->rebased] = rebase->parent;
-      CHECK(rebase->parent->dom.defined());
-      CHECK(dom_map.count(rebase->rebased));
+      ICHECK(rebase->parent->dom.defined());
+      ICHECK(dom_map.count(rebase->rebased));
     }
   }
   stmt = ApplyLoopShapes(stage, dom_map, stmt);
diff --git a/src/te/operation/hybrid_op.h b/src/te/operation/hybrid_op.h
index a11ae89e23f7..705456850ce6 100644
--- a/src/te/operation/hybrid_op.h
+++ b/src/te/operation/hybrid_op.h
@@ -32,7 +32,7 @@
 #include <vector>
 
 #include "../../tir/transforms/arg_binder.h"
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "../schedule/message_passing.h"
 
 namespace tvm {
diff --git a/src/te/operation/op_util.cc b/src/te/operation/op_utils.cc
similarity index 93%
rename from src/te/operation/op_util.cc
rename to src/te/operation/op_utils.cc
index 2abf68a71d54..f1991c181e67 100644
--- a/src/te/operation/op_util.cc
+++ b/src/te/operation/op_utils.cc
@@ -19,9 +19,9 @@
 
 /*!
  * \brief Utility to make loop nest.
- * \file op_util.cc
+ * \file op_utils.cc
  */
-#include "op_util.h"
+#include "op_utils.h"
 
 #include <tvm/te/operation.h>
 #include <tvm/tir/expr.h>
@@ -100,7 +100,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
           default:
             LOG(FATAL) << "Unknown iter type" << it_attr->iter_type << " in the iter_var_attrs";
         }
-        CHECK_EQ(it_attr->pragma_keys.size(), it_attr->pragma_values.size());
+        ICHECK_EQ(it_attr->pragma_keys.size(), it_attr->pragma_values.size());
         for (size_t k = 0; k < it_attr->pragma_keys.size(); ++k) {
           const std::string& pkey = it_attr->pragma_keys[k].as<StringImmNode>()->value;
           PrimExpr pvalue = it_attr->pragma_values[k];
@@ -125,8 +125,8 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
         nest[i + 1].emplace_back(LetStmt(var, new_value, no_op));
       }
       if (it_attr.defined() && it_attr->prefetch_data.size() != 0) {
-        CHECK(!is_one(dom->extent)) << "Cannot prefetch on trivial loop with extent=1";
-        CHECK_EQ(it_attr->prefetch_data.size(), it_attr->prefetch_offset.size());
+        ICHECK(!is_one(dom->extent)) << "Cannot prefetch on trivial loop with extent=1";
+        ICHECK_EQ(it_attr->prefetch_data.size(), it_attr->prefetch_offset.size());
         for (size_t j = 0; j < it_attr->prefetch_data.size(); ++j) {
           nest[i + 1].emplace_back(AttrStmt(it_attr->prefetch_data[j], tir::attr::prefetch_scope,
                                             it_attr->prefetch_offset[j], no_op));
@@ -135,23 +135,23 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
     } else if (bind_iv->thread_tag == "vthread" || bind_iv->thread_tag == "cthread") {
       // virtual thread
       // Always restrict threaded IterVar to starts from 0.
-      CHECK(is_zero(dom->min));
-      CHECK(is_positive_const(dom->extent));
+      ICHECK(is_zero(dom->min));
+      ICHECK(is_positive_const(dom->extent));
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::virtual_thread, dom->extent, no_op));
       value_map[iv] = var;
     } else if (bind_iv->thread_tag == "pipeline") {
       // pipeline marker.
-      CHECK(is_zero(dom->min));
-      CHECK(is_one(dom->extent));
+      ICHECK(is_zero(dom->min));
+      ICHECK(is_one(dom->extent));
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(
           AttrStmt(bind_iv, tir::attr::pipeline_exec_scope, dom->extent, no_op));
       value_map[iv] = dom->min;
     } else {
       // Always restrict threaded IterVar to starts from 0.
-      CHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at "
-                               << dom->min;
+      ICHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at "
+                                << dom->min;
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent, dom->extent, no_op));
       if (!debug_keep_trivial_loop && is_one(dom->extent)) {
@@ -205,7 +205,7 @@ class TensorReplacer : public tir::StmtExprMutator {
   PrimExpr VisitExpr_(const tir::ProducerLoadNode* op) final {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<tir::ProducerLoadNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
 
     Tensor t = Downcast<Tensor>(op->producer);
     auto it = vmap_.find(t);
diff --git a/src/te/operation/op_util.h b/src/te/operation/op_utils.h
similarity index 95%
rename from src/te/operation/op_util.h
rename to src/te/operation/op_utils.h
index 6c864fca67d5..16f7d96cfa77 100644
--- a/src/te/operation/op_util.h
+++ b/src/te/operation/op_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file op_util.h
+ * \file op_utils.h
  * \brief Common utility used in operator construction.
  */
-#ifndef TVM_TE_OPERATION_OP_UTIL_H_
-#define TVM_TE_OPERATION_OP_UTIL_H_
+#ifndef TVM_TE_OPERATION_OP_UTILS_H_
+#define TVM_TE_OPERATION_OP_UTILS_H_
 
 #include <tvm/te/schedule.h>
 #include <tvm/tir/expr.h>
@@ -32,7 +32,7 @@
 #include <vector>
 
 #include "../../tir/transforms/arg_binder.h"
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "../schedule/message_passing.h"
 
 namespace tvm {
@@ -101,4 +101,4 @@ tir::ForType IterVarTypeToForType(IterVarType iter_type);
 
 }  // namespace te
 }  // namespace tvm
-#endif  // TVM_TE_OPERATION_OP_UTIL_H_
+#endif  // TVM_TE_OPERATION_OP_UTILS_H_
diff --git a/src/te/operation/placeholder_op.cc b/src/te/operation/placeholder_op.cc
index 5b7ede314e49..c51e53e16cd1 100644
--- a/src/te/operation/placeholder_op.cc
+++ b/src/te/operation/placeholder_op.cc
@@ -41,12 +41,12 @@ int PlaceholderOpNode::num_outputs() const { return 1; }
 Array<IterVar> PlaceholderOpNode::root_iter_vars() const { return {}; }
 
 DataType PlaceholderOpNode::output_dtype(size_t i) const {
-  CHECK_EQ(i, 0U);
+  ICHECK_EQ(i, 0U);
   return dtype;
 }
 
 Array<PrimExpr> PlaceholderOpNode::output_shape(size_t i) const {
-  CHECK_EQ(i, 0U);
+  ICHECK_EQ(i, 0U);
   return shape;
 }
 
diff --git a/src/te/operation/scan_op.cc b/src/te/operation/scan_op.cc
index 99b0edf60a7c..a555e86097b7 100644
--- a/src/te/operation/scan_op.cc
+++ b/src/te/operation/scan_op.cc
@@ -26,7 +26,7 @@
 #include <tvm/tir/expr.h>
 
 #include "../schedule/graph.h"
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
@@ -51,7 +51,7 @@ Array<IterVar> ScanOpNode::root_iter_vars() const {
 DataType ScanOpNode::output_dtype(size_t i) const { return update[i]->dtype; }
 
 Array<PrimExpr> ScanOpNode::output_shape(size_t i) const {
-  CHECK_LT(i, state_placeholder.size());
+  ICHECK_LT(i, state_placeholder.size());
   return state_placeholder[i]->shape;
 }
 
@@ -62,27 +62,27 @@ ScanOp::ScanOp(std::string name, std::string tag, Map<String, ObjectRef> attrs,
     attrs = Map<String, ObjectRef>();
   }
   auto n = make_object<ScanOpNode>();
-  CHECK_EQ(init.size(), update.size());
-  CHECK_EQ(init.size(), state_placeholder.size());
+  ICHECK_EQ(init.size(), update.size());
+  ICHECK_EQ(init.size(), state_placeholder.size());
   arith::Analyzer analyzer;
   auto prove_equal = [&](PrimExpr lhs, PrimExpr rhs) {
     return is_zero(analyzer.Simplify(lhs - rhs));
   };
 
   for (size_t i = 0; i < init.size(); ++i) {
-    CHECK_EQ(init[i]->dtype, state_placeholder[i]->dtype);
-    CHECK_EQ(init[i]->dtype, update[i]->dtype);
-    CHECK(prove_equal(init[i]->shape[0], axis->dom->min))
+    ICHECK_EQ(init[i]->dtype, state_placeholder[i]->dtype);
+    ICHECK_EQ(init[i]->dtype, update[i]->dtype);
+    ICHECK(prove_equal(init[i]->shape[0], axis->dom->min))
         << "init.shape[0] need to match scan_axis.dom.min";
-    CHECK(prove_equal(state_placeholder[i]->shape[0], axis->dom->min + axis->dom->extent))
+    ICHECK(prove_equal(state_placeholder[i]->shape[0], axis->dom->min + axis->dom->extent))
         << "state_placeholder.shape[0] need to match"
         << " scan_axis.dom.min + scan_axis.dom.extent";
-    CHECK_EQ(state_placeholder[i].ndim(), init[i].ndim())
+    ICHECK_EQ(state_placeholder[i].ndim(), init[i].ndim())
         << "The dimension of init need to match state_placeholder";
-    CHECK_EQ(update[i].ndim(), state_placeholder[i].ndim())
+    ICHECK_EQ(update[i].ndim(), state_placeholder[i].ndim())
         << "The update.ndim need to be state_placeholder.ndim - 1";
     for (size_t k = 0; k < update[i].ndim(); ++k) {
-      CHECK(prove_equal(update[i]->shape[k], state_placeholder[i]->shape[k]));
+      ICHECK(prove_equal(update[i]->shape[k], state_placeholder[i]->shape[k]));
       if (k != 0) {
         // setup spatial axis
         std::ostringstream spatial_name;
@@ -93,7 +93,7 @@ ScanOp::ScanOp(std::string name, std::string tag, Map<String, ObjectRef> attrs,
     }
 
     for (size_t k = 1; k < init[i].ndim(); ++k) {
-      CHECK(prove_equal(init[i]->shape[k], state_placeholder[i]->shape[k]));
+      ICHECK(prove_equal(init[i]->shape[k], state_placeholder[i]->shape[k]));
     }
   }
   n->name = std::move(name);
@@ -141,7 +141,7 @@ Array<Tensor> ScanOpNode::InputTensors() const {
 
 Operation ScanOpNode::ReplaceInputs(const Operation& self,
                                     const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto n = make_object<ScanOpNode>(*this);
   for (size_t i = 0; i < n->init.size(); ++i) {
     if (rmap.count(n->init[i])) {
@@ -161,7 +161,7 @@ Operation ScanOpNode::ReplaceInputs(const Operation& self,
 void ScanOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
                                    const std::unordered_map<const VarNode*, IntSet>& dom_map,
                                    std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   for (size_t i = 0, sp_idx = 0; i < this->init.size(); ++i) {
     TensorDom* init_dom = nullptr;
     TensorDom* update_dom = nullptr;
@@ -195,8 +195,8 @@ void ScanOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analy
 void ScanOpNode::GatherBound(const Operation& self,
                              const std::unordered_map<Tensor, TensorDom>& tensor_dom,
                              std::unordered_map<IterVar, Range>* out_dom_map) const {
-  CHECK_EQ(self.operator->(), this);
-  CHECK(!out_dom_map->count(this->scan_axis));
+  ICHECK_EQ(self.operator->(), this);
+  ICHECK(!out_dom_map->count(this->scan_axis));
   std::vector<Tensor> output(this->num_outputs());
   for (size_t i = 0; i < output.size(); ++i) {
     output[i] = self.output(i);
@@ -207,7 +207,7 @@ void ScanOpNode::GatherBound(const Operation& self,
     const TensorDom& d = tensor_dom.at(output[i]);
     time_dom.insert(time_dom.end(), d.data[0].begin(), d.data[0].end());
   }
-  CHECK(!out_dom_map->count(this->scan_axis));
+  ICHECK(!out_dom_map->count(this->scan_axis));
   arith::Analyzer analyzer;
   Range sdom = this->scan_axis->dom;
   Range r = arith::Union(time_dom).CoverRange(sdom);
@@ -220,8 +220,8 @@ void ScanOpNode::GatherBound(const Operation& self,
     const TensorDom& d = tensor_dom.at(output[i]);
     for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) {
       IterVar sp_ax = this->spatial_axis_[sp_idx];
-      CHECK(!out_dom_map->count(sp_ax));
-      CHECK(fix_pt.count(sp_ax));
+      ICHECK(!out_dom_map->count(sp_ax));
+      ICHECK(fix_pt.count(sp_ax));
       if (fix_pt[sp_ax].as<tir::IntImmNode>()->value) {
         // fix point, we can slice it.
         (*out_dom_map)[sp_ax] = arith::Union(d.data[k]).CoverRange(sp_ax->dom);
@@ -236,14 +236,14 @@ void ScanOpNode::GatherBound(const Operation& self,
 Stmt ScanOpNode::BuildRealize(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
                               const Stmt& body) const {
   arith::Analyzer analyzer;
-  CHECK_EQ(stage->op.get(), this);
+  ICHECK_EQ(stage->op.get(), this);
   Range sdom = dom_map.at(this->scan_axis);
   Range tdom = Range::FromMinExtent(0, analyzer.Simplify(sdom->extent + sdom->min));
   Stmt ret = body;
   size_t sp_idx = 0;
   for (size_t i = 0; i < update.size(); ++i) {
     Tensor t = stage->op.output(i);
-    CHECK_EQ(static_cast<size_t>(t->value_index), i);
+    ICHECK_EQ(static_cast<size_t>(t->value_index), i);
     Region bounds;
     bounds.push_back(tdom);
     for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) {
@@ -257,14 +257,14 @@ Stmt ScanOpNode::BuildRealize(const Stage& stage, const std::unordered_map<IterV
 
 Stmt ScanOpNode::BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
                               bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
   Stmt provide =
       AttrStmt(stage->op, tir::attr::scan_update_scope, this->scan_axis->var, Evaluate(0));
   Stmt init = AttrStmt(stage->op, tir::attr::scan_init_scope, 0, Evaluate(0));
   size_t begin_scan = 0;
   for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
     if (stage->leaf_iter_vars[i]->iter_type == kThreadIndex) {
-      CHECK_EQ(begin_scan, i);
+      ICHECK_EQ(begin_scan, i);
       begin_scan = i + 1;
     }
   }
diff --git a/src/te/operation/tensor_compute_op.cc b/src/te/operation/tensor_compute_op.cc
index f6f00584aa76..262e5a2b97f4 100644
--- a/src/te/operation/tensor_compute_op.cc
+++ b/src/te/operation/tensor_compute_op.cc
@@ -31,7 +31,7 @@
 #include <unordered_set>
 
 #include "./compute_op.h"
-#include "./op_util.h"
+#include "./op_utils.h"
 
 namespace tvm {
 namespace te {
@@ -83,7 +83,7 @@ Array<Tensor> TensorComputeOpNode::InputTensors() const { return inputs; }
 
 Operation TensorComputeOpNode::ReplaceInputs(const Operation& self,
                                              const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto n = make_object<TensorComputeOpNode>(*this);
   auto intrin = make_object<TensorIntrinNode>(*(this->intrin.operator->()));
   intrin->body = ReplaceTensor(this->intrin->body, rmap);
@@ -132,7 +132,7 @@ size_t TensorComputeOpNode::num_schedulable_dims() const { return schedulable_nd
 Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
                                        const std::unordered_map<IterVar, Range>& dom_map,
                                        bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
 
   // Start bind data.
   Stmt nop = Evaluate(0);
@@ -194,7 +194,7 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
     PrimExpr esp = sp;
     sp_expr.push_back(esp);
   }
-  CHECK_EQ(sp_expr.size(), user_expr.size());
+  ICHECK_EQ(sp_expr.size(), user_expr.size());
   // TODO(jdavies-huawei): what name should be used here?
   binder.BindArray(sp_expr, user_expr, this->name);
 
@@ -204,8 +204,8 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
   if (this->reduce_axis.size() == 0) {
     std::vector<std::vector<Stmt> > nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
     nest.emplace_back(MakeIfNest(n.main_predicates));
-    CHECK_EQ(n.init_predicates.size(), 0U);
-    CHECK(this->intrin->body.defined())
+    ICHECK_EQ(n.init_predicates.size(), 0U);
+    ICHECK(this->intrin->body.defined())
         << "Normal store op for intrin " << this << " is not defined";
     Stmt body = MergeNest(output_bind_nest, this->intrin->body);
     body = MergeNest(input_bind_nest, body);
@@ -216,9 +216,9 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
     return ret;
   } else {
     // Need to split reduction
-    CHECK(this->intrin->reduce_update.defined()) << "Reduction update op is not defined";
+    ICHECK(this->intrin->reduce_update.defined()) << "Reduction update op is not defined";
     // Need init and update steps
-    CHECK_NE(this->reduce_axis.size(), 0U);
+    ICHECK_NE(this->reduce_axis.size(), 0U);
     std::vector<std::vector<Stmt> > common(n.main_nest.begin(),
                                            n.main_nest.begin() + n.num_common_loop + 1);
     std::vector<std::vector<Stmt> > update_nest(n.main_nest.begin() + n.num_common_loop + 1,
@@ -243,7 +243,7 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
       return MergeNest(common, SeqStmt::Flatten(init, update));
     } else {
       // When init op is not available, use body op for reset in the first iter.
-      CHECK(this->intrin->body.defined()) << "Normal body op is not defined";
+      ICHECK(this->intrin->body.defined()) << "Normal body op is not defined";
       Stmt update =
           TransformUpdate(stage, dom_map, n, this->intrin->body, this->intrin->reduce_update);
       update = MergeNest(output_bind_nest, update);
diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc
index ab96ae8180f5..bfd1ec579818 100644
--- a/src/te/operation/tensorize.cc
+++ b/src/te/operation/tensorize.cc
@@ -28,7 +28,7 @@
 
 #include "../schedule/message_passing.h"
 #include "compute_op.h"
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
@@ -50,14 +50,14 @@ size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage,
   // Loop over the leafs
   for (size_t i = stage->leaf_iter_vars.size(); i != 0; --i) {
     IterVar iv = stage->leaf_iter_vars[i - 1];
-    CHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce);
+    ICHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce);
     auto vit = dom_map.find(iv);
-    CHECK(vit != dom_map.end());
+    ICHECK(vit != dom_map.end());
     const Range& vrange = vit->second;
     if (is_one(vrange->extent)) {
       up_state[iv] = IntSet::SinglePoint(vrange->min);
     } else if (found_point) {
-      CHECK(is_zero(vrange->min));
+      ICHECK(is_zero(vrange->min));
       up_state[iv] = IntSet::SinglePoint(iv->var);
     } else {
       up_state[iv] = IntSet::FromRange(vrange);
@@ -66,16 +66,16 @@ size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage,
     if (iit != stage->iter_var_attrs.end()) {
       const IterVarAttr& attr = (*iit).second;
       if (!found_point) {
-        CHECK(!attr->bind_thread.defined()) << "Do not allow thread in tensorize scope";
+        ICHECK(!attr->bind_thread.defined()) << "Do not allow thread in tensorize scope";
       }
       if (attr->iter_type == kTensorized) {
-        CHECK(!found_point) << "Do not allow two tensorized point";
+        ICHECK(!found_point) << "Do not allow two tensorized point";
         found_point = true;
         loc_scope = i - 1;
       }
     }
   }
-  CHECK(found_point);
+  ICHECK(found_point);
   // Get domain of the tensorized scope.
   te::PassUpDomain(stage, dom_map, &up_state);
   // Get domains if inputs
@@ -101,7 +101,7 @@ size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage,
     const Tensor& t = kv.first;
     for (size_t i = 0; i < t.ndim(); ++i) {
       Range r = arith::Union(kv.second.data.at(i)).CoverRange(none);
-      CHECK(r.defined()) << "cannot deduce region of tensorized scope for input " << t;
+      ICHECK(r.defined()) << "cannot deduce region of tensorized scope for input " << t;
       vec.push_back(std::move(r));
     }
     (*in_region)[t] = std::move(vec);
@@ -113,8 +113,8 @@ void VerifyTensorizeLoopNest(const ComputeOpNode* self, const Stage& stage,
                              const ComputeLoopNest& n, size_t tloc) {
   // Veirfication step.
   std::unordered_set<const VarNode*> banned;
-  CHECK_EQ(n.main_nest.size(), stage->leaf_iter_vars.size() + 1);
-  CHECK(n.init_nest.size() == stage->leaf_iter_vars.size() + 1 || n.init_nest.size() == 0);
+  ICHECK_EQ(n.main_nest.size(), stage->leaf_iter_vars.size() + 1);
+  ICHECK(n.init_nest.size() == stage->leaf_iter_vars.size() + 1 || n.init_nest.size() == 0);
   auto f_push_banned = [&banned](const Stmt& s) {
     if (const ForNode* op = s.as<ForNode>()) {
       banned.insert(op->loop_var.get());
@@ -163,7 +163,7 @@ class TensorIntrinMatcher final : public StmtExprMutator {
     auto it = in_remap_.find(t);
     if (it != in_remap_.end()) {
       const InputEntry& e = it->second;
-      CHECK_EQ(op->indices.size(), e.region.size());
+      ICHECK_EQ(op->indices.size(), e.region.size());
       Array<PrimExpr> indices;
       for (size_t i = e.start; i < e.region.size(); ++i) {
         indices.push_back(op->indices[i] - e.region[i]->min);
@@ -200,7 +200,7 @@ class TensorIntrinMatcher final : public StmtExprMutator {
             const std::unordered_map<IterVar, Range>& out_dom,
             const std::unordered_map<Tensor, Array<Range> >& in_region, const TensorIntrin& intrin,
             Map<Var, Range>* compute_intrin_iter_space) {
-    CHECK(self == stage->op.get());
+    ICHECK(self == stage->op.get());
 
     for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
       IterVar iv = stage->leaf_iter_vars[i];
@@ -214,17 +214,17 @@ class TensorIntrinMatcher final : public StmtExprMutator {
 
     // input remap.
     Array<Tensor> inputs = self->InputTensors();
-    CHECK_EQ(inputs.size(), intrin->inputs.size());
+    ICHECK_EQ(inputs.size(), intrin->inputs.size());
     for (size_t i = 0; i < inputs.size(); ++i) {
       InputEntry e;
       e.tensor = intrin->inputs[i];
       e.region = Array<Range>(in_region.at(inputs[i]));
-      CHECK_GE(e.region.size(), e.tensor.ndim());
+      ICHECK_GE(e.region.size(), e.tensor.ndim());
       // Enable fuzzy matching, to match [1, n, m] to [n, m]
       e.start = e.region.size() - e.tensor.ndim();
       for (size_t j = 0; j < e.start; ++j) {
         auto canonical_extent = analyzer_.Simplify(e.region[j]->extent);
-        CHECK(is_one(canonical_extent))
+        ICHECK(is_one(canonical_extent))
             << "Tensorize " << intrin->name << ":"
             << " Input dimension mismatch with tensor intrin "
             << " expected shape=" << e.tensor->shape << ", given region=" << e.region;
@@ -233,16 +233,16 @@ class TensorIntrinMatcher final : public StmtExprMutator {
     }
     // output remap
     const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-    CHECK(intrin_compute) << "Only support compute intrinsic for now";
-    CHECK_GE(self->axis.size(), intrin_compute->axis.size())
+    ICHECK(intrin_compute) << "Only support compute intrinsic for now";
+    ICHECK_GE(self->axis.size(), intrin_compute->axis.size())
         << "Tensorize: Output mismatch with tensor intrin ";
     // Enable fuzzy matching, to match [1, n, m] to [n, m]
     size_t axis_start = self->axis.size() - intrin_compute->axis.size();
     for (size_t i = 0; i < axis_start; ++i) {
       Range r = out_dom.at(self->axis[i]);
-      CHECK(is_one(r->extent)) << "Tensorize: Output mismatch with tensor intrin "
-                               << " intrin-dim=" << intrin_compute->axis.size()
-                               << ", tensorize-dim=" << self->axis.size();
+      ICHECK(is_one(r->extent)) << "Tensorize: Output mismatch with tensor intrin "
+                                << " intrin-dim=" << intrin_compute->axis.size()
+                                << ", tensorize-dim=" << self->axis.size();
       var_remap_[self->axis[i]->var.get()] = r->min;
     }
     // Assume we tensorize at regin axis i [min, min + extent)
@@ -257,14 +257,14 @@ class TensorIntrinMatcher final : public StmtExprMutator {
       compute_intrin_iter_space->Set(target_iv->var, target_iv->dom);
     }
     // Remap reduction axis
-    CHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
+    ICHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
         << "Tensorize: Reduction dimension mismatch with tensor intrin";
     axis_start = self->reduce_axis.size() - intrin_compute->reduce_axis.size();
     for (size_t i = 0; i < axis_start; ++i) {
       Range r = out_dom.at(self->reduce_axis[i]);
-      CHECK(is_one(r->extent)) << "Tensorize: Reduction mismatch with tensor intrin "
-                               << " intrin-dim=" << intrin_compute->reduce_axis.size()
-                               << ", tensorize-dim=" << self->reduce_axis.size();
+      ICHECK(is_one(r->extent)) << "Tensorize: Reduction mismatch with tensor intrin "
+                                << " intrin-dim=" << intrin_compute->reduce_axis.size()
+                                << ", tensorize-dim=" << self->reduce_axis.size();
       var_remap_[self->reduce_axis[i]->var.get()] = r->min;
     }
     for (size_t i = axis_start; i < self->reduce_axis.size(); ++i) {
@@ -320,8 +320,8 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
   Array<PrimExpr> body = MatchTensorizeBody(self, stage, dom_map, out_dom, in_region, intrin,
                                             &compute_intrin_iter_space);
   const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-  CHECK(intrin_compute) << "Only support compute intrinsic for now";
-  CHECK_EQ(body.size(), intrin_compute->body.size()) << "Tensorize failed: body size mismatch";
+  ICHECK(intrin_compute) << "Only support compute intrinsic for now";
+  ICHECK_EQ(body.size(), intrin_compute->body.size()) << "Tensorize failed: body size mismatch";
   arith::Analyzer ana;
   ana.Bind(compute_intrin_iter_space);
 
@@ -333,9 +333,9 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
                  << "'s declaration "
                  << " provided=" << lhs.dtype() << ", intrin=" << rhs.dtype();
     }
-    CHECK(expr_equal(lhs, rhs)) << "Failed to match the compute with TensorIntrin " << intrin->name
-                                << "'s declaration "
-                                << " provided= " << lhs << ", intrin=  " << rhs;
+    ICHECK(expr_equal(lhs, rhs)) << "Failed to match the compute with TensorIntrin " << intrin->name
+                                 << "'s declaration "
+                                 << " provided= " << lhs << ", intrin=  " << rhs;
   }
 }
 
@@ -346,7 +346,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   std::unordered_map<Tensor, Array<Range> > in_region;
   size_t tloc = InferTensorizeRegion(self, stage, dom_map, &out_dom, &in_region);
   TensorIntrin intrin = stage->iter_var_attrs.at(stage->leaf_iter_vars[tloc])->tensor_intrin;
-  CHECK(intrin.defined());
+  ICHECK(intrin.defined());
   ComputeLoopNest n = ComputeLoopNest::Create(self, stage, dom_map, debug_keep_trivial_loop);
   VerifyTensorizeLoopNest(self, stage, n, tloc);
   VerifyTensorizeBody(self, stage, dom_map, out_dom, in_region, intrin);
@@ -354,14 +354,14 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   Stmt nop = Evaluate(0);
   std::vector<Stmt> input_bind_nest, output_bind_nest;
   Array<Tensor> inputs = self->InputTensors();
-  CHECK_EQ(inputs.size(), intrin->inputs.size()) << "Tensorize failed: input size mismatch ";
+  ICHECK_EQ(inputs.size(), intrin->inputs.size()) << "Tensorize failed: input size mismatch ";
   // input binding
   for (size_t i = 0; i < intrin->inputs.size(); ++i) {
     Tensor tensor = inputs[i];
     Buffer buffer = intrin->buffers[i];
     Array<ObjectRef> bind_spec{buffer, tensor};
     auto it = in_region.find(tensor);
-    CHECK(it != in_region.end());
+    ICHECK(it != in_region.end());
     const Array<Range>& region = it->second;
     Array<PrimExpr> tuple;
     for (const Range r : region) {
@@ -374,13 +374,13 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   }
   // output binding
   const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-  CHECK(intrin_compute) << "Only support compute intrinsic for now";
-  CHECK_EQ(intrin->inputs.size() + intrin_compute->body.size(), intrin->buffers.size());
-  CHECK_EQ(intrin_compute->body.size(), self->body.size());
+  ICHECK(intrin_compute) << "Only support compute intrinsic for now";
+  ICHECK_EQ(intrin->inputs.size() + intrin_compute->body.size(), intrin->buffers.size());
+  ICHECK_EQ(intrin_compute->body.size(), self->body.size());
   Array<PrimExpr> tuple;
   for (IterVar iv : self->axis) {
     auto it = out_dom.find(iv);
-    CHECK(it != out_dom.end());
+    ICHECK(it != out_dom.end());
     tuple.push_back(it->second->min);
     tuple.push_back(it->second->extent);
   }
@@ -395,20 +395,20 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   // Check variable remap
   std::unordered_map<const VarNode*, PrimExpr> vmap;
   tir::ArgBinder binder(&vmap);
-  CHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
+  ICHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
       << "Tensorization fail: reduction axis size do not match";
   size_t start = self->reduce_axis.size() - intrin_compute->reduce_axis.size();
   for (size_t i = 0; i < start; ++i) {
     IterVar iv = self->reduce_axis[i];
     auto it = out_dom.find(iv);
-    CHECK(it != out_dom.end());
-    CHECK(is_one(it->second->extent)) << "Tensorization fail: reduction axis size do not match";
+    ICHECK(it != out_dom.end());
+    ICHECK(is_one(it->second->extent)) << "Tensorization fail: reduction axis size do not match";
   }
   for (size_t i = start; i < self->reduce_axis.size(); ++i) {
     IterVar iv = self->reduce_axis[i];
     IterVar target = intrin_compute->reduce_axis[i - start];
     auto it = out_dom.find(iv);
-    CHECK(it != out_dom.end());
+    ICHECK(it != out_dom.end());
     binder.Bind(target->dom->min, make_const(iv->dom->min.dtype(), 0),
                 "tensir_intrin.reduction.min");
     binder.Bind(target->dom->extent, it->second->extent, "tensir_intrin.reduction.extent");
@@ -417,8 +417,8 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
     // Do no need to split reduction
     std::vector<std::vector<Stmt> > nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
     nest.emplace_back(MakeIfNest(n.main_predicates));
-    CHECK_EQ(n.init_predicates.size(), 0U);
-    CHECK(intrin->body.defined()) << "Normal store op for intrin " << intrin << " is not defined";
+    ICHECK_EQ(n.init_predicates.size(), 0U);
+    ICHECK(intrin->body.defined()) << "Normal store op for intrin " << intrin << " is not defined";
     Stmt body = MergeNest(output_bind_nest, intrin->body);
     body = MergeNest(input_bind_nest, body);
     body = tir::Substitute(body, vmap);
@@ -427,10 +427,10 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
     return MergeNest(nest, body);
   } else {
     // Need to split reduction
-    CHECK(intrin->reduce_update.defined())
+    ICHECK(intrin->reduce_update.defined())
         << "Reduction update op for intrin " << intrin << " is not defined";
     // Need init and update steps
-    CHECK_NE(self->reduce_axis.size(), 0U);
+    ICHECK_NE(self->reduce_axis.size(), 0U);
     std::vector<std::vector<Stmt> > common(n.main_nest.begin(),
                                            n.main_nest.begin() + n.num_common_loop + 1);
     std::vector<std::vector<Stmt> > update_nest(n.main_nest.begin() + n.num_common_loop + 1,
@@ -455,7 +455,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
       return MergeNest(common, SeqStmt::Flatten(init, update));
     } else {
       // When init op is not available, use body op for reset in the first iter.
-      CHECK(intrin->body.defined()) << "Normal body op for intrin " << intrin << " is not defined";
+      ICHECK(intrin->body.defined()) << "Normal body op for intrin " << intrin << " is not defined";
       Stmt update = TransformUpdate(stage, dom_map, n, intrin->body, intrin->reduce_update);
       update = MergeNest(output_bind_nest, update);
       update = MergeNest(input_bind_nest, update);
@@ -474,7 +474,7 @@ TVM_REGISTER_GLOBAL("test.op.InferTensorizeRegion").set_body([](TVMArgs args, TV
   Map<IterVar, Range> dmap = args[1];
   std::unordered_map<IterVar, Range> out_dom;
   std::unordered_map<Tensor, Array<Range> > in_region;
-  CHECK(stage->op.as<ComputeOpNode>());
+  ICHECK(stage->op.as<ComputeOpNode>());
   InferTensorizeRegion(stage->op.as<ComputeOpNode>(), stage, as_unordered_map(dmap), &out_dom,
                        &in_region);
   *ret = Array<ObjectRef>{Map<IterVar, Range>(out_dom), Map<Tensor, Array<Range> >(in_region)};
@@ -486,7 +486,7 @@ TVM_REGISTER_GLOBAL("test.op.MatchTensorizeBody").set_body([](TVMArgs args, TVMR
   Map<Tensor, Array<Range> > in_region = args[2];
   TensorIntrin intrin = args[3];
   Map<Var, Range> vrange;
-  CHECK(stage->op.as<ComputeOpNode>());
+  ICHECK(stage->op.as<ComputeOpNode>());
   *ret = MatchTensorizeBody(stage->op.as<ComputeOpNode>(), stage, {{}}, as_unordered_map(out_dom),
                             as_unordered_map(in_region), intrin, &vrange);
 });
diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc
index 83a1caf3c63a..12c9b5538b44 100644
--- a/src/te/schedule/bound.cc
+++ b/src/te/schedule/bound.cc
@@ -89,16 +89,16 @@ StorageScope InferStorageScope(const Stage& stage, const GraphContext& ctx) {
 
 void InferRootBound(const Stage& stage, const GraphContext& ctx,
                     std::unordered_map<IterVar, Range>* rmap) {
-  CHECK_NE(stage->attach_type, kInline) << "call schedule.normalize before scheduleops";
+  ICHECK_NE(stage->attach_type, kInline) << "call schedule.normalize before scheduleops";
   if (stage->attach_type == kInlinedAlready) return;
   if (stage->is_output) {
     // verify correctness.
-    CHECK_EQ(stage.GetAttachSpec()->attach_type, kGroupRoot) << "Output must be attached at root";
+    ICHECK_EQ(stage.GetAttachSpec()->attach_type, kGroupRoot) << "Output must be attached at root";
   }
   if (stage->is_output || stage->op.as<PlaceholderOpNode>()) {
     for (auto iv : stage->op->root_iter_vars()) {
-      CHECK(iv->dom.defined());
-      CHECK(!rmap->count(iv));
+      ICHECK(iv->dom.defined());
+      ICHECK(!rmap->count(iv));
       (*rmap)[iv] = iv->dom;
     }
     return;
@@ -132,7 +132,7 @@ void InferRootBound(const Stage& stage, const GraphContext& ctx,
     Map<Var, IntSet> relax_set;
     std::unordered_map<IterVar, IntSet> up_state;
     bool found_attach = false;
-    CHECK(ctx.op2stage_.count(op.get()));
+    ICHECK(ctx.op2stage_.count(op.get()));
     const Stage& op_stage = ctx.op2stage_.at(op.get());
     // Consumer nest
     for (size_t i = op_stage->leaf_iter_vars.size(); i != 0; --i) {
@@ -141,13 +141,13 @@ void InferRootBound(const Stage& stage, const GraphContext& ctx,
         found_attach = true;
       }
       auto it = rmap->find(iv);
-      CHECK(it != rmap->end());
+      ICHECK(it != rmap->end());
       const Range& vrange = it->second;
       if (is_one(vrange->extent)) {
         up_state[iv] = IntSet::SinglePoint(vrange->min);
       } else if (!NeedRelax(iv, found_attach, ctx.bind_map, scope)) {
-        CHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
-                                    << " call schedule.normalize to achieve this. ";
+        ICHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
+                                     << " call schedule.normalize to achieve this. ";
         if (ctx.bind_map.count(iv)) {
           up_state[iv] = IntSet::SinglePoint(ctx.bind_map.at(iv)->var);
         } else {
@@ -163,8 +163,8 @@ void InferRootBound(const Stage& stage, const GraphContext& ctx,
         found_attach = true;
       }
       Range vrange = rmap->at(iv);
-      CHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
-                                  << "call schedule.normalize to achieve this.";
+      ICHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
+                                   << "call schedule.normalize to achieve this.";
       if (NeedRelax(iv, found_attach, ctx.bind_map, scope)) {
         relax_set.Set(iv->var, IntSet::FromRange(vrange));
         if (ctx.bind_map.count(iv)) {
@@ -172,7 +172,7 @@ void InferRootBound(const Stage& stage, const GraphContext& ctx,
         }
       }
     }
-    CHECK(found_attach || stage_attach.size() == 0)
+    ICHECK(found_attach || stage_attach.size() == 0)
         << "Invalid Schedule, cannot find the producer " << stage->op
         << " along the loop nest specified by compute_at of consumer " << op;
     // Get the domain of the consumer
@@ -218,7 +218,7 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
   for (Stage stage : sch->stages) {
     for (auto kv : stage->iter_var_attrs) {
       if (kv.second->bind_thread.defined()) {
-        CHECK(!ctx.bind_map.count(kv.first));
+        ICHECK(!ctx.bind_map.count(kv.first));
         ctx.bind_map[kv.first] = kv.second->bind_thread;
       }
     }
@@ -242,7 +242,7 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
     // pass down to get bound of all iter vars.
     PassDownDomain(stage, &ret, &analyzer);
     for (IterVar iv : stage->env_threads) {
-      CHECK(iv->dom.defined());
+      ICHECK(iv->dom.defined());
       ret[iv] = iv->dom;
     }
   }
diff --git a/src/te/schedule/graph.cc b/src/te/schedule/graph.cc
index 09e899581d14..502753284da6 100644
--- a/src/te/schedule/graph.cc
+++ b/src/te/schedule/graph.cc
@@ -174,7 +174,7 @@ AttachPath CreateAttachPath(Schedule sch) {
     std::unordered_set<const Object*> visited;
     Array<IterVar> path;
     for (Stage s = stage; s.defined();) {
-      CHECK(!visited.count(s.get())) << "Find loop in compute_at attach group";
+      ICHECK(!visited.count(s.get())) << "Find loop in compute_at attach group";
       visited.insert(s.get());
       Stage spec = s.GetAttachSpec();
       bool start_attach;
@@ -183,14 +183,14 @@ AttachPath CreateAttachPath(Schedule sch) {
         attach_ivar = spec->attach_ivar;
         s = spec->attach_stage;
         start_attach = false;
-        CHECK(attach_ivar.defined());
+        ICHECK(attach_ivar.defined());
       } else if (spec->attach_type == kScanUpdate) {
         s = spec->attach_stage;
         start_attach = true;
       } else {
         break;
       }
-      CHECK(s.defined());
+      ICHECK(s.defined());
       for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) {
         IterVar iv = s->leaf_iter_vars[i - 1];
         if (!start_attach && iv.same_as(attach_ivar)) {
@@ -198,8 +198,8 @@ AttachPath CreateAttachPath(Schedule sch) {
         }
         if (start_attach) path.push_back(iv);
       }
-      CHECK(start_attach) << "Invalid Schedule: cannot find attach point " << attach_ivar
-                          << " in the schedule of " << s->op;
+      ICHECK(start_attach) << "Invalid Schedule: cannot find attach point " << attach_ivar
+                           << " in the schedule of " << s->op;
     }
     if (!ret.count(stage->op)) {
       ret.Set(stage->op, path);
diff --git a/src/te/schedule/message_passing.cc b/src/te/schedule/message_passing.cc
index 0a82673aa4b8..d45f29ebc5b6 100644
--- a/src/te/schedule/message_passing.cc
+++ b/src/te/schedule/message_passing.cc
@@ -40,9 +40,9 @@ void Update(std::unordered_map<IterVar, Range>* p_state, const IterVar& iv, Rang
   } else {
     bool match =
         is_zero(it->second->min) && analyzer->CanProve(r->extent - it->second->extent == 0);
-    CHECK(match) << iv << " domain already inferred,"
-                 << " cannot prove their extents are the same " << it->second->extent << " vs "
-                 << r->extent;
+    ICHECK(match) << iv << " domain already inferred,"
+                  << " cannot prove their extents are the same " << it->second->extent << " vs "
+                  << r->extent;
   }
 }
 
@@ -109,10 +109,10 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
   for (IterVarRelation rel : stage->relations) {
     if (const SplitNode* r = rel.as<SplitNode>()) {
       if (!state.count(r->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
-      CHECK(!state.count(r->inner));
+      ICHECK(!state.count(r->inner));
       const Range& range_parent = state.at(r->parent);
       // Tighten iv's extent to min(parent_extent, factor_or_nparts), only if all of the
       // following conditions are met:
@@ -143,7 +143,7 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
       }
     } else if (const FuseNode* r = rel.as<FuseNode>()) {
       if (!state.count(r->outer) || !state.count(r->inner)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       const Range& range_outer = state.at(r->outer);
@@ -151,7 +151,7 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
       state[r->fused] = Range::FromMinExtent(0, range_outer->extent * range_inner->extent);
     } else if (const RebaseNode* r = rel.as<RebaseNode>()) {
       if (!state.count(r->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       Update(p_state, r->rebased, Range::FromMinExtent(0, state.at(r->parent)->extent), actx);
@@ -164,7 +164,7 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
   // update the extents of binded threads.
   for (auto kv : stage->iter_var_attrs) {
     if (kv.second->bind_thread.defined()) {
-      CHECK(state.count(kv.first));
+      ICHECK(state.count(kv.first));
       Update(p_state, kv.second->bind_thread, state.at(kv.first), actx);
     }
   }
@@ -177,7 +177,7 @@ void PassUpIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
     IterVarRelation rel = stage->relations[i - 1];
     if (const SplitNode* s = rel.as<SplitNode>()) {
       if (!state.count(s->outer) || !state.count(s->inner)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr outer = state.at(s->outer);
@@ -191,7 +191,7 @@ void PassUpIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
       }
     } else if (const FuseNode* s = rel.as<FuseNode>()) {
       if (!state.count(s->fused)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr value = state.at(s->fused);
@@ -213,7 +213,7 @@ void PassUpIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
       state[s->inner] = cast(s->inner->var.dtype(), state[s->inner]);
     } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
       if (!state.count(s->rebased)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr value = state.at(s->rebased);
@@ -237,18 +237,18 @@ void PassDownIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
   for (IterVarRelation rel : stage->relations) {
     if (const SplitNode* s = rel.as<SplitNode>()) {
       if (!state.count(s->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       Range r = dom_map.at(s->inner);
-      CHECK(is_zero(r->min));
+      ICHECK(is_zero(r->min));
       PrimExpr parent = state.at(s->parent);
       PrimExpr factor = r->extent;
       state[s->outer] = indexdiv(parent, factor);
       state[s->inner] = indexmod(parent, factor);
     } else if (const FuseNode* s = rel.as<FuseNode>()) {
       if (!state.count(s->inner) && !state.count(s->outer)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr factor = dom_map.at(s->inner)->extent;
@@ -256,17 +256,17 @@ void PassDownIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
       PrimExpr inner_min = dom_map.at(s->inner)->min;
       PrimExpr inner = state.at(s->inner);
       PrimExpr outer = state.at(s->outer);
-      CHECK(is_zero(outer_min));
-      CHECK(is_zero(inner_min));
+      ICHECK(is_zero(outer_min));
+      ICHECK(is_zero(inner_min));
       state[s->fused] = outer * factor + inner;
     } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
       if (!state.count(s->rebased)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr value = state.at(s->parent);
       PrimExpr parent_min = dom_map.at(s->parent)->min;
-      CHECK(is_zero(parent_min));
+      ICHECK(is_zero(parent_min));
       state[s->rebased] = value;
     } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
       state[s->iter] = make_zero(s->iter->var.dtype());
@@ -286,18 +286,18 @@ void PassUpDomain(const SplitNode* s, const std::unordered_map<IterVar, Range>&
   }
   PrimExpr factor = dom_map.at(s->inner)->extent;
   PrimExpr parent_min = dom_map.at(s->parent)->min;
-  CHECK(outer.defined());
-  CHECK(inner.defined());
-  CHECK(factor.defined());
+  ICHECK(outer.defined());
+  ICHECK(inner.defined());
+  ICHECK(factor.defined());
   *parent = arith::EvalSet(s->outer->var * factor + s->inner->var + parent_min,
                            {{s->outer, outer}, {s->inner, inner}});
 }
 
 void PassUpDomain(const FuseNode* s, const std::unordered_map<IterVar, Range>& dom_map,
                   const IntSet& fused, IntSet* outer, IntSet* inner) {
-  CHECK(dom_map.count(s->outer));
-  CHECK(dom_map.count(s->inner));
-  CHECK(dom_map.count(s->fused));
+  ICHECK(dom_map.count(s->outer));
+  ICHECK(dom_map.count(s->inner));
+  ICHECK(dom_map.count(s->fused));
   arith::Analyzer ana;
 
   if (fused.MatchRange(dom_map.at(s->fused))) {
@@ -342,7 +342,7 @@ void PassUpDomain(const FuseNode* s, const std::unordered_map<IterVar, Range>& d
 
 void PassUpDomain(const RebaseNode* s, const std::unordered_map<IterVar, Range>& dom_map,
                   const IntSet& rebased, IntSet* parent) {
-  CHECK(dom_map.count(s->parent));
+  ICHECK(dom_map.count(s->parent));
   if (rebased.MatchRange(dom_map.at(s->rebased))) {
     *parent = IntSet::FromRange(dom_map.at(s->parent));
     return;
@@ -384,7 +384,7 @@ void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_sta
     IterVarRelation rel = stage->relations[i - 1];
     if (const SplitNode* s = rel.as<SplitNode>()) {
       if (!state.count(s->inner) && !state.count(s->outer)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       int res = 0;
@@ -394,7 +394,7 @@ void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_sta
       state[s->parent] = res;
     } else if (const FuseNode* s = rel.as<FuseNode>()) {
       if (!state.count(s->fused)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       if (!state.count(s->outer)) {
@@ -409,7 +409,7 @@ void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_sta
       }
     } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
       if (!state.count(s->rebased)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       if (!state.count(s->parent)) {
@@ -430,7 +430,7 @@ void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_s
   for (IterVarRelation rel : stage->relations) {
     if (const SplitNode* s = rel.as<SplitNode>()) {
       if (!state.count(s->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       if (!state.count(s->outer)) {
@@ -445,7 +445,7 @@ void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_s
       }
     } else if (const FuseNode* s = rel.as<FuseNode>()) {
       if (!state.count(s->outer) && !state.count(s->inner)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       int res = 0;
@@ -455,7 +455,7 @@ void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_s
       state[s->fused] = res;
     } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
       if (!state.count(s->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       if (!state.count(s->rebased)) {
@@ -561,7 +561,7 @@ std::vector<PrimExpr> MakeBoundCheck(const Stage& stage, const Map<IterVar, Rang
   for (const IterVar& iv : stage->op->root_iter_vars()) {
     if (skip_iter.count(iv) || iv->iter_type == kOpaque) continue;
     Range dom = dom_map.at(iv);
-    CHECK(iv->dom.defined());
+    ICHECK(iv->dom.defined());
     if (!skip_ivar_domain && !IsRangeSame(iv->dom, dom)) {
       PrimExpr value = value_map.at(iv) - iv->dom->min;
       IntSet s = analyzer.int_set(value, iset_dmap);
diff --git a/src/te/schedule/operation_inline.cc b/src/te/schedule/operation_inline.cc
index 7399af90f570..8eed6e3f10fc 100644
--- a/src/te/schedule/operation_inline.cc
+++ b/src/te/schedule/operation_inline.cc
@@ -29,7 +29,7 @@
 
 #include <utility>
 
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 
 namespace tvm {
 namespace te {
@@ -48,9 +48,9 @@ class OperationInliner final : public StmtExprMutator {
     auto tensor = Downcast<Tensor>(op->producer);
 
     if (tensor->op.same_as(operation_)) {
-      CHECK_EQ(tensor->value_index, 0);
+      ICHECK_EQ(tensor->value_index, 0);
       expr = body_;
-      CHECK_EQ(args_.size(), op->indices.size());
+      ICHECK_EQ(args_.size(), op->indices.size());
 
       bool has_side_effect = false;
       for (size_t i = 0; i < op->indices.size(); ++i) {
@@ -81,7 +81,7 @@ class OperationInliner final : public StmtExprMutator {
 };
 
 Stmt Inline(Stmt stmt, Operation f, Array<Var> args, PrimExpr body) {
-  CHECK_EQ(f->num_outputs(), 1) << "can only inline output single value operation";
+  ICHECK_EQ(f->num_outputs(), 1) << "can only inline output single value operation";
   Stmt ret = OperationInliner(f, args, body)(std::move(stmt));
   if (ret.same_as(stmt)) return ret;
   return ConvertSSA(ret);
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index 78f0608a0d14..bae8e069bcdb 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -27,7 +27,7 @@
 
 #include <unordered_set>
 
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "message_passing.h"
 #include "operation_inline.h"
 
@@ -136,6 +136,15 @@ Tensor Schedule::cache_read(const Tensor& tensor, const std::string& scope,
   if (tensor->op->num_outputs() != 1) {
     os << ".v" << tensor->value_index;
   }
+
+  // when a schedule has multiple cache_read on the same tensor,
+  // we make sure their op names are unique. e.g., w.shared, w_d.shared, w_d_d.shared
+  for (auto pair : (*this)->stage_map) {
+    auto stage = pair.second;
+    if (stage->op->name == os.str() + "." + scope) {
+      os << ".d";
+    }
+  }
   os << "." << scope;
 
   std::unordered_map<Tensor, Tensor> vsub;
@@ -154,7 +163,7 @@ Tensor Schedule::cache_read(const Tensor& tensor, const std::string& scope,
   for (Operation op : readers) {
     Stage s = operator[](op);
     Operation repl_op = s->op->ReplaceInputs(s->op, vsub);
-    CHECK(!repl_op.same_as(s->op)) << "Cannot find " << tensor << " in the inputs of " << s->op;
+    ICHECK(!repl_op.same_as(s->op)) << "Cannot find " << tensor << " in the inputs of " << s->op;
     vmap[s->op.output(0)] = repl_op.output(0);
     rvmap[repl_op.output(0)] = s->op.output(0);
     s->op = repl_op;
@@ -165,7 +174,7 @@ Tensor Schedule::cache_read(const Tensor& tensor, const std::string& scope,
   size_t pos = FindNodeRef(stages.GetArrayNode(), op_stage);
   Stage cache_stage = Stage(cache->op);
   cache_stage.set_scope(scope);
-  CHECK_LT(pos, stages.size());
+  ICHECK_LT(pos, stages.size());
   stages.insert(stages.begin() + pos + 1, cache_stage);
   (*this)->stage_map.Set(cache->op, cache_stage);
   // Update group
@@ -203,7 +212,7 @@ void PrepareAxisMapping(Stage orig_stage, OpType* op, std::unordered_set<IterVar
     std::unordered_map<IterVar, PrimExpr> value_map;
     for (IterVar iv : orig_stage->leaf_iter_vars) {
       if (red_axis.count(iv)) continue;
-      CHECK_EQ(iv->iter_type, kDataPar) << "Can only relayout with in data parallel dimensions";
+      ICHECK_EQ(iv->iter_type, kDataPar) << "Can only relayout with in data parallel dimensions";
       Range dom = dom_map.at(iv);
       IterVar new_iv = IterVar(dom, iv->var.copy_with_suffix(".c"), iv->iter_type);
       new_axis.push_back(new_iv);
@@ -257,7 +266,7 @@ Array<Tensor> ReplaceOriginalOp(Schedule sch, Stage orig_stage, const std::strin
   size_t pos = FindNodeRef(stages.GetArrayNode(), orig_stage);
   Stage cache_stage = Stage(cache_op);
   cache_stage.set_scope(scope);
-  CHECK_LT(pos, stages.size());
+  ICHECK_LT(pos, stages.size());
   stages.insert(stages.begin() + pos, cache_stage);
   sch->stage_map.Set(cache_op, cache_stage);
   // Update group
@@ -300,14 +309,14 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch, const Array<Tensor>& tensor_a
     if (body->IsInstance<tir::ReduceNode>()) {
       const tir::ReduceNode* reduce_body = body.as<tir::ReduceNode>();
       if (first_reduce != nullptr) {
-        CHECK(ReduceEqual(reduce_body, first_reduce));
+        ICHECK(ReduceEqual(reduce_body, first_reduce));
         body = tir::Reduce(first_reduce->combiner, first_reduce->source, first_reduce->axis,
                            first_reduce->condition, reduce_body->value_index, reduce_body->init);
       } else {
         first_reduce = reduce_body;
       }
     } else {
-      CHECK(first_reduce == nullptr) << "cannot mix reduce and other node in ONE compute bodys";
+      ICHECK(first_reduce == nullptr) << "cannot mix reduce and other node in ONE compute bodys";
     }
     body_list.push_back(body);
   }
@@ -346,7 +355,7 @@ Array<Tensor> CacheWriteWithReLayoutTensor(Schedule sch, const Array<Tensor>& te
   Tensor tensor = tensor_array[0];
   Stage orig_stage = sch[tensor->op];
   const TensorComputeOpNode* tensor_op = orig_stage->op.as<TensorComputeOpNode>();
-  CHECK_EQ(tensor_op->num_outputs(), 1)
+  ICHECK_EQ(tensor_op->num_outputs(), 1)
       << "cache write only support single output tensor_compute_op";
 
   std::unordered_set<IterVar> red_axis;
@@ -426,15 +435,15 @@ Array<Tensor> CacheWriteWithReLayoutTensor(Schedule sch, const Array<Tensor>& te
 
 Array<Tensor> Schedule::cache_write(const Array<Tensor>& tensor_array, const std::string& scope) {
   (*this)->InvalidateCache();
-  CHECK(tensor_array.size() > 0) << "size of tensor_array must be greater than 0";
+  ICHECK(tensor_array.size() > 0) << "size of tensor_array must be greater than 0";
   Tensor tensor = tensor_array[0];
   Stage orig_stage = operator[](tensor->op);
   const ComputeOpNode* compute = tensor->op.as<ComputeOpNode>();
-  CHECK(static_cast<size_t>(compute->num_outputs()) == tensor_array.size())
+  ICHECK(static_cast<size_t>(compute->num_outputs()) == tensor_array.size())
       << "size of input tensor list must be same as number of stage outputs";
   for (size_t i = 1; i < tensor_array.size(); i++) {
     Stage tmp_stage = operator[](tensor_array[i]->op);
-    CHECK(orig_stage.same_as(tmp_stage)) << "Input tensor list must be generated by ONE computeOp";
+    ICHECK(orig_stage.same_as(tmp_stage)) << "Input tensor list must be generated by ONE computeOp";
   }
   return CacheWriteWithReLayout(*this, tensor_array, scope);
 }
@@ -493,7 +502,7 @@ void RebaseNonZeroMinLoop(ScheduleNode* sch) {
   }
 }
 
-void InjectInline(ScheduleNode* sch) {
+void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
   sch->InvalidateCache();
 
   std::vector<Array<PrimExpr> > new_body(sch->stages.size());
@@ -510,12 +519,20 @@ void InjectInline(ScheduleNode* sch) {
       {
         // setup args
         const ComputeOpNode* compute = stage->op.as<ComputeOpNode>();
-        CHECK(compute) << "can only inline compute op";
+        ICHECK(compute) << "can only inline compute op";
         for (auto iv : compute->axis) {
           args.push_back(iv->var);
         }
-        CHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
-        body = compute->body[0];
+        ICHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
+
+        if (feature_extraction_mode && compute->attrs.count("const_matrix")) {
+          // Use constant value to replace access of const matrices.
+          // This produces wrong IR but is good enough for feature extraction purposes.
+          // This simplification can accelerate the feature extration and evolutionary search.
+          body = make_const(compute->output_dtype(0), 1.0f);
+        } else {
+          body = compute->body[0];
+        }
       }
       for (size_t j = i; j < sch->stages.size(); ++j) {
         Stage s = sch->stages[j];
@@ -530,9 +547,9 @@ void InjectInline(ScheduleNode* sch) {
             const tir::ReduceNode* reduce = new_body[j][0].as<tir::ReduceNode>();
             for (size_t k = 1; k < new_body[j].size(); ++k) {
               const tir::ReduceNode* reduce_ = new_body[j][k].as<tir::ReduceNode>();
-              CHECK(reduce_);
-              CHECK(ReduceEqual(reduce_, reduce)) << "The Reduce inputs of ComputeOp should "
-                                                  << "have the same attribute except value_index";
+              ICHECK(reduce_);
+              ICHECK(ReduceEqual(reduce_, reduce)) << "The Reduce inputs of ComputeOp should "
+                                                   << "have the same attribute except value_index";
             }
             PrimExpr new_value = Inline(tir::Evaluate(new_body[j][0]), stage->op, args, body)
                                      .as<tir::EvaluateNode>()
@@ -540,8 +557,8 @@ void InjectInline(ScheduleNode* sch) {
             if (!new_value.same_as(new_body[j][0])) {
               changed[j] = true;
               const tir::ReduceNode* r = new_value.as<tir::ReduceNode>();
-              CHECK(r != nullptr);
-              CHECK_EQ(new_body[j].size(), r->source.size());
+              ICHECK(r != nullptr);
+              ICHECK_EQ(new_body[j].size(), r->source.size());
               for (size_t k = 0; k < new_body[j].size(); ++k) {
                 auto n = make_object<tir::ReduceNode>(*r);
                 n->value_index = static_cast<int>(k);
@@ -581,7 +598,7 @@ void InjectInline(ScheduleNode* sch) {
     if (new_body[i].size()) {
       // Logics from ReplaceDataFlow
       const ComputeOpNode* compute = sch->stages[i]->op.as<ComputeOpNode>();
-      CHECK(compute);
+      ICHECK(compute);
       Operation op = s->op;
       if (changed[i]) {
         op = ComputeOp(compute->name, compute->tag, compute->attrs, compute->axis, new_body[i]);
@@ -595,7 +612,7 @@ void InjectInline(ScheduleNode* sch) {
       }
     } else if (hybrid_changed[i]) {
       const HybridOpNode* hybrid = sch->stages[i]->op.as<HybridOpNode>();
-      CHECK(hybrid);
+      ICHECK(hybrid);
       Operation op = HybridOp(hybrid->name, hybrid->tag, hybrid->attrs, hybrid->inputs,
                               hybrid->outputs, new_hybrid_body[i]);
       op = op->ReplaceInputs(op, repl);
@@ -638,8 +655,8 @@ void LegalizeInvalidAttach(ScheduleNode* sch) {
       bool start_attach = false;
       IterVar attach_ivar = spec->attach_ivar;
       s = spec->attach_stage;
-      CHECK(attach_ivar.defined());
-      CHECK(s.defined());
+      ICHECK(attach_ivar.defined());
+      ICHECK(s.defined());
 
       for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) {
         IterVar iv = s->leaf_iter_vars[i - 1];
@@ -691,7 +708,15 @@ void LegalizeInvalidAttach(ScheduleNode* sch) {
 
 Schedule Schedule::normalize() {
   Schedule sn = copy();
-  InjectInline(sn.operator->());
+  InjectInline(sn.operator->(), false);
+  RebaseNonZeroMinLoop(sn.operator->());
+  LegalizeInvalidAttach(sn.operator->());
+  return sn;
+}
+
+Schedule Schedule::normalize_for_feature_extraction() {
+  Schedule sn = copy();
+  InjectInline(sn.operator->(), true);
   RebaseNonZeroMinLoop(sn.operator->());
   LegalizeInvalidAttach(sn.operator->());
   return sn;
@@ -701,14 +726,15 @@ Schedule Schedule::normalize() {
 Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int factor_axis) {
   (*this)->InvalidateCache();
   using tir::ReduceNode;
-  CHECK_EQ(axis->iter_type, kCommReduce) << "Can only factor reduction axis";
+  ICHECK_EQ(axis->iter_type, kCommReduce) << "Can only factor reduction axis";
   Stage reduce_stage = operator[](tensor->op);
   const ComputeOpNode* compute_op = reduce_stage->op.as<ComputeOpNode>();
-  CHECK(compute_op) << "Can only factor ComputeOp";
+  ICHECK(compute_op) << "Can only factor ComputeOp";
   ArrayNode* leaf_vars = reduce_stage->leaf_iter_vars.CopyOnWrite();
   {
     size_t axis_pos = FindNodeRef(leaf_vars, axis);
-    CHECK_NE(axis_pos, leaf_vars->size()) << "Cannot find IterVar " << axis << " in leaf iter vars";
+    ICHECK_NE(axis_pos, leaf_vars->size())
+        << "Cannot find IterVar " << axis << " in leaf iter vars";
   }
   // Find touched reduction axis.
   std::unordered_map<IterVar, int> touch_map;
@@ -719,7 +745,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   std::unordered_set<IterVar> skip_bound_check;
   // Verify normal axis are not touched.
   for (IterVar iv : compute_op->axis) {
-    CHECK(!touch_map.count(iv)) << "Factor axis touches normal axis.";
+    ICHECK(!touch_map.count(iv)) << "Factor axis touches normal axis.";
     skip_bound_check.insert(iv);
   }
   // get analyzer.
@@ -753,14 +779,14 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   // Get the factored op node.
   const int factor_axis_pos =
       factor_axis >= 0 ? factor_axis : static_cast<int>(compute_op->axis.size() + 1) + factor_axis;
-  CHECK_LE(factor_axis_pos, compute_op->axis.size());
+  ICHECK_LE(factor_axis_pos, compute_op->axis.size());
   auto n = make_object<ComputeOpNode>();
   n->name = compute_op->name + ".rf";
   {
     // axis relacement.
     auto iv_node = make_object<IterVarNode>();
     iv_node->dom = dom_map.at(axis);
-    CHECK(is_zero(iv_node->dom->min)) << "Can only factor reduction domain starting from 0";
+    ICHECK(is_zero(iv_node->dom->min)) << "Can only factor reduction domain starting from 0";
     iv_node->var = axis->var;
     iv_node->iter_type = kDataPar;
 
@@ -778,7 +804,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   // predicate generation, copy not touched axis.
   int idx = tensor->value_index;
   const ReduceNode* reduce = compute_op->body[idx].as<ReduceNode>();
-  CHECK(reduce) << "Can only rfactor non-inline reductions";
+  ICHECK(reduce) << "Can only rfactor non-inline reductions";
   predicates.push_back(reduce->condition);
   auto fand = [](PrimExpr a, PrimExpr b) { return a && b; };
 
@@ -790,7 +816,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
     if (!touch_map.count(iv)) {
       n->reduce_axis.push_back(iv);
     } else {
-      CHECK(value_map.count(iv));
+      ICHECK(value_map.count(iv));
       PrimExpr index = value_map.at(iv);
       vsub[iv->var.get()] = index;
     }
@@ -799,7 +825,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   // Copy touched axis.
   for (IterVar iv : reduce_stage->leaf_iter_vars) {
     if (touch_map.count(iv) && !iv.same_as(axis)) {
-      CHECK_EQ(iv->iter_type, kCommReduce);
+      ICHECK_EQ(iv->iter_type, kCommReduce);
       auto ncpy = make_object<IterVarNode>(*iv.operator->());
       ncpy->dom = dom_map.at(iv);
       n->reduce_axis.push_back(IterVar(ncpy));
@@ -839,7 +865,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   size_t stage_pos = FindNodeRef(stages.GetArrayNode(), reduce_stage);
   Stage factor_stage = Stage(factor_op);
   factor_stage->relations = rels;
-  CHECK_LT(stage_pos, stages.size());
+  ICHECK_LT(stage_pos, stages.size());
   stages.insert(stages.begin() + stage_pos, factor_stage);
   (*this)->stage_map.Set(factor_op, factor_stage);
   factor_stage->group = reduce_stage->group;
@@ -871,7 +897,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
           std::unordered_map<const VarNode*, PrimExpr> init_vsub;
           for (const auto& init : reduce->init) {
             if (init->IsInstance<ProducerLoadNode>()) {
-              CHECK_EQ(compute_op->axis.size(), idx_size)
+              ICHECK_EQ(compute_op->axis.size(), idx_size)
                   << "'init' should have the number of dimensions as output when using with "
                      "rfactor";
               for (int idx = 0; idx < idx_size; idx++) {
diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc
index a8257c07a473..8964c1013a53 100644
--- a/src/te/schedule/schedule_lang.cc
+++ b/src/te/schedule/schedule_lang.cc
@@ -58,8 +58,8 @@ size_t FindLeafVar(ArrayNode* all_vars, ArrayNode* leaf_vars, const IterVar& v)
 DataType MatchDataType(std::vector<DataType> dtypes) {
   int max_bits = -1;
   for (const auto& dtype : dtypes) {
-    CHECK(dtype.is_int());
-    CHECK(dtype.is_scalar());
+    ICHECK(dtype.is_int());
+    ICHECK(dtype.is_scalar());
     max_bits = std::max(max_bits, dtype.bits());
   }
   return DataType::Int(max_bits);
@@ -68,8 +68,8 @@ DataType MatchDataType(std::vector<DataType> dtypes) {
 void SplitHelper(StageNode* self, IterVar parent, PrimExpr factor, PrimExpr nparts,
                  IterVar* p_outer, IterVar* p_inner) {
   // Check if split is valid.
-  CHECK(parent->iter_type == kDataPar || parent->iter_type == kCommReduce ||
-        parent->iter_type == kOrdered)
+  ICHECK(parent->iter_type == kDataPar || parent->iter_type == kCommReduce ||
+         parent->iter_type == kOrdered)
       << "Cannot split on " << IterVarType2String(parent->iter_type);
   IterVar outer = IterVar(Range(), parent->var.copy_with_suffix(".outer"), parent->iter_type);
   IterVar inner = IterVar(Range(), parent->var.copy_with_suffix(".inner"), parent->iter_type);
@@ -127,7 +127,7 @@ Stage& Stage::set_scope(std::string scope) {  // NOLINT(*)
 }
 
 Stage& Stage::compute_at(Stage parent, IterVar scope) {  // NOLINT(*)
-  CHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
+  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
   // Group constraint checking.
   Stage group = (*this)->group;
   if (group.defined()) {
@@ -135,7 +135,7 @@ Stage& Stage::compute_at(Stage parent, IterVar scope) {  // NOLINT(*)
     while (pg.defined() && !pg.same_as(group)) {
       pg = pg->group;
     }
-    CHECK(pg.same_as(group)) << "Can only assign compute_at to stages within the same group";
+    ICHECK(pg.same_as(group)) << "Can only assign compute_at to stages within the same group";
   }
 
   (*this)->attach_type = kScope;
@@ -148,28 +148,28 @@ Stage& Stage::compute_at(Stage parent, IterVar scope) {  // NOLINT(*)
       break;
     }
   }
-  CHECK(found) << "Cannot find the axis " << scope << " in parent's leaf_iter_vars"
-               << " parent=" << parent;
+  ICHECK(found) << "Cannot find the axis " << scope << " in parent's leaf_iter_vars"
+                << " parent=" << parent;
   return *this;
 }
 
 Stage& Stage::compute_inline() {  // NOLINT(*)
-  CHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
+  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
   (*this)->attach_type = kInline;
   return *this;
 }
 
 Stage& Stage::compute_root() {  // NOLINT(*)
-  CHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
+  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
   (*this)->attach_type = kGroupRoot;
   return *this;
 }
 
 Stage& Stage::bind(IterVar ivar, IterVar thread_ivar) {  // NOLINT(*)
   StageNode* self = operator->();
-  CHECK(ivar->iter_type == kDataPar || ivar->iter_type == kCommReduce)
+  ICHECK(ivar->iter_type == kDataPar || ivar->iter_type == kCommReduce)
       << "Cannot bind " << IterVarType2String(ivar->iter_type) << " to thread";
-  CHECK(thread_ivar->iter_type == kThreadIndex)
+  ICHECK(thread_ivar->iter_type == kThreadIndex)
       << "Cannot rebase by " << IterVarType2String(ivar->iter_type)
       << ", only thread axis is allowed so far";
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
@@ -193,9 +193,9 @@ Stage& Stage::bind(IterVar ivar, IterVar thread_ivar) {  // NOLINT(*)
 
 Stage& Stage::env_threads(Array<IterVar> threads) {
   StageNode* self = operator->();
-  CHECK(self->op.defined() && self->op.as<ScanOpNode>())
+  ICHECK(self->op.defined() && self->op.as<ScanOpNode>())
       << "env_threads is only valid for composite ops such as ScanOp";
-  CHECK_EQ(self->env_threads.size(), 0U) << "Already set env_threads";
+  ICHECK_EQ(self->env_threads.size(), 0U) << "Already set env_threads";
   Array<IterVar>& leaf_vars = self->leaf_iter_vars;
   Array<IterVar>& all_vars = self->all_iter_vars;
   std::vector<ObjectRef> temp;
@@ -228,11 +228,11 @@ Stage& Stage::split_by_nparts(IterVar parent, PrimExpr nparts, IterVar* p_outer,
 
 Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT(*)
   StageNode* self = operator->();
-  CHECK(outer->iter_type == kDataPar || outer->iter_type == kCommReduce ||
-        outer->iter_type == kOrdered)
+  ICHECK(outer->iter_type == kDataPar || outer->iter_type == kCommReduce ||
+         outer->iter_type == kOrdered)
       << "Cannot fuse " << IterVarType2String(outer->iter_type);
-  CHECK(inner->iter_type == kDataPar || inner->iter_type == kCommReduce ||
-        inner->iter_type == kOrdered)
+  ICHECK(inner->iter_type == kDataPar || inner->iter_type == kCommReduce ||
+         inner->iter_type == kOrdered)
       << "Cannot fuse " << IterVarType2String(inner->iter_type);
 
   IterVarType iter_type = outer->iter_type;
@@ -251,7 +251,7 @@ Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT
     std::swap(outer, inner);
     std::swap(pos_inner, pos_outer);
   }
-  CHECK_EQ(pos_inner, pos_outer + 1)
+  ICHECK_EQ(pos_inner, pos_outer + 1)
       << "Can only fuse iterations that are consecutive between each other";
   self->relations.push_back(Fuse(outer, inner, fused));
   all_vars.push_back(fused);
@@ -288,11 +288,11 @@ Stage& Stage::reorder(const Array<IterVar>& order) {  // NOLINT(*)
   std::unordered_set<IterVar> seen_var;
   StageNode* self = operator->();
   for (IterVar iv : order) {
-    CHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce ||
-          iv->iter_type == kThreadIndex)
+    ICHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce ||
+           iv->iter_type == kThreadIndex)
         << "Cannot reorder IterVar(" << IterVarType2String(iv->iter_type) << ")";
 
-    CHECK_EQ(seen_var.count(iv), 0) << "Same axis can not appear more than once " << iv;
+    ICHECK_EQ(seen_var.count(iv), 0) << "Same axis can not appear more than once " << iv;
     seen_var.insert(iv);
   }
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
@@ -345,9 +345,9 @@ inline void SetAttrIterType(StageNode* self, IterVar var, IterVarType iter_type)
 }
 
 Stage& Stage::vectorize(IterVar var) {  // NOLINT(*)
-  CHECK(var->iter_type == kDataPar || var->iter_type == kOpaque || var->iter_type == kUnrolled ||
-        var->iter_type == kVectorized || var->iter_type == kTensorized ||
-        var->iter_type == kParallelized)
+  ICHECK(var->iter_type == kDataPar || var->iter_type == kOpaque || var->iter_type == kUnrolled ||
+         var->iter_type == kVectorized || var->iter_type == kTensorized ||
+         var->iter_type == kParallelized)
       << "Cannot vectorize on " << IterVarType2String(var->iter_type);
   SetAttrIterType(operator->(), var, kVectorized);
   return *this;
@@ -418,7 +418,7 @@ Stage& Stage::storage_align(IterVar axis, int factor, int offset) {
 
 Stage& Stage::double_buffer() {
   StageNode* self = operator->();
-  CHECK(!self->is_output) << "Cannot apply double buffer on output";
+  ICHECK(!self->is_output) << "Cannot apply double buffer on output";
   self->double_buffer = true;
   return *this;
 }
@@ -451,23 +451,23 @@ Schedule Schedule::copy() const {
   }
   for (Stage s : n->stages) {
     if (s->attach_stage.defined()) {
-      CHECK(smap.find(s->attach_stage) != smap.end())
+      ICHECK(smap.find(s->attach_stage) != smap.end())
           << s->attach_stage << " not found in " << (*this);
       s->attach_stage = smap.at(s->attach_stage);
     }
     if (s->group.defined()) {
-      CHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
+      ICHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
       s->group = smap.at(s->group);
     }
   }
   for (Stage s : n->groups) {
     if (s->attach_stage.defined()) {
-      CHECK(smap.find(s->attach_stage) != smap.end())
+      ICHECK(smap.find(s->attach_stage) != smap.end())
           << s->attach_stage << " not found in " << (*this);
       s->attach_stage = smap.at(s->attach_stage);
     }
     if (s->group.defined()) {
-      CHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
+      ICHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
       s->group = smap.at(s->group);
     }
   }
@@ -476,7 +476,7 @@ Schedule Schedule::copy() const {
 
 Stage Schedule::operator[](const Operation& op) {
   auto it = (*this)->stage_map.find(op);
-  CHECK(it != (*this)->stage_map.end())
+  ICHECK(it != (*this)->stage_map.end())
       << "Cannot find Stage for operator " << op << " in the schedule";
   return (*it).second;
 }
@@ -504,7 +504,7 @@ Array<Tensor> RemapTensor(ScheduleNode* self, const Array<Tensor>& arr) {
   Array<Tensor> ret;
   for (Tensor t : arr) {
     if (!op2stage_cache.count(t->op.get())) {
-      CHECK(self->stage_map.count(t->op)) << "Given tensor is not in the schedule plan";
+      ICHECK(self->stage_map.count(t->op)) << "Given tensor is not in the schedule plan";
       t = self->stage_map[t->op]->op.output(t->value_index);
     }
     ret.push_back(t);
@@ -534,7 +534,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>&
   for (size_t i = 0; i < ops.size(); ++i) {
     Operation op = ops[i];
     auto it = op2stage_cache.find(op.get());
-    CHECK(it != op2stage_cache.end());
+    ICHECK(it != op2stage_cache.end());
     Stage op_group = it->second->group;
     if (i == 0) {
       parent_group = op_group;
@@ -575,7 +575,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>&
   // Verification and remappig the subgroups.
   for (auto& kv : counter) {
     if (kv.first.same_as(parent_group)) continue;
-    CHECK_EQ(kv.first->num_child_stages, kv.second.count)
+    ICHECK_EQ(kv.first->num_child_stages, kv.second.count)
         << "Trying to group region that intersect with an already existed group";
     if (kv.first->group.same_as(parent_group)) {
       Stage s = kv.first;
@@ -589,7 +589,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>&
   // Remap the group of op stages.
   for (Operation op : ops) {
     auto it = op2stage_cache.find(op.get());
-    CHECK(it != op2stage_cache.end());
+    ICHECK(it != op2stage_cache.end());
     Stage s = it->second;
     if (s->group.same_as(parent_group)) {
       s->group = gstage;
@@ -602,7 +602,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>&
   // Correct the attach to keep everything in group.
   for (Operation op : ops) {
     auto it = op2stage_cache.find(op.get());
-    CHECK(it != op2stage_cache.end());
+    ICHECK(it != op2stage_cache.end());
     Stage s = it->second;
     if (s->attach_type == kScope) {
       Stage cg = LeastCommonAncestor(s->attach_stage->group, gstage);
@@ -628,7 +628,7 @@ void ScheduleNode::InitCache() {
       op2stage_cache_[s->op.get()] = s;
     }
   }
-  CHECK_EQ(op2stage_cache_.size(), stages.size());
+  ICHECK_EQ(op2stage_cache_.size(), stages.size());
 }
 
 bool ScheduleNode::Contain(const Operation& op) const {
@@ -667,7 +667,7 @@ Schedule::Schedule(Array<Operation> ops) {
 
       for (size_t i = 0; i < scan->update.size(); ++i) {
         Stage s = n->stage_map[scan->update[i]->op];
-        CHECK(scan_group.same_as(s->group));
+        ICHECK(scan_group.same_as(s->group));
       }
     }
   }
@@ -726,8 +726,8 @@ void SpecializedCondition::EnterWithScope() {
 
 void SpecializedCondition::ExitWithScope() {
   TVMSpecializationThreadLocalEntry* entry = TVMSpecializationThreadLocalStore::Get();
-  CHECK(!entry->condition_stack.empty());
-  CHECK(entry->condition_stack.top().same_as(*this));
+  ICHECK(!entry->condition_stack.empty());
+  ICHECK(entry->condition_stack.top().same_as(*this));
   entry->condition_stack.pop();
 }
 
diff --git a/src/te/schedule/schedule_ops.cc b/src/te/schedule/schedule_ops.cc
index e5124dfdc965..355e3c39494b 100644
--- a/src/te/schedule/schedule_ops.cc
+++ b/src/te/schedule/schedule_ops.cc
@@ -31,8 +31,8 @@
 #include <unordered_set>
 #include <utility>
 
-#include "../../tir/transforms/ir_util.h"
-#include "../operation/op_util.h"
+#include "../../tir/transforms/ir_utils.h"
+#include "../operation/op_utils.h"
 #include "graph.h"
 
 namespace tvm {
@@ -69,13 +69,13 @@ class InjectAttach : public StmtMutator {
         debug_keep_trivial_loop_(debug_keep_trivial_loop) {}
 
   Stmt VisitStmt(const Stmt& input_stmt) final {
-    CHECK(input_stmt.defined());
+    ICHECK(input_stmt.defined());
     auto stmt = StmtMutator::VisitStmt(input_stmt);
     const AttrStmtNode* op = stmt.as<AttrStmtNode>();
     if (op != nullptr && op->attr_key == tir::attr::loop_scope) {
       if (attach_spec_->attach_type == kScope && op->node == attach_spec_->attach_ivar) {
-        CHECK(!found_attach) << "Find IterVar" << attach_spec_->attach_ivar
-                             << " in multiple places in the IR";
+        ICHECK(!found_attach) << "Find IterVar" << attach_spec_->attach_ivar
+                              << " in multiple places in the IR";
         found_attach = true;
         stmt = AttrStmt(op->node, op->attr_key, op->value,
                         MakePipeline(stage_, dom_map_, op->body, debug_keep_trivial_loop_));
@@ -111,7 +111,7 @@ class InjectScanStep : public StmtMutator {
         debug_keep_trivial_loop_(debug_keep_trivial_loop) {}
 
   Stmt VisitStmt(const Stmt& input_stmt) final {
-    CHECK(input_stmt.defined());
+    ICHECK(input_stmt.defined());
     auto stmt = StmtMutator::VisitStmt(input_stmt);
     // update
     const AttrStmtNode* op = stmt.as<AttrStmtNode>();
@@ -160,14 +160,14 @@ class SchedulePostProc : public StmtExprMutator {
       return this->VisitStmt(op->body);
     } else if (op->attr_key == tir::attr::scan_update_scope) {
       const ScanOpNode* scan = op->node.as<ScanOpNode>();
-      CHECK(scan);
+      ICHECK(scan);
       var_value_[scan->scan_axis->var.get()] = op->value;
       return this->VisitStmt(op->body);
     } else if (op->attr_key == tir::attr::thread_extent) {
       // delete duplicated thread extent attr
       auto it = thread_extent_scope_.find(op->node.get());
       if (it != thread_extent_scope_.end()) {
-        CHECK(is_zero(analyzer_.Simplify(it->second - op->value)));
+        ICHECK(is_zero(analyzer_.Simplify(it->second - op->value)));
         return this->VisitStmt(op->body);
       } else {
         thread_extent_scope_[op->node.get()] = op->value;
@@ -243,7 +243,7 @@ class SchedulePostProc : public StmtExprMutator {
   PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<ProducerLoadNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
 
     auto key = Downcast<Tensor>(op->producer);
     auto it = replace_buffer_.find(key);
@@ -271,7 +271,7 @@ class SchedulePostProc : public StmtExprMutator {
         if (kv.second->bind_thread.defined()) {
           const Var& from = kv.first->var;
           const Var& to = kv.second->bind_thread->var;
-          CHECK(!var_value_.count(from.get()));
+          ICHECK(!var_value_.count(from.get()));
           var_value_[from.get()] = to;
         }
       }
@@ -325,7 +325,8 @@ Stmt ScheduleOps(Schedule sch, Map<IterVar, Range> dom_map_, bool debug_keep_tri
     if (!scan) continue;
     for (Tensor t : scan->init) {
       if (scan_init.count(t->op)) {
-        CHECK(scan_init.at(t->op).same_as(s->op)) << "Scan init tensor can only belong to one scan";
+        ICHECK(scan_init.at(t->op).same_as(s->op))
+            << "Scan init tensor can only belong to one scan";
       } else {
         scan_init[t->op] = s->op;
       }
@@ -333,44 +334,44 @@ Stmt ScheduleOps(Schedule sch, Map<IterVar, Range> dom_map_, bool debug_keep_tri
   }
   // verify correctness of group.
   for (Stage g : sch->groups) {
-    CHECK(!g->op.defined());
-    CHECK_EQ(g->leaf_iter_vars.size(), 0U);
+    ICHECK(!g->op.defined());
+    ICHECK_EQ(g->leaf_iter_vars.size(), 0U);
   }
   // reverse the post DFS order.
   for (size_t i = sch->stages.size(); i != 0; --i) {
     Stage s = sch->stages[i - 1];
-    CHECK_NE(s->attach_type, kInline) << "call schedule.normalize before scheduleops";
-    CHECK(s->op.defined());
+    ICHECK_NE(s->attach_type, kInline) << "call schedule.normalize before scheduleops";
+    ICHECK(s->op.defined());
     // no need to specify place holder op.
     if (s->op.as<PlaceholderOpNode>()) continue;
     // Remove grouping sugar, get the real attach spec.
     Stage attach_spec = s.GetAttachSpec();
 
     if (scan_init.count(s->op)) {
-      CHECK(body.defined());
+      ICHECK(body.defined());
       InjectScanStep mu(s, scan_init.at(s->op), dom_map, true, debug_keep_trivial_loop);
       body = mu(std::move(body));
-      CHECK(mu.found_attach) << "did not find attachment point for scan.init";
+      ICHECK(mu.found_attach) << "did not find attachment point for scan.init";
     } else if (attach_spec->attach_type == kScanUpdate) {
       // Handle scan update
-      CHECK(body.defined());
+      ICHECK(body.defined());
       InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false, debug_keep_trivial_loop);
       body = mu(std::move(body));
-      CHECK(mu.found_attach) << "did not find attachment point for scan.update";
+      ICHECK(mu.found_attach) << "did not find attachment point for scan.update";
     } else if (attach_spec->attach_type == kInlinedAlready) {
       // do nothing
     } else if (attach_spec->attach_type == kGroupRoot) {
-      CHECK(!s->group.defined());
+      ICHECK(!s->group.defined());
       body = MakePipeline(s, dom_map, body, debug_keep_trivial_loop);
     } else {
-      CHECK_EQ(attach_spec->attach_type, kScope);
-      CHECK(body.defined());
+      ICHECK_EQ(attach_spec->attach_type, kScope);
+      ICHECK(body.defined());
       InjectAttach mutator(s, attach_spec, dom_map, debug_keep_trivial_loop);
       body = mutator(std::move(body));
-      CHECK(mutator.found_attach) << "did not find attachment point for " << s << " in "
-                                  << attach_spec->attach_stage->op << " x "
-                                  << attach_spec->attach_ivar << ", body:\n"
-                                  << body;
+      ICHECK(mutator.found_attach)
+          << "did not find attachment point for " << s << " in " << attach_spec->attach_stage->op
+          << " x " << attach_spec->attach_ivar << ", body:\n"
+          << body;
     }
   }
   SchedulePostProc post_proc;
diff --git a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
index 7c4a3c7f6ebd..f81d72e0fe02 100644
--- a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
+++ b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
@@ -415,7 +415,7 @@ class BufferAnalyser : public StmtExprVisitor {
     } else if (op->attr_key == tir::attr::buffer_dim_align) {
       te::Tensor tensor = Downcast<te::Tensor>(op->node);
       const CallNode* tuple = op->value.as<CallNode>();
-      CHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
+      ICHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
       auto& vinfo = dim_align_[tensor];
       size_t dim = tuple->args[0].as<IntImmNode>()->value;
       if (dim >= vinfo.size()) {
@@ -433,9 +433,9 @@ class BufferAnalyser : public StmtExprVisitor {
     StmtExprVisitor::VisitStmt_(op);
     auto key = Downcast<Tensor>(op->producer);
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key->GetNameHint();
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key->GetNameHint();
     const BufferInfo& bi = it->second;
-    CHECK(!bi.released) << "Read a buffer that is already out of scope";
+    ICHECK(!bi.released) << "Read a buffer that is already out of scope";
 
     if (matrix_abc_.count(key->GetNameHint())) {
       if (bi.shape.size() < 2) {
@@ -535,9 +535,9 @@ class BufferAnalyser : public StmtExprVisitor {
 
     auto tensor = Downcast<Tensor>(op->producer);
     auto it = buf_map_.find(tensor);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << tensor->GetNameHint();
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << tensor->GetNameHint();
     const BufferInfo& bi = it->second;
-    CHECK(!bi.released) << "Read a buffer that is already out of scope";
+    ICHECK(!bi.released) << "Read a buffer that is already out of scope";
 
     if (matrix_abc_.count(tensor->op->name)) {
       if (bi.shape.size() < 2) {
@@ -591,7 +591,7 @@ class BufferAnalyser : public StmtExprVisitor {
   void VisitStmt_(const ProducerRealizeNode* op) final {
     auto key = Downcast<Tensor>(op->producer);
     if (buf_map_.count(key)) {
-      CHECK(buf_map_.at(key).external);
+      ICHECK(buf_map_.at(key).external);
       this->VisitStmt(op->body);
     } else {
       // create a buffer entry
@@ -678,7 +678,7 @@ class BufferAnalyser : public StmtExprVisitor {
     inline Array<PrimExpr> RelIndex(Array<PrimExpr> args) const {
       if (bounds.size() != 0) {
         Array<PrimExpr> index;
-        CHECK_EQ(bounds.size(), args.size());
+        ICHECK_EQ(bounds.size(), args.size());
         for (size_t i = 0; i < bounds.size(); ++i) {
           index.push_back(args[i] - bounds[i]->min);
         }
@@ -797,7 +797,7 @@ class TensorCoreIRMutator : public StmtExprMutator {
       for (size_t i = 0; i < op->bounds.size() - 2; ++i) {
         new_bounds.push_back(op->bounds[i]);
       }
-      CHECK_GE(op->bounds.size(), 2) << "Less than 2 dimensions for matrix " << key->GetNameHint();
+      ICHECK_GE(op->bounds.size(), 2) << "Less than 2 dimensions for matrix " << key->GetNameHint();
       new_bounds.push_back(
           Range::FromMinExtent(op->bounds[op->bounds.size() - 2]->min, new_extents[0]));
       new_bounds.push_back(
@@ -818,7 +818,7 @@ class TensorCoreIRMutator : public StmtExprMutator {
         }
 
         auto it = matrix_abc_.find(simplify_name(node->name));
-        CHECK(it != matrix_abc_.end()) << "Cannot find matrix info for " << node->name;
+        ICHECK(it != matrix_abc_.end()) << "Cannot find matrix info for " << node->name;
         auto matrix_abc = tvm::tir::StringImm("wmma." + it->second);
         Stmt body = this->VisitStmt(op->body);
         return AttrStmt(op->node, op->attr_key, matrix_abc, body);
@@ -887,12 +887,12 @@ class TensorCoreIRMutator : public StmtExprMutator {
       }
 
       const ProducerLoadNode* value = op->value.as<ProducerLoadNode>();
-      CHECK(value != nullptr) << "Can only load fragment from a buffer";
+      ICHECK(value != nullptr) << "Can only load fragment from a buffer";
 
       auto it = strides_.find(value->producer->GetNameHint());
-      CHECK(it != strides_.end()) << "Cannot find stride for " << value->producer->GetNameHint();
+      ICHECK(it != strides_.end()) << "Cannot find stride for " << value->producer->GetNameHint();
       auto strides = it->second;
-      CHECK_GE(strides.size(), 2);
+      ICHECK_GE(strides.size(), 2);
       PrimExpr stride = strides[strides.size() - 2];
 
       // thread index unification inside a warp
@@ -905,7 +905,7 @@ class TensorCoreIRMutator : public StmtExprMutator {
       auto pload = dst.as<ProducerLoadNode>();
       PrimExpr matrix_major;
       auto iter2 = matrix_major_.find(simplify_name(pload->producer->GetNameHint()));
-      CHECK(iter2 != matrix_major_.end())
+      ICHECK(iter2 != matrix_major_.end())
           << "Can not determine matrix major for " << pload->producer->GetNameHint();
       if (iter2->second == "col_major") {
         matrix_major = StringImm("col_major");
@@ -928,9 +928,9 @@ class TensorCoreIRMutator : public StmtExprMutator {
     auto it3 = frag_store_.find(op);
     if (it3 != frag_store_.end()) {
       auto it = strides_.find(op->producer->GetNameHint());
-      CHECK(it != strides_.end()) << "Cannot find stride for " << op->producer->GetNameHint();
+      ICHECK(it != strides_.end()) << "Cannot find stride for " << op->producer->GetNameHint();
       auto strides = it->second;
-      CHECK_GE(strides.size(), 2);
+      ICHECK_GE(strides.size(), 2);
       PrimExpr stride = strides[strides.size() - 2];
 
       PrimExpr dst = it3->second;
@@ -978,7 +978,7 @@ class TensorCoreIRMutator : public StmtExprMutator {
   Array<PrimExpr> get_tile_size_(const std::string& name) {
     auto it = matrix_abc_.find(name);
     auto it2 = matrix_major_.find(name);
-    CHECK(it != matrix_abc_.end() && it2 != matrix_major_.end())
+    ICHECK(it != matrix_abc_.end() && it2 != matrix_major_.end())
         << "Cannot find matrix info for " << name;
     PrimExpr size0 = make_const(DataType::Int(32), 16);
     PrimExpr size1 = make_const(DataType::Int(32), 16);
@@ -1011,13 +1011,13 @@ class TensorCoreIRMutator : public StmtExprMutator {
                               const std::function<Stmt(const Buffer& buffer)>& call_back) {
     auto tensor = Downcast<Tensor>(pload->producer);
     auto it = bounds_.find(tensor);
-    CHECK(it != bounds_.end());
+    ICHECK(it != bounds_.end());
     Array<PrimExpr> min_bound;
     for (auto i : it->second) {
       min_bound.push_back(i->min);
     }
 
-    CHECK_GE(it->second.size(), 2);
+    ICHECK_GE(it->second.size(), 2);
     Array<PrimExpr> shape;
     for (size_t i = 0; i < it->second.size() - 2; ++i) {
       shape.push_back(it->second[i]->extent);
@@ -1037,13 +1037,13 @@ class TensorCoreIRMutator : public StmtExprMutator {
     strides.push_back(make_const(DataType::Int(32), 1));
 
     PrimExpr elem_offset = IntImm(DataType::Int(32), 0);
-    CHECK_EQ(pload->indices.size(), min_bound.size());
+    ICHECK_EQ(pload->indices.size(), min_bound.size());
     for (size_t i = 0; i < min_bound.size(); i++) {
       elem_offset = Add(elem_offset, Mul(strides[i], Sub(pload->indices[i], min_bound[i])));
     }
 
     auto it2 = matrix_abc_.find(simplify_name(tensor->op->name));
-    CHECK(it2 != matrix_abc_.end()) << "Cannot find matrix info for " << tensor->op->name;
+    ICHECK(it2 != matrix_abc_.end()) << "Cannot find matrix info for " << tensor->op->name;
     buffer_node->data = Var(tensor->op->name, DataType::Handle());
     buffer_node->name = tensor->op->name;
     buffer_node->scope = "wmma." + it2->second;
diff --git a/src/te/schedule/schedule_postproc_to_primfunc.cc b/src/te/schedule/schedule_postproc_to_primfunc.cc
index a86ad76b0eb9..1710a91c6985 100644
--- a/src/te/schedule/schedule_postproc_to_primfunc.cc
+++ b/src/te/schedule/schedule_postproc_to_primfunc.cc
@@ -128,7 +128,7 @@ class TensorToBufferMapper : public StmtExprMutator {
   Buffer GetBuffer(const Tensor& tensor, bool allow_alloc = false) {
     auto it = buffer_map_.find(tensor);
     if (it != buffer_map_.end()) return it->second;
-    CHECK(allow_alloc) << "Cannot find the Realization point of tensor " << tensor;
+    ICHECK(allow_alloc) << "Cannot find the Realization point of tensor " << tensor;
 
     auto buffer = CreateBufferFor(tensor);
     buffer_map_[tensor] = buffer;
@@ -156,7 +156,7 @@ PrimFunc SchedulePostProcToPrimFunc(Array<ObjectRef> arg_list, Stmt body,
       params.push_back(GetRef<tir::Var>(n));
     } else if (auto* n = var.as<te::TensorNode>()) {
       te::Tensor tensor = GetRef<te::Tensor>(n);
-      CHECK(!extern_buffer.count(tensor));
+      ICHECK(!extern_buffer.count(tensor));
 
       tir::Buffer buffer = CreateBufferFor(tensor);
       tir::Var bptr(buffer->name, DataType::Handle());
diff --git a/src/te/tensor.cc b/src/te/tensor.cc
index e66b9632d8a2..18d4947cdddc 100644
--- a/src/te/tensor.cc
+++ b/src/te/tensor.cc
@@ -46,8 +46,8 @@ PrimExpr Tensor::operator()(Array<Var> indices) const {
 
 PrimExpr Tensor::operator()(Array<PrimExpr> indices) const {
   if (ndim() != 0) {
-    CHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read"
-                                     << "ndim = " << ndim() << ", indices.size=" << indices.size();
+    ICHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read"
+                                      << "ndim = " << ndim() << ", indices.size=" << indices.size();
   }
 
   return ProducerLoad((*this), indices);
diff --git a/src/tir/analysis/verify_gpu_code.cc b/src/tir/analysis/verify_gpu_code.cc
index 5ef755a1b5a1..afd3c7add605 100644
--- a/src/tir/analysis/verify_gpu_code.cc
+++ b/src/tir/analysis/verify_gpu_code.cc
@@ -94,7 +94,7 @@ class GPUCodeVerifier : public StmtExprVisitor {
 
       Var var = op->node.as<IterVarNode>()->var;
       const auto* extent = op->value.as<IntImmNode>();
-      CHECK(extent);
+      ICHECK(extent);
 
       std::string name = var.get()->name_hint;
       // record the number of threads in a block
@@ -167,7 +167,7 @@ class GPUCodeVerifier : public StmtExprVisitor {
   void VisitStmt_(const ForNode* op) {
     if (op->loop_var->name_hint == "vthread.s") {
       const auto* extent = op->extent.as<IntImmNode>();
-      CHECK(extent);
+      ICHECK(extent);
 
       size_t num_vthread = static_cast<size_t>(extent->value);
       if (num_vthread > max_vthread_) {
diff --git a/src/tir/analysis/verify_memory.cc b/src/tir/analysis/verify_memory.cc
index 64097e1d343a..905384f29908 100644
--- a/src/tir/analysis/verify_memory.cc
+++ b/src/tir/analysis/verify_memory.cc
@@ -170,7 +170,7 @@ class MemoryAccessVerifier final : protected StmtExprVisitor {
 /// Interface of VerifyMemory pass
 std::vector<String> VerifyMemory_(const PrimFunc& func) {
   auto target = func->GetAttr<Target>(tvm::attr::kTarget);
-  CHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
+  ICHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
 
   if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
       CallingConv::kDefault) {
diff --git a/src/tir/analysis/verify_ssa.cc b/src/tir/analysis/verify_ssa.cc
index 834ad09cb61a..d7ccb363c16e 100644
--- a/src/tir/analysis/verify_ssa.cc
+++ b/src/tir/analysis/verify_ssa.cc
@@ -148,7 +148,7 @@ Pass VerifySSA() {
     for (auto kv : mod->functions) {
       if (auto* n = kv.second.as<PrimFuncNode>()) {
         auto func = GetRef<PrimFunc>(n);
-        CHECK(VerifySSA(func)) << "RuntimeError: IR is not in SSA form" << func;
+        ICHECK(VerifySSA(func)) << "RuntimeError: IR is not in SSA form" << func;
       }
     }
     return mod;
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index d33f2ddf698a..7db49093e596 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -45,9 +45,9 @@ Array<PrimExpr> SimplifyArray(arith::Analyzer* ana, Array<PrimExpr> array) {
   return array;
 }
 
-Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name) {
-  return Buffer(Var(name, PointerType(PrimType(dtype))), dtype, shape, Array<PrimExpr>(),
-                PrimExpr(), name, "", 0, 0, kDefault);
+Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name, Span span) {
+  return Buffer(Var(name, PointerType(PrimType(dtype)), span), dtype, shape, Array<PrimExpr>(),
+                PrimExpr(), name, "", 0, 0, kDefault, span);
 }
 
 // Split the given expression w.r.t the add operator
@@ -244,10 +244,10 @@ inline PrimExpr ElemOffset(const BufferNode* n, Array<PrimExpr> index) {
     // Scalar case
     if (n->shape.size() == 0 && index.size() == 1) {
       auto is_int = index[0].as<IntImmNode>();
-      CHECK(is_int && is_int->value == 0);
+      ICHECK(is_int && is_int->value == 0);
       base = base + index[0];
     } else {
-      CHECK_EQ(n->shape.size(), index.size());
+      ICHECK_EQ(n->shape.size(), index.size());
       if (index.size() > 0) {
         PrimExpr offset = index[0];
         for (size_t i = 1; i < index.size(); ++i) {
@@ -257,7 +257,7 @@ inline PrimExpr ElemOffset(const BufferNode* n, Array<PrimExpr> index) {
       }
     }
   } else {
-    CHECK_EQ(n->strides.size(), index.size());
+    ICHECK_EQ(n->strides.size(), index.size());
     if (is_zero(base)) {
       base = MergeMulMod(&ana, index[0] * n->strides[0]);
     } else {
@@ -285,7 +285,7 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array<PrimExpr> index, DataTyp
 PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
   // specially handle bool, stored as DataType::Int(8)
   const BufferNode* n = operator->();
-  CHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
+  ICHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype << " from buffer of " << n->dtype;
   if (dtype == DataType::Bool()) {
     return tir::Cast(DataType::Bool(),
@@ -300,7 +300,7 @@ Stmt Buffer::vstore(Array<PrimExpr> begin, PrimExpr value) const {
   // specially handle bool, stored as DataType::Int(8)
   const BufferNode* n = operator->();
   DataType dtype = value.dtype();
-  CHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
+  ICHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot store " << dtype << " to buffer of " << n->dtype;
   if (value.dtype() == DataType::Bool()) {
     return tir::Store(n->data, tir::Cast(DataType::Int(8), value),
@@ -382,8 +382,8 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
 
 Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr> strides,
                PrimExpr elem_offset, String name, String scope, int data_alignment,
-               int offset_factor, BufferType buffer_type) {
-  CHECK(IsPointerType(data->type_annotation, dtype))
+               int offset_factor, BufferType buffer_type, Span span) {
+  ICHECK(IsPointerType(data->type_annotation, dtype))
       << "Buffer data field expect to have the right pointer type annotation"
       << " annotation=" << data->type_annotation << ", dtype=" << dtype;
 
@@ -416,6 +416,7 @@ Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr>
       n->strides.push_back(Var("stride", n->shape[i].dtype()));
     }
   }
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
@@ -428,7 +429,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(BufferNode);
 
 TVM_REGISTER_GLOBAL("tir.Buffer").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args.size(), 10);
+  ICHECK_EQ(args.size(), 10);
   auto buffer_type = args[9].operator String();
   BufferType type = (buffer_type == "auto_broadcast") ? kAutoBroadcast : kDefault;
   *ret =
diff --git a/src/tir/ir/data_layout.cc b/src/tir/ir/data_layout.cc
index bc777db55dbe..da3496dba407 100644
--- a/src/tir/ir/data_layout.cc
+++ b/src/tir/ir/data_layout.cc
@@ -54,7 +54,7 @@ const LayoutAxis LayoutAxis::LOWER_CASE[] = {
     LayoutAxis('z')};
 
 const LayoutAxis& LayoutAxis::Get(const char name) {
-  CHECK((name >= 'A' && name <= 'Z') || (name >= 'a' && name <= 'z'))
+  ICHECK((name >= 'A' && name <= 'Z') || (name >= 'a' && name <= 'z'))
       << "Invalid layout axis name: " << name << ". Has to be A-Z or a-z.";
   return (name >= 'A' && name <= 'Z') ? LayoutAxis::UPPER_CASE[name - 'A']
                                       : LayoutAxis::LOWER_CASE[name - 'a'];
@@ -62,12 +62,12 @@ const LayoutAxis& LayoutAxis::Get(const char name) {
 
 const LayoutAxis& LayoutAxis::Get(const IterVar& itvar) {
   const std::string axis = itvar->var.get()->name_hint;
-  CHECK_EQ(axis.size(), 1) << "Invalid layout axis " << axis;
+  ICHECK_EQ(axis.size(), 1) << "Invalid layout axis " << axis;
   return LayoutAxis::Get(axis[0]);
 }
 
 const LayoutAxis& LayoutAxis::Get(const std::string& name) {
-  CHECK_EQ(name.length(), 1) << "Invalid axis " << name;
+  ICHECK_EQ(name.length(), 1) << "Invalid axis " << name;
   return LayoutAxis::Get(name[0]);
 }
 
@@ -77,13 +77,13 @@ Layout::Layout(const Array<IterVar>& axes) {
   std::ostringstream repr;
   for (const IterVar& axis : axes) {
     if (const auto* factor = axis->dom->extent.as<IntImmNode>()) {
-      CHECK_GT(factor->value, 0);
+      ICHECK_GT(factor->value, 0);
       repr << factor->value;
     }
-    CHECK_EQ(axis->var.get()->name_hint.size(), 1)
+    ICHECK_EQ(axis->var.get()->name_hint.size(), 1)
         << "Invalid layout axis " << axis->var.get()->name_hint;
     char c = axis->var.get()->name_hint.operator std::string()[0];
-    CHECK((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) << "Invalid layout axis " << c;
+    ICHECK((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) << "Invalid layout axis " << c;
     repr << axis->var.get()->name_hint;
   }
   node->name = repr.str();
@@ -102,22 +102,22 @@ Layout::Layout(const std::string& name) {  // NOLINT(*)
   int32_t factor = 0;
   for (char c : name) {
     if (c >= 'A' && c <= 'Z') {
-      CHECK_EQ(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
-                          << " before dimension " << c;
+      ICHECK_EQ(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
+                           << " before dimension " << c;
       std::string shape_name("_shape");
       shape_name.insert(0, 1, c);
       IterVar axis =
           IterVar(Range(PrimExpr(0), Var(shape_name)), Var(std::string(1, c)), tir::kDataPar);
       node->axes.push_back(axis);
     } else if (c >= 'a' && c <= 'z') {
-      CHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
-                          << " for dimension " << c;
+      ICHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
+                           << " for dimension " << c;
       IterVar axis =
           IterVar(Range(PrimExpr(0), PrimExpr(factor)), Var(std::string(1, c)), tir::kDataPar);
       node->axes.push_back(axis);
       factor = 0;
     } else if (c >= '0' && c <= '9') {
-      CHECK(factor >= 0) << "Invalid layout " << name << ": _ is adjacent to a number.";
+      ICHECK(factor >= 0) << "Invalid layout " << name << ": _ is adjacent to a number.";
       factor = factor * 10 + c - '0';
     } else {
       LOG(FATAL) << "Invalid layout " << name;
@@ -128,16 +128,16 @@ Layout::Layout(const std::string& name) {  // NOLINT(*)
   std::vector<bool> exist_axis(256, false);
   for (const IterVar& v : node->axes) {
     auto axis_str = v->var.get()->name_hint.operator std::string();
-    CHECK_EQ(axis_str.size(), 1);
+    ICHECK_EQ(axis_str.size(), 1);
     char axis = axis_str[0];
-    CHECK((axis >= 'a' && axis <= 'z') || (axis >= 'A' && axis <= 'Z'));
-    CHECK(!exist_axis[axis]) << "Invalid layout " << name << ": duplicate axis " << axis;
+    ICHECK((axis >= 'a' && axis <= 'z') || (axis >= 'A' && axis <= 'Z'));
+    ICHECK(!exist_axis[axis]) << "Invalid layout " << name << ": duplicate axis " << axis;
     exist_axis[axis] = true;
   }
   for (const IterVar& v : node->axes) {
     char axis = v->var.get()->name_hint.operator std::string()[0];
     if (axis >= 'a' && axis <= 'z') {
-      CHECK(exist_axis[axis - 'a' + 'A'])
+      ICHECK(exist_axis[axis - 'a' + 'A'])
           << "Invalid layout " << name << ": missing axis " << std::toupper(axis);
     }
   }
@@ -160,13 +160,13 @@ Layout Layout::Split(const LayoutAxis& axis, size_t target_pos, int32_t factor)
   if (!defined()) return Layout::Undef();
   const std::string& name = operator->()->name;
   const auto axes = operator->()->axes;
-  CHECK(target_pos <= this->ndim())
+  ICHECK(target_pos <= this->ndim())
       << "Invalid split position " << target_pos << " for layout " << name;
-  CHECK(axis.IsPrimal()) << "Cannot split a subordinate axis " << axis;
-  CHECK(this->Contains(axis)) << "Axis " << axis << " does not exist in " << name;
-  CHECK(!this->Contains(axis.ToSubordinate()))
+  ICHECK(axis.IsPrimal()) << "Cannot split a subordinate axis " << axis;
+  ICHECK(this->Contains(axis)) << "Axis " << axis << " does not exist in " << name;
+  ICHECK(!this->Contains(axis.ToSubordinate()))
       << "Axis " << axis << " has already been split in " << name;
-  CHECK(factor > 0) << "Invalid split size " << factor;
+  ICHECK(factor > 0) << "Invalid split size " << factor;
   Array<IterVar> new_layout;
   for (size_t i = 0; i <= this->ndim(); ++i) {
     if (i == target_pos) {
@@ -186,7 +186,7 @@ int32_t Layout::FactorOf(const LayoutAxis& axis) const {
   for (const IterVar& itvar : operator->()->axes) {
     if (sub == LayoutAxis::Get(itvar)) {
       const auto* factor = itvar->dom->extent.as<IntImmNode>();
-      CHECK(factor);
+      ICHECK(factor);
       return factor->value;
     }
   }
@@ -261,17 +261,17 @@ inline Array<PrimExpr> TransformIndex(const Array<PrimExpr>& src_index,
 }
 
 Array<PrimExpr> BijectiveLayout::ForwardIndex(const Array<PrimExpr>& src_index) const {
-  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  ICHECK(defined()) << "Cannot operate on an undefined bijective layout.";
   const BijectiveLayoutNode* self = operator->();
-  CHECK_EQ(src_index.size(), self->src_layout->axes.size())
+  ICHECK_EQ(src_index.size(), self->src_layout->axes.size())
       << "Input mismatch with layout " << self->src_layout;
   return TransformIndex(src_index, self->src_layout->axes, self->forward_rule);
 }
 
 Array<PrimExpr> BijectiveLayout::BackwardIndex(const Array<PrimExpr>& dst_index) const {
-  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  ICHECK(defined()) << "Cannot operate on an undefined bijective layout.";
   const BijectiveLayoutNode* self = operator->();
-  CHECK_EQ(dst_index.size(), self->dst_layout->axes.size())
+  ICHECK_EQ(dst_index.size(), self->dst_layout->axes.size())
       << "Output mismatch with layout " << self->dst_layout;
   return TransformIndex(dst_index, self->dst_layout->axes, self->backward_rule);
 }
@@ -281,7 +281,7 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
                                       const Array<IterVar>& target_axis,
                                       const Array<PrimExpr>& transform_rule) {
   arith::Analyzer ana;
-  CHECK_EQ(src_shape.size(), src_axis.size());
+  ICHECK_EQ(src_shape.size(), src_axis.size());
   // bind variables for original axes
   // for major-axis, bind the corresponding size
   // for minor-axis, simply bind it as 0, so that we can reuse forward/backward_rule,
@@ -299,7 +299,7 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
         const auto* orig_shape_const = orig_shape.as<IntImmNode>();
         const auto* orig_axis_extent = orig_axis->dom->extent.as<IntImmNode>();
         if (orig_shape_const) {
-          CHECK_EQ(orig_shape_const->value, orig_axis_extent->value)
+          ICHECK_EQ(orig_shape_const->value, orig_axis_extent->value)
               << "Input shape mismatch at index " << i << ". Expected " << orig_axis->dom->extent
               << ", get " << orig_shape;
         }
@@ -313,7 +313,7 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
   // for major-axis, use the forward/backward_rule directly,
   // for minor-axis, simply use the extent.
   Array<PrimExpr> result;
-  CHECK_EQ(transform_rule.size(), target_axis.size());
+  ICHECK_EQ(transform_rule.size(), target_axis.size());
   for (size_t i = 0; i < transform_rule.size(); ++i) {
     PrimExpr rule = transform_rule[i];
     IterVar axis = target_axis[i];
@@ -331,13 +331,13 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
 }
 
 Array<PrimExpr> BijectiveLayout::ForwardShape(const Array<PrimExpr>& shape) const {
-  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  ICHECK(defined()) << "Cannot operate on an undefined bijective layout.";
   const BijectiveLayoutNode* self = operator->();
   return TransformShape(shape, self->src_layout->axes, self->dst_layout->axes, self->forward_rule);
 }
 
 Array<PrimExpr> BijectiveLayout::BackwardShape(const Array<PrimExpr>& shape) const {
-  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  ICHECK(defined()) << "Cannot operate on an undefined bijective layout.";
   const BijectiveLayoutNode* self = operator->();
   return TransformShape(shape, self->dst_layout->axes, self->src_layout->axes, self->backward_rule);
 }
@@ -351,7 +351,7 @@ BijectiveLayout::BijectiveLayout(Layout src_layout, Layout dst_layout) {
   // To be consistent with previous behavior, a nullptr layout is created
   // when argument is invalid.
   if (GetStoreRule(&n->forward_rule, n->src_layout, n->dst_layout)) {
-    CHECK(GetStoreRule(&n->backward_rule, n->dst_layout, n->src_layout));
+    ICHECK(GetStoreRule(&n->backward_rule, n->dst_layout, n->src_layout));
     data_ = std::move(n);
   }
 }
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index f648aca18e46..aa400997e2b3 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -33,45 +33,50 @@
 namespace tvm {
 namespace tir {
 
-#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                            \
-  Name::Name(PrimExpr a, PrimExpr b) {                                \
-    using T = Name::ContainerType;                                    \
-    CHECK(a.defined()) << "ValueError: a is undefined\n";             \
-    CHECK(b.defined()) << "ValueError: b is undefined\n";             \
-    CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n"; \
-    ObjectPtr<T> node = make_object<T>();                             \
-    node->dtype = a.dtype();                                          \
-    node->a = std::move(a);                                           \
-    node->b = std::move(b);                                           \
-    data_ = std::move(node);                                          \
+#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                                               \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                                        \
+    using T = Name::ContainerType;                                                       \
+    ICHECK(a.defined()) << "ValueError: a is undefined\n";                               \
+    ICHECK(b.defined()) << "ValueError: b is undefined\n";                               \
+    ICHECK(a.dtype() == b.dtype())                                                       \
+        << "TypeError: mismatched types. " << a.dtype() << " vs. " << b.dtype() << "\n"; \
+    ObjectPtr<T> node = make_object<T>();                                                \
+    node->dtype = a.dtype();                                                             \
+    node->a = std::move(a);                                                              \
+    node->b = std::move(b);                                                              \
+    node->span = std::move(span);                                                        \
+    data_ = std::move(node);                                                             \
   }
 
-#define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                            \
-  Name::Name(PrimExpr a, PrimExpr b) {                                \
-    using T = Name::ContainerType;                                    \
-    CHECK(a.defined()) << "ValueError: a is undefined\n";             \
-    CHECK(b.defined()) << "ValueError: b is undefined\n";             \
-    CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n"; \
-    ObjectPtr<T> node = make_object<T>();                             \
-    node->dtype = DataType::Bool(a.dtype().lanes());                  \
-    node->a = std::move(a);                                           \
-    node->b = std::move(b);                                           \
-    data_ = std::move(node);                                          \
+#define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                             \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                      \
+    using T = Name::ContainerType;                                     \
+    ICHECK(a.defined()) << "ValueError: a is undefined\n";             \
+    ICHECK(b.defined()) << "ValueError: b is undefined\n";             \
+    ICHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n"; \
+    ObjectPtr<T> node = make_object<T>();                              \
+    node->dtype = DataType::Bool(a.dtype().lanes());                   \
+    node->a = std::move(a);                                            \
+    node->b = std::move(b);                                            \
+    node->span = std::move(span);                                      \
+    data_ = std::move(node);                                           \
   }
 
 // Var
-Var::Var(String name_hint, DataType dtype) {
+Var::Var(String name_hint, DataType dtype, Span span) {
   auto n = make_object<VarNode>();
   n->name_hint = std::move(name_hint);
   n->dtype = std::move(dtype);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
-Var::Var(String name_hint, Type type_annotation) {
+Var::Var(String name_hint, Type type_annotation, Span span) {
   auto n = make_object<VarNode>();
   n->name_hint = std::move(name_hint);
   n->dtype = GetRuntimeDataType(type_annotation);
   n->type_annotation = std::move(type_annotation);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
@@ -87,11 +92,12 @@ Var Var::copy_with_suffix(const String& suffix) const {
   return Var(new_ptr);
 }
 
-TVM_REGISTER_GLOBAL("tir.Var").set_body_typed([](String name_hint, runtime::TVMArgValue type) {
+TVM_REGISTER_GLOBAL("tir.Var").set_body_typed([](String name_hint, runtime::TVMArgValue type,
+                                                 Span span) {
   if (type.IsObjectRef<Type>()) {
-    return Var(name_hint, type.operator Type());
+    return Var(name_hint, type.operator Type(), span);
   } else {
-    return Var(name_hint, type.operator DataType());
+    return Var(name_hint, type.operator DataType(), span);
   }
 });
 
@@ -106,15 +112,16 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // SizeVar
-SizeVar::SizeVar(String name_hint, DataType dtype) {
+SizeVar::SizeVar(String name_hint, DataType dtype, Span span) {
   auto n = make_object<SizeVarNode>();
   n->name_hint = std::move(name_hint);
   n->dtype = std::move(dtype);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
-TVM_REGISTER_GLOBAL("tir.SizeVar").set_body_typed([](String s, DataType t) {
-  return SizeVar(s, t);
+TVM_REGISTER_GLOBAL("tir.SizeVar").set_body_typed([](String s, DataType t, Span span) {
+  return SizeVar(s, t, span);
 });
 
 TVM_REGISTER_NODE_TYPE(SizeVarNode);
@@ -126,18 +133,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // IterVar
-IterVar::IterVar(Range dom, Var var, IterVarType t, String thread_tag) {
+IterVar::IterVar(Range dom, Var var, IterVarType t, String thread_tag, Span span) {
   ObjectPtr<IterVarNode> n = make_object<IterVarNode>();
   n->dom = dom;
   n->var = var;
   n->iter_type = t;
   n->thread_tag = thread_tag;
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("tir.IterVar")
-    .set_body_typed([](Range dom, Var var, int iter_type, String thread_tag) {
-      return IterVar(dom, var, static_cast<IterVarType>(iter_type), thread_tag);
+    .set_body_typed([](Range dom, Var var, int iter_type, String thread_tag, Span span) {
+      return IterVar(dom, var, static_cast<IterVarType>(iter_type), thread_tag, span);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -159,14 +167,17 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(IterVarNode);
 
 // StringImm
-StringImm::StringImm(String value) {
+StringImm::StringImm(String value, Span span) {
   ObjectPtr<StringImmNode> node = make_object<StringImmNode>();
   node->dtype = DataType::Handle();
   node->value = std::move(value);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.StringImm").set_body_typed([](String value) { return StringImm(value); });
+TVM_REGISTER_GLOBAL("tir.StringImm").set_body_typed([](String value, Span span) {
+  return StringImm(value, span);
+});
 
 TVM_REGISTER_NODE_TYPE(StringImmNode);
 
@@ -177,17 +188,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Cast
-Cast::Cast(DataType t, PrimExpr value) {
-  CHECK(value.defined());
-  CHECK_EQ(t.lanes(), value.dtype().lanes());
+Cast::Cast(DataType t, PrimExpr value, Span span) {
+  ICHECK(value.defined());
+  ICHECK_EQ(t.lanes(), value.dtype().lanes());
   ObjectPtr<CastNode> node = make_object<CastNode>();
   node->dtype = t;
   node->value = std::move(value);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Cast").set_body_typed([](DataType dtype, PrimExpr value) {
-  return Cast(dtype, value);
+TVM_REGISTER_GLOBAL("tir.Cast").set_body_typed([](DataType dtype, PrimExpr value, Span span) {
+  return Cast(dtype, value, span);
 });
 
 TVM_REGISTER_NODE_TYPE(CastNode);
@@ -203,7 +215,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Add
 TVM_DEFINE_BINOP_CONSTRUCTOR(Add);
 
-TVM_REGISTER_GLOBAL("tir.Add").set_body_typed([](PrimExpr a, PrimExpr b) { return Add(a, b); });
+TVM_REGISTER_GLOBAL("tir.Add").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Add(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(AddNode);
 
@@ -220,7 +234,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Sub
 TVM_DEFINE_BINOP_CONSTRUCTOR(Sub);
 
-TVM_REGISTER_GLOBAL("tir.Sub").set_body_typed([](PrimExpr a, PrimExpr b) { return Sub(a, b); });
+TVM_REGISTER_GLOBAL("tir.Sub").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Sub(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(SubNode);
 
@@ -237,7 +253,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Mul
 TVM_DEFINE_BINOP_CONSTRUCTOR(Mul);
 
-TVM_REGISTER_GLOBAL("tir.Mul").set_body_typed([](PrimExpr a, PrimExpr b) { return Mul(a, b); });
+TVM_REGISTER_GLOBAL("tir.Mul").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Mul(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(MulNode);
 
@@ -254,7 +272,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Div
 TVM_DEFINE_BINOP_CONSTRUCTOR(Div);
 
-TVM_REGISTER_GLOBAL("tir.Div").set_body_typed([](PrimExpr a, PrimExpr b) { return Div(a, b); });
+TVM_REGISTER_GLOBAL("tir.Div").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Div(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(DivNode);
 
@@ -271,7 +291,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Mod
 TVM_DEFINE_BINOP_CONSTRUCTOR(Mod);
 
-TVM_REGISTER_GLOBAL("tir.Mod").set_body_typed([](PrimExpr a, PrimExpr b) { return Mod(a, b); });
+TVM_REGISTER_GLOBAL("tir.Mod").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Mod(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(ModNode);
 
@@ -288,8 +310,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // FloorDiv
 TVM_DEFINE_BINOP_CONSTRUCTOR(FloorDiv);
 
-TVM_REGISTER_GLOBAL("tir.FloorDiv").set_body_typed([](PrimExpr a, PrimExpr b) {
-  return FloorDiv(a, b);
+TVM_REGISTER_GLOBAL("tir.FloorDiv").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return FloorDiv(a, b, span);
 });
 
 TVM_REGISTER_NODE_TYPE(FloorDivNode);
@@ -303,8 +325,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // FloorMod
 TVM_DEFINE_BINOP_CONSTRUCTOR(FloorMod);
 
-TVM_REGISTER_GLOBAL("tir.FloorMod").set_body_typed([](PrimExpr a, PrimExpr b) {
-  return FloorMod(a, b);
+TVM_REGISTER_GLOBAL("tir.FloorMod").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return FloorMod(a, b, span);
 });
 
 TVM_REGISTER_NODE_TYPE(FloorModNode);
@@ -318,7 +340,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Min
 TVM_DEFINE_BINOP_CONSTRUCTOR(Min);
 
-TVM_REGISTER_GLOBAL("tir.Min").set_body_typed([](PrimExpr a, PrimExpr b) { return Min(a, b); });
+TVM_REGISTER_GLOBAL("tir.Min").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Min(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(MinNode);
 
@@ -335,7 +359,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Max
 TVM_DEFINE_BINOP_CONSTRUCTOR(Max);
 
-TVM_REGISTER_GLOBAL("tir.Max").set_body_typed([](PrimExpr a, PrimExpr b) { return Max(a, b); });
+TVM_REGISTER_GLOBAL("tir.Max").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Max(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(MaxNode);
 
@@ -352,7 +378,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // EQ
 TVM_DEFINE_CMPOP_CONSTRUCTOR(EQ);
 
-TVM_REGISTER_GLOBAL("tir.EQ").set_body_typed([](PrimExpr a, PrimExpr b) { return EQ(a, b); });
+TVM_REGISTER_GLOBAL("tir.EQ").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return EQ(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(EQNode);
 
@@ -369,7 +397,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // NE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(NE);
 
-TVM_REGISTER_GLOBAL("tir.NE").set_body_typed([](PrimExpr a, PrimExpr b) { return NE(a, b); });
+TVM_REGISTER_GLOBAL("tir.NE").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return NE(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(NENode);
 
@@ -386,7 +416,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // LT
 TVM_DEFINE_CMPOP_CONSTRUCTOR(LT);
 
-TVM_REGISTER_GLOBAL("tir.LT").set_body_typed([](PrimExpr a, PrimExpr b) { return LT(a, b); });
+TVM_REGISTER_GLOBAL("tir.LT").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return LT(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(LTNode);
 
@@ -403,7 +435,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // LE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(LE);
 
-TVM_REGISTER_GLOBAL("tir.LE").set_body_typed([](PrimExpr a, PrimExpr b) { return LE(a, b); });
+TVM_REGISTER_GLOBAL("tir.LE").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return LE(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(LENode);
 
@@ -420,7 +454,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // GT
 TVM_DEFINE_CMPOP_CONSTRUCTOR(GT);
 
-TVM_REGISTER_GLOBAL("tir.GT").set_body_typed([](PrimExpr a, PrimExpr b) { return GT(a, b); });
+TVM_REGISTER_GLOBAL("tir.GT").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return GT(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(GTNode);
 
@@ -437,7 +473,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // GE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(GE);
 
-TVM_REGISTER_GLOBAL("tir.GE").set_body_typed([](PrimExpr a, PrimExpr b) { return GE(a, b); });
+TVM_REGISTER_GLOBAL("tir.GE").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return GE(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(GENode);
 
@@ -452,21 +490,24 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // And
-And::And(PrimExpr a, PrimExpr b) {
-  CHECK(a.defined()) << "ValueError: a is undefined";
-  CHECK(b.defined()) << "ValueError: b is undefined";
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
-  CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
+And::And(PrimExpr a, PrimExpr b, Span span) {
+  ICHECK(a.defined()) << "ValueError: a is undefined";
+  ICHECK(b.defined()) << "ValueError: b is undefined";
+  ICHECK(a.dtype().is_bool());
+  ICHECK(b.dtype().is_bool());
+  ICHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
 
   ObjectPtr<AndNode> node = make_object<AndNode>();
   node->dtype = DataType::Bool(a.dtype().lanes());
   node->a = std::move(a);
   node->b = std::move(b);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.And").set_body_typed([](PrimExpr a, PrimExpr b) { return And(a, b); });
+TVM_REGISTER_GLOBAL("tir.And").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return And(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(AndNode);
 
@@ -481,21 +522,24 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Or
-Or::Or(PrimExpr a, PrimExpr b) {
-  CHECK(a.defined()) << "ValueError: a is undefined";
-  CHECK(b.defined()) << "ValueError: b is undefined";
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
-  CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
+Or::Or(PrimExpr a, PrimExpr b, Span span) {
+  ICHECK(a.defined()) << "ValueError: a is undefined";
+  ICHECK(b.defined()) << "ValueError: b is undefined";
+  ICHECK(a.dtype().is_bool());
+  ICHECK(b.dtype().is_bool());
+  ICHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
 
   ObjectPtr<OrNode> node = make_object<OrNode>();
   node->dtype = DataType::Bool(a.dtype().lanes());
   node->a = std::move(a);
   node->b = std::move(b);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Or").set_body_typed([](PrimExpr a, PrimExpr b) { return Or(a, b); });
+TVM_REGISTER_GLOBAL("tir.Or").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Or(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(OrNode);
 
@@ -510,17 +554,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Not
-Not::Not(PrimExpr a) {
-  CHECK(a.defined()) << "ValueError: a is undefined";
-  CHECK(a.dtype().is_bool());
+Not::Not(PrimExpr a, Span span) {
+  ICHECK(a.defined()) << "ValueError: a is undefined";
+  ICHECK(a.dtype().is_bool());
 
   ObjectPtr<NotNode> node = make_object<NotNode>();
   node->dtype = DataType::Bool(a.dtype().lanes());
   node->a = std::move(a);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Not").set_body_typed([](PrimExpr a) { return Not(a); });
+TVM_REGISTER_GLOBAL("tir.Not").set_body_typed([](PrimExpr a, Span span) { return Not(a, span); });
 
 TVM_REGISTER_NODE_TYPE(NotNode);
 
@@ -532,25 +577,26 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Select
-Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value) {
-  CHECK(condition.defined()) << "ValueError: condition is undefined";
-  CHECK(true_value.defined()) << "ValueError: true_value is undefined";
-  CHECK(false_value.defined()) << "ValueError: true_value is undefined";
-  CHECK(condition.dtype().is_bool());
-  CHECK(condition.dtype().lanes() == true_value.dtype().lanes() || condition.dtype().lanes() == 1);
-  CHECK(false_value.dtype() == true_value.dtype()) << "TypeError: mismatched types";
+Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Span span) {
+  ICHECK(condition.defined()) << "ValueError: condition is undefined";
+  ICHECK(true_value.defined()) << "ValueError: true_value is undefined";
+  ICHECK(false_value.defined()) << "ValueError: true_value is undefined";
+  ICHECK(condition.dtype().is_bool());
+  ICHECK(condition.dtype().lanes() == true_value.dtype().lanes() || condition.dtype().lanes() == 1);
+  ICHECK(false_value.dtype() == true_value.dtype()) << "TypeError: mismatched types";
 
   ObjectPtr<SelectNode> node = make_object<SelectNode>();
   node->dtype = true_value.dtype();
   node->condition = std::move(condition);
   node->true_value = std::move(true_value);
   node->false_value = std::move(false_value);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.Select")
-    .set_body_typed([](PrimExpr condition, PrimExpr true_value, PrimExpr false_value) {
-      return Select(condition, true_value, false_value);
+    .set_body_typed([](PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Span span) {
+      return Select(condition, true_value, false_value, span);
     });
 
 TVM_REGISTER_NODE_TYPE(SelectNode);
@@ -568,18 +614,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Load
-Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate) {
-  CHECK(buffer_var.defined());
-  CHECK(predicate.defined());
-  CHECK(index.defined());
-  CHECK_EQ(dtype.lanes(), index.dtype().lanes());
-  CHECK_EQ(dtype.lanes(), predicate.dtype().lanes());
+Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate, Span span) {
+  ICHECK(buffer_var.defined());
+  ICHECK(predicate.defined());
+  ICHECK(index.defined());
+  ICHECK_EQ(dtype.lanes(), index.dtype().lanes());
+  ICHECK_EQ(dtype.lanes(), predicate.dtype().lanes());
 
   ObjectPtr<LoadNode> node = make_object<LoadNode>();
   node->dtype = dtype;
   node->buffer_var = std::move(buffer_var);
   node->index = std::move(index);
   node->predicate = std::move(predicate);
+  node->span = std::move(span);
 
   data_ = std::move(node);
 }
@@ -587,9 +634,11 @@ Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate) {
 TVM_REGISTER_GLOBAL("tir.Load").set_body([](TVMArgs args, TVMRetValue* ret) {
   DataType t = args[0];
   if (args.size() == 3) {
-    *ret = Load(t, args[1], args[2], const_true(t.lanes()));
+    *ret = Load(t, args[1], args[2], const_true(t.lanes()), Span());
+  } else if (args.size() == 4) {
+    *ret = Load(t, args[1], args[2], args[3], Span());
   } else {
-    *ret = Load(t, args[1], args[2], args[3]);
+    *ret = Load(t, args[1], args[2], args[3], args[4]);
   }
 });
 
@@ -608,25 +657,27 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Ramp
-Ramp::Ramp(PrimExpr base, PrimExpr stride, int lanes) {
-  CHECK(base.defined());
-  CHECK(stride.defined());
-  CHECK(base.dtype().is_scalar());
-  CHECK(stride.dtype().is_scalar());
-  CHECK_GT(lanes, 1);
-  CHECK_EQ(stride.dtype(), base.dtype());
+Ramp::Ramp(PrimExpr base, PrimExpr stride, int lanes, Span span) {
+  ICHECK(base.defined());
+  ICHECK(stride.defined());
+  ICHECK(base.dtype().is_scalar());
+  ICHECK(stride.dtype().is_scalar());
+  ICHECK_GT(lanes, 1);
+  ICHECK_EQ(stride.dtype(), base.dtype());
 
   ObjectPtr<RampNode> node = make_object<RampNode>();
   node->dtype = base.dtype().with_lanes(lanes);
   node->base = base;
   node->stride = stride;
   node->lanes = lanes;
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Ramp").set_body_typed([](PrimExpr base, PrimExpr stride, int lanes) {
-  return Ramp(base, stride, lanes);
-});
+TVM_REGISTER_GLOBAL("tir.Ramp")
+    .set_body_typed([](PrimExpr base, PrimExpr stride, int lanes, Span span) {
+      return Ramp(base, stride, lanes, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(RampNode);
 
@@ -641,20 +692,21 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Broadcast
-Broadcast::Broadcast(PrimExpr value, int lanes) {
-  CHECK(value.defined());
-  CHECK(value.dtype().is_scalar());
-  CHECK_GT(lanes, 1);
+Broadcast::Broadcast(PrimExpr value, int lanes, Span span) {
+  ICHECK(value.defined());
+  ICHECK(value.dtype().is_scalar());
+  ICHECK_GT(lanes, 1);
 
   ObjectPtr<BroadcastNode> node = make_object<BroadcastNode>();
   node->dtype = value.dtype().with_lanes(lanes);
   node->value = std::move(value);
   node->lanes = lanes;
+  node->span = std::move(span);
   data_ = node;
 }
 
-TVM_REGISTER_GLOBAL("tir.Broadcast").set_body_typed([](PrimExpr value, int lanes) {
-  return Broadcast(value, lanes);
+TVM_REGISTER_GLOBAL("tir.Broadcast").set_body_typed([](PrimExpr value, int lanes, Span span) {
+  return Broadcast(value, lanes, span);
 });
 
 TVM_REGISTER_NODE_TYPE(BroadcastNode);
@@ -668,21 +720,23 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Let
-Let::Let(Var var, PrimExpr value, PrimExpr body) {
-  CHECK(value.defined());
-  CHECK(body.defined());
-  CHECK_EQ(value.dtype(), var.dtype());
+Let::Let(Var var, PrimExpr value, PrimExpr body, Span span) {
+  ICHECK(value.defined());
+  ICHECK(body.defined());
+  ICHECK_EQ(value.dtype(), var.dtype());
 
   ObjectPtr<LetNode> node = make_object<LetNode>();
   node->dtype = body.dtype();
   node->var = std::move(var);
   node->value = std::move(value);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Let").set_body_typed([](Var var, PrimExpr value, PrimExpr body) {
-  return Let(var, value, body);
+TVM_REGISTER_GLOBAL("tir.Let").set_body_typed([](Var var, PrimExpr value, PrimExpr body,
+                                                 Span span) {
+  return Let(var, value, body, span);
 });
 
 TVM_REGISTER_NODE_TYPE(LetNode);
@@ -698,30 +752,31 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Call
-Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args) {
+Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span) {
   for (size_t i = 0; i < args.size(); ++i) {
-    CHECK(args[i].defined());
+    ICHECK(args[i].defined());
   }
 
   ObjectPtr<CallNode> node = make_object<CallNode>();
   node->dtype = dtype;
   node->op = std::move(op);
   node->args = std::move(args);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.Call")
-    .set_body_typed([](DataType type, RelayExpr op, Array<ObjectRef> args) {
+    .set_body_typed([](DataType type, RelayExpr op, Array<ObjectRef> args, Span span) {
       Array<PrimExpr> prim_expr_args;
       for (const auto& it : args) {
-        CHECK(it->IsInstance<runtime::StringObj>() || it->IsInstance<PrimExprNode>());
+        ICHECK(it->IsInstance<runtime::StringObj>() || it->IsInstance<PrimExprNode>());
         if (const auto* str = it.as<runtime::StringObj>()) {
           prim_expr_args.push_back(StringImm(str->data));
         } else {
           prim_expr_args.push_back(Downcast<PrimExpr>(it));
         }
       }
-      return Call(type, op, prim_expr_args);
+      return Call(type, op, prim_expr_args, span);
     });
 
 TVM_REGISTER_NODE_TYPE(CallNode);
@@ -733,7 +788,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
         p->stream << ptr_op->name << "(";
       } else {
         auto* ptr_gvar = op->op.as<GlobalVarNode>();
-        CHECK(ptr_gvar != nullptr);
+        ICHECK(ptr_gvar != nullptr);
         p->stream << "@" << ptr_gvar->name_hint << "(";
       }
       for (size_t i = 0; i < op->args.size(); ++i) {
@@ -746,28 +801,29 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Shuffle
-Shuffle::Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices) {
-  CHECK_NE(vectors.size(), 0U);
-  CHECK_NE(indices.size(), 0U);
+Shuffle::Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices, Span span) {
+  ICHECK_NE(vectors.size(), 0U);
+  ICHECK_NE(indices.size(), 0U);
 
   DataType base_type = vectors[0].dtype().element_of();
   int total_lanes = 0;
 
   for (PrimExpr val : vectors) {
-    CHECK(val.dtype().element_of() == base_type);
+    ICHECK(val.dtype().element_of() == base_type);
     total_lanes += val.dtype().lanes();
   }
-  CHECK_LE(indices.size(), static_cast<size_t>(total_lanes));
+  ICHECK_LE(indices.size(), static_cast<size_t>(total_lanes));
 
   ObjectPtr<ShuffleNode> node = make_object<ShuffleNode>();
   node->dtype = base_type.with_lanes(static_cast<int>(indices.size()));
   node->vectors = std::move(vectors);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = node;
 }
 
-PrimExpr Shuffle::Concat(Array<PrimExpr> vectors) {
-  CHECK_NE(vectors.size(), 0);
+PrimExpr Shuffle::Concat(Array<PrimExpr> vectors, Span span) {
+  ICHECK_NE(vectors.size(), 0);
   if (vectors.size() == 1) {
     return vectors[0];
   }
@@ -778,16 +834,16 @@ PrimExpr Shuffle::Concat(Array<PrimExpr> vectors) {
       indices.push_back(IntImm(DataType::Int(32), index++));
     }
   }
-  return Shuffle(vectors, indices);
+  return Shuffle(vectors, indices, span);
 }
 
-PrimExpr Shuffle::ExtractElement(PrimExpr vector, int index) {
-  return Shuffle({vector}, {Integer(index)});
+PrimExpr Shuffle::ExtractElement(PrimExpr vector, int index, Span span) {
+  return Shuffle({vector}, {Integer(index)}, span);
 }
 
 TVM_REGISTER_GLOBAL("tir.Shuffle")
-    .set_body_typed([](Array<PrimExpr> vectors, Array<PrimExpr> indices) {
-      return Shuffle(vectors, indices);
+    .set_body_typed([](Array<PrimExpr> vectors, Array<PrimExpr> indices, Span span) {
+      return Shuffle(vectors, indices, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ShuffleNode);
@@ -814,19 +870,20 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // CommReducer
 CommReducer::CommReducer(Array<Var> lhs, Array<Var> rhs, Array<PrimExpr> result,
-                         Array<PrimExpr> identity_element) {
+                         Array<PrimExpr> identity_element, Span span) {
   auto node = make_object<CommReducerNode>();
   node->lhs = lhs;
   node->rhs = rhs;
   node->result = result;
   node->identity_element = identity_element;
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 Array<PrimExpr> CommReducerNode::operator()(Array<PrimExpr> a, Array<PrimExpr> b) const {
-  CHECK_EQ(a.size(), b.size());
-  CHECK_EQ(lhs.size(), a.size());
-  CHECK_EQ(rhs.size(), b.size());
+  ICHECK_EQ(a.size(), b.size());
+  ICHECK_EQ(lhs.size(), a.size());
+  ICHECK_EQ(rhs.size(), b.size());
   Map<Var, PrimExpr> value_map;
   for (size_t i = 0; i < a.size(); ++i) {
     value_map.Set(lhs[i], a[i]);
@@ -839,8 +896,8 @@ Array<PrimExpr> CommReducerNode::operator()(Array<PrimExpr> a, Array<PrimExpr> b
 
 TVM_REGISTER_GLOBAL("tir.CommReducer")
     .set_body_typed([](Array<Var> lhs, Array<Var> rhs, Array<PrimExpr> result,
-                       Array<PrimExpr> identity_element) {
-      return CommReducer(lhs, rhs, result, identity_element);
+                       Array<PrimExpr> identity_element, Span span) {
+      return CommReducer(lhs, rhs, result, identity_element, span);
     });
 
 TVM_REGISTER_GLOBAL("tir.CommReducerCombine")
@@ -857,23 +914,23 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Reduce
 Reduce::Reduce(CommReducer combiner, Array<PrimExpr> source, Array<IterVar> axis,
-               PrimExpr condition, int value_index, Array<PrimExpr> init) {
+               PrimExpr condition, int value_index, Array<PrimExpr> init, Span span) {
   for (size_t i = 0; i < axis.size(); ++i) {
-    CHECK_EQ(axis[i]->iter_type, kCommReduce) << "Can only take axis created by reduce_axis";
+    ICHECK_EQ(axis[i]->iter_type, kCommReduce) << "Can only take axis created by reduce_axis";
   }
   if (!condition.defined()) {
     condition = const_true();
   }
   auto n = make_object<ReduceNode>();
-  CHECK(source.defined());
+  ICHECK(source.defined());
   for (size_t i = 0; i < axis.size(); ++i) {
-    CHECK(axis[i].defined());
+    ICHECK(axis[i].defined());
   }
   if (!init.empty()) {
-    CHECK_EQ(init.size(), source.size()) << "Number of inits should match number of exprs";
+    ICHECK_EQ(init.size(), source.size()) << "Number of inits should match number of exprs";
     for (size_t i = 0; i < init.size(); i++) {
-      CHECK(init[i]->IsInstance<ProducerLoadNode>() || init[i]->IsInstance<IntImmNode>() ||
-            init[i]->IsInstance<FloatImmNode>())
+      ICHECK(init[i]->IsInstance<ProducerLoadNode>() || init[i]->IsInstance<IntImmNode>() ||
+             init[i]->IsInstance<FloatImmNode>())
           << "init can only be a IntImm, FloatImm or ProducerLoad";
     }
   }
@@ -884,13 +941,14 @@ Reduce::Reduce(CommReducer combiner, Array<PrimExpr> source, Array<IterVar> axis
   n->axis = std::move(axis);
   n->condition = condition;
   n->value_index = value_index;
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("tir.Reduce")
     .set_body_typed([](CommReducer combiner, Array<PrimExpr> source, Array<IterVar> axis,
-                       PrimExpr condition, int value_index, Array<PrimExpr> init) {
-      return Reduce(combiner, source, axis, condition, value_index, init);
+                       PrimExpr condition, int value_index, Array<PrimExpr> init, Span span) {
+      return Reduce(combiner, source, axis, condition, value_index, init, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ReduceNode);
@@ -908,13 +966,14 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Any
-Any::Any() {
+Any::Any(Span span) {
   auto n = make_object<AnyNode>();
   n->dtype = DataType::Int(32);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
-TVM_REGISTER_GLOBAL("tir.Any").set_body_typed([]() { return Any(); });
+TVM_REGISTER_GLOBAL("tir.Any").set_body_typed([](Span span) { return Any(span); });
 
 TVM_REGISTER_NODE_TYPE(AnyNode);
 
@@ -922,17 +981,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<AnyNode>([](const ObjectRef& node, ReprPrinter* p) { p->stream << "?"; });
 
 // BufferLoad
-BufferLoad::BufferLoad(Buffer buffer, Array<PrimExpr> indices) {
+BufferLoad::BufferLoad(Buffer buffer, Array<PrimExpr> indices, Span span) {
   ObjectPtr<BufferLoadNode> node = make_object<BufferLoadNode>();
   node->dtype = buffer->dtype;
   node->buffer = std::move(buffer);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.BufferLoad").set_body_typed([](Buffer buffer, Array<PrimExpr> indices) {
-  return BufferLoad(buffer, indices);
-});
+TVM_REGISTER_GLOBAL("tir.BufferLoad")
+    .set_body_typed([](Buffer buffer, Array<PrimExpr> indices, Span span) {
+      return BufferLoad(buffer, indices, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(BufferLoadNode);
 
@@ -950,17 +1011,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // ProducerLoad
-ProducerLoad::ProducerLoad(DataProducer producer, Array<PrimExpr> indices) {
+ProducerLoad::ProducerLoad(DataProducer producer, Array<PrimExpr> indices, Span span) {
   ObjectPtr<ProducerLoadNode> node = make_object<ProducerLoadNode>();
   node->dtype = producer->GetDataType();
   node->producer = std::move(producer);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.ProducerLoad")
-    .set_body_typed([](DataProducer producer, Array<PrimExpr> indices) {
-      return ProducerLoad(producer, indices);
+    .set_body_typed([](DataProducer producer, Array<PrimExpr> indices, Span span) {
+      return ProducerLoad(producer, indices, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ProducerLoadNode);
diff --git a/src/tir/ir/function.cc b/src/tir/ir/function.cc
index 1149e039cae4..101d80a52ea1 100644
--- a/src/tir/ir/function.cc
+++ b/src/tir/ir/function.cc
@@ -28,9 +28,16 @@
 namespace tvm {
 namespace tir {
 
+LinkedParam::LinkedParam(int64_t id, ::tvm::runtime::NDArray param) {
+  auto n = make_object<LinkedParamNode>();
+  n->id = id;
+  n->param = param;
+  data_ = std::move(n);
+}
+
 // Get the function type of a PrimFunc
 PrimFunc::PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type,
-                   Map<tir::Var, Buffer> buffer_map, DictAttrs attrs) {
+                   Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {
   // Assume void-return type for now
   // TODO(tvm-team) consider type deduction from body.
   if (!ret_type.defined()) {
@@ -43,6 +50,7 @@ PrimFunc::PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type,
   n->buffer_map = std::move(buffer_map);
   n->attrs = std::move(attrs);
   n->checked_type_ = n->func_type_annotation();
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
@@ -73,8 +81,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 TVM_REGISTER_GLOBAL("tir.PrimFunc")
     .set_body_typed([](Array<tir::Var> params, Stmt body, Type ret_type,
-                       Map<tir::Var, Buffer> buffer_map, DictAttrs attrs) {
-      return PrimFunc(params, body, ret_type, buffer_map, attrs);
+                       Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {
+      return PrimFunc(params, body, ret_type, buffer_map, attrs, span);
     });
 
 }  // namespace tir
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index f45117791457..86960d9bd999 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -29,21 +29,23 @@ namespace tvm {
 namespace tir {
 
 // LetStmt
-LetStmt::LetStmt(Var var, PrimExpr value, Stmt body) {
-  CHECK(value.defined());
-  CHECK(body.defined());
-  CHECK_EQ(value.dtype(), var.dtype());
+LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
+  ICHECK(value.defined());
+  ICHECK(body.defined());
+  ICHECK_EQ(value.dtype(), var.dtype());
 
   ObjectPtr<LetStmtNode> node = make_object<LetStmtNode>();
   node->var = std::move(var);
   node->value = std::move(value);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.LetStmt").set_body_typed([](Var var, PrimExpr value, Stmt body) {
-  return LetStmt(var, value, body);
-});
+TVM_REGISTER_GLOBAL("tir.LetStmt")
+    .set_body_typed([](Var var, PrimExpr value, Stmt body, Span span) {
+      return LetStmt(var, value, body, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(LetStmtNode);
 
@@ -58,18 +60,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // AttrStmt
-AttrStmt::AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body) {
+AttrStmt::AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body, Span span) {
   auto n = make_object<AttrStmtNode>();
   n->node = node;
   n->attr_key = std::move(attr_key);
   n->value = std::move(value);
   n->body = std::move(body);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("tir.AttrStmt")
-    .set_body_typed([](ObjectRef node, String attr_key, PrimExpr value, Stmt body) {
-      return AttrStmt(node, attr_key, value, body);
+    .set_body_typed([](ObjectRef node, String attr_key, PrimExpr value, Stmt body, Span span) {
+      return AttrStmt(node, attr_key, value, body, span);
     });
 
 TVM_REGISTER_NODE_TYPE(AttrStmtNode);
@@ -87,27 +90,28 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // AssertStmt
-AssertStmt::AssertStmt(PrimExpr condition, PrimExpr message, Stmt body) {
-  CHECK(condition.defined());
-  CHECK(message.dtype() == DataType::Int(32) || message.as<StringImmNode>())
+AssertStmt::AssertStmt(PrimExpr condition, PrimExpr message, Stmt body, Span span) {
+  ICHECK(condition.defined());
+  ICHECK(message.dtype() == DataType::Int(32) || message.as<StringImmNode>())
       << "TypeError: AssertStmt message must be an int or string:" << message << "\n";
 
   ObjectPtr<AssertStmtNode> node = make_object<AssertStmtNode>();
   node->condition = std::move(condition);
   node->message = std::move(message);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_NODE_TYPE(AssertStmtNode);
 
 TVM_REGISTER_GLOBAL("tir.AssertStmt")
-    .set_body_typed([](PrimExpr condition, ObjectRef message, Stmt body) {
+    .set_body_typed([](PrimExpr condition, ObjectRef message, Stmt body, Span span) {
       if (const auto* str = message.as<StringObj>()) {
         auto msg = StringImm(str->data);
-        return AssertStmt(condition, msg, body);
+        return AssertStmt(condition, msg, body, span);
       } else {
-        return AssertStmt(condition, Downcast<PrimExpr>(message), body);
+        return AssertStmt(condition, Downcast<PrimExpr>(message), body, span);
       }
     });
 
@@ -125,13 +129,13 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // For
 For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api,
-         Stmt body) {
-  CHECK(min.defined());
-  CHECK(extent.defined());
-  CHECK(min.dtype().is_scalar());
-  CHECK(extent.dtype().is_scalar());
-  CHECK(loop_var.dtype().is_scalar());
-  CHECK(body.defined());
+         Stmt body, Span span) {
+  ICHECK(min.defined());
+  ICHECK(extent.defined());
+  ICHECK(min.dtype().is_scalar());
+  ICHECK(extent.dtype().is_scalar());
+  ICHECK(loop_var.dtype().is_scalar());
+  ICHECK(body.defined());
 
   ObjectPtr<ForNode> node = make_object<ForNode>();
   node->loop_var = std::move(loop_var);
@@ -140,13 +144,15 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAP
   node->for_type = for_type;
   node->device_api = device_api;
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.For").set_body_typed([](Var loop_var, PrimExpr min, PrimExpr extent,
-                                                 int for_type, int device_api, Stmt body) {
+                                                 int for_type, int device_api, Stmt body,
+                                                 Span span) {
   return For(loop_var, min, extent, static_cast<ForType>(for_type),
-             static_cast<DeviceAPI>(device_api), body);
+             static_cast<DeviceAPI>(device_api), body, span);
 });
 
 TVM_REGISTER_NODE_TYPE(ForNode);
@@ -188,27 +194,30 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Store
-Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate) {
-  CHECK(value.defined());
-  CHECK(index.defined());
-  CHECK(predicate.defined());
-  CHECK_EQ(value.dtype().lanes(), index.dtype().lanes());
-  CHECK_EQ(value.dtype().lanes(), predicate.dtype().lanes());
+Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate, Span span) {
+  ICHECK(value.defined());
+  ICHECK(index.defined());
+  ICHECK(predicate.defined());
+  ICHECK_EQ(value.dtype().lanes(), index.dtype().lanes());
+  ICHECK_EQ(value.dtype().lanes(), predicate.dtype().lanes());
 
   ObjectPtr<StoreNode> node = make_object<StoreNode>();
   node->buffer_var = std::move(buffer_var);
   node->value = std::move(value);
   node->index = std::move(index);
   node->predicate = std::move(predicate);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.Store").set_body([](TVMArgs args, TVMRetValue* ret) {
   PrimExpr value = args[1];
   if (args.size() == 3) {
-    *ret = Store(args[0], value, args[2], const_true(value.dtype().lanes()));
+    *ret = Store(args[0], value, args[2], const_true(value.dtype().lanes()), Span());
+  } else if (args.size() == 4) {
+    *ret = Store(args[0], value, args[2], args[3], Span());
   } else {
-    *ret = Store(args[0], value, args[2], args[3]);
+    *ret = Store(args[0], value, args[2], args[3], args[4]);
   }
 });
 
@@ -230,17 +239,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // ProducerStore
-ProducerStore::ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices) {
+ProducerStore::ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices,
+                             Span span) {
   ObjectPtr<ProducerStoreNode> node = make_object<ProducerStoreNode>();
   node->producer = std::move(producer);
   node->value = std::move(value);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.ProducerStore")
-    .set_body_typed([](DataProducer producer, PrimExpr value, Array<PrimExpr> indices) {
-      return ProducerStore(producer, value, indices);
+    .set_body_typed([](DataProducer producer, PrimExpr value, Array<PrimExpr> indices, Span span) {
+      return ProducerStore(producer, value, indices, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ProducerStoreNode);
@@ -262,17 +273,17 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Allocate
 Allocate::Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, PrimExpr condition,
-                   Stmt body) {
+                   Stmt body, Span span) {
   // TODO(tvm-team): Add invariant check to make sure
   // IsPointerPType(buffer_var->type_annotation, dtype)
   // once we fix the allocate tvm script printing.
   for (size_t i = 0; i < extents.size(); ++i) {
-    CHECK(extents[i].defined());
-    CHECK(extents[i].dtype().is_scalar());
+    ICHECK(extents[i].defined());
+    ICHECK(extents[i].dtype().is_scalar());
   }
-  CHECK(body.defined());
-  CHECK(condition.defined());
-  CHECK(condition.dtype().is_bool());
+  ICHECK(body.defined());
+  ICHECK(condition.defined());
+  ICHECK(condition.dtype().is_bool());
 
   ObjectPtr<AllocateNode> node = make_object<AllocateNode>();
   node->buffer_var = std::move(buffer_var);
@@ -280,6 +291,7 @@ Allocate::Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, Prim
   node->extents = std::move(extents);
   node->condition = std::move(condition);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
@@ -300,7 +312,9 @@ int32_t AllocateNode::constant_allocation_size(const Array<PrimExpr>& extents) {
 
 TVM_REGISTER_GLOBAL("tir.Allocate")
     .set_body_typed([](Var buffer_var, DataType type, Array<PrimExpr> extents, PrimExpr condition,
-                       Stmt body) { return Allocate(buffer_var, type, extents, condition, body); });
+                       Stmt body, Span span) {
+      return Allocate(buffer_var, type, extents, condition, body, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(AllocateNode);
 
@@ -324,28 +338,30 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // ProducerRealize
 ProducerRealize::ProducerRealize(DataProducer producer, Region bounds, PrimExpr condition,
-                                 Stmt body) {
+                                 Stmt body, Span span) {
   for (size_t i = 0; i < bounds.size(); ++i) {
-    CHECK(bounds[i]->min.defined());
-    CHECK(bounds[i]->extent.defined());
-    CHECK(bounds[i]->min.dtype().is_scalar());
-    CHECK(bounds[i]->extent.dtype().is_scalar());
+    ICHECK(bounds[i]->min.defined());
+    ICHECK(bounds[i]->extent.defined());
+    ICHECK(bounds[i]->min.dtype().is_scalar());
+    ICHECK(bounds[i]->extent.dtype().is_scalar());
   }
-  CHECK(body.defined());
-  CHECK(condition.defined());
-  CHECK(condition.dtype().is_bool());
+  ICHECK(body.defined());
+  ICHECK(condition.defined());
+  ICHECK(condition.dtype().is_bool());
 
   ObjectPtr<ProducerRealizeNode> node = make_object<ProducerRealizeNode>();
   node->producer = std::move(producer);
   node->bounds = std::move(bounds);
   node->condition = std::move(condition);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.ProducerRealize")
-    .set_body_typed([](DataProducer producer, Region bounds, PrimExpr condition, Stmt body) {
-      return ProducerRealize(producer, bounds, condition, body);
+    .set_body_typed([](DataProducer producer, Region bounds, PrimExpr condition, Stmt body,
+                       Span span) {
+      return ProducerRealize(producer, bounds, condition, body, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ProducerRealizeNode);
@@ -379,13 +395,14 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Prefetch
-Prefetch::Prefetch(Buffer buffer, Array<Range> bounds) {
-  data_ = make_object<PrefetchNode>(buffer, bounds);
+Prefetch::Prefetch(Buffer buffer, Array<Range> bounds, Span span) {
+  data_ = make_object<PrefetchNode>(buffer, bounds, span);
 }
 
-TVM_REGISTER_GLOBAL("tir.Prefetch").set_body_typed([](Buffer buffer, Array<Range> bounds) {
-  return Prefetch(buffer, bounds);
-});
+TVM_REGISTER_GLOBAL("tir.Prefetch")
+    .set_body_typed([](Buffer buffer, Array<Range> bounds, Span span) {
+      return Prefetch(buffer, bounds, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(PrefetchNode);
 
@@ -406,14 +423,15 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // SeqStmt
-SeqStmt::SeqStmt(Array<Stmt> seq) {
+SeqStmt::SeqStmt(Array<Stmt> seq, Span span) {
   auto node = make_object<SeqStmtNode>();
   node->seq = std::move(seq);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.SeqStmt").set_body_typed([](Array<Stmt> seq) {
-  return SeqStmt(std::move(seq));
+TVM_REGISTER_GLOBAL("tir.SeqStmt").set_body_typed([](Array<Stmt> seq, Span span) {
+  return SeqStmt(std::move(seq), span);
 });
 
 TVM_REGISTER_NODE_TYPE(SeqStmtNode);
@@ -427,22 +445,23 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // IfThenElse
-IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case) {
-  CHECK(condition.defined());
-  CHECK(then_case.defined());
+IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case, Span span) {
+  ICHECK(condition.defined());
+  ICHECK(then_case.defined());
   // else_case may be null.
   ObjectPtr<IfThenElseNode> node = make_object<IfThenElseNode>();
   node->condition = std::move(condition);
   node->then_case = std::move(then_case);
   node->else_case = std::move(else_case);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_NODE_TYPE(IfThenElseNode);
 
 TVM_REGISTER_GLOBAL("tir.IfThenElse")
-    .set_body_typed([](PrimExpr condition, Stmt then_case, Stmt else_case) {
-      return IfThenElse(condition, then_case, else_case);
+    .set_body_typed([](PrimExpr condition, Stmt then_case, Stmt else_case, Span span) {
+      return IfThenElse(condition, then_case, else_case, span);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -477,15 +496,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Evaluate
-Evaluate::Evaluate(PrimExpr value) {
-  CHECK(value.defined());
+Evaluate::Evaluate(PrimExpr value, Span span) {
+  ICHECK(value.defined());
 
   ObjectPtr<EvaluateNode> node = make_object<EvaluateNode>();
   node->value = std::move(value);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Evaluate").set_body_typed([](PrimExpr value) { return Evaluate(value); });
+TVM_REGISTER_GLOBAL("tir.Evaluate").set_body_typed([](PrimExpr value, Span span) {
+  return Evaluate(value, span);
+});
 
 TVM_REGISTER_NODE_TYPE(EvaluateNode);
 
@@ -498,17 +520,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // BufferStore
-BufferStore::BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices) {
+BufferStore::BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices, Span span) {
   ObjectPtr<BufferStoreNode> node = make_object<BufferStoreNode>();
   node->buffer = std::move(buffer);
   node->value = std::move(value);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.BufferStore")
-    .set_body_typed([](Buffer buffer, PrimExpr value, Array<PrimExpr> indices) {
-      return BufferStore(buffer, value, indices);
+    .set_body_typed([](Buffer buffer, PrimExpr value, Array<PrimExpr> indices, Span span) {
+      return BufferStore(buffer, value, indices, span);
     });
 
 TVM_REGISTER_NODE_TYPE(BufferStoreNode);
@@ -529,14 +552,14 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // BufferRealize
-BufferRealize::BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body) {
-  data_ = make_object<BufferRealizeNode>(buffer, bounds, condition, body);
+BufferRealize::BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
+                             Span span) {
+  data_ = make_object<BufferRealizeNode>(buffer, bounds, condition, body, span);
 }
 
 TVM_REGISTER_GLOBAL("tir.BufferRealize")
-    .set_body_typed([](Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body) {
-      return BufferRealize(buffer, bounds, condition, body);
-    });
+    .set_body_typed([](Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
+                       Span span) { return BufferRealize(buffer, bounds, condition, body, span); });
 
 TVM_REGISTER_NODE_TYPE(BufferRealizeNode);
 
@@ -568,9 +591,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "}\n";
     });
 
-PrimExpr TypeAnnotation(DataType dtype) {
+PrimExpr TypeAnnotation(DataType dtype, Span span) {
   static auto op = Op::Get("tir.type_annotation");
-  return tir::Call(dtype, op, {});
+  return tir::Call(dtype, op, {}, span);
 }
 
 TVM_REGISTER_OP("tir.type_annotation")
diff --git a/src/tir/ir/transform.cc b/src/tir/ir/transform.cc
index 62c790fab3ab..95c40f9a3c8e 100644
--- a/src/tir/ir/transform.cc
+++ b/src/tir/ir/transform.cc
@@ -88,7 +88,7 @@ PrimFuncPass::PrimFuncPass(
 // Perform Module -> Module optimizations at the PrimFunc level.
 IRModule PrimFuncPassNode::operator()(IRModule mod, const PassContext& pass_ctx) const {
   const PassInfo& pass_info = Info();
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
   pass_ctx.Trace(mod, pass_info, true);
   std::vector<ObjectRef> deleted_list;
   IRModuleNode* mod_ptr = mod.CopyOnWrite();
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 6d94a08cad5d..1a6df556876d 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -107,7 +107,7 @@ void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs) {  // NOLINT(*)
   } else if (rtype.lanes() == 1 && ltype.lanes() != 1) {
     rhs = tir::Broadcast(rhs, ltype.lanes());
   } else {
-    CHECK(ltype.lanes() == rtype.lanes()) << "Cannot match type " << ltype << " vs " << rtype;
+    ICHECK(ltype.lanes() == rtype.lanes()) << "Cannot match type " << ltype << " vs " << rtype;
   }
   if (lhs.dtype() == rhs.dtype()) return;
   // Only do very simple type coversion
@@ -146,7 +146,7 @@ void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs) {  // NOLINT(*)
 // maximum and min limits
 PrimExpr max_value(const DataType& dtype) {
   using namespace tir;
-  CHECK_EQ(dtype.lanes(), 1);
+  ICHECK_EQ(dtype.lanes(), 1);
   if (dtype.is_int()) {
     if (dtype.bits() == 64) {
       return IntImm(dtype, std::numeric_limits<int64_t>::max());
@@ -178,10 +178,10 @@ PrimExpr max_value(const DataType& dtype) {
 
 PrimExpr min_value(const DataType& dtype) {
   using namespace tir;
-  CHECK_EQ(dtype.lanes(), 1);
+  ICHECK_EQ(dtype.lanes(), 1);
   if (datatype::Registry::Global()->GetTypeRegistered(dtype.code())) {
     auto f = datatype::GetMinFunc(dtype.code());
-    CHECK(f) << "No minimum function registered for custom dtype " << (unsigned int)dtype.code();
+    ICHECK(f) << "No minimum function registered for custom dtype " << (unsigned int)dtype.code();
     // TODO(@hypercubestart) Document this change (and others associated with the overflowing
     // floatimm min bug)
     return (*f)(dtype.bits());
@@ -211,7 +211,7 @@ PrimExpr min_value(const DataType& dtype) {
 // infinity
 PrimExpr infinity(const DataType& dtype) {
   using namespace tir;
-  CHECK_EQ(dtype.lanes(), 1);
+  ICHECK_EQ(dtype.lanes(), 1);
   if (dtype.is_float()) {
     if (dtype.bits() == 64) {
       return FloatImm(dtype, std::numeric_limits<double>::infinity());
@@ -273,7 +273,7 @@ PrimExpr cast(const DataType& t, PrimExpr value) {
       }
       return tir::Broadcast(value, t.lanes());
     } else {
-      CHECK(value.dtype().lanes() == t.lanes());
+      ICHECK(value.dtype().lanes() == t.lanes());
       return tir::Cast(t, value);
     }
   }
@@ -326,8 +326,8 @@ PrimExpr div(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr truncdiv(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  CHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   return div(a, b);
 }
 
@@ -348,8 +348,8 @@ PrimExpr indexdiv(PrimExpr a, PrimExpr b) { return floordiv(a, b); }
 PrimExpr indexmod(PrimExpr a, PrimExpr b) { return floormod(a, b); }
 
 PrimExpr floordiv(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  CHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   BinaryOpMatchTypes(a, b);
   PrimExpr ret = arith::TryConstFold<tir::FloorDiv>(a, b);
   if (ret.defined()) return ret;
@@ -357,8 +357,8 @@ PrimExpr floordiv(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr floormod(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  CHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   BinaryOpMatchTypes(a, b);
   PrimExpr ret = arith::TryConstFold<tir::FloorMod>(a, b);
   if (ret.defined()) return ret;
@@ -395,7 +395,7 @@ PrimExpr max(PrimExpr a, PrimExpr b) {
 
 // if_then_else
 PrimExpr if_then_else(PrimExpr cond, PrimExpr true_value, PrimExpr false_value) {
-  CHECK(cond.dtype() == DataType::Bool(1))
+  ICHECK(cond.dtype() == DataType::Bool(1))
       << "if_then_else only accept the condition to be boolean type.";
   BinaryOpMatchTypes(true_value, false_value);
   if (const IntImmNode* op = cond.as<IntImmNode>()) {
@@ -460,23 +460,23 @@ PrimExpr operator!=(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr operator&&(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
+  ICHECK(a.dtype().is_bool());
+  ICHECK(b.dtype().is_bool());
   PrimExpr ret = arith::TryConstFold<tir::And>(a, b);
   if (ret.defined()) return ret;
   return tir::And(a, b);
 }
 
 PrimExpr operator||(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
+  ICHECK(a.dtype().is_bool());
+  ICHECK(b.dtype().is_bool());
   PrimExpr ret = arith::TryConstFold<tir::Or>(a, b);
   if (ret.defined()) return ret;
   return tir::Or(a, b);
 }
 
 PrimExpr operator!(PrimExpr a) {
-  CHECK(a.dtype().is_bool());
+  ICHECK(a.dtype().is_bool());
   PrimExpr ret = arith::TryConstFold<tir::Not>(a);
   if (ret.defined()) return ret;
   return tir::Not(a);
@@ -484,13 +484,13 @@ PrimExpr operator!(PrimExpr a) {
 
 // shirt right
 PrimExpr operator>>(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pb)
-      CHECK(pb->value >= 0 && pb->value < rtype.bits())
+      ICHECK(pb->value >= 0 && pb->value < rtype.bits())
           << "Shift amount must be non-negative and less than " << rtype.bits() << " for type "
           << rtype;
     if (pa && pb) return IntImm(rtype, (pa->value >> pb->value));
@@ -504,13 +504,13 @@ PrimExpr operator>>(PrimExpr a, PrimExpr b) {
 
 // shift left
 PrimExpr operator<<(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pb)
-      CHECK(pb->value >= 0 && pb->value < rtype.bits())
+      ICHECK(pb->value >= 0 && pb->value < rtype.bits())
           << "Shift amount must be non-negative and less than " << rtype.bits() << " for type "
           << rtype;
     if (pa && pb) return IntImm(rtype, (pa->value << pb->value));
@@ -523,8 +523,8 @@ PrimExpr operator<<(PrimExpr a, PrimExpr b) {
 
 // bitwise and
 PrimExpr operator&(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -535,8 +535,8 @@ PrimExpr operator&(PrimExpr a, PrimExpr b) {
 
 // bitwise_or
 PrimExpr operator|(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -547,8 +547,8 @@ PrimExpr operator|(PrimExpr a, PrimExpr b) {
 
 // bitwise_xor
 PrimExpr operator^(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -559,7 +559,7 @@ PrimExpr operator^(PrimExpr a, PrimExpr b) {
 
 // bitwie_not
 PrimExpr operator~(PrimExpr a) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
   return tir::Call(a.dtype(), tir::builtin::bitwise_not(), {a});
 }
 
@@ -568,7 +568,7 @@ TVM_REGISTER_GLOBAL("tir.bitwise_not").set_body_typed([](PrimExpr a) { return ~a
 // pow
 PrimExpr pow(PrimExpr x, PrimExpr y) {
   BinaryOpMatchTypes(x, y);
-  CHECK(x.dtype().is_float()) << "power only applies to float";
+  ICHECK(x.dtype().is_float()) << "power only applies to float";
   static auto op = Op::Get("tir.pow");
   return tir::Call(x.dtype(), op, {x, y});
 }
@@ -652,7 +652,7 @@ PrimExpr sum(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
 }
 
 PrimExpr all(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
-  CHECK(source.dtype().is_bool());
+  ICHECK(source.dtype().is_bool());
   Var x("x", source.dtype()), y("y", source.dtype());
   PrimExpr result = tir::And(x, y);
   PrimExpr identity_element = make_const(source.dtype(), true);
@@ -661,7 +661,7 @@ PrimExpr all(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
 }
 
 PrimExpr any(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
-  CHECK(source.dtype().is_bool());
+  ICHECK(source.dtype().is_bool());
   Var x("x", source.dtype()), y("y", source.dtype());
   PrimExpr result = tir::Or(x, y);
   PrimExpr identity_element = make_const(source.dtype(), false);
@@ -696,7 +696,7 @@ PrimExpr prod(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
 // fmod
 PrimExpr fmod(PrimExpr x, PrimExpr y) {
   BinaryOpMatchTypes(x, y);
-  CHECK(x.dtype().is_float()) << "fmod only applies to float";
+  ICHECK(x.dtype().is_float()) << "fmod only applies to float";
   static auto op = Op::Get("tir.fmod");
   return tir::Call(x.dtype(), op, {x, y});
 }
@@ -922,4 +922,8 @@ TVM_REGISTER_GLOBAL("tir._OpIfThenElse")
       return if_then_else(cond, true_value, false_value);
     });
 
+TVM_REGISTER_GLOBAL("tir.const_true").set_body_typed([](DataType t) {
+  return const_true(t.lanes());
+});
+
 }  // namespace tvm
diff --git a/src/tir/transforms/arg_binder.cc b/src/tir/transforms/arg_binder.cc
index b88d2980b770..1b58bfa38b40 100644
--- a/src/tir/transforms/arg_binder.cc
+++ b/src/tir/transforms/arg_binder.cc
@@ -28,7 +28,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -49,7 +49,7 @@ void BinderAddAssert(arith::Analyzer* ana, PrimExpr cond, const std::string& arg
 
 bool ArgBinder::Bind_(const PrimExpr& arg, const PrimExpr& value, const std::string& arg_name,
                       bool with_lets) {
-  CHECK_EQ(arg.dtype(), value.dtype());
+  ICHECK_EQ(arg.dtype(), value.dtype());
   if (const VarNode* v = arg.as<VarNode>()) {
     auto it = def_map_->find(v);
     if (it == def_map_->end()) {
@@ -78,7 +78,7 @@ void ArgBinder::Bind(const PrimExpr& arg, const PrimExpr& value, const std::stri
 
 void ArgBinder::BindArray(const Array<PrimExpr>& arg, const Array<PrimExpr>& value,
                           const std::string& arg_name) {
-  CHECK_EQ(arg.size(), value.size()) << "Argument " << arg_name << " array size mismatch";
+  ICHECK_EQ(arg.size(), value.size()) << "Argument " << arg_name << " array size mismatch";
   for (size_t i = 0; i < arg.size(); ++i) {
     std::ostringstream os;
     os << arg_name << "[" << i << "]";
@@ -88,8 +88,8 @@ void ArgBinder::BindArray(const Array<PrimExpr>& arg, const Array<PrimExpr>& val
 
 void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::string& arg_name,
                            bool fuzzy_match) {
-  CHECK_EQ(arg->scope, value->scope) << "Argument " << arg_name << " Buffer bind scope mismatch";
-  CHECK_EQ(arg->dtype, value->dtype)
+  ICHECK_EQ(arg->scope, value->scope) << "Argument " << arg_name << " Buffer bind scope mismatch";
+  ICHECK_EQ(arg->dtype, value->dtype)
       << "Argument " << arg_name << " Buffer bind data type mismatch";
   if (value->data_alignment % arg->data_alignment != 0) {
     LOG(WARNING) << "Trying to bind buffer to another one with lower alignment requirement "
@@ -98,7 +98,7 @@ void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::st
   }
   // bind pointer and offset.
   if (is_zero(arg->elem_offset)) {
-    CHECK(is_zero(value->elem_offset))
+    ICHECK(is_zero(value->elem_offset))
         << "Trying to bind a Buffer with offset into one without offset "
         << " required elem_offset=" << arg->elem_offset
         << ", provided elem_offset=" << value->elem_offset;
@@ -116,10 +116,10 @@ void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::st
   }
 
   if (arg->shape.size() < value->shape.size()) {
-    CHECK(fuzzy_match) << "Argument " << arg_name << " size mismatch";
+    ICHECK(fuzzy_match) << "Argument " << arg_name << " size mismatch";
     size_t diff = value->shape.size() - arg->shape.size();
     for (size_t i = 0; i < diff; ++i) {
-      CHECK(is_one(analyzer_.Simplify(value->shape[i])))
+      ICHECK(is_one(analyzer_.Simplify(value->shape[i])))
           << "Argument " << arg_name << " shape mismatch" << arg->shape << " vs " << value->shape;
     }
     for (size_t i = 0; i < arg->shape.size(); ++i) {
@@ -128,8 +128,8 @@ void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::st
       this->Bind(arg->shape[i], value->shape[i + diff], os.str());
     }
     if (value->strides.size() != 0) {
-      CHECK_EQ(arg->strides.size(), arg->shape.size());
-      CHECK_EQ(value->strides.size(), value->shape.size());
+      ICHECK_EQ(arg->strides.size(), arg->shape.size());
+      ICHECK_EQ(value->strides.size(), value->shape.size());
       for (size_t i = 0; i < arg->strides.size(); ++i) {
         std::ostringstream os;
         os << arg_name << ".strides[" << i << "]";
diff --git a/src/tir/transforms/bf16_legalize.cc b/src/tir/transforms/bf16_legalize.cc
index 97c96edc6ca7..7a8789457923 100644
--- a/src/tir/transforms/bf16_legalize.cc
+++ b/src/tir/transforms/bf16_legalize.cc
@@ -50,10 +50,10 @@ class BF16PromoteRewriter : public StmtExprMutator {
     auto b = this->VisitExpr(orig_b);
     *is_bfloat16 = false;
     if (a->dtype.is_bfloat16()) {
-      CHECK(b->dtype.is_bfloat16());
+      ICHECK(b->dtype.is_bfloat16());
       *is_bfloat16 = true;
     } else if (b->dtype.is_bfloat16()) {
-      CHECK(a->dtype.is_bfloat16());
+      ICHECK(a->dtype.is_bfloat16());
       *is_bfloat16 = true;
     }
 
@@ -182,14 +182,14 @@ class BF16LowerRewriter : public StmtExprMutator {
     auto op_val = StmtExprMutator::VisitExpr(op->value);
     if (op->value->dtype.is_bfloat16()) {
       // if is cast_from_bf16, check if is to fp32
-      CHECK(op->dtype.is_float() && op->dtype.bits() == 32);
+      ICHECK(op->dtype.is_float() && op->dtype.bits() == 32);
       auto uint32_dtype = DataType(kDLUInt, 32, op_val->dtype.lanes());
       auto uint32_v = Cast(uint32_dtype, op_val);
       // to be endian invariant.
       return Call(op->dtype, builtin::reinterpret(), {uint32_v << 16});
     } else if (op->dtype.is_bfloat16()) {
       // if is cast_to_bf16, check if op->value is fp32
-      CHECK(op->value->dtype.is_float() && op->value->dtype.bits() == 32);
+      ICHECK(op->value->dtype.is_float() && op->value->dtype.bits() == 32);
       auto uint32_dtype = DataType(kDLUInt, 32, op_val->dtype.lanes());
       auto uint32_v = Call(uint32_dtype, builtin::reinterpret(), {op_val});
       auto uint16_dtype = DataType(kDLUInt, 16, op_val->dtype.lanes());
@@ -299,7 +299,7 @@ class BF16LowerRewriter : public StmtExprMutator {
 
     if (op->dtype.is_bfloat16()) {
       auto it = var_remap_.find(op->buffer_var);
-      CHECK(it != var_remap_.end()) << "bfloat* var needs to be remapped";
+      ICHECK(it != var_remap_.end()) << "bfloat* var needs to be remapped";
       return Load(DataType::UInt(16, op->dtype.lanes()), it->second, op->index, op->predicate);
     } else {
       return ret;
diff --git a/src/tir/transforms/combine_context_call.cc b/src/tir/transforms/combine_context_call.cc
index 0485bb1f7613..03a0d5e751cf 100644
--- a/src/tir/transforms/combine_context_call.cc
+++ b/src/tir/transforms/combine_context_call.cc
@@ -42,13 +42,13 @@ class ContextCallCombiner final : public StmtExprMutator {
  public:
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_thread_context())) {
-      CHECK_EQ(op->args.size(), 1U);
+      ICHECK_EQ(op->args.size(), 1U);
       PrimExpr ctx = op->args[0];
       auto it = ctx_map_.find(ctx);
       if (it != ctx_map_.end()) {
         return it->second;
       } else {
-        CHECK(ctx.dtype().is_handle());
+        ICHECK(ctx.dtype().is_handle());
         Var ctx_var("ctx_cache_", ctx.dtype());
         ctx_map_[ctx] = ctx_var;
         return std::move(ctx_var);
diff --git a/src/tir/transforms/coproc_sync.cc b/src/tir/transforms/coproc_sync.cc
index 716ec625d5a8..f9245442d268 100644
--- a/src/tir/transforms/coproc_sync.cc
+++ b/src/tir/transforms/coproc_sync.cc
@@ -29,7 +29,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 #include "storage_access.h"
 
 namespace tvm {
@@ -149,7 +149,7 @@ class CoProcSyncPlanner : public StorageAccessVisitor {
         }
       }
       if (sync_write) {
-        CHECK_NE(i, 0U);
+        ICHECK_NE(i, 0U);
         sync_[seq[i - 1].stmt] = GetSync(co_access);
         co_access.clear();
         contain_sync = true;
@@ -175,7 +175,7 @@ class CoProcSyncPlanner : public StorageAccessVisitor {
       }
     }
     if (sync_at_end && co_access.size() != 0) {
-      CHECK_NE(seq.size(), 0);
+      ICHECK_NE(seq.size(), 0);
       contain_sync = true;
       sync_[seq.back().stmt] = GetSync(co_access);
       co_access.clear();
@@ -190,8 +190,8 @@ class CoProcSyncPlanner : public StorageAccessVisitor {
   // Add write Synchronization
   std::vector<Stmt> GetSync(const std::vector<AccessEntry>& co_access) {
     // Does not consider memory coherence, need runtime.
-    CHECK_NE(co_access.size(), 0U);
-    CHECK_EQ(co_access[0].threads.size(), 1U);
+    ICHECK_NE(co_access.size(), 0U);
+    ICHECK_EQ(co_access[0].threads.size(), 1U);
     return GetSync(coproc_name_ + ".coproc_sync");
   }
 
@@ -250,7 +250,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
     auto fupdate = [&](size_t i, const AccessEntry& acc) {
       auto it = write_set.find(acc.buffer.get());
       if (it != write_set.end()) {
-        CHECK_NE(i, 0U);
+        ICHECK_NE(i, 0U);
         barrier_after_[seq[i - 1].stmt].push_back(MakeBarrier(write_barrier_name_, it->second));
         write_set.erase(it);
       }
@@ -288,7 +288,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
     auto fupdate = [&](size_t i, const AccessEntry& acc) {
       auto it = read_set.find(acc.buffer.get());
       if (it != read_set.end()) {
-        CHECK_NE(i, seq.size());
+        ICHECK_NE(i, seq.size());
         barrier_before_[seq[i].stmt].push_back(MakeBarrier(read_barrier_name_, it->second));
         read_set.erase(it);
       }
@@ -324,12 +324,12 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
     // insert write point
     Array<arith::IntSet> wset;
     for (const AccessEntry& acc : wvec) {
-      CHECK(acc.dtype == wvec[0].dtype);
+      ICHECK(acc.dtype == wvec[0].dtype);
       wset.push_back(acc.touched);
     }
     Range none;
     Range r = arith::Union(wset).CoverRange(none);
-    CHECK(r.defined()) << "Cannot deduce write range of " << wvec[0].buffer;
+    ICHECK(r.defined()) << "Cannot deduce write range of " << wvec[0].buffer;
     PrimExpr min = r->min;
     PrimExpr extent = r->extent;
     return Evaluate(Call(DataType::Int(32), Op::Get(func),
@@ -361,7 +361,7 @@ class CoProcInstDepDetector : public StmtVisitor {
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::coproc_scope && op->node.same_as(coproc_axis_)) {
       const IntImmNode* ctx_id = op->value.as<IntImmNode>();
-      CHECK(ctx_id != nullptr);
+      ICHECK(ctx_id != nullptr);
       curr_state_.clear();
       curr_state_.node = op->body.get();
       curr_state_.enter_ctx.insert(ctx_id->value);
@@ -380,7 +380,7 @@ class CoProcInstDepDetector : public StmtVisitor {
     curr_state_.clear();
     if (last_state_.node != nullptr) {
       curr_state_.node = op;
-      CHECK(first_state_.node != nullptr);
+      ICHECK(first_state_.node != nullptr);
       // loop carry dependency
       InjectSync(last_state_, first_state_, &(curr_state_.exit_push), &(curr_state_.enter_pop));
       curr_state_.enter_ctx = first_state_.enter_ctx;
@@ -548,7 +548,7 @@ class CoProcInstDepDetector : public StmtVisitor {
       InjectSync(last_state_, curr_state_, &t1, &t2);
       std::swap(last_state_, curr_state_);
     } else {
-      CHECK(first_state_.node == nullptr);
+      ICHECK(first_state_.node == nullptr);
       first_state_ = curr_state_;
       last_state_ = curr_state_;
     }
@@ -582,7 +582,7 @@ class CoProcSyncInserter : public StmtMutator {
         touched.insert(kv.first);
       }
     }
-    CHECK_EQ(visitor.coproc_.size(), 1U);
+    ICHECK_EQ(visitor.coproc_.size(), 1U);
     std::string coproc_name = (*visitor.coproc_.begin())->var->name_hint;
     // plan sync.
     CoProcSyncPlanner sync_planner(touched, coproc_name);
diff --git a/src/tir/transforms/hoist_if_then_else.cc b/src/tir/transforms/hoist_if_then_else.cc
index 4e7589c3a795..7bae0ce8ca75 100644
--- a/src/tir/transforms/hoist_if_then_else.cc
+++ b/src/tir/transforms/hoist_if_then_else.cc
@@ -32,7 +32,7 @@
 
 #include "../../arith/interval_set.h"
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -248,7 +248,7 @@ class HoistCandidateSelector final : public StmtExprVisitor {
  private:
   void ResetRecorderInternal() {
     if (is_recorder_on_) {
-      CHECK_GT(ordered_list_.size(), 0);
+      ICHECK_GT(ordered_list_.size(), 0);
       is_recorder_on_ = false;
     }
     ordered_list_.clear();
diff --git a/src/tir/transforms/inject_copy_intrin.cc b/src/tir/transforms/inject_copy_intrin.cc
index b27459f4bd45..f7443c74c0f7 100644
--- a/src/tir/transforms/inject_copy_intrin.cc
+++ b/src/tir/transforms/inject_copy_intrin.cc
@@ -47,7 +47,7 @@ class CopyIntrinInjector : public StmtMutator {
       storage_scope_[buf] = op->value.as<StringImmNode>()->value;
     } else if (op->attr_key == pragma_key_) {
       Stmt ret;
-      CHECK(MatchCopyPattern(op->body, &ret)) << "Cannot match copy pattern of " << op->body;
+      ICHECK(MatchCopyPattern(op->body, &ret)) << "Cannot match copy pattern of " << op->body;
       return ret;
     }
     return StmtMutator::VisitStmt_(op);
@@ -76,7 +76,7 @@ class CopyIntrinInjector : public StmtMutator {
     const CastNode* cast = store->value.as<CastNode>();
     const LoadNode* load = store->value.as<LoadNode>();
     if (0 == loops.size()) {
-      CHECK(!has_cond);
+      ICHECK(!has_cond);
     }
     // for now only support true condition matching
     if (has_cond) {
@@ -112,8 +112,8 @@ class CopyIntrinInjector : public StmtMutator {
       Array<PrimExpr> clip_bound = arith::DetectClipBound(sel_cond.Eval(), loop_vars);
       pad_value = sel_false_value.Eval();
       if (clip_bound.size() == 0) return false;
-      CHECK_EQ(src_shape.size(), loop_vars.size());
-      CHECK_EQ(clip_bound.size(), loop_vars.size() * 2);
+      ICHECK_EQ(src_shape.size(), loop_vars.size());
+      ICHECK_EQ(clip_bound.size(), loop_vars.size() * 2);
       for (size_t i = 0; i < src_shape.size(); ++i) {
         PrimExpr min_value = clip_bound[2 * i];
         PrimExpr max_value = clip_bound[2 * i + 1];
@@ -139,8 +139,8 @@ class CopyIntrinInjector : public StmtMutator {
       }
       src_elem_offset = analyzer_.Simplify(src_elem_offset);
     }
-    CHECK_EQ(load_strides.size(), store_strides.size());
-    CHECK_EQ(load_strides.size(), loop_var_size + 1);
+    ICHECK_EQ(load_strides.size(), store_strides.size());
+    ICHECK_EQ(load_strides.size(), loop_var_size + 1);
     Array<PrimExpr> src_strides(load_strides.begin(), load_strides.begin() + loop_var_size);
     Array<PrimExpr> dst_strides(store_strides.begin(), store_strides.begin() + loop_var_size);
     if (loop_var_size == 0) {
@@ -154,7 +154,7 @@ class CopyIntrinInjector : public StmtMutator {
                         load->buffer_var->name_hint, GetStorageScope(load->buffer_var.get()), 0, 0,
                         kDefault);
     *out = flower_copy_fromto_(src, dst, pad_before, pad_after, pad_value);
-    CHECK(out->defined()) << "flower function did not return correct stmt";
+    ICHECK(out->defined()) << "flower function did not return correct stmt";
     return true;
   }
   // Get storage scope
diff --git a/src/tir/transforms/inject_double_buffer.cc b/src/tir/transforms/inject_double_buffer.cc
index 9d5ee950cdfa..8de446727a71 100644
--- a/src/tir/transforms/inject_double_buffer.cc
+++ b/src/tir/transforms/inject_double_buffer.cc
@@ -26,7 +26,7 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -123,7 +123,7 @@ class DoubleBufferInjector : public StmtExprMutator {
       for (PrimExpr e : op->extents) {
         new_extents.push_back(e);
       }
-      CHECK(it->second.loop != nullptr);
+      ICHECK(it->second.loop != nullptr);
       auto& alloc_nest = loop_allocs_[it->second.loop];
       alloc_nest.emplace_back(
           AttrStmt(op->buffer_var, attr::storage_scope, StringImm(it->second.scope), Evaluate(0)));
@@ -143,9 +143,9 @@ class DoubleBufferInjector : public StmtExprMutator {
       const ForNode* old_loop = stmt.as<ForNode>();
       if (split_loop_ != 0) {
         // Explicitly unroll the loop
-        CHECK(split_loop_ % 2 == 0 || split_loop_ == 1)
+        ICHECK(split_loop_ % 2 == 0 || split_loop_ == 1)
             << "It is better to split with multiple of 2";
-        CHECK(is_zero(old_loop->min));
+        ICHECK(is_zero(old_loop->min));
         PrimExpr zero = old_loop->min;
         PrimExpr new_ext = old_loop->extent - make_const(old_loop->loop_var.dtype(), 1);
         PrimExpr factor = make_const(new_ext.dtype(), split_loop_);
@@ -186,8 +186,8 @@ class DoubleBufferInjector : public StmtExprMutator {
     auto it = dbuffer_info_.find(op->buffer_var.get());
     if (it != dbuffer_info_.end()) {
       const StorageEntry& e = it->second;
-      CHECK(in_double_buffer_scope_);
-      CHECK(e.stride.defined());
+      ICHECK(in_double_buffer_scope_);
+      ICHECK(e.stride.defined());
       return Store(op->buffer_var, op->value, e.switch_write_var * e.stride + op->index,
                    op->predicate);
     } else {
@@ -201,8 +201,8 @@ class DoubleBufferInjector : public StmtExprMutator {
     auto it = dbuffer_info_.find(op->buffer_var.get());
     if (it != dbuffer_info_.end()) {
       const StorageEntry& e = it->second;
-      CHECK(e.stride.defined());
-      CHECK(e.switch_read_var.defined());
+      ICHECK(e.stride.defined());
+      ICHECK(e.switch_read_var.defined());
       return Load(op->dtype, op->buffer_var, e.switch_read_var * e.stride + op->index,
                   op->predicate);
     } else {
@@ -211,14 +211,14 @@ class DoubleBufferInjector : public StmtExprMutator {
   }
 
   PrimExpr VisitExpr_(const VarNode* op) final {
-    CHECK(!dbuffer_info_.count(op));
+    ICHECK(!dbuffer_info_.count(op));
     return GetRef<PrimExpr>(op);
   }
 
  private:
   Stmt MakeProducer(const AttrStmtNode* op) {
     const Var buffer = Downcast<Var>(op->node);
-    CHECK_NE(loop_nest_.size(), 0U) << "Double buffer scope must be inside a loop";
+    ICHECK_NE(loop_nest_.size(), 0U) << "Double buffer scope must be inside a loop";
     auto it = dbuffer_info_.find(buffer.get());
     if (it == dbuffer_info_.end()) {
       LOG(WARNING) << "Skip double buffer scope " << op->node;
diff --git a/src/tir/transforms/inject_prefetch.cc b/src/tir/transforms/inject_prefetch.cc
index 4e4f33baed2b..b5c4cf5ec582 100644
--- a/src/tir/transforms/inject_prefetch.cc
+++ b/src/tir/transforms/inject_prefetch.cc
@@ -44,7 +44,7 @@ class PrefetchInjector : public StmtMutator {
     op = ret.as<AttrStmtNode>();
     if (op && op->attr_key == attr::prefetch_scope) {
       Buffer buffer = Downcast<Buffer>(op->node);
-      CHECK_NE(loop_nest_.size(), 0U);
+      ICHECK_NE(loop_nest_.size(), 0U);
       Region domain = DomainTouched(op->body, buffer, true, false);
       Region region;
 
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index d5405790a15a..9a77449ecfa2 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -28,7 +28,7 @@
 
 #include <unordered_set>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -58,8 +58,8 @@ class ExprTouched final : public StmtExprVisitor {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       const auto* rw_mask = op->args[4].as<IntImmNode>();
       const VarNode* buffer_var = op->args[1].as<VarNode>();
-      CHECK(buffer_var);
-      CHECK(rw_mask);
+      ICHECK(buffer_var);
+      ICHECK(rw_mask);
       // read
       if (rw_mask->value & 1) {
         HandleUseVar(buffer_var);
@@ -182,7 +182,7 @@ class VTInjector : public StmtExprMutator {
         allow_share_(allow_share) {}
   // Inject VTLoop when needed.
   Stmt VisitStmt(const Stmt& s) final {
-    CHECK(!visit_touched_var_);
+    ICHECK(!visit_touched_var_);
     auto stmt = StmtExprMutator::VisitStmt(s);
     if (visit_touched_var_ || trigger_base_inject_) {
       if (!vt_loop_injected_) {
@@ -195,7 +195,7 @@ class VTInjector : public StmtExprMutator {
   }
   // Variable
   PrimExpr VisitExpr_(const VarNode* op) final {
-    CHECK(!alloc_remap_.count(op)) << "Buffer address may get rewritten in virtual thread";
+    ICHECK(!alloc_remap_.count(op)) << "Buffer address may get rewritten in virtual thread";
     if (touched_var_.count(op)) {
       visit_touched_var_ = true;
     }
@@ -221,7 +221,7 @@ class VTInjector : public StmtExprMutator {
   // Expression.
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
-      CHECK_EQ(op->args.size(), 5U);
+      ICHECK_EQ(op->args.size(), 5U);
       DataType dtype = op->args[0].dtype();
       const VarNode* buffer = op->args[1].as<VarNode>();
       auto it = alloc_remap_.find(buffer);
@@ -290,7 +290,7 @@ class VTInjector : public StmtExprMutator {
   }
   // For
   Stmt VisitStmt_(const ForNode* op) final {
-    CHECK(is_zero(op->min));
+    ICHECK(is_zero(op->min));
     PrimExpr extent = this->VisitExpr(op->extent);
     if (visit_touched_var_ && !vt_loop_injected_) {
       Stmt stmt = InjectVTLoop(GetRef<Stmt>(op), true);
@@ -313,7 +313,7 @@ class VTInjector : public StmtExprMutator {
       return InjectVTLoop(GetRef<Stmt>(op), true);
     }
     visit_touched_var_ = false;
-    CHECK_EQ(max_loop_depth_, 0);
+    ICHECK_EQ(max_loop_depth_, 0);
     Stmt then_case = this->VisitStmt(op->then_case);
     Stmt else_case;
     if (op->else_case.defined()) {
@@ -332,7 +332,7 @@ class VTInjector : public StmtExprMutator {
 
   // Seq
   Stmt VisitStmt_(const SeqStmtNode* op) final {
-    CHECK_EQ(max_loop_depth_, 0);
+    ICHECK_EQ(max_loop_depth_, 0);
     auto fmutate = [this](const Stmt& s) {
       int temp = max_loop_depth_;
       max_loop_depth_ = 0;
@@ -392,7 +392,7 @@ class VTInjector : public StmtExprMutator {
 
   // inject vthread loop
   Stmt InjectVTLoop(Stmt stmt, bool before_mutation) {
-    CHECK(!vt_loop_injected_);
+    ICHECK(!vt_loop_injected_);
     // reset the flags
     visit_touched_var_ = false;
     trigger_base_inject_ = false;
diff --git a/src/tir/transforms/ir_util.cc b/src/tir/transforms/ir_utils.cc
similarity index 93%
rename from src/tir/transforms/ir_util.cc
rename to src/tir/transforms/ir_utils.cc
index 4f21f0bb7411..033a2e093a2a 100644
--- a/src/tir/transforms/ir_util.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -18,10 +18,10 @@
  */
 
 /*!
- * \file ir_util.cc
+ * \file ir_utils.cc
  * \brief Helper functions to construct and compose IR nodes.
  */
-#include "ir_util.h"
+#include "ir_utils.h"
 
 #include <tvm/tir/stmt_functor.h>
 
@@ -38,38 +38,38 @@ Stmt MergeNest(const std::vector<Stmt>& nest, Stmt body) {
     Stmt s = *ri;
     if (const auto* for_ = s.as<ForNode>()) {
       auto n = make_object<ForNode>(*for_);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (const auto* let = s.as<LetStmtNode>()) {
       auto n = make_object<LetStmtNode>(*let);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (const auto* attr = s.as<AttrStmtNode>()) {
       auto n = make_object<AttrStmtNode>(*attr);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (const auto* ite = s.as<IfThenElseNode>()) {
       auto n = make_object<IfThenElseNode>(*ite);
-      CHECK(is_no_op(n->then_case));
-      CHECK(!n->else_case.defined());
+      ICHECK(is_no_op(n->then_case));
+      ICHECK(!n->else_case.defined());
       n->then_case = body;
       body = Stmt(n);
     } else if (const auto* seq = s.as<SeqStmtNode>()) {
       auto n = make_object<SeqStmtNode>(*seq);
-      CHECK(n->size() != 0 && is_no_op(n->seq[n->size() - 1]));
+      ICHECK(n->size() != 0 && is_no_op(n->seq[n->size() - 1]));
       n->seq.Set(n->size() - 1, body);
       body = Stmt(n);
     } else if (const auto* assert_ = s.as<AssertStmtNode>()) {
       auto n = make_object<AssertStmtNode>(*assert_);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (const auto* alloc = s.as<AllocateNode>()) {
       auto n = make_object<AllocateNode>(*alloc);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else {
@@ -89,7 +89,7 @@ Stmt MergeNest(const std::vector<std::vector<Stmt>>& nest, Stmt body) {
 class IRConvertSSA final : public StmtExprMutator {
  public:
   PrimExpr VisitExpr_(const VarNode* op) final {
-    if (scope_.count(op)) {
+    if (scope_.count(op) && !scope_[op].empty()) {
       return scope_[op].back();
     } else {
       return GetRef<PrimExpr>(op);
@@ -158,7 +158,7 @@ class IRConvertSSA final : public StmtExprMutator {
   Stmt VisitStmt_(const AllocateNode* op) final {
     const Var& v = op->buffer_var;
     if (defined_.count(v.get())) {
-      Var new_var(v->name_hint, v.dtype());
+      Var new_var(v->name_hint, v->type_annotation);
       scope_[v.get()].push_back(new_var);
       Stmt stmt = StmtExprMutator::VisitStmt_(op);
       scope_[v.get()].pop_back();
@@ -177,7 +177,7 @@ class IRConvertSSA final : public StmtExprMutator {
           Stmt new_alloc = this->VisitStmt(op->body);
           if (new_alloc.same_as(op->body)) return GetRef<Stmt>(op);
           alloc = new_alloc.as<AllocateNode>();
-          CHECK(alloc);
+          ICHECK(alloc);
           return AttrStmt(alloc->buffer_var, op->attr_key, op->value, new_alloc);
         }
       }
diff --git a/src/tir/transforms/ir_util.h b/src/tir/transforms/ir_utils.h
similarity index 95%
rename from src/tir/transforms/ir_util.h
rename to src/tir/transforms/ir_utils.h
index 2f9d70659f4d..3b4e693b820a 100644
--- a/src/tir/transforms/ir_util.h
+++ b/src/tir/transforms/ir_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file ir_util.h
+ * \file ir_utils.h
  * \brief Helper functions to construct and compose IR nodes.
  */
-#ifndef TVM_TIR_TRANSFORMS_IR_UTIL_H_
-#define TVM_TIR_TRANSFORMS_IR_UTIL_H_
+#ifndef TVM_TIR_TRANSFORMS_IR_UTILS_H_
+#define TVM_TIR_TRANSFORMS_IR_UTILS_H_
 
 #include <tvm/runtime/device_api.h>
 #include <tvm/tir/builtin.h>
@@ -138,9 +138,9 @@ inline Stmt TVMStructSet(Var handle, int index, builtin::TVMStructFieldKind kind
  */
 inline DataType APIType(DataType t) {
   if (t.is_handle()) return t;
-  CHECK_EQ(t.lanes(), 1) << "Cannot pass vector type through packed API.";
+  ICHECK_EQ(t.lanes(), 1) << "Cannot pass vector type through packed API.";
   if (t.is_uint() || t.is_int()) return DataType::Int(64);
-  CHECK(t.is_float());
+  ICHECK(t.is_float());
   return DataType::Float(64);
 }
 
@@ -170,4 +170,4 @@ Stmt ConvertSSA(Stmt stmt);
 
 }  // namespace tir
 }  // namespace tvm
-#endif  // TVM_TIR_TRANSFORMS_IR_UTIL_H_
+#endif  // TVM_TIR_TRANSFORMS_IR_UTILS_H_
diff --git a/src/tir/transforms/lift_attr_scope.cc b/src/tir/transforms/lift_attr_scope.cc
index 1a1279f0640a..27dd583b8b42 100644
--- a/src/tir/transforms/lift_attr_scope.cc
+++ b/src/tir/transforms/lift_attr_scope.cc
@@ -27,7 +27,7 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -88,7 +88,7 @@ class AttrScopeLifter : public StmtMutator {
     if (attr_node.size() == 0) return ret;
 
     op = ret.as<SeqStmtNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     Array<Stmt> reorg;
     // check if all decorations are common.
     for (size_t begin = 0; begin < attr_node.size();) {
diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index 23f41e1676a6..a104dbb029eb 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -33,16 +33,20 @@
 
 #include "../../arith/interval_set.h"
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
 
 struct LoopPartitionConfigNode : public tvm::AttrsNode<LoopPartitionConfigNode> {
   bool partition_const_loop;
+  bool no_unroll_loop_with_extent_one;
 
   TVM_DECLARE_ATTRS(LoopPartitionConfigNode, "tir.transform.LoopPartitionConfig") {
     TVM_ATTR_FIELD(partition_const_loop).describe("Split constant loop").set_default(false);
+    TVM_ATTR_FIELD(no_unroll_loop_with_extent_one)
+        .describe("Don't unroll loops with extent 1")
+        .set_default(false);
   }
 };
 
@@ -121,7 +125,7 @@ class CandidateSelector final : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent) {
       const IterVarNode* iv = op->node.as<IterVarNode>();
-      CHECK(iv);
+      ICHECK(iv);
       Var var = iv->var;
       runtime::ThreadScope scope = runtime::ThreadScope::Create(iv->thread_tag);
       if ((scope.rank == 0) && (!is_const_int(op->value) || partition_const_loop_)) {
@@ -210,7 +214,7 @@ class PartitionFinder : public StmtExprVisitor {
     // handle thread_axis
     if (op->attr_key == attr::thread_extent) {
       const IterVarNode* thread_axis = op->node.as<IterVarNode>();
-      CHECK(thread_axis);
+      ICHECK(thread_axis);
       const VarNode* var = thread_axis->var.get();
       IntSet dom = IntSet::FromRange(Range(make_zero(op->value.dtype()), op->value));
       hint_map_.insert({var, dom});
@@ -334,8 +338,9 @@ class ThreadPartitionInserter : public StmtMutator {
 // likely conditions
 class LoopPartitioner : public StmtMutator {
  public:
-  explicit LoopPartitioner(bool partition_const_loop)
-      : selector(CandidateSelector(partition_const_loop)) {}
+  explicit LoopPartitioner(bool partition_const_loop, bool no_unroll_loop_with_extent_one)
+      : selector(CandidateSelector(partition_const_loop)),
+        no_unroll_loop_with_extent_one_(no_unroll_loop_with_extent_one) {}
 
   Stmt VisitAndMutate(Stmt stmt) {
     selector(stmt);
@@ -363,7 +368,7 @@ class LoopPartitioner : public StmtMutator {
     }
 
     const IterVarNode* iv = op->node.as<IterVarNode>();
-    CHECK(iv);
+    ICHECK(iv);
     Var var = iv->var;
     auto as = GetRef<Stmt>(op);
     if (selector.candidates.count(as)) {
@@ -402,6 +407,7 @@ class LoopPartitioner : public StmtMutator {
   std::unordered_map<const VarNode*, IntSet> relax_map_;
   arith::Analyzer analyzer_;
   CandidateSelector selector;
+  bool no_unroll_loop_with_extent_one_;
 };
 
 // Returns an interval (in the first component) in which all the conditions
@@ -595,8 +601,9 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
 
 inline Stmt LoopPartitioner::MakeFor(const Object* node, PrimExpr extent, Stmt body) {
   const ForNode* for_node = static_cast<const ForNode*>(node);
-  CHECK(for_node);
-  if (analyzer_.CanProve(extent == make_const(DataType::Int(32), 1))) {
+  ICHECK(for_node);
+  if (analyzer_.CanProve(extent == make_const(DataType::Int(32), 1)) &&
+      !no_unroll_loop_with_extent_one_) {
     // If the loop extent is 1, do not create the loop anymore
     return Substitute(body, {{Var{for_node->loop_var}, make_const(DataType::Int(32), 0)}});
   } else {
@@ -609,7 +616,7 @@ class RemoveLikelyTags : public StmtExprMutator {
  public:
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::likely())) {
-      CHECK_EQ(op->args.size(), 1);
+      ICHECK_EQ(op->args.size(), 1);
       return StmtExprMutator::VisitExpr(op->args[0]);
     } else {
       return StmtExprMutator::VisitExpr_(op);
@@ -617,8 +624,9 @@ class RemoveLikelyTags : public StmtExprMutator {
   }
 };
 
-Stmt LoopPartition(Stmt stmt, bool partition_const_loop) {
-  stmt = LoopPartitioner(partition_const_loop).VisitAndMutate(std::move(stmt));
+Stmt LoopPartition(Stmt stmt, bool partition_const_loop, bool no_unroll_loop_with_extent_one) {
+  stmt = LoopPartitioner(partition_const_loop, no_unroll_loop_with_extent_one)
+             .VisitAndMutate(std::move(stmt));
   stmt = RemoveLikelyTags()(std::move(stmt));
   return stmt;
 }
@@ -632,7 +640,8 @@ Pass LoopPartition() {
     if (!cfg.defined()) {
       cfg = AttrsWithDefaultValues<LoopPartitionConfig>();
     }
-    n->body = LoopPartition(std::move(n->body), cfg.value()->partition_const_loop);
+    n->body = LoopPartition(std::move(n->body), cfg.value()->partition_const_loop,
+                            cfg.value()->no_unroll_loop_with_extent_one);
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.LoopPartition", {});
diff --git a/src/tir/transforms/lower_custom_datatypes.cc b/src/tir/transforms/lower_custom_datatypes.cc
index a0faa17fbcc3..a3e5a920a0b2 100644
--- a/src/tir/transforms/lower_custom_datatypes.cc
+++ b/src/tir/transforms/lower_custom_datatypes.cc
@@ -53,9 +53,9 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     if (toBeLowered) {
       auto lower = datatype::GetCastLowerFunc(target_, type_code, src_type_code);
-      CHECK(lower) << "Cast lowering function for target " << target_ << " destination type "
-                   << static_cast<unsigned>(type_code) << " source type "
-                   << static_cast<unsigned>(src_type_code) << " not found";
+      ICHECK(lower) << "Cast lowering function for target " << target_ << " destination type "
+                    << static_cast<unsigned>(type_code) << " source type "
+                    << static_cast<unsigned>(src_type_code) << " not found";
       return (*lower)(expr);
     }
     return expr;
@@ -66,8 +66,8 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     auto e = GetRef<PrimExpr>(imm);
     if (datatype::Registry::Global()->GetTypeRegistered(type_code)) {
       auto lower = datatype::GetFloatImmLowerFunc(target_, type_code);
-      CHECK(lower) << "FloatImm lowering function for target " << target_ << " type "
-                   << static_cast<unsigned>(type_code) << " not found";
+      ICHECK(lower) << "FloatImm lowering function for target " << target_ << " type "
+                    << static_cast<unsigned>(type_code) << " not found";
       return (*lower)(e);
     }
     return e;
@@ -103,11 +103,11 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     call = expr.as<CallNode>();
     if (toBeLowered) {
       auto op = call->op.as<OpNode>();
-      CHECK(op != nullptr) << "Lowering non-intrinsic Calls not implemented";
+      ICHECK(op != nullptr) << "Lowering non-intrinsic Calls not implemented";
       auto lower = datatype::GetIntrinLowerFunc(target_, op->name, call->dtype.code());
-      CHECK(lower) << "Intrinsic lowering function for target " << target_ << ", intrinsic name "
-                   << op->name << ", type " << static_cast<unsigned>(call->dtype.code())
-                   << " not found";
+      ICHECK(lower) << "Intrinsic lowering function for target " << target_ << ", intrinsic name "
+                    << op->name << ", type " << static_cast<unsigned>(call->dtype.code())
+                    << " not found";
       return (*lower)(expr);
     }
     return expr;
@@ -121,8 +121,8 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     op = expr.as<NodeName>();                                                      \
     if (toBeLowered) {                                                             \
       auto lower = datatype::Get##OP##LowerFunc(target_, type_code);               \
-      CHECK(lower) << #OP " lowering function for target " << target_ << " type "  \
-                   << static_cast<unsigned>(type_code) << " not found";            \
+      ICHECK(lower) << #OP " lowering function for target " << target_ << " type " \
+                    << static_cast<unsigned>(type_code) << " not found";           \
       return (*lower)(expr);                                                       \
     }                                                                              \
     return expr;                                                                   \
@@ -153,7 +153,7 @@ Pass LowerCustomDatatypes() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
-    CHECK(target.defined()) << "LowerCustomDatatypes: Require the target attribute";
+    ICHECK(target.defined()) << "LowerCustomDatatypes: Require the target attribute";
 
     n->body = CustomDatatypesLowerer(target.value()->kind->name)(std::move(n->body));
     return f;
diff --git a/src/tir/transforms/lower_device_storage_access_info.cc b/src/tir/transforms/lower_device_storage_access_info.cc
index fac50a08a9b7..829b7d822d11 100644
--- a/src/tir/transforms/lower_device_storage_access_info.cc
+++ b/src/tir/transforms/lower_device_storage_access_info.cc
@@ -30,7 +30,7 @@
 #include <tvm/tir/transform.h>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -49,7 +49,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
     if (it != storage_info_.end() && it->second.info.defined()) {
       const MemoryInfo& info = it->second.info;
       ++it->second.alloc_count;
-      CHECK_LE(it->second.alloc_count, 1)
+      ICHECK_LE(it->second.alloc_count, 1)
           << "Double allocation of " << it->second.scope.to_string();
 
       if (info->head_address.defined()) {
@@ -69,7 +69,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
       e.scope = scope;
       if (scope.tag.length() != 0) {
         e.info = GetMemoryInfo(op->value.as<StringImmNode>()->value);
-        CHECK(e.info.defined()) << "Cannot find memory info of " << scope.to_string();
+        ICHECK(e.info.defined()) << "Cannot find memory info of " << scope.to_string();
       }
       storage_info_[buf] = e;
       return StmtExprMutator::VisitStmt_(op);
@@ -93,7 +93,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
     // Specially handle the buffer packed intrinsic
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<CallNode>();
-    CHECK_EQ(op->args.size(), 5U);
+    ICHECK_EQ(op->args.size(), 5U);
     DataType dtype = op->args[0].dtype();
     const VarNode* buffer = op->args[1].as<VarNode>();
     Var buffer_var = Downcast<Var>(op->args[1]);
@@ -102,7 +102,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
     if (it != storage_info_.end() && it->second.info.defined()) {
       return MakeTaggedAccessPtr(op->dtype, buffer_var, dtype, offset, it->second.info);
     }
-    CHECK(op->dtype.is_handle());
+    ICHECK(op->dtype.is_handle());
     // Change to address_of
     return AddressOffset(buffer_var, dtype, offset);
   }
@@ -110,11 +110,11 @@ class StorageAccessInfoLower : public StmtExprMutator {
   PrimExpr MakeTaggedAccessPtr(DataType ptr_type, Var buffer_var, DataType dtype, PrimExpr offset,
                                const MemoryInfo& info) {
     if (ptr_type.is_handle()) {
-      CHECK(info->head_address.defined()) << buffer_var << " is not adddressable.";
+      ICHECK(info->head_address.defined()) << buffer_var << " is not adddressable.";
       return AddressOffset(buffer_var, dtype, offset);
     }
     int dtype_bits = dtype.bits() * dtype.lanes();
-    CHECK_EQ(info->unit_bits % dtype_bits, 0);
+    ICHECK_EQ(info->unit_bits % dtype_bits, 0);
     return cast(ptr_type, analyzer_.Simplify(
                               offset / make_const(offset.dtype(), info->unit_bits / dtype_bits)));
   }
diff --git a/src/tir/transforms/lower_intrin.cc b/src/tir/transforms/lower_intrin.cc
index 8774fc37125f..cd7c10ffa688 100644
--- a/src/tir/transforms/lower_intrin.cc
+++ b/src/tir/transforms/lower_intrin.cc
@@ -86,7 +86,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     if (op == nullptr) return ret;
     int shift;
     const DataType& dtype = op->dtype;
-    CHECK(dtype.is_int() || dtype.is_uint());
+    ICHECK(dtype.is_int() || dtype.is_uint());
 
     if (support_bitwise_op_ && is_const_power_of_two_integer(op->b, &shift)) {
       // lower to right shift if possible.
@@ -138,7 +138,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     // Lower floordiv to native truncdiv.
     int shift;
     const DataType& dtype = op->dtype;
-    CHECK(dtype.is_int() || dtype.is_uint());
+    ICHECK(dtype.is_int() || dtype.is_uint());
 
     if (support_bitwise_op_ && is_const_power_of_two_integer(op->b, &shift)) {
       // lower to masking if possible.
@@ -281,7 +281,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       // if pattern exists.
       if (f != nullptr) {
         PrimExpr r = (*f)(e);
-        CHECK(r.defined()) << "intrinsic rule must always return valid Expr";
+        ICHECK(r.defined()) << "intrinsic rule must always return valid Expr";
         if (!r.same_as(e)) {
           return this->VisitExpr(r);
         }
@@ -307,7 +307,7 @@ Pass LowerIntrin() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
-    CHECK(target.defined()) << "LowerIntrin: Require the target attribute";
+    ICHECK(target.defined()) << "LowerIntrin: Require the target attribute";
     arith::Analyzer analyzer;
     auto mtriple = target.value()->GetAttr<runtime::String>("mtriple", "");
     n->body =
diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
index bd216bb1c6cb..c24e26b58db0 100644
--- a/src/tir/transforms/lower_thread_allreduce.cc
+++ b/src/tir/transforms/lower_thread_allreduce.cc
@@ -32,7 +32,7 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -59,7 +59,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       }
     } else if (op->attr_key == attr::reduce_scope) {
       const CommReducerNode* combiner = op->node.as<CommReducerNode>();
-      CHECK(combiner);
+      ICHECK(combiner);
       reduce_combiner_.push_back(combiner);
       Stmt ret = StmtExprMutator::VisitStmt_(op);
       reduce_combiner_.pop_back();
@@ -101,7 +101,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
   PrimExpr VisitExpr_(const LoadNode* op) final {
     auto it = load_remap_.find(op->buffer_var.get());
     if (it != load_remap_.end()) {
-      CHECK(is_zero(op->index));
+      ICHECK(is_zero(op->index));
       return it->second;
     } else {
       return StmtExprMutator::VisitExpr_(op);
@@ -122,13 +122,13 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
   // make allreduce.
   Stmt MakeAllreduce(const CallNode* call) {
-    CHECK(!reduce_combiner_.empty());
+    ICHECK(!reduce_combiner_.empty());
     const CommReducerNode* combiner = reduce_combiner_.back();
     size_t size = combiner->result.size();
 
     const IntImmNode* size_of_args = call->args[0].as<IntImmNode>();
-    CHECK(size_of_args) << call->args[0]->GetTypeKey();
-    CHECK_EQ(size, size_of_args->value);
+    ICHECK(size_of_args) << call->args[0]->GetTypeKey();
+    ICHECK_EQ(size, size_of_args->value);
     Array<PrimExpr> inits = combiner->identity_element;
     std::vector<PrimExpr> values(size);
     std::vector<DataType> types(size);
@@ -143,7 +143,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     std::vector<const VarNode*> buffers(size);
     for (size_t idx = 0; idx < size; ++idx) {
       const VarNode* buffer = call->args[2 + size + idx].as<VarNode>();
-      CHECK(buffer);
+      ICHECK(buffer);
       buffers[idx] = buffer;
     }
 
@@ -156,7 +156,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       if (v) {
         reduce_set.insert(v);
       } else {
-        CHECK(call->args[i].as<IntImmNode>() && call->args[i].as<IntImmNode>()->value == 0)
+        ICHECK(call->args[i].as<IntImmNode>() && call->args[i].as<IntImmNode>()->value == 0)
             << "arg" << i << "should be a VarNode or IntImmNode";
       }
     }
@@ -168,11 +168,11 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       IterVar iv = Downcast<IterVar>(attr->node);
       e.scope = runtime::ThreadScope::Create(iv->thread_tag);
       e.iv = iv;
-      CHECK_LE(e.scope.rank, 1);
-      CHECK_GE(e.scope.dim_index, 0) << "vthread do not work with cross thread reduction";
+      ICHECK_LE(e.scope.rank, 1);
+      ICHECK_GE(e.scope.dim_index, 0) << "vthread do not work with cross thread reduction";
       if (e.scope.rank == 1) {
         const auto* ptr = attr->value.as<IntImmNode>();
-        CHECK(ptr) << "Need constant extent for reduce set " << iv;
+        ICHECK(ptr) << "Need constant extent for reduce set " << iv;
         e.extent = static_cast<int>(ptr->value);
         // ignore variables equal to 0
         if (e.extent == 1) {
@@ -187,7 +187,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         }
       }
     }
-    CHECK_EQ(nmatch, reduce_set.size()) << "Not all reduce index are presented in the context";
+    ICHECK_EQ(nmatch, reduce_set.size()) << "Not all reduce index are presented in the context";
     std::sort(vred.begin(), vred.end());
     std::sort(vpar.begin(), vpar.end());
     // the size of each index.
@@ -216,7 +216,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     //
     if (is_warp_reduction(types)) {
       // TODO(tvm-team) sub-warp reduction support.
-      CHECK_EQ(reduce_extent, warp_size_) << "not a warp reduction";
+      ICHECK_EQ(reduce_extent, warp_size_) << "not a warp reduction";
       //
       // This is the index to the reduction variable, one reduction
       // variable per warp. Local scope seems easier to reason without
@@ -309,7 +309,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
       // Update existing allocations.
       for (size_t i = 0; i < size; ++i) {
-        CHECK(!load_remap_.count(buffers[i]));
+        ICHECK(!load_remap_.count(buffers[i]));
         PrimExpr pred = const_true(types[i].lanes());
         Var var = shared_bufs[i];
         load_remap_[buffers[i]] = Load(types[i], var, index, pred);
@@ -347,7 +347,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       seq.emplace_back(MakeBufAllreduce(combiner, types, shared_bufs, reduce_index, group_index,
                                         reduce_extent, threadx_extent));
       for (size_t idx = 0; idx < size; ++idx) {
-        CHECK(!load_remap_.count(buffers[idx]));
+        ICHECK(!load_remap_.count(buffers[idx]));
         PrimExpr pred = const_true(types[idx].lanes());
         load_remap_[buffers[idx]] =
             Load(types[idx], shared_bufs[idx],
@@ -380,7 +380,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     while (reduce_extent > reduce_align) {
       reduce_align = reduce_align << 1;
     }
-    CHECK_GT(reduce_align, 1);
+    ICHECK_GT(reduce_align, 1);
     std::vector<Stmt> seq;
 
     size_t size = shared_bufs.size();
@@ -409,7 +409,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       seq.emplace_back(IfThenElse(cond, freduce(reduce_align)));
       seq.emplace_back(SyncThread("shared"));
     }
-    CHECK(threadx_extent >= 1 && warp_size_ >= 1);
+    ICHECK(threadx_extent >= 1 && warp_size_ >= 1);
     // normal synchronization
     while (reduce_align > threadx_extent || reduce_align > warp_size_) {
       reduce_align = reduce_align >> 1;
@@ -446,7 +446,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       if (ret.defined()) {
         ret = ret + e.iv->var * total_extent;
       } else {
-        CHECK_EQ(total_extent, 1);
+        ICHECK_EQ(total_extent, 1);
         ret = e.iv->var;
       }
       total_extent *= e.extent;
@@ -547,7 +547,7 @@ Pass LowerThreadAllreduce() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
-    CHECK(target.defined()) << "LowerThreadAllreduce: Require the target attribute";
+    ICHECK(target.defined()) << "LowerThreadAllreduce: Require the target attribute";
     const TargetNode* target_node = target.as<TargetNode>();
     n->body = ThreadAllreduceBuilder(target_node)(n->body);
     return f;
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index c8df122d40b5..1d12d57d10b4 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -29,13 +29,13 @@
 
 #include <unordered_set>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
 
 inline PrimExpr ConstInt32(size_t index) {
-  CHECK_LE(index, std::numeric_limits<int>::max());
+  ICHECK_LE(index, std::numeric_limits<int>::max());
   return make_const(DataType::Int(32), static_cast<int>(index));
 }
 
@@ -70,8 +70,8 @@ class BuiltinLower : public StmtExprMutator {
 
   Stmt VisitStmt(const Stmt& s) final {
     auto stmt = StmtExprMutator::VisitStmt(s);
-    CHECK_EQ(run_shape_stack_, -1);
-    CHECK_EQ(run_array_stack_, 0);
+    ICHECK_EQ(run_shape_stack_, -1);
+    ICHECK_EQ(run_array_stack_, 0);
 
     if (prep_seq_.size() != 0) {
       Stmt ret = SeqStmt::Flatten(prep_seq_, stmt);
@@ -102,8 +102,8 @@ class BuiltinLower : public StmtExprMutator {
     for (size_t i = 0; i < op->extents.size(); ++i) {
       total_bytes = total_bytes * op->extents[i];
     }
-    CHECK(device_type_.defined()) << "Unknown device type in current IR";
-    CHECK(device_id_.defined()) << "Unknown device id in current IR";
+    ICHECK(device_type_.defined()) << "Unknown device type in current IR";
+    ICHECK(device_id_.defined()) << "Unknown device id in current IR";
     Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
 
     Stmt body = SeqStmt({IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {op->buffer_var}),
@@ -129,11 +129,11 @@ class BuiltinLower : public StmtExprMutator {
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::device_context_id) {
-      CHECK(!device_id_.defined());
+      ICHECK(!device_id_.defined());
       device_id_ = op->value;
       return this->VisitStmt(op->body);
     } else if (op->attr_key == attr::device_context_type) {
-      CHECK(!device_type_.defined());
+      ICHECK(!device_type_.defined());
       device_type_ = op->value;
       return this->VisitStmt(op->body);
     } else {
@@ -202,8 +202,8 @@ class BuiltinLower : public StmtExprMutator {
     }
     prep_seq_.emplace_back(TVMStructSet(stack_array_, idx, builtin::kArrByteOffset,
                                         cast(DataType::UInt(64), byte_offset)));
-    CHECK(device_type_.defined()) << "Unknown device type in current IR";
-    CHECK(device_id_.defined()) << "Unknown device id in current IR";
+    ICHECK(device_type_.defined()) << "Unknown device type in current IR";
+    ICHECK(device_id_.defined()) << "Unknown device id in current IR";
     prep_seq_.emplace_back(TVMStructSet(stack_array_, idx, builtin::kArrDeviceId,
                                         cast(DataType::Int(32), device_id_)));
     prep_seq_.emplace_back(TVMStructSet(stack_array_, idx, builtin::kArrDeviceType,
@@ -256,7 +256,7 @@ class BuiltinLower : public StmtExprMutator {
     size_t arg_stack_begin = run_arg_stack_;
     run_arg_stack_ += op->args.size();
     size_t args_size = op->args.size();
-    CHECK_GT(args_size, 0);
+    ICHECK_GT(args_size, 0);
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<CallNode>();
     for (size_t i = 1; i < op->args.size(); ++i) {
@@ -270,7 +270,7 @@ class BuiltinLower : public StmtExprMutator {
       prep_seq_.emplace_back(TVMStructSet(stack_value_, static_cast<int>(arg_stack_begin + i - 1),
                                           builtin::kTVMValueContent, arg));
       int arg_tcode = api_type.code();
-      CHECK(!IsArrayHandle(arg)) << "Trace does not support Buffers";
+      ICHECK(!IsArrayHandle(arg)) << "Trace does not support Buffers";
       prep_seq_.emplace_back(
           Store(stack_tcode_, ConstInt32(arg_tcode), stack_index, const_true(1)));
     }
diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc
index cb6c609ef657..b95681a936ca 100644
--- a/src/tir/transforms/lower_warp_memory.cc
+++ b/src/tir/transforms/lower_warp_memory.cc
@@ -117,7 +117,7 @@ class WarpStoreCoeffFinder : private StmtVisitor {
         UpdatePattern(op->index);
       } else {
         arith::PVar<PrimExpr> base;
-        CHECK(arith::ramp(base, 1, op->value.dtype().lanes()).Match(op->index))
+        ICHECK(arith::ramp(base, 1, op->value.dtype().lanes()).Match(op->index))
             << "LowerWarpMemory failed due to store index=" << op->index
             << ", can only handle continuous store";
         UpdatePattern(base.Eval());
@@ -129,20 +129,20 @@ class WarpStoreCoeffFinder : private StmtVisitor {
 
   void UpdatePattern(const PrimExpr& index) {
     Array<PrimExpr> m = arith::DetectLinearEquation(index, {warp_index_});
-    CHECK_EQ(m.size(), 2U)
+    ICHECK_EQ(m.size(), 2U)
         << "LowerWarpMemory failed. Could not simplify the store index `" << index
         << "` into the form ax + by + cz + ... Warp memory is approximated by storing values in "
            "thread local registers and shuffling values between these registers. Currently only "
            "linear equation indices are supported.";
     PrimExpr mcoeff = analyzer_->canonical_simplify(m[0]);
     const auto* mcoeff_as_int = mcoeff.as<IntImmNode>();
-    CHECK(mcoeff_as_int && mcoeff_as_int->value > 0)
+    ICHECK(mcoeff_as_int && mcoeff_as_int->value > 0)
         << "LowerWarpMemory failed due to store index=" << index
         << ", require positive constant coefficient on warp index " << warp_index_ << " but get "
         << mcoeff;
 
     if (warp_coeff_ != 0) {
-      CHECK_EQ(warp_coeff_, mcoeff_as_int->value)
+      ICHECK_EQ(warp_coeff_, mcoeff_as_int->value)
           << "LowerWarpMemory failed due to two different store coefficient to warp index";
     } else {
       warp_coeff_ = mcoeff_as_int->value;
@@ -166,7 +166,7 @@ class WarpIndexFinder : private StmtVisitor {
   // find the warp co-efficient and the shuffle width in the statement
   std::pair<Var, int> Find(const Stmt& stmt) {
     this->VisitStmt(stmt);
-    CHECK(warp_index_.defined())
+    ICHECK(warp_index_.defined())
         << "Cannot find warp index(threadIdx.x) within the scope of warp memory";
     return std::make_pair(warp_index_->var, width_);
   }
@@ -178,14 +178,14 @@ class WarpIndexFinder : private StmtVisitor {
       IterVar iv = Downcast<IterVar>(op->node);
       if (iv->thread_tag == "threadIdx.x") {
         auto* value_as_int = op->value.as<IntImmNode>();
-        CHECK(value_as_int && value_as_int->value <= warp_size_ &&
-              warp_size_ % value_as_int->value == 0)
+        ICHECK(value_as_int && value_as_int->value <= warp_size_ &&
+               warp_size_ % value_as_int->value == 0)
             << "Expect threadIdx.x 's size to be no larger than, and a factor of"
             << " warp size(" << warp_size_ << ")"
             << " to enable warp memory"
             << " but get " << op->value << " instead";
         if (warp_index_.defined()) {
-          CHECK(warp_index_.same_as(iv))
+          ICHECK(warp_index_.same_as(iv))
               << "Find two instance of " << warp_index_->thread_tag << " in the same kernel. "
               << "Please create it using thread_axis once and reuse the axis "
               << "across multiple binds in the same kernel";
@@ -214,7 +214,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
   Stmt Rewrite(const AllocateNode* op) {
     buffer_ = op->buffer_var.get();
     int alloc_size = op->constant_allocation_size();
-    CHECK_GT(alloc_size, 0) << "warp memory only support constant alloc size";
+    ICHECK_GT(alloc_size, 0) << "warp memory only support constant alloc size";
     alloc_size *= op->dtype.lanes();
     std::tie(warp_index_, width_) = WarpIndexFinder(warp_size_).Find(op->body);
     warp_coeff_ = WarpStoreCoeffFinder(buffer_, warp_index_, analyzer_).Find(op->body);
@@ -231,7 +231,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
 
  protected:
   PrimExpr VisitExpr_(const VarNode* op) override {
-    CHECK(op != buffer_) << "Cannot access address of warp memory directly";
+    ICHECK(op != buffer_) << "Cannot access address of warp memory directly";
     return StmtExprMutator::VisitExpr_(op);
   }
 
@@ -250,7 +250,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
       PrimExpr local_index, group;
       std::tie(local_index, group) = SplitIndexByGroup(op->index);
       // invariance: local index must do not contain warp id
-      CHECK(!ExprUseVar(local_index, warp_index_))
+      ICHECK(!ExprUseVar(local_index, warp_index_))
           << "LowerWarpMemory failed to rewrite load to shuffle for index " << op->index
           << " local_index=" << local_index;
       PrimExpr load_value = Load(op->dtype, op->buffer_var, local_index, op->predicate);
@@ -271,7 +271,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
       PrimExpr local_index, group;
 
       arith::PVar<PrimExpr> base;
-      CHECK(arith::ramp(base, 1, index.dtype().lanes()).Match(index));
+      ICHECK(arith::ramp(base, 1, index.dtype().lanes()).Match(index));
 
       std::tie(local_index, group) = SplitIndexByGroup(base.Eval());
       local_index = Ramp(local_index, make_const(local_index.dtype(), 1), index.dtype().lanes());
@@ -326,7 +326,7 @@ class BindVarBoundInfo : public StmtVisitor {
   void VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       if (!var_dom_.count(iv->var.get())) {
         Range dom = Range::FromMinExtent(0, op->value);
         var_dom_[iv->var.get()] = dom;
@@ -395,7 +395,7 @@ Pass LowerWarpMemory() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
-    CHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
+    ICHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
     int warp_size = target.value()->GetAttr<Integer>("thread_warp_size", 1).value();
     n->body = WarpMemoryRewriter(warp_size).Rewrite(std::move(n->body));
     return f;
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 3fae2bbf40c8..7c4a8ef92724 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -36,7 +36,7 @@
 #include <vector>
 
 #include "arg_binder.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -47,10 +47,10 @@ inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
 
 PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol) << "MakePackedAPI: Expect PrimFunc to have the global_symbol attribute";
+  ICHECK(global_symbol) << "MakePackedAPI: Expect PrimFunc to have the global_symbol attribute";
 
   auto target = func->GetAttr<Target>(tvm::attr::kTarget);
-  CHECK(target.defined()) << "MakePackedAPI: Require the target attribute";
+  ICHECK(target.defined()) << "MakePackedAPI: Require the target attribute";
   int target_device_type = target.value()->kind->device_type;
 
   std::string name_hint = global_symbol.value();
@@ -58,7 +58,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   auto* func_ptr = func.CopyOnWrite();
   const Stmt nop = Evaluate(0);
   int num_args = static_cast<int>(func_ptr->params.size());
-  CHECK_LE(num_unpacked_args, num_args);
+  ICHECK_LE(num_unpacked_args, num_args);
 
   int num_packed_args = num_args - num_unpacked_args;
   // Data field definitions
@@ -143,7 +143,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
         msg << name_hint << ": Expect arg[" << i << "] to be int";
         seq_check.emplace_back(AssertStmt(tcode == kDLInt, tvm::tir::StringImm(msg.str()), nop));
       } else {
-        CHECK(t.is_float());
+        ICHECK(t.is_float());
         std::ostringstream msg;
         msg << name_hint << ": Expect arg[" << i << "] to be float";
         seq_check.emplace_back(AssertStmt(tcode == kDLFloat, tvm::tir::StringImm(msg.str()), nop));
@@ -161,7 +161,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   }
 
   size_t expected_nargs = num_unpacked_args + (num_packed_args != 0 ? 6 : 0);
-  CHECK_EQ(args.size(), expected_nargs);
+  ICHECK_EQ(args.size(), expected_nargs);
 
   // Arg definitions are defined before buffer binding to avoid the use before
   // def errors.
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 4d6aa88ede01..0b248959ec6e 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -105,7 +105,7 @@ class DataTypeVisitor final : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       analyzer_.Bind(iv->var, Range::FromMinExtent(0, op->value));
       vextent_[iv->var.as<VarNode>()] = op->value.dtype();
       StmtExprVisitor::VisitStmt_(op);
@@ -216,8 +216,8 @@ class DataTypeRewriter : public StmtExprMutator {
   Stmt VisitStmt_(const ForNode* op) final {
     Stmt s = StmtExprMutator::VisitStmt_(op);
     op = s.as<ForNode>();
-    CHECK(op != nullptr) << "Expected type to be ForNode"
-                         << ", but get " << s->GetTypeKey();
+    ICHECK(op != nullptr) << "Expected type to be ForNode"
+                          << ", but get " << s->GetTypeKey();
     PrimExpr e = VisitExpr(op->loop_var);
     Var var = Downcast<Var>(e);
     return For(var, cast(var.dtype(), op->min), cast(var.dtype(), op->extent), op->for_type,
@@ -228,11 +228,11 @@ class DataTypeRewriter : public StmtExprMutator {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       Stmt s = StmtExprMutator::VisitStmt_(op);
       op = s.as<AttrStmtNode>();
-      CHECK(op != nullptr) << "Expected type to be AttrStmtNode"
-                           << ", but get " << s->GetTypeKey();
+      ICHECK(op != nullptr) << "Expected type to be AttrStmtNode"
+                            << ", but get " << s->GetTypeKey();
       const IterVarNode* iv = op->node.as<IterVarNode>();
-      CHECK(iv != nullptr) << "Expected type to be IterVarNode"
-                           << ", but get " << op->node->GetTypeKey();
+      ICHECK(iv != nullptr) << "Expected type to be IterVarNode"
+                            << ", but get " << op->node->GetTypeKey();
       PrimExpr e = VisitExpr(iv->var);
       Var var = Downcast<Var>(e);
       if (ivmap_.find(iv) == ivmap_.end()) {
@@ -284,8 +284,8 @@ class DataTypeRewriter : public StmtExprMutator {
     if (is_index_ && visitor_.vmap.find(op) != visitor_.vmap.end()) {
       PrimExpr e = StmtExprMutator::VisitExpr_(op);
       const CastNode* new_op = e.as<CastNode>();
-      CHECK(new_op != nullptr) << "Expected type to be CastNode"
-                               << ", but get " << e->GetTypeKey();
+      ICHECK(new_op != nullptr) << "Expected type to be CastNode"
+                                << ", but get " << e->GetTypeKey();
       return Cast(visitor_.vmap[op], new_op->value);
     }
     return StmtExprMutator::VisitExpr_(op);
@@ -353,8 +353,8 @@ DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
 PrimExpr DataTypeRewriter::VisitExpr_(const CallNode* op) {
   PrimExpr e = StmtExprMutator::VisitExpr_(op);
   op = e.as<CallNode>();
-  CHECK(op != nullptr) << "Expected type to be CallNode"
-                       << ", but get " << e->GetTypeKey();
+  ICHECK(op != nullptr) << "Expected type to be CallNode"
+                        << ", but get " << e->GetTypeKey();
 
   if (op->op.same_as(builtin::if_then_else())) {
     return if_then_else(op->args[0], op->args[1], op->args[2]);
diff --git a/src/tir/transforms/remap_thread_axis.cc b/src/tir/transforms/remap_thread_axis.cc
index 017d1b4e6c67..e101e6b904ce 100644
--- a/src/tir/transforms/remap_thread_axis.cc
+++ b/src/tir/transforms/remap_thread_axis.cc
@@ -41,7 +41,7 @@ class ThreadAxisRewriter : private StmtExprMutator {
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       auto it = tmap_.find(iv->thread_tag);
       if (it != tmap_.end()) {
         const IterVar& new_iv = it->second;
@@ -49,7 +49,7 @@ class ThreadAxisRewriter : private StmtExprMutator {
         if (!vmap_.count(v)) {
           vmap_[v] = new_iv->var;
         } else {
-          CHECK(vmap_[v].same_as(new_iv->var));
+          ICHECK(vmap_[v].same_as(new_iv->var));
         }
         Stmt body = this->VisitStmt(op->body);
         return AttrStmt(new_iv, op->attr_key, op->value, body);
@@ -76,7 +76,7 @@ PrimFunc RemapThreadAxis(PrimFunc&& f, Map<runtime::String, IterVar> thread_map)
   }
 
   auto opt_thread_axis = f->GetAttr<Array<IterVar>>(tir::attr::kDeviceThreadAxis);
-  CHECK(opt_thread_axis != nullptr) << "Require attribute " << tir::attr::kDeviceThreadAxis;
+  ICHECK(opt_thread_axis != nullptr) << "Require attribute " << tir::attr::kDeviceThreadAxis;
   auto thread_axis = opt_thread_axis.value();
   auto* n = f.CopyOnWrite();
 
diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index baa1c3c368fd..aae1749b27db 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -97,7 +97,7 @@ class NoOpRemover : public StmtMutator {
   Stmt VisitStmt_(const SeqStmtNode* op) final {
     Stmt ret = StmtMutator::VisitSeqStmt_(op, true);
     op = ret.as<SeqStmtNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     bool need_compact = false;
     for (size_t i = 0; i < op->size(); ++i) {
       if (is_no_op(op->seq[i])) need_compact = true;
diff --git a/src/tir/transforms/split_host_device.cc b/src/tir/transforms/split_host_device.cc
index c121285e2314..921c7ad79509 100644
--- a/src/tir/transforms/split_host_device.cc
+++ b/src/tir/transforms/split_host_device.cc
@@ -43,7 +43,7 @@ class VarUseDefAnalysis : public StmtExprMutator {
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       // thread_extent can appear multiple times
       // use the first appearance as def.
       if (!use_count_.count(iv->var.get())) {
@@ -108,7 +108,7 @@ class VarUseDefAnalysis : public StmtExprMutator {
     auto it = let_binding_.find(op->var);
     PrimExpr value = this->VisitExpr(op->value);
     if (it != let_binding_.end()) {
-      CHECK(deep_equal_(it->second->value, value))
+      ICHECK(deep_equal_(it->second->value, value))
           << "Let cannot bind the same var to two different values";
       return GetRef<PrimExpr>(it->second);
     } else {
@@ -147,16 +147,16 @@ class VarUseDefAnalysis : public StmtExprMutator {
   }
 
   void HandleDef(const VarNode* v) {
-    CHECK(!def_count_.count(v)) << "variable " << v->name_hint
-                                << " has already been defined, the Stmt is not SSA";
-    CHECK(!use_count_.count(v)) << "variable " << v->name_hint
-                                << " has been used before definition!";
+    ICHECK(!def_count_.count(v)) << "variable " << v->name_hint
+                                 << " has already been defined, the Stmt is not SSA";
+    ICHECK(!use_count_.count(v)) << "variable " << v->name_hint
+                                 << " has been used before definition!";
     use_count_[v] = 0;
     def_count_[v] = 1;
   }
 
   void HandleUse(const PrimExpr& v) {
-    CHECK(v.as<VarNode>());
+    ICHECK(v.as<VarNode>());
     Var var = Downcast<Var>(v);
     auto it = use_count_.find(var.get());
     if (it != use_count_.end()) {
@@ -290,9 +290,9 @@ class HostDeviceSplitter : public StmtMutator {
 
 PrimFunc SplitHostDevice(PrimFunc&& func, IRModule* device_mod) {
   auto target = func->GetAttr<Target>(tvm::attr::kTarget);
-  CHECK(target.defined()) << "SplitHostDevice: Require the target attribute";
+  ICHECK(target.defined()) << "SplitHostDevice: Require the target attribute";
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined())
+  ICHECK(global_symbol.defined())
       << "SplitHostDevice: Expect PrimFunc to have the global_symbol attribute";
 
   HostDeviceSplitter splitter(device_mod, target.value(),
@@ -316,7 +316,7 @@ Pass SplitHostDevice() {
     for (auto& kv : *func_dict) {
       if (kv.second->IsInstance<PrimFuncNode>()) {
         PrimFunc func = Downcast<PrimFunc>(std::move(kv.second));
-        CHECK(device_mod.defined()) << "The device module must be defined.";
+        ICHECK(device_mod.defined()) << "The device module must be defined.";
         kv.second = SplitHostDevice(std::move(func), &device_mod);
       }
     }
diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc
index f9adfb82a33f..be20724ae207 100644
--- a/src/tir/transforms/storage_access.cc
+++ b/src/tir/transforms/storage_access.cc
@@ -28,7 +28,7 @@
 #include <string>
 #include <utility>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -37,7 +37,7 @@ void StorageAccessVisitor::VisitExpr_(const LoadNode* op) {
   const VarNode* buf = op->buffer_var.as<VarNode>();
   StorageScope scope = GetScope(buf);
   if (Enabled(buf, scope)) {
-    CHECK(allow_append_) << op << " " << scope.to_string();
+    ICHECK(allow_append_) << op << " " << scope.to_string();
     AccessEntry e;
     e.threads = env_threads();
     e.buffer = op->buffer_var;
@@ -53,7 +53,7 @@ void StorageAccessVisitor::VisitExpr_(const LoadNode* op) {
 
 void StorageAccessVisitor::VisitStmt_(const StoreNode* op) {
   allow_append_ = true;
-  CHECK_EQ(curr_stmt_.access.size(), 0U);
+  ICHECK_EQ(curr_stmt_.access.size(), 0U);
   curr_stmt_.stmt = op;
   const VarNode* buf = op->buffer_var.as<VarNode>();
   StorageScope scope = GetScope(buf);
@@ -78,7 +78,7 @@ void StorageAccessVisitor::VisitStmt_(const StoreNode* op) {
 
 void StorageAccessVisitor::VisitStmt_(const EvaluateNode* op) {
   allow_append_ = true;
-  CHECK_EQ(curr_stmt_.access.size(), 0U);
+  ICHECK_EQ(curr_stmt_.access.size(), 0U);
   curr_stmt_.stmt = op;
   StmtExprVisitor::VisitStmt_(op);
   // push to the scope
@@ -95,7 +95,7 @@ void StorageAccessVisitor::VisitStmt_(const AttrStmtNode* op) {
     storage_scope_[buf] = StorageScope::Create(op->value.as<StringImmNode>()->value);
     StmtExprVisitor::VisitStmt_(op);
   } else if (op->attr_key == attr::double_buffer_write) {
-    CHECK(double_buffer_write_ == nullptr);
+    ICHECK(double_buffer_write_ == nullptr);
     double_buffer_write_ = op->node.as<VarNode>();
     scope_.push_back(std::vector<StmtEntry>());
     StmtExprVisitor::VisitStmt_(op);
@@ -151,7 +151,7 @@ void StorageAccessVisitor::VisitStmt_(const ForNode* op) {
         arith::IntSet::FromRange(Range::FromMinExtent(op->min, op->extent));
     for (AccessEntry& e : s.access) {
       if (e.buffer.defined()) {
-        CHECK(e.touched.defined());
+        ICHECK(e.touched.defined());
         e.touched = arith::EvalSet(e.touched, relax_map);
       }
     }
@@ -185,7 +185,7 @@ void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
     const LoadNode* l = op->args[0].as<LoadNode>();
     StmtExprVisitor::VisitExpr_(l);
   } else if (op->op.same_as(builtin::tvm_access_ptr())) {
-    CHECK_EQ(op->args.size(), 5U);
+    ICHECK_EQ(op->args.size(), 5U);
     DataType dtype = op->args[0].dtype();
     const VarNode* buffer = op->args[1].as<VarNode>();
     PrimExpr offset = op->args[2];
@@ -194,7 +194,7 @@ void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
     StorageScope scope = GetScope(buffer);
     // The buffer scope.
     if (Enabled(buffer, scope)) {
-      CHECK(allow_append_);
+      ICHECK(allow_append_);
       AccessEntry e;
       e.threads = env_threads();
       e.dtype = dtype;
@@ -212,7 +212,7 @@ void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
     }
     StmtExprVisitor::VisitExpr_(op);
   } else if (op->op.same_as(builtin::tvm_storage_sync())) {
-    CHECK(allow_append_);
+    ICHECK(allow_append_);
     const std::string& s = op->args[0].as<StringImmNode>()->value;
     if (s != "warp") {
       StorageScope scope = StorageScope::Create(s);
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 7475bf6d2f8e..d392866b3694 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -41,7 +41,7 @@
 #include "../../arith/ir_visitor_with_analyzer.h"
 #include "../../runtime/thread_storage_scope.h"
 #include "arg_binder.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -69,7 +69,7 @@ class StorageFlattener : public StmtExprMutator {
     op = stmt.as<StoreNode>();
     auto it = var_remap_.find(op->buffer_var.get());
     if (it != var_remap_.end() && !it->second.same_as(op->buffer_var)) {
-      CHECK(it->second.as<VarNode>());
+      ICHECK(it->second.as<VarNode>());
       Var buf_var = Downcast<Var>(it->second);
       return Store(buf_var, op->value, op->index, op->predicate);
     } else {
@@ -86,7 +86,7 @@ class StorageFlattener : public StmtExprMutator {
       auto buffer = Downcast<tir::Buffer>(op->node);
       Stmt body = this->VisitStmt(op->body);
       auto it = buf_map_.find(buffer);
-      CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << buffer;
+      ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << buffer;
       body = AttrStmt(it->second.buffer->data, op->attr_key, op->value, std::move(body));
       return body;
     } else if (op->attr_key == attr::thread_extent) {
@@ -101,7 +101,7 @@ class StorageFlattener : public StmtExprMutator {
     } else if (op->attr_key == attr::buffer_dim_align) {
       auto buffer = Downcast<tir::Buffer>(op->node);
       const CallNode* tuple = op->value.as<CallNode>();
-      CHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
+      ICHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
       auto& vinfo = dim_align_[buffer];
       int dim = tuple->args[0].as<IntImmNode>()->value;
       if (static_cast<size_t>(dim) >= vinfo.size()) {
@@ -122,10 +122,10 @@ class StorageFlattener : public StmtExprMutator {
     const auto& key = op->buffer;
 
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
 
     const BufferEntry& e = it->second;
-    CHECK(!e.released) << "Read a buffer that is already out of scope";
+    ICHECK(!e.released) << "Read a buffer that is already out of scope";
 
     Stmt body = e.buffer.vstore(e.RelIndex(op->indices), op->value);
     if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
@@ -145,7 +145,7 @@ class StorageFlattener : public StmtExprMutator {
     const auto& key = op->buffer;
 
     if (buf_map_.count(key)) {
-      CHECK(buf_map_.at(key).external);
+      ICHECK(buf_map_.at(key).external);
       return this->VisitStmt(op->body);
     } else {
       // create a buffer entry
@@ -157,7 +157,7 @@ class StorageFlattener : public StmtExprMutator {
       }
       // deduce current storage scope.
       auto it = storage_scope_.find(op->buffer.get());
-      CHECK(it != storage_scope_.end()) << "Cannot find storage scope of " << op->buffer;
+      ICHECK(it != storage_scope_.end()) << "Cannot find storage scope of " << op->buffer;
       StorageScope skey;
       const std::string& strkey = it->second;
       if (strkey.length() == 0) {
@@ -176,7 +176,7 @@ class StorageFlattener : public StmtExprMutator {
         MemoryInfo info = GetMemoryInfo(skey.to_string());
         if (info.defined()) {
           align = (info->max_simd_bits + dtype.bits() - 1) / dtype.bits();
-          CHECK_LE(const_size * dtype.bits(), info->max_num_bits)
+          ICHECK_LE(const_size * dtype.bits(), info->max_num_bits)
               << "Allocation exceed bound of memory tag " << skey.to_string();
         }
       }
@@ -243,7 +243,7 @@ class StorageFlattener : public StmtExprMutator {
     op = expr.as<LoadNode>();
     auto it = var_remap_.find(op->buffer_var.get());
     if (it != var_remap_.end() && !it->second.same_as(op->buffer_var)) {
-      CHECK(it->second.as<VarNode>());
+      ICHECK(it->second.as<VarNode>());
       Var buf_var = Downcast<Var>(it->second);
       return Load(op->dtype, buf_var, op->index, op->predicate);
     } else {
@@ -267,9 +267,9 @@ class StorageFlattener : public StmtExprMutator {
     const auto& key = op->buffer;
 
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
     const BufferEntry& e = it->second;
-    CHECK(!e.released) << "Read a buffer that is already out of scope";
+    ICHECK(!e.released) << "Read a buffer that is already out of scope";
 
     if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
       shape_collector_.push_back(std::make_pair(e.buffer->data, e.buffer->shape));
@@ -280,15 +280,15 @@ class StorageFlattener : public StmtExprMutator {
   Stmt VisitStmt_(const PrefetchNode* op) final {
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<PrefetchNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
 
     const auto& key = op->buffer;
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
     const BufferEntry& e = it->second;
 
-    CHECK(!e.released) << "Read a buffer that is already out of scope";
-    CHECK_EQ(e.buffer->shape.size(), op->bounds.size())
+    ICHECK(!e.released) << "Read a buffer that is already out of scope";
+    ICHECK_EQ(e.buffer->shape.size(), op->bounds.size())
         << "Prefetch dim should be the same as buffer dim";
 
     int block_size = 1, elem_cnt = cache_line_size_ / e.buffer->dtype.bytes();
@@ -385,22 +385,22 @@ class StorageFlattener : public StmtExprMutator {
   // region with shape [1, 1, n, m] to buffer with shape [n, m]
   Stmt HandleBufferBindScope(const AttrStmtNode* op) {
     Array<ObjectRef> arr = Downcast<Array<ObjectRef>>(op->node);
-    CHECK_EQ(arr.size(), 2U);
+    ICHECK_EQ(arr.size(), 2U);
     const BufferNode* buffer = arr[0].as<BufferNode>();
     const BufferNode* target = arr[1].as<BufferNode>();
     const CallNode* tuple = op->value.as<CallNode>();
-    CHECK(buffer && target);
-    CHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
+    ICHECK(buffer && target);
+    ICHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
     auto key = GetRef<Buffer>(target);
 
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find buffer of " << key;
+    ICHECK(it != buf_map_.end()) << "Cannot find buffer of " << key;
     const BufferEntry& be = it->second;
-    CHECK(!be.released);
-    CHECK_EQ(tuple->args.size(), be.buffer->shape.size() * 2);
+    ICHECK(!be.released);
+    ICHECK_EQ(tuple->args.size(), be.buffer->shape.size() * 2);
     Array<PrimExpr> begins, extents;
     if (be.bounds.size() != 0) {
-      CHECK_EQ(tuple->args.size(), be.bounds.size() * 2);
+      ICHECK_EQ(tuple->args.size(), be.bounds.size() * 2);
       for (size_t i = 0; i < be.buffer->shape.size(); ++i) {
         begins.push_back(tuple->args[2 * i] - be.bounds[i]->min);
         extents.push_back(tuple->args[2 * i + 1]);
@@ -414,7 +414,7 @@ class StorageFlattener : public StmtExprMutator {
     }
     Buffer slice = be.buffer.MakeSlice(begins, extents);
     if (buffer->strides.size() == 0) {
-      CHECK_EQ(slice->strides.size(), 0U)
+      ICHECK_EQ(slice->strides.size(), 0U)
           << "Trying to bind compact buffer to strided one strides=" << slice->strides;
     } else {
       slice = slice.MakeStrideView();
@@ -452,7 +452,7 @@ class StorageFlattener : public StmtExprMutator {
     inline Array<PrimExpr> RelIndex(Array<PrimExpr> args) const {
       if (bounds.size() != 0) {
         Array<PrimExpr> index;
-        CHECK_EQ(bounds.size(), args.size());
+        ICHECK_EQ(bounds.size(), args.size());
         for (size_t i = 0; i < bounds.size(); ++i) {
           index.push_back(args[i] - bounds[i]->min);
         }
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 09d96510f7f0..2817b1334019 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -36,7 +36,7 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -86,8 +86,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     size_t level = scope_.size();
     const VarNode* buf = op->buffer_var.get();
     auto it = alloc_info_.find(buf);
-    CHECK(it != alloc_info_.end());
-    CHECK(it->second.alloc == nullptr);
+    ICHECK(it != alloc_info_.end());
+    ICHECK(it->second.alloc == nullptr);
     it->second.alloc = op;
     it->second.level = level;
     StmtExprVisitor::VisitStmt_(op);
@@ -100,7 +100,7 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     const VarNode* buf = op->buffer_var.get();
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
-      CHECK_LT(it->second.level, scope_.size());
+      ICHECK_LT(it->second.level, scope_.size());
       scope_[it->second.level].touched.push_back(buf);
     }
     StmtEntry e = scope_.back();
@@ -127,7 +127,7 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     const VarNode* buf = op->buffer_var.get();
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
-      CHECK_LT(it->second.level, scope_.size()) << "Load memory in places other than store.";
+      ICHECK_LT(it->second.level, scope_.size()) << "Load memory in places other than store.";
       scope_[it->second.level].touched.push_back(buf);
     }
   }
@@ -143,7 +143,7 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     // Directly reference to the variable count as a read.
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
-      CHECK_LT(it->second.level, scope_.size()) << " buf=" << buf->name_hint;
+      ICHECK_LT(it->second.level, scope_.size()) << " buf=" << buf->name_hint;
       scope_[it->second.level].touched.push_back(buf);
     }
   }
@@ -160,11 +160,11 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     e.touched = std::move(scope_.back().touched);
     scope_.pop_back();
     int64_t end_index = static_cast<int64_t>(linear_seq_.size());
-    CHECK_GT(end_index, begin_index);
+    ICHECK_GT(end_index, begin_index);
     e.scope_pair_offset = begin_index - end_index;
     linear_seq_.push_back(e);
     // record the pointer to end index.
-    CHECK_NE(end_index, 0U);
+    ICHECK_NE(end_index, 0U);
     linear_seq_[begin_index].scope_pair_offset = end_index - begin_index;
   }
   void VisitStmt_(const AttrStmtNode* op) final {
@@ -349,7 +349,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     if (attach_map_.count(nullptr)) {
       std::vector<Stmt> nest;
       for (StorageEntry* e : attach_map_.at(nullptr)) {
-        // CHECK_EQ(e->scope.rank, 0);
+        // ICHECK_EQ(e->scope.rank, 0);
         if (e->new_alloc.defined()) {
           nest.emplace_back(AttrStmt(e->alloc_var, attr::storage_scope,
                                      StringImm(e->scope.to_string()), Evaluate(0)));
@@ -389,7 +389,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   }
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
-      CHECK_EQ(op->args.size(), 5U);
+      ICHECK_EQ(op->args.size(), 5U);
       DataType dtype = op->args[0].dtype();
       const VarNode* buffer = op->args[1].as<VarNode>();
       auto it = alloc_map_.find(buffer);
@@ -400,7 +400,7 @@ class StoragePlanRewriter : public StmtExprMutator {
       PrimExpr offset = this->VisitExpr(op->args[2]);
       PrimExpr extent = this->VisitExpr(op->args[3]);
       uint64_t elem_bits = dtype.bits() * dtype.lanes();
-      CHECK_EQ(se->bits_offset % elem_bits, 0U);
+      ICHECK_EQ(se->bits_offset % elem_bits, 0U);
       if (se->bits_offset != 0) {
         offset = make_const(offset.dtype(), se->bits_offset / elem_bits) + offset;
       }
@@ -435,7 +435,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     }
   }
   Stmt VisitStmt_(const ForNode* op) final {
-    CHECK(op->for_type != ForType::Vectorized) << "VectorizeLoop before LiftStorageAlloc";
+    ICHECK(op->for_type != ForType::Vectorized) << "VectorizeLoop before LiftStorageAlloc";
     // remake all the allocation at the attach scope.
     if (attach_map_.count(op)) {
       auto& svec = attach_map_[op];
@@ -508,7 +508,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   PrimExpr RemapIndex(DataType dtype, PrimExpr index, StorageEntry* e) {
     if (e->bits_offset == 0) return index;
     uint64_t elem_bits = dtype.bits() * dtype.lanes();
-    CHECK_EQ(e->bits_offset % elem_bits, 0U);
+    ICHECK_EQ(e->bits_offset % elem_bits, 0U);
     return make_const(index.dtype(), e->bits_offset / elem_bits) + index;
   }
   // Prepare the new allocations
@@ -525,7 +525,7 @@ class StoragePlanRewriter : public StmtExprMutator {
       for (size_t i = 0; i < vec.size(); ++i) {
         StorageEntry* e = vec[i];
         if (e->scope.tag.length() != 0) {
-          CHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size";
+          ICHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size";
           for (size_t j = 0; j < i; ++j) {
             if (e->scope == vec[j]->scope) {
               vec[j]->merged_children.push_back(e);
@@ -562,7 +562,7 @@ class StoragePlanRewriter : public StmtExprMutator {
           if (e->scope.tag.length() != 0) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
-            CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+            ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
                 << "Allocation exceed bound of memory tag " << e->scope.to_string();
           }
         } else {
@@ -602,7 +602,7 @@ class StoragePlanRewriter : public StmtExprMutator {
           if (e->scope.tag.length() != 0) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
-            CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+            ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
                 << "Allocation exceed bound of memory tag " << e->scope.to_string();
           }
         }
@@ -611,9 +611,9 @@ class StoragePlanRewriter : public StmtExprMutator {
   }
   // New allocation for merged data
   void NewAllocTagMerged(StorageEntry* e) {
-    CHECK_NE(e->scope.tag.length(), 0U);
+    ICHECK_NE(e->scope.tag.length(), 0U);
     // allocate with element type.
-    CHECK_NE(e->const_nbits, 0U);
+    ICHECK_NE(e->const_nbits, 0U);
     MemoryInfo info = GetMemoryInfo(e->scope.to_string());
     uint64_t total_bits = e->const_nbits;
     // By default, align to 32 bits.
@@ -628,8 +628,8 @@ class StoragePlanRewriter : public StmtExprMutator {
     }
     e->alloc_var = e->allocs[0]->buffer_var;
     for (StorageEntry* child : e->merged_children) {
-      CHECK_NE(child->const_nbits, 0U);
-      CHECK_NE(total_bits, 0U);
+      ICHECK_NE(child->const_nbits, 0U);
+      ICHECK_NE(total_bits, 0U);
       child->bits_offset = total_bits;
       child->alloc_var = e->alloc_var;
       total_bits += child->const_nbits;
@@ -642,7 +642,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         make_const(e->allocs[0]->extents[0].dtype(), (total_bits + type_bits - 1) / type_bits);
     e->new_alloc = Allocate(e->alloc_var, e->elem_type, {alloc_size}, const_true(), Evaluate(0));
     if (info.defined()) {
-      CHECK_LE(total_bits, info->max_num_bits)
+      ICHECK_LE(total_bits, info->max_num_bits)
           << "Allocation exceed bound of memory tag " << e->scope.to_string();
     }
   }
@@ -675,7 +675,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   }
   void PlanNewScope(const Object* op) {
     if (thread_scope_ != nullptr) {
-      CHECK(thread_scope_ == op);
+      ICHECK(thread_scope_ == op);
       // erase all memory atatched to this scope.
       for (auto it = const_free_map_.begin(); it != const_free_map_.end();) {
         if (it->second->attach_scope_ == op) {
@@ -716,7 +716,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         bool detect_inplace = detect_inplace_ && (it->second.gen.size() <= 2);
 
         for (const VarNode* var : it->second.gen) {
-          CHECK(alloc_info.count(var));
+          ICHECK(alloc_info.count(var));
           const AllocEntry& ae = alloc_info.at(var);
           StorageEntry* dst_entry = nullptr;
           // inplace detection
@@ -758,7 +758,7 @@ class StoragePlanRewriter : public StmtExprMutator {
             attr::IsPragmaKey(op->attr_key)) {
           PlanNewScope(op);
         } else {
-          CHECK(op->attr_key == attr::extern_scope);
+          ICHECK(op->attr_key == attr::extern_scope);
         }
       } else if (s.stmt->IsInstance<ForNode>()) {
         const auto* op = static_cast<const ForNode*>(s.stmt);
@@ -785,7 +785,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   // Allocate new storage entry.
   StorageEntry* NewAlloc(const AllocateNode* op, const Object* attach_scope,
                          const StorageScope& scope, size_t const_nbits) {
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     // Re-use not successful, allocate a new buffer.
     std::unique_ptr<StorageEntry> entry(new StorageEntry());
     entry->attach_scope_ = attach_scope;
@@ -799,7 +799,7 @@ class StoragePlanRewriter : public StmtExprMutator {
 
   StorageEntry* FindAlloc(const AllocateNode* op, const Object* attach_scope,
                           const StorageScope& scope) {
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     // skip plan for local variable,
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
@@ -858,9 +858,9 @@ class StoragePlanRewriter : public StmtExprMutator {
   // simulated free.
   void Free(const VarNode* var) {
     auto it = alloc_map_.find(var);
-    CHECK(it != alloc_map_.end());
+    ICHECK(it != alloc_map_.end());
     StorageEntry* e = it->second;
-    CHECK_NE(e->allocs.size(), 0U);
+    ICHECK_NE(e->allocs.size(), 0U);
 
     // disable reuse of small arrays, they will be lowered to registers in LLVM
     // This rules only apply if we are using non special memory
@@ -989,7 +989,7 @@ PrimFunc PointerValueTypeRewrite(PrimFunc f) {
     }
   }
 
-  CHECK_EQ(args.size(), n->params.size());
+  ICHECK_EQ(args.size(), n->params.size());
   n->params = args;
   n->body = Substitute(n->body, remap_vars);
   return f;
diff --git a/src/tir/transforms/tensorcore_infer_fragment.cc b/src/tir/transforms/tensorcore_infer_fragment.cc
index 1b3b3c44ff9c..d0f58074ada0 100644
--- a/src/tir/transforms/tensorcore_infer_fragment.cc
+++ b/src/tir/transforms/tensorcore_infer_fragment.cc
@@ -30,7 +30,7 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 #include "storage_access.h"
 
 namespace tvm {
@@ -56,28 +56,28 @@ class FragmentGetter : public StmtExprVisitor {
     if (op->op.same_as(builtin::tvm_load_matrix_sync()) ||
         op->op.same_as(builtin::tvm_store_matrix_sync())) {
       // Get shape and layout information from load and store intrinsic
-      CHECK_EQ(op->args.size(), 8U);
+      ICHECK_EQ(op->args.size(), 8U);
       const VarNode* buffer_var = op->args[0].as<VarNode>();
-      CHECK(buffer_var);
+      ICHECK(buffer_var);
       // Get shape
       const IntImmNode* m = op->args[1].as<IntImmNode>();
       const IntImmNode* n = op->args[2].as<IntImmNode>();
       const IntImmNode* k = op->args[3].as<IntImmNode>();
       const StringImmNode* layout = op->args[7].as<StringImmNode>();
-      CHECK(m);
-      CHECK(n);
-      CHECK(k);
-      CHECK(layout);
+      ICHECK(m);
+      ICHECK(n);
+      ICHECK(k);
+      ICHECK(layout);
 
       std::string scope = scopes[buffer_var];
       if (fragments.count(buffer_var)) {
         // check if the fragment has met before
         FragmentInfo info = fragments[buffer_var];
-        CHECK_EQ(m->value, info.m);
-        CHECK_EQ(n->value, info.n);
-        CHECK_EQ(k->value, info.k);
+        ICHECK_EQ(m->value, info.m);
+        ICHECK_EQ(n->value, info.n);
+        ICHECK_EQ(k->value, info.k);
         if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
-          CHECK_EQ(layout->value, info.layout);
+          ICHECK_EQ(layout->value, info.layout);
         }
       } else {
         // store metadata
@@ -91,25 +91,25 @@ class FragmentGetter : public StmtExprVisitor {
       }
     } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
       // Get shape information from fill intrinsic
-      CHECK_EQ(op->args.size(), 6U);
+      ICHECK_EQ(op->args.size(), 6U);
       const VarNode* buffer_var = op->args[0].as<VarNode>();
-      CHECK(buffer_var);
+      ICHECK(buffer_var);
       // Get shape
       const IntImmNode* m = op->args[1].as<IntImmNode>();
       const IntImmNode* n = op->args[2].as<IntImmNode>();
       const IntImmNode* k = op->args[3].as<IntImmNode>();
-      CHECK(m);
-      CHECK(n);
-      CHECK(k);
+      ICHECK(m);
+      ICHECK(n);
+      ICHECK(k);
 
       std::string scope = scopes[buffer_var];
       // Only wmma.accumulator can use tvm_fill_fragment
-      CHECK_EQ(scope, "wmma.accumulator");
+      ICHECK_EQ(scope, "wmma.accumulator");
       if (fragments.count(buffer_var)) {
         FragmentInfo info = fragments[buffer_var];
-        CHECK_EQ(m->value, info.m);
-        CHECK_EQ(n->value, info.n);
-        CHECK_EQ(k->value, info.k);
+        ICHECK_EQ(m->value, info.m);
+        ICHECK_EQ(n->value, info.n);
+        ICHECK_EQ(k->value, info.k);
       } else {
         FragmentInfo info(m->value, n->value, k->value, "");
         fragments[buffer_var] = info;
@@ -121,7 +121,7 @@ class FragmentGetter : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::storage_scope) {
       const VarNode* buffer = op->node.as<VarNode>();
-      CHECK(buffer);
+      ICHECK(buffer);
       scopes[buffer] = op->value.as<StringImmNode>()->value;
     }
     StmtExprVisitor::VisitStmt_(op);
@@ -142,28 +142,28 @@ class FragmentChecker : public StmtExprVisitor {
     StmtExprVisitor::VisitExpr_(op);
     // Check shape when calling tvm_mma_sync
     if (op->op.same_as(builtin::tvm_mma_sync()) || op->op.same_as(builtin::tvm_bmma_sync())) {
-      CHECK_EQ(op->args.size(), 8U);
+      ICHECK_EQ(op->args.size(), 8U);
       const VarNode* buffer_var_d = op->args[0].as<VarNode>();
       const VarNode* buffer_var_a = op->args[2].as<VarNode>();
       const VarNode* buffer_var_b = op->args[4].as<VarNode>();
       const VarNode* buffer_var_c = op->args[6].as<VarNode>();
-      CHECK(buffer_var_d);
-      CHECK(buffer_var_a);
-      CHECK(buffer_var_b);
-      CHECK(buffer_var_c);
+      ICHECK(buffer_var_d);
+      ICHECK(buffer_var_a);
+      ICHECK(buffer_var_b);
+      ICHECK(buffer_var_c);
 
       // Check all fragment A, B, C and D have the same shape
-      CHECK(CheckShape(buffer_var_d, buffer_var_a));
-      CHECK(CheckShape(buffer_var_d, buffer_var_b));
-      CHECK(CheckShape(buffer_var_d, buffer_var_c));
+      ICHECK(CheckShape(buffer_var_d, buffer_var_a));
+      ICHECK(CheckShape(buffer_var_d, buffer_var_b));
+      ICHECK(CheckShape(buffer_var_d, buffer_var_c));
     }
   }
 
  private:
   // A tool for checking shapes of two fragments
   bool CheckShape(const VarNode* buffer1, const VarNode* buffer2) {
-    CHECK(fragment_getter.fragments.count(buffer1));
-    CHECK(fragment_getter.fragments.count(buffer2));
+    ICHECK(fragment_getter.fragments.count(buffer1));
+    ICHECK(fragment_getter.fragments.count(buffer2));
     FragmentGetter::FragmentInfo info1 = fragment_getter.fragments.at(buffer1);
     FragmentGetter::FragmentInfo info2 = fragment_getter.fragments.at(buffer2);
     return info1.m == info2.m && info1.n == info2.n && info1.k == info2.k;
diff --git a/src/tir/transforms/thread_storage_sync.cc b/src/tir/transforms/thread_storage_sync.cc
index 4893748bf522..8f757171afbd 100644
--- a/src/tir/transforms/thread_storage_sync.cc
+++ b/src/tir/transforms/thread_storage_sync.cc
@@ -31,7 +31,7 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 #include "storage_access.h"
 
 namespace tvm {
@@ -97,7 +97,7 @@ class ThreadSyncPlanner : public StorageAccessVisitor {
         }
       }
       if (sync_before_stmt) {
-        CHECK_EQ(condition_counter(), 0) << "Cannot insert syncs inside condition";
+        ICHECK_EQ(condition_counter(), 0) << "Cannot insert syncs inside condition";
         syncs_inserted_.insert(s.stmt);
       }
     }
@@ -124,7 +124,7 @@ class ThreadSyncPlanner : public StorageAccessVisitor {
           }
         }
         if (sync_before_stmt) {
-          CHECK_EQ(condition_counter(), 0) << "Cannot insert syncs inside condition";
+          ICHECK_EQ(condition_counter(), 0) << "Cannot insert syncs inside condition";
           syncs_inserted_.insert(s.stmt);
           break;
         }
@@ -263,7 +263,7 @@ class ThreadSyncInserter : public StmtExprMutator {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       PrimExpr expr = StmtExprMutator::VisitExpr_(op);
       op = expr.as<CallNode>();
-      CHECK_EQ(op->args.size(), 5U);
+      ICHECK_EQ(op->args.size(), 5U);
       const VarNode* buffer_var = op->args[1].as<VarNode>();
       Var var(GetRef<Var>(buffer_var));
       const IntImmNode* flag = op->args[4].as<IntImmNode>();
@@ -297,7 +297,7 @@ class ThreadSyncInserter : public StmtExprMutator {
   }
   // private functions.
   Stmt InitGlobalBarrier(const AttrStmtNode* op) {
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     Array<PrimExpr> pargs = {StringImm(runtime::symbol::tvm_prepare_global_barrier)};
     Stmt prep = Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
     Stmt body = op->body;
@@ -314,9 +314,9 @@ class ThreadSyncInserter : public StmtExprMutator {
     return SeqStmt({prep, body});
   }
   Stmt MakeGlobalBarrier() {
-    CHECK(sync_scope_.rank == StorageRank::kGlobal);
+    ICHECK(sync_scope_.rank == StorageRank::kGlobal);
     if (!num_blocks_.defined()) {
-      CHECK(!is_lead_.defined());
+      ICHECK(!is_lead_.defined());
       num_work_dim_ = thread_extents_.size();
       for (const AttrStmtNode* attr : thread_extents_) {
         IterVar iv = Downcast<IterVar>(attr->node);
@@ -329,7 +329,7 @@ class ThreadSyncInserter : public StmtExprMutator {
         }
       }
     } else {
-      CHECK_EQ(num_work_dim_, thread_extents_.size());
+      ICHECK_EQ(num_work_dim_, thread_extents_.size());
     }
     return Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
                          {StringImm(sync_scope_.to_string()), is_lead_, num_blocks_}));
diff --git a/src/tir/transforms/unroll_loop.cc b/src/tir/transforms/unroll_loop.cc
index a15190665949..71ad899273a6 100644
--- a/src/tir/transforms/unroll_loop.cc
+++ b/src/tir/transforms/unroll_loop.cc
@@ -33,7 +33,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
@@ -107,7 +107,7 @@ class LoopUnroller : public StmtExprMutator {
         auto_unroll && (value * step_count_ <= auto_max_step_ || value <= auto_max_extent_);
 
     if (op->for_type == ForType::Unrolled) {
-      CHECK_GE(value, 0) << "Cannot unroll non-constant loop";
+      ICHECK_GE(value, 0) << "Cannot unroll non-constant loop";
       auto_unroll = true;
     }
 
@@ -163,7 +163,7 @@ class LoopUnroller : public StmtExprMutator {
   Stmt Unroll(const ForNode* op) {
     int value = GetExtent(op);
     // For loop must have a constant integer extent
-    CHECK_NE(value, -1) << "loop doesn't have a constant integer extent";
+    ICHECK_NE(value, -1) << "loop doesn't have a constant integer extent";
     if (value == 0) return Evaluate(0);
     Stmt body = op->body;
     Map<Var, PrimExpr> vmap;
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index bf54ada6e837..239f42266b83 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -45,8 +45,8 @@ inline PrimExpr BroadcastTo(PrimExpr e, int lanes) {
       return Broadcast(op->value, lanes);
     }
   }
-  CHECK_EQ(e.dtype().lanes(), 1) << "Cannot broadcast lane=" << e.dtype().lanes() << " to "
-                                 << lanes;
+  ICHECK_EQ(e.dtype().lanes(), 1) << "Cannot broadcast lane=" << e.dtype().lanes() << " to "
+                                  << lanes;
   return Broadcast(e, lanes);
 }
 
@@ -105,7 +105,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   }
 
   Stmt VisitStmt(const Stmt& stmt) final {
-    CHECK(!need_scalarize_);
+    ICHECK(!need_scalarize_);
     Stmt ret = StmtMutator::VisitStmt(stmt);
     if (need_scalarize_) {
       need_scalarize_ = false;
@@ -319,7 +319,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     // (let x = 1 in x + 1) * (let x = 1 in x + 1)
     auto it = let_binding_.find(op->var);
     if (it != let_binding_.end()) {
-      CHECK(deep_equal_(it->second, value))
+      ICHECK(deep_equal_(it->second, value))
           << "Let cannot bind the same var to two different values";
     }
     if (value.dtype().lanes() != op->value.dtype().lanes()) {
@@ -355,8 +355,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (op->for_type == ForType::Vectorized) {
       LOG(WARNING) << "Detect vectorize inside vectorized loop, ignoring...";
     }
-    CHECK(is_zero(op->min));
-    CHECK(!op->extent.dtype().is_vector());
+    ICHECK(is_zero(op->min));
+    ICHECK(!op->extent.dtype().is_vector());
     PrimExpr extent = this->VisitExpr(op->extent);
     if (extent.dtype().is_vector()) {
       return Scalarize(GetRef<Stmt>(op));
@@ -370,7 +370,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   }
   // IfThenElse
   Stmt VisitStmt_(const IfThenElseNode* op) final {
-    CHECK(!op->condition.dtype().is_vector());
+    ICHECK(!op->condition.dtype().is_vector());
     PrimExpr condition = this->VisitExpr(op->condition);
     if (condition.dtype().is_vector()) {
       return Scalarize(GetRef<Stmt>(op));
@@ -390,7 +390,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   // LetStmt
   Stmt VisitStmt_(const LetStmtNode* op) final {
     PrimExpr value = this->VisitExpr(op->value);
-    CHECK(!let_binding_.count(op->var)) << "SSA violation, a single var is binded twice";
+    ICHECK(!let_binding_.count(op->var)) << "SSA violation, a single var is binded twice";
     let_binding_[op->var] = value;
 
     if (value.dtype().lanes() != op->value.dtype().lanes()) {
@@ -526,7 +526,7 @@ class LoopVectorizer : public StmtMutator {
  public:
   Stmt VisitStmt_(const ForNode* op) final {
     if (op->for_type == ForType::Vectorized) {
-      CHECK(is_zero(op->min));
+      ICHECK(is_zero(op->min));
       auto* extent_as_int = op->extent.as<IntImmNode>();
       if (!extent_as_int || extent_as_int->value < 1) {
         LOG(FATAL) << "Failed to vectorize loop with extent " << op->extent;
diff --git a/src/topi/broadcast.cc b/src/topi/broadcast.cc
index a06d91401580..f6a28c7722af 100644
--- a/src/topi/broadcast.cc
+++ b/src/topi/broadcast.cc
@@ -24,7 +24,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/topi/broadcast.h>
-#include <tvm/topi/util.h>
+#include <tvm/topi/utils.h>
 
 namespace tvm {
 namespace topi {
diff --git a/src/topi/nn.cc b/src/topi/nn.cc
index 2c9546507de6..092fe65e19dc 100644
--- a/src/topi/nn.cc
+++ b/src/topi/nn.cc
@@ -57,6 +57,14 @@ TVM_REGISTER_GLOBAL("topi.nn.pad").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = pad(args[0], args[1], args[2], args[3]);
 });
 
+TVM_REGISTER_GLOBAL("topi.nn.space_to_batch_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = space_to_batch_nd(args[0], args[1], args[2], args[3], args[4]);
+});
+
+TVM_REGISTER_GLOBAL("topi.nn.batch_to_space_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = batch_to_space_nd(args[0], args[1], args[2], args[3]);
+});
+
 /* Ops from nn/dense.h */
 TVM_REGISTER_GLOBAL("topi.nn.dense").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = nn::dense(args[0], args[1], args[2], args[3]);
diff --git a/src/topi/reduction.cc b/src/topi/reduction.cc
index b5c6690e1676..55c59162e68c 100644
--- a/src/topi/reduction.cc
+++ b/src/topi/reduction.cc
@@ -24,7 +24,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/topi/reduction.h>
-#include <tvm/topi/util.h>
+#include <tvm/topi/utils.h>
 
 namespace tvm {
 namespace topi {
diff --git a/src/topi/schedule.cc b/src/topi/schedule.cc
index 83457ced9f16..f9400bf59df6 100644
--- a/src/topi/schedule.cc
+++ b/src/topi/schedule.cc
@@ -182,14 +182,18 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_lrn").set_body([](TVMArgs args, TVMRetVa
 });
 
 /* Utility functions */
-TVM_REGISTER_GLOBAL("topi.util.is_empty_shape").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("topi.utils.is_empty_shape").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = topi::detail::is_empty_shape(args[0]);
 });
 
-TVM_REGISTER_GLOBAL("topi.util.bilinear_sample_nchw").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nchw").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = detail::bilinear_sample_nchw(args[0], args[1], args[2], args[3]);
 });
 
+TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nhwc").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = detail::bilinear_sample_nhwc(args[0], args[1], args[2], args[3]);
+});
+
 /*! \brief Builder function for instantiating schedules. */
 using FTVMScheduleBuilder = std::function<tvm::te::Schedule(
     const tvm::Target& target, const tvm::Array<tvm::te::Tensor>& outs)>;
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index d79952e2494f..e1e3988f6400 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -24,7 +24,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/topi/transform.h>
-#include <tvm/topi/util.h>
+#include <tvm/topi/utils.h>
 
 namespace tvm {
 namespace topi {
@@ -150,7 +150,7 @@ TVM_REGISTER_GLOBAL("topi.matmul").set_body([](TVMArgs args, TVMRetValue* rv) {
       *rv = matmul(args[0], args[1], args[2], args[3]);
       break;
     default:
-      CHECK(0) << "topi.matmul expects 2, 3 or 4 arguments";
+      ICHECK(0) << "topi.matmul expects 2, 3 or 4 arguments";
   }
 });
 
@@ -169,6 +169,10 @@ TVM_REGISTER_GLOBAL("topi.strided_slice").set_body([](TVMArgs args, TVMRetValue*
   *rv = strided_slice(args[0], args[1], args[2], args[3], args[4]);
 });
 
+TVM_REGISTER_GLOBAL("topi.dynamic_strided_slice").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = dynamic_strided_slice(args[0], args[1], args[2], args[3]);
+});
+
 TVM_REGISTER_GLOBAL("topi.one_hot").set_body([](TVMArgs args, TVMRetValue* rv) {
   int depth = args[3];
   int axis = args[4];
diff --git a/tests/cpp/arith_simplify_test.cc b/tests/cpp/arith_simplify_test.cc
index 341d9f8df062..829e2689887e 100644
--- a/tests/cpp/arith_simplify_test.cc
+++ b/tests/cpp/arith_simplify_test.cc
@@ -27,11 +27,11 @@ TEST(Simplify, MinMax) {
   auto x = tvm::te::var("x");
   auto e1 = (tvm::max(x, 1) - tvm::max(x, 1));
   auto e1s = ana.canonical_simplify(e1);
-  CHECK(tvm::tir::is_zero(e1s));
+  ICHECK(tvm::tir::is_zero(e1s));
 
   auto e2 = (x * tvm::min(x, 1)) - (x * tvm::min(x, 1));
   auto e2s = ana.canonical_simplify(e2);
-  CHECK(tvm::tir::is_zero(e2s));
+  ICHECK(tvm::tir::is_zero(e2s));
 }
 
 TEST(Simplify, Mul) {
@@ -39,7 +39,7 @@ TEST(Simplify, Mul) {
   auto x = tvm::te::var("x");
   auto e = (x * x) - (x * x);
   auto es = ana.canonical_simplify(e);
-  CHECK(tvm::tir::is_zero(es));
+  ICHECK(tvm::tir::is_zero(es));
 }
 
 TEST(Simplify, Mod) {
@@ -51,7 +51,7 @@ TEST(Simplify, Mod) {
   // and therefore, the constant folding will be attempted in CanonicalSimplify
   auto mod = ana.canonical_simplify(tvm::tir::Mod(x, y));
   auto es = ana.canonical_simplify(mod - x);
-  CHECK(tvm::tir::is_zero(es));
+  ICHECK(tvm::tir::is_zero(es));
 }
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/tests/cpp/attrs_test.cc b/tests/cpp/attrs_test.cc
index 7b301bd13f68..4d6de60b9706 100644
--- a/tests/cpp/attrs_test.cc
+++ b/tests/cpp/attrs_test.cc
@@ -65,21 +65,21 @@ TEST(Attrs, Basic) {
     LOG(FATAL) << "bad";
   } catch (const tvm::AttrError& e) {
     std::string what = e.what();
-    CHECK(what.find("expr : PrimExpr, default=1") != std::string::npos);
-    CHECK(what.find("axisx") != std::string::npos);
+    ICHECK(what.find("expr : PrimExpr, default=1") != std::string::npos);
+    ICHECK(what.find("axisx") != std::string::npos);
   }
   n->InitBySeq("learning_rate", PrimExpr(1), "expr", 128, "name", "xx");
-  CHECK_EQ(n->learning_rate, 1.0);
+  ICHECK_EQ(n->learning_rate, 1.0);
 
   n->InitBySeq("name", "xxx", "expr", 128);
-  CHECK_EQ(n->name, "xxx");
-  CHECK_EQ(n->axis, 10);
-  CHECK_EQ(n->expr.as<tvm::tir::IntImmNode>()->value, 128);
+  ICHECK_EQ(n->name, "xxx");
+  ICHECK_EQ(n->axis, 10);
+  ICHECK_EQ(n->expr.as<tvm::tir::IntImmNode>()->value, 128);
   // Check docstring
   std::ostringstream os;
   n->PrintDocString(os);
   LOG(INFO) << "docstring\n" << os.str();
-  CHECK(os.str().find("expr : PrimExpr, default=1") != std::string::npos);
+  ICHECK(os.str().find("expr : PrimExpr, default=1") != std::string::npos);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/auto_scheduler_test.cc b/tests/cpp/auto_scheduler_test.cc
index aacc3b154463..5e4533733d2e 100644
--- a/tests/cpp/auto_scheduler_test.cc
+++ b/tests/cpp/auto_scheduler_test.cc
@@ -43,8 +43,8 @@ tvm::Array<tvm::te::Tensor> conv2d_nchw_bn_relu_func(int N, int H, int W, int CI
   int OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) / strides + 1;
 
   const auto& conv = topi::conv2d_nchw(data, kernel, padding, padding, strides, strides);
-  CHECK(conv->shape[2].as<IntImmNode>()->value == OH);
-  CHECK(conv->shape[3].as<IntImmNode>()->value == OW);
+  ICHECK(conv->shape[2].as<IntImmNode>()->value == OH);
+  ICHECK(conv->shape[3].as<IntImmNode>()->value == OW);
 
   const auto& bias_add = compute(
       {N, CO, OH, OW}, [&](Var i, Var j, Var k, Var l) { return conv[i][j][k][l] + bias[j][0][0]; },
@@ -76,9 +76,9 @@ TEST(ComputeDAG, AccessAnalyzer) {
   std::set<int> needs_multi_level_tiling = {conv};
   for (size_t stage_id = 0; stage_id < dag->ops.size(); stage_id++) {
     if (needs_multi_level_tiling.count(stage_id)) {
-      CHECK(dag->access_analyzer.NeedsMultiLevelTiling(dag->ops[stage_id]));
+      ICHECK(dag->access_analyzer.NeedsMultiLevelTiling(dag->ops[stage_id]));
     } else {
-      CHECK(!dag->access_analyzer.NeedsMultiLevelTiling(dag->ops[stage_id]));
+      ICHECK(!dag->access_analyzer.NeedsMultiLevelTiling(dag->ops[stage_id]));
     }
   }
 
@@ -86,37 +86,37 @@ TEST(ComputeDAG, AccessAnalyzer) {
                                     bn_scale, bn_mul,  bn_offset, bn_add, relu};
   for (size_t stage_id = 0; stage_id < dag->ops.size(); stage_id++) {
     if (is_simple_access.count(stage_id)) {
-      CHECK(dag->access_analyzer.IsSimpleAccess(dag->ops[stage_id]));
+      ICHECK(dag->access_analyzer.IsSimpleAccess(dag->ops[stage_id]));
     } else {
-      CHECK(!dag->access_analyzer.IsSimpleAccess(dag->ops[stage_id]));
+      ICHECK(!dag->access_analyzer.IsSimpleAccess(dag->ops[stage_id]));
     }
   }
 
   std::set<int> is_strictly_inlinable = {bias_add, bn_mul, bn_add, relu};
   for (size_t stage_id = 0; stage_id < dag->ops.size(); stage_id++) {
     if (is_strictly_inlinable.count(stage_id)) {
-      CHECK(dag->access_analyzer.IsStrictlyInlineable(dag->ops[stage_id]));
+      ICHECK(dag->access_analyzer.IsStrictlyInlineable(dag->ops[stage_id]));
     } else {
-      CHECK(!dag->access_analyzer.IsStrictlyInlineable(dag->ops[stage_id]));
+      ICHECK(!dag->access_analyzer.IsStrictlyInlineable(dag->ops[stage_id]));
     }
   }
 
   std::set<int> is_output = {relu};
   for (size_t stage_id = 0; stage_id < dag->ops.size(); stage_id++) {
     if (is_output.count(stage_id)) {
-      CHECK(dag->access_analyzer.IsOutput(dag->ops[stage_id]));
+      ICHECK(dag->access_analyzer.IsOutput(dag->ops[stage_id]));
     } else {
-      CHECK(!dag->access_analyzer.IsOutput(dag->ops[stage_id]));
+      ICHECK(!dag->access_analyzer.IsOutput(dag->ops[stage_id]));
     }
   }
 
-  CHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[conv], dag->ops[bias_add]), 4);
-  CHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[conv], dag->ops[relu]), 4);
-  CHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[data], dag->ops[relu]), 1);
+  ICHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[conv], dag->ops[bias_add]), 4);
+  ICHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[conv], dag->ops[relu]), 4);
+  ICHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[data], dag->ops[relu]), 1);
 
-  CHECK(dag->access_analyzer.ElementWiseMatch(dag->ops[conv], dag->ops[bias_add]));
-  CHECK(dag->access_analyzer.ElementWiseMatch(dag->ops[conv], dag->ops[relu]));
-  CHECK(!dag->access_analyzer.ElementWiseMatch(dag->ops[data], dag->ops[padding]));
+  ICHECK(dag->access_analyzer.ElementWiseMatch(dag->ops[conv], dag->ops[bias_add]));
+  ICHECK(dag->access_analyzer.ElementWiseMatch(dag->ops[conv], dag->ops[relu]));
+  ICHECK(!dag->access_analyzer.ElementWiseMatch(dag->ops[data], dag->ops[padding]));
 
   std::unordered_set<tvm::te::Operation, tvm::ObjectHash, tvm::ObjectEqual> op_set;
   {
@@ -126,8 +126,8 @@ TEST(ComputeDAG, AccessAnalyzer) {
         {bn_offset, bn_add}, {bn_add, relu}};
     for (const auto& pair : consumer_list) {
       op_set = dag->access_analyzer.GetConsumers(s0, s0->stages[pair.first]->op);
-      CHECK_EQ(op_set.size(), 1);
-      CHECK_EQ((*op_set.begin()), s0->stages[pair.second]->op);
+      ICHECK_EQ(op_set.size(), 1);
+      ICHECK_EQ((*op_set.begin()), s0->stages[pair.second]->op);
     }
     std::vector<std::pair<int, std::vector<int>>> producer_list = {{padding, {data}},
                                                                    {conv, {padding, kernel}},
@@ -137,9 +137,9 @@ TEST(ComputeDAG, AccessAnalyzer) {
                                                                    {relu, {bn_add}}};
     for (const auto& pair : producer_list) {
       op_set = dag->access_analyzer.GetProducers(s0, s0->stages[pair.first]->op);
-      CHECK_EQ(op_set.size(), pair.second.size());
+      ICHECK_EQ(op_set.size(), pair.second.size());
       for (const auto& target : pair.second) {
-        CHECK(op_set.count(s0->stages[target]->op));
+        ICHECK(op_set.count(s0->stages[target]->op));
       }
     }
   }
@@ -152,8 +152,8 @@ TEST(ComputeDAG, AccessAnalyzer) {
     std::vector<std::pair<int, int>> consumer_list = {{data, conv}, {kernel, conv}, {conv, relu}};
     for (const auto& pair : consumer_list) {
       op_set = dag->access_analyzer.GetConsumers(s0, s0->stages[pair.first]->op);
-      CHECK_EQ(op_set.size(), 1);
-      CHECK_EQ((*op_set.begin()), s0->stages[pair.second]->op);
+      ICHECK_EQ(op_set.size(), 1);
+      ICHECK_EQ((*op_set.begin()), s0->stages[pair.second]->op);
     }
     std::vector<std::pair<int, std::vector<int>>> producer_list = {{padding, {data}},
                                                                    {conv, {padding, kernel}},
@@ -163,9 +163,9 @@ TEST(ComputeDAG, AccessAnalyzer) {
                                                                    {relu, {bn_add}}};
     for (const auto& pair : producer_list) {
       op_set = dag->access_analyzer.GetDirectProducers(s0->stages[pair.first]->op);
-      CHECK_EQ(op_set.size(), pair.second.size());
+      ICHECK_EQ(op_set.size(), pair.second.size());
       for (const auto& target : pair.second) {
-        CHECK(op_set.count(s0->stages[target]->op));
+        ICHECK(op_set.count(s0->stages[target]->op));
       }
     }
   }
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 62c37f827cd5..ed50e3c86e85 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -56,14 +56,14 @@ TEST(BuildModule, Basic) {
   auto module = build(lowered, target, Target());
 
   auto mali_target = Target("opencl -model=Mali-T860MP4@800Mhz -device=mali");
-  CHECK_EQ(mali_target->kind->name, "opencl");
-  CHECK_EQ(mali_target->keys.size(), 3);
-  CHECK_EQ(mali_target->keys[0], "mali");
-  CHECK_EQ(mali_target->keys[1], "opencl");
-  CHECK_EQ(mali_target->keys[2], "gpu");
-  CHECK_EQ(mali_target->GetAttr<String>("device").value(), "mali");
-  CHECK_EQ(mali_target->GetAttr<String>("model").value(), "Mali-T860MP4@800Mhz");
-  CHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 256);
+  ICHECK_EQ(mali_target->kind->name, "opencl");
+  ICHECK_EQ(mali_target->keys.size(), 3);
+  ICHECK_EQ(mali_target->keys[0], "mali");
+  ICHECK_EQ(mali_target->keys[1], "opencl");
+  ICHECK_EQ(mali_target->keys[2], "gpu");
+  ICHECK_EQ(mali_target->GetAttr<String>("device").value(), "mali");
+  ICHECK_EQ(mali_target->GetAttr<String>("model").value(), "Mali-T860MP4@800Mhz");
+  ICHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 256);
 }
 
 TEST(BuildModule, Heterogeneous) {
@@ -122,7 +122,7 @@ TEST(BuildModule, Heterogeneous) {
   auto module = build(inputs, Target());
 
   // Assertion for build.
-  CHECK_EQ(module->imports().size(), 1);
+  ICHECK_EQ(module->imports().size(), 1);
 
   // Execute the graph and check the correctness.
   // Setup graph json.
@@ -177,7 +177,7 @@ TEST(BuildModule, Heterogeneous) {
   // test FFI for module.
   auto test_ffi = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
     int tcode = args[1];
-    CHECK_EQ(args[0].type_code(), tcode);
+    ICHECK_EQ(args[0].type_code(), tcode);
   });
 
   test_ffi(runtime::Module(mod), static_cast<int>(kTVMModuleHandle));
@@ -196,7 +196,7 @@ TEST(BuildModule, Heterogeneous) {
 
   // Check correctness.
   for (int i = 0; i < n; ++i) {
-    CHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
+    ICHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
   }
 }
 
diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc
index d1d6ffb7ce76..35fd5b1c45b1 100644
--- a/tests/cpp/container_test.cc
+++ b/tests/cpp/container_test.cc
@@ -160,15 +160,15 @@ TEST(Array, Mutate) {
   Array<PrimExpr> list{x, z, z};
   auto list2 = list;
   list.Set(1, x);
-  CHECK(list[1].same_as(x));
-  CHECK(list2[1].same_as(z));
+  ICHECK(list[1].same_as(x));
+  ICHECK(list2[1].same_as(z));
 }
 
 TEST(Array, Iterator) {
   using namespace tvm;
   Array<PrimExpr> array{1, 2, 3};
   std::vector<PrimExpr> vector(array.begin(), array.end());
-  CHECK(vector[1].as<IntImmNode>()->value == 2);
+  ICHECK(vector[1].as<IntImmNode>()->value == 2);
 }
 
 TEST(Array, PushPop) {
@@ -277,10 +277,10 @@ TEST(Map, Expr) {
   auto z = max(x + 1 + 2, 100);
   auto zz = z + 1;
   Map<PrimExpr, PrimExpr> dict{{x, z}, {z, 2}};
-  CHECK(dict.size() == 2);
-  CHECK(dict[x].same_as(z));
-  CHECK(dict.count(z));
-  CHECK(!dict.count(zz));
+  ICHECK(dict.size() == 2);
+  ICHECK(dict[x].same_as(z));
+  ICHECK(dict.count(z));
+  ICHECK(!dict.count(zz));
 }
 
 TEST(Map, Str) {
@@ -288,8 +288,8 @@ TEST(Map, Str) {
   Var x("x");
   auto z = max(x + 1 + 2, 100);
   Map<String, PrimExpr> dict{{"x", z}, {"z", 2}};
-  CHECK(dict.size() == 2);
-  CHECK(dict["x"].same_as(z));
+  ICHECK(dict.size() == 2);
+  ICHECK(dict["x"].same_as(z));
 }
 
 TEST(Map, Mutate) {
@@ -298,19 +298,19 @@ TEST(Map, Mutate) {
   auto z = max(x + 1 + 2, 100);
   Map<PrimExpr, PrimExpr> dict{{x, z}, {z, 2}};
   auto zz = z + 1;
-  CHECK(dict[x].same_as(z));
+  ICHECK(dict[x].same_as(z));
   dict.Set(x, zz);
   auto dict2 = dict;
-  CHECK(dict2.count(z) == 1);
+  ICHECK(dict2.count(z) == 1);
   dict.Set(zz, x);
-  CHECK(dict2.count(zz) == 0);
-  CHECK(dict.count(zz) == 1);
+  ICHECK(dict2.count(zz) == 0);
+  ICHECK(dict.count(zz) == 1);
 
   auto it = dict.find(zz);
-  CHECK(it != dict.end() && (*it).second.same_as(x));
+  ICHECK(it != dict.end() && (*it).second.same_as(x));
 
   it = dict2.find(zz);
-  CHECK(it == dict2.end());
+  ICHECK(it == dict2.end());
 }
 
 TEST(Map, Iterator) {
@@ -319,17 +319,17 @@ TEST(Map, Iterator) {
   Map<PrimExpr, PrimExpr> map1{{a, b}};
   std::unordered_map<PrimExpr, PrimExpr, ObjectPtrHash, ObjectPtrEqual> map2(map1.begin(),
                                                                              map1.end());
-  CHECK(map2[a].as<IntImmNode>()->value == 2);
+  ICHECK(map2[a].as<IntImmNode>()->value == 2);
 }
 
 TEST(Map, Insert) {
   using namespace tvm;
   auto check = [](const Map<String, Integer>& result,
                   std::unordered_map<std::string, int64_t> expected) {
-    CHECK_EQ(result.size(), expected.size());
+    ICHECK_EQ(result.size(), expected.size());
     for (const auto& kv : result) {
-      CHECK(expected.count(kv.first));
-      CHECK_EQ(expected[kv.first], kv.second.operator int64_t());
+      ICHECK(expected.count(kv.first));
+      ICHECK_EQ(expected[kv.first], kv.second.operator int64_t());
       expected.erase(kv.first);
     }
   };
@@ -348,10 +348,10 @@ TEST(Map, Insert) {
 TEST(Map, Erase) {
   auto check = [](const Map<String, Integer>& result,
                   std::unordered_map<std::string, int64_t> expected) {
-    CHECK_EQ(result.size(), expected.size());
+    ICHECK_EQ(result.size(), expected.size());
     for (const auto& kv : result) {
-      CHECK(expected.count(kv.first));
-      CHECK_EQ(expected[kv.first], kv.second.operator int64_t());
+      ICHECK(expected.count(kv.first));
+      ICHECK_EQ(expected[kv.first], kv.second.operator int64_t());
       expected.erase(kv.first);
     }
   };
@@ -373,8 +373,8 @@ TEST(String, MoveFromStd) {
   string expect = source;
   String s(std::move(source));
   string copy = (string)s;
-  CHECK_EQ(copy, expect);
-  CHECK_EQ(source.size(), 0);
+  ICHECK_EQ(copy, expect);
+  ICHECK_EQ(source.size(), 0);
 }
 
 TEST(String, CopyFromStd) {
@@ -383,26 +383,26 @@ TEST(String, CopyFromStd) {
   string expect = source;
   String s{source};
   string copy = (string)s;
-  CHECK_EQ(copy, expect);
-  CHECK_EQ(source.size(), expect.size());
+  ICHECK_EQ(copy, expect);
+  ICHECK_EQ(source.size(), expect.size());
 }
 
 TEST(String, Assignment) {
   using namespace std;
   String s{string{"hello"}};
   s = string{"world"};
-  CHECK_EQ(s == "world", true);
+  ICHECK_EQ(s == "world", true);
   string s2{"world2"};
   s = std::move(s2);
-  CHECK_EQ(s == "world2", true);
+  ICHECK_EQ(s == "world2", true);
 }
 
 TEST(String, empty) {
   using namespace std;
   String s{"hello"};
-  CHECK_EQ(s.empty(), false);
+  ICHECK_EQ(s.empty(), false);
   s = std::string("");
-  CHECK_EQ(s.empty(), true);
+  ICHECK_EQ(s.empty(), true);
 }
 
 TEST(String, Comparisons) {
@@ -412,24 +412,24 @@ TEST(String, Comparisons) {
   String s{source};
   String m{mismatch};
 
-  CHECK_EQ(s == source, true);
-  CHECK_EQ(s == mismatch, false);
-  CHECK_EQ(s == source.data(), true);
-  CHECK_EQ(s == mismatch.data(), false);
+  ICHECK_EQ(s == source, true);
+  ICHECK_EQ(s == mismatch, false);
+  ICHECK_EQ(s == source.data(), true);
+  ICHECK_EQ(s == mismatch.data(), false);
 
-  CHECK_EQ(s < m, source < mismatch);
-  CHECK_EQ(s > m, source > mismatch);
-  CHECK_EQ(s <= m, source <= mismatch);
-  CHECK_EQ(s >= m, source >= mismatch);
-  CHECK_EQ(s == m, source == mismatch);
-  CHECK_EQ(s != m, source != mismatch);
+  ICHECK_EQ(s < m, source < mismatch);
+  ICHECK_EQ(s > m, source > mismatch);
+  ICHECK_EQ(s <= m, source <= mismatch);
+  ICHECK_EQ(s >= m, source >= mismatch);
+  ICHECK_EQ(s == m, source == mismatch);
+  ICHECK_EQ(s != m, source != mismatch);
 
-  CHECK_EQ(m < s, mismatch < source);
-  CHECK_EQ(m > s, mismatch > source);
-  CHECK_EQ(m <= s, mismatch <= source);
-  CHECK_EQ(m >= s, mismatch >= source);
-  CHECK_EQ(m == s, mismatch == source);
-  CHECK_EQ(m != s, mismatch != source);
+  ICHECK_EQ(m < s, mismatch < source);
+  ICHECK_EQ(m > s, mismatch > source);
+  ICHECK_EQ(m <= s, mismatch <= source);
+  ICHECK_EQ(m >= s, mismatch >= source);
+  ICHECK_EQ(m == s, mismatch == source);
+  ICHECK_EQ(m != s, mismatch != source);
 }
 
 // Check '\0' handling
@@ -439,11 +439,11 @@ TEST(String, null_byte_handling) {
   string v1 = "hello world";
   size_t v1_size = v1.size();
   v1[5] = '\0';
-  CHECK_EQ(v1[5], '\0');
-  CHECK_EQ(v1.size(), v1_size);
+  ICHECK_EQ(v1[5], '\0');
+  ICHECK_EQ(v1.size(), v1_size);
   String str_v1{v1};
-  CHECK_EQ(str_v1.compare(v1), 0);
-  CHECK_EQ(str_v1.size(), v1_size);
+  ICHECK_EQ(str_v1.compare(v1), 0);
+  ICHECK_EQ(str_v1.size(), v1_size);
 
   // Ensure bytes after '\0' are taken into account for mismatches.
   string v2 = "aaa one";
@@ -452,12 +452,12 @@ TEST(String, null_byte_handling) {
   v3[3] = '\0';
   String str_v2{v2};
   String str_v3{v3};
-  CHECK_EQ(str_v2.compare(str_v3), -1);
-  CHECK_EQ(str_v2.size(), 7);
+  ICHECK_EQ(str_v2.compare(str_v3), -1);
+  ICHECK_EQ(str_v2.size(), 7);
   // strcmp won't be able to detect the mismatch
-  CHECK_EQ(strcmp(v2.data(), v3.data()), 0);
+  ICHECK_EQ(strcmp(v2.data(), v3.data()), 0);
   // string::compare can handle \0 since it knows size
-  CHECK_LT(v2.compare(v3), 0);
+  ICHECK_LT(v2.compare(v3), 0);
 
   // If there is mismatch before '\0', should still handle it.
   string v4 = "acc one";
@@ -466,12 +466,12 @@ TEST(String, null_byte_handling) {
   v5[3] = '\0';
   String str_v4{v4};
   String str_v5{v5};
-  CHECK_GT(str_v4.compare(str_v5), 0);
-  CHECK_EQ(str_v4.size(), 7);
+  ICHECK_GT(str_v4.compare(str_v5), 0);
+  ICHECK_EQ(str_v4.size(), 7);
   // strcmp is able to detect the mismatch
-  CHECK_GT(strcmp(v4.data(), v5.data()), 0);
+  ICHECK_GT(strcmp(v4.data(), v5.data()), 0);
   // string::compare can handle \0 since it knows size
-  CHECK_GT(v4.compare(v5), 0);
+  ICHECK_GT(v4.compare(v5), 0);
 }
 
 TEST(String, compare_same_memory_region_different_size) {
@@ -479,11 +479,11 @@ TEST(String, compare_same_memory_region_different_size) {
   string source = "a string";
   String str_source{source};
   char* memory = const_cast<char*>(str_source.data());
-  CHECK_EQ(str_source.compare(memory), 0);
+  ICHECK_EQ(str_source.compare(memory), 0);
   // This changes the string size
   memory[2] = '\0';
   // memory is logically shorter now
-  CHECK_GT(str_source.compare(memory), 0);
+  ICHECK_GT(str_source.compare(memory), 0);
 }
 
 TEST(String, compare) {
@@ -500,55 +500,55 @@ TEST(String, compare) {
   String str_mismatch4{mismatch4};
 
   // compare with string
-  CHECK_EQ(str_source.compare(source), 0);
-  CHECK(str_source == source);
-  CHECK(source == str_source);
-  CHECK(str_source <= source);
-  CHECK(source <= str_source);
-  CHECK(str_source >= source);
-  CHECK(source >= str_source);
-  CHECK_LT(str_source.compare(mismatch1), 0);
-  CHECK(str_source < mismatch1);
-  CHECK(mismatch1 != str_source);
-  CHECK_GT(str_source.compare(mismatch2), 0);
-  CHECK(str_source > mismatch2);
-  CHECK(mismatch2 < str_source);
-  CHECK_GT(str_source.compare(mismatch3), 0);
-  CHECK(str_source > mismatch3);
-  CHECK_LT(str_source.compare(mismatch4), 0);
-  CHECK(str_source < mismatch4);
-  CHECK(mismatch4 > str_source);
+  ICHECK_EQ(str_source.compare(source), 0);
+  ICHECK(str_source == source);
+  ICHECK(source == str_source);
+  ICHECK(str_source <= source);
+  ICHECK(source <= str_source);
+  ICHECK(str_source >= source);
+  ICHECK(source >= str_source);
+  ICHECK_LT(str_source.compare(mismatch1), 0);
+  ICHECK(str_source < mismatch1);
+  ICHECK(mismatch1 != str_source);
+  ICHECK_GT(str_source.compare(mismatch2), 0);
+  ICHECK(str_source > mismatch2);
+  ICHECK(mismatch2 < str_source);
+  ICHECK_GT(str_source.compare(mismatch3), 0);
+  ICHECK(str_source > mismatch3);
+  ICHECK_LT(str_source.compare(mismatch4), 0);
+  ICHECK(str_source < mismatch4);
+  ICHECK(mismatch4 > str_source);
 
   // compare with char*
-  CHECK_EQ(str_source.compare(source.data()), 0);
-  CHECK(str_source == source.data());
-  CHECK(source.data() == str_source);
-  CHECK(str_source <= source.data());
-  CHECK(source <= str_source.data());
-  CHECK(str_source >= source.data());
-  CHECK(source >= str_source.data());
-  CHECK_LT(str_source.compare(mismatch1.data()), 0);
-  CHECK(str_source < mismatch1.data());
-  CHECK(str_source != mismatch1.data());
-  CHECK(mismatch1.data() != str_source);
-  CHECK_GT(str_source.compare(mismatch2.data()), 0);
-  CHECK(str_source > mismatch2.data());
-  CHECK(mismatch2.data() < str_source);
-  CHECK_GT(str_source.compare(mismatch3.data()), 0);
-  CHECK(str_source > mismatch3.data());
-  CHECK_LT(str_source.compare(mismatch4.data()), 0);
-  CHECK(str_source < mismatch4.data());
-  CHECK(mismatch4.data() > str_source);
+  ICHECK_EQ(str_source.compare(source.data()), 0);
+  ICHECK(str_source == source.data());
+  ICHECK(source.data() == str_source);
+  ICHECK(str_source <= source.data());
+  ICHECK(source <= str_source.data());
+  ICHECK(str_source >= source.data());
+  ICHECK(source >= str_source.data());
+  ICHECK_LT(str_source.compare(mismatch1.data()), 0);
+  ICHECK(str_source < mismatch1.data());
+  ICHECK(str_source != mismatch1.data());
+  ICHECK(mismatch1.data() != str_source);
+  ICHECK_GT(str_source.compare(mismatch2.data()), 0);
+  ICHECK(str_source > mismatch2.data());
+  ICHECK(mismatch2.data() < str_source);
+  ICHECK_GT(str_source.compare(mismatch3.data()), 0);
+  ICHECK(str_source > mismatch3.data());
+  ICHECK_LT(str_source.compare(mismatch4.data()), 0);
+  ICHECK(str_source < mismatch4.data());
+  ICHECK(mismatch4.data() > str_source);
 
   // compare with String
-  CHECK_LT(str_source.compare(str_mismatch1), 0);
-  CHECK(str_source < str_mismatch1);
-  CHECK_GT(str_source.compare(str_mismatch2), 0);
-  CHECK(str_source > str_mismatch2);
-  CHECK_GT(str_source.compare(str_mismatch3), 0);
-  CHECK(str_source > str_mismatch3);
-  CHECK_LT(str_source.compare(str_mismatch4), 0);
-  CHECK(str_source < str_mismatch4);
+  ICHECK_LT(str_source.compare(str_mismatch1), 0);
+  ICHECK(str_source < str_mismatch1);
+  ICHECK_GT(str_source.compare(str_mismatch2), 0);
+  ICHECK(str_source > str_mismatch2);
+  ICHECK_GT(str_source.compare(str_mismatch3), 0);
+  ICHECK(str_source > str_mismatch3);
+  ICHECK_LT(str_source.compare(str_mismatch4), 0);
+  ICHECK(str_source < str_mismatch4);
 }
 
 TEST(String, c_str) {
@@ -557,8 +557,8 @@ TEST(String, c_str) {
   string mismatch = "mismatch";
   String s{source};
 
-  CHECK_EQ(std::strcmp(s.c_str(), source.data()), 0);
-  CHECK_NE(std::strcmp(s.c_str(), mismatch.data()), 0);
+  ICHECK_EQ(std::strcmp(s.c_str(), source.data()), 0);
+  ICHECK_NE(std::strcmp(s.c_str(), mismatch.data()), 0);
 }
 
 TEST(String, hash) {
@@ -575,8 +575,8 @@ TEST(String, hash) {
   map[k1] = v1;
   map[k2] = v2;
 
-  CHECK_EQ(map[k1], v1);
-  CHECK_EQ(map[k2], v2);
+  ICHECK_EQ(map[k1], v1);
+  ICHECK_EQ(map[k2], v2);
 }
 
 TEST(String, Cast) {
@@ -597,11 +597,11 @@ TEST(String, Concat) {
   String res4 = s1 + "world";
   String res5 = "world" + s1;
 
-  CHECK_EQ(res1.compare("helloworld"), 0);
-  CHECK_EQ(res2.compare("helloworld"), 0);
-  CHECK_EQ(res3.compare("worldhello"), 0);
-  CHECK_EQ(res4.compare("helloworld"), 0);
-  CHECK_EQ(res5.compare("worldhello"), 0);
+  ICHECK_EQ(res1.compare("helloworld"), 0);
+  ICHECK_EQ(res2.compare("helloworld"), 0);
+  ICHECK_EQ(res3.compare("worldhello"), 0);
+  ICHECK_EQ(res4.compare("helloworld"), 0);
+  ICHECK_EQ(res5.compare("worldhello"), 0);
 }
 
 TEST(Optional, Composition) {
@@ -609,71 +609,71 @@ TEST(Optional, Composition) {
   Optional<String> opt1 = String("xyz");
   Optional<String> opt2 = String("xyz1");
   // operator bool
-  CHECK(!opt0);
-  CHECK(opt1);
+  ICHECK(!opt0);
+  ICHECK(opt1);
   // comparison op
-  CHECK(opt0 != "xyz");
-  CHECK(opt1 == "xyz");
-  CHECK(opt1 != nullptr);
-  CHECK(opt0 == nullptr);
-  CHECK(opt0.value_or("abc") == "abc");
-  CHECK(opt1.value_or("abc") == "xyz");
-  CHECK(opt0 != opt1);
-  CHECK(opt1 == Optional<String>(String("xyz")));
-  CHECK(opt0 == Optional<String>(nullptr));
+  ICHECK(opt0 != "xyz");
+  ICHECK(opt1 == "xyz");
+  ICHECK(opt1 != nullptr);
+  ICHECK(opt0 == nullptr);
+  ICHECK(opt0.value_or("abc") == "abc");
+  ICHECK(opt1.value_or("abc") == "xyz");
+  ICHECK(opt0 != opt1);
+  ICHECK(opt1 == Optional<String>(String("xyz")));
+  ICHECK(opt0 == Optional<String>(nullptr));
   opt0 = opt1;
-  CHECK(opt0 == opt1);
-  CHECK(opt0.value().same_as(opt1.value()));
+  ICHECK(opt0 == opt1);
+  ICHECK(opt0.value().same_as(opt1.value()));
   opt0 = std::move(opt2);
-  CHECK(opt0 != opt2);
+  ICHECK(opt0 != opt2);
 }
 
 TEST(Optional, IntCmp) {
   Integer val(CallingConv::kDefault);
   Optional<Integer> opt = Integer(0);
-  CHECK(0 == static_cast<int>(CallingConv::kDefault));
-  CHECK(val == CallingConv::kDefault);
-  CHECK(opt == CallingConv::kDefault);
+  ICHECK(0 == static_cast<int>(CallingConv::kDefault));
+  ICHECK(val == CallingConv::kDefault);
+  ICHECK(opt == CallingConv::kDefault);
 
   // check we can handle implicit 0 to nullptr conversion.
   Optional<Integer> opt1(nullptr);
-  CHECK(opt1 != 0);
-  CHECK(opt1 != false);
-  CHECK(!(opt1 == 0));
+  ICHECK(opt1 != 0);
+  ICHECK(opt1 != false);
+  ICHECK(!(opt1 == 0));
 }
 
 TEST(Optional, PackedCall) {
   auto tf = [](Optional<String> s, bool isnull) {
     if (isnull) {
-      CHECK(s == nullptr);
+      ICHECK(s == nullptr);
     } else {
-      CHECK(s != nullptr);
+      ICHECK(s != nullptr);
     }
     return s;
   };
   auto func = TypedPackedFunc<Optional<String>(Optional<String>, bool)>(tf);
-  CHECK(func(String("xyz"), false) == "xyz");
-  CHECK(func(Optional<String>(nullptr), true) == nullptr);
+  ICHECK(func(String("xyz"), false) == "xyz");
+  ICHECK(func(Optional<String>(nullptr), true) == nullptr);
 
   auto pf = [](TVMArgs args, TVMRetValue* rv) {
     Optional<String> s = args[0];
     bool isnull = args[1];
     if (isnull) {
-      CHECK(s == nullptr);
+      ICHECK(s == nullptr);
     } else {
-      CHECK(s != nullptr);
+      ICHECK(s != nullptr);
     }
     *rv = s;
   };
   auto packedfunc = PackedFunc(pf);
-  CHECK(packedfunc("xyz", false).operator String() == "xyz");
-  CHECK(packedfunc("xyz", false).operator Optional<String>() == "xyz");
-  CHECK(packedfunc(nullptr, true).operator Optional<String>() == nullptr);
+  ICHECK(packedfunc("xyz", false).operator String() == "xyz");
+  ICHECK(packedfunc("xyz", false).operator Optional<String>() == "xyz");
+  ICHECK(packedfunc(nullptr, true).operator Optional<String>() == nullptr);
 
   // test FFI convention.
   auto test_ffi = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
     int tcode = args[1];
-    CHECK_EQ(args[0].type_code(), tcode);
+    ICHECK_EQ(args[0].type_code(), tcode);
   });
   String s = "xyz";
   auto nd = NDArray::Empty({0, 1}, DataType::Float(32), DLContext{kDLCPU, 0});
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index a5d47dd4d989..99ff26dc0b58 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -30,8 +30,8 @@ TEST(Expr, Basic) {
   PrimExpr zz = Downcast<PrimExpr>(tmp);
   std::ostringstream os;
   os << z;
-  CHECK(zz.same_as(z));
-  CHECK(os.str() == "max(((x + 1) + 2), 100)");
+  ICHECK(zz.same_as(z));
+  ICHECK(os.str() == "max(((x + 1) + 2), 100)");
 }
 
 TEST(ExprNodeRef, Basic) {
@@ -40,7 +40,7 @@ TEST(ExprNodeRef, Basic) {
   Var x("x");
   PrimExpr z = max(x + 1 + 2, 100);
   const tir::MaxNode* op = z.as<tir::MaxNode>();
-  CHECK(GetRef<ObjectRef>(op).same_as(z));
+  ICHECK(GetRef<ObjectRef>(op).same_as(z));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index de06a0e7189f..683caaa7c5de 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -35,8 +35,8 @@ TEST(IRF, Basic) {
   NodeFunctor<int(const ObjectRef& n, int b)> f;
   f.set_dispatch<VarNode>([](const ObjectRef& n, int b) { return b; });
   f.set_dispatch<AddNode>([](const ObjectRef& n, int b) { return b + 2; });
-  CHECK_EQ(f(x, 2), 2);
-  CHECK_EQ(f(z, 2), 4);
+  ICHECK_EQ(f(x, 2), 2);
+  ICHECK_EQ(f(z, 2), 4);
 }
 
 TEST(IRF, CountVar) {
@@ -49,7 +49,7 @@ TEST(IRF, CountVar) {
   tir::PostOrderVisit(z, [&n_var](const ObjectRef& n) {
     if (n.as<VarNode>()) ++n_var;
   });
-  CHECK_EQ(n_var, 2);
+  ICHECK_EQ(n_var, 2);
 }
 
 TEST(IRF, ExprTransform) {
@@ -67,8 +67,8 @@ TEST(IRF, ExprTransform) {
     }
   };
   MyExprFunctor f;
-  CHECK_EQ(f(x, 2), 2);
-  CHECK_EQ(f(z, 2), 3);
+  ICHECK_EQ(f(x, 2), 2);
+  ICHECK_EQ(f(z, 2), 3);
   try {
     f(z - 1, 2);
     LOG(FATAL) << "should fail";
@@ -97,7 +97,7 @@ TEST(IRF, ExprVisit) {
   };
   MyVisitor v;
   v.VisitStmt(Evaluate(z));
-  CHECK_EQ(v.count, 1);
+  ICHECK_EQ(v.count, 1);
 }
 
 TEST(IRF, StmtVisitor) {
@@ -118,7 +118,7 @@ TEST(IRF, StmtVisitor) {
     return Allocate(buffer, DataType::Float(32), {z, z}, const_true(), body);
   };
   v(fmaketest());
-  CHECK_EQ(v.count, 3);
+  ICHECK_EQ(v.count, 3);
 }
 
 TEST(IRF, StmtMutator) {
@@ -159,14 +159,14 @@ TEST(IRF, StmtMutator) {
     Array<Stmt> arr{std::move(body), body2, body2};
     auto* arrptr = arr.get();
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr.get() == arrptr);
+    ICHECK(arr.get() == arrptr);
     // inplace update body
-    CHECK(arr[0].as<AllocateNode>()->extents[1].same_as(x));
-    CHECK(arr[0].as<AllocateNode>()->extents.get() == extentptr);
+    ICHECK(arr[0].as<AllocateNode>()->extents[1].same_as(x));
+    ICHECK(arr[0].as<AllocateNode>()->extents.get() == extentptr);
     // copy because there is additional refs
-    CHECK(!arr[0].as<AllocateNode>()->body.same_as(bref));
-    CHECK(arr[0].as<AllocateNode>()->body.as<EvaluateNode>()->value.same_as(x));
-    CHECK(bref.as<EvaluateNode>()->value.as<AddNode>());
+    ICHECK(!arr[0].as<AllocateNode>()->body.same_as(bref));
+    ICHECK(arr[0].as<AllocateNode>()->body.as<EvaluateNode>()->value.same_as(x));
+    ICHECK(bref.as<EvaluateNode>()->value.as<AddNode>());
   }
   {
     Array<Stmt> arr{fmakealloc()};
@@ -174,29 +174,29 @@ TEST(IRF, StmtMutator) {
     Array<Stmt> arr2 = arr;
     auto* arrptr = arr.get();
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr.get() != arrptr);
-    CHECK(arr[0].as<AllocateNode>()->extents[1].same_as(x));
-    CHECK(!arr2[0].as<AllocateNode>()->extents[1].same_as(x));
+    ICHECK(arr.get() != arrptr);
+    ICHECK(arr[0].as<AllocateNode>()->extents[1].same_as(x));
+    ICHECK(!arr2[0].as<AllocateNode>()->extents[1].same_as(x));
     // mutate but no content change.
     arr2 = arr;
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr2.get() == arr.get());
+    ICHECK(arr2.get() == arr.get());
   }
   {
     Array<Stmt> arr{fmakeif()};
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr[0].as<IfThenElseNode>()->else_case.as<EvaluateNode>()->value.same_as(x));
+    ICHECK(arr[0].as<IfThenElseNode>()->else_case.as<EvaluateNode>()->value.same_as(x));
     // mutate but no content change.
     auto arr2 = arr;
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr2.get() == arr.get());
+    ICHECK(arr2.get() == arr.get());
   }
 
   {
     auto body =
         Evaluate(Call(DataType::Int(32), builtin::call_extern(), {StringImm("xyz"), x + 1}));
     auto res = v(std::move(body));
-    CHECK(res.as<EvaluateNode>()->value.as<CallNode>()->args[1].same_as(x));
+    ICHECK(res.as<EvaluateNode>()->value.as<CallNode>()->args[1].same_as(x));
   }
   {
     Stmt body = fmakealloc();
@@ -209,9 +209,9 @@ TEST(IRF, StmtMutator) {
     body = SeqStmt({body, body2});
     body = v(std::move(body));
     // the seq get flattened
-    CHECK(body.as<SeqStmtNode>()->size() == 3);
-    CHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() == extentptr);
-    CHECK(body.as<SeqStmtNode>()->seq[1].get() == ref2);
+    ICHECK(body.as<SeqStmtNode>()->size() == 3);
+    ICHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() == extentptr);
+    ICHECK(body.as<SeqStmtNode>()->seq[1].get() == ref2);
   }
 
   {
@@ -225,7 +225,7 @@ TEST(IRF, StmtMutator) {
     body = SeqStmt({body, body2});
     body = v(std::move(body));
     // the seq get flattened
-    CHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() != extentptr);
+    ICHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() != extentptr);
   }
 }
 
diff --git a/tests/cpp/object_protocol_test.cc b/tests/cpp/object_protocol_test.cc
index 0df802497434..aaf9ee4af271 100644
--- a/tests/cpp/object_protocol_test.cc
+++ b/tests/cpp/object_protocol_test.cc
@@ -72,28 +72,28 @@ TEST(ObjectHierachy, Basic) {
   using namespace tvm::test;
 
   ObjectRef refA(make_object<ObjA>());
-  CHECK_EQ(refA->type_index(), ObjA::RuntimeTypeIndex());
-  CHECK(refA.as<Object>() != nullptr);
-  CHECK(refA.as<ObjA>() != nullptr);
-  CHECK(refA.as<ObjBase>() != nullptr);
-  CHECK(refA.as<ObjB>() == nullptr);
-  CHECK(refA.as<ObjAA>() == nullptr);
+  ICHECK_EQ(refA->type_index(), ObjA::RuntimeTypeIndex());
+  ICHECK(refA.as<Object>() != nullptr);
+  ICHECK(refA.as<ObjA>() != nullptr);
+  ICHECK(refA.as<ObjBase>() != nullptr);
+  ICHECK(refA.as<ObjB>() == nullptr);
+  ICHECK(refA.as<ObjAA>() == nullptr);
 
   ObjectRef refAA(make_object<ObjAA>());
-  CHECK_EQ(refAA->type_index(), ObjAA::RuntimeTypeIndex());
-  CHECK(refAA.as<Object>() != nullptr);
-  CHECK(refAA.as<ObjBase>() != nullptr);
-  CHECK(refAA.as<ObjA>() != nullptr);
-  CHECK(refAA.as<ObjAA>() != nullptr);
-  CHECK(refAA.as<ObjB>() == nullptr);
+  ICHECK_EQ(refAA->type_index(), ObjAA::RuntimeTypeIndex());
+  ICHECK(refAA.as<Object>() != nullptr);
+  ICHECK(refAA.as<ObjBase>() != nullptr);
+  ICHECK(refAA.as<ObjA>() != nullptr);
+  ICHECK(refAA.as<ObjAA>() != nullptr);
+  ICHECK(refAA.as<ObjB>() == nullptr);
 
   ObjectRef refB(make_object<ObjB>());
-  CHECK_EQ(refB->type_index(), ObjB::RuntimeTypeIndex());
-  CHECK(refB.as<Object>() != nullptr);
-  CHECK(refB.as<ObjBase>() != nullptr);
-  CHECK(refB.as<ObjA>() == nullptr);
-  CHECK(refB.as<ObjAA>() == nullptr);
-  CHECK(refB.as<ObjB>() != nullptr);
+  ICHECK_EQ(refB->type_index(), ObjB::RuntimeTypeIndex());
+  ICHECK(refB.as<Object>() != nullptr);
+  ICHECK(refB.as<ObjBase>() != nullptr);
+  ICHECK(refB.as<ObjA>() == nullptr);
+  ICHECK(refB.as<ObjAA>() == nullptr);
+  ICHECK(refB.as<ObjB>() != nullptr);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 523df9891332..53a3f40388cb 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -34,16 +34,16 @@ TEST(PackedFunc, Basic) {
   DLTensor a;
 
   Var v = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK(args.num_args == 3);
-    CHECK(args.values[0].v_float64 == 1.0);
-    CHECK(args.type_codes[0] == kDLFloat);
-    CHECK(args.values[1].v_handle == &a);
-    CHECK(args.type_codes[1] == kTVMDLTensorHandle);
-    CHECK(args.values[2].v_handle == &x);
-    CHECK(args.type_codes[2] == kTVMOpaqueHandle);
+    ICHECK(args.num_args == 3);
+    ICHECK(args.values[0].v_float64 == 1.0);
+    ICHECK(args.type_codes[0] == kDLFloat);
+    ICHECK(args.values[1].v_handle == &a);
+    ICHECK(args.type_codes[1] == kTVMDLTensorHandle);
+    ICHECK(args.values[2].v_handle == &x);
+    ICHECK(args.type_codes[2] == kTVMOpaqueHandle);
     *rv = Var("a");
   })(1.0, &a, handle);
-  CHECK(v->name_hint == "a");
+  ICHECK(v->name_hint == "a");
 }
 
 TEST(PackedFunc, Node) {
@@ -52,13 +52,13 @@ TEST(PackedFunc, Node) {
   using namespace tvm::runtime;
   Var x;
   Var t = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK(args.num_args == 1);
-    CHECK(args[0].IsObjectRef<ObjectRef>());
+    ICHECK(args.num_args == 1);
+    ICHECK(args[0].IsObjectRef<ObjectRef>());
     Var b = args[0];
-    CHECK(x.same_as(b));
+    ICHECK(x.same_as(b));
     *rv = b;
   })(x);
-  CHECK(t.same_as(x));
+  ICHECK(t.same_as(x));
 }
 
 TEST(PackedFunc, NDArray) {
@@ -66,38 +66,38 @@ TEST(PackedFunc, NDArray) {
   using namespace tvm::runtime;
   auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0});
   reinterpret_cast<float*>(x->data)[0] = 10.0f;
-  CHECK(x.use_count() == 1);
+  ICHECK(x.use_count() == 1);
 
   PackedFunc forward([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; });
 
   NDArray ret = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
     NDArray y = args[0];
     DLTensor* ptr = args[0];
-    CHECK(ptr == x.operator->());
-    CHECK(x.same_as(y));
-    CHECK(x.use_count() == 2);
+    ICHECK(ptr == x.operator->());
+    ICHECK(x.same_as(y));
+    ICHECK(x.use_count() == 2);
     *rv = forward(y);
   })(x);
-  CHECK(ret.use_count() == 2);
-  CHECK(ret.same_as(x));
+  ICHECK(ret.use_count() == 2);
+  ICHECK(ret.same_as(x));
 }
 
 TEST(PackedFunc, str) {
   using namespace tvm;
   using namespace tvm::runtime;
   PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK(args.num_args == 1);
+    ICHECK(args.num_args == 1);
     std::string x = args[0];
-    CHECK(x == "hello");
+    ICHECK(x == "hello");
     String y = args[0];
-    CHECK(y == "hello");
+    ICHECK(y == "hello");
     *rv = x;
   })("hello");
 
   PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK(args.num_args == 1);
+    ICHECK(args.num_args == 1);
     runtime::String s = args[0];
-    CHECK(s == "hello");
+    ICHECK(s == "hello");
   })(runtime::String("hello"));
 }
 
@@ -111,13 +111,13 @@ TEST(PackedFunc, func) {
     // TVMArgValue -> Arguments as function
     *rv = f(args[1]).operator int();
   })(addone, 1);
-  CHECK_EQ(r0, 2);
+  ICHECK_EQ(r0, 2);
 
   int r1 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
     // TVMArgValue -> TVMRetValue
     *rv = args[1];
   })(2, 100);
-  CHECK_EQ(r1, 100);
+  ICHECK_EQ(r1, 100);
 
   int r2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
     // re-assignment
@@ -125,7 +125,7 @@ TEST(PackedFunc, func) {
     // TVMRetValue -> Function argument
     *rv = addone(args[0].operator PackedFunc()(args[1], 1));
   })(addone, 100);
-  CHECK_EQ(r2, 102);
+  ICHECK_EQ(r2, 102);
 }
 
 TEST(PackedFunc, Expr) {
@@ -141,7 +141,7 @@ TEST(PackedFunc, Expr) {
     // TVMArgValue -> Arguments as function
     *rv = f(args[1]).operator int();
   })(addone, 1);
-  CHECK_EQ(r0, 2);
+  ICHECK_EQ(r0, 2);
 }
 
 TEST(PackedFunc, Type) {
@@ -152,9 +152,9 @@ TEST(PackedFunc, Type) {
     *rv = x;
   });
   auto get_type2 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; });
-  CHECK(get_type("int32").operator DataType() == DataType::Int(32));
-  CHECK(get_type("float").operator DataType() == DataType::Float(32));
-  CHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2));
+  ICHECK(get_type("int32").operator DataType() == DataType::Int(32));
+  ICHECK(get_type("float").operator DataType() == DataType::Float(32));
+  ICHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2));
 }
 
 TEST(TypedPackedFunc, HighOrder) {
@@ -170,12 +170,12 @@ TEST(TypedPackedFunc, HighOrder) {
     return x;
   };
   auto add = [](int x, int y) { return x + y; };
-  CHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
+  ICHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
   PackedFunc f = ftyped(Int2Func(add), 1);
-  CHECK_EQ(f(3).operator int(), 4);
+  ICHECK_EQ(f(3).operator int(), 4);
   // call the type erased version.
   Int1Func f1 = ftyped.packed()(Int2Func(add), 1);
-  CHECK_EQ(f1(3), 4);
+  ICHECK_EQ(f1(3), 4);
 }
 
 TEST(TypedPackedFunc, Deduce) {
@@ -202,54 +202,54 @@ TEST(PackedFunc, ObjectConversion) {
   auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0});
   // assign null
   rv = ObjectRef();
-  CHECK_EQ(rv.type_code(), kTVMNullptr);
+  ICHECK_EQ(rv.type_code(), kTVMNullptr);
 
   // Can assign NDArray to ret type
   rv = x;
-  CHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
+  ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
   // Even if we assign base type it still shows as NDArray
   rv = ObjectRef(x);
-  CHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
+  ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
   // Check convert back
-  CHECK(rv.operator NDArray().same_as(x));
-  CHECK(rv.operator ObjectRef().same_as(x));
-  CHECK(!rv.IsObjectRef<PrimExpr>());
+  ICHECK(rv.operator NDArray().same_as(x));
+  ICHECK(rv.operator ObjectRef().same_as(x));
+  ICHECK(!rv.IsObjectRef<PrimExpr>());
 
   auto pf1 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK_EQ(args[0].type_code(), kTVMNDArrayHandle);
-    CHECK(args[0].operator NDArray().same_as(x));
-    CHECK(args[0].operator ObjectRef().same_as(x));
-    CHECK(args[1].operator ObjectRef().get() == nullptr);
-    CHECK(args[1].operator NDArray().get() == nullptr);
-    CHECK(args[1].operator Module().get() == nullptr);
-    CHECK(args[1].operator Array<NDArray>().get() == nullptr);
-    CHECK(!args[0].IsObjectRef<PrimExpr>());
+    ICHECK_EQ(args[0].type_code(), kTVMNDArrayHandle);
+    ICHECK(args[0].operator NDArray().same_as(x));
+    ICHECK(args[0].operator ObjectRef().same_as(x));
+    ICHECK(args[1].operator ObjectRef().get() == nullptr);
+    ICHECK(args[1].operator NDArray().get() == nullptr);
+    ICHECK(args[1].operator Module().get() == nullptr);
+    ICHECK(args[1].operator Array<NDArray>().get() == nullptr);
+    ICHECK(!args[0].IsObjectRef<PrimExpr>());
   });
   pf1(x, ObjectRef());
   pf1(ObjectRef(x), NDArray());
 
   // testcases for modules
   auto* pf = tvm::runtime::Registry::Get("runtime.SourceModuleCreate");
-  CHECK(pf != nullptr);
+  ICHECK(pf != nullptr);
   Module m = (*pf)("", "xyz");
   rv = m;
-  CHECK_EQ(rv.type_code(), kTVMModuleHandle);
+  ICHECK_EQ(rv.type_code(), kTVMModuleHandle);
   // Even if we assign base type it still shows as NDArray
   rv = ObjectRef(m);
-  CHECK_EQ(rv.type_code(), kTVMModuleHandle);
+  ICHECK_EQ(rv.type_code(), kTVMModuleHandle);
   // Check convert back
-  CHECK(rv.operator Module().same_as(m));
-  CHECK(rv.operator ObjectRef().same_as(m));
-  CHECK(!rv.IsObjectRef<NDArray>());
+  ICHECK(rv.operator Module().same_as(m));
+  ICHECK(rv.operator ObjectRef().same_as(m));
+  ICHECK(!rv.IsObjectRef<NDArray>());
 
   auto pf2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK_EQ(args[0].type_code(), kTVMModuleHandle);
-    CHECK(args[0].operator Module().same_as(m));
-    CHECK(args[0].operator ObjectRef().same_as(m));
-    CHECK(args[1].operator ObjectRef().get() == nullptr);
-    CHECK(args[1].operator NDArray().get() == nullptr);
-    CHECK(args[1].operator Module().get() == nullptr);
-    CHECK(!args[0].IsObjectRef<PrimExpr>());
+    ICHECK_EQ(args[0].type_code(), kTVMModuleHandle);
+    ICHECK(args[0].operator Module().same_as(m));
+    ICHECK(args[0].operator ObjectRef().same_as(m));
+    ICHECK(args[1].operator ObjectRef().get() == nullptr);
+    ICHECK(args[1].operator NDArray().get() == nullptr);
+    ICHECK(args[1].operator Module().get() == nullptr);
+    ICHECK(!args[0].IsObjectRef<PrimExpr>());
   });
   pf2(m, ObjectRef());
   pf2(ObjectRef(m), Module());
@@ -261,7 +261,7 @@ TEST(TypedPackedFunc, RValue) {
   {
     auto inspect = [](TVMArgs args, TVMRetValue* rv) {
       for (int i = 0; i < args.size(); ++i) {
-        CHECK_EQ(args[0].type_code(), kTVMObjectRValueRefArg);
+        ICHECK_EQ(args[0].type_code(), kTVMObjectRValueRefArg);
       }
     };
     PackedFunc finspect(inspect);
@@ -270,37 +270,37 @@ TEST(TypedPackedFunc, RValue) {
   {
     auto f = [](tir::Var x, bool move) {
       if (move) {
-        CHECK(x.unique());
+        ICHECK(x.unique());
       } else {
-        CHECK(!x.unique());
+        ICHECK(!x.unique());
       }
-      CHECK(x->name_hint == "x");
+      ICHECK(x->name_hint == "x");
       return x;
     };
     TypedPackedFunc<tir::Var(tir::Var, bool)> tf(f);
 
     tir::Var var("x");
-    CHECK(var.unique());
+    ICHECK(var.unique());
     tf(var, false);
     // move the result to the function.
     tir::Var ret = tf(std::move(var), true);
-    CHECK(!var.defined());
+    ICHECK(!var.defined());
   }
 
   {
     // pass child class.
     auto f = [](PrimExpr x, bool move) {
       if (move) {
-        CHECK(x.unique());
+        ICHECK(x.unique());
       } else {
-        CHECK(!x.unique());
+        ICHECK(!x.unique());
       }
       return x;
     };
     TypedPackedFunc<PrimExpr(PrimExpr, bool)> tf(f);
 
     tir::Var var("x");
-    CHECK(var.unique());
+    ICHECK(var.unique());
     tf(var, false);
     tf(std::move(var), true);
     // auto conversion.
diff --git a/tests/cpp/parallel_for_test.cc b/tests/cpp/parallel_for_test.cc
index 82e95f9ab46e..bf5fe94b83ff 100644
--- a/tests/cpp/parallel_for_test.cc
+++ b/tests/cpp/parallel_for_test.cc
@@ -19,6 +19,7 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <tvm/support/logging.h>
 #include <tvm/support/parallel_for.h>
 
 #include <vector>
@@ -34,7 +35,7 @@ TEST(ParallelFor, Basic) {
   }
   parallel_for(0, 10, [&b](int i) { b[i] = i; });
   for (int i = 0; i < 10; i++) {
-    CHECK_EQ(a[i], b[i]);
+    ICHECK_EQ(a[i], b[i]);
   }
 
   // Check for a large size of parallel
@@ -43,7 +44,7 @@ TEST(ParallelFor, Basic) {
   }
   parallel_for(0, 1000, [&b](int i) { b[i] = i; });
   for (int i = 0; i < 1000; i++) {
-    CHECK_EQ(a[i], b[i]);
+    ICHECK_EQ(a[i], b[i]);
   }
 
   // Check for step != 1
@@ -53,7 +54,7 @@ TEST(ParallelFor, Basic) {
   parallel_for(
       0, 1000, [&b](int i) { b[i] *= 2; }, 2);
   for (int i = 0; i < 1000; i++) {
-    CHECK_EQ(a[i], b[i]);
+    ICHECK_EQ(a[i], b[i]);
   }
 }
 
@@ -75,7 +76,7 @@ TEST(ParallelFor, NestedWithNormalForLoop) {
   });
   for (int i = 0; i < 500; i++) {
     for (int j = 0; j < 500; j++) {
-      CHECK_EQ(a[i][j], b[i][j]);
+      ICHECK_EQ(a[i][j], b[i][j]);
     }
   }
 
@@ -84,7 +85,7 @@ TEST(ParallelFor, NestedWithNormalForLoop) {
   }
   for (int i = 0; i < 500; i++) {
     for (int j = 0; j < 500; j++) {
-      CHECK_EQ(a[i][j], c[i][j]);
+      ICHECK_EQ(a[i][j], c[i][j]);
     }
   }
 }
@@ -103,7 +104,7 @@ TEST(Parallelfor, NestedWithParallelFor) {
   } catch (const std::exception& e) {
     exception = true;
   }
-  CHECK(exception);
+  ICHECK(exception);
 }
 
 TEST(ParallelFor, Exception) {
@@ -115,7 +116,7 @@ TEST(ParallelFor, Exception) {
   } catch (const std::exception& e) {
     exception = true;
   }
-  CHECK(exception);
+  ICHECK(exception);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/pattern_match_test.cc b/tests/cpp/pattern_match_test.cc
index 5063509e4e35..dfe09406ba52 100644
--- a/tests/cpp/pattern_match_test.cc
+++ b/tests/cpp/pattern_match_test.cc
@@ -33,91 +33,91 @@ TEST(Pattern, Basic) {
 
   // arithmetics
   auto r = 1 + (y + 1);
-  CHECK(!(px + (px + px)).Match(r));
-  CHECK(!(px + (py + py)).Match(r));
-  CHECK((px + (py + pz)).Match(r));
+  ICHECK(!(px + (px + px)).Match(r));
+  ICHECK(!(px + (py + py)).Match(r));
+  ICHECK((px + (py + pz)).Match(r));
   auto pattern = px + (py + pz);
-  CHECK(pattern.Match(r));
+  ICHECK(pattern.Match(r));
   {
-    CHECK((px + (py + px)).Match(r));
+    ICHECK((px + (py + px)).Match(r));
     auto rr = (px + py).Eval();
 
-    CHECK(tir::ExprDeepEqual()(rr, 1 + y));
-    CHECK(tir::ExprDeepEqual()(px.Eval() + py.Eval(), 1 + y));
+    ICHECK(tir::ExprDeepEqual()(rr, 1 + y));
+    ICHECK(tir::ExprDeepEqual()(px.Eval() + py.Eval(), 1 + y));
   }
   {
-    CHECK((px + max(py, px)).Match((x + 1) + max(y, (x + 1))));
-    CHECK(tir::ExprDeepEqual()(px.Eval(), x + 1));
+    ICHECK((px + max(py, px)).Match((x + 1) + max(y, (x + 1))));
+    ICHECK(tir::ExprDeepEqual()(px.Eval(), x + 1));
   }
-  CHECK(!(px + min(py, px)).Match((x + 1) + max(y, (x + 1))));
+  ICHECK(!(px + min(py, px)).Match((x + 1) + max(y, (x + 1))));
 
-  CHECK((px + min(py, px)).Match(z + min(y, z)));
-  CHECK((px + truncdiv(py, px * py)).Match(x + truncdiv(2, x * 2)));
-  CHECK((px - truncmod(py, px * pz)).Match(x - truncmod(2, x * 2)));
-  CHECK((px - floormod(py, px * PConst<PrimExpr>(2))).Match(x - floormod(2, x * 2)));
+  ICHECK((px + min(py, px)).Match(z + min(y, z)));
+  ICHECK((px + truncdiv(py, px * py)).Match(x + truncdiv(2, x * 2)));
+  ICHECK((px - truncmod(py, px * pz)).Match(x - truncmod(2, x * 2)));
+  ICHECK((px - floormod(py, px * PConst<PrimExpr>(2))).Match(x - floormod(2, x * 2)));
 
   // logicals
-  CHECK((px == pz).Match(x == 1));
-  CHECK((px != pz).Match(x != 1));
-  CHECK((px > py).Match(x > y));
-  CHECK((px < py).Match(x < y));
-  CHECK((px <= py).Match(x <= y));
-  CHECK((px >= py).Match(x >= y));
-  CHECK((px >= py && px < pz).Match(x >= y && x < z));
-  CHECK((!(px > py || px != py)).Match(!(x > y || x != y)));
+  ICHECK((px == pz).Match(x == 1));
+  ICHECK((px != pz).Match(x != 1));
+  ICHECK((px > py).Match(x > y));
+  ICHECK((px < py).Match(x < y));
+  ICHECK((px <= py).Match(x <= y));
+  ICHECK((px >= py).Match(x >= y));
+  ICHECK((px >= py && px < pz).Match(x >= y && x < z));
+  ICHECK((!(px > py || px != py)).Match(!(x > y || x != y)));
   {
-    CHECK(select(px >= pz, py, py + pz).Match(tir::Select((x + 1) >= 1, y, y + 1)));
-    CHECK(tir::ExprDeepEqual()(px.Eval(), x + 1));
+    ICHECK(select(px >= pz, py, py + pz).Match(tir::Select((x + 1) >= 1, y, y + 1)));
+    ICHECK(tir::ExprDeepEqual()(px.Eval(), x + 1));
   }
   // bit intrinsics
   {
-    CHECK((px >> pz).Match(x >> 1));
-    CHECK(is_const_int(pz.Eval(), 1));
+    ICHECK((px >> pz).Match(x >> 1));
+    ICHECK(is_const_int(pz.Eval(), 1));
   }
-  CHECK(!(px >> pz).Match(x << 1));
-  CHECK((px << pz).Match(x << 1));
-  CHECK((px & pz).Match(x & 1));
-  CHECK((px | pz).Match(x | 1));
-  CHECK((px ^ pz).Match(x ^ 1));
-  CHECK((px - (~(py | (px * pz)))).Match(x - (~(2 | (x * 2)))));
+  ICHECK(!(px >> pz).Match(x << 1));
+  ICHECK((px << pz).Match(x << 1));
+  ICHECK((px & pz).Match(x & 1));
+  ICHECK((px | pz).Match(x | 1));
+  ICHECK((px ^ pz).Match(x ^ 1));
+  ICHECK((px - (~(py | (px * pz)))).Match(x - (~(2 | (x * 2)))));
   // select
   {
-    CHECK(select(px > pz, py, py + pz).Match(tir::Select(x > 1, y, y + 1)));
-    CHECK(is_const_int(pz.Eval(), 1));
+    ICHECK(select(px > pz, py, py + pz).Match(tir::Select(x > 1, y, y + 1)));
+    ICHECK(is_const_int(pz.Eval(), 1));
   }
-  CHECK(!select(px > pz, py, py + pz).Match(tir::Select(x > 2, y, y + 1)));
-  CHECK(!select(px > pz, py, py).Match(tir::Select(x > 2, y, y + 1)));
+  ICHECK(!select(px > pz, py, py + pz).Match(tir::Select(x > 2, y, y + 1)));
+  ICHECK(!select(px > pz, py, py).Match(tir::Select(x > 2, y, y + 1)));
   {
-    CHECK(select(px, py, pz).Match(tir::Select(x > 2, y, y + 1)));
-    CHECK(tir::ExprDeepEqual()(pz.Eval(), y + 1));
+    ICHECK(select(px, py, pz).Match(tir::Select(x > 2, y, y + 1)));
+    ICHECK(tir::ExprDeepEqual()(pz.Eval(), y + 1));
   }
   // if_then_else
   {
-    CHECK(if_then_else(px > pz, py, py + pz).Match(if_then_else(x > 1, y, y + 1)));
-    CHECK(is_const_int(pz.Eval(), 1));
+    ICHECK(if_then_else(px > pz, py, py + pz).Match(if_then_else(x > 1, y, y + 1)));
+    ICHECK(is_const_int(pz.Eval(), 1));
   }
   // cast pattern
   {
-    CHECK(!cast(PConst<DataType>(DataType::Int(32)), px).Match(tir::Cast(DataType::Float(64), x)));
-    CHECK(cast(pt, px).Match(tir::Cast(DataType::Float(64), x)));
-    CHECK(pt.Eval() == DataType::Float(64));
+    ICHECK(!cast(PConst<DataType>(DataType::Int(32)), px).Match(tir::Cast(DataType::Float(64), x)));
+    ICHECK(cast(pt, px).Match(tir::Cast(DataType::Float(64), x)));
+    ICHECK(pt.Eval() == DataType::Float(64));
     auto zz = cast(pt, px).Eval();
-    CHECK((cast(pt, px) - cast(pt, py))
-              .Match(tir::Cast(DataType::Float(64), x) - tir::Cast(DataType::Int(64), x)));
+    ICHECK((cast(pt, px) - cast(pt, py))
+               .Match(tir::Cast(DataType::Float(64), x) - tir::Cast(DataType::Int(64), x)));
     auto expr = tir::Cast(DataType::Int(32), tir::Cast(DataType::Float(64), x));
-    CHECK(!(cast(pt, cast(pt, px))).Match(expr));
+    ICHECK(!(cast(pt, cast(pt, px))).Match(expr));
   }
   // ramp pattern
   {
-    CHECK(ramp(px, PConst<PrimExpr>(1), planes).Match(tir::Ramp(x, 1, 10)));
-    CHECK(planes.Eval() == 10);
-    CHECK(!ramp(px, PConst<PrimExpr>(1), planes).Match(tir::Ramp(x, 2, 10)));
+    ICHECK(ramp(px, PConst<PrimExpr>(1), planes).Match(tir::Ramp(x, 1, 10)));
+    ICHECK(planes.Eval() == 10);
+    ICHECK(!ramp(px, PConst<PrimExpr>(1), planes).Match(tir::Ramp(x, 2, 10)));
   }
   // broadcast pattern
   {
-    CHECK(broadcast(px, planes).Match(tir::Broadcast(x, 10)));
-    CHECK(planes.Eval() == 10);
-    CHECK(broadcast(px * py, planes).Match(tir::Broadcast(x * 10, 10)));
+    ICHECK(broadcast(px, planes).Match(tir::Broadcast(x, 10)));
+    ICHECK(planes.Eval() == 10);
+    ICHECK(broadcast(px * py, planes).Match(tir::Broadcast(x * 10, 10)));
   }
 }
 
@@ -129,14 +129,14 @@ TEST(Pattern, IntImm) {
   {
     // We can match integer and Var, both of which are
     // special case container of Expr
-    CHECK((v * c).Match(tx * 3));
-    CHECK_EQ(c.Eval()->value, 3);
-    CHECK((v * 3).Match(tx * 3));
+    ICHECK((v * c).Match(tx * 3));
+    ICHECK_EQ(c.Eval()->value, 3);
+    ICHECK((v * 3).Match(tx * 3));
   }
   // cannot match c to ty
-  CHECK(!(v * c).Match(tx * ty));
+  ICHECK(!(v * c).Match(tx * ty));
   // cannot match tx + 1 to v
-  CHECK(!(v * c).Match((tx + 1) * 3));
+  ICHECK(!(v * c).Match((tx + 1) * 3));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index fcab1b85edd9..3212f9079619 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -41,7 +41,7 @@ TVM_REGISTER_GLOBAL("test.strategy")
                        const Target& target) {
       FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) -> Array<te::Tensor> {
-        CHECK_EQ(inputs.size(), 2U);
+        ICHECK_EQ(inputs.size(), 2U);
         return {topi::add(inputs[0], inputs[1])};
       };
       FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>& outs,
@@ -116,14 +116,14 @@ TEST(Relay, BuildModule) {
   Target llvm_tgt = Target("llvm");
   targets.Set(0, llvm_tgt);
   auto relay_mod = tvm::IRModule::FromExpr(func);
-  CHECK(relay_mod.defined()) << "Module must be defined";
+  ICHECK(relay_mod.defined()) << "Module must be defined";
   build_f(relay_mod, targets, llvm_tgt);
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   // run
   auto ctx = A->ctx;
   auto pfr = tvm::runtime::Registry::Get("tvm.graph_runtime.create");
-  CHECK(mod.defined()) << "Module must be defined";
+  ICHECK(mod.defined()) << "Module must be defined";
   tvm::runtime::Module run_mod = (*pfr)(json, mod, (int)ctx.device_type, (int)ctx.device_id);
   auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false);
   auto run_f = run_mod.GetFunction("run", false);
@@ -135,7 +135,7 @@ TEST(Relay, BuildModule) {
   tvm::runtime::NDArray Y = get_output_f(0);
   auto pY = (float*)Y->data;
   for (int i = 0; i < 6; ++i) {
-    CHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4);
+    ICHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4);
   }
   // mutate the input a bit and run it again
   for (int i = 0; i < 6; ++i) {
@@ -145,7 +145,7 @@ TEST(Relay, BuildModule) {
   tvm::runtime::NDArray Y2 = get_output_f(0);
   auto pY2 = (float*)Y2->data;
   for (int i = 0; i < 6; ++i) {
-    CHECK_LT(fabs(pY2[i] - (i + (i + 3) + (i + 2))), 1e-4);
+    ICHECK_LT(fabs(pY2[i] - (i + (i + 3) + (i + 2))), 1e-4);
   }
   // attach a different input and run it again
   auto C2 = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
@@ -158,7 +158,7 @@ TEST(Relay, BuildModule) {
   tvm::runtime::NDArray Y3 = get_output_f(0);
   auto pY3 = (float*)Y3->data;
   for (int i = 0; i < 6; ++i) {
-    CHECK_LT(fabs(pY3[i] - (i + (i + 3) + (i + 4))), 1e-4);
+    ICHECK_LT(fabs(pY3[i] - (i + (i + 3) + (i + 4))), 1e-4);
   }
 }
 
@@ -171,12 +171,12 @@ TEST(Relay, GetExprRefCount) {
   auto y = relay::Call(relu_op, {x}, tvm::Attrs(), {});
   auto z = relay::Call(add_op, {y, x}, tvm::Attrs(), {});
   auto ref_count = GetExprRefCount(z);
-  CHECK(ref_count[a.get()] == 1);
-  CHECK(ref_count[relu_op.get()] == 2);
-  CHECK(ref_count[add_op.get()] == 1);
-  CHECK(ref_count[x.get()] == 2);
-  CHECK(ref_count[y.get()] == 1);
-  CHECK(ref_count[z.get()] == 1);
+  ICHECK(ref_count[a.get()] == 1);
+  ICHECK(ref_count[relu_op.get()] == 2);
+  ICHECK(ref_count[add_op.get()] == 1);
+  ICHECK(ref_count[x.get()] == 2);
+  ICHECK(ref_count[y.get()] == 1);
+  ICHECK(ref_count[z.get()] == 1);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/relay_pass_type_infer_test.cc b/tests/cpp/relay_pass_type_infer_test.cc
index cb7330dfab6d..38ac906c6dac 100644
--- a/tests/cpp/relay_pass_type_infer_test.cc
+++ b/tests/cpp/relay_pass_type_infer_test.cc
@@ -30,7 +30,7 @@ TEST(Relay, SelfReference) {
   auto tensor_type = relay::TensorType({}, DataType::Bool());
   auto x = relay::Var("x", relay::Type());
   auto f = relay::Function(tvm::Array<relay::Var>{x}, x, relay::Type(), {});
-  CHECK(f->IsInstance<BaseFuncNode>());
+  ICHECK(f->IsInstance<BaseFuncNode>());
   auto y = relay::Var("y", tensor_type);
   auto call = relay::Call(f, Array<relay::Expr>{y});
   auto fx = relay::Function(tvm::Array<relay::Var>{y}, call, relay::Type(), {});
@@ -39,7 +39,7 @@ TEST(Relay, SelfReference) {
   auto type_fx = mod->Lookup("main");
 
   auto expected = relay::FuncType(tvm::Array<relay::Type>{tensor_type}, tensor_type, {}, {});
-  CHECK(tvm::StructuralEqual()(type_fx->checked_type(), expected));
+  ICHECK(tvm::StructuralEqual()(type_fx->checked_type(), expected));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/relay_transform_sequential_test.cc b/tests/cpp/relay_transform_sequential_test.cc
index f052d66fbc5a..289574aef1e2 100644
--- a/tests/cpp/relay_transform_sequential_test.cc
+++ b/tests/cpp/relay_transform_sequential_test.cc
@@ -40,7 +40,7 @@ TVM_REGISTER_GLOBAL("test.seq.strategy")
                        const Target& target) {
       relay::FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                        const Type& out_type) -> Array<te::Tensor> {
-        CHECK_EQ(inputs.size(), 2U);
+        ICHECK_EQ(inputs.size(), 2U);
         return {topi::add(inputs[0], inputs[1])};
       };
       relay::FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>& outs,
@@ -99,11 +99,11 @@ TEST(Relay, Sequential) {
     mod = seq(mod);
   }
 
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
   auto entry_func = mod->GetGlobalVar("main");
-  CHECK(entry_func.defined());
+  ICHECK(entry_func.defined());
   relay::Function f = Downcast<relay::Function>(mod->Lookup("main"));
-  CHECK(f.defined());
+  ICHECK(f.defined());
 
   // Expected function
   auto c1 = relay::Constant(c_data);
@@ -118,7 +118,7 @@ TEST(Relay, Sequential) {
   auto mod1 = IRModule::FromExpr(expected_func);
   mod1 = relay::transform::InferType()(mod1);
   auto expected = mod1->Lookup("main");
-  CHECK(tvm::StructuralEqual()(f, expected));
+  ICHECK(tvm::StructuralEqual()(f, expected));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index b4c53ab84520..a422f12b04d7 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -37,7 +37,7 @@ TEST(TargetKind, GetAttrMap) {
   auto map = tvm::TargetKind::GetAttrMap<std::string>("Attr1");
   auto target_kind = tvm::TargetKind::Get("TestTargetKind").value();
   std::string result = map[target_kind];
-  CHECK_EQ(result, "Value1");
+  ICHECK_EQ(result, "Value1");
 }
 
 TEST(TargetCreation, NestedConfig) {
@@ -54,19 +54,19 @@ TEST(TargetCreation, NestedConfig) {
       },
   };
   Target target = Target(config);
-  CHECK_EQ(target->kind, TargetKind::Get("TestTargetKind").value());
-  CHECK_EQ(target->tag, "");
-  CHECK(target->keys.empty());
+  ICHECK_EQ(target->kind, TargetKind::Get("TestTargetKind").value());
+  ICHECK_EQ(target->tag, "");
+  ICHECK(target->keys.empty());
   Bool my_bool = target->GetAttr<Bool>("my_bool").value();
-  CHECK_EQ(my_bool.operator bool(), true);
+  ICHECK_EQ(my_bool.operator bool(), true);
   Array<String> your_names = target->GetAttr<Array<String>>("your_names").value();
-  CHECK_EQ(your_names.size(), 2U);
-  CHECK_EQ(your_names[0], "junru");
-  CHECK_EQ(your_names[1], "jian");
+  ICHECK_EQ(your_names.size(), 2U);
+  ICHECK_EQ(your_names[0], "junru");
+  ICHECK_EQ(your_names[1], "jian");
   Map<String, Integer> her_maps = target->GetAttr<Map<String, Integer>>("her_maps").value();
-  CHECK_EQ(her_maps.size(), 2U);
-  CHECK_EQ(her_maps["a"], 1);
-  CHECK_EQ(her_maps["b"], 2);
+  ICHECK_EQ(her_maps.size(), 2U);
+  ICHECK_EQ(her_maps["a"], 1);
+  ICHECK_EQ(her_maps["b"], 2);
 }
 
 TEST(TargetCreationFail, UnrecognizedConfigOption) {
@@ -142,13 +142,14 @@ TEST(TargetCreation, DeduplicateKeys) {
       {"device", String("arm_cpu")},
   };
   Target target = Target(config);
-  CHECK_EQ(target->kind, TargetKind::Get("llvm").value());
-  CHECK_EQ(target->tag, "");
-  CHECK_EQ(target->keys.size(), 2U);
-  CHECK_EQ(target->keys[0], "cpu");
-  CHECK_EQ(target->keys[1], "arm_cpu");
-  CHECK_EQ(target->attrs.size(), 1U);
-  CHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
+  ICHECK_EQ(target->kind, TargetKind::Get("llvm").value());
+  ICHECK_EQ(target->tag, "");
+  ICHECK_EQ(target->keys.size(), 2U);
+  ICHECK_EQ(target->keys[0], "cpu");
+  ICHECK_EQ(target->keys[1], "arm_cpu");
+  ICHECK_EQ(target->attrs.size(), 2U);
+  ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
+  ICHECK_EQ(target->GetAttr<Bool>("link-params"), false);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/tir_analysis_side_effect.cc b/tests/cpp/tir_analysis_side_effect.cc
index 26dedabb9304..022f2cffeda8 100644
--- a/tests/cpp/tir_analysis_side_effect.cc
+++ b/tests/cpp/tir_analysis_side_effect.cc
@@ -27,11 +27,11 @@ TEST(SimplePasses, SideEffect) {
   using namespace tvm;
   auto A = tir::Var("A", DataType::Handle());
   auto i = tir::Var("i", DataType::Int(32));
-  CHECK(tir::SideEffect(tir::Load(DataType::Float(32), A, i, tir::const_true(1))) ==
-        tir::CallEffectKind::kReadState);
-  CHECK(tir::SideEffect(exp(tir::Cast(DataType::Float(32), i + 1))) == tir::CallEffectKind::kPure);
-  CHECK(tir::SideEffect(tir::Call(DataType::Handle(), tir::builtin::tvm_storage_sync(), {})) ==
-        tir::CallEffectKind::kUpdateState);
+  ICHECK(tir::SideEffect(tir::Load(DataType::Float(32), A, i, tir::const_true(1))) ==
+         tir::CallEffectKind::kReadState);
+  ICHECK(tir::SideEffect(exp(tir::Cast(DataType::Float(32), i + 1))) == tir::CallEffectKind::kPure);
+  ICHECK(tir::SideEffect(tir::Call(DataType::Handle(), tir::builtin::tvm_storage_sync(), {})) ==
+         tir::CallEffectKind::kUpdateState);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/lint/add_asf_header.py b/tests/lint/add_asf_header.py
index a83373cea078..477ef2db4390 100644
--- a/tests/lint/add_asf_header.py
+++ b/tests/lint/add_asf_header.py
@@ -115,6 +115,25 @@
 // under the License.
 """.strip()
 
+header_cmdstyle = """
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+""".strip()
+
 FMT_MAP = {
     "sh": header_pystyle,
     "cc": header_cstyle,
@@ -141,6 +160,7 @@
     "plist": header_mdstyle,
     "xcworkspacedata": header_mdstyle,
     "html": header_mdstyle,
+    "bat": header_cmdstyle,
 }
 
 
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 60aa732fa6a0..ab51b6c79c83 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -44,6 +44,7 @@
     "pyd",
     "pyx",
     "cu",
+    "bat",
     # relay text format
     "rly",
     # configurations
@@ -124,6 +125,12 @@
     "docs/_static/img/tvm-logo-square.png",
     # pytest config
     "pytest.ini",
+    # Zephyr tests
+    "tests/micro/qemu/zephyr-runtime/prj.conf",
+    "tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386",
+    # microTVM Virtual Machines
+    "apps/microtvm/reference-vm/zephyr/Vagrantfile",
+    "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template",
 }
 
 
diff --git a/tests/micro/qemu/.gitignore b/tests/micro/qemu/.gitignore
new file mode 100644
index 000000000000..c920d8f93ff8
--- /dev/null
+++ b/tests/micro/qemu/.gitignore
@@ -0,0 +1,2 @@
+/test_zephyr*-workspace
+/*.micro-binary
diff --git a/tests/micro/qemu/conftest.py b/tests/micro/qemu/conftest.py
new file mode 100644
index 000000000000..e6cd9f2ffb1a
--- /dev/null
+++ b/tests/micro/qemu/conftest.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--microtvm-platforms",
+        default="host",
+        help=(
+            "Specify a comma-separated list of test models (i.e. as passed to tvm.target.micro()) "
+            "for microTVM tests."
+        ),
+    )
+
+
+def pytest_generate_tests(metafunc):
+    if "platform" in metafunc.fixturenames:
+        metafunc.parametrize("platform", metafunc.config.getoption("microtvm_platforms").split(","))
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
new file mode 100644
index 000000000000..2213203d55c1
--- /dev/null
+++ b/tests/micro/qemu/test_zephyr.py
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import contextlib
+import copy
+import datetime
+import glob
+import os
+import subprocess
+import sys
+
+import pytest
+import numpy as np
+
+import tvm
+import tvm.rpc
+import tvm.micro
+import tvm.relay
+
+from tvm.micro.contrib import zephyr
+from tvm.contrib import utils
+
+BUILD = True
+DEBUG = False
+
+
+TARGET = None
+
+
+def _make_sess_from_op(model, zephyr_board, op_name, sched, arg_bufs):
+    target = tvm.target.target.micro(model)
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.build(sched, arg_bufs, target, target_host=target, name=op_name)
+
+    return _make_session(model, target, zephyr_board, mod)
+
+
+def _make_session(model, target, zephyr_board, mod):
+    test_name = f"{os.path.splitext(os.path.abspath(__file__))[0]}-{model}"
+    prev_build = f"{test_name}-last-build.micro-binary"
+    workspace_root = (
+        f'{test_name}-workspace/{datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")}'
+    )
+    workspace_parent = os.path.dirname(workspace_root)
+    if not os.path.exists(workspace_parent):
+        os.makedirs(workspace_parent)
+    workspace = tvm.micro.Workspace(debug=True, root=workspace_root)
+
+    project_dir = os.path.join(os.path.dirname(__file__) or ".", "zephyr-runtime")
+    compiler = zephyr.ZephyrCompiler(
+        project_dir=project_dir,
+        board="nucleo_f746zg" if "stm32f746" in str(target) else "qemu_x86",
+        zephyr_toolchain_variant="zephyr",
+    )
+
+    opts = tvm.micro.default_options(f"{project_dir}/crt")
+    # TODO(weberlo) verify this is necessary
+    opts["bin_opts"]["ccflags"] = ["-std=gnu++14"]
+    opts["lib_opts"]["ccflags"] = ["-std=gnu++14"]
+
+    flasher_kw = {}
+    if DEBUG:
+        flasher_kw["debug_rpc_session"] = tvm.rpc.connect("127.0.0.1", 9090)
+
+    session_kw = {
+        "flasher": compiler.flasher(**flasher_kw),
+    }
+
+    if BUILD:
+        session_kw["binary"] = tvm.micro.build_static_runtime(
+            # the x86 compiler *expects* you to give the exact same dictionary for both
+            # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
+            # the binary compiler is expecting those mutations to be in bin_opts.
+            # TODO(weberlo) fix this very bizarre behavior
+            workspace,
+            compiler,
+            mod,
+            lib_opts=opts["lib_opts"],
+            bin_opts=opts["bin_opts"],
+        )
+        if os.path.exists(prev_build):
+            os.unlink(prev_build)
+        session_kw["binary"].archive(prev_build, metadata_only=True)
+    else:
+        unarchive_dir = utils.tempdir()
+        session_kw["binary"] = tvm.micro.MicroBinary.unarchive(
+            prev_build, unarchive_dir.relpath("binary")
+        )
+
+    return tvm.micro.Session(**session_kw)
+
+
+def _make_add_sess(model, zephyr_board):
+    A = tvm.te.placeholder((2,), dtype="int8")
+    B = tvm.te.placeholder((1,), dtype="int8")
+    C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
+    sched = tvm.te.create_schedule(C.op)
+    return _make_sess_from_op(model, zephyr_board, "add", sched, [A, B, C])
+
+
+# The models that should pass this configuration. Maps a short, identifying platform string to
+# (model, zephyr_board).
+PLATFORMS = {
+    "host": ("host", "qemu_x86"),
+    "stm32f746xx": ("stm32f746xx", "nucleo_f746zg"),
+}
+
+
+# The same test code can be executed on both the QEMU simulation and on real hardware.
+def test_compile_runtime(platform):
+    """Test compiling the on-device runtime."""
+
+    model, zephyr_board = PLATFORMS[platform]
+
+    # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
+    def test_basic_add(sess):
+        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
+        assert (A_data.asnumpy() == np.array([2, 3])).all()
+        B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context)
+        assert (B_data.asnumpy() == np.array([4])).all()
+        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context)
+        assert (C_data.asnumpy() == np.array([0, 0])).all()
+
+        system_lib = sess.get_system_lib()
+        system_lib.get_function("add")(A_data, B_data, C_data)
+        assert (C_data.asnumpy() == np.array([6, 7])).all()
+
+    with _make_add_sess(model, zephyr_board) as sess:
+        test_basic_add(sess)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([os.path.dirname(__file__)] + sys.argv[1:]))
diff --git a/tests/micro/qemu/zephyr-runtime/.gitignore b/tests/micro/qemu/zephyr-runtime/.gitignore
new file mode 100644
index 000000000000..64be5d3a487c
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/.gitignore
@@ -0,0 +1,3 @@
+__tvm*
+libtvm__*
+/build
diff --git a/tests/micro/qemu/zephyr-runtime/CMakeLists.txt b/tests/micro/qemu/zephyr-runtime/CMakeLists.txt
new file mode 100644
index 000000000000..ce5605469fcb
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/CMakeLists.txt
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.13.1)
+
+set(ENV{QEMU_BIN_PATH} "${CMAKE_SOURCE_DIR}/qemu-hack")
+
+set(QEMU_PIPE "\${QEMU_PIPE}")  # QEMU_PIPE is set by the calling TVM instance.
+
+find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
+project(microtvm_zephyr_runtime)
+
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+file(GLOB TVM_SOURCES ${CMAKE_SOURCE_DIR}/__tvm*.c)
+target_sources(app PRIVATE src/main.c ${TVM_SOURCES})
+
+foreach(tvm_lib ${TVM_LIBS})
+  string(LENGTH ${tvm_lib} tvm_lib_length)
+  math(EXPR tvm_lib_cut "${tvm_lib_length} - 2")
+  string(SUBSTRING ${tvm_lib} 3 ${tvm_lib_cut} tvm_lib_name)
+  add_library(${tvm_lib_name} STATIC IMPORTED)
+  set_target_properties(${tvm_lib_name} PROPERTIES
+      IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/${tvm_lib})
+  target_link_libraries(app PRIVATE ${tvm_lib_name})
+endforeach(tvm_lib ${TVM_LIBS})
+
+target_include_directories(app PRIVATE ${TVM_INCLUDE_DIRS})
diff --git a/tests/micro/qemu/zephyr-runtime/crt/crt_config.h b/tests/micro/qemu/zephyr-runtime/crt/crt_config.h
new file mode 100644
index 000000000000..a7f4f90b0538
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/crt/crt_config.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/runtime/crt_config.h.template
+ * \brief Template for CRT configuration, to be modified on each target.
+ */
+#ifndef TVM_RUNTIME_CRT_CONFIG_H_
+#define TVM_RUNTIME_CRT_CONFIG_H_
+
+#include <tvm/runtime/crt/logging.h>
+
+/*! Log level of the CRT runtime */
+#define TVM_CRT_LOG_LEVEL TVM_CRT_LOG_LEVEL_DEBUG
+
+/*! Maximum supported dimension in NDArray */
+#define TVM_CRT_MAX_NDIM 6
+
+/*! Maximum supported arguments in generated functions */
+#define TVM_CRT_MAX_ARGS 10
+
+/*! Size of the global function registry, in bytes. */
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 200
+
+/*! Maximum number of registered modules. */
+#define TVM_CRT_MAX_REGISTERED_MODULES 2
+
+/*! Maximum packet size, in bytes, including the length header. */
+#define TVM_CRT_MAX_PACKET_SIZE_BYTES 8192
+
+/*! Maximum supported string length in dltype, e.g. "int8", "int16", "float32" */
+#define TVM_CRT_MAX_STRLEN_DLTYPE 10
+
+/*! Maximum supported string length in function names */
+#define TVM_CRT_MAX_STRLEN_FUNCTION_NAME 80
+
+/*! \brief Maximum length of a PackedFunc function name. */
+#define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
+
+/*! \brief Log2 of the page size (bytes) for a virtual memory page. */
+#define TVM_CRT_PAGE_BITS 10  // 1 kB
+
+/*! \brief Number of pages on device. */
+#define TVM_CRT_MAX_PAGES 300
+
+//#define TVM_CRT_FRAMER_ENABLE_LOGS
+
+#endif  // TVM_RUNTIME_CRT_CONFIG_H_
diff --git a/tests/micro/qemu/zephyr-runtime/prj.conf b/tests/micro/qemu/zephyr-runtime/prj.conf
new file mode 100644
index 000000000000..cebb55756e8c
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/prj.conf
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# For UART implementation in main().
+CONFIG_RING_BUFFER=y
+CONFIG_UART_CONSOLE=n
+CONFIG_UART_INTERRUPT_DRIVEN=y
+
+# For RPC server C++ bindings.
+CONFIG_CPLUSPLUS=y
+CONFIG_NEWLIB_LIBC=y
+
+# For models with floating point.
+CONFIG_FPU=y
+
+# For TVMPlatformAbort().
+CONFIG_REBOOT=y
diff --git a/tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386 b/tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386
new file mode 100755
index 000000000000..a0bf0f2c4dee
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386
@@ -0,0 +1,33 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Zephyr insists on running qemu with a -pidfile option, but that option doesn't appear to
+# work given the way we've configured docker (the underlying filesystem doesn't suppor the
+# file locking it needs to). This script strips any -pidfile option, then invokes qemu.
+
+ARGS=( "$(basename $0)" )
+while [ "$#" -gt 0 ]; do
+    if [ "$1" == "-pidfile" ]; then
+        shift
+    else
+        ARGS=( "${ARGS[@]}" "$1" )
+    fi
+    shift
+done
+
+"${ARGS[@]}"
diff --git a/tests/micro/qemu/zephyr-runtime/sample.yaml b/tests/micro/qemu/zephyr-runtime/sample.yaml
new file mode 100644
index 000000000000..88616b4acc40
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/sample.yaml
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+sample:
+  description: uTVM RPC Server unit test
+  name: utvm rpc server
+common:
+    tags: introduction
diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c
new file mode 100644
index 000000000000..1fa32e384c0b
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/src/main.c
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <drivers/gpio.h>
+#include <drivers/uart.h>
+#include <kernel.h>
+#include <power/reboot.h>
+#include <stdio.h>
+#include <sys/printk.h>
+#include <sys/ring_buffer.h>
+#include <tvm/runtime/crt/logging.h>
+#include <tvm/runtime/crt/utvm_rpc_server.h>
+#include <unistd.h>
+#include <zephyr.h>
+
+#ifdef CONFIG_ARCH_POSIX
+#include "posix_board_if.h"
+#endif
+
+#include "crt_config.h"
+
+K_SEM_DEFINE(tx_sem, 0, 1);
+
+static const struct device* tvm_uart;
+
+int write_hook(int c) {
+  uart_poll_out(tvm_uart, c);
+  return 0;
+}
+
+ssize_t write_serial(void* unused_context, const uint8_t* data, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    uart_poll_out(tvm_uart, data[i]);
+  }
+
+  return size;
+}
+
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args) {
+  return vsnprintk(out_buf, out_buf_size_bytes, fmt, args);
+}
+
+void TVMPlatformAbort(tvm_crt_error_t error) {
+  sys_reboot(SYS_REBOOT_COLD);
+  for (;;)
+    ;
+}
+
+uint32_t g_utvm_start_time;
+
+#define MILLIS_TIL_EXPIRY 200
+#define TIME_TIL_EXPIRY (K_MSEC(MILLIS_TIL_EXPIRY))
+K_TIMER_DEFINE(g_utvm_timer, /* expiry func */ NULL, /* stop func */ NULL);
+
+int g_utvm_timer_running = 0;
+
+#ifdef CONFIG_LED
+/* The devicetree node identifier for the "led0" alias. */
+#define LED0_NODE DT_ALIAS(led0)
+
+#define LED0 DT_GPIO_LABEL(LED0_NODE, gpios)
+#define PIN DT_GPIO_PIN(LED0_NODE, gpios)
+#define FLAGS DT_GPIO_FLAGS(LED0_NODE, gpios)
+
+static struct device* led_pin;
+#endif  // CONFIG_LED
+
+int TVMPlatformTimerStart() {
+  if (g_utvm_timer_running) {
+    TVMLogf("timer already running");
+    return -1;
+  }
+
+#ifdef CONFIG_LED
+  gpio_pin_set(led_pin, PIN, 1);
+#endif
+  k_timer_start(&g_utvm_timer, TIME_TIL_EXPIRY, TIME_TIL_EXPIRY);
+  g_utvm_start_time = k_cycle_get_32();
+  g_utvm_timer_running = 1;
+  return 0;
+}
+
+int TVMPlatformTimerStop(double* res_us) {
+  if (!g_utvm_timer_running) {
+    TVMLogf("timer not running");
+    return -1;
+  }
+
+  uint32_t stop_time = k_cycle_get_32();
+#ifdef CONFIG_LED
+  gpio_pin_set(led_pin, PIN, 0);
+#endif
+
+  // compute how long the work took
+  uint32_t cycles_spent = stop_time - g_utvm_start_time;
+  if (stop_time < g_utvm_start_time) {
+    // we rolled over *at least* once, so correct the rollover it was *only*
+    // once, because we might still use this result
+    cycles_spent = ~((uint32_t)0) - (g_utvm_start_time - stop_time);
+  }
+
+  uint32_t ns_spent = (uint32_t)k_cyc_to_ns_floor64(cycles_spent);
+  double hw_clock_res_us = ns_spent / 1000.0;
+
+  // need to grab time remaining *before* stopping. when stopped, this function
+  // always returns 0.
+  int32_t time_remaining_ms = k_timer_remaining_get(&g_utvm_timer);
+  k_timer_stop(&g_utvm_timer);
+  // check *after* stopping to prevent extra expiries on the happy path
+  if (time_remaining_ms < 0) {
+    TVMLogf("negative time remaining");
+    return -1;
+  }
+  uint32_t num_expiries = k_timer_status_get(&g_utvm_timer);
+  uint32_t timer_res_ms = ((num_expiries * MILLIS_TIL_EXPIRY) + time_remaining_ms);
+  double approx_num_cycles =
+      (double)k_ticks_to_cyc_floor32(1) * (double)k_ms_to_ticks_ceil32(timer_res_ms);
+  // if we approach the limits of the HW clock datatype (uint32_t), use the
+  // coarse-grained timer result instead
+  if (approx_num_cycles > (0.5 * (~((uint32_t)0)))) {
+    *res_us = timer_res_ms * 1000.0;
+  } else {
+    *res_us = hw_clock_res_us;
+  }
+
+  g_utvm_timer_running = 0;
+  return 0;
+}
+
+#define WORKSPACE_SIZE_BYTES (120 * 1024)
+#define WORKSPACE_PAGE_SIZE_BYTES_LOG2 8
+
+uint8_t workspace[WORKSPACE_SIZE_BYTES];
+
+#define RING_BUF_SIZE 512
+struct uart_rx_buf_t {
+  struct ring_buf buf;
+  uint32_t buffer[RING_BUF_SIZE];
+};
+
+struct uart_rx_buf_t uart_rx_buf;
+
+void uart_irq_cb(const struct device* dev, void* user_data) {
+  while (uart_irq_update(dev) && uart_irq_is_pending(dev)) {
+    struct uart_rx_buf_t* buf = (struct uart_rx_buf_t*)user_data;
+    if (uart_irq_rx_ready(dev) == 0) {
+      continue;
+    }
+
+    uint8_t data[32];
+    for (;;) {
+      int bytes_read = uart_fifo_read(dev, data, sizeof(data));
+      if (bytes_read < 0) {
+        TVMPlatformAbort(0xbeef);
+      } else if (bytes_read == 0) {
+        break;
+      }
+      int bytes_written = ring_buf_put(&buf->buf, data, bytes_read);
+      CHECK_EQ(bytes_read, bytes_written, "bytes_read: %d; bytes_written: %d", bytes_read,
+               bytes_written);
+    }
+  }
+}
+
+void uart_rx_init(struct uart_rx_buf_t* buf, const struct device* dev) {
+  ring_buf_init(&buf->buf, RING_BUF_SIZE, buf->buffer);
+  uart_irq_callback_user_data_set(dev, uart_irq_cb, (void*)buf);
+  uart_irq_rx_enable(dev);
+}
+
+int uart_rx_buf_read(struct uart_rx_buf_t* buf, uint8_t* data, size_t data_size_bytes) {
+  unsigned int key = irq_lock();
+  int bytes_read = ring_buf_get(&buf->buf, data, data_size_bytes);
+  irq_unlock(key);
+  return bytes_read;
+}
+
+extern void __stdout_hook_install(int (*hook)(int));
+void main(void) {
+#ifdef CONFIG_LED
+  led_pin = device_get_binding(LED0);
+  if (led_pin == NULL) {
+    for (;;)
+      ;
+  }
+  int ret = gpio_pin_configure(led_pin, PIN, GPIO_OUTPUT_ACTIVE | FLAGS);
+  if (ret < 0) {
+    for (;;)
+      ;
+  }
+  gpio_pin_set(led_pin, PIN, 0);
+#endif
+
+  /* Claim console device */
+  tvm_uart = device_get_binding(DT_LABEL(DT_CHOSEN(zephyr_console)));
+  uart_rx_init(&uart_rx_buf, tvm_uart);
+  __stdout_hook_install(&write_hook);
+
+  utvm_rpc_server_t server = UTvmRpcServerInit(workspace, WORKSPACE_SIZE_BYTES,
+                                               WORKSPACE_PAGE_SIZE_BYTES_LOG2, write_serial, NULL);
+  TVMLogf("uTVM On-Device Runtime");
+
+  while (true) {
+    uint8_t buf[256];
+    int bytes_read = uart_rx_buf_read(&uart_rx_buf, buf, sizeof(buf));
+    if (bytes_read > 0) {
+      size_t bytes_remaining = bytes_read;
+      uint8_t* cursor = buf;
+      while (bytes_remaining > 0) {
+        tvm_crt_error_t err = UTvmRpcServerLoop(server, &cursor, &bytes_remaining);
+        if (err != kTvmErrorNoError && err != kTvmErrorFramingShortPacket) {
+          TVMPlatformAbort(err);
+        }
+      }
+    }
+  }
+
+#ifdef CONFIG_ARCH_POSIX
+  posix_exit(0);
+#endif
+}
diff --git a/tests/micro/test_runtime_micro_on_arm.py b/tests/micro/test_runtime_micro_on_arm.py
index cc4066d5adda..45ca8e74323c 100644
--- a/tests/micro/test_runtime_micro_on_arm.py
+++ b/tests/micro/test_runtime_micro_on_arm.py
@@ -19,7 +19,7 @@
 import numpy as np
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime, util
+from tvm.contrib import graph_runtime, utils
 from tvm import relay
 import tvm.micro as micro
 from tvm.micro import create_micro_mod
diff --git a/tests/python/all-platform-minimal-test/README.md b/tests/python/all-platform-minimal-test/README.md
new file mode 100644
index 000000000000..d1f53b9163a3
--- /dev/null
+++ b/tests/python/all-platform-minimal-test/README.md
@@ -0,0 +1,29 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Core Cross Platform Regression Tests
+
+CI Unit test cases that will run on all platforms.
+To reduce the CI burden, we only put in test-cases that are platform sensitive.
+Please use the following guideline:
+
+- Always consider add tests to the unittest folder first.
+- If a problems that passes the Linux pipeline but fails in Windows or MacOS,
+  we should isolate the problem, write a minimal regression test case
+  and add it to this folder.
+- A test case in this folder should be minimal and finish in a reasonable amount of time.
+- Document about why it should be in the all-platform-minimal-test.
diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
new file mode 100644
index 000000000000..9861a1c39740
--- /dev/null
+++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""LLVM enablement tests."""
+
+import tvm
+import tvm.testing
+from tvm import te
+from tvm import topi
+from tvm.contrib import utils, clang
+import numpy as np
+import ctypes
+import math
+import re
+
+
+@tvm.testing.requires_llvm
+def test_llvm_add_pipeline():
+    """all-platform-minimal-test: Check LLVM enablement."""
+    nn = 1024
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name="A")
+    B = te.placeholder((n,), name="B")
+    AA = te.compute((n,), lambda *i: A(*i), name="A")
+    BB = te.compute((n,), lambda *i: B(*i), name="B")
+    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
+    C = te.compute(A.shape, lambda *i: T(*i), name="C")
+    s = te.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], factor=4)
+    xo1, xo2 = s[C].split(xo, factor=13)
+    s[C].parallel(xo2)
+    s[C].pragma(xo1, "parallel_launch_point")
+    s[C].pragma(xo2, "parallel_stride_pattern")
+    s[C].pragma(xo2, "parallel_barrier_when_finish")
+    s[C].vectorize(xi)
+
+    def check_llvm():
+        # Specifically allow offset to test codepath when offset is available
+        Ab = tvm.tir.decl_buffer(
+            A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A"
+        )
+        binds = {A: Ab}
+        # BUILD and invoke the kernel.
+        f = tvm.build(s, [A, B, C], "llvm", binds=binds)
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        f(a, b, c)
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+    check_llvm()
+
+
+@tvm.testing.requires_llvm
+def test_llvm_import():
+    """all-platform-minimal-test: check shell dependent clang behavior."""
+    # extern "C" is necessary to get the correct signature
+    cc_code = """
+    extern "C" float my_add(float x, float y) {
+      return x + y;
+    }
+    """
+    n = 10
+    A = te.placeholder((n,), name="A")
+    B = te.compute(
+        (n,), lambda *i: tvm.tir.call_pure_extern("float32", "my_add", A(*i), 1.0), name="B"
+    )
+
+    def check_llvm(use_file):
+        if not clang.find_clang(required=False):
+            print("skip because clang is not available")
+            return
+        temp = utils.tempdir()
+        ll_path = temp.relpath("temp.ll")
+        ll_code = clang.create_llvm(cc_code, output=ll_path)
+        s = te.create_schedule(B.op)
+        if use_file:
+            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
+        else:
+            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
+        # BUILD and invoke the kernel.
+        f = tvm.build(s, [A, B], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1.0)
+
+    check_llvm(use_file=True)
+    check_llvm(use_file=False)
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
similarity index 98%
rename from tests/python/unittest/test_runtime_ndarray.py
rename to tests/python/all-platform-minimal-test/test_runtime_ndarray.py
index 0183ecd81864..bd9fb738ba7b 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Basic runtime enablement test."""
+
 import tvm
 from tvm import te
 import numpy as np
diff --git a/tests/python/all-platform-minimal-test/test_runtime_packed_func.py b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
new file mode 100644
index 000000000000..c6efbb472c4a
--- /dev/null
+++ b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test packed function FFI."""
+import tvm
+from tvm import te
+import tvm.testing
+import numpy as np
+
+
+def test_get_global():
+    targs = (10, 10.0, "hello")
+    # register into global function table
+    @tvm.register_func
+    def my_packed_func(*args):
+        assert tuple(args) == targs
+        return 10
+
+    # get it out from global function table
+    f = tvm.get_global_func("my_packed_func")
+    assert isinstance(f, tvm.runtime.PackedFunc)
+    y = f(*targs)
+    assert y == 10
+
+
+def test_get_callback_with_node():
+    x = tvm.runtime.convert(10)
+
+    def test(y):
+        assert y.handle != x.handle
+        return y
+
+    f2 = tvm.runtime.convert(test)
+    # register into global function table
+    @tvm.register_func
+    def my_callback_with_node(y, f):
+        assert y == x
+        return f(y)
+
+    # get it out from global function table
+    f = tvm.get_global_func("my_callback_with_node")
+    assert isinstance(f, tvm.runtime.PackedFunc)
+    y = f(x, f2)
+    assert y.value == 10
+
+
+def test_return_func():
+    def addy(y):
+        def add(x):
+            return tvm.runtime.convert(x + y)
+
+        return add
+
+    myf = tvm.runtime.convert(addy)
+    f = myf(10)
+    assert f(11).value == 21
+
+
+def test_convert():
+    # convert a function to tvm function
+    targs = (10, 10.0, "hello", 10)
+
+    def myfunc(*args):
+        assert tuple(args) == targs
+
+    f = tvm.runtime.convert(myfunc)
+    assert isinstance(f, tvm.runtime.PackedFunc)
+
+
+def test_byte_array():
+    s = "hello"
+    a = bytearray(s, encoding="ascii")
+
+    def myfunc(ss):
+        assert ss == a
+
+    f = tvm.runtime.convert(myfunc)
+    f(a)
+
+
+def test_empty_array():
+    def myfunc(ss):
+        assert tuple(ss) == ()
+
+    x = tvm.runtime.convert(())
+    tvm.runtime.convert(myfunc)(x)
+
+
+def test_ctx():
+    def test_ctx_func(ctx):
+        assert tvm.gpu(7) == ctx
+        return tvm.cpu(0)
+
+    x = test_ctx_func(tvm.gpu(7))
+    assert x == tvm.cpu(0)
+    x = tvm.opencl(10)
+    x = tvm.testing.context_test(x, x.device_type, x.device_id)
+    assert x == tvm.opencl(10)
+
+
+def test_rvalue_ref():
+    def callback(x, expected_count):
+        assert expected_count == tvm.testing.object_use_count(x)
+        return x
+
+    f = tvm.runtime.convert(callback)
+
+    def check0():
+        x = tvm.tir.Var("x", "int32")
+        assert tvm.testing.object_use_count(x) == 1
+        f(x, 2)
+        y = f(x._move(), 1)
+        assert x.handle.value == None
+
+    def check1():
+        x = tvm.tir.Var("x", "int32")
+        assert tvm.testing.object_use_count(x) == 1
+        y = f(x, 2)
+        z = f(x._move(), 2)
+        assert x.handle.value == None
+        assert y.handle.value is not None
+
+    check0()
+    check1()
+
+
+def test_numpy_scalar():
+    maxint = (1 << 63) - 1
+    assert tvm.testing.echo(np.int64(maxint)) == maxint
+
+
+def test_ndarray_args():
+    def check(arr):
+        assert not arr.is_view
+        assert tvm.testing.object_use_count(arr) == 2
+
+    fcheck = tvm.runtime.convert(check)
+    x = tvm.nd.array([1, 2, 3])
+    fcheck(x)
+    assert tvm.testing.object_use_count(x) == 1
+
+
+if __name__ == "__main__":
+    test_ndarray_args()
+    test_numpy_scalar()
+    test_rvalue_ref()
+    test_empty_array()
+    test_get_global()
+    test_get_callback_with_node()
+    test_convert()
+    test_return_func()
+    test_byte_array()
+    test_ctx()
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index 42b111bcad2e..c5d711d7afa3 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -26,7 +26,7 @@
 from tvm import rpc
 from tvm.contrib import graph_runtime
 from tvm.relay.op.contrib import arm_compute_lib
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.autotvm.measure import request_remote
 
 
@@ -226,7 +226,7 @@ def build_and_run(
 def update_lib(lib, device, cross_compile):
     """Export the library to the remote/local device."""
     lib_name = "mod.so"
-    temp = util.tempdir()
+    temp = utils.tempdir()
     lib_path = temp.relpath(lib_name)
     if cross_compile:
         lib.export_library(lib_path, cc=cross_compile)
@@ -276,10 +276,11 @@ def verify_codegen(
     module,
     known_good_codegen,
     num_acl_modules,
+    tvm_ops=0,
     target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon",
 ):
     """Check acl codegen against a known good output."""
-    module = build_module(module, target)
+    module = build_module(module, target, tvm_ops=tvm_ops, acl_partitions=num_acl_modules)
     acl_modules = extract_acl_modules(module)
 
     assert len(acl_modules) == num_acl_modules, (
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index 8a3632a79919..0279aa72eaf7 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -20,8 +20,8 @@
 
 import tvm
 from tvm import relay
-
-from .infrastructure import (
+from tvm import testing
+from test_arm_compute_lib.infrastructure import (
     Device,
     skip_runtime_test,
     skip_codegen_test,
@@ -185,18 +185,34 @@ def test_dense():
     np.random.seed(0)
 
     dtype = ["float32"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [
+        (1, (1, 128), (16, 128), 16),
+        (1, (32, 32), (32, 32), 32),
+        (0, (1, 64), (1, 64), 1),
+        (0, (11, 2), (2, 2), 2),
+    ]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
         func, params = _get_model(
             shape, weight_shape, units, dtype, var_names=iter(inputs), has_bias=composite
         )
         for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0])
+            outputs.append(
+                build_and_run(
+                    func,
+                    inputs,
+                    1,
+                    params,
+                    device,
+                    enable_acl=acl,
+                    tvm_ops=(1 - acl_partitions) * (2 - int(not composite)),
+                    acl_partitions=acl_partitions,
+                )[0]
+            )
 
         config = {
             "shape": shape,
@@ -215,18 +231,18 @@ def test_codegen_dense():
     np.random.seed(0)
 
     dtype = ["float32"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         inputs = {"a"}
 
         args = (shape, weight_shape, units, dtype)
 
         func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite)
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, 1)
+        verify_codegen(func, exp_codegen, acl_partitions, 1 - acl_partitions)
 
 
 def test_qnn_dense():
@@ -239,11 +255,18 @@ def test_qnn_dense():
     np.random.seed(0)
 
     dtype = ["uint8"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [
+        (0, (4, 4), (4, 4), 4),
+        (1, (16, 16), (4, 16), 4),
+        (1, (1, 128), (16, 128), 16),
+        (1, (32, 32), (32, 32), 32),
+        (0, (1, 64), (1, 64), 1),
+    ]
+
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
         input_zp = 100
@@ -270,7 +293,18 @@ def test_qnn_dense():
         )
 
         for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0])
+            outputs.append(
+                build_and_run(
+                    func,
+                    inputs,
+                    1,
+                    params,
+                    device,
+                    tvm_ops=(1 - acl_partitions) * (3 - int(not composite)),
+                    acl_partitions=acl_partitions,
+                    enable_acl=acl,
+                )[0]
+            )
 
         config = {
             "shape": shape,
@@ -295,11 +329,11 @@ def test_codegen_qnn_dense():
     np.random.seed(0)
 
     dtype = ["uint8"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         inputs = {"a"}
         args = (shape, weight_shape, units, dtype)
 
@@ -323,7 +357,7 @@ def test_codegen_qnn_dense():
             has_bias=composite,
         )
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, 1)
+        verify_codegen(func, exp_codegen, acl_partitions, 2 - 2 * acl_partitions)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_arm_compute_lib/test_maximum.py b/tests/python/contrib/test_arm_compute_lib/test_maximum.py
index 8ddb901946fc..1942d1e213a5 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_maximum.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_maximum.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import relay
+from tvm import testing
 
 from .infrastructure import (
     skip_runtime_test,
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index 2526a584c56c..4efae487f220 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -17,11 +17,12 @@
 """Arm Compute Library network tests."""
 
 import numpy as np
-
+import pytest
+from tvm import testing
 from tvm import relay
 
-from .infrastructure import skip_runtime_test, build_and_run, verify
-from .infrastructure import Device
+from test_arm_compute_lib.infrastructure import skip_runtime_test, build_and_run, verify
+from test_arm_compute_lib.infrastructure import Device
 
 
 def _build_and_run_network(mod, params, inputs, device, tvm_ops, acl_partitions, atol, rtol):
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index 35017170d0ec..7ab4b42f95c1 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -20,15 +20,16 @@
 
 import tvm
 from tvm import relay
+from tvm import testing
 
-from .infrastructure import (
+from test_arm_compute_lib.infrastructure import (
     skip_runtime_test,
     skip_codegen_test,
     build_and_run,
     verify,
     verify_codegen,
 )
-from .infrastructure import Device
+from test_arm_compute_lib.infrastructure import Device
 
 
 def _calculate_output_shape(shape, sizes, padding, strides):
@@ -167,6 +168,7 @@ def test_pooling():
     uint8_dtype = ("uint8", 0, 255, 1, 0)
 
     trials = [
+        ["nn.max_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 0), False, False, (27, 27, 512)],
         ["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
         ["nn.max_pool2d", fp32_dtype, (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
         ["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
@@ -175,7 +177,8 @@ def test_pooling():
         ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
         ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
         ["nn.avg_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
-        ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
+        # 20.05: "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"
+        # ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
         ["nn.avg_pool2d", uint8_dtype, (3, 3), (2, 2), (0, 1), False, False, (16, 16, 16)],
         ["nn.l2_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), True, False, (16, 16, 16)],
         ["nn.l2_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 0), False, False, (16, 16, 16)],
@@ -211,6 +214,7 @@ def test_pooling():
             "padding": pad,
             "ceil_mode": ceil_mode,
             "count_include_pad": count_include_pad,
+            "inputs": inputs,
         }
         verify_saturation = True if dtype == "uint8" else False
 
@@ -255,7 +259,6 @@ def test_global_pooling():
         }
 
         func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
-
         config = {
             "shape": shape,
             "pooling type": typef,
diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
index 9547aefd8803..9364c6b1a478 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import relay
+from tvm import testing
 
 from .infrastructure import (
     skip_runtime_test,
@@ -77,7 +78,7 @@ def test_reshape():
     ]:
         inputs = {"a": tvm.nd.array(np.random.uniform(low, high, (1, 1, 1, 1000)).astype(dtype))}
 
-        for new_shape in [(1, 1000), (10, 10, 10)]:
+        for new_shape in [(1, 1000), (10, 10, 10), (10, 100, 1), (1, 1000, 1)]:
             outputs = []
             func = _get_model(inputs["a"].shape, new_shape, dtype, iter(inputs))
             for acl in [False, True]:
@@ -98,7 +99,7 @@ def test_codegen_reshape():
     shape = (1, 1, 1, 1000)
     inputs = {"a"}
     for dtype in ["float32", "uint8"]:
-        for new_shape in [(1, 1000), (10, 10, 10)]:
+        for new_shape in [(1, 1000), (10, 10, 10), (10, 100, 1)]:
             args = (shape, new_shape, dtype)
             func = _get_model(*args, iter(inputs))
             exp_codegen = _get_expected_codegen(*args)
diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py
deleted file mode 100644
index 83b220f03b10..000000000000
--- a/tests/python/contrib/test_binutil.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test various utilities for interaction with compiled binaries.
-
-Specifically, we test the following capabilities:
-  - querying the size of a binary section
-  - relocating sections within a binary to new addresses
-  - reading the contents of a binary section
-  - querying the address of a symbol in the binary
-"""
-
-import tvm
-from tvm import te
-import subprocess
-from tvm.contrib import util
-from tvm.contrib import cc
-from tvm.contrib.binutil import *
-
-TOOLCHAIN_PREFIX = ""
-
-
-def make_binary():
-    prog = "int a = 7; \
-            int main() { \
-                int b = 5; \
-                return 0; \
-            }"
-    tmp_dir = util.tempdir()
-    tmp_source = tmp_dir.relpath("source.c")
-    tmp_obj = tmp_dir.relpath("obj.obj")
-    with open(tmp_source, "w") as f:
-        f.write(prog)
-    cc.create_executable(tmp_obj, tmp_source, [], cc="{}gcc".format(TOOLCHAIN_PREFIX))
-    prog_bin = bytearray(open(tmp_obj, "rb").read())
-    return prog_bin
-
-
-def test_tvm_callback_get_section_size(binary=None):
-    if binary is None:
-        binary = make_binary()
-    tmp_dir = util.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        print(
-            "Text section size: %d"
-            % tvm_callback_get_section_size(tmp_bin, "text", TOOLCHAIN_PREFIX)
-        )
-        print(
-            "Data section size: %d"
-            % tvm_callback_get_section_size(tmp_bin, "data", TOOLCHAIN_PREFIX)
-        )
-        print(
-            "Bss section size: %d" % tvm_callback_get_section_size(tmp_bin, "bss", TOOLCHAIN_PREFIX)
-        )
-        print()
-
-    verify()
-
-
-def test_tvm_callback_relocate_binary():
-    binary = make_binary()
-    tmp_dir = util.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        word_size = 8
-        text_loc = 0x0
-        rodata_loc = 0x10000
-        data_loc = 0x20000
-        bss_loc = 0x30000
-        stack_end = 0x50000
-        rel_bin = tvm_callback_relocate_binary(
-            tmp_bin, word_size, text_loc, rodata_loc, data_loc, bss_loc, stack_end, TOOLCHAIN_PREFIX
-        )
-        print("Relocated binary section sizes")
-        test_tvm_callback_get_section_size(binary=rel_bin)
-        relf = tmp_dir.relpath("rel.bin")
-        with open(relf, "wb") as f:
-            f.write(rel_bin)
-        nm_proc = subprocess.Popen(
-            ["nm", "-C", "--defined-only", relf], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-        )
-        (out, _) = nm_proc.communicate()
-        symbol_entries = out.decode("utf-8").split("\n")
-        for entry in symbol_entries:
-            if len(entry) == 0:
-                continue
-            sym_loc, section, sym_name = entry.split(" ")
-            sym_loc = int(sym_loc, 16)
-            if section == "T":  # text
-                assert sym_loc >= text_loc and sym_loc < data_loc
-            elif section == "D":  # data
-                assert sym_loc >= data_loc and sym_loc < bss_loc
-            elif section == "B":  # bss
-                assert sym_loc >= bss_loc
-
-    verify()
-
-
-def test_tvm_callback_read_binary_section():
-    binary = make_binary()
-
-    def verify():
-        text_bin = tvm_callback_read_binary_section(binary, "text", TOOLCHAIN_PREFIX)
-        data_bin = tvm_callback_read_binary_section(binary, "data", TOOLCHAIN_PREFIX)
-        bss_bin = tvm_callback_read_binary_section(binary, "bss", TOOLCHAIN_PREFIX)
-        print("Read text section part of binary? %r" % (text_bin in binary))
-        print("Read data section part of binary? %r" % (data_bin in binary))
-        print("Read bss section part of binary? %r" % (bss_bin in binary))
-        print()
-
-    verify()
-
-
-def test_tvm_callback_get_symbol_map():
-    binary = make_binary()
-    tmp_dir = util.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        word_size = 8
-        text_loc = 0x0
-        rodata_loc = 0x10000
-        data_loc = 0x20000
-        bss_loc = 0x30000
-        stack_end = 0x50000
-        rel_bin = tvm_callback_relocate_binary(
-            tmp_bin, word_size, text_loc, rodata_loc, data_loc, bss_loc, stack_end, TOOLCHAIN_PREFIX
-        )
-        symbol_map = tvm_callback_get_symbol_map(rel_bin, TOOLCHAIN_PREFIX)
-        symbols = set()
-        for i, line in enumerate(symbol_map.split("\n")):
-            # Every other line is the value the symbol maps to.
-            if i % 2 == 0:
-                symbols.add(line)
-        assert "a" in symbols
-        assert "main" in symbols
-
-    verify()
-
-
-if __name__ == "__main__":
-    test_tvm_callback_get_section_size()
-    test_tvm_callback_relocate_binary()
-    test_tvm_callback_read_binary_section()
-    test_tvm_callback_get_symbol_map()
diff --git a/tests/python/contrib/test_coreml_runtime.py b/tests/python/contrib/test_coreml_runtime.py
index f6b9d9eb58e4..c0076d6eb12f 100644
--- a/tests/python/contrib/test_coreml_runtime.py
+++ b/tests/python/contrib/test_coreml_runtime.py
@@ -18,7 +18,7 @@
 from tvm import te
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util, xcode, coreml_runtime
+from tvm.contrib import utils, xcode, coreml_runtime
 
 import pytest
 import os
@@ -82,7 +82,7 @@ def verify(coreml_model, model_path, ctx):
             np.testing.assert_almost_equal(c_out, t_out, 3)
 
     def check_remote(coreml_model):
-        temp = util.tempdir()
+        temp = utils.tempdir()
         compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir)
         xcode.popen_test_rpc(
             proxy_host, proxy_port, key, destination=destination, libs=[compiled_model]
@@ -93,7 +93,7 @@ def check_remote(coreml_model):
         verify(coreml_model, compiled_model, ctx)
 
     def check_local(coreml_model):
-        temp = util.tempdir()
+        temp = utils.tempdir()
         compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir)
         ctx = tvm.cpu(0)
         verify(coreml_model, compiled_model, ctx)
diff --git a/tests/python/contrib/test_edgetpu_runtime.py b/tests/python/contrib/test_edgetpu_runtime.py
index eef9e0b06349..8c6113cee3d4 100644
--- a/tests/python/contrib/test_edgetpu_runtime.py
+++ b/tests/python/contrib/test_edgetpu_runtime.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util, tflite_runtime
+from tvm.contrib import utils, tflite_runtime
 
 # import tflite_runtime.interpreter as tflite
 
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 8f07a372ab6b..905d066ce7a3 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import, print_function
 import tvm
 from tvm import relay
-from tvm.contrib import util, graph_runtime, download
+from tvm.contrib import utils, graph_runtime, download
 from hashlib import md5
 from itertools import zip_longest, combinations
 import numpy as np
@@ -61,7 +61,7 @@ def assert_lib_hash(lib, golden):
     if isinstance(golden, str):
         golden = {golden}
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path = temp.relpath("lib.cmm")
     hash_set = set()
     for mod in lib.imported_modules:
@@ -207,7 +207,7 @@ def run(lib, inputs, outputs, npu=True):
     """
     # Export and load lib to confirm this works
     lib_name = "mod.so"
-    temp = util.tempdir()
+    temp = utils.tempdir()
     lib_path = temp.relpath(lib_name)
     lib.export_library(lib_path)
     lib = tvm.runtime.load_module(lib_path)
diff --git a/tests/python/contrib/test_ethosn/test_constant_duplication.py b/tests/python/contrib/test_ethosn/test_constant_duplication.py
new file mode 100644
index 000000000000..a096e57c19a9
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_constant_duplication.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test that constants aren't duplicated for Ethos-N"""
+
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib.ethosn import ethosn_available
+from . import infrastructure as tei
+
+
+def _get_model():
+    """Return a model and any parameters it may have"""
+    shape = (1, 4, 4, 4)
+    kernel_h = 3
+    kernel_w = 3
+    out_channels = 8
+
+    a = relay.var("a", shape=shape, dtype="uint8")
+    add_const_value = tvm.nd.array(np.random.randint(0, high=10, size=shape, dtype="uint8"))
+    add_const = relay.const(add_const_value, "uint8")
+    a = relay.add(a, add_const)
+    weight_shape = (kernel_h, kernel_w, shape[3], out_channels)
+    w = tvm.nd.array(np.random.randint(low=0, high=255, size=weight_shape, dtype="uint8"))
+    weights = relay.const(w, "uint8")
+    conv = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(0, "int32"),
+        kernel_zero_point=relay.const(0, "int32"),
+        input_scale=relay.const(0.3, "float32"),
+        kernel_scale=relay.const(0.4, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        dilation=(1, 1),
+        strides=(1, 1),
+        groups=1,
+        channels=out_channels,
+        padding=(0, 0, 0, 0),
+        out_dtype="int32",
+    )
+    b = tvm.nd.array(np.random.randint(0, high=10, size=(out_channels,), dtype="int32"))
+    biasc = relay.const(b, "int32")
+    bias = relay.nn.bias_add(conv, biasc, axis=3)
+    req = relay.qnn.op.requantize(
+        bias,
+        relay.const(0.3 * 0.4, "float32"),  # input zero scale
+        relay.const(0, "int32"),  # input zero point
+        relay.const(0.4, "float32"),  # output zero scale
+        relay.const(0, "int32"),  # output zero point
+        out_dtype="uint8",
+    )
+    params = {"w": w, "b": b}
+    return req, params
+
+
+def test_constant_duplication():
+    if not ethosn_available():
+        return
+
+    model, params = _get_model()
+    mod = tei.make_module(model, params)
+    res = tei.build(mod, params, npu=True, expected_host_ops=1)
+    for key, value in res.params.items():
+        assert key == "p0"
+        assert value.asnumpy().size == 64
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index a92b9301f7cc..bcb4358596b2 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 from tvm.contrib import nnpack
 import pytest
 
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
new file mode 100644
index 000000000000..aadfa1303655
--- /dev/null
+++ b/tests/python/contrib/test_tensorrt.py
@@ -0,0 +1,1226 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import time
+import pytest
+import itertools
+
+import tvm
+import tvm.relay.testing
+
+from tvm import relay
+from tvm.relay.op.contrib import tensorrt
+from tvm.contrib import graph_runtime, utils
+from tvm.runtime.vm import VirtualMachine
+from tvm.relay import Any, GlobalVar, transform
+from typing import Dict, Tuple, Union
+from tvm.contrib.download import download
+from tvm.relay.op.contrib import tensorrt
+
+
+def skip_codegen_test():
+    """Skip test if TensorRT and CUDA codegen are not present"""
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tvm.get_global_func("relay.ext.tensorrt", True):
+        print("Skip because TensorRT codegen is not available.")
+        return True
+    return False
+
+
+def skip_runtime_test():
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tensorrt.is_tensorrt_runtime_enabled():
+        print("Skip because TensorRT runtime is not available.")
+        return True
+    return False
+
+
+def vmobj_to_list(o):
+    if isinstance(o, tvm.nd.NDArray):
+        return [o.asnumpy()]
+    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
+        return [vmobj_to_list(f) for f in o]
+    else:
+        raise RuntimeError("Unknown object type: %s" % type(o))
+
+
+def assert_result_dict_holds(result_dict):
+    for k1, k2 in itertools.combinations(result_dict, 2):
+        res1 = vmobj_to_list(result_dict[k1])
+        res2 = vmobj_to_list(result_dict[k2])
+        for r1, r2 in zip(res1, res2):
+            tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
+
+
+def run_and_verify_func(config, target="cuda"):
+    """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
+
+    Parameters
+    ----------
+    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
+        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
+        3) A list of which vars should be considered params.
+    """
+    if skip_codegen_test():
+        return
+    f, input_shapes, is_param = config
+    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param}
+    input_dict = {
+        k: np.random.uniform(-1, 1, v).astype(np.float32)
+        for k, v in input_shapes.items()
+        if k not in is_param
+    }
+    ctx = tvm.context(target)
+
+    result_dict = dict()
+    for mode in ["graph", "vm"]:
+        for use_trt in [False, True]:
+            mod = tvm.IRModule()
+            mod["main"] = f
+            result_key = mode + ("_trt" if use_trt else "")
+            if use_trt:
+                mod, config = tensorrt.partition_for_tensorrt(mod, params)
+                with tvm.transform.PassContext(
+                    opt_level=3, config={"relay.ext.tensorrt.options": config}
+                ):
+                    exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target)
+            else:
+                with tvm.transform.PassContext(opt_level=3):
+                    exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target)
+            if not skip_runtime_test():
+                result_dict[result_key] = exec.evaluate()(**input_dict, **params)
+
+    if not skip_runtime_test():
+        assert_result_dict_holds(result_dict)
+
+
+def run_and_verify_model(model):
+    if skip_codegen_test():
+        return
+
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    def check_trt_used(mod):
+        num_trt_subgraphs = sum(
+            [1 if gv.name_hint == "tensorrt_0" else 0 for gv in mod.get_global_vars()]
+        )
+        assert num_trt_subgraphs == 1
+
+    def compile_and_run(mod, params, i_data, mode="vm", use_trt=True):
+        assert mode in ["graph", "vm"]
+
+        if use_trt:
+            mod, config = tensorrt.partition_for_tensorrt(mod, params)
+            check_trt_used(mod)
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.ext.tensorrt.options": config}
+            ):
+                exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+        else:
+            with tvm.transform.PassContext(opt_level=3):
+                exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+
+        res = exec.evaluate()(i_data, **params) if not skip_runtime_test() else None
+        return res
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    i_data = np.random.uniform(-1, 1, input_shape).astype(dtype)
+    block = get_model(model, pretrained=True)
+    mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+
+    result_dict = dict()
+    for mode in ["vm", "graph"]:
+        for use_trt in [True, False]:
+            result_key = mode + ("_trt" if use_trt else "")
+            result_dict[result_key] = compile_and_run(
+                mod, params, i_data, mode=mode, use_trt=use_trt
+            )
+
+    if not skip_runtime_test():
+        assert_result_dict_holds(result_dict)
+
+
+def test_tensorrt_simple():
+    if skip_codegen_test():
+        return
+    dtype = "float32"
+    xshape = (1, 3, 2, 2)
+    yshape = (1, 3, 1, 1)
+    zshape = (1, 1, 1, 1)
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.var("y", shape=(yshape), dtype=dtype)
+    z = relay.var("z", shape=(zshape), dtype=dtype)
+    w = z * (x + y)
+    out = relay.nn.relu(w)
+    f = relay.Function([x, y, z], out)
+
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
+    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
+
+    result_dict = dict()
+    for mode in ["vm", "graph"]:
+        for use_trt in [True, False]:
+            mod = tvm.IRModule()
+            mod["main"] = f
+            result_key = mode + ("_trt" if use_trt else "")
+            if use_trt:
+                mod, config = tensorrt.partition_for_tensorrt(mod)
+                with tvm.transform.PassContext(
+                    opt_level=3, config={"relay.ext.tensorrt.options": config}
+                ):
+                    relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            else:
+                with tvm.transform.PassContext(opt_level=3):
+                    relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            if not skip_runtime_test():
+                result_dict[result_key] = relay_exec.evaluate()(x_data, y_data, z_data)
+
+    if not skip_runtime_test():
+        assert_result_dict_holds(result_dict)
+
+
+def test_tensorrt_simple_cpu_io():
+    def get_graph():
+        dtype = "float32"
+        x_shape = (1, 3, 2, 2)
+        y_shape = (1, 3, 1, 1)
+        z_shape = (1, 1, 1, 1)
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        y = relay.var("y", shape=(y_shape), dtype=dtype)
+        z = relay.var("z", shape=(z_shape), dtype=dtype)
+        w = z * (x + y)
+        out = relay.nn.relu(w)
+        f = relay.Function([x, y, z], out)
+        return f, {"x": x_shape, "y": y_shape, "z": z_shape}, ["y"]
+
+    run_and_verify_func(get_graph(), target="llvm")
+
+
+def test_tensorrt_not_compatible():
+    if skip_codegen_test():
+        return
+    dtype = "float32"
+    xshape = (1, 32, 14, 14)
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.add(x, x)
+    z = relay.erf(y)
+    out = relay.nn.relu(z)
+    f = relay.Function([x], out)
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    for mode in ["graph", "vm"]:
+        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+            exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            if not skip_runtime_test():
+                results = exec.evaluate()(x_data)
+
+
+def test_tensorrt_serialize_graph_runtime():
+    if skip_codegen_test():
+        return
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    data_shape = (1, 3, 224, 224)
+    data_type = "float32"
+    i_data = np.random.uniform(0, 1, data_shape).astype(data_type)
+    block = get_model("resnet18_v1", pretrained=True)
+    mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type)
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    tmpdir = utils.tempdir()
+
+    def compile_graph(mod, params):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+            graph, lib, params = relay.build(mod, params=params, target="cuda")
+            params = relay.save_param_dict(params)
+        return graph, lib, params
+
+    def run_graph(graph, lib, params):
+        mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+        mod_.load_params(params)
+        mod_.run(data=i_data)
+        res = mod_.get_output(0)
+        return res
+
+    def save_graph(graph, lib, params):
+        # Serialize
+        with open(tmpdir.relpath("compiled.json"), "w") as f_graph_json:
+            f_graph_json.write(graph)
+        with open(tmpdir.relpath("compiled.params"), "wb") as f_params:
+            f_params.write(params)
+        lib.export_library(tmpdir.relpath("compiled.so"))
+
+    def load_graph():
+        # Deserialize
+        with open(tmpdir.relpath("compiled.json"), "r") as f_graph_json:
+            graph = f_graph_json.read()
+        with open(tmpdir.relpath("compiled.params"), "rb") as f_params:
+            params = bytearray(f_params.read())
+        lib = tvm.runtime.load_module(tmpdir.relpath("compiled.so"))
+        return graph, lib, params
+
+    # Test serialization with graph runtime
+    graph, lib, graph_params = compile_graph(mod, params)
+    save_graph(graph, lib, graph_params)
+    loaded_graph, loaded_lib, loaded_params = load_graph()
+
+    if not skip_runtime_test():
+        result_dict = dict()
+        result_dict["graph"] = run_graph(graph, lib, graph_params)
+        result_dict["graph_ref"] = run_graph(loaded_graph, loaded_lib, loaded_params)
+        assert_result_dict_holds(result_dict)
+
+
+def test_tensorrt_serialize_vm():
+    if skip_codegen_test():
+        return
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    data_shape = (1, 3, 224, 224)
+    data_type = "float32"
+    i_data = np.random.uniform(0, 1, data_shape).astype(data_type)
+    block = get_model("resnet18_v1", pretrained=True)
+    mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type)
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    tmpdir = utils.tempdir()
+
+    def compile_vm(mod, params):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+            vm_exec = relay.vm.compile(mod, target="cuda", params=params)
+            code, lib = vm_exec.save()
+        return code, lib
+
+    def run_vm(code, lib):
+        vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
+        vm = VirtualMachine(vm_exec, tvm.gpu(0))
+        result = vm.invoke("main", data=i_data)
+        return result
+
+    def save_vm(code, lib):
+        # save and load the code and lib file.
+        lib.export_library(tmpdir.relpath("path_lib.so"))
+        with open(tmpdir.relpath("path_code.ro"), "wb") as fo:
+            fo.write(code)
+
+    def load_vm():
+        lib = tvm.runtime.load_module(tmpdir.relpath("path_lib.so"))
+        code = bytearray(open(tmpdir.relpath("path_code.ro"), "rb").read())
+        return lib, code
+
+    # Test serialization with VM
+    code_vm, lib_vm = compile_vm(mod, params)
+    save_vm(code_vm, lib_vm)
+    loaded_lib_vm, loaded_code_vm = load_vm()
+
+    if not skip_runtime_test():
+        result_dict = dict()
+        result_dict["vm"] = run_vm(code_vm, lib_vm)
+        result_dict["vm_ref"] = run_vm(loaded_code_vm, loaded_lib_vm)
+        assert_result_dict_holds(result_dict)
+
+
+def test_conv2d():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(16, 32, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        channels=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+            channels=channels,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32)]:
+        for padding in [(0, 0), (1, 1)]:
+            for strides in [(1, 1), (2, 2)]:
+                for dilation in [(1, 1), (2, 2)]:
+                    run_and_verify_func(
+                        get_graph(
+                            k_shape=k_shape,
+                            groups=groups,
+                            padding=padding,
+                            strides=strides,
+                            dilation=dilation,
+                        )
+                    )
+    run_and_verify_func(
+        get_graph((1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24)
+    )
+
+
+def test_conv2d_nhwc():
+    def get_graph(x_shape=(1, 8, 8, 32), k_shape=(3, 3, 32, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            channels=16,
+            kernel_size=(3, 3),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+
+
+def test_conv2d_weights_const():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(16, 32, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.const(np.ones(k_shape).astype("float32"))
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            channels=k_shape[0],
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+        )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_conv2d_weights_transposed():
+    def get_graph(x_shape=(1, 32, 9, 9), k_shape=(3, 3, 32, 16), order=(3, 2, 0, 1)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        kernel_t = relay.transpose(kernel, order)
+        # Conv2d requires constant weights in TensorRT, so the weights should be transposed by
+        # FoldConstant.
+        out = relay.nn.conv2d(x, kernel_t, channels=k_shape[order[0]], kernel_size=(3, 3))
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+
+
+def test_dense():
+    def get_graph(x_shape=(1, 16), k_shape=(32, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
+        out = relay.nn.dense(x, kernel, units=k_shape[0])
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+
+
+def test_bias_add():
+    def get_graph(x_shape=(1, 16), channels=16):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        bias = relay.var("bias", shape=(channels,), dtype="float32")
+        out = relay.nn.bias_add(x, bias)
+        f = relay.Function([x, bias], out)
+        return f, {"x": x_shape, "bias": (channels,)}, ["bias"]
+
+    run_and_verify_func(get_graph())
+    run_and_verify_func(get_graph((1, 6, 3, 4), 6))
+
+
+def test_pool2d():
+    def get_graph(
+        op,
+        x_shape=(1, 3, 32, 32),
+        pool_size=(2, 2),
+        strides=(2, 2),
+        padding=(0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for pool_size in [(2, 2), (3, 3)]:
+        for strides in [(1, 1), (2, 2)]:
+            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
+                for ceil_mode in [False, True]:
+                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
+                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
+                        continue
+                    for count_include_pad in [False, True]:
+                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
+                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
+                            continue
+                        run_and_verify_func(
+                            get_graph(
+                                relay.nn.avg_pool2d,
+                                pool_size=pool_size,
+                                strides=strides,
+                                padding=padding,
+                                ceil_mode=ceil_mode,
+                                count_include_pad=count_include_pad,
+                            )
+                        )
+                    run_and_verify_func(
+                        get_graph(
+                            relay.nn.max_pool2d,
+                            pool_size=pool_size,
+                            strides=strides,
+                            padding=padding,
+                            ceil_mode=ceil_mode,
+                        )
+                    )
+
+
+def test_global_pool2d():
+    def get_graph(op, x_shape=(1, 3, 32, 32)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(relay.nn.global_max_pool2d))
+    run_and_verify_func(get_graph(relay.nn.global_avg_pool2d))
+
+
+def test_batch_flatten():
+    def get_graph(x_shape=(1, 3, 4, 6)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.batch_flatten(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_expand_dims():
+    def get_graph(x_shape=(1, 3), axis=1, num_newaxis=1):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.expand_dims(x, axis, num_newaxis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_squeeze():
+    def get_graph(x_shape, axis):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.squeeze(x, axis=axis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 5, 1, 1), (2, 3)))
+    run_and_verify_func(get_graph((1, 3, 1), (-1,)))
+
+
+def test_concatenate():
+    def get_graph(input_shapes, axis):
+        concat_inputs = []
+        shapes_dict = {}
+        for i in range(len(input_shapes)):
+            name = "input_{}".format(i)
+            concat_inputs.append(relay.var(name, shape=(input_shapes[i]), dtype="float32"))
+            shapes_dict[name] = input_shapes[i]
+        out = relay.concatenate(concat_inputs, axis)
+        f = relay.Function(concat_inputs, out)
+        return f, shapes_dict, []
+
+    run_and_verify_func(get_graph([(1, 2, 6, 6), (1, 3, 6, 6)], axis=1))
+
+
+def test_conv2d_transpose():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(32, 16, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d_transpose(
+            x,
+            kernel,
+            channels=k_shape[1],
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    for padding in [(0, 0), (1, 1)]:
+        for strides in [(1, 1), (2, 2)]:
+            run_and_verify_func(get_graph(padding=padding, strides=strides))
+
+
+def test_reshape():
+    def get_graph(x_shape, new_shape):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.reshape(x, new_shape)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 1, 1, 10), (-1, 10)))
+    run_and_verify_func(get_graph((1, 10, 2, 3), (1, -1)))
+    run_and_verify_func(get_graph((1, 1, 2, 3), (1, 6)))
+
+
+def test_transpose():
+    def get_graph(x_shape, order):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.transpose(x, order)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 16, 7, 7), [0, 2, 3, 1]))
+    run_and_verify_func(get_graph((1, 7, 7, 16), [0, 3, 1, 2]))
+
+
+def test_float_const():
+    def get_graph(x_shape=(1, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        beta = relay.const(1, dtype="float32")
+        out = relay.multiply(x, beta)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_pad():
+    def get_graph(x_shape, pad_width):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.pad(x, pad_width=pad_width)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [0, 0], [0, 0]]))
+    run_and_verify_func(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [1, 1], [1, 1]]))
+    run_and_verify_func(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [0, 1], [2, 0]]))
+    run_and_verify_func(get_graph((1, 8, 3, 16, 16), [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]))
+
+
+def test_softmax():
+    def get_graph(x_shape, axis):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.softmax(x, axis=axis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 1000), axis=1))
+    run_and_verify_func(get_graph((1, 1000), axis=-1))
+    run_and_verify_func(get_graph((1, 3, 4), axis=-2))
+    run_and_verify_func(get_graph((1, 3, 4), axis=1))
+
+
+def test_batch_norm():
+    def get_graph(x_shape, param_shape, axis=1, epsilon=1e-5):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        beta = relay.var("beta", shape=(param_shape), dtype="float32")
+        gamma = relay.var("gamma", shape=(param_shape), dtype="float32")
+        moving_mean = relay.var("moving_mean", shape=(param_shape), dtype="float32")
+        moving_var = relay.var("moving_var", shape=(param_shape), dtype="float32")
+        out, _, _ = relay.nn.batch_norm(
+            x,
+            gamma=gamma,
+            beta=beta,
+            moving_mean=moving_mean,
+            moving_var=moving_var,
+            axis=axis,
+            center=True,
+            scale=True,
+            epsilon=epsilon,
+        )
+        f = relay.Function([x, gamma, beta, moving_mean, moving_var], out)
+        return (
+            f,
+            {
+                "x": x_shape,
+                "beta": param_shape,
+                "gamma": param_shape,
+                "moving_mean": param_shape,
+                "moving_var": param_shape,
+            },
+            ["beta", "gamma", "moving_mean", "moving_var"],
+        )
+
+    run_and_verify_func(get_graph((1, 64, 56, 56), (64,)))
+    run_and_verify_func(get_graph((1, 56, 56, 64), (64,), axis=3, epsilon=1.001e-05))
+
+
+def test_unary():
+    def get_graph(op, x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for op in [
+        relay.nn.relu,
+        relay.sigmoid,
+        relay.tanh,
+        relay.exp,
+        relay.log,
+        relay.sqrt,
+        relay.abs,
+        relay.negative,
+        relay.sin,
+        relay.cos,
+        relay.atan,
+        relay.ceil,
+        relay.floor,
+    ]:
+        run_and_verify_func(get_graph(op))
+
+
+def test_clip():
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.clip(x, a_min=-0.2, a_max=0.4)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_leaky_relu():
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.leaky_relu(x, alpha=0.1)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_binary():
+    def get_graph(op, x_shape, y_shape, y_is_const=False):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if y_is_const:
+            y = relay.const(np.ones(y_shape).astype("float32"))
+            out = op(x, y)
+            f = relay.Function([x], out)
+            return f, {"x": x_shape}, []
+        y = relay.var("y", shape=(y_shape), dtype="float32")
+        out = op(x, y)
+        f = relay.Function([x, y], out)
+        return f, {"x": x_shape, "y": y_shape}, []
+
+    for op in [relay.add, relay.subtract, relay.multiply, relay.divide, relay.power]:
+        for y_is_const in [True, False]:
+            run_and_verify_func(get_graph(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const))
+            run_and_verify_func(get_graph(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const))
+            run_and_verify_func(get_graph(op, (1, 10), (10,), y_is_const))
+            run_and_verify_func(get_graph(op, (1, 1, 1, 10), (10,), y_is_const))
+            run_and_verify_func(get_graph(op, (1, 1, 1), (3,), y_is_const))
+
+
+def test_reduce():
+    def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x, axis=axis, keepdims=keepdims)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]:
+        for keepdims in [True, False]:
+            run_and_verify_func(get_graph(op, axis=(1), keepdims=keepdims))
+            run_and_verify_func(get_graph(op, axis=(2, 3), keepdims=keepdims))
+            run_and_verify_func(get_graph(op, axis=(1, 2), keepdims=keepdims))
+            run_and_verify_func(get_graph(op, axis=(1, 2, 3), keepdims=keepdims))
+
+
+def test_strided_slice():
+    def get_graph(x_shape, begin, end, strides=None, slice_mode="size"):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if strides:
+            out = relay.strided_slice(
+                x,
+                begin,
+                end,
+                strides,
+                slice_mode=slice_mode,
+            )
+        else:
+            out = relay.strided_slice(
+                x,
+                begin,
+                end,
+                slice_mode=slice_mode,
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for slice_mode in ["size", "end"]:
+        run_and_verify_func(
+            get_graph((1, 3, 6, 7), (0, 0, 0, 0), (1, 1, 6, 7), slice_mode=slice_mode)
+        )
+        run_and_verify_func(
+            get_graph((1, 3, 6, 7), [0, 1, 0, 0], [1, 2, 6, 6], slice_mode=slice_mode)
+        )
+        run_and_verify_func(
+            get_graph((2, 3, 6, 7), [0, 0, 0, 0], [-1, -1, -1, -1], slice_mode=slice_mode)
+        )
+        run_and_verify_func(
+            get_graph((2, 3, 6, 7), [0, 1, 0, 0], [-1, -1, -1, -1], slice_mode=slice_mode)
+        )
+        run_and_verify_func(get_graph((1, 6), [0, 1], [1, 3], slice_mode=slice_mode))
+
+
+def test_adaptive_pool2d():
+    def get_graph(op, x_shape=(1, 3, 32, 32), out_size=(1, 1)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x, out_size)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(relay.nn.adaptive_max_pool2d))
+    run_and_verify_func(get_graph(relay.nn.adaptive_avg_pool2d))
+
+
+def test_multiple_outputs():
+    def get_graph():
+        x = relay.var("x", shape=(1, 3), dtype="float32")
+        y = relay.var("y", shape=(1, 3), dtype="float32")
+        z = relay.add(x, y)
+        w = relay.add(z, y)
+        out = relay.Tuple((z, w))
+        f = relay.Function([x, y], out)
+        return f, {"x": (1, 3), "y": (1, 3)}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_conv3d():
+    def get_graph(
+        x_shape=(1, 32, 8, 8, 8),
+        k_shape=(16, 32, 3, 3, 3),
+        groups=1,
+        padding=(0, 0, 0),
+        strides=(1, 1, 1),
+        dilation=(1, 1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv3d(
+            x,
+            kernel,
+            channels=k_shape[0],
+            kernel_size=k_shape[2:],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+    run_and_verify_func(get_graph(padding=(0, 0, 0, 1, 1, 1)))
+
+
+def test_pool3d():
+    def get_graph(
+        op,
+        x_shape=(1, 3, 8, 32, 32),
+        pool_size=(2, 2, 2),
+        strides=(2, 2, 2),
+        padding=(0, 0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(relay.nn.avg_pool3d))
+    run_and_verify_func(get_graph(relay.nn.max_pool3d))
+    run_and_verify_func(get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)))
+    run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)))
+
+
+def test_conv3d_transpose():
+    def get_graph(
+        x_shape=(1, 32, 8, 8, 8),
+        k_shape=(32, 16, 3, 3, 3),
+        groups=1,
+        padding=(0, 0, 0),
+        strides=(1, 1, 1),
+        output_padding=(0, 0, 0),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv3d_transpose(
+            x,
+            kernel,
+            channels=k_shape[1],
+            kernel_size=k_shape[2:5],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            output_padding=output_padding,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+    run_and_verify_func(get_graph(strides=(2, 2, 2)))
+    run_and_verify_func(get_graph(strides=(2, 2, 2), output_padding=(1, 1, 1)))
+
+
+def test_alexnet():
+    run_and_verify_model("alexnet")
+
+
+def test_resnet18_v1():
+    run_and_verify_model("resnet18_v1")
+
+
+def test_resnet18_v2():
+    run_and_verify_model("resnet18_v2")
+
+
+def test_squeezenet():
+    run_and_verify_model("squeezenet1.0")
+
+
+def test_mobilenet():
+    run_and_verify_model("mobilenet0.25")
+
+
+def test_mobilenet_v2():
+    run_and_verify_model("mobilenetv2_0.25")
+
+
+def test_vgg11():
+    run_and_verify_model("vgg11")
+
+
+def test_densenet121():
+    run_and_verify_model("densenet121")
+
+
+def test_dynamic_offload():
+    """
+    This test checks for proper dynamic offloading of relay graphs. An addition between
+    the outputs of two conv2d's is performed, one of them having all static args whereas
+    the other has a arg with dynamic shape. It is expected for the TRT partitioner to
+    offload the conv2d with dynamic arg to TVM while running the other in TRT.
+    """
+
+    if skip_codegen_test():
+        return
+
+    data_shape = (1, 32, 8, 8)
+    k_shape = (1, 32, 3, 3)
+
+    x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32")
+    y = relay.var("y", shape=(data_shape), dtype="float32")
+    kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+
+    def get_expected():
+        def set_func_attr(func, compile_name, symbol_name):
+            func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+            func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
+            func = func.with_attr("Compiler", compile_name)
+            func = func.with_attr("global_symbol", symbol_name)
+            return func
+
+        # Create a nested TRT function that matches the expected output
+        mod = tvm.IRModule()
+        var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
+        kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32")
+        out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4])
+        f1 = GlobalVar("tensorrt_0")
+        func = relay.Function([var1, kernel_trt], out1)
+        func = set_func_attr(func, "tensorrt", "tensorrt_0")
+        mod[f1] = func
+        mod = relay.transform.InferType()(mod)
+
+        # Create the main function
+        out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
+        out = relay.add(out1, f1(y, kernel))
+        f = relay.Function([x, y, kernel], out)
+        mod["main"] = f
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    # Create relay function that will be offloaded to TRT
+    out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
+    out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
+    out = relay.add(out1, out2)
+    f = relay.Function([x, y, kernel], out)
+
+    # Pass the function to TRT compilation
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod = relay.transform.InferType()(mod)
+    mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={})
+
+    # Get the expected relay graph and compare
+    mod_exp = get_expected()
+    tvm.ir.assert_structural_equal(mod_trt, mod_exp, map_free_vars=True)
+
+
+def test_tensorrt_dynamic_batch():
+    if skip_codegen_test():
+        return
+
+    batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2]
+    x_shape = (relay.Any(), 1, 8, 8)
+    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
+    result_arr = [{} for _ in range(len(batches_to_test))]
+    for use_trt in [True, False]:
+        x = relay.var("x", shape=x_shape, dtype="float32")
+        out = relay.nn.relu(x)
+        f = relay.Function([x], out)
+        mod = tvm.IRModule()
+        mod["main"] = f
+        if use_trt:
+            mod, _ = tensorrt.partition_for_tensorrt(mod)
+
+        if not skip_runtime_test():
+            with relay.build_config(opt_level=3):
+                relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
+
+            for i, batch_size in enumerate(batches_to_test):
+                result_arr[i][use_trt] = relay_exec.evaluate()(x_data[:batch_size, ...])
+
+    if not skip_runtime_test():
+        for i in range(len(batches_to_test)):
+            assert_result_dict_holds(result_arr[i])
+
+
+def test_tensorrt_dynamic_batch_conv():
+    if skip_codegen_test():
+        return
+    batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2]
+    x_shape = (relay.Any(), 32, 8, 8)
+    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
+    k_shape = (16, 32, 3, 3)
+    params = {"kernel": np.random.uniform(-1, 1, k_shape).astype("float32")}
+    result_arr = [{} for _ in range(len(batches_to_test))]
+    for use_trt in [True, False]:
+        x = relay.var("x", shape=x_shape, dtype="float32")
+        kernel = relay.var("kernel", shape=k_shape, dtype="float32")
+        out = relay.nn.conv2d(x, kernel, channels=16, kernel_size=(3, 3), groups=1)
+        f = relay.Function([x, kernel], out)
+        mod = tvm.IRModule()
+        mod["main"] = f
+        if use_trt:
+            mod, _ = tensorrt.partition_for_tensorrt(mod, params)
+
+        if not skip_runtime_test():
+            with relay.build_config(opt_level=3):
+                relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
+
+            for i, batch_size in enumerate(batches_to_test):
+                result_arr[i][use_trt] = relay_exec.evaluate()(x_data[:batch_size, ...], **params)
+
+    if not skip_runtime_test():
+        for i in range(len(batches_to_test)):
+            assert_result_dict_holds(result_arr[i])
+
+
+def test_maskrcnn_resnet50() -> None:
+    """
+    This function tests the working of pytorch maskrcnn with resnet50 as backbone with
+    VM and VM + TRT. Since the order of compiled model outputs is a bit different from
+    original pytorch model, it uses a custom logic for comparison check.
+    """
+    if skip_codegen_test():
+        return
+
+    import torch
+    import torchvision
+
+    def convert_traced_model_to_vm_trt(
+        traced_module: torch.jit.TopLevelTracedModule, np_sample_input: np.ndarray, target: str
+    ) -> tvm.runtime.vm.Executable:
+        """
+        This function converts a traced pytorch model to VM + TRT.
+        """
+        input_shape = np_sample_input.shape
+        input_name = "input0"
+        shape_list = [(input_name, input_shape)]
+        mod, params = relay.frontend.from_pytorch(traced_module, shape_list)
+        mod, config = tensorrt.partition_for_tensorrt(mod, params, remove_no_mac_subgraphs=True)
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]):
+            vm_trt_exec = relay.vm.compile(mod, target=target, params=params)
+
+        return vm_trt_exec
+
+    class TraceWrapper(torch.nn.Module):
+        """
+        This class is a wrapper over the torch module to convert the outputs into traceable form
+        """
+
+        def __init__(self, model: torch.nn.Module) -> None:
+            super().__init__()
+            self.model = model
+
+        def forward(
+            self, inp: torch.Tensor
+        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+            out = self.model(inp)
+            return out[0]["boxes"], out[0]["scores"], out[0]["labels"], out[0]["masks"]
+
+    def get_traced_maskrcnn_model(np_sample_input: np.ndarray) -> torch.jit.TopLevelTracedModule:
+        """
+        This function takes a sample input and returns the traced maskrcnn model
+        """
+        model_func = torchvision.models.detection.maskrcnn_resnet50_fpn
+        model = TraceWrapper(model_func(pretrained=True))
+        model.eval()
+        inp = torch.Tensor(np.random.uniform(0.0, 250.0, size=np_sample_input.shape))
+
+        with torch.no_grad():
+            out = model(inp)
+            traced_module = torch.jit.trace(model, inp)
+            traced_module.eval()
+
+        return traced_module
+
+    def get_maskrcnn_input(in_size: int) -> np.ndarray:
+        """
+        This function gets a real image with multiple objects of interest and returns it.
+        """
+        input_shape = (1, 3, in_size, in_size)
+        img_path = "test_street_small.jpg"
+        img_url = (
+            "https://raw.githubusercontent.com/dmlc/web-data/"
+            "master/gluoncv/detection/street_small.jpg"
+        )
+        download(img_url, img_path)
+        import cv2
+
+        img = cv2.imread(img_path).astype("float32")
+        img = cv2.resize(img, (in_size, in_size))
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = np.transpose(img / 255.0, [2, 0, 1])
+        img = np.expand_dims(img, axis=0)
+
+        return img
+
+    in_size = 300
+    np_sample_input = get_maskrcnn_input(in_size)
+    traced_module = get_traced_maskrcnn_model(np_sample_input)
+    vm_trt_exec = convert_traced_model_to_vm_trt(traced_module, np_sample_input, target="llvm")
+
+    if skip_runtime_test():
+        return
+
+    ctx = tvm.cpu()
+    vm = tvm.runtime.vm.VirtualMachine(vm_trt_exec, ctx)
+    vm.set_input("main", **{"input0": np_sample_input})
+    tvm_res = vm.run()
+
+    # Descending sort by scores and get the high confidence indices. In this example 9 is chosen,
+    # because this image has 9 boxes over 0.9 confidence
+    num_high_confidence_boxes = 9
+    tvm_indices = np.argsort(-1 * tvm_res[1].asnumpy())[:num_high_confidence_boxes]
+
+    with torch.no_grad():
+        out = traced_module(torch.Tensor(np_sample_input))
+        # Descending sort by scores and get the high confidence indices
+        pt_indices = np.argsort(-1 * out[1].numpy())[:num_high_confidence_boxes]
+
+    tol = [1e-1, 5e-3, 1e-5, 4e-1]  # [Box Tol, Score Tol, Label Tol, Mask Tol]
+    # Because of certain ops, there are certain minor differences in TVM outputs and PT outputs,
+    # This means that the tolerance can't be 1e-4 or 1e-5 throughout. The ideal way to get around
+    # this is to test it on an entire dataset and compare mAP with the original model.
+    # However, since that is not practically possible on CI, the following compromise is made.
+    # These tolerances are chosen based on their impact or lack thereof to the mAP score, e.g:
+    # 0.1 pixel difference of a box in a 300X300 image wont make any change.
+    for i, tol_val in zip(range(4), tol):
+        np.testing.assert_allclose(
+            tvm_res[i].asnumpy()[tvm_indices],
+            out[i].numpy()[pt_indices],
+            rtol=tol_val,
+            atol=tol_val,
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_tflite_runtime.py b/tests/python/contrib/test_tflite_runtime.py
index c24747db8238..39d8881f4040 100644
--- a/tests/python/contrib/test_tflite_runtime.py
+++ b/tests/python/contrib/test_tflite_runtime.py
@@ -20,7 +20,7 @@
 from tvm import te
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util, tflite_runtime
+from tvm.contrib import utils, tflite_runtime
 
 
 def _create_tflite_model():
@@ -70,7 +70,7 @@ def test_local():
 
     tflite_fname = "model.tflite"
     tflite_model = _create_tflite_model()
-    temp = util.tempdir()
+    temp = utils.tempdir()
     tflite_model_path = temp.relpath(tflite_fname)
     open(tflite_model_path, "wb").write(tflite_model)
 
@@ -111,7 +111,7 @@ def test_remote():
 
     tflite_fname = "model.tflite"
     tflite_model = _create_tflite_model()
-    temp = util.tempdir()
+    temp = utils.tempdir()
     tflite_model_path = temp.relpath(tflite_fname)
     open(tflite_model_path, "wb").write(tflite_model)
 
diff --git a/tests/python/contrib/test_util.py b/tests/python/contrib/test_util.py
index 29c6fbf6c897..5078450cedb9 100644
--- a/tests/python/contrib/test_util.py
+++ b/tests/python/contrib/test_util.py
@@ -19,7 +19,7 @@
 import datetime
 import os
 import shutil
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 def validate_debug_dir_path(temp_dir, expected_basename):
@@ -32,55 +32,55 @@ def validate_debug_dir_path(temp_dir, expected_basename):
 
 
 def test_tempdir():
-    assert util.TempDirectory._KEEP_FOR_DEBUG == False, "don't submit with KEEP_FOR_DEBUG == True"
+    assert utils.TempDirectory._KEEP_FOR_DEBUG == False, "don't submit with KEEP_FOR_DEBUG == True"
 
-    temp_dir = util.tempdir()
+    temp_dir = utils.tempdir()
     assert os.path.exists(temp_dir.temp_dir)
 
-    old_debug_mode = util.TempDirectory._KEEP_FOR_DEBUG
-    old_tempdirs = util.TempDirectory.TEMPDIRS
+    old_debug_mode = utils.TempDirectory._KEEP_FOR_DEBUG
+    old_tempdirs = utils.TempDirectory.TEMPDIRS
     try:
         for temp_dir_number in range(0, 3):
-            with util.TempDirectory.set_keep_for_debug():
-                debug_temp_dir = util.tempdir()
+            with utils.TempDirectory.set_keep_for_debug():
+                debug_temp_dir = utils.tempdir()
                 try:
                     validate_debug_dir_path(debug_temp_dir, "0000" + str(temp_dir_number))
                 finally:
                     shutil.rmtree(debug_temp_dir.temp_dir)
 
-        with util.TempDirectory.set_keep_for_debug():
+        with utils.TempDirectory.set_keep_for_debug():
             # Create 2 temp_dir within the same session.
-            debug_temp_dir = util.tempdir()
+            debug_temp_dir = utils.tempdir()
             try:
                 validate_debug_dir_path(debug_temp_dir, "00003")
             finally:
                 shutil.rmtree(debug_temp_dir.temp_dir)
 
-            debug_temp_dir = util.tempdir()
+            debug_temp_dir = utils.tempdir()
             try:
                 validate_debug_dir_path(debug_temp_dir, "00004")
             finally:
                 shutil.rmtree(debug_temp_dir.temp_dir)
 
-            with util.TempDirectory.set_keep_for_debug(False):
-                debug_temp_dir = util.tempdir()  # This one should get deleted.
+            with utils.TempDirectory.set_keep_for_debug(False):
+                debug_temp_dir = utils.tempdir()  # This one should get deleted.
 
                 # Simulate atexit hook
-                util.TempDirectory.remove_tempdirs()
+                utils.TempDirectory.remove_tempdirs()
 
                 # Calling twice should be a no-op.
-                util.TempDirectory.remove_tempdirs()
+                utils.TempDirectory.remove_tempdirs()
 
                 # Creating a new TempDirectory should fail now
                 try:
-                    util.tempdir()
+                    utils.tempdir()
                     assert False, "creation should fail"
-                except util.DirectoryCreatedPastAtExit:
+                except utils.DirectoryCreatedPastAtExit:
                     pass
 
     finally:
-        util.TempDirectory.DEBUG_MODE = old_debug_mode
-        util.TempDirectory.TEMPDIRS = old_tempdirs
+        utils.TempDirectory.DEBUG_MODE = old_debug_mode
+        utils.TempDirectory.TEMPDIRS = old_tempdirs
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_verilator/__init__.py b/tests/python/contrib/test_verilator/__init__.py
new file mode 100644
index 000000000000..4838dc3f4371
--- /dev/null
+++ b/tests/python/contrib/test_verilator/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Infrastructure and tests for Verilator codegen """
diff --git a/tests/python/contrib/test_verilator/infrastructure.py b/tests/python/contrib/test_verilator/infrastructure.py
new file mode 100644
index 000000000000..1333f484aec9
--- /dev/null
+++ b/tests/python/contrib/test_verilator/infrastructure.py
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Verilator utility functions"""
+
+import sys
+
+import tvm
+from tvm import relay
+import tvm.relay.testing
+from tvm import runtime
+from tvm.relay import transform
+
+
+def _register_verilator_op(op_name, supported=True):
+    """The helper function to indicate that a given operator can be supported by Verilator.
+
+    Paramters
+    ---------
+    op_name : Str
+        The name of operator that will be registered.
+
+    Returns
+    -------
+    f : callable
+        A function that returns if the operator is supported by DNNL.
+    """
+
+    @tvm.ir.register_op_attr(op_name, "target.verilator")
+    def _func_wrapper(expr):
+        return supported
+
+    return _func_wrapper
+
+
+def skip_test():
+    """Skip test if it requires the Verilator codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.verilator", True):
+        print("Skip test because Verilator codegen is not available.")
+        return True
+    if sys.platform == "win32":
+        print("Skip test on Windows for now")
+        return True
+    return False
+
+
+def offload(mod):
+    """Offload ops based on the registered ops"""
+
+    backend = "verilator"
+    mod = transform.AnnotateTarget([backend])(mod)
+    mod = transform.PartitionGraph()(mod)
+    return mod
+
+
+def compile_module(mod):
+    """Compile Relay module"""
+
+    with relay.build_config(opt_level=3):
+        exe = relay.vm.compile(mod, target="llvm", params=None)
+        code, lib = exe.save()
+        return runtime.vm.Executable.load_exec(code, lib)
+
+
+def run_module(exe, inputs):
+    """Run Relay module"""
+
+    ctx = tvm.cpu()
+    vm = runtime.vm.VirtualMachine(exe, ctx)
+    return vm.run(**inputs)
diff --git a/tests/python/contrib/test_verilator/test_verilator_codegen.py b/tests/python/contrib/test_verilator/test_verilator_codegen.py
new file mode 100644
index 000000000000..664e254041b2
--- /dev/null
+++ b/tests/python/contrib/test_verilator/test_verilator_codegen.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Verilator codegen tests"""
+
+import numpy as np
+
+import tvm
+from tvm import relay
+
+from test_verilator.infrastructure import (
+    _register_verilator_op,
+    skip_test,
+    compile_module,
+    run_module,
+    offload,
+)
+
+
+_register_verilator_op("add")
+
+
+def create_module_add(shape, dtype):
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y = relay.var("y", shape=shape, dtype=dtype)
+    z = relay.add(x, y)
+    f = relay.Function([x, y], z)
+    mod = tvm.IRModule()
+    mod["main"] = f
+    return mod
+
+
+def run_check_add(exe, shape, dtype):
+    x_data = np.random.randint(5, size=shape, dtype=dtype)
+    y_data = np.random.randint(5, size=shape, dtype=dtype)
+    ref = x_data + y_data
+    inputs = {"x": x_data, "y": y_data}
+    out = run_module(exe, inputs)
+    tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
+
+
+def test_add():
+    if skip_test():
+        return
+    dtype = "int32"
+    shape = (8, 4)
+    mod = create_module_add(shape, dtype)
+    mod = offload(mod)
+    exe = compile_module(mod)
+    run_check_add(exe, shape, dtype)
+
+
+if __name__ == "__main__":
+    test_add()
diff --git a/tests/python/contrib/test_vitis_ai/__init__.py b/tests/python/contrib/test_vitis_ai/__init__.py
new file mode 100644
index 000000000000..c5fe1539b059
--- /dev/null
+++ b/tests/python/contrib/test_vitis_ai/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Infrastructure and tests for Vitis-AI codegen """
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
new file mode 100644
index 000000000000..df7836a37647
--- /dev/null
+++ b/tests/python/contrib/test_vitis_ai/infrastructure.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
+
+"""Expose Vitis-AI test functions to the Python frontend"""
+
+import sys
+import numpy as np
+
+import pytest
+
+pytest.importorskip("pyxir")
+import pyxir.contrib.target.DPUCADX8G
+import pyxir.contrib.target.DPUCZDX8G
+
+import tvm
+from tvm import relay
+from tvm import runtime
+from tvm.relay import transform
+from tvm.relay.op.contrib.vitis_ai import annotation
+from tvm.relay.build_module import bind_params_by_name
+from tvm.contrib.target import vitis_ai
+from tvm.contrib import graph_runtime
+from tvm.contrib import utils
+
+
+def get_cpu_op_count(mod):
+    """Traverse graph counting ops offloaded to TVM."""
+
+    class Counter(tvm.relay.ExprVisitor):
+        def __init__(self):
+            super().__init__()
+            self.count = 0
+
+        def visit_call(self, call):
+            if isinstance(call.op, tvm.ir.Op):
+                self.count += 1
+
+            super().visit_call(call)
+
+    c = Counter()
+    c.visit(mod["main"])
+    return c.count
+
+
+def skip_test():
+    """Skip test if it requires the Vitis-AI codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.vitis_ai", True):
+        print("Skip test because Vitis-AI codegen is not available.")
+        return True
+    return False
+
+
+def build_module(
+    mod,
+    target,
+    dpu_target="DPUCADX8G",
+    params=None,
+    enable_vitis_ai=True,
+    tvm_ops=0,
+    vitis_ai_partitions=1,
+):
+    """Build module for Vitis-AI codegen."""
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+    if params is None:
+        params = {}
+
+    with tvm.transform.PassContext(
+        opt_level=3, config={"relay.ext.vitis_ai.options.target": dpu_target}
+    ):
+        if enable_vitis_ai:
+            mod["main"] = bind_params_by_name(mod["main"], params)
+            mod = annotation(mod, params, dpu_target)
+            mod = transform.MergeCompilerRegions()(mod)
+            mod = transform.PartitionGraph()(mod)
+            tvm_op_count = get_cpu_op_count(mod)
+            assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format(
+                tvm_op_count, tvm_ops
+            )
+            partition_count = 0
+            for global_var in mod.get_global_vars():
+                if "vitis_ai" in global_var.name_hint:
+                    partition_count += 1
+
+            assert (
+                vitis_ai_partitions == partition_count
+            ), "Got {} Vitis-AI partitions, expected {}".format(
+                partition_count, vitis_ai_partitions
+            )
+        relay.backend.compile_engine.get().clear()
+        return relay.build(mod, target, params=params)
+
+
+def update_lib(lib, cross_compile=None):
+    tmp_path = utils.tempdir()
+    lib_name = "lib.so"
+    lib_path = tmp_path.relpath(lib_name)
+    if cross_compile:
+        lib.export_library(lib_path, cc=cross_compile)
+    else:
+        lib.export_library(lib_path)
+    lib = runtime.load_module(lib_path)
+    return lib
+
+
+def extract_vitis_ai_modules(module):
+    """Get the Vits-AI runtime module from llvm module."""
+    return list(
+        filter(lambda mod: mod.type_key == "VitisAIRuntime", module.get_lib().imported_modules)
+    )
+
+
+def verify_codegen(
+    module, num_vitis_ai_modules=1, params=None, target="llvm", dpu_target="DPUCADX8G"
+):
+    """Check Vitis-AI codegen against a known good output."""
+    module = build_module(module, target, params=params, dpu_target=dpu_target)
+    vitis_ai_modules = extract_vitis_ai_modules(module)
+
+    assert len(vitis_ai_modules) == num_vitis_ai_modules, (
+        f"The number of Vitis-AI modules produced ({len(vitis_ai_modules)}) does not "
+        f"match the expected value ({num_vitis_ai_modules})."
+    )
+
+
+def verify_result(
+    mod,
+    map_inputs,
+    out_shape,
+    result,
+    tol=1e-5,
+    target="llvm",
+    ctx=tvm.cpu(),
+    params=None,
+    dpu_target="DPUCADX8G",
+    tvm_ops=0,
+):
+    """To check the result between reference and byoc vitis-ai flow"""
+
+    lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops)
+    lib = update_lib(lib)
+    ctx = tvm.cpu()
+    rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+
+    for name, data in map_inputs.items():
+        rt_mod.set_input(name, data)
+    rt_mod.set_input(**params)
+    rt_mod.run()
+
+    out_shapes = out_shape if isinstance(out_shape, list) else [out_shape]
+    results = result if isinstance(result, list) else [result]
+
+    for idx, shape in enumerate(out_shapes):
+        out = tvm.nd.empty(shape, ctx=ctx)
+        out = rt_mod.get_output(idx, out)
+        tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=tol, atol=tol)
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
new file mode 100644
index 000000000000..4d5d5dc92c41
--- /dev/null
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
@@ -0,0 +1,336 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
+
+"""Vitis-AI codegen tests"""
+
+import sys
+import numpy as np
+
+import pytest
+
+pytest.importorskip("pyxir")
+import pyxir.contrib.target.DPUCADX8G
+import pyxir.contrib.target.DPUCZDX8G
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.op.contrib.vitis_ai import annotation
+from tvm.relay.build_module import bind_params_by_name
+from tvm.contrib.target import vitis_ai
+
+from .infrastructure import skip_test, verify_codegen
+
+
+def set_func_attr(func, compile_name, symbol_name):
+    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Compiler", compile_name)
+    func = func.with_attr("global_symbol", symbol_name)
+    return func
+
+
+def test_conv2d():
+    """Test conv2d operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    x = relay.var("x", shape=(1, 3, 224, 224))
+    w = relay.const(np.zeros((16, 3, 3, 3), dtype="float32"))
+    y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
+    func = relay.Function([x], y)
+    params = {}
+    params["x"] = np.zeros((1, 3, 224, 224), dtype="float32")
+    params["w"] = np.random.rand(16, 3, 3, 3).astype("float32")
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCADX8G")
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_depthwise_conv():
+    """Test depthwise_conv operator for Vitis-AI DPUCZDX8G-zcu104 target"""
+
+    dtype = "float32"
+    ishape = (1, 32, 14, 14)
+    wshape = (32, 1, 3, 3)
+    data = relay.var("data", shape=(ishape), dtype=dtype)
+    weights = relay.var("weights", shape=(wshape), dtype=dtype)
+    depthwise_conv2d = relay.nn.conv2d(data, weights, kernel_size=(3, 3), padding=(1, 1), groups=32)
+    func = relay.Function([data, weights], depthwise_conv2d)
+    params = {}
+    params["weights"] = np.random.randn(32, 1, 3, 3).astype(dtype)
+    params["data"] = np.random.randn(1, 32, 14, 14).astype(dtype)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_bias_add():
+    """Test bias_add operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    dtype = "float32"
+    ishape = (1, 32, 14, 14)
+    data = relay.var("data", shape=(ishape), dtype=dtype)
+    bias = relay.var("bias", relay.TensorType((32,), dtype))
+    out = relay.nn.bias_add(data, bias)
+    func = relay.Function([data, bias], out)
+    params = {}
+    params["bias"] = np.random.randn(32).astype(dtype)
+    params["data"] = np.random.randn(1, 32, 14, 14).astype(dtype)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCADX8G")
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_relu():
+    """Test relu operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (10, 10)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.relu(x)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_batchnorm():
+    """Test batchnorm operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    data = relay.var("data", shape=(1, 16, 112, 112))
+    bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
+    bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
+    bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
+    bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
+    bn_output = relay.nn.batch_norm(data, bn_gamma, bn_beta, bn_mmean, bn_mvar)
+    func = relay.Function([data, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output[0])
+    params = {}
+    params["data"] = np.zeros((1, 16, 112, 112), dtype="float32")
+    params["bn_gamma"] = np.random.rand(16).astype("float32")
+    params["bn_beta"] = np.random.rand(16).astype("float32")
+    params["bn_mean"] = np.random.rand(16).astype("float32")
+    params["bn_var"] = np.random.rand(16).astype("float32")
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCADX8G")
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_add():
+    """Test add operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (10, 10)
+    x = relay.var("x", shape=shape)
+    y = x + x
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_global_avg_pool2d():
+    """Test global_avg_pool2d operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (10, 10, 7, 7)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.global_avg_pool2d(x)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_avg_pool2d():
+    """Test avg_pool2d for operator Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (10, 10, 10, 10)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.avg_pool2d(x, pool_size=(3, 3))
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_max_pool2d():
+    """Test max_pool2d for operator Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (64, 512, 10, 10)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.max_pool2d(x, pool_size=(3, 3))
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_global_max_pool2d():
+    """Test global_maxpool2d operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (1, 512, 7, 7)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.global_max_pool2d(x)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_upsampling():
+    """Test upsampling operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (64, 512, 10, 10)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.upsampling(x, scale_h=2, scale_w=2)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_conv2d_transpose():
+    """Test conv2d_transpose operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    dshape = (1, 3, 18, 18)
+    kshape = (3, 10, 3, 3)
+    x = relay.var("x", shape=dshape)
+    w = relay.const(np.zeros(kshape, dtype="float32"))
+    y = relay.nn.conv2d_transpose(
+        x, w, channels=10, kernel_size=(3, 3), strides=(1, 1), padding=(1, 1)
+    )
+    func = relay.Function([x], y)
+    params = {}
+    dtype = "float32"
+    params["x"] = np.random.uniform(size=dshape).astype(dtype)
+    params["w"] = np.random.uniform(size=kshape).astype(dtype)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCADX8G")
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_annotate():
+    """Test annotation operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    def partition(dpu_target):
+        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
+        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
+        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
+        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
+        bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
+        bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
+
+        conv = relay.nn.conv2d(
+            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
+        )
+        bn_output = relay.nn.batch_norm(conv, bn_gamma, bn_beta, bn_mmean, bn_mvar)
+
+        func = relay.Function(
+            [data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output.astuple()
+        )
+        mod = tvm.IRModule()
+        mod["main"] = func
+        params = {}
+        params["weight"] = np.random.rand(16, 3, 3, 3).astype("float32")
+        params["bn_gamma"] = np.random.rand(16).astype("float32")
+        params["bn_beta"] = np.random.rand(16).astype("float32")
+        params["bn_mean"] = np.random.rand(16).astype("float32")
+        params["bn_var"] = np.random.rand(16).astype("float32")
+        mod = annotation(mod, params, dpu_target)
+
+        opt_pass = tvm.transform.Sequential(
+            [
+                transform.MergeCompilerRegions(),
+                transform.PartitionGraph(),
+            ]
+        )
+
+        with tvm.transform.PassContext(opt_level=3):
+            mod = opt_pass(mod)
+
+        return mod
+
+    def expected():
+        # function variables for conv2d
+        data0 = relay.var("data0", relay.TensorType((1, 3, 224, 224), "float32"))
+        weight0 = relay.var("weight0", relay.TensorType((16, 3, 3, 3), "float32"))
+        conv = relay.nn.conv2d(
+            data=data0, weight=weight0, kernel_size=(3, 3), channels=16, padding=(1, 1)
+        )
+
+        # function variables for batch_norm
+        bn_gamma0 = relay.var("bn_gamma0", relay.TensorType((16,), "float32"))
+        bn_beta0 = relay.var("bn_beta0", relay.TensorType((16,), "float32"))
+        bn_mmean0 = relay.var("bn_mean0", relay.TensorType((16,), "float32"))
+        bn_mvar0 = relay.var("bn_var0", relay.TensorType((16,), "float32"))
+        bn = relay.nn.batch_norm(conv, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0)
+        func0 = relay.Function(
+            [data0, weight0, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0], bn.astuple()
+        )
+        func0 = set_func_attr(func0, "vitis_ai", "vitis_ai_0")
+        gv0 = relay.GlobalVar("vitis_ai_0")
+        mod = tvm.IRModule()
+        mod[gv0] = func0
+        mod = relay.transform.InferType()(mod)
+
+        # main function
+        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
+        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
+        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
+        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
+        bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
+        bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
+        call0 = gv0(data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar)
+        mod["main"] = relay.Function([data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], call0)
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    partitioned_dpuczdx8g_zcu104 = partition("DPUCZDX8G-zcu104")
+    partitioned_dpucadx8g = partition("DPUCADX8G")
+
+    ref_mod = expected()
+
+    assert tvm.ir.structural_equal(partitioned_dpuczdx8g_zcu104, ref_mod, map_free_vars=True)
+    assert tvm.ir.structural_equal(partitioned_dpucadx8g, ref_mod, map_free_vars=True)
+
+
+if __name__ == "__main__":
+    if sys.platform == "win32":
+        print("Skip test on Windows for now")
+        sys.exit(0)
+
+    test_conv2d()
+    test_depthwise_conv()
+    test_bias_add()
+    test_relu()
+    test_add()
+    test_max_pool2d()
+    test_global_max_pool2d()
+    test_batchnorm()
+    test_global_avg_pool2d()
+    test_avg_pool2d()
+    test_upsampling()
+    test_conv2d_transpose()
+    test_annotate()
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
new file mode 100644
index 000000000000..030dda372cfe
--- /dev/null
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
+
+"""Vitis-AI runtime test for CPU only part
+
+This test verifies as much as possible whether the a model can be correctly offloaded
+and executed for Vitis-AI acceleration. This entails:
+    - Annotating and partitioning model for Vitis-AI acceleration
+    - Building a Vitis-AI PyXIR runtime module with on-the-fly quantization enabled
+    - Run first iteration of on-the-fly quantization flow. This will always be run
+      on CPU as the first N (parameter) will be used for collecting calibration data
+      for quantization.
+
+NOTE This is not a full end-to-end test as we need the full Vitis-AI docker environment
+and access to an FPGA instance for that. This test verifies the Vitis-AI flow as much as
+possible without requiring access to dedicated docker environment and/or hardware setup.
+NOTE Quantization is not being tested (we need to be inside Vitis-AI docker environment
+for that) buth the internal representation used for quantization is being generated and
+functionally tested (CPU).
+"""
+
+import sys
+import numpy as np
+
+import pytest
+
+pytest.importorskip("pyxir")
+import pyxir.contrib.target.DPUCADX8G
+
+import tvm
+import tvm.relay.testing
+from tvm import relay
+
+from .infrastructure import skip_test, verify_result
+
+
+def test_extern_vitis_ai_resnet18():
+    """Test first part of Vitis-AI on-the-fly quantization runtime with ResNet 18 model"""
+    if skip_test():
+        return
+
+    dtype = "float32"
+    ishape = (1, 3, 224, 224)
+    mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1)
+    ref_mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1)
+
+    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0))
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+
+    ref_res = ref_ex.evaluate()(i_data, **params)
+    verify_result(
+        mod,
+        {"data": i_data},
+        (1, 1000),
+        ref_res.asnumpy(),
+        tol=1e-5,
+        params=params,
+        dpu_target="DPUCADX8G",
+        tvm_ops=4,
+    )
+
+
+if __name__ == "__main__":
+    if sys.platform == "win32":
+        print("Skip test on Windows for now")
+        sys.exit(0)
+    test_extern_vitis_ai_resnet18()
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index 62af34ee7758..882d793ccebd 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -148,3 +148,16 @@ def imagenet_cat(tmpdir_factory):
     np.savez(cat_file_full_path, input=image_data)
 
     return cat_file_full_path
+
+
+@pytest.fixture(scope="session")
+def tflite_mobilenet_v1_0_25_128(tmpdir_factory):
+    base_url = "https://storage.googleapis.com/download.tensorflow.org/models"
+    model_url = "mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz"
+    model_file = download_and_untar(
+        "{}/{}".format(base_url, model_url),
+        "mobilenet_v1_0.25_128.tflite",
+        temp_dir=tmpdir_factory.mktemp("data"),
+    )
+
+    return model_file
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 28a60b19b28e..4bbb6fbf2cf8 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -150,3 +150,21 @@ def test_cross_compile_aarch64_onnx_module(onnx_resnet50):
     assert type(params) is dict
     assert type(dumps) is dict
     assert "asm" in dumps.keys()
+
+
+@tvm.testing.requires_opencl
+def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
+    pytest.importorskip("tflite")
+
+    graph, lib, params, dumps = tvmc.compiler.compile_model(
+        tflite_mobilenet_v1_0_25_128,
+        target="opencl",
+        target_host="llvm",
+        alter_layout="NCHW",
+    )
+
+    # check for output types
+    assert type(graph) is str
+    assert type(lib) is tvm.runtime.module.Module
+    assert type(params) is dict
+    assert type(dumps) is dict
diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py
index 2005090d7b1a..d75ecd83a285 100644
--- a/tests/python/frontend/caffe/test_forward.py
+++ b/tests/python/frontend/caffe/test_forward.py
@@ -36,7 +36,7 @@
 
 import tvm
 from tvm import relay
-from tvm.contrib import util, graph_runtime
+from tvm.contrib import utils, graph_runtime
 from tvm.contrib.download import download_testdata
 
 CURRENT_DIR = os.path.join(os.path.expanduser("~"), ".tvm_test_data", "caffe_test")
diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py
index 74c1a2199caa..b6dc815a9530 100644
--- a/tests/python/frontend/darknet/test_forward.py
+++ b/tests/python/frontend/darknet/test_forward.py
@@ -46,6 +46,17 @@
 )
 
 
+def astext(program, unify_free_vars=False):
+    """check that program is parsable in text format"""
+    text = program.astext()
+    if isinstance(program, relay.Expr):
+        roundtrip_program = tvm.parser.parse_expr(text)
+    else:
+        roundtrip_program = tvm.parser.fromtext(text)
+
+    tvm.ir.assert_structural_equal(roundtrip_program, program, map_free_vars=True)
+
+
 def _read_memory_buffer(shape, data, dtype="float32"):
     length = 1
     for x in shape:
@@ -60,6 +71,10 @@ def _get_tvm_output(net, data, build_dtype="float32", states=None):
     """Compute TVM output"""
     dtype = "float32"
     mod, params = relay.frontend.from_darknet(net, data.shape, dtype)
+    # verify that from_darknet creates a valid, parsable relay program
+    mod = relay.transform.InferType()(mod)
+    astext(mod)
+
     target = "llvm"
     shape_dict = {"data": data.shape}
     lib = relay.build(mod, target, params=params)
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 44307f4e60fe..79c587fc7f9e 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -1932,7 +1932,10 @@ def verify(data_shape, axis, use_length, length):
 @pytest.mark.skipif(not hasattr(mx.sym.np, "pad"), reason="mx.sym.np.pad hasn't been publish yet")
 @pytest.mark.parametrize(
     "data_shape, pad_width",
-    [((1, 1, 3, 5), (0, 0, 0, 0, 1, 2, 3, 4)), ((1, 1, 3, 5, 7), (0, 0, 0, 0, 1, 2, 3, 4, 5, 6))],
+    [
+        ((1, 1, 3, 5), ((0, 0), (0, 0), (1, 2), (3, 4))),
+        ((1, 1, 3, 5, 7), ((0, 0), (0, 0), (1, 2), (3, 4), (5, 6))),
+    ],
 )
 @pytest.mark.parametrize("mode", ["constant", "edge", "reflect"])
 @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"])
@@ -1943,19 +1946,17 @@ def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, tar
     data_np = np.random.uniform(size=data_shape).astype(dtype)
     data = mx.sym.var("data")
     if mode == "constant":
-        ref_res = mx.ndarray.pad(
-            mx.nd.array(data_np), mode=mode, pad_width=pad_width, constant_value=constant_value
-        )
+        ref_res = np.pad(data_np, mode=mode, pad_width=pad_width, constant_values=constant_value)
         mx_sym = mx.sym.np.pad(
             data.as_np_ndarray(), mode=mode, pad_width=pad_width, constant_values=constant_value
         )
     else:
-        ref_res = mx.ndarray.pad(mx.nd.array(data_np), mode=mode, pad_width=pad_width)
+        ref_res = np.pad(data_np, mode=mode, pad_width=pad_width)
         mx_sym = mx.sym.np.pad(data.as_np_ndarray(), mode=mode, pad_width=pad_width)
     mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype)
     intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
     op_res = intrp.evaluate()(data_np)
-    tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+    tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
 @pytest.mark.skipif(
@@ -2029,8 +2030,12 @@ def test_forward_np_copy(data_shape, dtype, target, ctx, kind):
         ((2, 3, 8), (-2, -2, 2, -1), False),
         ((8, 3, 3, 3, 4, 4), (-6, 2, -1, -4), False),
         ((8, 3, 3, 3, 4, 4), (-5, -4), False),
+        ((1, 8, 3, 3, 3, 4, 4), (-3, -5, -4), False),
+        ((8, 1, 3, 4), (-2, -3, -1), False),
         ((8, 3, 3, 3, 3, 8), (-4, -5), True),
         ((8, 3, 2, 4, 8), (-4, -1, 2, -6), True),
+        ((3, 2, 4, 8, 1, 1), (-4, -1, 2, -6, -5, -3), True),
+        ((2, 4, 1, 8), (-4, -3, -1, 2, -6), True),
     ],
 )
 def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, ctx, kind):
@@ -2117,16 +2122,21 @@ def test_forward_npi_tanh(data_shape, dtype, target, ctx, kind):
 
 
 @pytest.mark.skipif(not hasattr(mx.np, "where"), reason="mx.np.where hasn't been publish yet")
-@pytest.mark.parametrize("data_shape", [(2, 2, 2), (2, 7, 2), (1, 8), (2, 2), (1, 3)])
+@pytest.mark.parametrize(
+    "data_shape,cond_shape",
+    [[(2, 2, 2), (2, 2, 2)], [(2, 7, 2), (7, 2)], [(2, 2), (1, 2)], [(1, 3), (3, 3)]],
+)
 @pytest.mark.parametrize("data_dtype", ["float64", "float32", "int64", "int32", "bool"])
 @pytest.mark.parametrize("cond_dtype", ["float64", "float32", "int64", "int32", "bool"])
 @pytest.mark.parametrize("scalar", [1.0, 2.0])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_where_rscalar(data_shape, cond_dtype, data_dtype, scalar, target, ctx, kind):
+def test_forward_npi_where_rscalar(
+    data_shape, cond_shape, data_dtype, cond_dtype, scalar, target, ctx, kind
+):
     if data_dtype == "bool":
         scalar = scalar == 0.0
-    cond_np = np.random.uniform(size=data_shape).astype(cond_dtype)
+    cond_np = np.random.uniform(size=cond_shape).astype(cond_dtype)
     data_np = np.random.uniform(size=data_shape).astype(data_dtype)
     cond = mx.sym.var("condition")
     data = mx.sym.var("x")
@@ -2136,7 +2146,7 @@ def test_forward_npi_where_rscalar(data_shape, cond_dtype, data_dtype, scalar, t
     dtypeDic["condition"] = cond_dtype
     dtypeDic["x"] = data_dtype
     mod, _ = relay.frontend.from_mxnet(
-        mx_sym, shape={"condition": data_shape, "x": data_shape}, dtype=dtypeDic
+        mx_sym, shape={"condition": cond_shape, "x": data_shape}, dtype=dtypeDic
     )
     intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
     op_res = intrp.evaluate()(cond_np, data_np)
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index ae32012e42e8..3ddc80af3a32 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -17,7 +17,7 @@
 import numpy as np
 import math
 import onnx
-from onnx import helper, TensorProto, mapping
+from onnx import helper, TensorProto, mapping, numpy_helper
 import torch
 import torchvision
 import tvm.topi.testing
@@ -992,10 +992,9 @@ def test_matmul():
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
-def verify_batch_matmul(a_shape, b_shape, target, ctx):
+def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx):
     a_array = np.random.uniform(size=a_shape).astype("float32")
     b_array = np.random.uniform(size=b_shape).astype("float32")
-    out_np = np.matmul(a_array, b_array)
 
     mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
 
@@ -1006,21 +1005,26 @@ def verify_batch_matmul(a_shape, b_shape, target, ctx):
             helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
             helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
         ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_np.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, out_shape)],
     )
 
     model = helper.make_model(graph, producer_name="matmul_test")
+    onnx_out = get_onnxruntime_output(model, [a_array, b_array], "float32")[0]
 
     tvm_out = get_tvm_output_with_vm(model, [a_array, b_array], target, ctx)
-    tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+    tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
 # TODO(mbrookhart): enable cuda once VM supports heterogenous execution
 @tvm.testing.parametrize_targets("llvm")
 def test_batch_matmul(target, ctx):
-    verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4), target, ctx)
-    verify_batch_matmul((2, 4, 3), (3, 4), target, ctx)
-    verify_batch_matmul((2, 3, 4, 3), (3, 4), target, ctx)
+    verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4), (2, 3, 4, 4), target, ctx)
+    verify_batch_matmul((2, 4, 3), (3, 4), (2, 4, 4), target, ctx)
+    verify_batch_matmul((2, 3, 4, 3), (3, 4), (2, 3, 4, 4), target, ctx)
+    # Test implicit broadcasting.
+    verify_batch_matmul((4, 3), (2, 3, 4), (2, 4, 4), target, ctx)
+    verify_batch_matmul((2, 4, 3), (1, 3, 4), (2, 4, 4), target, ctx)
+    verify_batch_matmul((1, 4, 3), (2, 3, 4), (2, 4, 4), target, ctx)
 
 
 def verify_simple_dynamic_model(a_shape, b_shape, target, ctx):
@@ -3521,6 +3525,36 @@ def verify(ishape, oshape, scales, mode, coord_trans):
     verify([1, 16, 32, 32], [], [1, 1, 2, 2], "nearest", "asymmetric")
     verify([1, 16, 32, 32], [], [1, 1, 0.5, 0.5], "linear", "half_pixel")
 
+    def verify_opset_10(ishape, scales, mode):
+        nodes = [
+            make_constant_node("scales", onnx.TensorProto.FLOAT, (len(scales),), scales),
+        ]
+        input_names = ["X", "scales"]
+        nodes.append(
+            helper.make_node(
+                "Resize",
+                inputs=input_names,
+                outputs=["Y"],
+                mode=mode,
+            )
+        )
+
+        oshape = [round(dim * scale) for (dim, scale) in zip(ishape, scales)]
+        graph = helper.make_graph(
+            nodes,
+            "resize_test",
+            inputs=[helper.make_tensor_value_info("X", TensorProto.FLOAT, ishape)],
+            outputs=[helper.make_tensor_value_info("Y", TensorProto.FLOAT, oshape)],
+        )
+
+        model = helper.make_model(graph, producer_name="resize_test")
+        model.opset_import[0].version = 10
+
+        verify_with_ort(model, [ishape], oshape, use_vm=True, freeze_params=True)
+
+    verify_opset_10([1, 16, 32, 32], [1, 1, 2, 2], "nearest")
+    verify_opset_10([1, 16, 32, 32], [1, 1, 0.5, 0.5], "linear")
+
 
 @tvm.testing.uses_gpu
 def test_nonzero():
@@ -3656,6 +3690,231 @@ def verify_roi_align(
     verify_roi_align((1, 4, 16, 16), 32, 7, 7, sampling_ratio=2, spatial_scale=1.0)
 
 
+def verify_cond_loop():
+    y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [1])
+    y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [1])
+    scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [1])
+    cond_in = helper.make_tensor_value_info("cond_in", TensorProto.BOOL, [])
+    cond_out = helper.make_tensor_value_info("cond_out", TensorProto.BOOL, [])
+    iter_count = helper.make_tensor_value_info("iter_count", TensorProto.INT64, [])
+
+    y = np.array([-2]).astype(np.float32)
+
+    five_const_node = helper.make_node(
+        "Constant",
+        inputs=[],
+        outputs=["five"],
+        value=helper.make_tensor(
+            name="const_tensor_five", data_type=TensorProto.FLOAT, dims=(), vals=[5]
+        ),
+    )
+
+    iter_cast_node = helper.make_node(
+        "Cast", inputs=["iter_count"], outputs=["iter_cast"], to=onnx.TensorProto.FLOAT
+    )
+
+    y_add_node = helper.make_node("Add", inputs=["y_in", "iter_cast"], outputs=["y_out"])
+
+    less_node = helper.make_node("Less", inputs=["y_out", "five"], outputs=["cond_less"])
+
+    squeeze_node = helper.make_node("Squeeze", inputs=["cond_less"], outputs=["cond_squeeze"])
+
+    cond_cast_node = helper.make_node(
+        "Cast", inputs=["cond_squeeze"], outputs=["cond_out"], to=onnx.TensorProto.BOOL
+    )
+
+    scan_identity_node = helper.make_node("Identity", inputs=["y_out"], outputs=["scan_out"])
+
+    loop_body = helper.make_graph(
+        [
+            five_const_node,
+            iter_cast_node,
+            y_add_node,
+            less_node,
+            squeeze_node,
+            cond_cast_node,
+            scan_identity_node,
+        ],
+        "loop_body",
+        [iter_count, cond_in, y_in],
+        [cond_out, y_out, scan_out],
+    )
+
+    loop_node = helper.make_node(
+        "Loop", inputs=["trip_count", "cond", "y"], outputs=["res_y", "res_scan"], body=loop_body
+    )
+
+    trip_count = np.array(5).astype(np.int64)
+    res_y = np.array([13]).astype(np.float32)
+    cond = np.array(1).astype(np.bool)
+    loop_graph = onnx.helper.make_graph(
+        [loop_node],
+        "loop_outer",
+        inputs=[
+            onnx.helper.make_tensor_value_info("trip_count", onnx.TensorProto.INT64, []),
+            onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []),
+            onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, [1]),
+        ],
+        outputs=[
+            onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, [1]),
+            onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5, 1]),
+        ],
+    )
+    loop_model = onnx.helper.make_model(loop_graph)
+
+    # Set a high trip count so that condition trips first.
+    trip_count = np.array(40).astype(np.int64)
+    cond = np.array(1).astype(np.bool)
+    input_vals = [trip_count, cond, y]
+    onnx_out = get_onnxruntime_output(loop_model, input_vals)
+
+    for target, ctx in [("llvm", tvm.cpu())]:
+        tvm_out = get_tvm_output_with_vm(loop_model, input_vals, target, ctx, freeze_params=True)
+        for i in range(len(tvm_out)):
+            tvm.testing.assert_allclose(onnx_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
+
+
+def verify_count_loop():
+    y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [1])
+    y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [1])
+    scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [1])
+    cond_in = helper.make_tensor_value_info("cond_in", TensorProto.BOOL, [])
+    cond_out = helper.make_tensor_value_info("cond_out", TensorProto.BOOL, [])
+    iter_count = helper.make_tensor_value_info("iter_count", TensorProto.INT64, [])
+
+    y = np.array([-2]).astype(np.float32)
+
+    iter_cast_node = helper.make_node(
+        "Cast", inputs=["iter_count"], outputs=["iter_cast"], to=onnx.TensorProto.FLOAT
+    )
+
+    y_add_node = helper.make_node("Add", inputs=["y_in", "iter_cast"], outputs=["y_out"])
+
+    identity_node = helper.make_node("Identity", inputs=["cond_in"], outputs=["cond_out"])
+
+    scan_identity_node = helper.make_node("Identity", inputs=["y_out"], outputs=["scan_out"])
+
+    loop_body = helper.make_graph(
+        [identity_node, iter_cast_node, y_add_node, scan_identity_node],
+        "loop_body",
+        [iter_count, cond_in, y_in],
+        [cond_out, y_out, scan_out],
+    )
+
+    loop_node = helper.make_node(
+        "Loop", inputs=["trip_count", "cond", "y"], outputs=["res_y", "res_scan"], body=loop_body
+    )
+
+    trip_count = np.array(5).astype(np.int64)
+    res_y = np.array([13]).astype(np.float32)
+    cond = np.array(1).astype(np.bool)
+    loop_graph = onnx.helper.make_graph(
+        [loop_node],
+        "loop_outer",
+        inputs=[
+            onnx.helper.make_tensor_value_info("trip_count", onnx.TensorProto.INT64, []),
+            onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []),
+            onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, [1]),
+        ],
+        outputs=[
+            onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, [1]),
+            onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5, 1]),
+        ],
+    )
+    loop_model = onnx.helper.make_model(loop_graph)
+
+    trip_count = np.array(5).astype(np.int64)
+    cond = np.array(1).astype(np.bool)
+    input_vals = [trip_count, cond, y]
+    onnx_out = get_onnxruntime_output(loop_model, input_vals)
+
+    for target, ctx in [("llvm", tvm.cpu())]:
+        tvm_out = get_tvm_output_with_vm(loop_model, input_vals, target, ctx, freeze_params=True)
+        for i in range(len(tvm_out)):
+            tvm.testing.assert_allclose(onnx_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
+
+
+def test_loop():
+    # Test a loop that exits once a condition is met.
+    verify_cond_loop()
+    # Test a loop that exits after a fixed number of iterations.
+    verify_count_loop()
+
+
+@tvm.testing.uses_gpu
+def test_if():
+    # Given a bool scalar input cond.
+    # return constant tensor x if cond is True, otherwise return constant tensor y.
+    then_out = onnx.helper.make_tensor_value_info("then_out", onnx.TensorProto.FLOAT, [5])
+    else_out = onnx.helper.make_tensor_value_info("else_out", onnx.TensorProto.FLOAT, [5])
+
+    x = np.array([1, 2, 3, 4, 5]).astype(np.float32)
+    y = np.array([5, 4, 3, 2, 1]).astype(np.float32)
+
+    then_const_node = onnx.helper.make_node(
+        "Constant", inputs=[], outputs=["then_out"], value=onnx.numpy_helper.from_array(x)
+    )
+
+    else_const_node = onnx.helper.make_node(
+        "Constant", inputs=[], outputs=["else_out"], value=onnx.numpy_helper.from_array(y)
+    )
+
+    then_body = onnx.helper.make_graph([then_const_node], "then_body", [], [then_out])
+
+    else_body = onnx.helper.make_graph([else_const_node], "else_body", [], [else_out])
+
+    if_node = onnx.helper.make_node(
+        "If", inputs=["cond"], outputs=["res"], then_branch=then_body, else_branch=else_body
+    )
+
+    if_graph = onnx.helper.make_graph(
+        [if_node],
+        "if_outer",
+        inputs=[
+            onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []),
+        ],
+        outputs=[
+            onnx.helper.make_tensor_value_info("res", onnx.TensorProto.FLOAT, [5]),
+        ],
+    )
+
+    if_model = onnx.helper.make_model(if_graph)
+    cond = np.array(1).astype("bool")
+    correct_out = x if cond else y
+
+    for target, ctx in tvm.testing.enabled_targets():
+        tvm_out = get_tvm_output_with_vm(if_model, [cond], target, ctx, freeze_params=True)
+        for i in range(len(tvm_out)):
+            tvm.testing.assert_allclose(correct_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
+
+
+@tvm.testing.uses_gpu
+def test_size():
+    def verify_size(indata):
+        node = helper.make_node(
+            "Size",
+            inputs=["X"],
+            outputs=["Y"],
+        )
+
+        graph = helper.make_graph(
+            [node],
+            "size_test",
+            inputs=[helper.make_tensor_value_info("X", TensorProto.INT64, list(indata.shape))],
+            outputs=[helper.make_tensor_value_info("Y", TensorProto.INT64, [])],
+        )
+
+        model = helper.make_model(graph, producer_name="size_test")
+
+        verify_with_ort_with_inputs(model, [indata], dtype="int64", use_vm=True, opset=11)
+
+    input_data = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    verify_size(input_data)
+
+    input_data = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]], dtype=np.int64)
+    verify_size(input_data)
+
+
 if __name__ == "__main__":
     test_flatten()
     test_reshape()
@@ -3730,3 +3989,6 @@ def verify_roi_align(
     test_xor()
     test_max_roi_pool()
     test_roi_align()
+    test_range()
+    test_loop()
+    test_size()
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 706f15b9d9d9..9781eb5d57c4 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -27,7 +27,9 @@
 from torch.quantization import fuse_modules, QuantWrapper
 
 import tvm
+import tvm.testing
 from tvm import relay
+from tvm.relay.frontend.pytorch_utils import is_version_greater_than
 from tvm.contrib.download import download_testdata
 
 
@@ -197,9 +199,7 @@ def fuse_model(self):
 
 # test on quantized::mul_scalar with negative scale
 class MulScalarNegative(nn.Module):
-    def __init__(
-        self,
-    ):
+    def __init__(self):
         super().__init__()
         self.float_op = nn.quantized.FloatFunctional()
         self.quant = QuantStub()
@@ -337,12 +337,7 @@ def get_transform():
 
         normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         return transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                normalize,
-            ]
+            [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize]
         )
 
     def get_real_image(im_height, im_width):
@@ -372,7 +367,8 @@ def get_imagenet_input():
             # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug
             # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756
             # ("inception_v3", qinception.inception_v3(pretrained=True), per_channel),
-            ("googlenet", qgooglenet(pretrained=True), per_channel),
+            # tracing quantized googlenet broken as of v1.6
+            # ("googlenet", qgooglenet(pretrained=True), per_channel),
         ]
 
     results = []
@@ -508,3 +504,45 @@ def test_serialized_modules():
     num_identical = np.sum(np.abs(tvm_result - pt_result) < 1e-2)
     match_ratio = num_identical / float(np.prod(tvm_result.shape))
     assert match_ratio > 0.90
+
+
+def test_quantize_dynamic():
+    # A wrapper is required for quantize_dynamic to work correctly
+    class LinearWrapper(nn.Module):
+        def __init__(self, in_dim, hidden_dim):
+            super().__init__()
+            self.linear = nn.Linear(in_dim, hidden_dim)
+
+        def forward(self, inp):
+            return self.linear(inp)
+
+    torch.manual_seed(0)
+    mod = LinearWrapper(16, 32)
+
+    for qconfig in [
+        torch.quantization.per_channel_dynamic_qconfig,
+        torch.quantization.default_dynamic_qconfig,
+    ]:
+        for ishape in [(16, 16), (10, 16, 16)]:
+            qspec = {nn.Linear: qconfig}
+            qmod = torch.quantization.quantize_dynamic(mod, qconfig_spec=qspec, dtype=torch.qint8)
+
+            inp = torch.randn(*ishape)
+            script_module = torch.jit.trace(qmod, inp).eval()
+
+            with torch.no_grad():
+                pt_result = script_module(inp.clone()).numpy()
+
+            input_name = "input"
+            runtime = get_tvm_runtime(script_module, "input", inp.shape)
+            runtime.set_input(input_name, inp.numpy().copy())
+            runtime.run()
+            tvm_result = runtime.get_output(0).asnumpy()
+
+            # Only compare with the PyTorch result for version v1.6 or newer
+            # Have seen a strange accuracy problem from PyTorch 1.4 and 1.5
+            # Even with the manual random seed set, the same PyTorch
+            # version can outputs slightly different results depending on an environment.
+            # Outputs from v1.6 seem reliable. TVM's outputs are always the same
+            if is_version_greater_than("1.5.1"):
+                tvm.testing.assert_allclose(tvm_result, pt_result, rtol=1e-4, atol=1e-4)
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 54c3daf25385..6250dfff811a 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -2535,7 +2535,7 @@ def test_forward_linspace():
 
     class Linspace1(Module):
         def forward(self, *args):
-            return torch.linspace(5, 10)
+            return torch.linspace(5, 10, steps=100)
 
     class Linspace2(Module):
         def forward(self, *args):
@@ -2559,7 +2559,7 @@ def forward(self, *args):
 
     class Linspace7(Module):
         def forward(self, *args):
-            return torch.linspace(1, 4, dtype=torch.float32)
+            return torch.linspace(1, 4, steps=100, dtype=torch.float32)
 
     class Linspace8(Module):
         def forward(self, *args):
@@ -3139,26 +3139,27 @@ def forward(self, data):
 
 
 def test_forward_scatter():
-    class Scatter(Module):
-        def __init__(self, dim=0):
-            super().__init__()
-            self.dim = dim
+    # integer cannot be traced
+    def test_fn_scatter(dim):
+        return lambda data, index, src: torch.scatter(data, dim=dim, index=index, src=src)
 
-        def forward(self, data, index, src):
-            return torch.scatter(data, dim=self.dim, index=index, src=src)
+    def test_fn_scatter_add(dim):
+        return lambda data, index, src: torch.scatter_add(data, dim=dim, index=index, src=src)
 
     in_data = torch.zeros(3, 5)
     in_index = torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]])
     in_src = torch.rand(2, 5)
-    # TODO: add scatter gpu schedule to enable gpu test.
-    verify_trace_model(Scatter(), [in_data, in_index, in_src], ["llvm"])
+
+    targets = ["llvm", "cuda"]
+    verify_trace_model(test_fn_scatter(0), [in_data, in_index, in_src], targets)
+    verify_trace_model(test_fn_scatter_add(0), [in_data, in_index, in_src], targets)
 
     in_data = torch.zeros(2, 4)
     in_index = torch.tensor([[2], [3]])
     in_src = torch.rand(2, 1)
 
-    # TODO: add scatter gpu schedule to enable gpu test.
-    verify_trace_model(Scatter(1), [in_data, in_index, in_src], ["llvm"])
+    verify_trace_model(test_fn_scatter(1), [in_data, in_index, in_src], targets)
+    verify_trace_model(test_fn_scatter_add(1), [in_data, in_index, in_src], targets)
 
 
 def test_numel():
@@ -3350,6 +3351,18 @@ def expected(x_shape, y_shape):
     assert tvm.ir.structural_equal(expected_mod, mod["main"], map_free_vars=True)
 
 
+def test_bincount():
+    def test_fn(x, weights=None):
+        return torch.bincount(x, weights=weights)
+
+    inp = torch.randint(0, 8, (5,), dtype=torch.int64)
+    weights = torch.linspace(0, 1, steps=5)
+
+    verify_trace_model(test_fn, [inp], ["llvm"])
+    verify_trace_model(test_fn, [inp, weights], ["llvm"])
+    verify_trace_model(test_fn, [inp, weights.to(torch.float64)], ["llvm"])
+
+
 if __name__ == "__main__":
     # some structural tests
     test_forward_traced_function()
@@ -3476,6 +3489,7 @@ def expected(x_shape, y_shape):
     test_forward_nonzero()
     test_forward_scatter()
     test_numel()
+    test_bincount()
 
     # Model tests
     test_resnet18()
diff --git a/tests/python/frontend/pytorch/test_lstm.py b/tests/python/frontend/pytorch/test_lstm.py
index 39d78c70c0fb..1197990f54ba 100644
--- a/tests/python/frontend/pytorch/test_lstm.py
+++ b/tests/python/frontend/pytorch/test_lstm.py
@@ -277,6 +277,8 @@ def test_custom_lstm():
     num_layers = 3
     state_tensor_shape = (batch, hidden_size)
 
+    torch.manual_seed(1)
+
     inp = torch.randn(seq_len, batch, input_size)
 
     input_shapes = [
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 6a24b5752772..fc1b191a1ecd 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1765,6 +1765,62 @@ def test_forward_batch_matmul():
     _test_batch_matmul((2, 3, 4, 2, 3, 4, 5, 6), (2, 3, 4, 2, 3, 4, 5, 6), "float32", False, True)
 
 
+#######################################################################
+# SparseTensorDenseMatMul
+# ----------------------------------
+
+
+def _test_sparse_dense_matmul(indices, values, A_shape, B_shape, dtype, flip=False):
+    """ One iteration of sparse_dense_matmul """
+
+    # TODO(ANSHUMAN87): Support adjoint options too
+    for adjoint_a in [False]:
+        for adjoint_b in [False]:
+            with tf.Graph().as_default():
+                A_sp = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=A_shape)
+                B = tf.placeholder(shape=B_shape, dtype=dtype, name="B")
+
+                if flip:
+                    result = tf.sparse.sparse_dense_matmul(
+                        B, A_sp, adjoint_a=adjoint_a, adjoint_b=adjoint_b
+                    )
+                else:
+                    result = tf.sparse.sparse_dense_matmul(
+                        A_sp, B, adjoint_a=adjoint_a, adjoint_b=adjoint_b
+                    )
+
+                B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype)
+
+                # TODO(ANSHUMAN87): There is an issue in cuda scheduling for csr, work in progress
+                compare_tf_with_tvm([B_np], [B.name], result.name, no_gpu=True)
+
+
+def test_forward_sparse_dense_matmul():
+    """ sparse_dense_matmul op test"""
+    ###################################################################
+    #
+    # In order to create a SparseTensor, it requires 3 input as below:
+    #    SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    #
+    # Above Sparse can be represented in Dense as below :
+    #    [[1, 0, 0, 0]
+    #     [0, 0, 2, 0]
+    #     [0, 0, 0, 0]]
+    #
+    # ------------------------------------------------------------------
+
+    # TODO(ANSHUMAN87): False case for flip need to be supported
+    # _test_sparse_dense_matmul([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [4, 3], "float32")
+    _test_sparse_dense_matmul([[0, 0], [1, 2]], [4.0, 8.0], [3, 5], [4, 3], "float32", True)
+    _test_sparse_dense_matmul([[0, 0], [1, 2]], [4.0, 8.0], [3, 3], [3, 3], "float32", True)
+    _test_sparse_dense_matmul(
+        [[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], "float32", True
+    )
+    _test_sparse_dense_matmul(
+        [[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [9, 5], [7, 9], "float32", True
+    )
+
+
 #######################################################################
 # StridedSlice
 # ------------
@@ -1895,6 +1951,16 @@ def test_forward_stridedslice():
         begin_mask=5,
         end_mask=8,
     )
+    _test_stridedslice(
+        (1, 13, 13, 3, 2),
+        [0, 0],
+        [1, 1],
+        [1, -1],
+        "float32",
+        ellipsis_mask=1,
+        begin_mask=2,
+        end_mask=2,
+    )
 
 
 #######################################################################
@@ -2596,9 +2662,35 @@ def _test_forward_nms_v4(
     )
 
 
+def _test_forward_nms_v5(
+    bx_shape, score_shape, iou_threshold, score_threshold, out_size, dtype="float32"
+):
+    boxes = np.random.uniform(0, 10, size=bx_shape).astype(dtype)
+    scores = np.random.uniform(size=score_shape).astype(dtype)
+    max_output_size = np.int32(out_size)
+    tf.reset_default_graph()
+    in_data_1 = tf.placeholder(dtype, boxes.shape, name="in_data_1")
+    in_data_2 = tf.placeholder(dtype, scores.shape, name="in_data_2")
+    in_data_3 = tf.placeholder(tf.int32, name="in_data_3")
+    tf.image.non_max_suppression_with_scores(
+        boxes=in_data_1,
+        scores=in_data_2,
+        max_output_size=in_data_3,
+        iou_threshold=iou_threshold,
+        score_threshold=score_threshold,
+        name="nms",
+    )
+    compare_tf_with_tvm(
+        [boxes, scores, max_output_size],
+        ["in_data_1:0", "in_data_2:0", "in_data_3:0"],
+        ["nms/NonMaxSuppressionV5:0", "nms/NonMaxSuppressionV5:1"],
+        mode="vm",
+    )
+
+
 def test_forward_nms():
-    """ NonMaxSuppressionV3,4 """
-    for _test_forward_nms in [_test_forward_nms_v3]:
+    """ NonMaxSuppressionV3,5 """
+    for _test_forward_nms in [_test_forward_nms_v3, _test_forward_nms_v5]:
         _test_forward_nms((5, 4), (5,), 0.7, 0.5, 5)
         _test_forward_nms((20, 4), (20,), 0.5, 0.6, 10)
         _test_forward_nms((1000, 4), (1000,), 0.3, 0.7, 1000)
@@ -2717,10 +2809,11 @@ def test_forward_unpack():
 
 def test_forward_range():
     """test operator Range"""
-    tf.reset_default_graph()
-    with tf.Graph().as_default():
-        tf.range(1, 18, 3, name="range")
-        compare_tf_with_tvm([], [], "range:0")
+    for dtype in [tf.int32, tf.int64]:
+        tf.reset_default_graph()
+        with tf.Graph().as_default():
+            tf.range(1, 18, 3, name="range", dtype=dtype)
+            compare_tf_with_tvm([], [], "range:0")
 
     """test type assignment for operator Range"""
     tf.reset_default_graph()
@@ -2863,11 +2956,11 @@ def test_forward_inception_v1():
 
         # Build an image from random data.
         from PIL import Image
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
         img_array = np.random.uniform(size=(1, 600, 600, 3)).astype("uint8")
         img = Image.frombuffer("RGB", (600, 600), img_array.tostring(), "raw", "RGB", 0, 1)
-        temp = util.tempdir()
+        temp = utils.tempdir()
         img_path = temp.relpath("tf-test.jpg")
         img.save(img_path)
 
@@ -3504,6 +3597,55 @@ def test_forward_atan2():
     compare_tf_with_tvm([np_data_1, np_data_2], ["in_data_1:0", "in_data_2:0"], "atan2:0")
 
 
+def test_forward_expm1():
+    """test operator expm1 """
+
+    def _test_forward_expm1(shape):
+        tf.disable_eager_execution()
+        np_data = np.random.uniform(1, 10, size=shape).astype(np.float32)
+        tf.reset_default_graph()
+        in_data = tf.placeholder(tf.float32, shape, name="in_data")
+        tf.expm1(in_data, name="expm1")
+        compare_tf_with_tvm([np_data], ["in_data:0"], "expm1:0")
+
+    _test_forward_expm1([1, 100])
+    _test_forward_expm1([1, 10, 10])
+    _test_forward_expm1([2, 5, 2, 5])
+
+
+def test_forward_softsign():
+    """test operator softsign """
+
+    def _test_forward_softsign(shape):
+        tf.disable_eager_execution()
+        np_data = np.random.uniform(1, 100, size=shape).astype(np.float32)
+        tf.reset_default_graph()
+        in_data = tf.placeholder(tf.float32, shape, name="in_data")
+        tf.nn.softsign(in_data, name="softsign")
+        compare_tf_with_tvm([np_data], ["in_data:0"], "softsign:0")
+
+    _test_forward_softsign([1, 100])
+    _test_forward_softsign([1, 10, 10])
+    _test_forward_softsign([2, 5, 2, 5])
+
+
+def test_forward_rint():
+    """test operator rint """
+
+    def _test_forward_rint(shape):
+        tf.disable_eager_execution()
+        np_data = np.random.uniform(-100, 100, size=shape).astype(np.float32)
+        tf.reset_default_graph()
+        in_data = tf.placeholder(tf.float32, shape, name="in_data")
+        tf.math.rint(in_data, name="rint")
+        compare_tf_with_tvm([np_data], ["in_data:0"], "rint:0")
+
+    _test_forward_rint([100])
+    _test_forward_rint([1, 100])
+    _test_forward_rint([1, 10, 10])
+    _test_forward_rint([2, 5, 2, 5])
+
+
 def test_forward_negative():
     """test tf operator Neg """
     np_data = np.random.uniform(-100, 255, size=(224, 224, 3)).astype(np.float32)
@@ -3660,7 +3802,7 @@ def _test_math_op(op, dtypes=["int32", "float32"]):
     _test_math_op(tf.math.reduce_max)
     _test_math_op(tf.math.reduce_min)
     _test_math_op(tf.math.reduce_prod)
-    _test_math_op(tf.math.reduce_variance)
+    _test_math_op(tf.math.reduce_variance, dtypes=["float32"])
     _test_math_op(tf.math.reduce_std, dtypes=["float32"])
     _test_math_op(tf.math.reduce_logsumexp, dtypes=["float32"])
     if package_version.parse(tf.VERSION) >= package_version.parse("1.15.0"):
@@ -3871,11 +4013,11 @@ def test_forward_unravel_index():
     _test_forward_unravel_index([x, y])
 
     x = np.array([0, 1, 2, 5])
-    y = np.array([2, 2])
+    y = np.array([2, 3])
     _test_forward_unravel_index([x, y])
 
     x = np.array([0, 1, 2, 5])
-    y = np.array([2])
+    y = np.array([6])
     _test_forward_unravel_index([x, y])
 
     x = np.array([102, 300, 16])
@@ -3936,6 +4078,83 @@ def test_forward_dilation():
     _test_dilation2d([1, 3, 3, 1], [2, 2, 1], [1, 1, 1, 1], [1, 1, 2, 1], "VALID")
 
 
+#######################################################################
+# Sparse To Dense
+# ---------------
+def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape):
+    with tf.Graph().as_default():
+        indices = tf.placeholder(
+            shape=sparse_indices.shape, dtype=str(sparse_indices.dtype), name="indices"
+        )
+        values = tf.placeholder(
+            shape=sparse_values.shape, dtype=str(sparse_values.dtype), name="values"
+        )
+        oshape = tf.constant(output_shape, shape=output_shape.shape, dtype=str(output_shape.dtype))
+
+        if default_value == None:
+            output = tf.sparse_to_dense(indices, oshape, values)
+            compare_tf_with_tvm(
+                [sparse_indices, sparse_values], ["indices:0", "values:0"], output.name
+            )
+        else:
+            dv = tf.placeholder(shape=(), dtype=str(default_value.dtype), name="default_value")
+            output = tf.sparse_to_dense(indices, oshape, values, dv)
+            compare_tf_with_tvm(
+                [sparse_indices, sparse_values, default_value],
+                ["indices:0", "values:0", "default_value:0"],
+                output.name,
+            )
+
+
+def test_forward_sparse_to_dense():
+    # scalar
+    _test_sparse_to_dense(
+        sparse_indices=np.int32(1),
+        sparse_values=np.int32(3),
+        default_value=np.int32(0),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # vector
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3, 3, 3]).astype("int32"),
+        default_value=np.int32(0),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # vector nXd
+    _test_sparse_to_dense(
+        sparse_indices=np.array([[0, 0], [1, 2]]).astype("int32"),
+        sparse_values=np.array([1, 2]).astype("int32"),
+        default_value=np.int32(0),
+        output_shape=np.array([3, 4]).astype("int32"),
+    )
+
+    _test_sparse_to_dense(
+        sparse_indices=np.array([[0, 0, 0], [1, 2, 3]]).astype("int32"),
+        sparse_values=np.array([1, 2]).astype("int32"),
+        default_value=np.int32(4),
+        output_shape=np.array([2, 3, 4]).astype("int32"),
+    )
+
+    # floats
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"),
+        default_value=np.float32(3.5),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # default value not specified
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"),
+        default_value=None,
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+
 #######################################################################
 # infinity ops
 # ------------
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 27980047e909..de962fea282f 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -136,14 +136,20 @@ def vmobj_to_list(o):
         raise RuntimeError("Unknown object type: %s" % type(o))
 
 
-def _quantize_keras_model(keras_model, representative_data_gen):
+def _quantize_keras_model(
+    keras_model, representative_data_gen, is_float_input=False, is_float_output=False
+):
     """Utility function to quantize a Keras model using TFLite converter."""
     converter = interpreter_wrapper.TFLiteConverter.from_keras_model(keras_model)
     converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
     converter.representative_dataset = representative_data_gen
     converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    converter.inference_input_type = tf.uint8
-    converter.inference_output_type = tf.uint8
+    # NOTE: If representative dataset is provided, and inference input type is not set,
+    #       then converter will self add quant & dequant Op accordingly.
+    if not is_float_input:
+        converter.inference_input_type = tf.uint8
+    if not is_float_output:
+        converter.inference_output_type = tf.uint8
     return converter.convert()
 
 
@@ -706,6 +712,8 @@ def test_forward_batch_to_space_nd():
 
     _test_batch_to_space_nd(input_shape=[4, 2, 2, 1], block_shape=[2, 2], crops=[[0, 0], [0, 0]])
 
+    _test_batch_to_space_nd(input_shape=[4, 3, 3, 1], block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
 
 ######################################################################
 # SpaceToBatchND
@@ -971,6 +979,7 @@ def _test_convolution(
                     [out],
                     quantized=quantized,
                     input_range=input_range,
+                    experimental_new_converter=True,
                 )
             else:
                 # Quantized the inputs and feed them to the convolution
@@ -998,6 +1007,7 @@ def _test_convolution(
                     [out],
                     quantized=quantized,
                     input_range=input_range,
+                    experimental_new_converter=True,
                 )
         else:
             data_array = np.reshape(data_array, tensor_in_sizes).astype("float32")
@@ -1076,21 +1086,21 @@ def test_forward_convolution():
         )
 
     # TFLite2 quantized convolution testing
-    if package_version.parse(tf.VERSION) >= package_version.parse("2.1.0"):
-        _test_tflite2_quantized_convolution(
-            [1, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], "SAME", "NHWC"
+    if package_version.parse(tf.VERSION) >= package_version.parse("2.3.0"):
+        _test_convolution(
+            [1, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], "SAME", "NHWC", quantized=True
         )
-        _test_tflite2_quantized_convolution(
-            [1, 17, 17, 12], [3, 3, 12, 32], [1, 1], [2, 2], "VALID", "NHWC"
+        _test_convolution(
+            [1, 17, 17, 12], [3, 3, 12, 32], [1, 1], [2, 2], "VALID", "NHWC", quantized=True
         )
-        _test_tflite2_quantized_convolution(
-            [1, 17, 17, 19], [3, 3, 19, 19], [1, 1], [2, 2], "VALID", "NHWC"
+        _test_convolution(
+            [1, 17, 17, 19], [3, 3, 19, 19], [1, 1], [2, 2], "VALID", "NHWC", quantized=True
         )
-        _test_tflite2_quantized_convolution(
-            [1, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], "SAME", "NHWC"
+        _test_convolution(
+            [1, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], "SAME", "NHWC", quantized=True
         )
 
-        # Disable as tests are flaky - https://github.com/apache/incubator-tvm/issues/6064
+        # Disable as tests are flaky - https://github.com/apache/tvm/issues/6064
         # depthwise convolution
         # _test_tflite2_quantized_depthwise_convolution([1, 8, 8, 128], [1, 1, 128, 1], [1, 1], [1, 1],
         #                                               'SAME', 'NHWC', 1)
@@ -1105,7 +1115,9 @@ def test_forward_convolution():
 # ---------------------
 
 
-def _test_transpose_conv(tensor_in_sizes, filter_in_sizes, output_shape, strides, padding):
+def _test_transpose_conv(
+    tensor_in_sizes, filter_in_sizes, output_shape, strides, padding, quantized=False
+):
     """ One iteration of transpose convolution with given shapes and attributes """
 
     total_size_1 = 1
@@ -1114,53 +1126,124 @@ def _test_transpose_conv(tensor_in_sizes, filter_in_sizes, output_shape, strides
         total_size_1 *= s
     for s in filter_in_sizes:
         total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    data_array = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
 
     with tf.Graph().as_default():
-        in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype="float32")
-        in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype="float32")
-        strides = [1] + strides + [1]
-        # in_filter layout is HWOI
-        out = nn_ops.conv2d_transpose(
-            in_data, in_filter, output_shape=output_shape, strides=strides, padding=padding
-        )
-        data_array = np.reshape(data_array, tensor_in_sizes).astype("float32")
-        compare_tflite_with_tvm(data_array, "Placeholder:0", [in_data], [out])
+        if quantized:
+            # Initializes the input tensor with array containing incrementing
+            # numbers from 1.
+            data_array = [max(f, 255) for f in range(1, total_size_1 + 1)]
+            filter_array = [max(f, 255) for f in range(1, total_size_2 + 1)]
+            data_array = np.reshape(data_array, tensor_in_sizes).astype("uint8")
+            filter_array = np.reshape(filter_array, filter_in_sizes).astype("uint8")
+
+            in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype="float32", name="in_data")
+            inq_data = tf.quantization.fake_quant_with_min_max_args(
+                in_data, min=-100, max=100, name="q_data"
+            )
+            input_range = {"q_data": (-100, 100)}
+
+            in_filter = constant_op.constant(
+                filter_array, shape=filter_in_sizes, dtype="float32", name="in_filter"
+            )
+            inq_filter = tf.quantization.fake_quant_with_min_max_args(
+                in_filter, min=-100, max=100, name="q_filter"
+            )
+
+            strides = [1] + strides + [1]
+
+            out = nn_ops.conv2d_transpose(
+                inq_data, inq_filter, output_shape=output_shape, strides=strides, padding=padding
+            )
+            out = tf.quantization.fake_quant_with_min_max_args(out, min=-100, max=100, name="out")
+            compare_tflite_with_tvm(
+                [data_array], ["q_data"], [inq_data], [out], quantized=True, input_range=input_range
+            )
+        else:
+            # Initializes the input tensor with array containing incrementing
+            # numbers from 1.
+            data_array = [f * 1.0 for f in range(1, total_size_1 + 1)]
+            filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
+
+            in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype="float32", name="in_data")
+            in_filter = constant_op.constant(
+                filter_array, shape=filter_in_sizes, dtype="float32", name="in_filter"
+            )
+            strides = [1] + strides + [1]
+            # in_filter layout is HWOI
+            out = nn_ops.conv2d_transpose(
+                in_data, in_filter, output_shape=output_shape, strides=strides, padding=padding
+            )
+            data_array = np.reshape(data_array, tensor_in_sizes).astype("float32")
+            compare_tflite_with_tvm([data_array], ["in_data"], [in_data], [out])
 
 
 def test_forward_transpose_conv():
-    # kernel 3x3, padding VALID
-    _test_transpose_conv([4, 32, 32, 16], [3, 3, 5, 16], [4, 34, 34, 5], [1, 1], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [3, 3, 5, 16], [1, 65, 65, 5], [2, 2], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [3, 3, 5, 16], [1, 65, 34, 5], [2, 1], "VALID")
+    for quantized in [True, False]:
+        # kernel 3x3, padding VALID
+        _test_transpose_conv(
+            [4, 32, 32, 16], [3, 3, 5, 16], [4, 34, 34, 5], [1, 1], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [3, 3, 5, 16], [1, 65, 65, 5], [2, 2], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [3, 3, 5, 16], [1, 65, 34, 5], [2, 1], "VALID", quantized
+        )
 
-    # kernel 3x3, padding SAME
-    _test_transpose_conv([4, 32, 32, 16], [3, 3, 5, 16], [4, 32, 32, 5], [1, 1], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [3, 3, 5, 16], [1, 64, 64, 5], [2, 2], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [3, 3, 5, 16], [1, 64, 32, 5], [2, 1], "SAME")
+        # kernel 3x3, padding SAME
+        _test_transpose_conv(
+            [4, 32, 32, 16], [3, 3, 5, 16], [4, 32, 32, 5], [1, 1], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [3, 3, 5, 16], [1, 64, 64, 5], [2, 2], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [3, 3, 5, 16], [1, 64, 32, 5], [2, 1], "SAME", quantized
+        )
 
-    # kernel 2x2, padding VALID
-    _test_transpose_conv([4, 32, 32, 16], [2, 2, 5, 16], [4, 33, 33, 5], [1, 1], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 64, 5], [2, 2], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 33, 5], [2, 1], "VALID")
+        # kernel 2x2, padding VALID
+        _test_transpose_conv(
+            [4, 32, 32, 16], [2, 2, 5, 16], [4, 33, 33, 5], [1, 1], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 64, 5], [2, 2], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 33, 5], [2, 1], "VALID", quantized
+        )
 
-    # kernel 2x2, padding SAME
-    _test_transpose_conv([4, 32, 32, 16], [2, 2, 5, 16], [4, 32, 32, 5], [1, 1], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 64, 5], [2, 2], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 32, 5], [2, 1], "SAME")
+        # kernel 2x2, padding SAME
+        _test_transpose_conv(
+            [4, 32, 32, 16], [2, 2, 5, 16], [4, 32, 32, 5], [1, 1], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 64, 5], [2, 2], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 32, 5], [2, 1], "SAME", quantized
+        )
 
-    # kernel 1x1, padding VALID
-    _test_transpose_conv([4, 32, 32, 16], [1, 1, 5, 16], [4, 32, 32, 5], [1, 1], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 63, 5], [2, 2], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 32, 5], [2, 1], "VALID")
+        # kernel 1x1, padding VALID
+        _test_transpose_conv(
+            [4, 32, 32, 16], [1, 1, 5, 16], [4, 32, 32, 5], [1, 1], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 63, 5], [2, 2], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 32, 5], [2, 1], "VALID", quantized
+        )
 
-    # kernel 1x1, padding SAME
-    _test_transpose_conv([4, 32, 32, 16], [1, 1, 5, 16], [4, 32, 32, 5], [1, 1], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 63, 5], [2, 2], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 32, 5], [2, 1], "SAME")
+        # kernel 1x1, padding SAME
+        _test_transpose_conv(
+            [4, 32, 32, 16], [1, 1, 5, 16], [4, 32, 32, 5], [1, 1], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 63, 5], [2, 2], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 32, 5], [2, 1], "SAME", quantized
+        )
 
 
 #######################################################################
@@ -2278,7 +2361,7 @@ def representative_data_gen():
         for i in range(1):
             yield [data]
 
-    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen)
+    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen, True, True)
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
     tvm_output = run_tvm_graph(tflite_model_quant, data, input_name)
@@ -2305,7 +2388,7 @@ def representative_data_gen():
         for i in range(1):
             yield [data]
 
-    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen)
+    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen, True, True)
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
     tvm_output = run_tvm_graph(tflite_model_quant, data, input_name)
@@ -2546,14 +2629,17 @@ def test_forward_padv2():
             np.array([2], dtype=np.float32),
         ]
     )
-    _test_padv2(
-        [
-            np.arange(0, 256, dtype=np.uint8).reshape((1, 256)),
-            np.array([[1, 1], [2, 2]], dtype=np.int32),
-            np.array([2], dtype=np.uint8),
-        ],
-        quantized=True,
-    )
+    # NOTE: In versions > 2.1.0, there is a bug in Tensorflow package for this scenario.
+    #       Hence, it is disabled temporarily for TF version > 2.1.0 .
+    if package_version.parse(tf.VERSION) <= package_version.parse("2.1.0"):
+        _test_padv2(
+            [
+                np.arange(0, 256, dtype=np.uint8).reshape((1, 256)),
+                np.array([[1, 1], [2, 2]], dtype=np.int32),
+                np.array([2], dtype=np.float32),
+            ],
+            quantized=True,
+        )
 
     # Constant Values input can be scalar
     _test_padv2(
@@ -2563,14 +2649,17 @@ def test_forward_padv2():
             np.float32(2),
         ]
     )
-    _test_padv2(
-        [
-            np.arange(0, 256, dtype=np.uint8).reshape((1, 256)),
-            np.array([[1, 1], [2, 2]], dtype=np.int32),
-            np.uint8(10),
-        ],
-        quantized=True,
-    )
+    # NOTE: In versions > 2.1.0, there is a bug in Tensorflow package for this scenario.
+    #       Hence, it is disabled temporarily for TF versions > 2.1.0.
+    if package_version.parse(tf.VERSION) <= package_version.parse("2.1.0"):
+        _test_padv2(
+            [
+                np.arange(0, 256, dtype=np.uint8).reshape((1, 256)),
+                np.array([[1, 1], [2, 2]], dtype=np.int32),
+                np.uint8(10),
+            ],
+            quantized=True,
+        )
 
 
 #######################################################################
@@ -2868,37 +2957,28 @@ def test_forward_tanh():
 def _test_relu(data, quantized=False):
     """ One iteration of ReLU """
 
-    if quantized:
-        if package_version.parse(tf.VERSION) < package_version.parse("2.1.0"):
-            pytest.skip("Testcase requires tflite version >= 2.1.0")
-        data_in = tf.keras.layers.Input(shape=data.shape[1:])
-        relu = tf.keras.layers.ReLU()(data_in)
-        keras_model = tf.keras.models.Model(inputs=data_in, outputs=relu)
-        input_name = data_in.name.split(":")[0]
-
-        # To create quantized values with dynamic range of activations, needs representative dataset
-        def representative_data_gen():
-            for i in range(1):
-                yield [data]
-
-        tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen)
-
-        tflite_output = run_tflite_graph(tflite_model_quant, data)
-        tvm_output = run_tvm_graph(tflite_model_quant, data, input_name)
-        tvm.testing.assert_allclose(
-            np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-5
-        )
-    else:
-        with tf.Graph().as_default():
-            in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0")
+
+        if quantized:
+            inq_data = tf.quantization.fake_quant_with_min_max_args(
+                in_data, min=-10, max=10, name="inq_0"
+            )
+            input_range = {"inq_0": (-10, 10)}
+            out = nn_ops.relu(inq_data)
+            out = tf.quantization.fake_quant_with_min_max_args(out, min=0, max=6, name="out")
+            compare_tflite_with_tvm(
+                data, "inq_0:0", [inq_data], [out], quantized=True, input_range=input_range
+            )
+        else:
             out = nn_ops.relu(in_data)
-            compare_tflite_with_tvm(data, "Placeholder:0", [in_data], [out])
+            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
 
 
 def test_forward_relu():
     """ ReLU """
     _test_relu(np.arange(6.0, dtype=np.float32).reshape((1, 6)))
-    _test_relu(np.arange(6.0, dtype=np.float32).reshape((1, 6)), quantized=True)
+    _test_relu(np.random.uniform(0, 255, (3, 6)).astype(np.uint8), quantized=True)
 
 
 #######################################################################
@@ -3717,7 +3797,7 @@ def test_forward_qnn_coco_ssd_mobilenet_v1():
     """Test the quantized Coco SSD Mobilenet V1 TF Lite model."""
     pytest.skip(
         "LLVM bug - getExtendedVectorNumElements - "
-        + "https://discuss.tvm.ai/t/segfault-in-llvm/3567. The workaround is to use a "
+        + "https://discuss.tvm.apache.org/t/segfault-in-llvm/3567. The workaround is to use a "
         + "specific target, for example, llvm -mpcu=core-avx2"
     )
 
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index f29d1fba8f1e..d32699375050 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -23,7 +23,7 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from pytest import skip
 import tvm.testing
 
diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py
index 18e1dd5bb72e..a520f6c2c368 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level10.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level10.py
@@ -27,8 +27,8 @@
 import random
 import tvm.testing
 
-# TODO(mbrookhart): Enable when the VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+
+@tvm.testing.uses_gpu
 def test_broadcast_to():
     def verify_more_dynamic_broadcast_to(x_shape, out_shape):
         rank = len(out_shape)
@@ -82,8 +82,33 @@ def verify_broadcast_to(x_shape, out_shape):
     verify_broadcast_to((4, 1), (1, 4, 3))
 
 
-# TODO(mbrookhart): Enable when the VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
+def test_dyn_broadcast_to():
+    dtype = "uint8"
+    rank = 3
+    shape_type = "int64"
+    dyn_shape = relay.Var("shape", relay.ty.TensorType((rank,), shape_type))
+    x_shape = (1,)
+    x = relay.Var("x", relay.ty.TensorType(x_shape, dtype))
+    z = relay.broadcast_to(x, dyn_shape)
+    zz = run_infer_type(z)
+
+    assert zz.checked_type == relay.ty.TensorType((relay.Any(),) * rank, dtype)
+
+    func = relay.Function([x, dyn_shape], z)
+
+    x = np.random.uniform(size=x_shape).astype(dtype)
+    dyn_shape = (1,) * rank
+    ref_res = np.broadcast_to(x, dyn_shape)
+    for target, ctx in tvm.testing.enabled_targets():
+        for kind in ["vm", "debug"]:
+            mod = tvm.ir.IRModule.from_expr(func)
+            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x, np.array(dyn_shape).astype(shape_type))
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+
+@tvm.testing.uses_gpu
 def test_dyn_one_hot():
     def _get_oshape(indices_shape, depth, axis):
         oshape = []
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
index 37cc124d33f7..5ef975f97d2c 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level2.py
@@ -27,8 +27,8 @@
 import tvm.topi.testing
 from tvm.relay.testing import run_infer_type
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+
+@tvm.testing.uses_gpu
 def test_dyn_upsampling_run():
     def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=False):
 
@@ -72,8 +72,7 @@ def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=Fa
 
 
 # tests upsampling type inference with scale_h passed in as a constant and scale_w as a variable
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_upsampling_infer_type_const():
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
 
@@ -85,8 +84,7 @@ def test_dyn_upsampling_infer_type_const():
     assert zz.checked_type == relay.TensorType((n, c, relay.Any(), relay.Any()), "int8")
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_upsampling3d_run():
     def verify_upsampling3d(
         dshape, scale_d, scale_h, scale_w, layout, method, coord_trans="half_pixel"
@@ -167,8 +165,7 @@ def test_dyn_upsampling3d_infer_type_const():
     )
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_pad():
     def verify_pad(dshape, pad_width, pad_val, dtype):
         x = relay.var("x", relay.TensorType(dshape, dtype))
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 301e72267bd1..dd73b9a96a52 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -37,8 +37,7 @@ def verify_func(func, data, ref_res):
             relay.backend.compile_engine.get().clear()
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_reshape():
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -68,8 +67,7 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((2, 3, 4), (0, -3), (2, 12))
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_shape_reshape():
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -87,8 +85,7 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_tile():
     def verify_tile(dshape, reps):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -106,8 +103,7 @@ def verify_tile(dshape, reps):
     verify_tile((2, 3), (3, 2, 1))
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_zeros_ones():
     def verify_zeros_ones(shape, dtype):
         for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
@@ -125,8 +121,7 @@ def verify_zeros_ones(shape, dtype):
     verify_zeros_ones((8, 9, 1, 2), "float32")
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_full():
     def verify_full(fill_value, src_shape, dtype):
         x = relay.var("x", relay.scalar_type(dtype))
@@ -145,9 +140,67 @@ def verify_full(fill_value, src_shape, dtype):
     verify_full(4.0, (2, 50), "float32")
 
 
+@tvm.testing.uses_gpu
+def test_dyn_sparse_to_dense():
+    def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
+        sparse_indices_data = np.array(sparse_indices)
+        sparse_values_data = np.array(sparse_values)
+        default_value_data = np.array(default_value)
+        output_shape_data = np.array(output_shape)
+
+        a = relay.var(
+            "a", relay.TensorType(sparse_indices_data.shape, str(sparse_indices_data.dtype))
+        )
+        b = relay.var(
+            "b", relay.TensorType(sparse_values_data.shape, str(sparse_values_data.dtype))
+        )
+        output_shape_var = relay.var(
+            "output_shape", relay.TensorType(output_shape_data.shape, str(output_shape_data.dtype))
+        )
+        if default_value is None:
+            args = [a, b, output_shape_var]
+            d = relay.sparse_to_dense(a, output_shape_var, b)
+        else:
+            c = relay.var(
+                "c", relay.TensorType(default_value_data.shape, str(default_value_data.dtype))
+            )
+            args = [a, b, c, output_shape_var]
+            d = relay.sparse_to_dense(a, output_shape_var, b, c)
+
+        zz = run_infer_type(d)
+        assert len(zz.checked_type.shape) == len(output_shape)
+
+        func = relay.Function(args, d)
+
+        if default_value is None:
+            arguments = [sparse_indices_data, sparse_values_data, output_shape_data]
+        else:
+            arguments = [
+                sparse_indices_data,
+                sparse_values_data,
+                default_value_data,
+                output_shape_data,
+            ]
+
+        verify_func(func, arguments, xpected)
+
+    verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
+    verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
+    verify_sparse_to_dense(
+        [[0, 0], [1, 2]], [1, 2], 0, [3, 4], [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
+    )  # nXd
+    verify_sparse_to_dense(
+        [[0, 0, 0], [1, 2, 3]],
+        [1, 2],
+        4,
+        [2, 3, 4],
+        [[[1, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]], [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 2]]],
+    )  # nXd
+    verify_sparse_to_dense(
+        [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
+    )  # floats
+    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
+
+
 if __name__ == "__main__":
-    test_dyn_reshape()
-    test_dyn_shape_reshape()
-    test_dyn_tile()
-    test_dyn_zeros_ones()
-    test_dyn_full()
+    pytest.main([__file__])
diff --git a/tests/python/relay/dyn/test_dynamic_op_level4.py b/tests/python/relay/dyn/test_dynamic_op_level4.py
index b8b2486df376..3d7a99a28e33 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level4.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level4.py
@@ -23,8 +23,7 @@
 import tvm.topi.testing
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dynamic_strided_slice():
     def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True, dtype="int32"):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -64,7 +63,6 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
             op_res = intrp.evaluate()(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
-    verify((1, 3, 10, 10), [0, 0, 0, 0], [-1, 3, 10, 10], [1], (0, 3, 10, 10), dtype="int64")
     verify(
         (1, 224, 224, 3),
         [0, 20, 20, 0],
diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py
index de199dd62deb..9273b019ec96 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level5.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level5.py
@@ -36,8 +36,7 @@ def test_resize_infer_type():
     assert zz.checked_type == relay.TensorType((n, c, relay.Any(), relay.Any()), "int8")
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_resize():
     def verify_resize(dshape, scale, method, layout):
         if layout == "NHWC":
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index bab8b9cf3078..52abbe2a15b6 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -22,8 +22,8 @@
 from tvm import relay
 import tvm.testing
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+
+@tvm.testing.uses_gpu
 def test_dynamic_topk():
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
         shape = (20, 100)
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index c445cd194400..ee67e67b282f 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay.loops import while_loop
 from tvm.relay.testing import run_infer_type as infer_type
-from util.assert_diagnostic import DiagnosticTesting
+from utils.assert_diagnostic import DiagnosticTesting
 import tvm.topi.testing
 
 
@@ -58,7 +58,7 @@ def check_result(
             if flatten:
                 result = result.flatten()
                 expected = expected.flatten()
-            tvm.testing.assert_allclose(result, expected)
+            tvm.testing.assert_allclose(result, expected, atol=2e-6)
 
 
 def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):
@@ -405,6 +405,7 @@ def verify_any_squeeze(data_shape, axis, static_data_shape):
 
 @tvm.testing.uses_gpu
 def test_any_squeeze():
+    verify_any_squeeze((relay.Any(), relay.Any(), relay.Any()), (0,), (1, 9, 8))
     verify_any_squeeze((1, relay.Any(), relay.Any()), (0,), (1, 9, 8))
     verify_any_squeeze(
         (1, relay.Any(), relay.Any(), 1, relay.Any(), relay.Any()), (0, 3), (1, 12, 2, 1, 9, 17)
@@ -684,6 +685,7 @@ def verify_any_split(data_shape, indices_or_sections, axis, static_data_shape, r
 
 @tvm.testing.uses_gpu
 def test_any_split():
+    verify_any_split((relay.Any(), 4), 2, -1, (9, 4), [(9, 2), (9, 2)])
     verify_any_split((relay.Any(), 4), 2, 1, (9, 4), [(9, 2), (9, 2)])
     verify_any_split((relay.Any(), relay.Any()), 2, 1, (9, 4), [(9, 2), (9, 2)])
     verify_any_split((relay.Any(), 12), (1, 4, 8), 1, (7, 12), [(7, 1), (7, 3), (7, 4)])
@@ -813,15 +815,11 @@ def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False):
     else:
         ref_out = sorted[0:kval]
 
-    for kind in ["debug", "vm"]:
-        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
-        result = ex.evaluate()(*in_vals)
-        tvm.testing.assert_allclose(result.asnumpy(), ref_out)
-
-    # TODO(@zhiics) Fix topk cuda schedule for dynamic inputs
-    # check_result(in_vals, mod, ref_out)
+    check_result(in_vals, mod, ref_out)
 
 
+# TODO(kevinthesun): enable this test when Thrust is available in ci.
+# @tvm.testing.uses_gpu
 def test_any_topk():
     verify_any_topk(any_dims(1), 5, (10,), "float32")
     verify_any_topk(any_dims(2), 2, (6, 3), "int32")
@@ -987,7 +985,10 @@ def _body(i, st):
     body = loop(start, relay.op.reshape(relay.const(0), newshape=(1, 1)))
     func = relay.Function([start], relay.TupleGetItem(body, 1))
     with DiagnosticTesting() as diagnostics:
-        diagnostics.assert_message("in particular dimension 0 conflicts 2 does not match 1")
+        diagnostics.assert_message(
+            "The Relay type checker is unable to show the following types "
+            "match.\nIn particular dimension 0 conflicts: 2 does not match 1."
+        )
         func = infer_type(func)
 
 
@@ -1119,6 +1120,94 @@ def test_any_ndarray_size():
     verify_any_ndarray_size((1, 2, 3, 4))
 
 
+def verify_any_resize(data_shape, scale, layout, static_data_shape, ref_out_shape):
+    mod = tvm.IRModule()
+    dtype = "float32"
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    if layout == "NHWC":
+        size = (data_shape[1] * scale, data_shape[2] * scale)
+    else:
+        size = (data_shape[2] * scale, data_shape[3] * scale)
+    y = relay.image.resize(data, size, layout)
+    mod["main"] = relay.Function([data], y)
+    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+    check_result([data_np], mod, ref_out_shape, assert_shape=True)
+
+
+@tvm.testing.uses_gpu
+def test_any_resize():
+    verify_any_resize(
+        data_shape=(relay.Any(), 4, 4, 4),
+        scale=2,
+        layout="NHWC",
+        static_data_shape=(1, 4, 4, 4),
+        ref_out_shape=(1, 8, 8, 4),
+    )
+    verify_any_resize(
+        data_shape=(relay.Any(), 8, 17, 20),
+        scale=3,
+        layout="NCHW",
+        static_data_shape=(2, 8, 17, 20),
+        ref_out_shape=(2, 8, 51, 60),
+    )
+
+
+def verify_any_grid_sample(data_shape, grid_shape, static_data_shape, ref_out_shape):
+    mod = tvm.IRModule()
+    dtype = "float32"
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    grid = relay.var("grid", shape=grid_shape, dtype=dtype)
+    y = relay.image.grid_sample(data, grid)
+    mod["main"] = relay.Function([data, grid], y)
+    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+    grid_np = np.random.uniform(size=grid_shape).astype(dtype)
+    check_result([data_np, grid_np], mod, ref_out_shape, assert_shape=True)
+
+
+@tvm.testing.uses_gpu
+def test_any_grid_sample():
+    verify_any_grid_sample(
+        data_shape=(relay.Any(), 4, 16, 32),
+        grid_shape=(4, 2, 8, 8),
+        static_data_shape=(4, 4, 16, 32),
+        ref_out_shape=(4, 4, 8, 8),
+    )
+    verify_any_grid_sample(
+        data_shape=(relay.Any(), 4, 16, 32),
+        grid_shape=(4, 2, 32, 32),
+        static_data_shape=(4, 4, 16, 32),
+        ref_out_shape=(4, 4, 32, 32),
+    )
+
+
+def verify_any_affine_grid(num_batch, static_num_batch, target_shape, ref_out_shape):
+    mod = tvm.IRModule()
+    dtype = "float32"
+    data_shape = (num_batch, 2, 3)
+    static_data_shape = (static_num_batch, 2, 3)
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    y = relay.image.affine_grid(data, target_shape)
+    mod["main"] = relay.Function([data], y)
+    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+    check_result([data_np], mod, ref_out_shape, assert_shape=True)
+
+
+@tvm.testing.uses_gpu
+def test_any_affine_grid():
+    verify_any_affine_grid(
+        num_batch=relay.Any(),
+        static_num_batch=1,
+        target_shape=(16, 32),
+        ref_out_shape=(1, 2, 16, 32),
+    )
+    verify_any_affine_grid(
+        num_batch=relay.Any(),
+        static_num_batch=8,
+        target_shape=(32, 32),
+        ref_out_shape=(8, 2, 32, 32),
+    )
+
+
 def test_any_consecutive_broadcast():
     dtype = "float32"
     data0 = relay.var("data0", shape=any_dims(2), dtype=dtype)
@@ -1236,5 +1325,49 @@ def test_any_stack():
     verify_any_stack(any_dims(4), (2, 1, 1, 4), 2, 2)
 
 
+def verify_any_where(
+    cond_shape, x_shape, y_shape, cond_np_shape, x_np_shape, y_np_shape, y_np_shape_invalid=None
+):
+    dtype = "float32"
+    cond = relay.var("cond", shape=cond_shape, dtype="bool")
+    x = relay.var("x", shape=x_shape, dtype=dtype)
+    y = relay.var("y", shape=y_shape, dtype=dtype)
+    z = relay.where(cond, x, y)
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([cond, x, y], z)
+
+    cond_np = np.random.randn(*cond_np_shape) > 0
+    x_np = np.random.randn(*x_np_shape).astype(dtype)
+    y_np = np.random.randn(*y_np_shape).astype(dtype)
+    expected = np.where(cond_np, x_np, y_np)
+
+    check_result([cond_np, x_np, y_np], mod, expected)
+
+    # verify invalid broadcasting check
+    if y_np_shape_invalid:
+        y_np_bad = np.random.randn(*y_np_shape_invalid).astype(dtype)
+        try:
+            check_result([cond_np, x_np, y_np_bad], mod, expected)
+        except tvm.error.TVMError as e:
+            error_msg = str(e).split("\n")[-1]
+            assert "Invalid broadcast shapes" in error_msg
+
+
+@tvm.testing.uses_gpu
+def test_any_where():
+    verify_any_where(any_dims(1), (5,), (5,), (5,), (5,), (5,))
+    verify_any_where(any_dims(1), any_dims(1), (5,), (5,), (5,), (5,))
+    verify_any_where(any_dims(1), any_dims(1), any_dims(1), (5,), (5,), (5,))
+    verify_any_where((5,), any_dims(1), any_dims(1), (5,), (5,), (5,))
+
+    # where with broadcast
+    verify_any_where(any_dims(1), any_dims(1), any_dims(1), (5,), (1,), (5,))
+    verify_any_where(any_dims(1), any_dims(2), any_dims(2), (5,), (5, 5), (5, 5))
+    verify_any_where(any_dims(1), any_dims(1), any_dims(2), (5,), (5,), (5, 5))
+    verify_any_where(
+        any_dims(2), any_dims(2), any_dims(2), (3, 4), (3, 1), (1, 4), y_np_shape_invalid=(2, 4)
+    )
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite.py b/tests/python/relay/test_auto_scheduler_layout_rewrite.py
new file mode 100644
index 000000000000..299fcb8ebb2c
--- /dev/null
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test layout rewrite support for whole neural networks"""
+import tempfile
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+from tvm.contrib import graph_runtime
+import tvm.testing
+
+
+def get_np_array(var, dtype):
+    return np.random.randn(*[int(x) for x in var.type_annotation.shape]).astype(dtype)
+
+
+def get_relay_conv2d(
+    outc=128,
+    inc=64,
+    height=14,
+    width=14,
+    kh=3,
+    kw=3,
+    batch=1,
+    pad=0,
+    stride=1,
+    dilation=1,
+    layout="NHWC",
+):
+    dtype = "float32"
+    if layout == "NHWC":
+        kernel_layout = "HWIO"
+        d = relay.var("data", shape=(batch, height, width, inc), dtype=dtype)
+        w = relay.var("weight", shape=(kh, kw, inc, outc), dtype=dtype)
+    elif layout == "NCHW":
+        kernel_layout = "OIHW"
+        d = relay.var("data", shape=(batch, inc, height, width), dtype=dtype)
+        w = relay.var("weight", shape=(outc, inc, kh, kw), dtype=dtype)
+
+    y = relay.nn.conv2d(
+        d,
+        w,
+        padding=pad,
+        kernel_size=(kh, kw),
+        strides=(stride, stride),
+        dilation=(dilation, dilation),
+        channels=outc,
+        groups=1,
+        data_layout=layout,
+        kernel_layout=kernel_layout,
+    )
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([d, w], y)
+    data, weight = get_np_array(d, dtype), get_np_array(w, dtype)
+    return mod, data, weight
+
+
+def tune_and_check(mod, data, weight):
+    # Extract tasks from a relay program
+    target = tvm.target.Target("llvm")
+    tasks, task_weights = auto_scheduler.extract_tasks(mod, target=target, params={})
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+
+        # Tune tasks
+        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=1,
+            num_measures_per_round=1,
+            builder=auto_scheduler.LocalBuilder(timeout=60),
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        tuner.tune(tune_option, search_policy="sketch.random")
+
+        # Compile and run
+        def compile_and_run(disabled_pass={}):
+            with auto_scheduler.ApplyHistoryBest(log_file):
+                with tvm.transform.PassContext(
+                    opt_level=3,
+                    config={"relay.backend.use_auto_scheduler": True},
+                    disabled_pass=disabled_pass,
+                ):
+                    lib = relay.build(mod, target=target, params={"weight": weight})
+
+            ctx = tvm.cpu()
+            module = graph_runtime.GraphModule(lib["default"](ctx))
+            module.set_input("data", data)
+            module.run()
+
+            return module.get_output(0).asnumpy()
+
+        # Check correctness
+        actual_output = compile_and_run()
+        expected_output = compile_and_run(disabled_pass={"AutoSchedulerLayoutRewrite"})
+
+        tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4)
+
+
+def test_conv2d():
+    mod, data, weight = get_relay_conv2d(kh=1, kw=1)
+    tune_and_check(mod, data, weight)
+
+
+if __name__ == "__main__":
+    test_conv2d()
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py
new file mode 100644
index 000000000000..1899f9521013
--- /dev/null
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -0,0 +1,229 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test task extraction for auto-scheduler"""
+import pytest
+
+import tvm.relay.testing
+import tvm.testing
+from tvm import auto_scheduler, relay
+
+
+def get_network(name, batch_size=1, layout="NHWC"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefer NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    elif layout == "NCDHW":
+        image_shape = (3, 16, 224, 224)
+    elif layout == "NDHWC":
+        image_shape = (3, 224, 224, 16)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    if name == "resnet-18":
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "resnet-50":
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=50, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "winograd-test":
+        input_shape = [1, 7, 7, 64]
+
+        data = relay.var("data", shape=input_shape, dtype="float32")
+        net = relay.testing.layers.conv2d(
+            data=data,
+            channels=64,
+            kernel_size=3,
+            strides=1,
+            padding=1,
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+            name="",
+        )
+        bias = relay.var("conv1_bias")
+        net = relay.nn.bias_add(net, bias, 3)
+        net = relay.nn.relu(net)
+        mod, params = relay.testing.create_workload(net)
+    elif name == "resnet3d-18":
+        mod, params = relay.testing.resnet_3d.get_workload(
+            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "resnet3d-18":
+        mod, params = relay.testing.resnet_3d.get_workload(
+            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "dcgan":
+        mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size, layout=layout)
+    elif name == "mlp":
+        data = relay.var("data", shape=(batch_size, 32))
+        fc1 = relay.nn.dense(data, relay.var("fc1_weight"), units=32)
+        fc1 = relay.nn.bias_add(fc1, relay.var("fc1_bias"), axis=-1)
+        act1 = relay.nn.relu(fc1)
+        fc2 = relay.nn.dense(act1, relay.var("fc2_weight"), units=32)
+        fc2 = relay.nn.bias_add(fc2, relay.var("fc2_bias"), axis=-1)
+        act2 = relay.nn.relu(fc2)
+        mlp = act2
+        args = relay.analysis.free_vars(act2)
+        mlp = relay.Function(args, mlp)
+        mod, params = relay.testing.init.create_workload(mlp)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return mod, params
+
+
+@tvm.testing.requires_cuda
+def test_task_extraction_cuda():
+    target = tvm.target.Target("cuda")
+
+    mod, params = get_network("mlp")
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+    assert len(tasks) == 1
+    assert sum(task_weights) == 2
+
+    for layout in ["NHWC", "NCHW"]:
+        mod, params = get_network("resnet-18", layout=layout)
+        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+        assert len(tasks) == 24
+        assert sum(task_weights) == 25
+
+        mod, params = get_network("mobilenet", layout=layout)
+        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+        assert len(tasks) == 22
+        assert sum(task_weights) == 30
+
+    for layout in ["NCDHW", "NDHWC"]:
+        mod, params = get_network("resnet3d-18", layout=layout)
+        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+        assert len(tasks) == 23
+        assert sum(task_weights) == 24, sum(task_weights)
+
+
+def test_task_extraction():
+    ishape = (1, 3, 224, 224)
+    w1shape = (32, 3, 3, 3)
+    w2shape = (32, 32, 3, 3)
+    dtype = "float32"
+    target = tvm.target.Target("llvm")
+
+    def get_func():
+        data = relay.var("data", shape=(ishape), dtype=dtype)
+        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
+        weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype)
+
+        conv2d = relay.nn.conv2d(data, weight1, kernel_size=(3, 3), padding=(1, 1))
+        relu = relay.nn.relu(conv2d)
+        conv2d = relay.nn.conv2d(relu, weight2, kernel_size=(3, 3), padding=(1, 1))
+        out = relay.nn.relu(conv2d)
+        return relay.Function([data, weight1, weight2], out)
+
+    def get_fused_func():
+        data = relay.var("data", shape=(ishape), dtype=dtype)
+        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
+        weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype)
+
+        fused_func = get_func()
+
+        # Set to primitive to keep fuse_ops untouch.
+        fused_func = fused_func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+
+        call = relay.Call(fused_func, [data, weight1, weight2])
+        return relay.Function([data, weight1, weight2], call)
+
+    def get_simple_func():
+        data = relay.var("data", relay.TensorType((1, 2, 3), "float32"))
+        out = relay.image.affine_grid(data, (150, 150))
+        return relay.Function([data], out)
+
+    def get_func_with_unsupported_op():
+        def get_postproc_func():
+            data = relay.var("data", shape=((1, 3, 6)), dtype=dtype)
+            out = relay.nn.relu(data)
+            func = relay.Function([data], out)
+            func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+            return func
+
+        cls_prob = relay.var("cls_prob", relay.ty.TensorType((1, 3, 3), "float32"))
+        loc_pred = relay.var("loc_pred", relay.ty.TensorType((1, 3 * 4), "float32"))
+        anchors = relay.var("anchors", relay.ty.TensorType((1, 3, 4), "float32"))
+
+        mtl = relay.vision.multibox_transform_loc(
+            cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors
+        )
+        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False)
+        out = relay.Call(get_postproc_func(), [nms])
+        return relay.Function([cls_prob, loc_pred, anchors], out)
+
+    func = get_func()
+    mod = tvm.IRModule.from_expr(func)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
+
+    # Relay FuseOps puts two conv2ds to separate functions and results in two tasks.
+    assert len(tasks) == 2
+    assert len(task_weights) == 2
+
+    func = get_fused_func()
+    mod = tvm.IRModule.from_expr(func)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
+
+    # By setting the function to primitive, Relay FuseOps will not break it and result in one task.
+    assert len(tasks) == 1
+    assert len(task_weights) == 1
+
+    func = get_simple_func()
+    mod = tvm.IRModule.from_expr(func)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
+
+    # The Relay function without complex ops will not form a task by default.
+    assert len(tasks) == 0
+    assert len(task_weights) == 0
+
+    tasks, task_weights = auto_scheduler.extract_tasks(
+        mod["main"], None, target, include_simple_tasks=True
+    )
+
+    # Every Relay function becomes a task regardless what ops in its body.
+    assert len(tasks) == 1
+    assert len(task_weights) == 1
+
+    # Func1 (with NMS) -> Func2 (injective).
+    func = get_func_with_unsupported_op()
+    mod = tvm.IRModule.from_expr(func)
+    tasks, task_weights = auto_scheduler.extract_tasks(
+        mod["main"], None, target, include_simple_tasks=True
+    )
+
+    # The function with NMS should fail, but the other function with ReLU should be a task.
+    assert len(tasks) == 1
+    assert len(task_weights) == 1
+
+
+if __name__ == "__main__":
+    test_task_extraction_cuda()
+    test_task_extraction()
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
new file mode 100644
index 000000000000..d42373c86626
--- /dev/null
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test end-to-end network tuning with auto-scheduler"""
+import tempfile
+
+import tvm.testing
+from tvm import auto_scheduler, relay
+
+from test_auto_scheduler_task_extraction import get_network
+
+
+def tune_network(network, target):
+    # Extract tasks
+    mod, params = get_network(network)
+    target = tvm.target.Target(target)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+
+        # Tuning
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
+        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=100,
+            num_measures_per_round=2,
+            early_stopping=1,
+            runner=measure_ctx.runner,
+            builder=auto_scheduler.LocalBuilder(timeout=60),
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        tuner.tune(tune_option, search_policy="sketch.random")
+        del measure_ctx
+
+        # Compile with the history best
+        with auto_scheduler.ApplyHistoryBest(log_file):
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
+            ):
+                lib = relay.build(mod, target=target, params=params)
+
+    # Todo(merrymercy): when the cpu backend is upstreamed, do the following things:
+    # 1. compile without history to test the fallback mechanism
+    # 2. check the correctness of layout rewrite / winograd pre-transform
+
+
+@tvm.testing.requires_cuda
+def test_tuning_cuda():
+    tune_network("mlp", "cuda")
+    tune_network("winograd-test", "cuda")
+
+
+if __name__ == "__main__":
+    test_tuning_cuda()
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index 1bd551004ad7..3c42b7b4196f 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -184,6 +184,31 @@ def unit_numpy(X, W):
             tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
 
 
+def test_compile_nested_tuples():
+    x = relay.var("x", shape=(10,))
+    x1 = x + relay.const(1.0)
+    x2 = x1 + relay.const(1.0)
+    x3 = x2 + relay.const(1.0)
+    x4 = x3 + relay.const(1.0)
+    out = relay.Tuple([x1, relay.Tuple([relay.Tuple([x2, x3]), x4])])
+    func = relay.Function([x], out)
+
+    graph, lib, _ = relay.build(tvm.IRModule.from_expr(func), "llvm")
+    mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+
+    x_data = np.random.uniform(size=(10,)).astype(np.float32)
+    mod.set_input(x=x_data)
+    mod.run()
+
+    assert mod.get_num_outputs() == 4
+
+    ref = x_data + 1
+    for i in range(mod.get_num_outputs()):
+        out = mod.get_output(i).asnumpy()
+        tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+        ref = ref + 1
+
+
 if __name__ == "__main__":
     test_plan_memory()
     test_with_params()
@@ -191,3 +216,4 @@ def unit_numpy(X, W):
     test_add_op_tensor()
     test_add_op_broadcast()
     test_gru_like()
+    test_compile_nested_tuples()
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index 23c0f9366ad9..d4c169bc603e 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -1210,7 +1210,7 @@ def test_partition_overused():
 
 
 def test_partition_check():
-    pattern = is_op("nn.relu")(is_op("nn.conv2d")(wildcard(), wildcard()))
+    pattern = is_op("nn.relu")(is_op("nn.conv2d")(is_var("input"), wildcard()))
 
     def check(pre):
         return pre.args[0].attrs.data_layout == "NCHW"
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index c919e7ce1a7c..0d729b7b1b94 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -25,7 +25,7 @@
 import tvm.relay.transform
 from tvm import relay
 from tvm import runtime
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu()):
@@ -40,7 +40,7 @@ def update_lib(lib):
 
         kwargs = {}
         kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
-        tmp_path = util.tempdir()
+        tmp_path = utils.tempdir()
         lib_name = "lib.so"
         lib_path = tmp_path.relpath(lib_name)
         lib.export_library(lib_path, fcompile=False, **kwargs)
@@ -219,6 +219,39 @@ def test_extern_gcc():
     check_result(mod, {"x": x_data, "y": y_data}, (2, 2), (y_data * y_data) - (x_data + x_data))
 
 
+def test_extern_gcc_consts():
+    @tvm._ffi.register_func("relay.ext.ccompiler.constant_updater")
+    def constant_updater(expr, symbol):
+        """A dummy constant updater just to test that a custom one works."""
+        return {"ccompiler_0_p0": tvm.nd.array(y0_data)}
+
+    x = relay.var("x", shape=(8, 8))
+    y0_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+
+    x0 = relay.var("x0", shape=(8, 8))
+    y0_const = relay.const(y0_data, "float32")
+    z = x0 + y0_const
+    f = relay.Function([x0], z)
+    f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
+    call = relay.Call(f, [x])
+    mod = tvm.IRModule.from_expr(call)
+
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        compiler = relay.backend.vm.VMCompiler()
+        compiler.lower(mod, "llvm")
+        compiler.codegen()
+        params = compiler.get_params()
+        assert len(params) == 1
+        assert "ccompiler_0_p0" in params.keys()
+
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        _, _, params = relay.build(mod, target="llvm")
+        assert len(params) == 1
+        assert "ccompiler_0_p0" in params.keys()
+
+    tvm._ffi.registry.remove_global_func("relay.ext.ccompiler.constant_updater")
+
+
 def test_extern_dnnl():
     if not tvm.get_global_func("relay.ext.dnnl", True):
         print("skip because DNNL codegen is not available")
@@ -301,5 +334,6 @@ def test_extern_dnnl_const():
     test_extern_gcc_single_op()
     test_extern_gcc_single_op_int()
     test_extern_gcc()
+    test_extern_gcc_consts()
     test_extern_dnnl()
     test_extern_dnnl_const()
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index c5217ba41bfd..162271756557 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -875,6 +875,20 @@ def @example() {
     parse_module(program)
 
 
+def test_parse_if_in_binding():
+    program = """
+    def @example(%b: bool) {
+        %0 = if (%b) {
+            1
+        } else {
+            0
+        };
+        %0
+    }
+    """
+    parse_module(program)
+
+
 def test_op_string_attr():
     call = parse_text(
         """
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 6c2f7166f446..72a243dbbb67 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -21,6 +21,7 @@
 import numpy as np
 from tvm.relay import Expr
 from tvm.relay.analysis import free_vars
+import pytest
 
 DEBUG_PRINT = False
 
@@ -250,7 +251,23 @@ def test_null_attribute():
     assert "TestAttribute=(nullptr)" in txt
 
 
-if __name__ == "__main__":
-    import sys
+def test_span():
+    x = relay.var("x", shape=(3, 2))
+    y = relay.var("y")
+    one = relay.const(10e10, dtype="float32")
+    z = relay.add(x, one)
+    z = relay.Call(
+        z.op, z.args, z.attrs, z.type_args, relay.Span(relay.SourceName("Add0"), 0, 0, 0, 0)
+    )
+    z = relay.add(z, z)
+    z = relay.Call(
+        z.op, z.args, z.attrs, z.type_args, relay.Span(relay.SourceName("Add1"), 0, 0, 0, 0)
+    )
+    f = relay.Function([x, y], z)
+    txt = astext(f)
+    assert "Add0" in txt
+    assert "Add1" in txt
 
-    pytext.argv(sys.argv)
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py
index c09dab34be1e..df4dff81b03e 100644
--- a/tests/python/relay/test_json_runtime.py
+++ b/tests/python/relay/test_json_runtime.py
@@ -24,7 +24,7 @@
 import tvm.relay.op as reg
 import tvm.relay.testing
 from tvm import relay, runtime
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.relay import transform
 from tvm.relay.backend import compile_engine
 from tvm.relay.build_module import bind_params_by_name
diff --git a/tests/python/relay/test_layer_count.py b/tests/python/relay/test_layer_count.py
new file mode 100644
index 000000000000..f680bb2725f2
--- /dev/null
+++ b/tests/python/relay/test_layer_count.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tvm.relay.testing import resnet
+from tvm.relay.analysis import count_layers
+
+
+def test_layer_count():
+    def verify(num_layers):
+        # Load a resnet with a known number of layers.
+        mod, _ = resnet.get_workload(num_layers=num_layers)
+        # Count the number of conv and dense layers.
+        count = count_layers(mod, valid_ops=["nn.conv2d", "nn.dense"])
+        assert count == num_layers
+
+    verify(18)
+    verify(50)
+
+
+if __name__ == "__main__":
+    test_layer_count()
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index c0270eae80d2..cac07c437a42 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -144,5 +144,11 @@ def test_bias_add_grad():
     verify_bias_add((4, 8), (8,))
 
 
+def test_expand_dims_grad():
+    data = relay.var("data", shape=(2, 3), dtype="float64")
+    fwd_func = relay.Function([data], relay.expand_dims(data, axis=1, num_newaxis=2))
+    check_grad(fwd_func)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 462a75255f90..4a6ffb933881 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -67,5 +67,10 @@ def test_batch_matmul_grad():
     check_grad(relay.Function([x, y], relay.op.nn.batch_matmul(x, y)))
 
 
+def test_reverse_reshape_grad():
+    x = relay.var("x", shape=(3, 4, 5), dtype="float64")
+    check_grad(relay.Function([x], relay.op.reverse_reshape(x, (-1, 0))))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 34bbf9e60b3a..bcf75de7915b 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -38,7 +38,7 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
 
     data = np.random.rand(*x_shape).astype("float32")
     ph, pw = padding
-    y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
+    y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
     out_grad = np.ones(shape=y_shape)
     ref_grad = tvm.topi.testing.pool_grad_nchw(
         data,
@@ -66,39 +66,43 @@ def test_max_pool2d_grad():
     )
 
 
-def verify_avg_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode, count_include_pad):
-    x = relay.var("x", relay.TensorType(x_shape, "float32"))
-    y = tvm.relay.nn.avg_pool2d(
-        x,
-        pool_size=pool_size,
-        strides=strides,
-        padding=padding,
-        ceil_mode=ceil_mode,
-        count_include_pad=count_include_pad,
-    )
+def verify_avg_pool2d_grad(
+    x_shape, pool_size, strides, padding, ceil_mode, count_include_pad, dtype="float32"
+):
 
-    fwd_func = relay.Function([x], y)
-    fwd_func = run_infer_type(fwd_func)
-    bwd_func = run_infer_type(gradient(fwd_func))
+    for shape_dtype in ["int32", "int64"]:
+        x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in x_shape], dtype=dtype)
+        y = tvm.relay.nn.avg_pool2d(
+            x,
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
 
-    data = np.random.rand(*x_shape).astype("float32")
-    ph, pw = padding
-    y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
-    out_grad = np.ones(shape=y_shape)
-    ref_grad = tvm.topi.testing.pool_grad_nchw(
-        data,
-        out_grad,
-        pool_size=pool_size,
-        strides=strides,
-        padding=[ph, pw, ph, pw],
-        pool_type="avg",
-        ceil_mode=ceil_mode,
-    )
+        fwd_func = relay.Function([x], y)
+        fwd_func = run_infer_type(fwd_func)
+        bwd_func = run_infer_type(gradient(fwd_func))
+
+        data = np.random.rand(*x_shape).astype(dtype)
+        ph, pw = padding
+        y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
+        out_grad = np.ones(shape=y_shape)
+        ref_grad = tvm.topi.testing.pool_grad_nchw(
+            data,
+            out_grad,
+            pool_size=pool_size,
+            strides=strides,
+            padding=[ph, pw, ph, pw],
+            pool_type="avg",
+            ceil_mode=ceil_mode,
+        )
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp = relay.create_executor(ctx=ctx, target=target)
-        op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
-        np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
+        for target, ctx in tvm.testing.enabled_targets():
+            intrp = relay.create_executor(ctx=ctx, target=target)
+            op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
+            np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
@@ -119,6 +123,15 @@ def test_avg_pool2d_grad():
         ceil_mode=False,
         count_include_pad=False,
     )
+    verify_avg_pool2d_grad(
+        (1, 4, 16, 16),
+        pool_size=(1, 1),
+        strides=(1, 1),
+        padding=(1, 1),
+        ceil_mode=False,
+        count_include_pad=False,
+        dtype="int32",
+    )
 
 
 def verify_global_avg_pool2d_grad(x_shape):
@@ -130,7 +143,7 @@ def verify_global_avg_pool2d_grad(x_shape):
     bwd_func = run_infer_type(gradient(fwd_func))
 
     data = np.random.rand(*x_shape).astype("float32")
-    y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
+    y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
     out_grad = np.ones(shape=y_shape)
     ref_grad = tvm.topi.testing.pool_grad_nchw(
         data,
@@ -155,13 +168,6 @@ def test_global_avg_pool2d_grad():
 
 
 def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order"):
-    try:
-        import torch
-        import torch.nn.functional as F
-    except ImportError:
-        print("Skip because pytorch is not installed")
-        return
-
     dtype = "float32"
     data = relay.var("data", shape=dshape, dtype=dtype)
     weight = relay.var("weight", shape=wshape, dtype=dtype)
@@ -169,49 +175,7 @@ def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mod
         data, weight, strides=strides, padding=padding, dilation=dilation, groups=groups
     )
     fwd_func = relay.Function([data, weight], conv)
-    fwd_func = run_infer_type(fwd_func)
-    bwd_func = run_infer_type(gradient(fwd_func, mode=mode))
-
-    data_pt = torch.randn(*dshape, dtype=torch.float32, requires_grad=True)
-    weight_pt = torch.randn(*wshape, dtype=torch.float32, requires_grad=True)
-    out_pt = F.conv2d(
-        data_pt, weight_pt, stride=strides, padding=padding, dilation=dilation, groups=groups
-    )
-    grad_output_pt = torch.ones(out_pt.shape)
-    grad_input_pt = (
-        F.grad.conv2d_input(
-            dshape,
-            weight_pt,
-            grad_output_pt,
-            stride=strides,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-        )
-        .detach()
-        .numpy()
-    )
-    grad_weight_pt = (
-        F.grad.conv2d_weight(
-            data_pt,
-            wshape,
-            grad_output_pt,
-            stride=strides,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-        )
-        .detach()
-        .numpy()
-    )
-
-    for target, ctx in tvm.testing.enabled_targets():
-        data = tvm.nd.array(data_pt.detach().numpy(), ctx)
-        weight = tvm.nd.array(weight_pt.detach().numpy(), ctx)
-        intrp = relay.create_executor(ctx=ctx, target=target)
-        op_res, (grad_input, grad_weight) = intrp.evaluate(bwd_func)(data, weight)
-        np.testing.assert_allclose(grad_input.asnumpy(), grad_input_pt, rtol=1e-4, atol=1e-4)
-        np.testing.assert_allclose(grad_weight.asnumpy(), grad_weight_pt, rtol=1e-4, atol=1e-4)
+    check_grad(fwd_func, mode=mode)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 0b4f8920aa5c..98ff62ed75d4 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.relay.testing import check_grad, run_infer_type
+from tvm.relay.testing import check_grad, run_infer_type, _np_randn_from_type
 from tvm.relay.transform import gradient
 import tvm.testing
 
@@ -75,5 +75,56 @@ def test_copy_grad():
     check_grad(fwd_func)
 
 
+def test_take_grad():
+    data_dtype = relay.TensorType((3, 4, 5), "float64")
+    data = relay.var("data", data_dtype)
+    indices = relay.var("indices", relay.TensorType((relay.Any(),), "int32"))
+    inputs = [_np_randn_from_type(data_dtype, scale=1e-5), np.array([1, 2], dtype="int32")]
+    test_inputs = [inputs[0]]
+
+    # take on axis
+    fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=1))
+    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs)
+
+    # take on flattened
+    fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=None))
+    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs)
+
+
+def test_stack_grad():
+    args = [relay.var(c, shape=(2, 3, 4), dtype="float64") for c in "xyz"]
+    fwd_func = relay.Function(args, relay.stack(args, axis=0))
+    check_grad(fwd_func)
+
+
+def test_squeeze_grad():
+    data = relay.var("data", shape=(2, 1, 1, 3, 4, 1), dtype="float64")
+    fwd_func = relay.Function([data], relay.squeeze(data))
+    fwd_func_subset = relay.Function([data], relay.squeeze(data, axis=[1, -1]))
+    check_grad(fwd_func)
+    check_grad(fwd_func_subset)
+
+
+def test_arange_grad():
+    # TODO: testing arange numerically is strange because two-sided approx can
+    #       produce different output shapes
+    dtype = "float64"
+    start = relay.var("start", relay.TensorType((), dtype))
+    stop = relay.var("stop", relay.TensorType((), dtype))
+    step = relay.var("step", relay.TensorType((), dtype))
+    values = [np.array(v, dtype=dtype) for v in [2.5, 9.5, 1.8]]
+    fwd_func = relay.Function([start, stop, step], relay.arange(start, stop, step, dtype))
+    check_grad(fwd_func, inputs=values)
+
+
+def test_gather_nd_grad():
+    data = relay.var("data", relay.TensorType((2, 3), "float64"))
+    indices = relay.var("indices", relay.TensorType((2, 4), "int64"))
+    fwd = relay.Function([data, indices], relay.gather_nd(data, indices))
+    data_np = np.random.rand(2, 3).astype("float64")
+    indices_np = np.array([[0, 1, 1, 0], [0, 1, 0, 0]], dtype="int64")
+    check_grad(fwd, inputs=[data_np, indices_np], test_inputs=[data_np])
+
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 8c724daaa9d0..37a59c30f410 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -134,7 +134,7 @@ def check_binary_op(opfunc, ref, dtype):
                     continue
                 intrp = relay.create_executor("graph", ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
-                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01, atol=1e-3)
 
     for opfunc, ref in [
         (relay.add, np.add),
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index bc565682d932..3ec1a5bb6129 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -425,17 +425,18 @@ def verify_ndarray_size(shape):
 
 
 def verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc):
-    x = relay.var("x", relay.TensorType(dshape, "float32"))
-    y = opfunc(x, out_size, layout)
-    func = relay.Function([x], y)
+    for shape_dtype in ["int32", "int64"]:
+        x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
+        y = opfunc(x, out_size, layout)
+        func = relay.Function([x], y)
 
-    np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
-    np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
+        np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
+        np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        relay_out = intrp1.evaluate(func)(np_data)
-        tvm.testing.assert_allclose(relay_out.asnumpy(), np_out, rtol=1e-5, atol=1e-5)
+        for target, ctx in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            relay_out = intrp1.evaluate(func)(np_data)
+            tvm.testing.assert_allclose(relay_out.asnumpy(), np_out, rtol=1e-5, atol=1e-5)
 
 
 def verify_adaptive_pool2d(dshape, out_size, pool_type, layout="NCHW", dtype="float32"):
@@ -452,6 +453,7 @@ def verify_adaptive_pool3d(dshape, out_size, pool_type, layout="NCHW", dtype="fl
 def test_adaptive_pool():
     verify_adaptive_pool2d((1, 9, 224, 224), (1, 1), "max")
     verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg")
+    verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg", dtype="int32")
     verify_adaptive_pool2d((1, 14, 56, 78), (34, 13), "max")
     verify_adaptive_pool2d((1, 5, 46, 97), (4, 96), "avg")
     verify_adaptive_pool2d((1, 224, 224, 3), (1, 1), "max", layout="NHWC")
@@ -459,6 +461,8 @@ def test_adaptive_pool():
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "max", layout="NCDHW")
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NCDHW")
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NDHWC")
+    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NCDHW", dtype="int32")
+    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NDHWC", dtype="int32")
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (2, 4, 4), "max", layout="NDHWC")
 
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index c25c2bf48ca7..06bd01b4189a 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -23,7 +23,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
-from tvm.contrib import util
+from tvm.contrib import utils
 import tvm.topi.testing
 from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
 import tvm.testing
@@ -258,7 +258,7 @@ def compile_test_conv2d_arm_cpu(
                         ["data_pad_inline", "ot", 4], ["data_vec_inline", "ot", 1], \
                         ["conv_inline", "ot", 0]]}], "r": [[0.0002933163], \
                         0, 3.1976189613342285, 1570811630.6058347], "v": 0.1}'
-        temp = util.tempdir()
+        temp = utils.tempdir()
         with open(temp.relpath("temp.log"), "w") as log_file:
             log_file.write(test_schedule)
         with autotvm.apply_history_best(temp.relpath("temp.log")):
@@ -298,7 +298,7 @@ def compile_test_conv2d_arm_cpu(
     )
 
     # CUDA is disabled for 'direct' schedule:
-    # https://github.com/apache/incubator-tvm/pull/3070#issuecomment-486597553
+    # https://github.com/apache/tvm/pull/3070#issuecomment-486597553
     # group conv2d
     dshape = (1, 32, 18, 18)
     kshape = (32, 4, 3, 3)
@@ -959,15 +959,16 @@ def _test_pool2d_int(opfunc, reffunc, dtype):
     # test execution
     dtype = "int32"
     dshape = (1, 3, 28, 28)
-    x = relay.var("x", shape=dshape, dtype=dtype)
-    y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-    func = relay.Function([x], y)
-    data = np.random.randint(low=-128, high=128, size=dshape)
-    ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5)).astype(dtype)
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        op_res1 = intrp1.evaluate(func)(data)
-        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+    for shape_dtype in ["int32", "int64"]:
+        x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
+        y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        func = relay.Function([x], y)
+        data = np.random.randint(low=-128, high=128, size=dshape)
+        ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5)).astype(dtype)
+        for target, ctx in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(data)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
 def _test_global_pool2d(opfunc, reffunc):
@@ -1010,7 +1011,7 @@ def test_pool2d():
 
 @tvm.testing.uses_gpu
 def test_pool1d():
-    def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0)):
+    def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0), dtype="float32"):
         n, c, w = te.var("n"), 10, 224
         x = relay.var("x", relay.TensorType((n, c, w), "float32"))
         y = opfunc(x, pool_size=(1,))
@@ -1018,24 +1019,26 @@ def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0)):
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((n, 10, 224), "float32")
         # test execution
-        dtype = "float32"
         dshape = (1, 3, 32)
-        x = relay.var("x", shape=dshape)
-        pool_type = "max" if "max" in str(opfunc) else "avg"
-        y = opfunc(x, pool_size=pool_size, strides=strides, padding=padding)
-        func = relay.Function([x], y)
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = tvm.topi.testing.pool1d_ncw_python(
-            data, (2,), (2,), (0, 0), (1, 3, 16), pool_type, False
-        )
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res1 = intrp1.evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+        for shape_dtype in ["int32", "int64"]:
+            x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
+            pool_type = "max" if "max" in str(opfunc) else "avg"
+            y = opfunc(x, pool_size=pool_size, strides=strides, padding=padding)
+            func = relay.Function([x], y)
+            data = np.random.uniform(size=dshape).astype(dtype)
+            ref_res = tvm.topi.testing.pool1d_ncw_python(
+                data, (2,), (2,), (0, 0), (1, 3, 16), pool_type, False
+            )
+            for target, ctx in tvm.testing.enabled_targets():
+                intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res1 = intrp1.evaluate(func)(data)
+                tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     _test_pool1d(relay.nn.max_pool1d)
+    _test_pool1d(relay.nn.max_pool1d, dtype="int32")
     _test_pool1d(relay.nn.max_pool1d, pool_size=2, strides=2, padding=0)
     _test_pool1d(relay.nn.avg_pool1d)
+    _test_pool1d(relay.nn.avg_pool1d, dtype="int32")
     _test_pool1d(relay.nn.avg_pool1d, pool_size=2, strides=2, padding=0)
 
 
@@ -1047,6 +1050,7 @@ def _test_pool3d(
         strides=(2, 2, 2),
         padding=(0, 0, 0, 0, 0, 0),
         out_shape=(1, 3, 16, 16, 16),
+        dtype="float32",
     ):
         n, c, d, h, w = te.size_var("n"), 10, 5, 224, 224
         x = relay.var("x", relay.TensorType((n, c, d, h, w), "float32"))
@@ -1057,30 +1061,33 @@ def _test_pool3d(
         # test execution
         dtype = "float32"
         dshape = (1, 3, 32, 32, 32)
-        x = relay.var("x", shape=dshape)
-        pool_type = "max" if "max" in str(opfunc) else "avg"
-        y = opfunc(x, pool_size=pool_size, strides=strides, padding=padding)
-        func = relay.Function([x], y)
-        # check output shape
-        f_out_shape = tuple(map(lambda x: int(x), run_infer_type(func).ret_type.shape))
-        assert out_shape == f_out_shape, "Output shape mismatch. expected {}, actual {}".format(
-            out_shape, f_out_shape
-        )
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = tvm.topi.testing.pool3d_ncdhw_python(
-            data, pool_size, strides, padding, out_shape, pool_type, False
-        )
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res1 = intrp1.evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+        for shape_dtype in ["int32", "int64"]:
+            x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
+            pool_type = "max" if "max" in str(opfunc) else "avg"
+            y = opfunc(x, pool_size=pool_size, strides=strides, padding=padding)
+            func = relay.Function([x], y)
+            # check output shape
+            f_out_shape = tuple(map(lambda x: int(x), run_infer_type(func).ret_type.shape))
+            assert out_shape == f_out_shape, "Output shape mismatch. expected {}, actual {}".format(
+                out_shape, f_out_shape
+            )
+            data = np.random.uniform(size=dshape).astype(dtype)
+            ref_res = tvm.topi.testing.pool3d_ncdhw_python(
+                data, pool_size, strides, padding, out_shape, pool_type, False
+            )
+            for target, ctx in tvm.testing.enabled_targets():
+                intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res1 = intrp1.evaluate(func)(data)
+                tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     _test_pool3d(relay.nn.max_pool3d)
+    _test_pool3d(relay.nn.max_pool3d, dtype="int32")
     _test_pool3d(relay.nn.max_pool3d, padding=(2, 0, 0, 2, 0, 0), out_shape=(1, 3, 18, 16, 16))
     _test_pool3d(relay.nn.max_pool3d, padding=(0, 3, 0, 0, 3, 0), out_shape=(1, 3, 16, 19, 16))
     _test_pool3d(relay.nn.max_pool3d, padding=(0, 0, 4, 0, 0, 4), out_shape=(1, 3, 16, 16, 20))
     _test_pool3d(relay.nn.max_pool3d, pool_size=2, padding=0, strides=2)
     _test_pool3d(relay.nn.avg_pool3d)
+    _test_pool3d(relay.nn.avg_pool3d, dtype="int32")
     _test_pool3d(relay.nn.avg_pool3d, padding=(2, 0, 0, 2, 0, 0), out_shape=(1, 3, 18, 16, 16))
     _test_pool3d(relay.nn.avg_pool3d, padding=(0, 3, 0, 0, 3, 0), out_shape=(1, 3, 16, 19, 16))
     _test_pool3d(relay.nn.avg_pool3d, padding=(0, 0, 4, 0, 0, 4), out_shape=(1, 3, 16, 16, 20))
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index d18a12f20fa5..90e6e870f370 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -316,17 +316,45 @@ def test_reshape_like_infer_type():
     zz = run_infer_type(z)
     assert zz.checked_type == relay.TensorType((1, 8, 8), "float32")
 
+    # partial reshaping
+    x = relay.var("x", relay.TensorType((1, 2, 3, 4), "float32"))
+    y = relay.var("y", relay.TensorType((1, 6, 5), "float32"))
+    z = relay.reshape_like(x, y, lhs_begin=1, lhs_end=3, rhs_begin=1, rhs_end=2)
+    zz = run_infer_type(z)
+    assert zz.checked_type == relay.TensorType((1, 6, 4), "float32")
+
+    x = relay.var("x", relay.TensorType((1, 2, 3, 4), "float32"))
+    y = relay.var("y", relay.TensorType((2, 3, 4, 1, 6), "float32"))
+    z = relay.reshape_like(x, y, rhs_end=3)
+    zz = run_infer_type(z)
+    assert zz.checked_type == relay.TensorType((2, 3, 4), "float32")
+    z = relay.reshape_like(x, y, rhs_begin=2)
+    zz = run_infer_type(z)
+    assert zz.checked_type == relay.TensorType((4, 1, 6), "float32")
+
+    # symbolic partial reshaping
+    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.var("y", relay.TensorType((5, 6), "float32"))
+    z = relay.var("z", relay.TensorType((4,), "float32"))
+    w = relay.reshape_like(x, y, lhs_end=3)
+    w = relay.reshape_like(w, z, lhs_begin=2)
+    w = run_infer_type(w)
+    assert w.checked_type == relay.TensorType((5, 6, 4), "float32")
+
 
 @tvm.testing.uses_gpu
 def test_reshape_like():
-    def verify_reshape_like(shape, oshape):
+    def verify_reshape_like(shape, oshape, shape_like=None, reshape_like_kwargs={}):
+        if shape_like is None:
+            shape_like = oshape
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=oshape).astype("float32")
-        ref_res = np.reshape(x_data, y_data.shape)
+        y_data = np.random.uniform(low=-1, high=1, size=shape_like).astype("float32")
+        ref_res = np.reshape(x_data, oshape)
 
         x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("x", relay.TensorType(oshape, "float32"))
-        z = relay.reshape_like(x, y)
+        y = relay.var("x", relay.TensorType(shape_like, "float32"))
+        z = relay.reshape_like(x, y, **reshape_like_kwargs)
         zz = run_infer_type(z)
         assert zz.checked_type == relay.ty.TensorType(ref_res.shape, "float32")
 
@@ -340,6 +368,9 @@ def verify_reshape_like(shape, oshape):
 
     verify_reshape_like((2, 3, 4), (1, 8, 3))
     verify_reshape_like((4, 7), (2, 7, 2))
+    verify_reshape_like(
+        (1, 2, 3, 4), (1, 6, 4), (1, 6, 5), dict(lhs_begin=1, lhs_end=3, rhs_begin=1, rhs_end=2)
+    )
 
 
 def test_take_infer_type():
@@ -910,6 +941,7 @@ def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
     )
 
 
+@tvm.testing.uses_gpu
 def test_scatter():
     def ref_scatter(data, indices, updates, axis=0):
         idx = np.indices(indices.shape).reshape(indices.ndim, -1)
@@ -935,13 +967,34 @@ def verify_scatter(dshape, ishape, axis=0):
         indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
 
         ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
-        # TODO(mbrookhart): expand testing when adding more backend schedules
-        for target, ctx in [("llvm", tvm.cpu())]:
+
+        for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
+    def verify_dynamic_scatter(dshape, ishape, axis=0):
+        d = relay.var("d", relay.TensorType([relay.Any() for i in range(len(dshape))], "float32"))
+        i = relay.var("i", relay.TensorType([relay.Any() for i in range(len(ishape))], "int64"))
+        u = relay.var("u", relay.TensorType([relay.Any() for i in range(len(ishape))], "float32"))
+        z = relay.op.scatter(d, i, u, axis)
+
+        func = relay.Function([d, i, u], z)
+
+        data_np = np.random.uniform(size=dshape).astype("float32")
+        updates_np = np.random.uniform(size=ishape).astype("float32")
+        indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
+
+        ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
+
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in ["vm", "debug"]:
+                mod = tvm.ir.IRModule.from_expr(func)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(data_np, indices_np, updates_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
     verify_scatter((10,), (10,), 0)
     verify_scatter((10, 5), (10, 5), -2)
     verify_scatter((10, 5), (10, 5), -1)
@@ -950,12 +1003,28 @@ def verify_scatter(dshape, ishape, axis=0):
     verify_scatter((2, 3, 4), (1, 3, 4), 0)
     verify_scatter((2, 3, 4), (2, 1, 4), 1)
     verify_scatter((2, 3, 4), (2, 3, 1), 2)
+    verify_scatter((4, 2, 1), (1, 1, 1), 0)
     verify_scatter((2, 3, 4, 5), (1, 3, 4, 5), 0)
     verify_scatter((6, 3, 4, 5), (2, 3, 4, 5), 1)
     verify_scatter((2, 3, 8, 5), (2, 3, 1, 1), 2)
     verify_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
 
+    verify_dynamic_scatter((10,), (10,), 0)
+    verify_dynamic_scatter((10, 5), (10, 5), -2)
+    verify_dynamic_scatter((10, 5), (10, 5), -1)
+    verify_dynamic_scatter((10, 5), (3, 5), 0)
+    verify_dynamic_scatter((12, 4), (7, 2), 1)
+    verify_dynamic_scatter((2, 3, 4), (1, 3, 4), 0)
+    verify_dynamic_scatter((2, 3, 4), (2, 1, 4), 1)
+    verify_dynamic_scatter((2, 3, 4), (2, 3, 1), 2)
+    verify_dynamic_scatter((4, 2, 1), (1, 1, 1), 0)
+    verify_dynamic_scatter((2, 3, 4, 5), (1, 3, 4, 5), 0)
+    verify_dynamic_scatter((6, 3, 4, 5), (2, 3, 4, 5), 1)
+    verify_dynamic_scatter((2, 3, 8, 5), (2, 3, 1, 1), 2)
+    verify_dynamic_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
+
 
+@tvm.testing.uses_gpu
 def test_scatter_add():
     def ref_scatter_add(data, indices, updates, axis=0):
         output = np.copy(data)
@@ -978,8 +1047,7 @@ def verify_scatter_add(dshape, ishape, axis=0):
         indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
 
         ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis)
-        # TODO(mbrookhart): expand testing when adding more backend schedules
-        for target, ctx in [("llvm", tvm.cpu())]:
+        for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 0df5a286b4e7..114783e55f20 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -152,35 +152,70 @@ def run(func, inputs, ref_res):
                 op_res = intrp.evaluate(func)(*inputs)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
-    shape = (3, 4)
-    dtype = "float32"
-    cond = relay.var("cond", relay.TensorType(shape, dtype))
-    x = relay.var("x", relay.TensorType(shape, dtype))
-    y = relay.var("y", relay.TensorType(shape, dtype))
-    z = relay.where(cond, x, y)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType(shape, dtype)
+    def verify(x_np, y_np, cond_np):
+        ref_res = np.where(cond_np, x_np, y_np)
 
-    func = relay.Function([cond, x, y], z)
-    condition = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
-    x = np.random.uniform(size=shape).astype(dtype)
-    y = np.random.uniform(size=shape).astype(dtype)
-    ref_res = np.where(condition, x, y)
+        args = []
+        args_np = []
+        vs = []
 
-    run(func, [condition, x, y], ref_res)
+        cond = relay.var("cond", relay.TensorType(cond_np.shape, "bool"))
+
+        args.append(cond)
+        args_np.append(cond_np)
+
+        for v_name, v_np in [("x", x_np), ("y", y_np)]:
+            if len(v_np.shape) == 0:
+                v = relay.const(v_np.item())
+            else:
+                v = relay.var(v_name, relay.TensorType(v_np.shape, dtype))
+                args.append(v)
+                args_np.append(v_np)
+            vs.append(v)
+
+        z = relay.where(cond, vs[0], vs[1])
+
+        func = relay.Function(args, z)
+
+        run(func, args_np, ref_res)
 
-    x = relay.const(1)
-    y = relay.const(-1)
-    shape = (3,)
     dtype = "float32"
-    cond = relay.var("cond", relay.TensorType(shape, "bool"))
-    z = relay.where(cond, x, y)
 
-    func = relay.Function([cond], z)
-    condition = np.array([1, 0, 1], dtype=np.bool)
-    ref_res = np.where(condition, 1, -1)
+    x_np = np.random.uniform(size=(3, 4)).astype(dtype)
+    y_np = np.random.uniform(size=(3, 4)).astype(dtype)
+    cond_np = np.random.uniform(low=-1, high=1, size=(3, 4)) > 0
+
+    verify(x_np, y_np, cond_np)
+
+    x_np = np.array(1.0, dtype)
+    y_np = np.array(-1.0, dtype)
+    cond_np = np.array([1, 0, 1], dtype=np.bool)
 
-    run(func, [condition], ref_res)
+    verify(x_np, y_np, cond_np)
+
+    x_np = np.arange(10).astype(dtype)
+    y_np = 10 * x_np
+    cond_np = x_np < 5
+
+    verify(x_np, y_np, cond_np)
+
+    x_np = np.array([[1, 2], [3, 4]], dtype)
+    y_np = np.array([[5, 6], [7, 8]], dtype)
+    cond_np = np.array([[1], [0]], dtype=np.bool)
+
+    verify(x_np, y_np, cond_np)
+    verify(x_np, y_np, cond_np.T)
+
+    x_np = np.random.randn(1, 12, 8, 8).astype(dtype)
+    y_np = np.array(-1.0, dtype)
+    cond_np = np.random.randn(1, 1, 8, 8) > 0
+
+    verify(x_np, y_np, cond_np)
+
+    x_np, y_np = np.ogrid[:3, :4]
+    cond_np = np.where(x_np < y_np, x_np, 10 + y_np).astype(np.bool)
+
+    verify(x_np.astype(dtype), y_np.astype(dtype), cond_np)
 
 
 def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32"):
@@ -392,14 +427,16 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
     verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
     verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
+    # Test backwards slicing.
+    verify((3, 4, 3), [-1, -1, -1], [-5, -5, -5], [-1, -1, -1], (3, 4, 3))
+    # Test slice mode.
     verify(
         (3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1], (2, 4, 3), slice_mode="size", test_ref=False
     )
     verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1], (2, 2, 3), slice_mode="size", test_ref=True)
 
 
-# TODO(mbrookhart): enable once vm supports heterogenous execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_strided_slice():
     def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True, dtype="int32"):
         ndim = len(dshape)
@@ -430,7 +467,6 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
             op_res = intrp.evaluate()(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
-    verify((1, 3, 10, 10), [0, 0, 0, 0], [-1, 3, 10, 10], [1], (0, 3, 10, 10), dtype="int64")
     verify(
         (1, 224, 224, 3),
         [0, 20, 20, 0],
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index cfb85b6d1e91..5a5a12c9efe0 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -1141,6 +1141,60 @@ def verify_grid_sample(data_shape, grid_shape):
     verify_grid_sample((4, 4, 16, 32), (4, 2, 32, 32))
 
 
+@tvm.testing.uses_gpu
+def test_space_to_batch_nd():
+    def verify_space_to_batch_nd(dshape, block_shape, paddings):
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        pad_before, pad_after = map(list, zip(*paddings))
+        ref_res = tvm.topi.testing.space_to_batch_nd_python(
+            x_data, block_shape, pad_before, pad_after
+        )
+
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.nn.space_to_batch_nd(x, block_shape, paddings)
+        assert "block_shape=" in z.astext()
+        assert "paddings=" in z.astext()
+        zz = run_infer_type(z)
+        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
+        func = relay.Function([x], z)
+
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
+
+    verify_space_to_batch_nd([3, 3, 2, 1], [3], [[0, 0]])
+    verify_space_to_batch_nd([2, 2, 4, 1], [2, 2], [[0, 0], [2, 0]])
+
+
+@tvm.testing.uses_gpu
+def test_batch_to_space_nd():
+    def verify_batch_to_space_nd(dshape, block_shape, crops):
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        crop_begin_list, crop_end_list = map(list, zip(*crops))
+        ref_res = tvm.topi.testing.batch_to_space_nd_python(
+            x_data, block_shape, crop_begin_list, crop_end_list
+        )
+
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.nn.batch_to_space_nd(x, block_shape, crops)
+        assert "block_shape=" in z.astext()
+        assert "crops=" in z.astext()
+        zz = run_infer_type(z)
+        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
+        func = relay.Function([x], z)
+
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
+
+    verify_batch_to_space_nd([4, 1, 1, 3], [2, 2], [[0, 0], [0, 0]])
+    verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [[0, 0], [2, 0]])
+
+
 if __name__ == "__main__":
     test_resize_infer_type()
     test_resize()
@@ -1163,3 +1217,5 @@ def verify_grid_sample(data_shape, grid_shape):
     test_dilation2d_run()
     test_affine_grid()
     test_grid_sample()
+    test_space_to_batch_nd()
+    test_batch_to_space_nd()
diff --git a/tests/python/relay/test_op_qnn_conv2_transpose.py b/tests/python/relay/test_op_qnn_conv2_transpose.py
new file mode 100644
index 000000000000..a86f9e1c6a80
--- /dev/null
+++ b/tests/python/relay/test_op_qnn_conv2_transpose.py
@@ -0,0 +1,638 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+import numpy as np
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.testing import run_infer_type
+from tvm.contrib import graph_runtime
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+
+
+def get_ref_func(
+    data,
+    kernel,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    data_layout,
+    kernel_layout,
+    out_dtype,
+    groups,
+    channels=None,
+):
+    casted_data = relay.op.cast(data, "int32")
+    casted_kernel = relay.op.cast(kernel, "int32")
+    shifted_data = relay.op.subtract(casted_data, relay.const(input_zero_point, "int32"))
+    shifted_kernel = relay.op.subtract(casted_kernel, relay.const(kernel_zero_point, "int32"))
+    func = relay.op.nn.conv2d_transpose(
+        shifted_data,
+        shifted_kernel,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+        groups=groups,
+        channels=channels,
+        kernel_size=kernel_size,
+        out_dtype=out_dtype,
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+    )
+
+    func = relay.Function(relay.analysis.free_vars(func), func)
+    return func
+
+
+def get_qnn_func(
+    data,
+    kernel,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    data_layout,
+    kernel_layout,
+    out_dtype,
+    channels,
+    groups,
+):
+    func = relay.qnn.op.conv2d_transpose(
+        data,
+        kernel,
+        input_zero_point=relay.const(input_zero_point, "int32"),
+        kernel_zero_point=relay.const(kernel_zero_point, "int32"),
+        input_scale=relay.const(input_scale, "float32"),
+        kernel_scale=relay.const(kernel_scale, "float32"),
+        kernel_size=kernel_size,
+        strides=strides,
+        dilation=dilation,
+        padding=padding,
+        out_dtype=out_dtype,
+        groups=groups,
+        channels=channels,
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+    )
+
+    mod = relay.Function(relay.analysis.free_vars(func), func)
+    mod = tvm.IRModule.from_expr(mod)
+    return mod
+
+
+def get_funcs(
+    data_shape,
+    data_dtype,
+    kernel_shape,
+    kernel_dtype,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    data_layout,
+    kernel_layout,
+    out_dtype,
+    groups=1,
+    channels=None,
+):
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
+
+    ref_func = get_ref_func(
+        data,
+        kernel,
+        input_zero_point,
+        kernel_zero_point,
+        input_scale,
+        kernel_scale,
+        kernel_size,
+        padding,
+        strides,
+        dilation,
+        data_layout,
+        kernel_layout,
+        out_dtype,
+        groups,
+        channels,
+    )
+    ref_func = run_infer_type(ref_func)
+    ref_func = tvm.IRModule.from_expr(ref_func)
+    qnn_func = get_qnn_func(
+        data,
+        kernel,
+        input_zero_point,
+        kernel_zero_point,
+        input_scale,
+        kernel_scale,
+        kernel_size,
+        padding,
+        strides,
+        dilation,
+        data_layout,
+        kernel_layout,
+        out_dtype,
+        channels,
+        groups,
+    )
+
+    return (ref_func, qnn_func)
+
+
+def verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype):
+    def get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype):
+        # Keeping inputs multiple of 4 because of a bug in Average Pool2d
+        # https://discuss.tvm.apache.org/t/pool2d-gives-bad-output-for-integer-inputs/3377
+        low = -128
+        high = 127
+        if data_dtype == "uint8":
+            low = 0
+            high = 255
+        golden_data = np.random.randint(low=low, high=high, size=data_shape).astype(data_dtype)
+        low = -128
+        high = 127
+        if kernel_dtype == "uint8":
+            low = 0
+            high = 255
+        golden_weight = np.random.randint(low=low, high=high, size=kernel_shape).astype(
+            kernel_dtype
+        )
+        return (golden_data, golden_weight)
+
+    def get_output(func, golden_inputs):
+        with tvm.transform.PassContext(opt_level=2):
+            golden_data, golden_weight = golden_inputs
+            params = {"kernel": golden_weight}
+            graph, lib, params = relay.build(func, "llvm", params=params)
+            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod.set_input("data", golden_data)
+            mod.set_input(**params)
+            mod.run()
+            res = mod.get_output(0).asnumpy()
+            return res
+
+    golden_inputs = get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype)
+    golden_output = get_output(ref_func, golden_inputs)
+    qnn_output = get_output(qnn_func, golden_inputs)
+    np.testing.assert_equal(qnn_output, golden_output)
+
+
+def test_no_zero_point():
+    # uint8 input
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=0,
+        kernel_zero_point=0,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=0,
+        kernel_zero_point=0,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_kernel_zero_point():
+    # uint8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=0,
+        kernel_zero_point=1,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=0,
+        kernel_zero_point=5,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_input_zero_point():
+    # uint8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=0,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=0,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_both_zero_point():
+    # uint8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_layout():
+    # uint8 input
+    data_shape = (2, 2, 4, 4)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (2, 2, 3, 4)  # HWIO
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    data_shape = (2, 2, 4, 3)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (2, 2, 1, 3)  # HWIO
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_padding():
+    # uint8 input
+    data_shape = (1, 4, 2, 2)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=8,
+        kernel_zero_point=5,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(1, 1),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # Try different layout
+    data_shape = (2, 2, 4, 4)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (2, 2, 3, 4)  # HWIO
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=8,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(1, 1),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # Try asymmetric padding
+    data_shape = (2, 8, 6, 4)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (2, 2, 3, 4)  # HWIO
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=8,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(1, 1, 2, 2),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_const_folding():
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+
+    golden_weight = np.random.randint(low=0, high=255, size=kernel_shape).astype(kernel_dtype)
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    kernel = relay.const(golden_weight)
+    qnn_func = get_qnn_func(
+        data,
+        kernel,
+        input_zero_point=8,
+        kernel_zero_point=3,
+        kernel_size=(2, 2),
+        input_scale=1.0,
+        kernel_scale=1.0,
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+        channels=kernel_shape[1],
+        groups=1,
+    )
+    folded_mod = transform.FoldConstant()(qnn_func)
+    folded_func = folded_mod["main"]
+    assert "reshape" not in folded_func.astext()
+
+
+def test_broadcast_layout():
+    # Test broadcast support for NHWC layout.
+    data_shape = (1, 229, 229, 3)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (7, 7, 64, 3)  # HWIO
+    kernel_dtype = "int8"
+    _, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=8,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(7, 7),
+        padding=(1, 1),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    func = qnn_func["main"].body
+    bias = relay.var("bias", shape=(64,), dtype="int32")
+    bias2 = relay.var("bias2", shape=(1, 233, 233, 64), dtype="int32")
+
+    # Check broadcast support on both lhs and rhs
+    func = relay.add(func, bias2)
+    func = relay.add(bias2, func)
+    func = relay.add(bias, func)
+    func = relay.add(func, bias)
+    func = relay.Function(relay.analysis.free_vars(func), func)
+    mod = tvm.IRModule.from_expr(func)
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512")
+
+
+def test_per_channel_kernel_scale():
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "uint8"
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
+    kernel_scales = [2, 2, 2]
+    kernel_scales = relay.const(np.array(kernel_scales).astype("float32"))
+    func = relay.qnn.op.conv2d_transpose(
+        data,
+        kernel,
+        input_zero_point=relay.const(0, "int32"),
+        kernel_zero_point=relay.const(0, "int32"),
+        input_scale=relay.const(2.0, "float32"),
+        kernel_scale=kernel_scales,
+        kernel_size=(2, 2),
+        channels=kernel_shape[0],
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+
+    mod = relay.Function(relay.analysis.free_vars(func), func)
+    mod = tvm.IRModule.from_expr(mod)
+
+
+if __name__ == "__main__":
+    test_no_zero_point()
+    test_input_zero_point()
+    test_kernel_zero_point()
+    test_both_zero_point()
+    test_layout()
+    test_padding()
+    test_const_folding()
+    test_broadcast_layout()
+    test_per_channel_kernel_scale()
diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py
index e14bba2d7f78..67d4c6f0b807 100644
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ b/tests/python/relay/test_op_qnn_conv2d.py
@@ -176,7 +176,7 @@ def get_funcs(
 def verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype):
     def get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype):
         # Keeping inputs multiple of 4 because of a bug in Average Pool2d
-        # https://discuss.tvm.ai/t/pool2d-gives-bad-output-for-integer-inputs/3377
+        # https://discuss.tvm.apache.org/t/pool2d-gives-bad-output-for-integer-inputs/3377
         low = -128
         high = 127
         if data_dtype == "uint8":
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index 6598e2bb2062..e7fb161a13cb 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -20,6 +20,7 @@
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
+from tvm.relay.testing import run_infer_type
 
 
 def dequantize_test_driver(in_dtype, quant_args, in_data, verify_output_data, axis):
@@ -101,8 +102,53 @@ def test_channelwise_axis_1():
     )
 
 
+def test_channelwise_axis_0():
+    data = np.array([0, 1, 2, 3, 4, 243, 247, 249, 250, 251]).astype("uint8").reshape((2, 5))
+    output = (
+        np.array([-63.5, -63, -62.5, -62, -61.5, 30, 31, 31.5, 31.75, 32])
+        .astype("float32")
+        .reshape((2, 5))
+    )
+    quant_args = {
+        "in_zero_point": np.array([127, 123]).astype("int32"),
+        "in_scale": np.array([0.5, 0.25]).astype("float32"),
+    }
+
+    dequantize_test_driver(
+        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=0
+    )
+
+
+def test_dynamic_dequantize():
+    x = relay.var("x", shape=(1, 2, 3, 4), dtype="int8")
+    scale_var = relay.var("scale", shape=(), dtype="float32")
+    zp_var = relay.var("zp", shape=(), dtype="int32")
+
+    deq_x = relay.qnn.op.dequantize(x, scale_var * scale_var, zp_var + zp_var)
+    tt = run_infer_type(deq_x)
+
+    assert tt.checked_type == relay.TensorType((1, 2, 3, 4), "float32")
+    func = relay.Function([x, scale_var, zp_var], deq_x)
+    data = np.random.uniform(size=(1, 2, 3, 4)).astype("int8")
+    scale = np.array(1).astype("float32")
+    zp = np.array(0).astype("int32")
+
+    mod = tvm.ir.IRModule.from_expr(func)
+
+    for target, ctx in tvm.testing.enabled_targets():
+        # TODO: (electriclilies) enable AlterOpLayout when it is fixed
+        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            lib = relay.build(mod, target=target)
+
+    module = graph_runtime.GraphModule(lib["default"](ctx))
+    module.set_input(**{"x": data, "scale": scale, "zp": zp})
+    module.run()
+
+
 if __name__ == "__main__":
     test_uint8_to_float32()
     test_int8_to_float32()
     test_int32_to_float32()
     test_channelwise_axis_1()
+    test_channelwise_axis_0()
+    test_dynamic_dequantize()
diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py
index a22c25f5b97f..2ef298679904 100644
--- a/tests/python/relay/test_op_qnn_quantize.py
+++ b/tests/python/relay/test_op_qnn_quantize.py
@@ -20,6 +20,7 @@
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
+from tvm.relay.testing import run_infer_type
 
 
 def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data, verify_output_data):
@@ -133,8 +134,35 @@ def test_channelwise_axis_1():
     )
 
 
+def test_dynamic_quantize():
+    x = relay.var("x", shape=(1, 2, 3, 4), dtype="float32")
+    scale_var = relay.var("scale", shape=(), dtype="float32")
+    zp_var = relay.var("zp", shape=(), dtype="int32")
+
+    q_x = relay.qnn.op.quantize(x, scale_var * scale_var, zp_var + zp_var)
+    tt = run_infer_type(q_x)
+
+    assert tt.checked_type == relay.TensorType((1, 2, 3, 4), "int8")
+    func = relay.Function([x, scale_var, zp_var], q_x)
+    data = np.random.uniform(size=(1, 2, 3, 4)).astype("float32")
+    scale = np.array(1).astype("float32")
+    zp = np.array(0).astype("int32")
+
+    mod = tvm.ir.IRModule.from_expr(func)
+
+    for target, ctx in tvm.testing.enabled_targets():
+        # TODO: (electriclilies) enable AlterOpLayout when it is fixed
+        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            lib = relay.build(mod, target=target)
+
+    module = graph_runtime.GraphModule(lib["default"](ctx))
+    module.set_input(**{"x": data, "scale": scale, "zp": zp})
+    module.run()
+
+
 if __name__ == "__main__":
     test_float32_to_uint8()
     test_float32_to_int8()
     test_channelwise_axis_0()
     test_channelwise_axis_1()
+    test_dynamic_quantize()
diff --git a/tests/python/relay/test_op_qnn_requantize.py b/tests/python/relay/test_op_qnn_requantize.py
index f152a4ebf840..f40a08711451 100644
--- a/tests/python/relay/test_op_qnn_requantize.py
+++ b/tests/python/relay/test_op_qnn_requantize.py
@@ -204,6 +204,48 @@ def test_upscale():
         verify(mod, (golden_data, golden_output))
 
 
+def test_non_power_of_two():
+    for rounding in roundings:
+        mod = get_mod(
+            data_shape=(32,),
+            data_dtype="int32",
+            out_dtype="int8",
+            input_scale=1,
+            output_scale=3,
+            rounding=rounding,
+        )
+
+        # Try positive values
+        golden_data = np.multiply(np.arange(0, 32, 1).astype("int32"), 3)
+        golden_output = np.arange(0, 32, 1)
+        verify(mod, (golden_data, golden_output))
+
+        # Try negative values
+        golden_data = np.multiply(np.arange(0, -32, -1).astype("int32"), 3)
+        golden_output = np.arange(0, -32, -1)
+        verify(mod, (golden_data, golden_output))
+
+        # Try a different scale
+        mod = get_mod(
+            data_shape=(32,),
+            data_dtype="int32",
+            out_dtype="int8",
+            input_scale=3,
+            output_scale=1,
+            rounding=rounding,
+        )
+
+        # Try positive values
+        golden_data = np.arange(0, 32, 1).astype("int32")
+        golden_output = np.multiply(golden_data, 3)
+        verify(mod, (golden_data, golden_output))
+
+        # Try negative values
+        golden_data = np.arange(0, -32, -1).astype("int32")
+        golden_output = np.multiply(golden_data, 3)
+        verify(mod, (golden_data, golden_output))
+
+
 def test_saturation():
     for rounding in roundings:
         mod = get_mod(
@@ -397,6 +439,7 @@ def test_per_channel_different_scale():
     test_same_scale()
     test_downscale()
     test_upscale()
+    test_non_power_of_two()
     test_saturation()
     test_zero_point()
     test_per_channel_same_scale()
diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py
index 68e4b5028c06..74c9ebcaa355 100644
--- a/tests/python/relay/test_param_dict.py
+++ b/tests/python/relay/test_param_dict.py
@@ -24,7 +24,7 @@
 from tvm.relay.op import add
 from tvm import relay
 from tvm import rpc
-from tvm.contrib import util, graph_runtime
+from tvm.contrib import utils, graph_runtime
 
 
 def test_save_load():
@@ -70,7 +70,7 @@ def verify_graph_runtime(remote, target, shape, dtype):
         params = {"x": x_in}
         graph, lib, params = relay.build(func, target=target, params=params)
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("dev_lib.o")
         lib.save(path_dso)
         remote.upload(path_dso)
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index a7ae9f77fcb7..58c279d750ec 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -673,6 +673,45 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_alter_layout_nchw_dyn_upsamping_op():
+    """Test upsamping operators """
+
+    def before():
+        x = relay.var("x", shape=(1, 32, 28, 28))
+        weight = relay.var("weight", shape=(32, 32, 3, 3))
+        y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.upsampling(y, scale_h=relay.const(2), scale_w=relay.const(2))
+        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2))
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW16c"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 32, 28, 28))
+        weight = relay.var("weight")
+        x = relay.layout_transform(x, "NCHW", "NCHW16c")
+        y = relay.nn.conv2d(
+            x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
+        )
+        y = relay.nn.upsampling(y, scale_h=relay.const(2), scale_w=relay.const(2), layout="NCHW16c")
+        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2), layout="NCHW16c")
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = before()
+        a = run_opt_pass(a, transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 @tvm.testing.uses_gpu
 def test_alter_layout_strided_slice():
     """Test rewriting strided_slice during alter_iop_layout"""
diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py
index 30b35fb8044b..325826d183da 100644
--- a/tests/python/relay/test_pass_annotate_target.py
+++ b/tests/python/relay/test_pass_annotate_target.py
@@ -25,7 +25,7 @@
 import tvm.relay.transform as transform
 from tvm import relay
 from tvm import runtime
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 def check_result(
@@ -42,7 +42,7 @@ def update_lib(lib):
 
         kwargs = {}
         kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
-        tmp_path = util.tempdir()
+        tmp_path = utils.tempdir()
         lib_name = "lib.so"
         lib_path = tmp_path.relpath(lib_name)
         lib.export_library(lib_path, fcompile=False, **kwargs)
@@ -179,7 +179,7 @@ def test_extern_dnnl_mobilenet():
 
 def test_multiple_ends():
     @tvm.ir.register_op_attr("nn.relu", "target.test")
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     def before():
@@ -221,8 +221,8 @@ def test_type_propagation():
     target = "test_type_propagation"
 
     @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(attrs, args):  # pylint: disable=unused-variable
-        return args[0].checked_type.dtype == "float32"
+    def relu(expr):  # pylint: disable=unused-variable
+        return expr.args[0].checked_type.dtype == "float32"
 
     def before():
         x = relay.var("x", shape=(10, 10))
@@ -240,11 +240,11 @@ def test_tuple():
     target = "test_tuple"
 
     @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("concatenate", "target." + target)
-    def concatenate(attrs, args):  # pylint: disable=unused-variable
+    def concatenate(expr):  # pylint: disable=unused-variable
         return True
 
     """Test that TupleNode is included in annotation when surrounded by supported nodes."""
@@ -331,11 +331,11 @@ def after():
 
 def test_multiple_runs():
     @tvm.ir.register_op_attr("nn.relu", "target.A")
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("add", "target.B")
-    def add(attrs, args):  # pylint: disable=unused-variable
+    def add(expr):  # pylint: disable=unused-variable
         return True
 
     def before():
@@ -359,19 +359,19 @@ def test_if_else():
     target = "test_if_else"
 
     @tvm.ir.register_op_attr("equal", "target." + target)
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("tanh", "target." + target)
-    def tanh(attrs, args):  # pylint: disable=unused-variable
+    def tanh(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("sigmoid", "target." + target)
-    def sigmoid(attrs, args):  # pylint: disable=unused-variable
+    def sigmoid(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("erf", "target." + target)
-    def erf(attrs, args):  # pylint: disable=unused-variable
+    def erf(expr):  # pylint: disable=unused-variable
         return True
 
     """Test that If-else nodes compiles correctly when surrounded by supported nodes."""
@@ -430,15 +430,15 @@ def test_while_let():
     target = "test_while_let"
 
     @tvm.ir.register_op_attr("less", "target." + target)
-    def less(attrs, args):  # pylint: disable=unused-variable
+    def less(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("add", "target." + target)
-    def add(attrs, args):  # pylint: disable=unused-variable
+    def add(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("zeros_like", "target." + target)
-    def zeros_like(attrs, args):  # pylint: disable=unused-variable
+    def zeros_like(expr):  # pylint: disable=unused-variable
         return True
 
     """Test that let nodes compiles correctly when surrounded by other nodes."""
@@ -510,6 +510,91 @@ def after():
     assert tvm.ir.structural_equal(expected, result)
 
 
+def test_if_free_vars():
+    target = "test_if_free_vars"
+
+    @tvm.ir.register_op_attr("equal", "target." + target)
+    def equal(expr):  # pylint: disable=unused-variable
+        return True
+
+    @tvm.ir.register_op_attr("sigmoid", "target." + target)
+    def sigmoid(expr):  # pylint: disable=unused-variable
+        return True
+
+    @tvm.ir.register_op_attr("erf", "target." + target)
+    def erf(expr):  # pylint: disable=unused-variable
+        return True
+
+    """Test that If-else nodes compiles correctly when surrounded by free variables"""
+
+    def before():
+        data = relay.var("data", shape=(1, 32))
+        eq1 = relay.var("e1", shape=[], dtype="float32")
+        eq2 = relay.var("e2", shape=[], dtype="float32")
+        eq = relay.equal(eq1, eq2)
+
+        true_branch = relay.zeros(shape=(1, 32), dtype="float32")
+        false_branch = relay.sigmoid(data)
+        ife = relay.If(eq, true_branch, false_branch)
+        out = relay.erf(ife)
+
+        func = relay.Function([data, eq1, eq2], out)
+        mod = tvm.IRModule.from_expr(func)
+
+        return mod
+
+    def after():
+        data = relay.var("data", shape=(1, 32))
+        eq1 = relay.var("e1", shape=[], dtype="float32")
+        eq2 = relay.var("e2", shape=[], dtype="float32")
+
+        cb_1 = relay.annotation.compiler_begin(eq1, target)
+        cb_2 = relay.annotation.compiler_begin(eq2, target)
+
+        equality_condition = relay.equal(cb_1, cb_2)
+        ce_1 = relay.annotation.compiler_end(equality_condition, target)
+
+        # if condition
+        true_branch = relay.zeros(shape=(1, 32), dtype="float32")
+
+        # else condition
+        cb_3 = relay.annotation.compiler_begin(data, target)
+        false_branch = relay.sigmoid(cb_3)
+        ce_2 = relay.annotation.compiler_end(false_branch, target)
+
+        if_condition = relay.If(ce_1, true_branch, ce_2)
+        cb_4 = relay.annotation.compiler_begin(if_condition, target)
+        erf_out = relay.erf(cb_4)
+        ce_3 = relay.annotation.compiler_end(erf_out, target)
+        func = relay.Function([data, eq1, eq2], ce_3)
+        mod = tvm.IRModule.from_expr(func)
+        return mod
+
+    result = transform.AnnotateTarget(target)(before())
+    expected = transform.InferType()(after())
+    assert tvm.ir.structural_equal(expected, result)
+
+
+def test_free_vars_zeros():
+    target = "test_free_vars_zeros"
+
+    """Test that free variables compile correctly on their own"""
+
+    def before():
+        func = relay.Function([], relay.zeros(shape=(0), dtype="float32"))
+        mod = tvm.IRModule.from_expr(func)
+        return mod
+
+    def after():
+        func = relay.Function([], relay.zeros(shape=(0), dtype="float32"))
+        mod = tvm.IRModule.from_expr(func)
+        return mod
+
+    result = transform.AnnotateTarget(target)(before())
+    expected = transform.InferType()(after())
+    assert tvm.ir.structural_equal(expected, result)
+
+
 if __name__ == "__main__":
     test_extern_dnnl()
     test_composite_function()
@@ -520,3 +605,5 @@ def after():
     test_multiple_runs()
     test_if_else()
     test_while_let()
+    test_if_free_vars()
+    test_free_vars_zeros()
diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py
index 034cb48de186..8a7c4cbfbbd6 100644
--- a/tests/python/relay/test_pass_auto_quantize.py
+++ b/tests/python/relay/test_pass_auto_quantize.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay import testing
 from tvm.relay.expr import Call
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def quantize_and_build(out):
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 40aef264a335..7fc896a72905 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -966,6 +966,59 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_conv_roi_pool_convert_layout():
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+        rois = relay.var("rois", shape=(32, 5))
+        y = relay.vision.roi_pool(
+            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, layout="NCHW"
+        )
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
+        x = relay.layout_transform(x, "NCHW", "NHWC")
+        weight1 = relay.layout_transform(weight1, "OIHW", "HWIO")
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        rois = relay.var("rois", shape=(32, 5))
+        y = relay.vision.roi_pool(
+            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, layout="NHWC"
+        )
+        ret = relay.layout_transform(y, "NHWC", "NCHW")
+        y = relay.Function(analysis.free_vars(ret), ret)
+        return y
+
+    a = before()
+    desired_layouts = {
+        "nn.conv2d": ["NHWC", "HWIO"],
+        "vision.roi_pool": ["NHWC", "default"],
+    }
+    a = run_opt_pass(a, transform.ConvertLayout(desired_layouts))
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 def test_default_keyword():
     """ Check that the default keyword selects correct TVM default layout. """
 
@@ -1162,6 +1215,78 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_convert_with_config():
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y = relay.nn.relu(y)
+
+        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
+        y2 = relay.nn.conv2d(
+            y,
+            weight2,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y2 = relay.nn.relu(y2)
+
+        out = relay.Function([x, weight, weight2], y2)
+        return out
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+
+        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
+        weight2 = relay.layout_transform(weight2, "HWIO", "HWOI")
+
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y = relay.nn.relu(y)
+        y = relay.layout_transform(y, "NHWC", "HWNC")
+
+        y2 = relay.nn.conv2d(
+            y,
+            weight2,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="HWNC",
+            kernel_layout="HWOI",
+        )
+        y2 = relay.nn.relu(y2)
+
+        y2 = relay.layout_transform(y2, "HWNC", "NHWC")
+        output = relay.Function(relay.analysis.free_vars(y2), y2)
+        return output
+
+    a = before()
+    layout_config = relay.transform.LayoutConfig(skip_layers=[0])
+    with layout_config:
+        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["HWNC", "default"]}))
+    b = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 if __name__ == "__main__":
     test_qnn_binary_no_convert_layout()
     test_no_convert_layout()
@@ -1181,7 +1306,9 @@ def expected():
     test_conv_convert_kernel_layout()
     test_conv_transpose_convert_layout()
     test_conv_roi_align_convert_layout()
+    test_conv_roi_pool_convert_layout()
     test_conv_strided_slice_convert_layout()
     test_default_keyword()
     test_different_ops_convert_layout()
     test_no_desired_layout()
+    test_convert_with_config()
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index 6da6c3efe5c5..127035c5d540 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -25,128 +25,179 @@
 import pytest
 
 
-class env:
-    def __init__(self):
-        self.shape = tvm.runtime.convert([1, 2, 3])
-        self.tt = relay.TensorType(self.shape, "float32")
-        self.int32 = relay.TensorType([], "int32")
-        self.float32 = relay.TensorType([], "float32")
-        self.one = relay.const(1.0)
-        self.two = relay.const(2.0)
-        self.three = relay.const(3.0)
-        self.a = relay.Var("a", self.float32)
-        self.b = relay.Var("b", self.float32)
-        self.c = relay.Var("c", self.float32)
-        self.d = relay.Var("d", self.float32)
-        self.e = relay.Var("e", self.float32)
-        self.x = relay.Var("x", self.int32)
-        self.y = relay.Var("y", self.int32)
-        self.z = relay.Var("z", self.int32)
-
-
-e = env()
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_let():
-    orig = relay.Let(e.x, e.y, e.z)
-    orig = run_opt_pass(orig, transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(Function(free_vars(orig), orig), Function([e.z], e.z))
-
-
-def test_used_let():
-    orig = relay.Let(e.c, e.one, e.c + e.c)
-    orig = run_opt_pass(orig, transform.DeadCodeElimination())
-    expected = relay.Let(e.c, e.one, e.c + e.c)
-    assert tvm.ir.structural_equal(Function([], orig), Function([], expected))
-
-
-def test_inline():
-    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c))
-    orig = run_opt_pass(orig, transform.DeadCodeElimination(True))
-    tvm.ir.assert_structural_equal(Function(free_vars(orig), orig), Function([e.d], e.d))
-
-
-def test_chain_unused_let():
-    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.e))
-    orig = run_opt_pass(orig, transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(Function(free_vars(orig), orig), Function([e.e], e.e))
-
-
-def use_f(func):
-    f = relay.Var("f")
-    n = relay.Var("n", e.int32)
-    data = relay.Var("data", e.float32)
-    funcbody = relay.If(
-        equal(n, relay.const(0)), data, relay.Call(f, [subtract(n, relay.const(1)), log(data)])
+def optimize_source(source, passes):
+    if not isinstance(passes, list):
+        passes = [passes]
+
+    optimize = tvm.transform.Sequential(passes)
+    module = tvm.parser.parse(source)
+    return optimize(module)
+
+
+def optimize_and_check(before_source, after_source, passes):
+    optimize_module = optimize_source(before_source, passes)
+    after_module = tvm.parser.parse(after_source)
+    print(optimize_module)
+    print(after_module)
+    assert tvm.ir.structural_equal(after_module, optimize_module)
+
+
+def test_dead_let():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main(%z: int) {
+        let %x = 1;
+        %z
+    }
+    """
+    after_program = """
+    #[version = "0.0.5"]
+    def @main(%z: int) {
+        %z
+    }
+    """
+    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
+
+
+def test_one_live_let():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main(%z: int) {
+        let %x = 1;
+        let %y = 2;
+        %x + %x
+    }
+    """
+    after_program = """
+    #[version = "0.0.5"]
+    def @main(%z: int) {
+        let %x = 1;
+        %x + %x
+    }
+    """
+    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
+
+
+def test_nested_let():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main(%d: int, %b: int) {
+        let %a = %b;
+        let %c = %d;
+        %c
+    }
+    """
+    after_program = """
+    #[version = "0.0.5"]
+    def @main(%d: int, %b: int) {
+        let %c = %d;
+        %c
+    }
+    """
+    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
+
+
+def test_live_recursion():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        let %f = fn (%n: int, %data: int) -> int {
+            if (%n == 0) {
+                %data
+            } else {
+                %f(%n - 1, log(%data))
+            }
+        };
+        %f(2, 10000)
+    }
+    """
+
+    after_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        let %f = fn (%n: int, %data: int) -> int {
+            if (%n == 0) {
+                %data
+            } else {
+                %f(%n - 1, log(%data))
+            }
+        };
+        %f(2, 10000)
+    }
+    """
+
+    optimize_and_check(
+        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
     )
-    value = relay.Function([n, data], funcbody, e.float32, [])
-    return relay.Let(f, value, func(f))
 
 
-# make sure we dont infinite loop
-def test_recursion():
+def test_dead_recursion():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        let %f = fn (%n: int, %data: int) -> int {
+            if (%n == 0) {
+                %data
+            } else {
+                %f(%n - 1, log(%data))
+            }
+        };
+        ()
+    }
     """
-    Program:
-       let f(n: i32, data: f32) -> f32 = {
-          if (n == 0) {
-              return data;
-          } else {
-              return f(n - 1, log(data));
-          }
-       }
-       f(2, 10000);
+
+    after_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        ()
+    }
     """
-    orig = use_f(lambda f: relay.Call(f, [relay.const(2), relay.const(10000.0)]))
-    dced = run_opt_pass(orig, transform.DeadCodeElimination())
-    orig = run_opt_pass(orig, transform.InferType())
-    tvm.ir.assert_structural_equal(dced, orig)
 
+    optimize_and_check(
+        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
+    )
 
-def test_recursion_dead():
-    x = relay.Let(e.a, e.one, e.three)
-    dced_f = lambda f: x
-    dced = run_opt_pass(use_f(dced_f), transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(dced, e.three)
 
+def test_add_with_let():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        (let %a = 1; 3) + 2
+    }
+    """
 
-def test_op_let():
-    dced = run_opt_pass(add(relay.Let(e.a, e.one, e.three), e.two), transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(dced, add(e.three, e.two))
+    after_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        3 + 2
+    }
+    """
+
+    optimize_and_check(
+        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
+    )
 
 
 def test_tuple_get_item():
-    tt = relay.TupleType([e.float32, e.float32])
-    t = relay.Var("t", tt)
-    a = relay.Var("a")
-    g = relay.TupleGetItem(t, 0)
-    dced = run_opt_pass(g, transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(Function(free_vars(dced), dced), Function(free_vars(g), g))
-    orig = relay.TupleGetItem(relay.Let(a, e.one, t), 0)
-    dced = run_opt_pass(orig, transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(Function(free_vars(dced), dced), Function(free_vars(g), g))
+    before_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        let %a = 100;
+        (1, 2, 3, 4).0
+    }
+    """
 
+    after_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        (1, 2, 3, 4).0
+    }
+    """
 
-@pytest.mark.timeout(timeout=10, method="thread")
-def test_complexity():
-    g = inception_v3.get_net(1, 1000, (3, 299, 299), "float32")
-    run_opt_pass(g, transform.DeadCodeElimination())
+    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
 
 
 if __name__ == "__main__":
-    test_let()
-    test_used_let()
-    test_inline()
-    test_chain_unused_let()
-    test_recursion()
-    test_recursion_dead()
-    test_op_let()
-    test_tuple_get_item()
-    test_complexity()
+    import sys
+
+    pytest.main(sys.argv)
diff --git a/tests/python/relay/test_pass_defuse_ops.py b/tests/python/relay/test_pass_defuse_ops.py
new file mode 100644
index 000000000000..2312b2d9ec47
--- /dev/null
+++ b/tests/python/relay/test_pass_defuse_ops.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.testing import run_opt_pass
+
+
+def test_defuse_simple():
+    """Simple testcase."""
+
+    def before():
+        x = relay.var("x", shape=(10, 20))
+        y = relay.add(x, relay.const(1, "float32"))
+        z = relay.exp(y)
+        w = relay.squeeze(z)
+        return relay.Function([x], w)
+
+    x = before()
+    x = run_opt_pass(x, transform.InferType())
+    fused = run_opt_pass(x, transform.FuseOps())
+    defused = run_opt_pass(fused, transform.DefuseOps())
+
+    assert tvm.ir.structural_equal(x, defused)
+
+
+def test_inception_like():
+    def conv(data):
+        y = relay.nn.conv2d(data, relay.var("w"), kernel_size=(3, 3), padding=(1, 1), channels=16)
+        return relay.nn.relu(data=y)
+
+    def inception_like(data):
+        c0 = conv(data)
+        c1 = conv(data)
+        return relay.concatenate((c0, c1), axis=1)
+
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        in1 = inception_like(x)
+        in2 = inception_like(in1)
+        return relay.Function(relay.analysis.free_vars(in2), in2)
+
+    dshape = (1, 16, 64, 64)
+    x = before(dshape)
+    x = run_opt_pass(x, transform.InferType())
+    fused = run_opt_pass(x, transform.FuseOps())
+    defused = run_opt_pass(fused, transform.DefuseOps())
+
+    assert tvm.ir.structural_equal(x, defused)
+
+
+if __name__ == "__main__":
+    test_defuse_simple()
+    test_inception_like()
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
index ba3d2795047f..141023d77019 100644
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ b/tests/python/relay/test_pass_dynamic_to_static.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
 import tvm
 from tvm import te
 from tvm import relay
@@ -457,17 +458,65 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
     verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1], (2, 2, 3), slice_mode="size", test_ref=True)
 
 
+@tvm.testing.uses_gpu
+def test_dyn_to_static_sparse_to_dense():
+    def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
+        sparse_indices_data = np.array(sparse_indices)
+        sparse_values_data = np.array(sparse_values)
+        default_value_data = np.array(default_value)
+        output_shape_data = np.array(output_shape)
+
+        a = relay.var(
+            "a", relay.TensorType(sparse_indices_data.shape, str(sparse_indices_data.dtype))
+        )
+        b = relay.var(
+            "b", relay.TensorType(sparse_values_data.shape, str(sparse_values_data.dtype))
+        )
+        output_shape_const = relay.const(output_shape_data)
+
+        if default_value is None:
+            args = [a, b]
+            d = relay.sparse_to_dense(a, output_shape_const, b)
+        else:
+            c = relay.var(
+                "c", relay.TensorType(default_value_data.shape, str(default_value_data.dtype))
+            )
+            args = [a, b, c]
+            d = relay.sparse_to_dense(a, output_shape_const, b, c)
+
+        zz = run_infer_type(d)
+        assert len(zz.checked_type.shape) == len(output_shape)
+
+        func = relay.Function(args, d)
+
+        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        assert isinstance(func2.body, relay.Call)
+        assert func2.body.op == relay.op.get("sparse_to_dense")
+
+        if default_value is None:
+            arguments = [sparse_indices_data, sparse_values_data]
+        else:
+            arguments = [sparse_indices_data, sparse_values_data, default_value_data]
+
+        verify_func(func2, arguments, xpected)
+
+    verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
+    verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
+    verify_sparse_to_dense(
+        [[0, 0], [1, 2]], [1, 2], 0, [3, 4], [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
+    )  # nXd
+    verify_sparse_to_dense(
+        [[0, 0, 0], [1, 2, 3]],
+        [1, 2],
+        4,
+        [2, 3, 4],
+        [[[1, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]], [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 2]]],
+    )  # nXd
+    verify_sparse_to_dense(
+        [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
+    )  # floats
+    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
+
+
 if __name__ == "__main__":
-    test_dynamic_to_static_reshape()
-    test_dynamic_to_static_double_reshape()
-    test_dynamic_to_static_quad_reshape()
-    test_dynamic_to_static_tile()
-    test_dynamic_to_static_topk()
-    test_dynamic_to_static_broadcast_to()
-    test_dynamic_to_static_zeros_ones()
-    test_dynamic_to_static_resize()
-    test_dynamic_to_static_one_hot()
-    test_dynamic_to_static_full()
-    test_dynamic_to_static_upsampling()
-    test_dynamic_to_static_pad()
-    test_dynamic_to_static_strided_slice()
+    pytest.main([__file__])
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index 421c6c5e8ef2..3c2dc82cb07b 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -311,6 +311,44 @@ def check(shape, channels, blocking, in_scale):
     check((2, 11, 10, 2, 2), 4, (2, 2), in_scale)
 
 
+def test_fold_fwd_let_fail():
+    """testcase where we canont fold"""
+
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        args = [x, conv_weight, in_bias]
+        x = relay.multiply(x, in_scale)
+        x = relay.nn.relu(x)
+        x = relay.add(x, in_bias)
+        x_var = relay.Var("x_var")
+        y1 = relay.nn.conv2d(
+            x_var,
+            conv_weight,
+            channels=channels,
+            kernel_size=(3, 3),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+            padding=(1, 1),
+        )
+        z = relay.add(y1, x)
+        let = relay.Let(x_var, x, z)
+        return relay.Function(args, let)
+
+    def check(shape, channels):
+        x = relay.var("x", shape=shape)
+        in_channels = shape[-1]
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.const(_get_positive_scale(size=(in_channels,)))
+        # test depthwise
+        assert in_channels == channels
+        weight = relay.var("weight")
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = run_opt_pass(y1, transform.InferType())
+        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
+        assert tvm.ir.structural_equal(y1, y1_folded)
+
+    check((2, 11, 10, 4), 4)
+
+
 def test_fold_fwd_negative_scale():
     """Testcase of folding negative scale"""
 
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index ff282df7c832..30ee29525daa 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -14,8 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
+
 import tvm
-from tvm import te
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_opt_pass
@@ -44,7 +45,6 @@ def expected():
         return relay.Function([x], y)
 
     z = before()
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
     zz = run_opt_pass(z, transform.FuseOps())
     after = run_opt_pass(expected(), transform.InferType())
     assert tvm.ir.structural_equal(zz, after)
@@ -759,6 +759,31 @@ def create_diamond_func(inp):
     assert tvm.ir.structural_equal(fused, expected)
 
 
+def test_fuse_dynamic_squeeze_slice_take():
+    input_data = [
+        np.random.random([1, 2, 4]).astype("float32"),
+        np.array([0]).astype("int64"),
+    ]
+
+    x = relay.var("p0107", shape=(relay.Any(), relay.Any(), 4), dtype="float32")
+    take_val = relay.var("p166", shape=(relay.Any(),), dtype="int64")
+
+    squeeze = relay.op.squeeze(x, axis=[0])
+    strided_slice = relay.op.strided_slice(
+        squeeze, begin=[0, 0], end=[15130, 9223372036854775807], strides=[1, 1]
+    )
+    take = relay.op.take(strided_slice, take_val, axis=0)
+
+    mod = tvm.IRModule.from_expr(take)
+    ex = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(), target="llvm")
+
+    result = ex.evaluate()(*input_data)
+
+    np_result = np.squeeze(input_data[0][:, input_data[1][0], :], axis=0)
+
+    assert np.allclose(result.asnumpy(), np_result)
+
+
 if __name__ == "__main__":
     test_fuse_simple()
     test_conv2d_fuse()
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index 93bad3a19c53..0604ed51272c 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -255,6 +255,29 @@ def _test_tuple(mode):
     tvm.testing.assert_allclose(grad_z.asnumpy(), -1 * np.ones_like(grad_z.asnumpy()))
 
 
+def _test_tuple_argument(mode):
+    shape = (2, 3)
+    dtype = "float32"
+    tensor_type = relay.TensorType(shape, dtype)
+    fields = 3
+    tuple_type = relay.TupleType([tensor_type] * fields)
+    tup = relay.var("tup", type_annotation=tuple_type)
+    body = relay.TupleGetItem(tup, 0)
+    for i in range(1, fields):
+        body = relay.add(body, relay.TupleGetItem(tup, i))
+    func = relay.Function([tup], body)
+    func = run_infer_type(func)
+    back_func = run_infer_type(gradient(func, mode=mode))
+    xs = [rand(dtype, *shape) for _ in range(fields)]
+    xs_np = np.array([x.asnumpy() for x in xs])
+    expected_forward = np.sum(xs_np, axis=0)
+    ex = create_executor()
+    forward, grad = ex.evaluate(back_func)(tuple(xs))
+    tvm.testing.assert_allclose(forward.asnumpy(), expected_forward)
+    for field in grad[0]:
+        tvm.testing.assert_allclose(field.asnumpy(), np.ones_like(field.asnumpy()))
+
+
 def test_tuple():
     _test_tuple("higher_order")
 
@@ -263,6 +286,16 @@ def test_tuple_first_order():
     _test_tuple("first_order")
 
 
+@pytest.mark.xfail(raises=tvm.error.TVMError)
+def test_tuple_argument():
+    # fails until we add support for top-level tuple arguments in higher-order AD
+    _test_tuple_argument("higher_order")
+
+
+def test_tuple_argument_first_order():
+    _test_tuple_argument("first_order")
+
+
 def test_pow():
     mod = tvm.IRModule()
     p = Prelude(mod)
diff --git a/tests/python/relay/test_pass_merge_compiler_regions.py b/tests/python/relay/test_pass_merge_compiler_regions.py
index 8447eeffa6ba..ba94021d3f66 100644
--- a/tests/python/relay/test_pass_merge_compiler_regions.py
+++ b/tests/python/relay/test_pass_merge_compiler_regions.py
@@ -89,7 +89,7 @@ def expected():
 def test_example_graph():
     """This tests the merging algorithm on the example used in the RFC.
 
-    See the RFC here: https://discuss.tvm.ai/t/relay-improved-graph-partitioning-algorithm/5830
+    See the RFC here: https://discuss.tvm.apache.org/t/relay-improved-graph-partitioning-algorithm/5830
     Blue nodes are adds (target: test), red nodes are subtracts (target: default).
     """
 
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 2fd440e1c2c9..059d0b4c8af8 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -27,7 +27,7 @@
 from tvm import relay
 from tvm import runtime
 from tvm.relay import transform
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.relay.backend import compile_engine
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.op.annotation import compiler_begin, compiler_end
@@ -186,7 +186,7 @@ def update_lib(lib):
 
         kwargs = {}
         kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
-        tmp_path = util.tempdir()
+        tmp_path = utils.tempdir()
         lib_name = "lib.so"
         lib_path = tmp_path.relpath(lib_name)
         lib.export_library(lib_path, fcompile=False, **kwargs)
@@ -1035,7 +1035,7 @@ def test_duplicate_outputs():
     target = "test_duplicate_outputs"
 
     @tvm.ir.register_op_attr("abs", "target." + target)
-    def abs(attrs, args):  # pylint: disable=unused-variable
+    def abs(expr):  # pylint: disable=unused-variable
         return True
 
     def create_graph():
@@ -1096,11 +1096,11 @@ def test_duplicate_merge_and_tuplegetitem():
     target = "test_duplicate_merge_and_tuplegetitem"
 
     @tvm.ir.register_op_attr("nn.batch_norm", "target." + target)
-    def batch_norm(attrs, args):  # pylint: disable=unused-variable
+    def batch_norm(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     def create_graph():
@@ -1177,7 +1177,7 @@ def expected():
 
 def test_constant_tuples():
     @tvm.ir.register_op_attr("qnn.concatenate", "target.const_tuples")
-    def add(attrs, args):  # pylint: disable=unused-variable
+    def add(expr):  # pylint: disable=unused-variable
         return True
 
     def create_graph():
@@ -1223,11 +1223,11 @@ def test_flatten_tuple_output():
     target = "test_flatten_tuple_output"
 
     @tvm.ir.register_op_attr("split", "target." + target)
-    def split(attrs, args):  # pylint: disable=unused-variable
+    def split(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("abs", "target." + target)
-    def abs(attrs, args):  # pylint: disable=unused-variable
+    def abs(expr):  # pylint: disable=unused-variable
         return True
 
     def create_graph():
diff --git a/tests/python/relay/test_recast.py b/tests/python/relay/test_recast.py
new file mode 100644
index 000000000000..8c5a562ddbba
--- /dev/null
+++ b/tests/python/relay/test_recast.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import relay
+from tvm.relay.transform import recast
+
+
+def test_recast_simple():
+    """Recast a single convolution operator."""
+
+    def before():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
+        return relay.Function([x, w], c)
+
+    def expected():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        x_int = relay.cast(x, "int8")
+        w_int = relay.cast(w, "int8")
+        c = relay.nn.conv2d(x_int, w_int, padding=(1, 1), out_dtype="int32")
+        c_float = relay.cast(c, "float32")
+        return relay.Function([x, w], c_float)
+
+    pre = before()
+    post = recast(pre, "int8", "int32")
+    expected = expected()
+    assert tvm.ir.structural_equal(expected, post)
+
+
+def test_recast_medium():
+    """Recast a slightly larger graph."""
+
+    def before():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
+        w2 = relay.var("w2", shape=[8, 8, 3, 3])
+        c2 = relay.nn.conv2d(c, w2, padding=(1, 1), out_dtype="float32")
+        return relay.Function([x, w, w2], c2)
+
+    def expected():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        x_int = relay.cast(x, "int8")
+        w_int = relay.cast(w, "int8")
+        c = relay.nn.conv2d(x_int, w_int, padding=(1, 1), out_dtype="int32")
+        c_float = relay.cast(c, "float32")
+        w2 = relay.var("w2", shape=[8, 8, 3, 3])
+        w2_int = relay.cast(w2, "int8")
+        c_float_int = relay.cast(c_float, "int8")
+        c2 = relay.nn.conv2d(c_float_int, w2_int, padding=(1, 1), out_dtype="int32")
+        c2_float = relay.cast(c2, "float32")
+        return relay.Function([x, w, w2], c2_float)
+
+    pre = before()
+    post = recast(pre, "int8", "int32")
+    expected = expected()
+    assert tvm.ir.structural_equal(expected, post)
+
+
+def test_recast_skip():
+    """Recast a graph using skip layers."""
+
+    def before():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
+        w2 = relay.var("w2", shape=[8, 8, 3, 3])
+        c2 = relay.nn.conv2d(c, w2, padding=(1, 1), out_dtype="float32")
+        return relay.Function([x, w, w2], c2)
+
+    def expected():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
+        w2 = relay.var("w2", shape=[8, 8, 3, 3])
+        w2_int = relay.cast(w2, "int8")
+        c_int = relay.cast(c, "int8")
+        c2 = relay.nn.conv2d(c_int, w2_int, padding=(1, 1), out_dtype="int32")
+        c2_float = relay.cast(c2, "float32")
+        return relay.Function([x, w, w2], c2_float)
+
+    pre = before()
+    post = recast(pre, "int8", "int32", skip_layers=[0])
+    expected = expected()
+    assert tvm.ir.structural_equal(expected, post)
+
+
+if __name__ == "__main__":
+    test_recast_simple()
+    test_recast_medium()
+    test_recast_skip()
diff --git a/tests/python/relay/test_tensorrt.py b/tests/python/relay/test_tensorrt.py
deleted file mode 100644
index 3ed1cdb0fd44..000000000000
--- a/tests/python/relay/test_tensorrt.py
+++ /dev/null
@@ -1,914 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-from tvm import relay
-import tvm.relay.testing
-import tvm.relay.tensorrt
-from tvm.contrib import graph_runtime
-from tvm.runtime.vm import VirtualMachine
-
-
-def should_skip():
-    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
-        print("skip because cuda is not enabled.")
-        return True
-    if not relay.tensorrt.IsTrtRuntimeAvailable():
-        print("skip because tensorrt runtime is not available")
-        return True
-    return False
-
-
-def vmobj_to_list(o):
-    if isinstance(o, tvm.nd.NDArray):
-        return [o.asnumpy()]
-    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
-        result = []
-        for f in o:
-            result.extend(vmobj_to_list(f))
-        return result
-    elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
-        if o.constructor.name_hint == "Cons":
-            tl = vmobj_to_list(o.fields[1])
-            hd = vmobj_to_list(o.fields[0])
-            hd.extend(tl)
-            return hd
-        elif o.constructor.name_hint == "Nil":
-            return []
-        elif "tensor_nil" in o.constructor.name_hint:
-            return [0]
-        elif "tensor" in o.constructor.name_hint:
-            return [o.fields[0].asnumpy()]
-        else:
-            raise RuntimeError("Unknown object type: %s" % o.constructor.name_hint)
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-def assert_result_matches(res1, res2):
-    for r1, r2 in zip(vmobj_to_list(res1), vmobj_to_list(res2)):
-        tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
-
-
-def test_tensorrt_simple():
-    if should_skip():
-        return
-    dtype = "float32"
-    xshape = (1, 3, 2, 2)
-    yshape = (1, 3, 1, 1)
-    zshape = (1, 1, 1, 1)
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.var("y", shape=(yshape), dtype=dtype)
-    z = relay.var("z", shape=(zshape), dtype=dtype)
-    w = z * (x + y)
-    out = relay.nn.relu(w)
-    f = relay.Function([x, y, z], out)
-
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
-    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
-    mod = tvm.IRModule()
-    mod["main"] = f
-
-    result_dict = dict()
-    for mode in ["vm", "graph"]:
-        for use_trt in [True, False]:
-            result_key = mode + ("_trt" if use_trt else "")
-            if use_trt:
-                mod = relay.tensorrt.EnableTrt(mod)
-            with relay.build_config(opt_level=3):
-                relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
-                results = relay_exec.evaluate()(x_data, y_data, z_data)
-            result_dict[result_key] = results
-
-    assert_result_matches(result_dict["vm_trt"], result_dict["vm"])
-    assert_result_matches(result_dict["graph_trt"], result_dict["graph"])
-    assert_result_matches(result_dict["graph_trt"], result_dict["vm_trt"])
-
-
-def test_tensorrt_simple_cpu_io():
-    if should_skip():
-        return
-    dtype = "float32"
-    xshape = (1, 3, 2, 2)
-    yshape = (1, 3, 1, 1)
-    zshape = (1, 1, 1, 1)
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.var("y", shape=(yshape), dtype=dtype)
-    z = relay.var("z", shape=(zshape), dtype=dtype)
-    w = z * (x + y)
-    out = relay.nn.relu(w)
-    f = relay.Function([x, y, z], out)
-
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
-    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
-
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = relay.tensorrt.EnableTrt(mod)
-    params = {"y": y_data}
-    with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(mod, target="llvm", params=params)
-    mod = graph_runtime.create(graph, lib, ctx=tvm.cpu())
-    mod.set_input(**params)
-    mod.run(x=x_data, z=z_data)
-    results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
-
-
-def test_tensorrt_not_compatible():
-    if should_skip():
-        return
-    dtype = "float32"
-    xshape = (1, 32, 14, 14)
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.add(x, x)
-    z = relay.erf(y)
-    out = relay.nn.relu(z)
-    f = relay.Function([x], out)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = relay.tensorrt.EnableTrt(mod)
-    assert not mod["main"].attrs
-
-
-def test_tensorrt_ops():
-    if should_skip():
-        return
-
-    def run_and_verify(config):
-        f, input_shapes, is_param = config
-        params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param}
-        input_dict = {
-            k: np.random.uniform(-1, 1, v).astype(np.float32)
-            for k, v in input_shapes.items()
-            if k not in is_param
-        }
-
-        results = dict()
-        for mode in ["graph", "vm"]:
-            for use_trt in [True, False]:
-                mod = tvm.IRModule()
-                mod["main"] = f
-                result_key = mode + ("_trt" if use_trt else "")
-                if use_trt:
-                    mod = relay.tensorrt.EnableTrt(mod, params)
-
-                with relay.build_config(opt_level=3):
-                    vm_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
-                    results[result_key] = vm_exec.evaluate()(**input_dict, **params)
-
-        assert_result_matches(results["vm_trt"], results["vm"])
-        assert_result_matches(results["graph_trt"], results["graph"])
-        assert_result_matches(results["graph_trt"], results["vm_trt"])
-
-    def test_conv2d(
-        x_shape=(1, 32, 8, 8),
-        k_shape=(16, 32, 3, 3),
-        groups=1,
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv2d(
-            x,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_conv2d_const_weights(
-        x_shape=(1, 32, 8, 8),
-        k_shape=(16, 32, 3, 3),
-        groups=1,
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.const(np.ones(k_shape).astype("float32"))
-        out = relay.nn.conv2d(
-            x,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-        )
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_dense(x_shape=(1, 16), k_shape=(32, 16)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.dense(x, kernel, units=k_shape[0])
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_bias_add(x_shape=(1, 16), channels=16):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        bias = relay.var("bias", shape=(channels,), dtype="float32")
-        out = relay.nn.bias_add(x, bias)
-        f = relay.Function([x, bias], out)
-        return f, {"x": x_shape, "bias": (channels,)}, ["bias"]
-
-    def test_pool2d(
-        op,
-        x_shape=(1, 3, 32, 32),
-        pool_size=(2, 2),
-        strides=(2, 2),
-        padding=(0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_global_pool2d(op, x_shape=(1, 3, 32, 32)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_batch_flatten(x_shape=(1, 3, 4, 6)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.batch_flatten(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_expand_dims(x_shape=(1, 3), axis=1, num_newaxis=1):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.expand_dims(x, axis, num_newaxis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_squeeze(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.squeeze(x, axis=axis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_concatenate(input_shapes, axis):
-        concat_inputs = []
-        shapes_dict = {}
-        for i in range(len(input_shapes)):
-            name = "input_{}".format(i)
-            concat_inputs.append(relay.var(name, shape=(input_shapes[i]), dtype="float32"))
-            shapes_dict[name] = input_shapes[i]
-        out = relay.concatenate(concat_inputs, axis)
-        f = relay.Function(concat_inputs, out)
-        return f, shapes_dict, []
-
-    def test_conv2d_transpose(
-        x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), groups=1, padding=(0, 0), strides=(1, 1)
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv2d_transpose(
-            x,
-            kernel,
-            channels=k_shape[1],
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_reshape(x_shape, new_shape):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.reshape(x, new_shape)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_transpose(x_shape, order):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.transpose(x, order)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_transpose_weights_conv2d(
-        x_shape=(1, 32, 9, 9), k_shape=(3, 3, 32, 16), order=(3, 2, 0, 1)
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        kernel_t = relay.transpose(kernel, order)
-        # Conv2d requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.conv2d(x, kernel_t, channels=k_shape[order[0]], kernel_size=(3, 3))
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_transpose_weights_dense(x_shape=(1, 16), k_shape=(16, 32)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        kernel_t = relay.transpose(kernel, (1, 0))
-        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.dense(x, kernel_t)
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_dense_from_pytorch(x_shape=(1, 16), k_shape=(32, 16)):
-        # FixPyTorchAddmm will fold away the tranpose -> mult -> transpose.
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        kernel_t = relay.transpose(kernel, (1, 0))
-        beta = relay.const(1, dtype="float32")
-        kernel_t = relay.multiply(kernel_t, beta)
-        kernel_t = relay.transpose(kernel_t, (1, 0))
-        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.dense(x, kernel_t)
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_float_const(x_shape=(1, 16)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        beta = relay.const(1, dtype="float32")
-        out = relay.multiply(x, beta)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_pad(x_shape, pad_width):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.pad(x, pad_width=pad_width)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_softmax(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.softmax(x, axis=axis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_batch_norm(x_shape, param_shape, axis=1, epsilon=1e-5):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        beta = relay.var("beta", shape=(param_shape), dtype="float32")
-        gamma = relay.var("gamma", shape=(param_shape), dtype="float32")
-        moving_mean = relay.var("moving_mean", shape=(param_shape), dtype="float32")
-        moving_var = relay.var("moving_var", shape=(param_shape), dtype="float32")
-        out, _, _ = relay.nn.batch_norm(
-            x,
-            gamma=gamma,
-            beta=beta,
-            moving_mean=moving_mean,
-            moving_var=moving_var,
-            axis=axis,
-            center=True,
-            scale=True,
-            epsilon=epsilon,
-        )
-        f = relay.Function([x, gamma, beta, moving_mean, moving_var], out)
-        return (
-            f,
-            {
-                "x": x_shape,
-                "beta": param_shape,
-                "gamma": param_shape,
-                "moving_mean": param_shape,
-                "moving_var": param_shape,
-            },
-            ["beta", "gamma", "moving_mean", "moving_var"],
-        )
-
-    def test_unary(op, x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_clip(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.clip(x, a_min=-0.2, a_max=0.4)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_leaky_relu(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.leaky_relu(x, alpha=0.1)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_binary(op, x_shape, y_shape, y_is_const=False):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if y_is_const:
-            y = relay.const(np.ones(y_shape).astype("float32"))
-            out = op(x, y)
-            f = relay.Function([x], out)
-            return f, {"x": x_shape}, []
-        y = relay.var("y", shape=(y_shape), dtype="float32")
-        out = op(x, y)
-        f = relay.Function([x, y], out)
-        return f, {"x": x_shape, "y": y_shape}, []
-
-    def test_reduce(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x, axis=axis, keepdims=keepdims)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_strided_slice(x_shape, begin, end, strides=None):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.strided_slice(x, begin, end, strides)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_adaptive_pool2d(op, x_shape=(1, 3, 32, 32), out_size=(1, 1)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x, out_size)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_resize(
-        x_shape=(1, 3, 16, 16),
-        out_size=(32, 32),
-        layout="NCHW",
-        method="nearest_neighbor",
-        coordinate_transformation_mode="align_corners",
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.image.resize(
-            x,
-            out_size,
-            layout=layout,
-            method=method,
-            coordinate_transformation_mode=coordinate_transformation_mode,
-        )
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_multiple_outputs():
-        x = relay.var("x", shape=(1, 3), dtype="float32")
-        y = relay.var("y", shape=(1, 3), dtype="float32")
-        z = relay.add(x, y)
-        w = relay.add(z, y)
-        out = relay.Tuple((z, w))
-        f = relay.Function([x, y], out)
-        return f, {"x": (1, 3), "y": (1, 3)}, []
-
-    def test_conv3d(
-        x_shape=(1, 32, 8, 8, 8),
-        k_shape=(16, 32, 3, 3, 3),
-        groups=1,
-        padding=(0, 0, 0),
-        strides=(1, 1, 1),
-        dilation=(1, 1, 1),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv3d(
-            x,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_pool3d(
-        op,
-        x_shape=(1, 3, 8, 32, 32),
-        pool_size=(2, 2, 2),
-        strides=(2, 2, 2),
-        padding=(0, 0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_conv3d_transpose(
-        x_shape=(1, 32, 8, 8, 8),
-        k_shape=(32, 16, 3, 3, 3),
-        groups=1,
-        padding=(0, 0, 0),
-        strides=(1, 1, 1),
-        output_padding=(0, 0, 0),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv3d_transpose(
-            x,
-            kernel,
-            channels=k_shape[1],
-            kernel_size=k_shape[2:5],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            output_padding=output_padding,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    run_and_verify(test_float_const())
-    run_and_verify(test_multiple_outputs())
-    run_and_verify(test_clip())
-    run_and_verify(test_leaky_relu())
-    run_and_verify(test_batch_norm((1, 64, 56, 56), (64,)))
-    run_and_verify(test_batch_norm((1, 56, 56, 64), (64,), axis=3, epsilon=1.001e-05))
-    run_and_verify(test_softmax((1, 1000), axis=1))
-    run_and_verify(test_softmax((1, 1000), axis=-1))
-    run_and_verify(test_softmax((1, 3, 4), axis=-2))
-    run_and_verify(test_softmax((1, 3, 4), axis=1))
-    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                for dilation in [(1, 1), (2, 2)]:
-                    run_and_verify(
-                        test_conv2d(
-                            k_shape=k_shape,
-                            groups=groups,
-                            padding=padding,
-                            strides=strides,
-                            dilation=dilation,
-                        )
-                    )
-    # Disabled due to incorrect results from TVM.
-    run_and_verify(test_conv2d_const_weights())
-    run_and_verify(test_dense())
-    run_and_verify(test_dense_from_pytorch())
-    run_and_verify(test_bias_add())
-    run_and_verify(test_bias_add((1, 6, 3, 4), 6))
-    for op in [relay.add, relay.subtract, relay.multiply, relay.divide, relay.power]:
-        # Disabled y_is_const=True due to incorrect results from TVM.
-        for y_is_const in [True, False]:
-            run_and_verify(test_binary(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const))
-            run_and_verify(test_binary(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const))
-            run_and_verify(test_binary(op, (1, 10), (10,), y_is_const))
-            run_and_verify(test_binary(op, (1, 1, 1, 10), (10,), y_is_const))
-            run_and_verify(test_binary(op, (1, 1, 1), (3,), y_is_const))
-    for pool_size in [(2, 2), (3, 3)]:
-        for strides in [(1, 1), (2, 2)]:
-            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
-                for ceil_mode in [False, True]:
-                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
-                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
-                        continue
-                    for count_include_pad in [False, True]:
-                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
-                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
-                            continue
-                        run_and_verify(
-                            test_pool2d(
-                                relay.nn.avg_pool2d,
-                                pool_size=pool_size,
-                                strides=strides,
-                                padding=padding,
-                                ceil_mode=ceil_mode,
-                                count_include_pad=count_include_pad,
-                            )
-                        )
-                    run_and_verify(
-                        test_pool2d(
-                            relay.nn.max_pool2d,
-                            pool_size=pool_size,
-                            strides=strides,
-                            padding=padding,
-                            ceil_mode=ceil_mode,
-                        )
-                    )
-    for op in [relay.nn.global_max_pool2d, relay.nn.global_max_pool2d]:
-        run_and_verify(test_global_pool2d(op))
-    for op in [
-        relay.nn.relu,
-        relay.sigmoid,
-        relay.tanh,
-        relay.exp,
-        relay.log,
-        relay.sqrt,
-        relay.abs,
-        relay.negative,
-        relay.sin,
-        relay.cos,
-        relay.atan,
-        relay.ceil,
-        relay.floor,
-    ]:
-        run_and_verify(test_unary(op))
-    run_and_verify(test_batch_flatten())
-    run_and_verify(test_expand_dims())
-    run_and_verify(test_squeeze((1, 5, 1, 1), (2, 3)))
-    run_and_verify(test_squeeze((1, 3, 1), (-1,)))
-    run_and_verify(test_concatenate([(1, 2, 6, 6), (1, 3, 6, 6)], axis=1))
-    for padding in [(0, 0), (1, 1)]:
-        for strides in [(1, 1), (2, 2)]:
-            run_and_verify(test_conv2d_transpose(padding=padding, strides=strides))
-    run_and_verify(test_transpose((1, 16, 7, 7), [0, 2, 3, 1]))
-    run_and_verify(test_transpose((1, 7, 7, 16), [0, 3, 1, 2]))
-    run_and_verify(test_transpose_weights_conv2d())
-    run_and_verify(test_transpose_weights_conv2d((1, 32, 9, 9), (3, 3, 16, 32), (2, 3, 0, 1)))
-    run_and_verify(test_transpose_weights_dense())
-    run_and_verify(test_reshape((1, 1, 1, 10), (-1, 10)))
-    run_and_verify(test_reshape((1, 10, 2, 3), (1, -1)))
-    run_and_verify(test_reshape((1, 1, 2, 3), (1, 6)))
-    run_and_verify(test_pad((1, 64, 56, 56), [[0, 0], [0, 0], [0, 0], [0, 0]]))
-    run_and_verify(test_pad((1, 64, 56, 56), [[0, 0], [0, 0], [1, 1], [1, 1]]))
-    run_and_verify(test_pad((1, 56, 56, 64), [[0, 0], [1, 1], [1, 1], [0, 0]]))
-    for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]:
-        for keepdims in [True, False]:
-            run_and_verify(test_reduce(op, axis=(1), keepdims=keepdims))
-            run_and_verify(test_reduce(op, axis=(2, 3), keepdims=keepdims))
-            run_and_verify(test_reduce(op, axis=(1, 2), keepdims=keepdims))
-            run_and_verify(test_reduce(op, axis=(1, 2, 3), keepdims=keepdims))
-    run_and_verify(test_strided_slice((1, 3, 6, 7), (0, 0, 0, 0), (1, 1, 6, 7)))
-    run_and_verify(test_strided_slice((1, 3, 6, 7), (0, 1, 0, 0), (1, 2, 6, 6)))
-    run_and_verify(test_strided_slice((1, 10), (0, 0), (1, 10), (1, 2)))
-    for op in [relay.nn.adaptive_max_pool2d, relay.nn.adaptive_avg_pool2d]:
-        run_and_verify(test_adaptive_pool2d(op))
-    # for x_shape, layout in [((1, 3, 16, 16), 'NCHW'), ((1, 16, 16, 3), 'NHWC')]:
-    #     for out_size in [(32, 32), (40, 40), (5, 21)]:
-    #         for method in ['nearest_neighbor', 'bilinear']:
-    #             for coordinate_transformation_mode in ['asymmetric']:
-    #                 # TODO(trevmorr): 'align_corners' gives incorrect results. 'half_pixel' not supported?
-    #                 run_and_verify(test_resize(x_shape, out_size, layout, method, coordinate_transformation_mode))
-    run_and_verify(test_conv3d())
-    run_and_verify(test_conv3d(padding=(0, 0, 0, 1, 1, 1)))
-    run_and_verify(test_pool3d(relay.nn.avg_pool3d))
-    run_and_verify(test_pool3d(relay.nn.max_pool3d))
-    run_and_verify(test_pool3d(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)))
-    run_and_verify(test_pool3d(relay.nn.max_pool3d, strides=(1, 1, 1)))
-    run_and_verify(test_conv3d_transpose())
-
-
-def test_tensorrt_integration(test_all_models=False):
-    if should_skip():
-        return
-
-    def test_model(model, mode, i_data, input_shape, dtype, use_trt=True):
-        from mxnet.gluon.model_zoo.vision import get_model
-
-        assert mode in ["graph", "vm"]
-
-        def check_trt_used(mod):
-            num_trt_subgraphs = sum(
-                [1 if gv.name_hint == "tensorrt_0" else 0 for gv in mod.get_global_vars()]
-            )
-            assert num_trt_subgraphs == 1
-
-        block = get_model(model, pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
-
-        if use_trt:
-            mod = relay.tensorrt.EnableTrt(mod, params)
-            check_trt_used(mod)
-
-        with relay.build_config(opt_level=3):
-            exec = relay.create_executor(mode, mod=mod, ctx=tvm.cpu(0), target="llvm")
-
-        res = exec.evaluate()(i_data, **params)
-        return res
-
-    models = [
-        "alexnet",
-        "resnet18_v1",
-        "resnet18_v2",
-        "squeezenet1.0",
-        "mobilenet0.25",
-        "mobilenetv2_0.25",
-        "vgg11",
-        "densenet121",
-    ]
-    additional_models = [
-        "resnet34_v1",
-        "resnet50_v1",
-        "resnet101_v1",
-        "resnet152_v1",
-        "resnet34_v2",
-        "resnet50_v2",
-        "resnet101_v2",
-        "resnet152_v2",
-        "mobilenet0.5",
-        "mobilenet0.75",
-        "mobilenet1.0",
-        "mobilenetv2_0.5",
-        "mobilenetv2_0.75",
-        "mobilenetv2_1.0",
-        "vgg16",
-        "densenet169",
-        "densenet201",
-    ]
-
-    if test_all_models:
-        models.extend(additional_models)
-
-    dtype = "float32"
-    input_shape = (1, 3, 224, 224)
-    i_data = np.random.uniform(-1, 1, input_shape).astype(dtype)
-
-    results = dict()
-    for model in models:
-        print("Testing model : {}".format(model))
-        for mode in ["vm", "graph"]:
-            for use_trt in [True, False]:
-                result_key = mode + ("_trt" if use_trt else "")
-                results[result_key] = test_model(
-                    model, mode, i_data, input_shape, dtype, use_trt=use_trt
-                )
-
-        assert_result_matches(results["vm_trt"], results["vm"])
-        assert_result_matches(results["graph_trt"], results["graph"])
-        assert_result_matches(results["graph_trt"], results["vm_trt"])
-
-
-def test_tensorrt_serialize(data_shape=(1, 3, 224, 224)):
-    if should_skip():
-        return
-
-    from mxnet.gluon.model_zoo.vision import get_model
-
-    i_data = np.random.uniform(0, 1, data_shape).astype("float32")
-    block = get_model("resnet18_v1", pretrained=True)
-    mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype="float32")
-    mod = relay.tensorrt.EnableTrt(mod, params)
-
-    def compile_vm(mod, params):
-        with relay.build_config(opt_level=3):
-            vm_exec = relay.vm.compile(mod, target="llvm", params=params)
-            code, lib = vm_exec.save()
-        return code, lib
-
-    def run_vm(code, lib):
-        vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
-        vm = VirtualMachine(vm_exec, tvm.cpu(0))
-        result = vm.invoke("main", data=i_data)
-        return result
-
-    def save_vm(code, lib):
-        # save and load the code and lib file.
-        lib.export_library("path_lib.so")
-        with open("path_code.ro", "wb") as fo:
-            fo.write(code)
-
-    def load_vm():
-        lib = tvm.runtime.load_module("path_lib.so")
-        code = bytearray(open("path_code.ro", "rb").read())
-        return lib, code
-
-    def compile_graph(mod, params):
-        with relay.build_config(opt_level=3):
-            graph, lib, params = relay.build(mod, params=params, target="cuda")
-            params = relay.save_param_dict(params)
-        return graph, lib, params
-
-    def run_graph(graph, lib, params):
-        mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
-        mod_.load_params(params)
-        mod_.run(data=i_data)
-        res = mod_.get_output(0)
-        return res
-
-    def save_graph(graph, lib, params):
-        # Serialize
-        with open("compiled.json", "w") as f_graph_json:
-            f_graph_json.write(graph)
-        with open("compiled.params", "wb") as f_params:
-            f_params.write(params)
-        lib.export_library("compiled.so")
-
-    def load_graph():
-        # Deserialize
-        with open("compiled.json", "r") as f_graph_json:
-            graph = f_graph_json.read()
-        with open("compiled.params", "rb") as f_params:
-            params = bytearray(f_params.read())
-        lib = tvm.runtime.load_module("compiled.so")
-        return graph, lib, params
-
-    # Test serialization with graph runtime and check if the results match
-    graph, lib, graph_params = compile_graph(mod, params)
-    save_graph(graph, lib, graph_params)
-    loaded_graph, loaded_lib, loaded_params = load_graph()
-
-    ref_res_graph = run_graph(graph, lib, graph_params)
-    res_graph_serialized = run_graph(loaded_graph, loaded_lib, loaded_params)
-    assert_result_matches(res_graph_serialized, ref_res_graph)
-
-    # Test serialization with VM and check if the results match
-    code, lib = compile_vm(mod, params)
-    save_vm(code, lib)
-    loaded_lib, loaded_code = load_vm()
-
-    ref_res_vm = run_vm(code, lib)
-    res_vm_serialized = run_vm(loaded_code, loaded_lib)
-    assert_result_matches(res_vm_serialized, ref_res_vm)
-
-    # Finally check accuracy between VM and graph
-    assert_result_matches(res_vm_serialized, res_graph_serialized)
-
-
-def test_tensorrt_dynamic_batch():
-    if should_skip():
-        return
-
-    batches_to_test = [1, 1, 2, 3, 1, 3, 2]
-    x_shape = (relay.Any(), 1, 8, 8)
-    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
-    result_dict = {}
-    for use_trt in [True, False]:
-        x = relay.var("x", shape=x_shape, dtype="float32")
-        out = relay.nn.relu(x)
-        f = relay.Function([x], out)
-        mod = tvm.IRModule()
-        mod["main"] = f
-        if use_trt:
-            mod = relay.tensorrt.EnableTrt(mod)
-        with relay.build_config(opt_level=3):
-            relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
-
-        for i, batch_size in enumerate(batches_to_test):
-            result_dict[(i, use_trt)] = relay_exec.evaluate()(x_data[:batch_size, ...])
-
-    for i in range(len(batches_to_test)):
-        assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
-
-
-def test_tensorrt_dynamic_batch_conv():
-    if should_skip():
-        return
-    batches_to_test = [1, 1, 2, 3, 1, 3, 2]
-    x_shape = (relay.Any(), 32, 8, 8)
-    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
-    k_shape = (16, 32, 3, 3)
-    params = {"kernel": np.random.uniform(-1, 1, k_shape).astype("float32")}
-    result_dict = {}
-    for use_trt in [True, False]:
-        x = relay.var("x", shape=x_shape, dtype="float32")
-        kernel = relay.var("kernel", shape=k_shape, dtype="float32")
-        out = relay.nn.conv2d(x, kernel, channels=16, kernel_size=(3, 3), groups=1)
-        f = relay.Function([x, kernel], out)
-        mod = tvm.IRModule()
-        mod["main"] = f
-        if use_trt:
-            mod = relay.tensorrt.EnableTrt(mod, params)
-        with relay.build_config(opt_level=3):
-            relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
-
-        for i, batch_size in enumerate(batches_to_test):
-            result_dict[(i, use_trt)] = relay_exec.evaluate()(x=x_data[:batch_size, ...], **params)
-
-    for i in range(len(batches_to_test)):
-        assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
-
-
-if __name__ == "__main__":
-    test_tensorrt_ops()
-    test_tensorrt_simple()
-    test_tensorrt_simple_cpu_io()
-    test_tensorrt_not_compatible()
-    test_tensorrt_integration()
-    test_tensorrt_serialize()
-    test_tensorrt_dynamic_batch()
-    test_tensorrt_dynamic_batch_conv()
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 038b5c5ed9e1..6958010176e3 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -754,5 +754,46 @@ def test_vm_reshape_tensor():
     check_result([x_np, y_np], x_np.reshape([8, 2, 8]), mod)
 
 
+def test_vm_reshape_tuple(x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
+    tup = relay.var(
+        "tup",
+        type_annotation=relay.TupleType([relay.TensorType(x_shape), relay.TensorType(y_shape)]),
+    )
+    out = relay.reshape(relay.TupleGetItem(tup, 0), (1, -1))
+    f = relay.Function([tup], out)
+
+    x_data = np.random.uniform(size=x_shape).astype("float32")
+    y_data = np.random.uniform(size=y_shape).astype("float32")
+
+    for tgt, ctx in tvm.testing.enabled_targets():
+        res = veval(f, (x_data, y_data), ctx=ctx, target=tgt)
+        tvm.testing.assert_allclose(res.asnumpy(), np.reshape(x_data, (1, -1)))
+
+
+def test_constant_shape_with_external_codegen():
+    mod = tvm.IRModule()
+    shape = (relay.Any(), 25)
+    dtype = "float32"
+
+    # external function
+    x = relay.var("x", shape=shape, dtype=dtype)
+    weight = relay.const(np.random.rand(5, 25).astype("float32"), dtype="float32")
+    out = relay.nn.dense(x, weight)
+    f1 = relay.Function([x], out)
+    f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+    f1 = f1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
+    f1 = f1.with_attr("Compiler", "a")
+    glb_f1 = relay.GlobalVar("f1")
+    mod[glb_f1] = f1
+    mod = relay.transform.InferType()(mod)
+
+    # Main function
+    x = relay.var("x", shape=shape, dtype=dtype)
+    mod["main"] = relay.Function([x], glb_f1(x))
+    comp = relay.vm.VMCompiler()
+    opt_mod, _ = comp.optimize(mod, target="llvm")
+    assert "shape_func" in opt_mod.astext(False)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py
index 14f003e3500b..b2a695dc5434 100644
--- a/tests/python/relay/test_vm_serialization.py
+++ b/tests/python/relay/test_vm_serialization.py
@@ -27,7 +27,7 @@
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay import transform
 from tvm.relay.prelude import Prelude
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.relay import testing
 
 
@@ -129,7 +129,7 @@ def test_save_load():
     assert isinstance(code, bytearray)
 
     # save and load the code and lib file.
-    tmp = util.tempdir()
+    tmp = utils.tempdir()
     path_lib = tmp.relpath("lib.so")
     lib.export_library(path_lib)
     with open(tmp.relpath("code.ro"), "wb") as fo:
diff --git a/tests/python/relay/util/assert_diagnostic.py b/tests/python/relay/utils/assert_diagnostic.py
similarity index 100%
rename from tests/python/relay/util/assert_diagnostic.py
rename to tests/python/relay/utils/assert_diagnostic.py
diff --git a/tests/python/topi/python/test_topi_basic.py b/tests/python/topi/python/test_topi_basic.py
index 074319895bcb..108b92d903d9 100644
--- a/tests/python/topi/python/test_topi_basic.py
+++ b/tests/python/topi/python/test_topi_basic.py
@@ -17,13 +17,13 @@
 import tvm
 from tvm import te
 from tvm import topi
-from tvm.topi import util
+from tvm.topi import utils
 
 
 def test_util():
     x = tvm.tir.const(100, "int32")
-    assert util.get_const_int(x) == 100
-    assert util.get_const_tuple((x, x)) == (100, 100)
+    assert utils.get_const_int(x) == 100
+    assert utils.get_const_tuple((x, x)) == (100, 100)
 
 
 def test_ewise():
diff --git a/tests/python/topi/python/test_topi_batch_matmul.py b/tests/python/topi/python/test_topi_batch_matmul.py
index 0d82ee69fa26..e939f6c21e37 100644
--- a/tests/python/topi/python/test_topi_batch_matmul.py
+++ b/tests/python/topi/python/test_topi_batch_matmul.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_batch_to_space_nd.py b/tests/python/topi/python/test_topi_batch_to_space_nd.py
new file mode 100644
index 000000000000..89d044fed963
--- /dev/null
+++ b/tests/python/topi/python/test_topi_batch_to_space_nd.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for batch to space"""
+import numpy as np
+import tvm
+from tvm import te
+from tvm import topi
+import tvm.testing
+import tvm.topi.testing
+
+
+def verify_batch_to_space_nd(input_shape, block_shape, crop_begin_list, crop_end_list):
+    out_shape = []
+    out_shape.append(int((input_shape[0] / np.prod(block_shape))))
+    for i in range(1, len(block_shape) + 1):
+        crop = crop_begin_list[i - 1] + crop_end_list[i - 1]
+        out_shape.append(input_shape[i] * block_shape[i - 1] - crop)
+    for i in range(len(block_shape) + 1, len(input_shape)):
+        out_shape.append(input_shape[i])
+
+    A = te.placeholder(input_shape, name="A", dtype="float32")
+    dtype = A.dtype
+    a_np = np.random.uniform(size=input_shape).astype(dtype)
+
+    B = topi.nn.batch_to_space_nd(A, block_shape, crop_begin_list, crop_end_list)
+
+    b_np = tvm.topi.testing.batch_to_space_nd_python(
+        a_np, block_shape, crop_begin_list, crop_end_list
+    )
+
+    def check_device(device, ctx):
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = tvm.topi.testing.get_injective_schedule(device)(B)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
+
+    for device, ctx in tvm.testing.enabled_targets():
+        check_device(device, ctx)
+
+
+@tvm.testing.uses_gpu
+def test_batch_to_space():
+    # Without crops
+    verify_batch_to_space_nd([4, 1, 1, 1], [2, 2], [0, 0], [0, 0])
+    # With crops
+    verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [0, 2], [0, 0])
+    verify_batch_to_space_nd([18, 2, 1, 2], [2, 3], [1, 1], [0, 0])
+    verify_batch_to_space_nd([20, 5, 8, 7], [2, 2], [1, 1], [1, 1])
+
+
+if __name__ == "__main__":
+    test_batch_to_space()
diff --git a/tests/python/topi/python/test_topi_bitserial_conv2d.py b/tests/python/topi/python/test_topi_bitserial_conv2d.py
index 8e55c7bd4306..b0bce44a03f9 100644
--- a/tests/python/topi/python/test_topi_bitserial_conv2d.py
+++ b/tests/python/topi/python/test_topi_bitserial_conv2d.py
@@ -20,7 +20,7 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 
diff --git a/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py b/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
index 76f342255541..1cd982db5450 100644
--- a/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
+++ b/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def generate_quantized_np(shape, bits, out_dtype):
diff --git a/tests/python/topi/python/test_topi_bitserial_dense.py b/tests/python/topi/python/test_topi_bitserial_dense.py
index 8b565ac52da5..a624b1b1fede 100644
--- a/tests/python/topi/python/test_topi_bitserial_dense.py
+++ b/tests/python/topi/python/test_topi_bitserial_dense.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 _bitserial_dense_implement = {
diff --git a/tests/python/topi/python/test_topi_bnn.py b/tests/python/topi/python/test_topi_bnn.py
index 1bec2d0507cf..fbd9ac5d66c6 100644
--- a/tests/python/topi/python/test_topi_bnn.py
+++ b/tests/python/topi/python/test_topi_bnn.py
@@ -20,7 +20,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 
diff --git a/tests/python/topi/python/test_topi_clip.py b/tests/python/topi/python/test_topi_clip.py
index bee31a50a209..704ffe7e6843 100644
--- a/tests/python/topi/python/test_topi_clip.py
+++ b/tests/python/topi/python/test_topi_clip.py
@@ -21,7 +21,7 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 
diff --git a/tests/python/topi/python/test_topi_conv1d.py b/tests/python/topi/python/test_topi_conv1d.py
index 533910308ce8..aad029ce3ce5 100644
--- a/tests/python/topi/python/test_topi_conv1d.py
+++ b/tests/python/topi/python/test_topi_conv1d.py
@@ -23,7 +23,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 _conv1d_ncw_implement = {
diff --git a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
index 5b02d586a801..2b8c486b8cd1 100644
--- a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
+++ b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 _conv1d_transpose_ncw_implement = {
@@ -91,9 +91,13 @@ def test_conv1d_transpose_ncw():
     verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 2, 256, (0,))
     verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256, (0,))
     verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256, (3,))
+    verify_conv1d_transpose_ncw(1, 2, 1024, 1, 128, 128, 0, (0,))
+    verify_conv1d_transpose_ncw(1, 1, 1024, 2, 128, 128, 0, (0,))
+    verify_conv1d_transpose_ncw(1, 1, 1024, 2, 2, 2, 0, (0,))
     verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (0, 3), (0,))
     verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (1, 3), (0,))
     verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (2, 3), (0,))
+    verify_conv1d_transpose_ncw(1, 257, 128, 1, 512, 128, 256, (0,))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/topi/python/test_topi_conv2d_NCHWc.py b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
index a21790d61861..b1955ef5fa3b 100644
--- a/tests/python/topi/python/test_topi_conv2d_NCHWc.py
+++ b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
@@ -24,8 +24,8 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def _transform_data(data, bn):
diff --git a/tests/python/topi/python/test_topi_conv2d_hwcn.py b/tests/python/topi/python/test_topi_conv2d_hwcn.py
index a16499ace46d..bd88839c9c15 100644
--- a/tests/python/topi/python/test_topi_conv2d_hwcn.py
+++ b/tests/python/topi/python/test_topi_conv2d_hwcn.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
index 758976469bfd..9d63175d2e84 100644
--- a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
@@ -25,8 +25,8 @@
 from tvm import te, autotvm, topi
 from tvm.contrib.pickle_memoize import memoize
 from tvm.contrib import nvcc
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 
 _conv2d_hwnc_tensorcore_implement = {
     "cuda": (topi.cuda.conv2d_hwnc_tensorcore, topi.cuda.schedule_conv2d_hwnc_tensorcore)
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index b2f9835c3d66..1bf83eba53ac 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -24,8 +24,8 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm
 
 from common import Int8Fallback
@@ -72,6 +72,12 @@ def compile_conv2d_NHWC_gemm_int8_arm(
             topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
             topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
         ),
+        # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension
+        # (
+        #   "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
+        #   topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+        #   topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+        # ),
     ]
 
     for device_tuple in devices:
diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py
index fef46e371da7..1b7575211dac 100644
--- a/tests/python/topi/python/test_topi_conv2d_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_nchw.py
@@ -23,8 +23,8 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index 747bd4f4429b..8c3b9e931eea 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
index f6617379b549..66ce6ffe41f4 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
@@ -25,7 +25,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def verify_conv2d_1x1_nhwc_pack_int8(
@@ -73,7 +73,7 @@ def check_device(device):
         check_device(device)
 
 
-# TODO(@llyfacebook): Please fix https://github.com/apache/incubator-tvm/issues/4122 to enable this test.
+# TODO(@llyfacebook): Please fix https://github.com/apache/tvm/issues/4122 to enable this test.
 @pytest.mark.skip
 def test_conv2d_nhwc():
     verify_conv2d_1x1_nhwc_pack_int8(1, 256, 32, 256, 1, 1, 0)
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
index 8d881b01250c..eab73410dbe6 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
@@ -24,8 +24,8 @@
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
 from tvm.contrib import nvcc
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
index 3ffa4ac21f15..436270173316 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
@@ -24,8 +24,8 @@
 import tvm.topi.testing
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
@@ -74,7 +74,7 @@ def verify_conv2d_nhwc(
     bias_shape = get_const_tuple(bias.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_conv2d_nhwc")
+    @memoize("topi.tests.test_topi_conv2d_nhwc_winograd.verify_conv2d_nhwc")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
diff --git a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
index 267cfbe4c990..5cc2c2eb6f5d 100644
--- a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
@@ -21,7 +21,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_conv2d_winograd.py b/tests/python/topi/python/test_topi_conv2d_winograd.py
index 69ef4f78723a..34febfd9460a 100644
--- a/tests/python/topi/python/test_topi_conv2d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_winograd.py
@@ -24,8 +24,8 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
@@ -67,7 +67,7 @@ def verify_conv2d_nchw(
     bias_shape = get_const_tuple(bias.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
+    @memoize("topi.tests.test_topi_conv2d_winograd.verify_conv2d_nhwc")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
@@ -140,7 +140,6 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
     verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
     verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
-    verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2, devices=["cuda"])
 
     # batch size = 2
     verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1)
@@ -154,6 +153,7 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1)
     verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1)
     verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
+    verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2, devices=["cuda"])
 
     # Asymmetric padding
     verify_conv2d_nchw(1, 48, 56, 48, 3, 1, (1, 1, 1, 1))
@@ -170,5 +170,79 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 48, 35, 48, 5, 1, "VALID", devices=["cuda"])
 
 
+def verify_conv2d_nhwc(
+    batch,
+    in_channel,
+    in_size,
+    num_filter,
+    kernel,
+    stride,
+    padding,
+    dilation=1,
+):
+    # This version is intented to be used by the auto-scheduler,
+    # so we only test the correctness of compute declaration
+    # with the default naive schedule in cpu
+
+    A = te.placeholder((batch, in_size, in_size, in_channel), name="A")
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
+    bias = te.placeholder((1, 1, 1, num_filter), name="bias")
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_winograd.verify_conv2d_nhwc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    target = "llvm"
+    ctx = tvm.context(target)
+
+    C = topi.nn.conv2d_winograd_nhwc(A, W, stride, padding, dilation, dtype)
+    s = te.create_schedule([C.op])
+
+    a = tvm.nd.array(a_np, ctx=ctx)
+    w = tvm.nd.array(w_np, ctx=ctx)
+    b = tvm.nd.array(b_np, ctx=ctx)
+    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx=ctx)
+    func = tvm.build(s, [A, W, C], target=target)
+    func(a, w, c)
+
+    rtol = 1e-3
+    tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
+
+
+def test_conv2d_nhwc():
+    # This version is intented to be used by the auto-scheduler,
+    # so we only test the correctness of compute declaration
+    # with the default naive schedule in cpu
+
+    # resnet 18 workloads
+    verify_conv2d_nhwc(1, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nhwc(1, 128, 28, 128, 3, 1, 1)
+    verify_conv2d_nhwc(1, 256, 14, 256, 3, 1, 1)
+    verify_conv2d_nhwc(1, 512, 7, 512, 3, 1, 1)
+
+    # more shapes
+    verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nhwc(1, 1, 1, 1, 3, 1, 1)
+    verify_conv2d_nhwc(3, 3, 3, 3, 3, 1, 1)
+    verify_conv2d_nhwc(2, 13, 71, 59, 3, 1, 1)
+
+    # Asymmetric padding
+    verify_conv2d_nhwc(1, 3, 7, 3, 3, 1, "SAME")
+    verify_conv2d_nhwc(1, 48, 35, 48, 3, 1, "VALID")
+
+
 if __name__ == "__main__":
     test_conv2d_nchw()
+    test_conv2d_nhwc()
diff --git a/tests/python/topi/python/test_topi_conv3d_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_ncdhw.py
index 58f30fbf6003..094a71074fa0 100644
--- a/tests/python/topi/python/test_topi_conv3d_ncdhw.py
+++ b/tests/python/topi/python/test_topi_conv3d_ncdhw.py
@@ -24,8 +24,8 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple3d
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple3d
+from tvm.topi.utils import get_const_tuple
 
 _conv3d_ncdhw_implement = {
     "generic": (topi.nn.conv3d_ncdhw, topi.generic.schedule_conv3d_ncdhw),
diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc.py b/tests/python/topi/python/test_topi_conv3d_ndhwc.py
index 2baacc07de75..2d2541af5979 100644
--- a/tests/python/topi/python/test_topi_conv3d_ndhwc.py
+++ b/tests/python/topi/python/test_topi_conv3d_ndhwc.py
@@ -23,7 +23,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 _conv3d_ndhwc_implement = {
diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
index 9a7d99ae6893..1e027aba4cd3 100644
--- a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
@@ -24,8 +24,8 @@
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
 from tvm.contrib import nvcc
-from tvm.topi.nn.util import get_pad_tuple3d
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple3d
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
index 480ec193ccf4..2ac7ccc708ec 100644
--- a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
+++ b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
@@ -22,7 +22,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 _conv3d_transpose_ncdhw_implement = {
diff --git a/tests/python/topi/python/test_topi_conv3d_winograd.py b/tests/python/topi/python/test_topi_conv3d_winograd.py
index fbb2995603c4..d00249ba4392 100644
--- a/tests/python/topi/python/test_topi_conv3d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv3d_winograd.py
@@ -24,8 +24,8 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple3d
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple3d
+from tvm.topi.utils import get_const_tuple
 
 
 _conv3d_ncdhw_implement = {
diff --git a/tests/python/topi/python/test_topi_correlation.py b/tests/python/topi/python/test_topi_correlation.py
index 17c21d6cb3f9..4709fb7d68f9 100644
--- a/tests/python/topi/python/test_topi_correlation.py
+++ b/tests/python/topi/python/test_topi_correlation.py
@@ -23,7 +23,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 _correlation_implement = {
     "generic": (topi.nn.correlation_nchw, topi.generic.schedule_correlation_nchw),
diff --git a/tests/python/topi/python/test_topi_deformable_conv2d.py b/tests/python/topi/python/test_topi_deformable_conv2d.py
index f57f42139395..cd6f33f14fd7 100644
--- a/tests/python/topi/python/test_topi_deformable_conv2d.py
+++ b/tests/python/topi/python/test_topi_deformable_conv2d.py
@@ -21,16 +21,20 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 import tvm.testing
 
 
-_deformable_conv2d_implement = {
+_deformable_conv2d_nchw_implement = {
     "generic": (topi.nn.deformable_conv2d_nchw, topi.generic.schedule_deformable_conv2d_nchw),
     "cuda": (topi.cuda.deformable_conv2d_nchw, topi.cuda.schedule_deformable_conv2d_nchw),
 }
 
+_deformable_conv2d_nhwc_implement = {
+    "generic": (topi.nn.deformable_conv2d_nhwc, topi.generic.schedule_deformable_conv2d_nhwc),
+}
+
 
 def verify_deformable_conv2d_nchw(
     batch,
@@ -94,7 +98,7 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_implement)
+        fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_nchw_implement)
         with tvm.target.Target(device):
             C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype)
             s = fschedule([C])
@@ -112,6 +116,86 @@ def check_device(device):
         check_device(device)
 
 
+def verify_deformable_conv2d_nhwc(
+    batch,
+    in_channel,
+    in_size,
+    num_filter,
+    kernel,
+    stride,
+    padding,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+):
+    print(
+        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)"
+        % (
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            deformable_groups,
+            groups,
+        )
+    )
+
+    A = te.placeholder((batch, in_size, in_size, in_channel), name="A")
+    out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1
+    Offset = te.placeholder(
+        (batch, out_size, out_size, deformable_groups * kernel * kernel * 2), name="offset"
+    )
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
+    bias = te.placeholder((num_filter,), name="bias")
+
+    a_shape = get_const_tuple(A.shape)
+    offset_shape = get_const_tuple(Offset.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nhwc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        offset_np = np.random.randn(*offset_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        c_np = tvm.topi.testing.deformable_conv2d_nhwc_python(
+            a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups
+        )
+
+        return a_np, offset_np, w_np, c_np
+
+    a_np, offset_np, w_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not tvm.testing.device_enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_nhwc_implement)
+        with tvm.target.Target(device):
+            C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype)
+            s = fschedule([C])
+
+            a = tvm.nd.array(a_np, ctx)
+            offset = tvm.nd.array(offset_np, ctx)
+            w = tvm.nd.array(w_np, ctx)
+            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx)
+
+            func = tvm.build(s, [A, Offset, W, C], device)
+            func(a, offset, w, c)
+            tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
 @tvm.testing.uses_gpu
 def test_deformable_conv2d_nchw():
     verify_deformable_conv2d_nchw(1, 16, 7, 16, 1, 1, 0, deformable_groups=4)
@@ -119,5 +203,12 @@ def test_deformable_conv2d_nchw():
     verify_deformable_conv2d_nchw(1, 16, 7, 16, 3, 1, 2, dilation=2)
 
 
+def test_deformable_conv2d_nhwc():
+    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 1, 1, 0, deformable_groups=4)
+    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 3, 1, 1, dilation=2, deformable_groups=4)
+    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 3, 1, 2, dilation=2)
+
+
 if __name__ == "__main__":
     test_deformable_conv2d_nchw()
+    test_deformable_conv2d_nhwc()
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index f46a271ced53..95ebce43497b 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 from common import Int8Fallback
diff --git a/tests/python/topi/python/test_topi_dense_tensorcore.py b/tests/python/topi/python/test_topi_dense_tensorcore.py
index 07dab352da86..3ffdea50d660 100644
--- a/tests/python/topi/python/test_topi_dense_tensorcore.py
+++ b/tests/python/topi/python/test_topi_dense_tensorcore.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index 07ddeab2e897..55d2fe0c4e52 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -20,8 +20,8 @@
 from tvm import topi
 import tvm.topi.testing
 import numpy as np
-from tvm.topi.util import get_const_tuple
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 import tvm.testing
@@ -56,6 +56,55 @@
 }
 
 
+def compile_depthwise_NHWC_int8_arm(
+    batch,
+    in_channel,
+    in_size,
+    kernel,
+    depth_multiplier,
+    stride,
+    padding,
+    add_bias=False,
+    dilation=1,
+):
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+    padding_sum = pad_top + pad_left + pad_bottom + pad_right
+
+    in_height = in_width = in_size
+    A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int16")
+    W = te.placeholder((kernel, kernel, in_channel, depth_multiplier), name="W", dtype="int16")
+    bias = te.placeholder((in_channel * depth_multiplier,), name="bias", dtype="int32")
+    dtype = "int32"
+
+    device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"
+    compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc
+    schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc
+
+    if not tvm.testing.device_enabled(device):
+        print("Skip because %s is not enabled" % device)
+        return
+
+    print("Compiling on arm AArch64 target: %s" % device)
+    with tvm.target.Target(device):
+        assert topi.arm_cpu.arm_utils.is_aarch64_arm(), "AArch64 target not recognized"
+
+        C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
+        if add_bias:
+            C += bias
+            ins_outs = [A, W, bias, C]
+        else:
+            ins_outs = [A, W, C]
+
+        s = schedule([C])
+
+        func = tvm.build(
+            s,
+            ins_outs,
+            device,
+            name="depthwise_conv2d",
+        )
+
+
 def depthwise_conv2d_with_workload_nchw(
     batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1
 ):
@@ -478,6 +527,7 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME")
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
+
     # dilation = 2
     # disabled because it uses too large shared memory on cuda
     # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
@@ -487,6 +537,10 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
 
+    # Test compilation on arm devices
+    compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 3, 1, "SAME")
+    compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 1, 1, "SAME", True)
+
 
 if __name__ == "__main__":
     test_depthwise_conv2d()
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
index 8b4575fa396d..72ad1e29004a 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
@@ -20,8 +20,8 @@
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
 from scipy import signal
-from tvm.topi.util import get_const_tuple
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 import tvm.topi.testing
 from tvm.topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
index 3826f6f1f473..53328113aa71 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
@@ -21,8 +21,8 @@
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
 from scipy import signal
-from tvm.topi.util import get_const_tuple
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 from tvm.topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_dilate.py b/tests/python/topi/python/test_topi_dilate.py
index 0ee51a6c7bf4..27e71735c565 100644
--- a/tests/python/topi/python/test_topi_dilate.py
+++ b/tests/python/topi/python/test_topi_dilate.py
@@ -39,7 +39,7 @@ def _test_dilate(input_size, strides, dilation_value=None):
         else:
             output_np = tvm.topi.testing.dilate_python(input_np, strides, dilation_value)
         input_tvm = tvm.nd.array(input_np, ctx=ctx)
-        output_size = topi.util.get_const_tuple(Output.shape)
+        output_size = topi.utils.get_const_tuple(Output.shape)
         output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
         f = tvm.build(schedule, [Input, Output], target)
         f(input_tvm, output_tvm)
diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index 45e77d19082d..9c4da5c2c849 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -24,7 +24,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from common import Int8Fallback
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
index 89e0e04206c7..b6cef2e97662 100644
--- a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
+++ b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
@@ -25,7 +25,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import pytest
 
 
diff --git a/tests/python/topi/python/test_topi_lrn.py b/tests/python/topi/python/test_topi_lrn.py
index 7e3300c39dc7..278926479977 100644
--- a/tests/python/topi/python/test_topi_lrn.py
+++ b/tests/python/topi/python/test_topi_lrn.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.topi.testing
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py
index 149ed82b150c..6e119e7aef6e 100644
--- a/tests/python/topi/python/test_topi_math.py
+++ b/tests/python/topi/python/test_topi_math.py
@@ -22,13 +22,13 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi import util
+from tvm.topi import utils
 
 
 def test_util():
     x = tvm.tir.const(100, "int32")
-    assert util.get_const_int(x) == 100
-    assert util.get_const_tuple((x, x)) == (100, 100)
+    assert utils.get_const_int(x) == 100
+    assert utils.get_const_tuple((x, x)) == (100, 100)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py
index f6933d7247f4..26ba6f8142b4 100644
--- a/tests/python/topi/python/test_topi_matmul.py
+++ b/tests/python/topi/python/test_topi_matmul.py
@@ -19,7 +19,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def with_tvm(lam, *args):
diff --git a/tests/python/topi/python/test_topi_pooling.py b/tests/python/topi/python/test_topi_pooling.py
index 251172f5a185..6f62b8ad969b 100644
--- a/tests/python/topi/python/test_topi_pooling.py
+++ b/tests/python/topi/python/test_topi_pooling.py
@@ -23,7 +23,7 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 _pool_schedule = {
diff --git a/tests/python/topi/python/test_topi_relu.py b/tests/python/topi/python/test_topi_relu.py
index aa68f235f795..7c45acae0570 100644
--- a/tests/python/topi/python/test_topi_relu.py
+++ b/tests/python/topi/python/test_topi_relu.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.nvcc import have_fp16
 
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_reorg.py b/tests/python/topi/python/test_topi_reorg.py
index 37a0eb95d8eb..93464d9bef03 100644
--- a/tests/python/topi/python/test_topi_reorg.py
+++ b/tests/python/topi/python/test_topi_reorg.py
@@ -17,7 +17,7 @@
 """Example code to do reorg."""
 import numpy as np
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm
 from tvm import te
 import tvm.topi.testing
diff --git a/tests/python/topi/python/test_topi_scatter.py b/tests/python/topi/python/test_topi_scatter.py
new file mode 100644
index 000000000000..2e701e2903d9
--- /dev/null
+++ b/tests/python/topi/python/test_topi_scatter.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import topi
+import tvm.topi.testing
+
+
+@tvm.testing.parametrize_targets
+def test_scatter_nd(ctx, target):
+    def check_scatter_nd(data, indices, shape, out):
+        implementations = {
+            "generic": (lambda x, y: topi.scatter_nd(x, y, shape), topi.generic.schedule_extern),
+            "gpu": (lambda x, y: topi.cuda.scatter_nd(x, y, shape), topi.generic.schedule_extern),
+            "cpu": (lambda x, y: topi.x86.scatter_nd(x, y, shape), topi.generic.schedule_extern),
+        }
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
+        tvm.topi.testing.compare_numpy_tvm([data, indices], out, target, ctx, fcompute, fschedule)
+
+    data = np.array([2, 3, 0])
+    indices = np.array([[1, 1, 0], [0, 1, 0]])
+    shape = (2, 2)
+    out = np.array([[0, 0], [2, 3]])
+    check_scatter_nd(data, indices, shape, out)
+
+    data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+    indices = np.array([[0, 1], [1, 1]])
+    shape = (2, 2, 2, 2)
+    out = np.array([[[[0, 0], [0, 0]], [[1, 2], [3, 4]]], [[[0, 0], [0, 0]], [[5, 6], [7, 8]]]])
+    check_scatter_nd(data, indices, shape, out)
+
+    data = np.reshape(np.arange(1560 * 3), (3, 1560)).astype("float32")
+    indices = np.array([[1, 0, 0]])
+    shape = (2, 1560)
+    out = np.zeros(shape).astype("float32")
+    out[1, :] += data[0, :]
+    out[0, :] += data[1, :]
+    out[0, :] += data[2, :]
+    check_scatter_nd(data, indices, shape, out)
+
+    data = np.ones((5, 3)).astype("float64")
+    indices = np.stack((np.random.randint(2, size=5), np.random.randint(7, size=5))).astype("int64")
+    shape = (2, 7, 3)
+    out = np.zeros(shape).astype("float64")
+    for i in range(indices.shape[1]):
+        for j in range(data.shape[1]):
+            out[indices[0, i], indices[1, i], j] += data[i, j]
+    check_scatter_nd(data, indices, shape, out)
+
+
+if __name__ == "__main__":
+    test_scatter_nd(tvm.context("cpu"), tvm.target.Target("llvm"))
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index df4c4ea3bf0b..66c44f937c5e 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -23,7 +23,7 @@
 import tvm.testing
 import tvm.topi.testing
 import logging
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 _softmax_schedule = {
diff --git a/tests/python/topi/python/test_topi_space_to_batch_nd.py b/tests/python/topi/python/test_topi_space_to_batch_nd.py
new file mode 100644
index 000000000000..6f969f391002
--- /dev/null
+++ b/tests/python/topi/python/test_topi_space_to_batch_nd.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for space to batch"""
+import numpy as np
+import tvm
+from tvm import te
+from tvm import topi
+import tvm.testing
+import tvm.topi.testing
+
+
+def verify_space_to_batch_nd(input_shape, block_shape, pad_before, pad_after, pad_value=0):
+    out_shape = []
+    out_shape.append(int((input_shape[0] * np.prod(block_shape))))
+    for i in range(1, len(block_shape) + 1):
+        pad = pad_before[i - 1] + pad_after[i - 1]
+        out_shape.append(int((input_shape[i] + pad) // block_shape[i - 1]))
+    for i in range(len(block_shape) + 1, len(input_shape)):
+        out_shape.append(input_shape[i])
+
+    A = te.placeholder(input_shape, name="A", dtype="float32")
+    dtype = A.dtype
+    a_np = np.random.uniform(size=input_shape).astype(dtype)
+
+    B = topi.nn.space_to_batch_nd(A, block_shape, pad_before, pad_after, pad_value)
+
+    b_np = tvm.topi.testing.space_to_batch_nd_python(
+        a_np, block_shape, pad_before, pad_after, pad_value
+    )
+
+    def check_device(device, ctx):
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = tvm.topi.testing.get_injective_schedule(device)(B)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
+
+    for device, ctx in tvm.testing.enabled_targets():
+        check_device(device, ctx)
+
+
+@tvm.testing.uses_gpu
+def test_space_to_batch():
+    # Without paddings
+    verify_space_to_batch_nd([3, 3, 2, 1], [3], [0], [0])
+    # With paddings
+    verify_space_to_batch_nd([3, 3, 2, 1], [3], [1], [2])
+    # Multiple spatial dims
+    verify_space_to_batch_nd([3, 3, 4, 5, 2], [3, 4, 2], [1, 0, 3], [2, 0, 0])
+    # No remaining dims
+    verify_space_to_batch_nd([3, 3, 4, 5, 2], [3, 4, 2, 2], [1, 4, 0, 0], [2, 0, 1, 0])
+
+
+if __name__ == "__main__":
+    test_space_to_batch()
diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py
index 9426eb7499df..62f49e21418f 100644
--- a/tests/python/topi/python/test_topi_sparse.py
+++ b/tests/python/topi/python/test_topi_sparse.py
@@ -21,7 +21,7 @@
 from tvm import topi
 from tvm import relay
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.contrib.sparse as tvmsp
 from collections import namedtuple
 import time
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index f18b5397eefe..30434f6fd266 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -423,6 +423,38 @@ def check_device(device):
         check_device(device)
 
 
+def verify_dynamic_strided_slice(in_shape, begin, end, strides=None):
+    A = te.placeholder(shape=in_shape, name="A")
+    Begin = te.placeholder(shape=[len(in_shape)], name="begin", dtype="int64")
+    End = te.placeholder(shape=[len(in_shape)], name="end", dtype="int64")
+    Strides = te.placeholder(shape=[len(in_shape)], name="strides", dtype="int64")
+    strides = [1, 1, 1] if strides is None else strides
+    B = topi.strided_slice(A, Begin, End, Strides) + 1
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not tvm.testing.device_enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.Target(device):
+            s = tvm.topi.testing.get_injective_schedule(device)(B)
+
+        foo = tvm.build(s, [A, Begin, End, Strides, B], device, name="stride_slice")
+        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides) + 1
+        data_nd = tvm.nd.array(x_np, ctx)
+        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
+        begin_nd = tvm.nd.array(np.array(begin).astype("int64"), ctx)
+        end_nd = tvm.nd.array(np.array(end).astype("int64"), ctx)
+        strides_nd = tvm.nd.array(np.array(strides).astype("int64"), ctx)
+        foo(data_nd, begin_nd, end_nd, strides_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
+        check_device(device)
+
+
 def verify_strided_set(in_shape, v_shape, begin, end, strides=None):
     A = te.placeholder(shape=in_shape, name="A")
     V = te.placeholder(shape=v_shape, name="V")
@@ -787,6 +819,15 @@ def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
 
 
+@tvm.testing.uses_gpu
+def test_dynamic_strided_slice():
+    verify_dynamic_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
+    verify_dynamic_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
+    verify_dynamic_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2])
+    verify_dynamic_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
+    verify_dynamic_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
+
+
 @tvm.testing.uses_gpu
 def test_strided_set():
     verify_strided_set((3, 4, 3), (3, 2, 2), [0, 3, 0], [4, 1, 4], [1, -1, 2])
@@ -839,6 +880,7 @@ def test_reshape():
 
 @tvm.testing.uses_gpu
 def test_where():
+    verify_where(())
     verify_where((1, 2, 3, 4))
 
 
diff --git a/tests/python/topi/python/test_topi_upsampling.py b/tests/python/topi/python/test_topi_upsampling.py
index 85042c032d03..0408220bfd65 100644
--- a/tests/python/topi/python/test_topi_upsampling.py
+++ b/tests/python/topi/python/test_topi_upsampling.py
@@ -22,7 +22,7 @@
 import tvm.testing
 import tvm.topi.testing
 import math
-from tvm.topi.util import nchw_pack_layout
+from tvm.topi.utils import nchw_pack_layout
 
 
 def verify_upsampling(
diff --git a/tests/python/topi/python/test_topi_util.py b/tests/python/topi/python/test_topi_util.py
index 18182dc27b11..bd7585e56302 100644
--- a/tests/python/topi/python/test_topi_util.py
+++ b/tests/python/topi/python/test_topi_util.py
@@ -20,7 +20,7 @@
 
 
 def verify_get_shape(src_shape, src_layout, dst_layout, expect_shape):
-    dst_shape = topi.util.get_shape(src_shape, src_layout, dst_layout)
+    dst_shape = topi.utils.get_shape(src_shape, src_layout, dst_layout)
     assert dst_shape == expect_shape, "Shape mismatch: expecting %s but got %s" % (
         expect_shape,
         dst_shape,
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 0d02dd8b3311..22c9045fd457 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -24,7 +24,7 @@
 import tvm.topi.testing
 
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.topi.vision import ssd, non_max_suppression, get_valid_counts
 import pytest
 import tvm.testing
@@ -124,7 +124,7 @@ def check_device(device):
 @tvm.testing.uses_gpu
 @pytest.mark.skip(
     "Skip this test as it is intermittent."
-    "See https://github.com/apache/incubator-tvm/pull/4901#issuecomment-595040094"
+    "See https://github.com/apache/tvm/pull/4901#issuecomment-595040094"
 )
 def test_get_valid_counts():
     verify_get_valid_counts((1, 1000, 5), 0.5, -1, 0)
diff --git a/tests/python/unittest/test_arith_const_int_bound.py b/tests/python/unittest/test_arith_const_int_bound.py
index badbcbcf1bb3..57e488f4f302 100644
--- a/tests/python/unittest/test_arith_const_int_bound.py
+++ b/tests/python/unittest/test_arith_const_int_bound.py
@@ -76,6 +76,20 @@ def test_add_sub_bound():
     assert bd.min_value == bd.NEG_INF
     assert bd.max_value == 1
 
+    ## constants with negative or positive max(int64) occassionally show up
+    ## in models, this is to ensure we can handle those cases
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.NEG_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
+    bd = analyzer.const_int_bound(x + y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.POS_INF, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
+    bd = analyzer.const_int_bound(x + y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
 
 def test_mul_bound():
     analyzer = tvm.arith.Analyzer()
@@ -303,6 +317,17 @@ def test_let_bound():
     assert bd.max_value == 2
 
 
+def test_floormod_negative_divisor():
+    analyzer = tvm.arith.Analyzer()
+    flm, fld = tvm.te.floormod, tvm.te.floordiv
+    a, b = te.var("a"), te.var("b")
+    analyzer.update(a, tvm.arith.ConstIntBound(0, 6))
+    analyzer.update(b, tvm.arith.ConstIntBound(-5, 7))
+    bd = analyzer.const_int_bound(flm(a, b))
+    assert bd.min_value == -4
+    assert bd.max_value == 6
+
+
 if __name__ == "__main__":
     test_let_bound()
     test_dtype_bound()
@@ -318,3 +343,4 @@ def test_let_bound():
     test_shift_and_bound()
     test_mix_index_bound()
     test_size_var_bound()
+    test_floormod_negative_divisor()
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
new file mode 100644
index 000000000000..620540cc9841
--- /dev/null
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.testing
+from tvm import te
+
+
+def ifuse(inputs):
+    """Fuse iterators"""
+    value, extent = 0, 1
+    for i, ext in inputs:
+        value = value * ext + i
+        extent = extent * ext
+    return (value, extent)
+
+
+def isplit(axis, factor):
+    """Split iterators"""
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    return [
+        (fld(axis[0], factor), fld(axis[1] + (factor - 1), factor)),
+        (flm(axis[0], factor), factor),
+    ]
+
+
+def var_dom(iters):
+    """Get domains of iterators"""
+    return {var: tvm.ir.Range(0, ext) for var, ext in iters}
+
+
+def assert_iter_sum_pattern(sum_expr, extent, base, scale=1):
+    """Check the sum expr have the right pattern."""
+    assert isinstance(sum_expr, tvm.arith.IterSumExpr)
+    if extent == 1:
+        assert len(sum_expr.args) == 0
+    else:
+        assert len(sum_expr.args) == 1
+        tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
+        tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
+    tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+
+
+def test_trivial():
+    x = tvm.tir.Var("x", "int32"), 3
+    y = tvm.tir.Var("y", "int32"), 4
+
+    res = tvm.arith.detect_iter_map([x[0], y[0], 3], var_dom([x, y]))
+
+    assert len(res) == 3
+    assert_iter_sum_pattern(res[0], 3, 0)
+    assert_iter_sum_pattern(res[1], 4, 0)
+    assert_iter_sum_pattern(res[2], 1, 3)
+
+    res = tvm.arith.detect_iter_map([x[0], 3], var_dom([x, y]))
+    assert len(res) == 0
+
+    # not independent
+    res = tvm.arith.detect_iter_map([x[0], x[0], 3], var_dom([x, y]))
+    assert len(res) == 0
+
+
+def test_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+    c1 = tvm.tir.SizeVar("c1", "int32")
+    c2 = tvm.tir.SizeVar("c1", "int32")
+
+    res = tvm.arith.detect_iter_map([y * 3 + 1 + c + x], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 12, 1 + c)
+
+    res = tvm.arith.detect_iter_map([ifuse([(x, 3), (y, 4)])[0]], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 12, 0)
+
+    # fuse with symbolic factor
+    res = tvm.arith.detect_iter_map([(y + 1) * c + x], var_dom([(x, c), (y, 4)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 4 * c, c)
+
+    # duplication
+    res = tvm.arith.detect_iter_map([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 0
+
+    # duplication 2
+    res = tvm.arith.detect_iter_map([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 0
+
+    # factor mismatch
+    res = tvm.arith.detect_iter_map([y * 4 + x], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 0
+
+    # simple stride pattern
+    res = tvm.arith.detect_iter_map([x * 4 + y * 2], var_dom([(x, 3), (y, 2)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 6, 0, scale=2)
+
+    # simple stride pattern with symbolic
+    res = tvm.arith.detect_iter_map([x * 2 * c0 + y * 2], var_dom([(x, 3), (y, c0)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 3 * c0, 0, scale=2)
+
+
+def test_split():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("y", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+    c1 = tvm.tir.SizeVar("c1", "int32")
+    c2 = tvm.tir.SizeVar("c1", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    res = tvm.arith.detect_iter_map([fld(x, 3), flm(x, 3) * 2 + c1], var_dom([(x, 24)]))
+
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 8, 0)
+    assert_iter_sum_pattern(res[1], 3, c1, 2)
+
+    res = tvm.arith.detect_iter_map([fld(x, 6), fld(flm(x, 6), 2), flm(x, 2)], var_dom([(x, 24)]))
+
+    assert len(res) == 3
+    assert_iter_sum_pattern(res[0], 4, 0)
+    assert_iter_sum_pattern(res[1], 3, 0)
+    assert_iter_sum_pattern(res[2], 2, 0)
+
+    # simple symbolic bound
+    # TODO(tvm-team) improve symbolic divisible check to enable
+    # more complicated symbolic bound
+    res = tvm.arith.detect_iter_map([fld(x, c0), flm(x, c0)], var_dom([(x, c1 * c0)]))
+
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], c1, 0)
+    assert_iter_sum_pattern(res[1], c0, 0)
+
+    res = tvm.arith.detect_iter_map([fld(x * 2, 4), flm(x * 2, 4)], var_dom([(x, 8)]))
+
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 4, 0, scale=1)
+    assert_iter_sum_pattern(res[1], 2, 0, scale=2)
+
+    res = tvm.arith.detect_iter_map([fld(x * 2, 4) * 4 + flm(x * 2, 4)], var_dom([(x, 8)]))
+
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 8, 0, scale=2)
+
+
+def test_compound():
+    x = tvm.tir.Var("x", "int32"), 10
+    y = tvm.tir.Var("y", "int32"), 9
+
+    xo, xi = isplit(x, 5)
+    yo, yi = isplit(y, 3)
+    z = ifuse([yo, xo, yi])
+
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([x, y]))
+
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 18, 0)
+    assert_iter_sum_pattern(res[1], 5, 0)
+    # reconstruct the pattern manually
+    mx = tvm.arith.IterMark(x[0], 10)
+    my = tvm.arith.IterMark(y[0], 9)
+
+    xoscale = 3
+    xiscale = 1
+    yoscale = 6
+    yiscale = 1
+    mxo = tvm.arith.IterSplitExpr(mx, 5, 2, xoscale)
+    mxi = tvm.arith.IterSplitExpr(mx, 1, 5, xiscale)
+    myo = tvm.arith.IterSplitExpr(my, 3, 3, yoscale)
+    myi = tvm.arith.IterSplitExpr(my, 1, 3, yiscale)
+
+    mz = tvm.arith.IterMark(tvm.arith.IterSumExpr([myo, mxo, myi], 0), 18)
+    sz = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(mz, 1, 18, 1)], 0)
+    tvm.ir.assert_structural_equal(sz, res[0])
+
+
+if __name__ == "__main__":
+    test_split()
+    test_trivial()
+    test_fuse()
+    test_compound()
diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py
index 880b11211e4e..87814f28ad72 100644
--- a/tests/python/unittest/test_auto_scheduler_common.py
+++ b/tests/python/unittest/test_auto_scheduler_common.py
@@ -23,7 +23,7 @@
 from tvm import te, auto_scheduler
 from tvm import topi
 from tvm.topi.nn.winograd_util import winograd_transform_matrices
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 @auto_scheduler.register_workload
@@ -53,6 +53,20 @@ def double_matmul_auto_scheduler_test(N):
     return [A, B, C, E]
 
 
+@auto_scheduler.register_workload
+def parallel_matmul_auto_scheduler_test(N):
+    """Two parallel matmuls with shared A."""
+    A = te.placeholder((N, N), name="A", dtype="float32")
+    B = te.placeholder((N, N), name="B", dtype="float32")
+    C = te.placeholder((N, N), name="C", dtype="float32")
+    k = te.reduce_axis((0, N), name="k")
+    D = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="D")
+    k = te.reduce_axis((0, N), name="k")
+    E = te.compute((N, N), lambda i, j: te.sum(A[i][k] * C[k][j], axis=[k]), name="E")
+
+    return [A, B, C, D, E]
+
+
 # Test for register_workload with different name
 @auto_scheduler.register_workload("matmul_auto_scheduler_test_rename_1")
 def matmul_auto_scheduler_test_rename_0(N, M, K):
@@ -123,6 +137,17 @@ def softmax_abcd_auto_scheduler_test(a, b, c, d):
     return [A, B]
 
 
+@auto_scheduler.register_workload
+def invalid_compute_definition():
+    A = te.placeholder((10, 10), name="A")
+    # The names of the following two iterators are the same.
+    # This is invalid.
+    r1 = te.reduce_axis((0, 2), name="r1")
+    r2 = te.reduce_axis((0, 2), name="r1")
+    B = te.compute((10,), lambda i: te.sum(A[i][r1 + r2], axis=[r1, r2]), name="B")
+    return [A, B]
+
+
 @auto_scheduler.register_workload
 def conv2d_winograd_nhwc_auto_scheduler_test(
     N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1
@@ -147,14 +172,12 @@ def conv2d_winograd_nhwc_auto_scheduler_test(
     r = KW
     m = tile_size
     alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, "float32")
+    A, B, _ = winograd_transform_matrices(m, r, "float32")
 
     H = (H + 2 * HPAD - KH) // HSTR + 1
     W = (W + 2 * WPAD - KW) // WSTR + 1
     nH, nW = (H + m - 1) // m, (W + m - 1) // m
     P = N * nH * nW
-    r_kh = te.reduce_axis((0, KH), name="r_kh")
-    r_kw = te.reduce_axis((0, KW), name="r_kw")
     kshape = (alpha, alpha, CI, CO)
     kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")
 
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index a58f2ca374a2..859964ff51ef 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -16,19 +16,26 @@
 # under the License.
 
 """Test ComputeDAG (replay, infer bound)"""
+import json
+import pickle
 
 import tvm
 from tvm import topi
 from tvm import auto_scheduler, te
 
-from test_auto_scheduler_common import get_tiled_matmul, matmul_auto_scheduler_test
+from test_auto_scheduler_common import (
+    get_tiled_matmul,
+    invalid_compute_definition,
+    matmul_auto_scheduler_test,
+    parallel_matmul_auto_scheduler_test,
+)
 
 
 def test_apply_steps():
     dag, s = get_tiled_matmul()
     dag.print_python_code_from_state(s)
     sch, tensors = dag.apply_steps_from_state(s)
-    stmt = tvm.lower(sch, tensors, simple_mode=True)
+    tvm.lower(sch, tensors, simple_mode=True)
 
 
 def test_infer_bound():
@@ -56,7 +63,95 @@ def test_estimate_flop():
     assert abs(dag.flop_ct - (2 * N ** 3 + 1234)) < 0.5
 
 
+def test_stage_order():
+    """Test if the stage order is preserved when recovering a DAG."""
+    N = 512
+    A, B, C, D, E = parallel_matmul_auto_scheduler_test(N)
+    sch = te.create_schedule([D.op, E.op])
+    (D_local,) = sch.cache_write([D], "local")
+    (E_local,) = sch.cache_write([E], "local")
+    sch.cache_read(A, "shared", [D_local])
+    sch.cache_read(B, "shared", [D_local])
+    sch.cache_read(A, "shared", [E_local])
+    sch.cache_read(C, "shared", [E_local])
+
+    dag = auto_scheduler.ComputeDAG(sch)
+    stage_ops_1 = dag.get_init_state().stage_ops
+
+    # 3 placeholder, 4 x.shared, 2 {D,E}.local, 2 {D,E} compute
+    assert len(stage_ops_1) == 11
+
+    # Cache read stage should follow the source stage
+    for idx, op in enumerate(stage_ops_1):
+        if op.name == "A":
+            assert (
+                stage_ops_1[idx + 1].name == "A.d.shared"
+                and stage_ops_1[idx + 2].name == "A.shared"
+            )
+        elif op.name in ["B", "C"]:
+            assert stage_ops_1[idx + 1].name == "%s.shared" % op.name
+
+    # Serialize and deserialize the ComputeDAG constructed by a schedule.
+    loaded_dag = pickle.loads(pickle.dumps(dag))
+    assert str(loaded_dag.get_init_state()) == str(dag.get_init_state())
+    assert len(loaded_dag.get_init_state().stage_ops) == len(dag.get_init_state().stage_ops)
+
+    # Apply the same schedule to Ansor state and it should have the same stage order
+    dag = auto_scheduler.ComputeDAG([A, B, C, D, E])
+    state = dag.get_init_state()
+
+    D_local = state.cache_write(D, "local")
+    E_local = state.cache_write(E, "local")
+    state.cache_read(A, "shared", [D_local])
+    state.cache_read(B, "shared", [D_local])
+    state.cache_read(A, "shared", [E_local])
+    state.cache_read(C, "shared", [E_local])
+
+    stage_ops_2 = state.stage_ops
+    assert len(stage_ops_1) == len(stage_ops_2)
+
+    # Cache read stage should follow the source stage
+    for op1, op2 in zip(stage_ops_1, stage_ops_2):
+        assert op1.name == op2.name
+
+    # Serialize and deserialize the ComputeDAG constructed by a list of tensor ops.
+    loaded_dag = pickle.loads(pickle.dumps(dag))
+    assert str(loaded_dag.get_init_state()) == str(dag.get_init_state())
+    assert len(loaded_dag.get_init_state().stage_ops) == len(dag.get_init_state().stage_ops)
+
+    # Serialize and deserialize the search task.
+    task = auto_scheduler.SearchTask(
+        compute_dag=dag,
+        workload_key=json.dumps(("test-key",)),
+        target=tvm.target.Target("llvm"),
+        hardware_params=auto_scheduler.HardwareParams(100000, 16, 64, 0, 0, 0, 0, 0),
+    )
+
+    task2 = pickle.loads(pickle.dumps(task))
+    assert "test-key" in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY
+    assert str(task.dag.get_init_state()) == str(task2.dag.get_init_state())
+    assert len(task.dag.get_init_state().stage_ops) == len(task2.dag.get_init_state().stage_ops)
+    assert task.workload_key == task2.workload_key
+    assert str(task.target) == str(task2.target)
+    assert task.hardware_params.num_cores == task2.hardware_params.num_cores
+    assert task.hardware_params.vector_unit_bytes == task2.hardware_params.vector_unit_bytes
+    assert task.hardware_params.cache_line_bytes == task2.hardware_params.cache_line_bytes
+
+
+def test_invalid_compute_dag():
+    failed = False
+    try:
+        A, B = invalid_compute_definition()
+        dag = auto_scheduler.ComputeDAG([A, B])
+    except tvm.TVMError as e:
+        failed = True
+
+    assert failed
+
+
 if __name__ == "__main__":
     test_apply_steps()
     test_infer_bound()
     test_estimate_flop()
+    test_stage_order()
+    test_invalid_compute_dag()
diff --git a/tests/python/unittest/test_auto_scheduler_cost_model.py b/tests/python/unittest/test_auto_scheduler_cost_model.py
index 62acb6b8e387..36360da45c8d 100644
--- a/tests/python/unittest/test_auto_scheduler_cost_model.py
+++ b/tests/python/unittest/test_auto_scheduler_cost_model.py
@@ -30,9 +30,9 @@
 def get_sample_records(number):
     """Generate a list of random MeasureInput and MeasureResult pairs"""
     N = 128
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), "llvm")
+    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target="llvm")
     policy = auto_scheduler.SketchPolicy(task, verbose=0)
-    states = policy.sample_initial_population(number)
+    states = policy.sample_initial_population()[:number]
 
     inputs = [auto_scheduler.MeasureInput(task, s) for s in states]
     results = [
diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
index 9fec6f15a6c4..e28219d0979f 100644
--- a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
+++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
@@ -44,16 +44,15 @@ def is_good_state(state):
 
         def predict(self, task, states):
             scores = []
-            found = False
             for state in states:
                 scores.append(1 if self.is_good_state(state) else 0)
             return scores
 
-    workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (10, 10, 4))
-    dag = auto_scheduler.ComputeDAG(workload_key)
-    task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.Target("llvm"))
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(10, 10, 4), target=tvm.target.Target("llvm")
+    )
     policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)
-    states = policy.sample_initial_population(50)
+    states = policy.sample_initial_population()[:50]
 
     bad_states = []
     for state in states:
@@ -89,18 +88,19 @@ def is_good_state(state):
 
         def predict(self, task, states):
             scores = []
-            found = False
             for state in states:
                 scores.append(1 if self.is_good_state(state) else 0)
             return scores
 
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (1024, 1024, 1024), "llvm")
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(1024, 1024, 1024), target="llvm"
+    )
     policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)
 
     found = False
     retry_ct = 0
     while retry_ct < 10 and not found:
-        states = policy.sample_initial_population(100)
+        states = policy.sample_initial_population()[:100]
         bad_states = []
         for state in states:
             if not MockCostModel.is_good_state(state):
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 8cbe201859cc..b52b53863ee4 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -45,7 +45,7 @@ def test_cpu_matmul():
     s.unroll(C, k)
 
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(dag, "test", target)
+    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
     names = auto_scheduler.feature.get_per_store_feature_names()
     fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0]
 
@@ -103,7 +103,7 @@ def fusion_test(N, M):
     s.compute_at(1, 2, s.stages[2].iters[1])
 
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(dag, "test", target)
+    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
     names = auto_scheduler.feature.get_per_store_feature_names()
     fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0]
 
@@ -147,16 +147,15 @@ def test_gpu_feature():
         inputs, results = auto_scheduler.RecordReader(f.name).read_lines()
 
         inp = inputs[0]
-        dag = auto_scheduler.ComputeDAG(inp.task.workload_key)
         task = auto_scheduler.SearchTask(
-            dag,
-            inp.task.workload_key,
-            inp.task.target,
-            None,
-            auto_scheduler.HardwareParams(100000, 16, 64),
+            workload_key=inp.task.workload_key,
+            target=inp.task.target,
+            hardware_params=auto_scheduler.HardwareParams(
+                100000, 16, 64, 1 << 30, 1 << 30, 1 << 30, 1 << 30, 1 << 30
+            ),
         )
 
-        state = dag.infer_bound_from_state(inputs[0].state)
+        state = task.dag.infer_bound_from_state(inputs[0].state)
         fea = auto_scheduler.feature.get_per_store_features_from_states([state], task)[0]
         names = auto_scheduler.feature.get_per_store_feature_names()
 
diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index 3ce7a438eef4..6ca56bde7c60 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -19,7 +19,10 @@
 import tempfile
 import numpy as np
 
+import pytest
+
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import auto_scheduler, te
 
@@ -28,21 +31,29 @@
 
 def test_apply_steps_with_layout_rewrite():
     dag, s = get_tiled_matmul()
-    _, bufs = dag.apply_steps_from_state(s, layout_rewrite=False)
+    _, bufs = dag.apply_steps_from_state(s)
     assert bufs[1].shape[0] == 512
     assert bufs[1].shape[1] == 512
-    _, bufs = dag.apply_steps_from_state(s, layout_rewrite=True)
+    _, bufs = dag.apply_steps_from_state(
+        s, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
+    )
     assert bufs[1].shape[0] == 4
     assert bufs[1].shape[1] == 8
     assert bufs[1].shape[2] == 4
     assert bufs[1].shape[3] == 4
     assert bufs[1].shape[4] == 512
+    _, bufs = dag.apply_steps_from_state(
+        s, layout_rewrite=auto_scheduler.LayoutRewriteOption.INSERT_TRANSFORM_STAGE
+    )
+    assert bufs[1].shape[0] == 512
+    assert bufs[1].shape[1] == 512
 
 
-def test_layout_rewrite_correctness():
+@tvm.testing.requires_llvm
+def test_correctness_layout_rewrite_rewrite_for_preTransformed():
     N = 128
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target)
+    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target)
     dag = task.compute_dag
 
     with tempfile.NamedTemporaryFile() as fp:
@@ -50,16 +61,19 @@ def test_layout_rewrite_correctness():
 
         search_policy = auto_scheduler.SketchPolicy(task)
 
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
         tuning_options = auto_scheduler.TuningOptions(
             num_measure_trials=2,
-            runner="local",
-            verbose=1,
+            runner=measure_ctx.runner,
+            verbose=2,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
-        auto_scheduler.auto_schedule(task, search_policy, tuning_options)
-        inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target)
-        s, bufs = dag.apply_steps_from_state(inp.state, layout_rewrite=True)
-        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state, layout_rewrite=False)
+        task.tune(tuning_options, search_policy=search_policy)
+        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key, target)
+        s, bufs = dag.apply_steps_from_state(
+            inp.state, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
+        )
+        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
         np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
         np_args_ref = [np.array(x) for x in np_args]
 
@@ -100,10 +114,60 @@ def test_layout_rewrite_correctness():
         func_ref(*args_ref)
         ctx.sync()
 
-        np.testing.assert_allclose(np_args[0], np_args_ref[0])
-        np.testing.assert_allclose(np_args[2], np_args_ref[2])
+        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3)
+        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3)
+        del measure_ctx
+
+
+@tvm.testing.requires_llvm
+def test_correctness_layout_rewrite_insert_transform_stage():
+    N = 128
+    target = tvm.target.Target("llvm")
+    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target)
+    dag = task.compute_dag
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+
+        search_policy = auto_scheduler.SketchPolicy(task)
+
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
+        tuning_options = auto_scheduler.TuningOptions(
+            num_measure_trials=2,
+            runner=measure_ctx.runner,
+            verbose=1,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        task.tune(tuning_options, search_policy=search_policy)
+        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key, target)
+        s, bufs = dag.apply_steps_from_state(
+            inp.state, layout_rewrite=auto_scheduler.LayoutRewriteOption.INSERT_TRANSFORM_STAGE
+        )
+
+        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
+        np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
+
+        func = tvm.build(s, bufs, target=target)
+        func_ref = tvm.build(s_ref, bufs_ref, target=target)
+
+        ctx = tvm.context(str(target))
+        ctx_ref = tvm.cpu()
+
+        args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
+        args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args]
+        ctx.sync()
+
+        func(*args)
+        func_ref(*args_ref)
+        ctx.sync()
+
+        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3)
+        tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), atol=1e-3, rtol=1e-3)
+        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3)
+        del measure_ctx
 
 
 if __name__ == "__main__":
     test_apply_steps_with_layout_rewrite()
-    test_layout_rewrite_correctness()
+    test_correctness_layout_rewrite_rewrite_for_preTransformed()
+    test_correctness_layout_rewrite_insert_transform_stage()
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 4369d203b476..b214d9c990b9 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -17,6 +17,7 @@
 
 """ Test measurement and log serialization. """
 
+import multiprocessing
 import tvm
 from tvm import topi
 from tvm import te, auto_scheduler
@@ -28,7 +29,7 @@
 
 def record_common(dag, s):
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(dag, "test", target)
+    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
 
     inp = auto_scheduler.measure.MeasureInput(task, s)
     res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
@@ -168,7 +169,9 @@ def test_record_pragma_storage_align_rfactor():
 
 
 def test_recover_measure_input():
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+    )
 
     inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
     res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
@@ -182,12 +185,10 @@ def test_recover_measure_input():
 
         raw_inp = inputs[0]
 
-        correct_inp = auto_scheduler.measure_record.recover_measure_input(raw_inp)
+        correct_inp = auto_scheduler.measure.recover_measure_input(raw_inp)
         assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag)
 
-        correct_inp = auto_scheduler.measure_record.recover_measure_input(
-            raw_inp, rebuild_state=True
-        )
+        correct_inp = auto_scheduler.measure.recover_measure_input(raw_inp, rebuild_state=True)
         assert str(correct_inp.state) == str(inp.state)
 
 
@@ -195,7 +196,9 @@ def test_measure_local_builder_runner():
     if not tvm.testing.device_enabled("llvm"):
         return
 
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+    )
 
     for enable_cpu_cache_flush in [True, False]:
         minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
@@ -214,7 +217,9 @@ def test_measure_local_builder_rpc_runner():
     if not tvm.testing.device_enabled("llvm"):
         return
 
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+    )
 
     for enable_cpu_cache_flush in [True, False]:
         minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
@@ -232,6 +237,19 @@ def test_measure_local_builder_rpc_runner():
         del measure_ctx
 
 
+def measure_local_builder_rpc_runner_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_measure_local_builder_rpc_runner()
+
+
+@tvm.testing.requires_llvm
+def test_measure_local_builder_rpc_runner_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=measure_local_builder_rpc_runner_spawn)
+    p.start()
+    p.join()
+
+
 if __name__ == "__main__":
     test_record_split_reorder_fuse_annotation()
     test_record_compute_at_root_inline_cache_read_write()
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index 07cf4c8141a0..1bb74497898c 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -18,6 +18,7 @@
 """Test search policy"""
 
 import random
+import multiprocessing
 import numpy as np
 import tempfile
 
@@ -26,24 +27,25 @@
 from tvm import auto_scheduler
 
 from test_auto_scheduler_common import matmul_auto_scheduler_test, PropagatingThread
+import multiprocessing
 
 
 def search_common(
     workload=matmul_auto_scheduler_test,
     target="llvm",
-    search_policy="empty",
-    seed=random.randint(1, 1 << 30),
+    search_policy="sketch",
+    seed=0,
     runner="local",
+    num_measure_trials=100,
     cost_model=auto_scheduler.RandomModel(),
-    num_measure_trials=2,
     init_search_callbacks=None,
 ):
-    print("Test %s schedule search with the default search policy" % (target))
+    print("Test search policy '%s' for '%s'" % (search_policy, target))
 
     random.seed(seed)
     N = 128
     target = tvm.target.Target(target)
-    task = auto_scheduler.create_task(workload, (N, N, N), target)
+    task = auto_scheduler.SearchTask(func=workload, args=(N, N, N), target=target)
 
     with tempfile.NamedTemporaryFile() as fp:
         log_file = fp.name
@@ -57,21 +59,22 @@ def search_common(
             search_policy = auto_scheduler.SketchPolicy(
                 task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks
             )
+        else:
+            raise ValueError("Invalid policy: " + search_policy)
 
         tuning_options = auto_scheduler.TuningOptions(
             num_measure_trials=num_measure_trials,
+            num_measures_per_round=2,
+            early_stopping=1,
             runner=runner,
-            verbose=1,
+            verbose=2,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
-        sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options)
-        print("*" * 80)
-        print(target)
-        print("*" * 80)
-        inp, res = auto_scheduler.load_best(log_file, task.workload_key, target)
+        task.tune(tuning_options=tuning_options, search_policy=search_policy)
+        sch, args = task.apply_best(log_file)
 
         print("==== Python Code ====")
-        print(task.compute_dag.print_python_code_from_state(inp.state))
+        print(task.print_best(log_file))
 
         try:
             print("==== Lowered Stmt ====")
@@ -95,17 +98,30 @@ def search_common(
 def test_workload_registry_search_basic():
     # wrap the search in a new thread to avoid the conflict
     # between python's multiprocessing and tvm's thread pool
-    t = PropagatingThread(target=search_common, kwargs={"seed": 944563397})
+    t = PropagatingThread(
+        target=search_common, kwargs={"search_policy": "empty", "num_measure_trials": 2}
+    )
     t.start()
     t.join()
+
     t = PropagatingThread(
-        target=search_common, kwargs={"seed": 944563397, "workload": "matmul_auto_scheduler_test"}
+        target=search_common,
+        kwargs={
+            "workload": "matmul_auto_scheduler_test",
+            "num_measure_trials": 2,
+            "search_policy": "empty",
+        },
     )
     t.start()
     t.join()
+
     t = PropagatingThread(
         target=search_common,
-        kwargs={"seed": 944563397, "workload": "matmul_auto_scheduler_test_rename_1"},
+        kwargs={
+            "workload": "matmul_auto_scheduler_test_rename_1",
+            "num_measure_trials": 2,
+            "search_policy": "empty",
+        },
     )
     t.start()
     t.join()
@@ -115,13 +131,24 @@ def test_workload_registry_search_basic():
 def test_sketch_search_policy_basic():
     # wrap the search in a new thread to avoid the conflict
     # between python's multiprocessing and tvm's thread pool
-    t = PropagatingThread(
-        target=search_common, kwargs={"seed": 944563397, "search_policy": "sketch"}
-    )
+    t = PropagatingThread(target=search_common)
     t.start()
     t.join()
 
 
+def sketch_search_policy_basic_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_sketch_search_policy_basic()
+
+
+@tvm.testing.requires_llvm
+def test_sketch_search_policy_basic_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=sketch_search_policy_basic_spawn)
+    p.start()
+    p.join()
+
+
 @tvm.testing.requires_llvm
 def test_sketch_search_policy_xgbmodel():
     # wrap the search in a new thread to avoid the conflict
@@ -129,8 +156,6 @@ def test_sketch_search_policy_xgbmodel():
     t = PropagatingThread(
         target=search_common,
         kwargs={
-            "seed": 944563397,
-            "search_policy": "sketch",
             "cost_model": auto_scheduler.XGBModel(),
         },
     )
@@ -146,8 +171,6 @@ def test_sketch_search_policy_cuda_rpc_runner():
     t = PropagatingThread(
         target=search_common,
         kwargs={
-            "seed": 944563397,
-            "search_policy": "sketch",
             "target": "cuda",
             "runner": measure_ctx.runner,
         },
@@ -156,17 +179,14 @@ def test_sketch_search_policy_cuda_rpc_runner():
     t.join()
 
 
+@tvm.testing.requires_cuda
 def test_sketch_search_policy_cuda_xgbmodel_rpc_runner():
-    if not tvm.runtime.enabled("cuda"):
-        return
     measure_ctx = auto_scheduler.LocalRPCMeasureContext()
     # wrap the search in a new thread to avoid the conflict
     # between python's multiprocessing and tvm's thread pool
     t = PropagatingThread(
         target=search_common,
         kwargs={
-            "seed": 944563397,
-            "search_policy": "sketch",
             "target": "cuda",
             "runner": measure_ctx.runner,
             "cost_model": auto_scheduler.XGBModel(),
@@ -179,6 +199,7 @@ def test_sketch_search_policy_cuda_xgbmodel_rpc_runner():
 if __name__ == "__main__":
     test_workload_registry_search_basic()
     test_sketch_search_policy_basic()
+    test_sketch_search_policy_basic_spawn()
     test_sketch_search_policy_xgbmodel()
     test_sketch_search_policy_cuda_rpc_runner()
     test_sketch_search_policy_cuda_xgbmodel_rpc_runner()
diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
index 1a47da00eda3..d060243a45d4 100644
--- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py
+++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
@@ -37,7 +37,7 @@
 
 
 def generate_sketches(workload_func, args, target, print_for_debug=False):
-    task = auto_scheduler.create_task(workload_func, args, tvm.target.Target(target))
+    task = auto_scheduler.SearchTask(func=workload_func, args=args, target=target)
     policy = auto_scheduler.SketchPolicy(task, verbose=0)
     return policy.generate_sketches(print_for_debug)
 
@@ -373,18 +373,19 @@ def test_cuda_conv2d_winograd_sketch():
     """ 1 multi-level tiling sketch """
     assert len(sketches) == 1
     assert_compute_at_condition(sketches[0].stages[1], "inlined")
-    assert_compute_at_condition(sketches[0].stages[2], "inlined")
+    assert_compute_at_condition(sketches[0].stages[2], "iter")
     assert_compute_at_condition(sketches[0].stages[3], "inlined")
     assert_is_tiled(sketches[0].stages[4])
     assert_has_cache_read(sketches[0], 4)
     assert_compute_at_condition(sketches[0].stages[5], "iter")
     assert_has_cache_read(sketches[0], 6)
     assert_compute_at_condition(sketches[0].stages[7], "iter")
-    assert_is_not_tiled(sketches[0].stages[8])
+    assert_is_tiled(sketches[0].stages[8])
     assert_compute_at_condition(sketches[0].stages[8], "iter")
-    assert_compute_at_condition(sketches[0].stages[9], "inlined")
-    assert_is_tiled(sketches[0].stages[10])
-    assert_is_not_tiled(sketches[0].stages[11])
+    assert_has_cache_write(sketches[0], 8)
+    assert_compute_at_condition(sketches[0].stages[9], "root")
+    assert_is_tiled(sketches[0].stages[11])
+    assert_is_not_tiled(sketches[0].stages[12])
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_auto_scheduler_task_scheduler.py b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
new file mode 100644
index 000000000000..032933f3f75f
--- /dev/null
+++ b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Test task scheduler """
+
+import tempfile
+
+import multiprocessing
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import auto_scheduler
+
+from test_auto_scheduler_common import matmul_auto_scheduler_test
+
+
+@tvm.testing.requires_llvm
+def test_task_scheduler_round_robin():
+    tasks = []
+    for n in [2, 4, 8]:
+        tasks.append(
+            auto_scheduler.SearchTask(
+                func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm"
+            )
+        )
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+        num_trials_per_task = 2
+
+        # Tune all tasks
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=num_trials_per_task * len(tasks),
+            runner=measure_ctx.runner,
+            num_measures_per_round=1,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        task_scheduler = auto_scheduler.TaskScheduler(tasks, strategy="round-robin")
+        task_scheduler.tune(tune_option, search_policy="sketch.random")
+
+        # Check the result of round robin
+        counters = {}
+        for task in tasks:
+            counters[task.workload_key] = 0
+
+        for inp, _ in auto_scheduler.load_records(log_file):
+            counters[inp.task.workload_key] += 1
+
+        for task in tasks:
+            assert counters[task.workload_key] == num_trials_per_task
+
+        # test continuous tuning (restoring the status)
+        task_scheduler = auto_scheduler.TaskScheduler(
+            tasks, strategy="round-robin", load_log_file=log_file
+        )
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=len(tasks),
+            num_measures_per_round=1,
+        )
+        task_scheduler.tune(tune_option, search_policy="sketch.random")
+        del measure_ctx
+
+
+@tvm.testing.requires_llvm
+def task_scheduler_round_robin_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_task_scheduler_round_robin()
+
+
+@tvm.testing.requires_llvm
+def test_task_scheduler_round_robin_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=task_scheduler_round_robin_spawn)
+    p.start()
+    p.join()
+
+
+@tvm.testing.requires_llvm
+def test_task_scheduler_gradient():
+    tasks = []
+    for n in [2, 4]:
+        tasks.append(
+            auto_scheduler.SearchTask(
+                func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm"
+            )
+        )
+
+    def objective_func(costs):
+        return costs[0]
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+
+        n_trials = 5
+
+        # Tune all tasks
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=n_trials,
+            runner=measure_ctx.runner,
+            num_measures_per_round=1,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func=objective_func)
+
+        # Forcely rewrite the initial values.
+        # This can make this test more stable on the slow CI machines
+        task_scheduler.best_costs = np.array([1e2, 1e-8])
+
+        task_scheduler.tune(tune_option, search_policy="sketch.random")
+
+        # Check the allocation results
+        counters = {}
+        for task in tasks:
+            counters[task.workload_key] = 0
+
+        for inp, _ in auto_scheduler.load_records(log_file):
+            counters[inp.task.workload_key] += 1
+
+        assert counters[tasks[0].workload_key] == n_trials - 1
+        assert counters[tasks[1].workload_key] == 1
+        del measure_ctx
+
+
+if __name__ == "__main__":
+    test_task_scheduler_round_robin()
+    test_task_scheduler_round_robin_spawn()
+    test_task_scheduler_gradient()
diff --git a/tests/python/unittest/test_autotvm_dispatch_context.py b/tests/python/unittest/test_autotvm_dispatch_context.py
index 4064ede3cc06..6ca062047fd7 100644
--- a/tests/python/unittest/test_autotvm_dispatch_context.py
+++ b/tests/python/unittest/test_autotvm_dispatch_context.py
@@ -21,12 +21,13 @@
 from tvm import autotvm
 
 
-def test_fallback():
-    @autotvm.template("testing/dispatch_fallback")
-    def simple_template(a, b):
-        cfg = autotvm.get_config()
-        assert cfg.is_fallback
+@autotvm.template("testing/dispatch_fallback")
+def simple_template(a, b):
+    cfg = autotvm.get_config()
+    assert cfg.is_fallback
+
 
+def test_fallback():
     simple_template(2, 3)
 
 
diff --git a/tests/python/unittest/test_autotvm_graph_tuner_core.py b/tests/python/unittest/test_autotvm_graph_tuner_core.py
index f5947619d295..3d7d304f13d5 100644
--- a/tests/python/unittest/test_autotvm_graph_tuner_core.py
+++ b/tests/python/unittest/test_autotvm_graph_tuner_core.py
@@ -18,7 +18,7 @@
 # NOTE: We name this test file to start with test_graph_tuner
 # to make it execute after zero_rank tensor test cases. This
 # helps avoid topi arithmetic operator overloading issue:
-# https://github.com/apache/incubator-tvm/issues/3240.
+# https://github.com/apache/tvm/issues/3240.
 # TODO: restore the file name after this issue is resolved.
 import os
 import copy
diff --git a/tests/python/unittest/test_autotvm_graph_tuner_utils.py b/tests/python/unittest/test_autotvm_graph_tuner_utils.py
index 9fc415c09dc6..6ab194c10ea7 100644
--- a/tests/python/unittest/test_autotvm_graph_tuner_utils.py
+++ b/tests/python/unittest/test_autotvm_graph_tuner_utils.py
@@ -18,7 +18,7 @@
 # NOTE: We name this test file to start with test_graph_tuner
 # to make it execute after zero_rank tensor test cases. This
 # helps avoid topi arithmetic operator overloading issue:
-# https://github.com/apache/incubator-tvm/issues/3240
+# https://github.com/apache/tvm/issues/3240
 # TODO: restore the file name after this issue is resolved.
 import tvm
 from tvm import te
diff --git a/tests/python/unittest/test_autotvm_index_tuner.py b/tests/python/unittest/test_autotvm_index_tuner.py
index 05f12118e6af..c433d8fb7297 100644
--- a/tests/python/unittest/test_autotvm_index_tuner.py
+++ b/tests/python/unittest/test_autotvm_index_tuner.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test index based tuners"""
 
+import multiprocessing
 from test_autotvm_common import DummyRunner, get_sample_task
 from tvm import autotvm
 from tvm.autotvm.tuner import GridSearchTuner, RandomTuner
@@ -43,6 +44,18 @@ def test_gridsearch_tuner():
     assert not tuner.has_next()
 
 
+def grid_search_spawn():
+    assert multiprocessing.get_spawn_method(False) == "spawn"
+    test_gridsearch_tuner()
+
+
+def test_grid_search_tuner_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=test_gridsearch_tuner)
+    p.start()
+    p.join()
+
+
 def test_random_tuner():
     """Test RandomTuner"""
 
@@ -65,4 +78,5 @@ def test_random_tuner():
 
 if __name__ == "__main__":
     test_gridsearch_tuner()
+    test_gridsearch_tuner_spawn()
     test_random_tuner()
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index c8760d2be1b4..1a18d6122bf0 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test builder and runner"""
 import logging
+import multiprocessing
 import time
 
 import numpy as np
@@ -46,6 +47,19 @@ def test_task_tuner_without_measurement():
         assert tuner.best_flops > 1
 
 
+def task_tuner_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_task_tuner_without_measurement()
+
+
+def test_task_tuner_without_measurement_spawn():
+    # Subprocesses inherit the spawn method of their parents
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=task_tuner_spawn)
+    p.start()
+    p.join()
+
+
 def test_check_correctness():
     task, target = get_sample_task()
 
@@ -77,4 +91,5 @@ def _callback_wrong(tuner, measure_inputs, measure_results):
     logging.basicConfig(level=logging.INFO)
 
     test_task_tuner_without_measurement()
+    test_task_tuner_without_measurement_spawn()
     test_check_correctness()
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index c9d2c49de5f7..51cc9074a4fe 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import util
+from tvm.contrib import utils
 
 from tvm import autotvm
 from tvm.autotvm.measure import MeasureInput, MeasureResult, MeasureErrorNo
@@ -50,7 +50,7 @@ def test_load_dump():
 
 
 def test_file_io():
-    temp = util.tempdir()
+    temp = utils.tempdir()
     file_path = temp.relpath("temp.log")
 
     tsk, target = get_sample_task()
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index 5789a9fad4d5..58b2a4d66344 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -16,6 +16,7 @@
 # under the License.
 import time
 
+import multiprocessing
 import numpy as np
 
 import tvm
@@ -43,6 +44,19 @@ def test_fit():
     upper_model.fit(xs, ys, plan_size=32)
 
 
+def fit_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_fit()
+
+
+def test_fit_spawn():
+    # Subprocesses inherit the spawn method of their parents
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=test_fit)
+    p.start()
+    p.join()
+
+
 def test_tuner():
     task, target = get_sample_task()
     records = get_sample_records(n=100)
@@ -53,4 +67,5 @@ def test_tuner():
 
 if __name__ == "__main__":
     test_fit()
+    test_fit_spawn()
     test_tuner()
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 9893a6449d96..3d6923342652 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -28,10 +28,9 @@
 
 import tvm
 import tvm.relay
-import tvm.micro
-from tvm.micro import transport
+import tvm.testing
 
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python
 
 BUILD = True
@@ -85,8 +84,11 @@ def _make_ident_sess(workspace):
     return _make_sess_from_op(workspace, "ident", sched, [A, B])
 
 
+@tvm.testing.requires_micro
 def test_compile_runtime():
     """Test compiling the on-device runtime."""
+    import tvm.micro
+
     workspace = tvm.micro.Workspace()
 
     with _make_add_sess(workspace) as sess:
@@ -102,20 +104,27 @@ def test_compile_runtime():
         assert (C_data.asnumpy() == np.array([6, 7])).all()
 
 
+@tvm.testing.requires_micro
 def test_reset():
     """Test when the remote end resets during a session."""
+    import tvm.micro
+    from tvm.micro import transport
+
     workspace = tvm.micro.Workspace()
 
     with _make_add_sess(workspace) as sess:
         try:
             sess._rpc.get_function("tvm.testing.reset_server")()
             assert False, "expected to raise SessionTerminatedError; did not raise"
-        except transport.SessionTerminatedError:
+        except tvm.micro.SessionTerminatedError:
             pass
 
 
+@tvm.testing.requires_micro
 def test_graph_runtime():
     """Test use of the graph runtime with microTVM."""
+    import tvm.micro
+
     workspace = tvm.micro.Workspace()
     relay_mod = tvm.parser.fromtext(
         """
@@ -144,8 +153,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
         assert (out.asnumpy() == np.array([6, 10])).all()
 
 
+@tvm.testing.requires_micro
 def test_std_math_functions():
     """Verify that standard math functions can be used."""
+    import tvm.micro
+
     workspace = tvm.micro.Workspace()
     A = tvm.te.placeholder((2,), dtype="float32", name="A")
     B = tvm.te.compute(A.shape, lambda i: tvm.te.exp(A[i]), name="B")
diff --git a/tests/python/unittest/test_format_si_prefix.py b/tests/python/unittest/test_format_si_prefix.py
index b655654d6ae3..4df5c2b8cd13 100644
--- a/tests/python/unittest/test_format_si_prefix.py
+++ b/tests/python/unittest/test_format_si_prefix.py
@@ -17,7 +17,7 @@
 
 from numpy import isclose
 import random
-from tvm.autotvm import util
+from tvm.autotvm import utils
 
 
 SI_PREFIXES = "yzafpn\xb5m kMGTPEZY"
@@ -25,16 +25,16 @@
 
 def test_format_si_prefix():
     # test float conversion
-    assert util.format_si_prefix(1024, "k") == 1.024
+    assert utils.format_si_prefix(1024, "k") == 1.024
 
     for i, prefix in enumerate(SI_PREFIXES):
         integer, decimal = random.randint(0, 1000), random.randint(0, 1000)
         exp = -24 + 3 * i  # 0th prefix (yocto) is 10^-24
         number = integer * (10 ** exp) + decimal * (10 ** (exp - 3))
         expected = integer + decimal / 1000
-        assert isclose(util.format_si_prefix(number, prefix), expected)
+        assert isclose(utils.format_si_prefix(number, prefix), expected)
 
-    assert util.format_si_prefix(0, "y") == 0
+    assert utils.format_si_prefix(0, "y") == 0
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
new file mode 100644
index 000000000000..c3c2232c2188
--- /dev/null
+++ b/tests/python/unittest/test_link_params.py
@@ -0,0 +1,409 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import collections
+import ctypes
+import json
+import os
+import re
+import struct
+import sys
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.relay
+import tvm.testing
+from tvm.contrib import utils
+
+
+INPUT_SHAPE = (1, 3, 16, 16)
+
+
+KERNEL_SHAPE = (3, 3, 3, 3)
+
+
+# The data types that are linkable.
+LINKABLE_DTYPES = (
+    [f"uint{b}" for b in (8, 16, 32, 64)]
+    + [f"int{b}" for b in (8, 16, 32, 64)]
+    + ["float32", "float64"]
+)
+
+
+def dtype_info(dtype):
+    """Lookup numpy type info for the given string dtype (of LINKABLE_DTYPES above)."""
+    if "int" in dtype:
+        return np.iinfo(getattr(np, dtype))
+    else:
+        return np.finfo(getattr(np, dtype))
+
+
+# Note: for debugging, set this to an integer (i.e. 1.0). Then all "random" tensors will become
+# predictable
+RANDOM_TENSOR_START = None
+
+
+def _make_random_tensor(dtype, shape):
+    """Create a random test tensor with given shape and dtype."""
+    global RAND_SEED
+    if RANDOM_TENSOR_START is not None:
+        to_return = np.arange(
+            RANDOM_TENSOR_START, RANDOM_TENSOR_START + np.prod(shape), dtype=dtype
+        ).reshape(shape)
+        RAND_SEED += np.prod(shape)
+        return to_return
+
+    dinfo = dtype_info(dtype)
+    if "int" in dtype:
+        return np.random.randint(dinfo.min, dinfo.max, shape, dtype=dtype)
+    else:
+        to_return = np.random.uniform(0, dinfo.max, shape).astype(dtype)
+        np.reshape(to_return, np.prod(shape))[::2] *= -1
+        return to_return
+
+
+def _lookup_sid(graph, name):
+    """Lookup the storage id of a named parameter.
+
+    Arguments
+    ---------
+    graph : dict
+        Parsed JSON graph.
+
+    name : str
+        Name of the tensor parameter to lookup.
+
+    Returns
+    -------
+    int :
+        The storage_id of the parameter.
+    """
+    num_outputs_seen = 0
+    for i, n in enumerate(graph["nodes"]):
+        if n["name"] == name:
+            print("sid", name, graph["attrs"]["storage_id"][1], num_outputs_seen)
+            return graph["attrs"]["storage_id"][1][num_outputs_seen]
+        else:
+            if "attrs" in n and "num_outputs" in n["attrs"]:
+                num_outputs_seen += int(n["attrs"]["num_outputs"])
+            else:
+                num_outputs_seen += 1
+
+    raise KeyError(f"no such param: {name}")
+
+
+def _get_ctypes_dtype(dt):
+    """Return a ctypes c_* datatype given a string data type."""
+    if "int" in dt:
+        return getattr(ctypes, f"c_{dt}")
+    elif dt == "float32":
+        return ctypes.c_float
+    elif dt == "float64":
+        return ctypes.c_double
+    else:
+        assert False, f"unknown dtype: {dt}"
+
+
+def _verify_linked_param(dtype, lib, mod, graph, name):
+    """Directly read memory from the linked library to verify the linked parameter is correct."""
+    sid = _lookup_sid(graph, name)
+    # NOTE: query_imports=True because when loading a module from disk (i.e. for C backend),
+    # a GraphRuntimeFactory module is created instead of the module itself.
+    param_ptr = mod.get_function("_lookup_linked_param", True)(sid)
+    gen_param = lib.params[name]
+    arr_data = (_get_ctypes_dtype(dtype) * np.prod(gen_param.shape)).from_address(param_ptr.value)
+    arr = np.ndarray(shape=gen_param.shape, dtype=gen_param.dtype, buffer=arr_data, order="C")
+    if "int" in gen_param.dtype:
+        np.testing.assert_equal(gen_param.asnumpy(), arr)
+    else:
+        np.testing.assert_allclose(gen_param.asnumpy(), arr)
+    return dtype == gen_param.dtype
+
+
+def _make_mod_and_params(dtype):
+    """Create a Relay module and parameters to test the given datatype."""
+    param_decls = collections.OrderedDict()
+    param_init = {}
+
+    def _add_decl(name, dtype):
+        param_decls[name] = f"%{name} : Tensor[{KERNEL_SHAPE}, {dtype}]"
+        param_init[name] = _make_random_tensor(dtype, KERNEL_SHAPE)
+
+    # Add several parameters so that the number of parameters
+    _add_decl(f"{dtype}_a", dtype)
+    _add_decl(f"{dtype}_b", dtype)
+
+    mod_lines = [
+        '#[version = "0.0.5"]',
+        f"def @main(%rand_input : Tensor[{INPUT_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } )  {{",
+        # This program ensures that GraphPlanMemory alternates between the same two storage IDs for a
+        # while. In doing this, it ensures that param %{dtype}_b will be placed into the graph at an
+        # index unequal to its storage_id. This ensures that GraphRuntimeCodegen encodes the storage_id
+        # and not the parameter index into the graph.
+        (
+            f'    %0 = nn.conv2d(%rand_input, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %1 = nn.conv2d(%0, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %2 = nn.conv2d(%1, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %3 = nn.conv2d(%2, %{dtype}_b, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        "    %3",
+        "}",
+    ]
+
+    mod = tvm.parser.fromtext("\n".join(mod_lines))
+    return mod, param_init
+
+
+@tvm.testing.requires_llvm
+def test_llvm_link_params():
+    for dtype in LINKABLE_DTYPES:
+        mod, param_init = _make_mod_and_params(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
+        main_func = mod["main"]
+        target = "llvm --runtime=c --system-lib --link-params"
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, target, params=param_init)
+            assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
+
+            print("graph", lib.graph_json)
+            graph = json.loads(lib.graph_json)
+            for p in lib.params:
+                _verify_linked_param(dtype, lib, lib.lib, graph, p) or found_one
+
+            # Wrap in function to explicitly deallocate the runtime.
+            def _run_linked(lib):
+                graph_json, mod, _ = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            linked_output = _run_linked(lib)
+
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, "llvm --system-lib", params=param_init)
+
+            def _run_unlinked(lib):
+                graph_json, mod, lowered_params = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input("rand_input", rand_input, **lowered_params)
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            unlinked_output = _run_unlinked(lib)
+
+        if "int" in dtype:
+            np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy())
+        else:
+            np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
+
+
+def _get_c_datatype(dtype):
+    """Translate LINKABLE_DTYPES element to c datatype."""
+    if "int" in dtype:
+        return f"{dtype}_t"
+    elif dtype == "float32":
+        return "float"
+    elif dtype == "float64":
+        return "double"
+    else:
+        assert False, f"unknown dtype {dtype}"
+
+
+def _format_c_value(dtype, width, x):
+    if "int" in dtype:
+        hex_formatstr = f'{{:{"+" if dtype.startswith("int") else ""}#0{width}x}}'
+        return hex_formatstr.format(x)
+    elif "float" in dtype:
+        to_ret = float(x).hex()
+        if "inf" in to_ret:
+            return ("-" if x < 0 else "") + "INFINITY"
+        elif "nan" in to_ret:
+            return "NAN"
+
+        before, after = to_ret.split("p")
+        return f'{before.rstrip("0")}p{after}'
+    else:
+        assert False, f"don't know dtype {dtype}"
+
+
+HEX_NUM_RE = re.compile(r"[+\-]?(?:(?:0x[0-9A-Fa-f.p+-]+)|(?:INFINITY)|(?:NAN))")
+
+
+def test_c_link_params():
+    temp_dir = utils.tempdir()
+    for dtype in LINKABLE_DTYPES:
+        mod, param_init = _make_mod_and_params(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
+        main_func = mod["main"]
+        target = "c --link-params"
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            lib = tvm.relay.build(mod, target, params=param_init)
+            assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
+
+            src = lib.lib.get_source()
+            lib.lib.save("test.c", "cc")
+            c_dtype = _get_c_datatype(dtype)
+            src_lines = src.split("\n")
+            param = lib.params["p0"].asnumpy().reshape(np.prod(KERNEL_SHAPE))
+            param_def = f"static const {c_dtype} __tvm_param__p0[{np.prod(param.shape)}] = {{"
+            for i, line in enumerate(src_lines):
+                if line == param_def:
+                    i += 1
+                    break
+            else:
+                assert False, f'did not find parameter definition "{param_def}":\n{src}'
+
+            cursor = 0
+            width = dtype_info(dtype).bits // 4 + 2
+            if dtype.startswith("int"):
+                width += 1  # Account for sign
+
+            while "};" not in src_lines[i]:
+                for match in HEX_NUM_RE.finditer(src_lines[i]):
+                    assert match.group() == _format_c_value(dtype, width, param[cursor]), (
+                        f'p0 byte {cursor}: want "{_format_c_value(dtype, width, param[cursor])}" got '
+                        f'"{match.group(0)}"; full p0 follows:\n{src}'
+                    )
+                    cursor += 1
+                i += 1
+
+            assert cursor == np.prod(param.shape)
+            temp = utils.tempdir()
+
+            # Need a unique name per library to avoid dlopen caching the lib load.
+            lib_path = temp_dir.relpath(f"test-{dtype}-linked.so")
+            lib["remove_params"]().export_library(lib_path)
+            lib_mod = tvm.runtime.load_module(lib_path)
+
+            #            lib_mod = lib_factory['default']()
+            graph = json.loads(lib.graph_json)
+            for p in lib.params:
+                _verify_linked_param(dtype, lib, lib_mod, graph, p)
+
+            # Wrap in function to explicitly deallocate the runtime.
+            def _run_linked(lib_mod):
+                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
+                graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
+                graph_rt.run()
+
+                return graph_rt.get_output(0)
+
+            linked_output = _run_linked(lib_mod)
+
+        linked_params = lib.params
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            lib = tvm.relay.build(mod, "c", params=param_init)
+            _, _, params = lib
+            # Need a unique name per library to avoid dlopen caching the lib load.
+            lib_path = temp_dir.relpath(f"test-{dtype}-unlinked.so")
+            lib.export_library(lib_path)
+            lib_mod = tvm.runtime.load_module(lib_path)
+
+            def _run_unlinked(lib_mod):
+                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
+                graph_rt.set_input("rand_input", rand_input, **params)
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            unlinked_output = _run_unlinked(lib_mod)
+
+        if "int" in dtype:
+            np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy())
+        else:
+            np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
+
+
+@tvm.testing.requires_micro
+def test_crt_link_params():
+    import tvm.micro
+
+    for dtype in LINKABLE_DTYPES:
+        mod, param_init = _make_mod_and_params(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
+        main_func = mod["main"]
+        target = "c -mcpu=native --system-lib --runtime=c --link-params"
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            graph_json, lib, params = tvm.relay.build(mod, target, params=param_init)
+            assert set(params.keys()) == {"p0", "p1"}  # NOTE: op folded
+
+            workspace = tvm.micro.Workspace()
+            compiler = tvm.micro.DefaultCompiler(target=target)
+            opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+            opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE")
+
+            micro_binary = tvm.micro.build_static_runtime(
+                # the x86 compiler *expects* you to give the exact same dictionary for both
+                # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
+                # the binary compiler is expecting those mutations to be in bin_opts.
+                # TODO(weberlo) fix this very bizarre behavior
+                workspace,
+                compiler,
+                lib,
+                lib_opts=opts["bin_opts"],
+                bin_opts=opts["bin_opts"],
+                extra_libs=[
+                    os.path.join(tvm.micro.CRT_ROOT_DIR, m)
+                    for m in ("graph_runtime_module", "graph_runtime")
+                ],
+            )
+
+            flasher_kw = {
+                "debug": False,
+            }
+            flasher = compiler.flasher(**flasher_kw)
+            with tvm.micro.Session(binary=micro_binary, flasher=flasher) as sess:
+                graph_rt = tvm.micro.session.create_local_graph_runtime(
+                    graph_json, sess.get_system_lib(), sess.context
+                )
+
+                # NOTE: not setting params here.
+                graph_rt.set_input("rand_input", rand_input)
+                graph_rt.run()
+                linked_output = graph_rt.get_output(0).asnumpy()
+
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, "llvm --system-lib", params=param_init)
+
+            def _run_unlinked(lib):
+                graph_json, mod, lowered_params = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input("rand_input", rand_input, **lowered_params)
+                graph_rt.run()
+                return graph_rt.get_output(0).asnumpy()
+
+            unlinked_output = _run_unlinked(lib)
+
+        if "int" in dtype:
+            np.testing.assert_equal(unlinked_output, linked_output)
+        else:
+            np.testing.assert_allclose(unlinked_output, linked_output)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_micro_artifact.py b/tests/python/unittest/test_micro_artifact.py
new file mode 100644
index 000000000000..d757f0956b81
--- /dev/null
+++ b/tests/python/unittest/test_micro_artifact.py
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Unit tests for the artifact module."""
+
+import json
+import os
+import shutil
+import tvm
+
+from tvm.contrib import utils
+
+
+FILE_LIST = ["label1", "label2", "label12", "unlabelled"]
+
+
+TEST_METADATA = {"foo": "bar"}
+
+
+TEST_LABELS = {"label1": ["label1", "label12"], "label2": ["label2", "label12"]}
+
+
+def build_artifact(artifact_path, immobile=False):
+    os.mkdir(artifact_path)
+
+    for f in FILE_LIST:
+        with open(os.path.join(artifact_path, f), "w") as lib_f:
+            lib_f.write(f"{f}\n")
+
+    sub_dir = os.path.join(artifact_path, "sub_dir")
+    os.mkdir(sub_dir)
+    os.symlink("label1", os.path.join(artifact_path, "rel_symlink"))
+    os.symlink("label2", os.path.join(artifact_path, "abs_symlink"), "label2")
+    os.symlink(
+        os.path.join(artifact_path, "sub_dir"), os.path.join(artifact_path, "abs_dir_symlink")
+    )
+
+    from tvm.micro import artifact
+
+    art = artifact.Artifact(artifact_path, TEST_LABELS, TEST_METADATA, immobile=immobile)
+
+    return art
+
+
+@tvm.testing.requires_micro
+def test_basic_functionality():
+    temp_dir = utils.tempdir()
+    artifact_path = temp_dir.relpath("foo")
+    art = build_artifact(artifact_path)
+
+    assert art.abspath("bar") == os.path.join(artifact_path, "bar")
+
+    for label, paths in TEST_LABELS.items():
+        assert art.label(label) == paths
+        assert art.label_abspath(label) == [os.path.join(artifact_path, p) for p in paths]
+
+
+@tvm.testing.requires_micro
+def test_archive():
+    from tvm.micro import artifact
+
+    temp_dir = utils.tempdir()
+    art = build_artifact(temp_dir.relpath("foo"))
+
+    # Create archive
+    archive_path = art.archive(temp_dir.temp_dir)
+    assert archive_path == temp_dir.relpath("foo.tar")
+
+    # Inspect created archive
+    unpack_dir = temp_dir.relpath("unpack")
+    os.mkdir(unpack_dir)
+    shutil.unpack_archive(archive_path, unpack_dir)
+
+    for path in FILE_LIST:
+        with open(os.path.join(unpack_dir, "foo", path)) as f:
+            assert f.read() == f"{path}\n"
+
+    with open(os.path.join(unpack_dir, "foo", "metadata.json")) as metadata_f:
+        metadata = json.load(metadata_f)
+
+    assert metadata["version"] == 2
+    assert metadata["labelled_files"] == TEST_LABELS
+    assert metadata["metadata"] == TEST_METADATA
+
+    # Unarchive and verify basic functionality
+    unarchive_base_dir = temp_dir.relpath("unarchive")
+    unarch = artifact.Artifact.unarchive(archive_path, unarchive_base_dir)
+
+    assert unarch.metadata == TEST_METADATA
+    assert unarch.labelled_files == TEST_LABELS
+    for f in FILE_LIST:
+        assert os.path.exists(os.path.join(unarchive_base_dir, f))
+
+
+@tvm.testing.requires_micro
+def test_metadata_only():
+    from tvm.micro import artifact
+
+    temp_dir = utils.tempdir()
+    base_dir = temp_dir.relpath("foo")
+    art = build_artifact(base_dir)
+
+    artifact_path = art.archive(temp_dir.relpath("foo.artifact"), metadata_only=True)
+    unarch_base_dir = temp_dir.relpath("bar")
+    unarch = artifact.Artifact.unarchive(artifact_path, unarch_base_dir)
+    assert unarch.base_dir == base_dir
+
+    for p in unarch.label_abspath("label1") + unarch.label_abspath("label2"):
+        assert os.path.exists(p)
+
+    os.unlink(art.abspath("label1"))
+    with open(art.abspath("label2"), "w+") as f:
+        f.write("changed line\n")
+
+    try:
+        artifact.Artifact.unarchive(artifact_path, os.path.join(temp_dir.temp_dir, "bar2"))
+        assert False, "unarchive should raise error"
+    except artifact.ArchiveModifiedError as err:
+        assert str(err) == (
+            "Files in metadata-only archive have been modified:\n"
+            " * label1: original file not found\n"
+            " * label2: sha256 mismatch: expected "
+            "6aa3c5668c8794c791400e19ecd7123949ded1616eafb0395acdd2d896354e83, got "
+            "ed87db21670a81819d65eccde87c5ae0243b2b61783bf77e9b27993be9a3eca0"
+        )
+
+
+if __name__ == "__main__":
+    test_basic_functionality()
+    test_archive()
+    test_metadata_only()
+    # TODO: tests for dir symlinks, symlinks out of bounds, loading malformed artifact tars.
diff --git a/tests/python/unittest/test_micro_transport.py b/tests/python/unittest/test_micro_transport.py
new file mode 100644
index 000000000000..b0f99681af2e
--- /dev/null
+++ b/tests/python/unittest/test_micro_transport.py
@@ -0,0 +1,187 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Tests for common micro transports."""
+
+import logging
+import sys
+import unittest
+
+import pytest
+
+import tvm.testing
+
+
+@tvm.testing.requires_micro
+class TransportLoggerTests(unittest.TestCase):
+    import tvm.micro
+
+    class TestTransport(tvm.micro.transport.Transport):
+        def __init__(self):
+            self.exc = None
+            self.to_return = None
+
+        def _raise_or_return(self):
+            if self.exc is not None:
+                to_raise = self.exc
+                self.exc = None
+                raise to_raise
+            elif self.to_return is not None:
+                to_return = self.to_return
+                self.to_return = None
+                return to_return
+            else:
+                assert False, "should not get here"
+
+        def open(self):
+            pass
+
+        def close(self):
+            pass
+
+        def timeouts(self):
+            raise NotImplementedError()
+
+        def read(self, n, timeout_sec):
+            return self._raise_or_return()
+
+        def write(self, data, timeout_sec):
+            return self._raise_or_return()
+
+    def test_transport_logger(self):
+        """Tests the TransportLogger class."""
+
+        logger = logging.getLogger("transport_logger_test")
+        with self.assertLogs(logger) as test_log:
+            transport = self.TestTransport()
+            transport_logger = tvm.micro.transport.TransportLogger("foo", transport, logger=logger)
+
+            transport_logger.open()
+            assert test_log.records[-1].getMessage() == "foo: opening transport"
+
+            ########### read() tests ##########
+
+            # Normal log, single-line data returned.
+            transport.to_return = b"data"
+            transport_logger.read(23, 3.0)
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { 3.00s}   23 B -> [  4 B]: 64 61 74 61"
+                "                                      data"
+            )
+
+            # Normal log, multi-line data returned.
+            transport.to_return = b"data" * 6
+            transport_logger.read(23, 3.0)
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { 3.00s}   23 B -> [ 24 B]:\n"
+                "0000  64 61 74 61 64 61 74 61 64 61 74 61 64 61 74 61  datadatadatadata\n"
+                "0010  64 61 74 61 64 61 74 61                          datadata"
+            )
+
+            # Lack of timeout prints.
+            transport.to_return = b"data"
+            transport_logger.read(15, None)
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { None }   15 B -> [  4 B]: 64 61 74 61"
+                "                                      data"
+            )
+
+            # IoTimeoutError includes the timeout value.
+            transport.exc = tvm.micro.transport.IoTimeoutError()
+            with self.assertRaises(tvm.micro.transport.IoTimeoutError):
+                transport_logger.read(23, 0.0)
+
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { 0.00s}   23 B -> [IoTimeoutError  0.00s]"
+            )
+
+            # Other exceptions are logged by name.
+            transport.exc = tvm.micro.transport.TransportClosedError()
+            with self.assertRaises(tvm.micro.transport.TransportClosedError):
+                transport_logger.read(8, 0.0)
+
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { 0.00s}    8 B -> [err: TransportClosedError]"
+            )
+
+            # KeyboardInterrupt produces no log record.
+            before_len = len(test_log.records)
+            transport.exc = KeyboardInterrupt()
+            with self.assertRaises(KeyboardInterrupt):
+                transport_logger.read(8, 0.0)
+
+            assert len(test_log.records) == before_len
+
+            ########### write() tests ##########
+
+            # Normal log, single-line data written.
+            transport.to_return = 3
+            transport_logger.write(b"data", 3.0)
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { 3.00s}        <- [  3 B]: 64 61 74   "
+                "                                      dat"
+            )
+
+            # Normal log, multi-line data written.
+            transport.to_return = 20
+            transport_logger.write(b"data" * 6, 3.0)
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { 3.00s}        <- [ 20 B]:\n"
+                "0000  64 61 74 61 64 61 74 61 64 61 74 61 64 61 74 61  datadatadatadata\n"
+                "0010  64 61 74 61                                      data"
+            )
+
+            # Lack of timeout prints.
+            transport.to_return = 3
+            transport_logger.write(b"data", None)
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { None }        <- [  3 B]: 64 61 74   "
+                "                                      dat"
+            )
+
+            # IoTimeoutError includes the timeout value.
+            transport.exc = tvm.micro.transport.IoTimeoutError()
+            with self.assertRaises(tvm.micro.transport.IoTimeoutError):
+                transport_logger.write(b"data", 0.0)
+
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { 0.00s}       <- [  4 B]: [IoTimeoutError  0.00s]"
+            )
+
+            # Other exceptions are logged by name.
+            transport.exc = tvm.micro.transport.TransportClosedError()
+            with self.assertRaises(tvm.micro.transport.TransportClosedError):
+                transport_logger.write(b"data", 0.0)
+
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { 0.00s}       <- [  4 B]: [err: TransportClosedError]"
+            )
+
+            # KeyboardInterrupt produces no log record.
+            before_len = len(test_log.records)
+            transport.exc = KeyboardInterrupt()
+            with self.assertRaises(KeyboardInterrupt):
+                transport_logger.write(b"data", 0.0)
+
+            assert len(test_log.records) == before_len
+
+            transport_logger.close()
+            assert test_log.records[-1].getMessage() == "foo: closing transport"
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
index 5968f0b0e02b..c43a35924420 100644
--- a/tests/python/unittest/test_runtime_graph.py
+++ b/tests/python/unittest/test_runtime_graph.py
@@ -20,7 +20,7 @@
 import numpy as np
 import json
 from tvm import rpc
-from tvm.contrib import util, graph_runtime
+from tvm.contrib import utils, graph_runtime
 
 
 @tvm.testing.requires_llvm
@@ -68,7 +68,7 @@ def check_remote():
         mlib = tvm.build(s, [A, B], "llvm", name="myadd")
         server = rpc.Server("localhost")
         remote = rpc.connect(server.host, server.port)
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ctx = remote.cpu(0)
         path_dso = temp.relpath("dev_lib.so")
         mlib.export_library(path_dso)
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 25dadbd44acf..8aeaf1a1a23b 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -21,7 +21,7 @@
 from tvm import te
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib.debugger import debug_runtime as graph_runtime
 
 
@@ -120,7 +120,7 @@ def check_remote():
         mlib = tvm.build(s, [A, B], "llvm", name="myadd")
         server = rpc.Server("localhost")
         remote = rpc.connect(server.host, server.port)
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ctx = remote.cpu(0)
         path_dso = temp.relpath("dev_lib.so")
         mlib.export_library(path_dso)
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index 80b330c77004..161f944ea7bb 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -21,7 +21,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime, util
+from tvm.contrib import graph_runtime, utils
 from tvm import topi
 
 
@@ -415,7 +415,7 @@ def check_verify():
             np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
 
         def check_load_module():
-            temp = util.tempdir()
+            temp = utils.tempdir()
             path_lib = temp.relpath("deploy.so")
             mhost.export_library(path_lib)
             with open(temp.relpath("deploy.json"), "w") as out_file:
diff --git a/tests/python/unittest/test_runtime_measure.py b/tests/python/unittest/test_runtime_measure.py
index 77f32a05ea61..0d02f910a44c 100644
--- a/tests/python/unittest/test_runtime_measure.py
+++ b/tests/python/unittest/test_runtime_measure.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 
 
 def test_min_repeat_ms():
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 1d682d2fccdd..64f87fb3c561 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -126,9 +126,9 @@ def verify_cpu_export(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -165,9 +165,9 @@ def verify_gpu_export(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -204,9 +204,9 @@ def verify_rpc_cpu_export(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -248,9 +248,9 @@ def verify_rpc_gpu_export(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -302,9 +302,9 @@ def verify_cpu_remove_package_params(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -349,9 +349,9 @@ def verify_gpu_remove_package_params(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -396,9 +396,9 @@ def verify_rpc_cpu_remove_package_params(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -449,9 +449,9 @@ def verify_rpc_gpu_remove_package_params(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -538,6 +538,35 @@ def test_debug_graph_runtime():
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
 
+def test_multiple_imported_modules():
+    def make_func(symbol):
+        n = tvm.te.size_var("n")
+        Ab = tvm.tir.decl_buffer((n,), dtype="float32")
+        i = tvm.te.var("i")
+        stmt = tvm.tir.For(
+            i,
+            0,
+            n - 1,
+            0,
+            0,
+            tvm.tir.Store(Ab.data, tvm.tir.Load("float32", Ab.data, i) + 1, i + 1),
+        )
+        return tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", symbol)
+
+    def make_module(mod):
+        mod = tvm.IRModule(mod)
+        mod = tvm.driver.build(mod, target="llvm")
+        return mod
+
+    module_main = make_module({"main": make_func("main")})
+    module_a = make_module({"func_a": make_func("func_a")})
+    module_b = make_module({"func_b": make_func("func_b")})
+    module_main.import_module(module_a)
+    module_main.import_module(module_b)
+    module_main.get_function("func_a", query_imports=True)
+    module_main.get_function("func_b", query_imports=True)
+
+
 if __name__ == "__main__":
     test_legacy_compatibility()
     test_cpu()
@@ -545,3 +574,4 @@ def test_debug_graph_runtime():
     test_mod_export()
     test_remove_package_params()
     test_debug_graph_runtime()
+    test_multiple_imported_modules()
diff --git a/tests/python/unittest/test_runtime_module_export.py b/tests/python/unittest/test_runtime_module_export.py
index fcdd906032af..88b7af984073 100644
--- a/tests/python/unittest/test_runtime_module_export.py
+++ b/tests/python/unittest/test_runtime_module_export.py
@@ -21,9 +21,9 @@
 
 import tvm.testing
 
-from tvm.contrib import util
+from tvm.contrib import utils
 
-header_file_dir_path = util.tempdir()
+header_file_dir_path = utils.tempdir()
 
 
 def gen_engine_header():
@@ -80,9 +80,9 @@ def verify_gpu_mod_export(obj_format):
                 synthetic_llvm_mod, "llvm", params=synthetic_llvm_params
             )
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -112,9 +112,9 @@ def verify_multi_dso_mod_export(obj_format):
         B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
         s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "llvm", name="myadd")
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -153,9 +153,9 @@ def verify_json_import_dso(obj_format):
             + "mul 6 inputs: 5 3 shape: 10 10"
         )
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         subgraph_path = temp.relpath("subgraph.examplejson")
         with open(subgraph_path, "w") as f:
             f.write(subgraph_json)
@@ -204,9 +204,9 @@ def verify_multi_c_mod_export():
         s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "c", name="myadd")
         engine_module = generate_engine_module()
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         file_name = "deploy_lib.so"
         path_lib = temp.relpath(file_name)
         synthetic_cpu_lib.import_module(f)
diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py
index 81aa2ba5cc95..7befed3bbcdd 100644
--- a/tests/python/unittest/test_runtime_module_load.py
+++ b/tests/python/unittest/test_runtime_module_load.py
@@ -16,7 +16,7 @@
 # under the License.
 import tvm
 from tvm import te
-from tvm.contrib import cc, util
+from tvm.contrib import cc, utils
 import ctypes
 import os
 import sys
@@ -47,7 +47,7 @@ def test_dso_module_load():
     if not tvm.testing.device_enabled("llvm"):
         return
     dtype = "int64"
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     def save_object(names):
         n = te.size_var("n")
@@ -105,7 +105,7 @@ def check_device(device):
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         name = "myadd_%s" % device
         if sys.platform == "darwin" or sys.platform.startswith("linux"):
             f = tvm.build(s, [A, B], device, "llvm -system-lib", name=name)
@@ -133,7 +133,7 @@ def check_stackvm(device):
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         name = "myadd_%s" % device
         f = tvm.build(s, [A, B], device, "stackvm", name=name)
         path_dso = temp.relpath("dev_lib.stackvm")
@@ -163,7 +163,7 @@ def check_llvm():
         if not tvm.testing.device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         fadd1 = tvm.build(s, [A, B], "llvm", name="myadd1")
         fadd2 = tvm.build(s, [A, B], "llvm", name="myadd2")
         path1 = temp.relpath("myadd1.o")
@@ -188,7 +188,7 @@ def check_system_lib():
         if not tvm.testing.device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         fadd1 = tvm.build(s, [A, B], "llvm -system-lib", name="myadd1")
         fadd2 = tvm.build(s, [A, B], "llvm -system-lib", name="myadd2")
         path1 = temp.relpath("myadd1.o")
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index d25eff23ae76..e975a1699341 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -26,10 +26,27 @@
 import pytest
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util, cc
+from tvm.contrib import utils, cc
 from tvm.rpc.tracker import Tracker
 
-
+# tkonolige: The issue as I understand it is this: multiprocessing's spawn
+# method launches a new process and then imports the relevant modules. This
+# means that all registered functions must exist at the top level scope. In
+# this file they are, so all is well when we run this file directly.
+# However, when run under pytest, the functions aren't registered on the
+# server. I believe this is because pytest is also using multiprocessing to
+# run individual functions. Somewhere along the way, the imports are being
+# lost, so the server ends up not registering the functions.
+pytestmark = pytest.mark.skipif(
+    multiprocessing.get_start_method() != "fork",
+    reason=(
+        "pytest + multiprocessing spawn method causes tvm.register_func to "
+        "not work on the rpc.Server."
+    ),
+)
+
+
+@tvm.testing.requires_rpc
 def test_bigendian_rpc():
     """Test big endian rpc when there is a PowerPC RPC server available"""
     host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
@@ -46,7 +63,7 @@ def verify_rpc(remote, target, shape, dtype):
         ctx = remote.cpu(0)
         a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx)
         b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx)
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("dev_lib.o")
         f.save(path_dso)
         remote.upload(path_dso)
@@ -61,22 +78,23 @@ def verify_rpc(remote, target, shape, dtype):
         verify_rpc(remote, target, (10,), dtype)
 
 
-def test_rpc_simple():
-    if not tvm.runtime.enabled("rpc"):
-        return
+@tvm.register_func("rpc.test.addone")
+def addone(x):
+    return x + 1
+
 
-    @tvm.register_func("rpc.test.addone")
-    def addone(x):
-        return x + 1
+@tvm.register_func("rpc.test.strcat")
+def strcat(name, x):
+    return "%s:%d" % (name, x)
 
-    @tvm.register_func("rpc.test.strcat")
-    def strcat(name, x):
-        return "%s:%d" % (name, x)
 
-    @tvm.register_func("rpc.test.except")
-    def remotethrow(name):
-        raise ValueError("%s" % name)
+@tvm.register_func("rpc.test.except")
+def remotethrow(name):
+    raise ValueError("%s" % name)
 
+
+@tvm.testing.requires_rpc
+def test_rpc_simple():
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.addone")
@@ -90,14 +108,13 @@ def remotethrow(name):
     assert f2("abc", 11) == "abc:11"
 
 
-def test_rpc_runtime_string():
-    if not tvm.runtime.enabled("rpc"):
-        return
+@tvm.register_func("rpc.test.runtime_str_concat")
+def strcat(x, y):
+    return x + y
 
-    @tvm.register_func("rpc.test.runtime_str_concat")
-    def strcat(x, y):
-        return x + y
 
+@tvm.testing.requires_rpc
+def test_rpc_runtime_string():
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     func = client.get_function("rpc.test.runtime_str_concat")
@@ -106,14 +123,15 @@ def strcat(x, y):
     assert str(func(x, y)) == "abcdef"
 
 
-def test_rpc_array():
-    if not tvm.runtime.enabled("rpc"):
-        return
-    x = np.random.randint(0, 10, size=(3, 4))
+@tvm.register_func("rpc.test.remote_array_func")
+def remote_array_func(y):
+    x = np.ones((3, 4))
+    np.testing.assert_equal(y.asnumpy(), x)
 
-    @tvm.register_func("rpc.test.remote_array_func")
-    def remote_array_func(y):
-        np.testing.assert_equal(y.asnumpy(), x)
+
+@tvm.testing.requires_rpc
+def test_rpc_array():
+    x = np.ones((3, 4))
 
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
@@ -124,6 +142,7 @@ def remote_array_func(y):
     fremote(r_cpu)
 
 
+@tvm.testing.requires_rpc
 def test_rpc_large_array():
     # testcase of large array creation
     server = rpc.Server("localhost")
@@ -137,6 +156,7 @@ def test_rpc_large_array():
     np.testing.assert_equal(b.asnumpy(), b_np)
 
 
+@tvm.testing.requires_rpc
 def test_rpc_echo():
     def check(remote):
         fecho = remote.get_function("testing.echo")
@@ -164,7 +184,7 @@ def check_minrpc():
         if tvm.get_global_func("rpc.CreatePipeClient", allow_missing=True) is None:
             return
         # Test minrpc server.
-        temp = util.tempdir()
+        temp = utils.tempdir()
         minrpc_exec = temp.relpath("minrpc")
         tvm.rpc.with_minrpc(cc.create_executable)(minrpc_exec, [])
         check(rpc.PopenSession(minrpc_exec))
@@ -180,9 +200,8 @@ def check_minrpc():
     check_minrpc()
 
 
+@tvm.testing.requires_rpc
 def test_rpc_file_exchange():
-    if not tvm.runtime.enabled("rpc"):
-        return
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
     blob = bytearray(np.random.randint(0, 10, size=(10)))
@@ -191,10 +210,9 @@ def test_rpc_file_exchange():
     assert rev == blob
 
 
+@tvm.testing.requires_rpc
 @tvm.testing.requires_llvm
 def test_rpc_remote_module():
-    if not tvm.runtime.enabled("rpc"):
-        return
     # graph
     n = tvm.runtime.convert(102)
     A = te.placeholder((n,), name="A")
@@ -212,7 +230,7 @@ def test_rpc_remote_module():
     )
 
     def check_remote(remote):
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ctx = remote.cpu(0)
         f = tvm.build(s, [A, B], "llvm", name="myadd")
         path_dso = temp.relpath("dev_lib.so")
@@ -243,7 +261,7 @@ def check_minrpc():
         if tvm.get_global_func("rpc.CreatePipeClient", allow_missing=True) is None:
             return
         # export to minrpc
-        temp = util.tempdir()
+        temp = utils.tempdir()
         f = tvm.build(s, [A, B], "llvm --system-lib", name="myadd")
         path_minrpc = temp.relpath("dev_lib.minrpc")
         f.export_library(path_minrpc, rpc.with_minrpc(cc.create_executable))
@@ -278,7 +296,7 @@ def check_remote_link_cl(remote):
         if not tvm.testing.device_enabled("opencl"):
             print("Skip because opencl is not enabled")
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ctx = remote.cl(0)
         s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=32)
@@ -317,11 +335,13 @@ def check_remote_link_cl(remote):
     check_minrpc()
 
 
-def test_rpc_return_func():
-    @tvm.register_func("rpc.test.remote_func")
-    def addone(x):
-        return lambda y: x + y
+@tvm.register_func("rpc.test.remote_func")
+def addone(x):
+    return lambda y: x + y
+
 
+@tvm.testing.requires_rpc
+def test_rpc_return_func():
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.remote_func")
@@ -329,6 +349,7 @@ def addone(x):
     assert fadd(12) == 22
 
 
+@tvm.testing.requires_rpc
 def test_rpc_session_constructor_args():
     # start server
     server0 = rpc.Server("localhost", key="x0")
@@ -365,21 +386,23 @@ def check_error_handling():
     check_error_handling()
 
 
-def test_rpc_return_ndarray():
+@tvm.register_func("rpc.test.remote_return_nd")
+def my_module(name):
     # Use closure to check the ref counter correctness
     nd = tvm.nd.array(np.zeros(10).astype("float32"))
 
-    @tvm.register_func("rpc.test.remote_return_nd")
-    def my_module(name):
-        if name == "get_arr":
-            return lambda: nd
-        elif name == "ref_count":
-            return lambda: tvm.testing.object_use_count(nd)
-        elif name == "get_elem":
-            return lambda idx: nd.asnumpy()[idx]
-        elif name == "get_arr_elem":
-            return lambda arr, idx: arr.asnumpy()[idx]
+    if name == "get_arr":
+        return lambda: nd
+    elif name == "ref_count":
+        return lambda: tvm.testing.object_use_count(nd)
+    elif name == "get_elem":
+        return lambda idx: nd.asnumpy()[idx]
+    elif name == "get_arr_elem":
+        return lambda arr, idx: arr.asnumpy()[idx]
 
+
+@tvm.testing.requires_rpc
+def test_rpc_return_ndarray():
     # start server
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
@@ -392,26 +415,19 @@ def my_module(name):
     # array test
     def run_arr_test():
         arr = get_arr()
-        assert ref_count() == 2
-        arr2 = get_arr()
-        assert ref_count() == 3
-        assert arr.context == client.cpu(0)
-        arr.copyfrom(np.ones(10).astype(arr.dtype))
-        assert arr2.asnumpy()[0] == 1.0
-        assert get_elem(0) == 1.0
-        assert get_arr_elem(arr2, 0) == 1.0
-
-    assert ref_count() == 1
+        assert get_elem(0) == 0.0
+        assert get_arr_elem(arr, 0) == 0.0
+
     run_arr_test()
-    # check recycle correctness
-    assert ref_count() == 1
 
 
-def test_local_func():
-    @tvm.register_func("rpc.test.remote_func2")
-    def addone(x):
-        return lambda y: x + y
+@tvm.register_func("rpc.test.remote_func2")
+def addone(x):
+    return lambda y: x + y
+
 
+@tvm.testing.requires_rpc
+def test_local_func():
     client = rpc.LocalSession()
     f1 = client.get_function("rpc.test.remote_func2")
     fadd = f1(10)
@@ -423,6 +439,7 @@ def addone(x):
     assert rev == blob
 
 
+@tvm.testing.requires_rpc
 def test_rpc_tracker_register():
     # test registration
     tracker = Tracker("localhost", port=9000, port_end=10000)
@@ -459,6 +476,15 @@ def test_rpc_tracker_register():
     tracker.terminate()
 
 
+def _target(host, port, device_key, timeout):
+    client = rpc.connect_tracker(host, port)
+    remote = client.request(device_key, session_timeout=timeout)
+    while True:
+        pass
+    remote.cpu()
+
+
+@tvm.testing.requires_rpc
 def test_rpc_tracker_request():
     # test concurrent request
     tracker = Tracker("localhost", port=9000, port_end=10000)
@@ -472,16 +498,11 @@ def test_rpc_tracker_request():
     )
     client = rpc.connect_tracker(tracker.host, tracker.port)
 
-    def target(host, port, device_key, timeout):
-        client = rpc.connect_tracker(host, port)
-        remote = client.request(device_key, session_timeout=timeout)
-        while True:
-            pass
-        remote.cpu()
-
-    proc1 = multiprocessing.Process(target=target, args=(tracker.host, tracker.port, device_key, 4))
+    proc1 = multiprocessing.Process(
+        target=_target, args=(tracker.host, tracker.port, device_key, 4)
+    )
     proc2 = multiprocessing.Process(
-        target=target, args=(tracker.host, tracker.port, device_key, 200)
+        target=_target, args=(tracker.host, tracker.port, device_key, 200)
     )
     proc1.start()
     time.sleep(0.5)
diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_trace.py
similarity index 74%
rename from tests/python/unittest/test_runtime_packed_func.py
rename to tests/python/unittest/test_runtime_trace.py
index 718fe03d5c16..951e88d7efdd 100644
--- a/tests/python/unittest/test_runtime_packed_func.py
+++ b/tests/python/unittest/test_runtime_trace.py
@@ -16,126 +16,9 @@
 # under the License.
 import tvm
 from tvm import te
-import tvm.testing
 import numpy as np
 
 
-def test_get_global():
-    targs = (10, 10.0, "hello")
-    # register into global function table
-    @tvm.register_func
-    def my_packed_func(*args):
-        assert tuple(args) == targs
-        return 10
-
-    # get it out from global function table
-    f = tvm.get_global_func("my_packed_func")
-    assert isinstance(f, tvm.runtime.PackedFunc)
-    y = f(*targs)
-    assert y == 10
-
-
-def test_get_callback_with_node():
-    x = tvm.runtime.convert(10)
-
-    def test(y):
-        assert y.handle != x.handle
-        return y
-
-    f2 = tvm.runtime.convert(test)
-    # register into global function table
-    @tvm.register_func
-    def my_callback_with_node(y, f):
-        assert y == x
-        return f(y)
-
-    # get it out from global function table
-    f = tvm.get_global_func("my_callback_with_node")
-    assert isinstance(f, tvm.runtime.PackedFunc)
-    y = f(x, f2)
-    assert y.value == 10
-
-
-def test_return_func():
-    def addy(y):
-        def add(x):
-            return tvm.runtime.convert(x + y)
-
-        return add
-
-    myf = tvm.runtime.convert(addy)
-    f = myf(10)
-    assert f(11).value == 21
-
-
-def test_convert():
-    # convert a function to tvm function
-    targs = (10, 10.0, "hello", 10)
-
-    def myfunc(*args):
-        assert tuple(args) == targs
-
-    f = tvm.runtime.convert(myfunc)
-    assert isinstance(f, tvm.runtime.PackedFunc)
-
-
-def test_byte_array():
-    s = "hello"
-    a = bytearray(s, encoding="ascii")
-
-    def myfunc(ss):
-        assert ss == a
-
-    f = tvm.runtime.convert(myfunc)
-    f(a)
-
-
-def test_empty_array():
-    def myfunc(ss):
-        assert tuple(ss) == ()
-
-    x = tvm.runtime.convert(())
-    tvm.runtime.convert(myfunc)(x)
-
-
-def test_ctx():
-    def test_ctx_func(ctx):
-        assert tvm.gpu(7) == ctx
-        return tvm.cpu(0)
-
-    x = test_ctx_func(tvm.gpu(7))
-    assert x == tvm.cpu(0)
-    x = tvm.opencl(10)
-    x = tvm.testing.context_test(x, x.device_type, x.device_id)
-    assert x == tvm.opencl(10)
-
-
-def test_rvalue_ref():
-    def callback(x, expected_count):
-        assert expected_count == tvm.testing.object_use_count(x)
-        return x
-
-    f = tvm.runtime.convert(callback)
-
-    def check0():
-        x = tvm.tir.Var("x", "int32")
-        assert tvm.testing.object_use_count(x) == 1
-        f(x, 2)
-        y = f(x._move(), 1)
-        assert x.handle.value == None
-
-    def check1():
-        x = tvm.tir.Var("x", "int32")
-        assert tvm.testing.object_use_count(x) == 1
-        y = f(x, 2)
-        z = f(x._move(), 2)
-        assert x.handle.value == None
-        assert y.handle.value is not None
-
-    check0()
-    check1()
-
-
 def test_trace_default_action():
     n = 2
     x = te.placeholder((n, n, n), name="X", dtype="float32")
@@ -328,21 +211,7 @@ def check_assign(dtype):
         check_assign(t)
 
 
-def test_numpy_scalar():
-    maxint = (1 << 63) - 1
-    assert tvm.testing.echo(np.int64(maxint)) == maxint
-
-
 if __name__ == "__main__":
-    test_numpy_scalar()
-    test_rvalue_ref()
-    test_empty_array()
-    test_get_global()
-    test_get_callback_with_node()
-    test_convert()
-    test_return_func()
-    test_byte_array()
-    test_ctx()
     test_trace_expr_assign()
     test_trace_expr_sum_generated()
     test_trace_expr_sum_custom()
diff --git a/tests/python/unittest/test_target_codegen_blob.py b/tests/python/unittest/test_target_codegen_blob.py
index 8189d1384e92..dc42381cf82d 100644
--- a/tests/python/unittest/test_target_codegen_blob.py
+++ b/tests/python/unittest/test_target_codegen_blob.py
@@ -49,9 +49,9 @@ def verify(data):
     with tvm.transform.PassContext(opt_level=3):
         synthetic_gpu_lib = relay.build_module.build(synthetic_mod, "cuda", params=synthetic_params)
 
-    from tvm.contrib import util
+    from tvm.contrib import utils
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path_lib = temp.relpath("deploy_lib.so")
     synthetic_gpu_lib.export_library(path_lib)
 
@@ -82,9 +82,9 @@ def test_cuda_lib():
     s[B].bind(bx, te.thread_axis("blockIdx.x"))
     s[B].bind(tx, te.thread_axis("threadIdx.x"))
 
-    from tvm.contrib import util
+    from tvm.contrib import utils
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     fn_add = tvm.build(s, [A, B], target="cuda", target_host="llvm", name="add")
     path_lib = temp.relpath("deploy_lib.so")
     fn_add.export_library(path_lib)
diff --git a/tests/python/unittest/test_target_codegen_c_host.py b/tests/python/unittest/test_target_codegen_c_host.py
index a2ed6345e651..3178d6dad0e4 100644
--- a/tests/python/unittest/test_target_codegen_c_host.py
+++ b/tests/python/unittest/test_target_codegen_c_host.py
@@ -18,7 +18,7 @@
 import tvm.testing
 from tvm import te
 import numpy as np
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 def test_add():
@@ -31,7 +31,7 @@ def test_add():
 
     def check_c():
         mhost = tvm.build(s, [A, B, C], "c", name="fadd")
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
@@ -76,7 +76,7 @@ def check_c():
         f1 = tvm.lower(s, [A, B, C], name="fadd_pipeline")
         mhost = tvm.build(f1, target="c")
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
@@ -104,7 +104,7 @@ def test_reinterpret():
 
     def check_c():
         mhost = tvm.build(s, [A, B], "c", name="reinterpret")
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
diff --git a/tests/python/unittest/test_target_codegen_cross_llvm.py b/tests/python/unittest/test_target_codegen_cross_llvm.py
index c0ab65100d52..a55530a090e4 100644
--- a/tests/python/unittest/test_target_codegen_cross_llvm.py
+++ b/tests/python/unittest/test_target_codegen_cross_llvm.py
@@ -21,7 +21,7 @@
 import os
 import struct
 from tvm import rpc
-from tvm.contrib import util, cc
+from tvm.contrib import utils, cc
 import numpy as np
 
 
@@ -46,7 +46,7 @@ def verify_elf(path, e_machine):
             assert struct.unpack(endian + "h", arr[0x12:0x14])[0] == e_machine
 
     def build_i386():
-        temp = util.tempdir()
+        temp = utils.tempdir()
         target = "llvm -mtriple=i386-pc-linux-gnu"
         f = tvm.build(s, [A, B, C], target)
         path = temp.relpath("myadd.o")
@@ -58,7 +58,7 @@ def build_arm():
         if not tvm.runtime.enabled(target):
             print("Skip because %s is not enabled.." % target)
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         f = tvm.build(s, [A, B, C], target)
         path = temp.relpath("myadd.o")
         f.save(path)
diff --git a/tests/python/unittest/test_target_codegen_device.py b/tests/python/unittest/test_target_codegen_device.py
index 3cdcb2d7345b..3b764c6709a8 100644
--- a/tests/python/unittest/test_target_codegen_device.py
+++ b/tests/python/unittest/test_target_codegen_device.py
@@ -16,7 +16,7 @@
 # under the License.
 import tvm
 from tvm import te
-from tvm.contrib import util
+from tvm.contrib import utils
 import numpy as np
 import tvm.testing
 
diff --git a/tests/python/unittest/test_target_codegen_hexagon.py b/tests/python/unittest/test_target_codegen_hexagon.py
index d42693ef16bd..b74d487f3fa7 100644
--- a/tests/python/unittest/test_target_codegen_hexagon.py
+++ b/tests/python/unittest/test_target_codegen_hexagon.py
@@ -63,6 +63,21 @@ def check_add(offload):
     check_add(False)
 
 
+def test_llvm_target_features():
+    if not check_prereq_and_setup():
+        return
+    target = tvm.target.hexagon("v66", hvx=128)
+    # Define some trivial compute
+    A = tvm.te.placeholder((128,), dtype="uint8", name="A")
+    C = tvm.te.compute((128,), lambda i: A[i] + 1, name="C")
+    s = tvm.te.create_schedule(C.op)
+    m = tvm.build(s, [C, A], target=target, target_host=target, name="add_one")
+    llvm_ir = m.get_source("ll")
+    # Make sure we find +hvx-length128b in "attributes".
+    fs = re.findall(r"attributes.*\+hvx-length128b", llvm_ir)
+    assert fs  # Check that it's non-empty
+
+
 def test_alloc_vtcm():
     if not check_prereq_and_setup():
         return
@@ -92,4 +107,5 @@ def test_alloc_vtcm():
 
 if __name__ == "__main__":
     test_basic()
+    test_llvm_target_features()
     test_alloc_vtcm()
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index e9f7a01f7a18..162481bfdb6e 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -14,11 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import collections
+import ctypes
+import json
 import tvm
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.contrib import util, clang
+from tvm.contrib import utils
 import numpy as np
 import ctypes
 import math
@@ -76,45 +79,6 @@ def use_llvm_intrinsic(A, C):
     f = tvm.build(s, [A, C], target="llvm")
 
 
-@tvm.testing.requires_llvm
-def test_llvm_import():
-    # extern "C" is necessary to get the correct signature
-    cc_code = """
-    extern "C" float my_add(float x, float y) {
-      return x + y;
-    }
-    """
-    n = 10
-    A = te.placeholder((n,), name="A")
-    B = te.compute(
-        (n,), lambda *i: tvm.tir.call_pure_extern("float32", "my_add", A(*i), 1.0), name="B"
-    )
-
-    def check_llvm(use_file):
-        if not clang.find_clang(required=False):
-            print("skip because clang is not available")
-            return
-        temp = util.tempdir()
-        ll_path = temp.relpath("temp.ll")
-        ll_code = clang.create_llvm(cc_code, output=ll_path)
-        s = te.create_schedule(B.op)
-        if use_file:
-            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
-        else:
-            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
-        # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, B], "llvm")
-        ctx = tvm.cpu(0)
-        # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1.0)
-
-    check_llvm(use_file=True)
-    check_llvm(use_file=False)
-
-
 @tvm.testing.requires_llvm
 def test_llvm_lookup_intrin():
     ib = tvm.tir.ir_builder.create()
@@ -147,45 +111,6 @@ def check_llvm():
     check_llvm()
 
 
-@tvm.testing.requires_llvm
-def test_llvm_add_pipeline():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    AA = te.compute((n,), lambda *i: A(*i), name="A")
-    BB = te.compute((n,), lambda *i: B(*i), name="B")
-    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
-    C = te.compute(A.shape, lambda *i: T(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    xo1, xo2 = s[C].split(xo, factor=13)
-    s[C].parallel(xo2)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xo2, "parallel_stride_pattern")
-    s[C].pragma(xo2, "parallel_barrier_when_finish")
-    s[C].vectorize(xi)
-
-    def check_llvm():
-        # Specifically allow offset to test codepath when offset is available
-        Ab = tvm.tir.decl_buffer(
-            A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A"
-        )
-        binds = {A: Ab}
-        # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, B, C], "llvm", binds=binds)
-        ctx = tvm.cpu(0)
-        # launch the kernel.
-        n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
-        f(a, b, c)
-        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
-
-    check_llvm()
-
-
 @tvm.testing.requires_llvm
 def test_llvm_persist_parallel():
     n = 128
@@ -672,7 +597,7 @@ def check_llvm_object():
         f2 = tvm.lower(s, [A, B, C], name="fadd1")
         f1 = tvm.lower(s, [A, B, C], name="fadd2")
         m = tvm.build([f1, f2], "llvm")
-        temp = util.tempdir()
+        temp = utils.tempdir()
         o_path = temp.relpath("temp.o")
         m.save(o_path)
         import shutil
diff --git a/tests/python/unittest/test_target_codegen_x86.py b/tests/python/unittest/test_target_codegen_x86.py
index ec11d26a9e76..b581f72ec763 100644
--- a/tests/python/unittest/test_target_codegen_x86.py
+++ b/tests/python/unittest/test_target_codegen_x86.py
@@ -28,6 +28,13 @@ def test_fp16_to_fp32():
         )
         return
 
+    import platform
+
+    machine = platform.machine()
+    if machine not in ["x86_64", "i386", "AMD64"]:
+        print("Skipping test because the platform is: {} ".format(machine))
+        return
+
     def fp16_to_fp32(target, width, match=None, not_match=None):
         elements = 64
         n = tvm.runtime.convert(elements)
diff --git a/tests/python/unittest/test_te_autodiff.py b/tests/python/unittest/test_te_autodiff.py
index 7b591dcd87f1..6031182091fe 100644
--- a/tests/python/unittest/test_te_autodiff.py
+++ b/tests/python/unittest/test_te_autodiff.py
@@ -19,7 +19,7 @@
 from tvm import te
 from tvm.testing import assert_allclose
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import pytest
 
 import numpy as np
diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py
index 3afdb66134e3..06d409933f1f 100644
--- a/tests/python/unittest/test_te_hybrid_script.py
+++ b/tests/python/unittest/test_te_hybrid_script.py
@@ -17,7 +17,7 @@
 import tvm, inspect, sys, traceback, numpy, pytest, types, os
 
 from tvm import te
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.te.hybrid import script
 from tvm.te.hybrid.runtime import HYBRID_GLOBALS
 
@@ -147,7 +147,7 @@ def test_outer_product():
     assert mul.b.producer.name == "b"
 
     func, ins, outs = run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101})
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path = temp.relpath("%s.py" % func.name)
     func.save(path)
     func_ = te.hybrid.HybridModule()
diff --git a/tests/python/unittest/test_te_tensor_overload.py b/tests/python/unittest/test_te_tensor_overload.py
index b44a85c26f4f..715771747d53 100644
--- a/tests/python/unittest/test_te_tensor_overload.py
+++ b/tests/python/unittest/test_te_tensor_overload.py
@@ -19,7 +19,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/unittest/test_tir_data_layout.py b/tests/python/unittest/test_tir_data_layout.py
index 22c24fafc54f..5c2eb8febd9b 100644
--- a/tests/python/unittest/test_tir_data_layout.py
+++ b/tests/python/unittest/test_tir_data_layout.py
@@ -18,7 +18,7 @@
 
 import tvm
 from tvm import te
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def test_layout():
diff --git a/tests/python/unittest/test_tir_intrin.py b/tests/python/unittest/test_tir_intrin.py
index 3d54175b8731..76390dace757 100644
--- a/tests/python/unittest/test_tir_intrin.py
+++ b/tests/python/unittest/test_tir_intrin.py
@@ -18,7 +18,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.contrib import util, clang
+from tvm.contrib import utils, clang
 import numpy as np
 import ctypes
 import math
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index be2f307a06af..ecaff319441d 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -66,6 +66,34 @@ def test_const_loop():
     assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
 
 
+def test_no_unroll_loop():
+    n = 21
+    A = te.placeholder((n,), name="A")
+    B = te.placeholder((n,), name="B")
+
+    T = te.compute((n,), lambda i: A[i] + B[i])
+    s = te.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
+    with tvm.transform.PassContext(
+        config={
+            "tir.LoopPartition": {
+                "partition_const_loop": True,
+                "no_unroll_loop_with_extent_one": True,
+            }
+        }
+    ):
+        mod = tvm.tir.transform.LoopPartition()(mod)
+        mod = tvm.tir.transform.Simplify()(mod)
+        stmt = tvm.tir.transform.RemoveNoOp()(mod)["main"].body
+
+    assert sum(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.For))) == 4
+
+
 def test_multi_loop():
     ib = tvm.tir.ir_builder.create()
     m = te.size_var("m")
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index b1a9eae7893a..cb8968cfc880 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -126,9 +126,10 @@ def check(m, lanes, target_bits, target_dtype):
         B = ib.buffer_ptr(Bb)
         with ib.for_range(0, m, name="i", dtype=m.dtype) as i:
             B[i] = A[i] + 1
+        A[0] = B[1]
         stmt = ib.get()
         stmt = lower_stmt([Ab, Bb], stmt, target_bits)
-        assert stmt.loop_var.dtype == target_dtype
+        assert stmt.seq[0].loop_var.dtype == target_dtype
 
     # i32 -> i32
     check(const(2 ** 10, dtype="int32"), 2, target_bits=32, target_dtype="int32")
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index dd8621d0fbfe..048a9544d6df 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -14,120 +14,169 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import pytest
-
 import tvm
 from tvm import tir
-from tvm.script import ty
-from tvm.script.parser import TVMScriptParserError
+from tvm.script import ty, from_source
+from tvm.ir.diagnostics import override_renderer
+import inspect
 
 
-@tvm.script.tir
-class Module1:
-    def buffer_bind_missing_args(a: ty.handle) -> None:
-        A = tir.match_buffer((16, 16), "float32")
+def buffer_bind_missing_args(a: ty.handle) -> None:
+    A = tir.match_buffer((16, 16), "float32")  # error
 
 
-@tvm.script.tir
-class Module2:
-    def range_missing_args(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
+def test_buffer_bind():
+    check_error(buffer_bind_missing_args, 2)
 
-        tir.attr(A, "realize_scope", "")
-        tir.realize(A[0:16, 0:16])
-        for i in tir.serial(16):
-            for j in tir.serial(0, 16):
-                A[i, j] = 0.0
 
+def range_missing_args(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
 
-@tvm.script.tir
-class Module3:
-    def undefined_buffer(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
+    tir.attr(A, "realize_scope", "")
+    tir.realize(A[0:16, 0:16], "")
+    for i in tir.serial(16):  # error
+        for j in tir.serial(0, 16):
+            A[i, j] = 0.0
 
-        tir.attr(A, "realize_scope", "")
-        tir.realize(C[0:16, 0:16])
-        for i in tir.serial(16):
-            for j in tir.serial(0, 16):
-                A[i, j] = 0.0
 
+def test_range_missing_args():
+    check_error(range_missing_args, 6)
 
-@tvm.script.tir
-class Module4:
-    def unsupported_stmt(a: ty.int32) -> None:
-        if a > 0:
-            print("I love tvm")
 
+def undefined_buffer(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
 
-@tvm.script.tir
-class Module5:
-    def unsupported_function_call(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
+    tir.attr(A, "realize_scope", "")
+    tir.realize(C[0:16, 0:16], "")  # error
+    for i in tir.serial(16):
+        for j in tir.serial(0, 16):
+            A[i, j] = 0.0
 
-        tir.attr(A, "realize_scope", "")
-        tir.realize(A[0:16, 0:16])
-        for i in tir.const_range(16):
-            for j in tir.serial(0, 16):
-                A[i, j] = 0.0
 
+def test_undefined_buffer():
+    check_error(undefined_buffer, 5)
 
-@tvm.script.tir
-class Module6:
-    def missing_type_annotation(a) -> None:
-        pass
+
+def unsupported_stmt(a: ty.int32) -> None:
+    if a > 0:
+        print("I love tvm")  # error
+
+
+def test_unsupported_stmt():
+    check_error(unsupported_stmt, 3)
+
+
+def unsupported_function_call(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+
+    tir.attr(A, "realize_scope", "")
+    tir.realize(A[0:16, 0:16], "")
+    for i in tir.const_range(16):  # error
+        for j in tir.serial(0, 16):
+            A[i, j] = 0.0
+
+
+def test_unsupported_function_call():
+    check_error(unsupported_function_call, 6)
+
+
+def missing_type_annotation(a) -> None:  # error
+    tir.evaluate(0.0)
+
+
+def test_missing_type_annotation():
+    check_error(missing_type_annotation, 1)
+
+
+def invalid_expr_stmt() -> None:
+    tir.max(1, 2)  # error
 
 
-@tvm.script.tir
-class Module7:
-    def invalid_concise_scoping() -> None:
-        tir.Assert(1.0 > 0.0, "aaaa")
-        tir.evaluate(0.0)
+def test_invalid_expr_stmt():
+    check_error(invalid_expr_stmt, 2)
 
 
-@tvm.script.tir
-class Module8:
-    def invalid_expr_stmt() -> None:
-        tir.max(1, 2)
+def invalid_for_function(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
 
+    for i in tir.evaluate(0.0):  # error
+        for j in tir.serial(0, 16):
+            A[i, j] = 0.0
 
-@tvm.script.tir
-class Module9:
-    def invalid_for_function(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
 
-        for i in tir.evaluate(0.0):
-            for j in tir.serial(0, 16):
-                A[i, j] = 0.0
+def test_invalid_for_function():
+    check_error(invalid_for_function, 4)
 
 
-@tvm.script.tir
-class Module10:
-    def invalid_block_function(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
+def invalid_block_function(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
 
-        with tir.evaluate(0.0):
-            pass
+    with tir.evaluate(0.0):  # error
+        tir.evaluate(1.0)
 
 
-def wrap_error(module, lineno):
-    with pytest.raises(TVMScriptParserError) as error:
-        mod = module()
-    assert error is not None
-    e = error.value
-    print(e)
-    msg = str(e).split("\n")[-1].split(":", maxsplit=1)[0].strip().split(" ")[-1].strip()
-    assert int(msg) == lineno
+def test_invalid_block_function():
+    check_error(invalid_block_function, 4)
+
+
+def return_not_allowed(a: ty.handle) -> None:
+    return tir.evaluate(0)  # error
+
+
+def test_return_not_allowed():
+    check_error(return_not_allowed, 2)
+
+
+def tir_assert(a: ty.handle) -> None:
+    tir.Assert(0, "")  # error
+
+
+def test_tir_assert():
+    check_error(tir_assert, 2)
+
+
+def no_body(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+    tir.realize(A, "")  # error
+
+
+def test_no_body():
+    check_error(no_body, 3)
+
+
+def check_error(module, rel_lineno):
+    # Override the default renderer to accumulate errors
+    _, start_line = inspect.getsourcelines(module)
+    lineno = start_line + rel_lineno - 1
+    errors = []
+
+    def render(e):
+        for d in e.diagnostics:
+            errors.append(d)
+
+    override_renderer(render)
+    # The diagnostic context throws an exception when it gets an error
+    try:
+        mod = from_source(module)
+    except tvm.error.DiagnosticError as e:
+        pass
+    assert len(errors) == 1, errors
+    for d in errors:
+        assert (
+            d.span.line == lineno
+        ), f"Expected error to be on line {lineno}, but it was on {d.span.line}"
 
 
 if __name__ == "__main__":
-    wrap_error(Module1, 29)
-    wrap_error(Module2, 39)
-    wrap_error(Module3, 50)
-    wrap_error(Module4, 60)
-    wrap_error(Module5, 70)
-    wrap_error(Module6, 77)
-    wrap_error(Module7, 84)
-    wrap_error(Module8, 91)
-    wrap_error(Module9, 99)
-    wrap_error(Module10, 109)
+    test_buffer_bind()
+    test_range_missing_args()
+    test_undefined_buffer()
+    test_unsupported_stmt()
+    test_unsupported_function_call()
+    test_missing_type_annotation()
+    test_invalid_expr_stmt()
+    test_invalid_for_function()
+    test_invalid_block_function()
+    test_return_not_allowed()
+    test_tir_assert()
+    test_no_body()
diff --git a/tests/scripts/task_ci_python_setup.sh b/tests/scripts/task_ci_python_setup.sh
new file mode 100755
index 000000000000..fe88ac650cc8
--- /dev/null
+++ b/tests/scripts/task_ci_python_setup.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+# Script to setup additional python env.
+#
+# Use the following command to install the
+# package to /workspace/.local, these additional
+# packages will have precedence over the system packages.
+#
+# command: python3 -m pip install --user <package>==<version>
+#
+echo "Addtiional setup in" ${CI_IMAGE_NAME}
+
+python3 -m pip install --user tlcpack-sphinx-addon==0.1.2 synr==0.2.1
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
new file mode 100755
index 000000000000..65f5d6359dac
--- /dev/null
+++ b/tests/scripts/task_config_build_arm.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+
+mkdir -p build
+cd build
+cp ../cmake/config.cmake .
+
+echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
+echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
+echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_LLVM llvm-config-8\) >> config.cmake
+echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
+echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
+echo set\(USE_VTA_TSIM ON\) >> config.cmake
+echo set\(USE_VTA_FSIM ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 521ab9b8ccdc..9a009b6a4a78 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -26,7 +26,6 @@ cp ../cmake/config.cmake .
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake
@@ -45,3 +44,4 @@ echo set\(USE_TENSORFLOW_PATH \"/tensorflow\"\) >> config.cmake
 echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake
 echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake
 echo set\(USE_ETHOSN_HW OFF\) >> config.cmake
+echo set\(USE_VITIS_AI ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 3fc7351c415f..155bac80533f 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -29,7 +29,6 @@ echo set\(USE_CUDA ON\) >> config.cmake
 echo set\(USE_OPENGL ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-9\) >> config.cmake
 echo set\(USE_NNPACK ON\) >> config.cmake
 echo set\(NNPACK_PATH /NNPACK/build/\) >> config.cmake
@@ -45,5 +44,4 @@ echo set\(USE_VTA_FSIM ON\) >> config.cmake
 echo set\(USE_BLAS openblas\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
-echo set\(USE_VTA_TSIM ON\) >> config.cmake
-echo set\(USE_VTA_FSIM ON\) >> config.cmake
+echo set\(USE_TENSORRT_CODEGEN ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu_vulkan.sh b/tests/scripts/task_config_build_gpu_vulkan.sh
index a5742e22110a..74096b1a9760 100755
--- a/tests/scripts/task_config_build_gpu_vulkan.sh
+++ b/tests/scripts/task_config_build_gpu_vulkan.sh
@@ -27,7 +27,6 @@ echo set\(USE_OPENCL ON\) >> config.cmake
 echo set\(USE_ROCM ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER clang-7\) >> config.cmake
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index d773985277aa..8ed5f94e30dc 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -28,7 +28,6 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-4.0\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
diff --git a/tests/scripts/task_config_build_qemu.sh b/tests/scripts/task_config_build_qemu.sh
index ebabdcab65b7..086ca8034dc9 100755
--- a/tests/scripts/task_config_build_qemu.sh
+++ b/tests/scripts/task_config_build_qemu.sh
@@ -25,7 +25,6 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-10\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index f3157bd54df0..c37a119b0590 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -26,7 +26,6 @@ cp ../cmake/config.cmake .
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake
diff --git a/tests/scripts/task_golang.sh b/tests/scripts/task_golang.sh
index 7a93b47c2913..5b01eeca164c 100755
--- a/tests/scripts/task_golang.sh
+++ b/tests/scripts/task_golang.sh
@@ -28,5 +28,7 @@ export PYTHONPATH="$tvm_root/python"
 export TVM_BIND_THREADS=0
 export OMP_NUM_THREADS=1
 
+make -C golang clean
+
 # Golang tests
 make -C golang tests
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index deb9b9bde6c1..700c63194fe3 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,7 +31,7 @@ echo "Checking file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Checking ASF license headers..."
-tests/lint/check_asf_header.sh
+tests/lint/check_asf_header.sh --local
 
 echo "Linting the C++ code..."
 tests/lint/cpplint.sh
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 98dac93ac98f..459b680daeb1 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -38,6 +38,10 @@ mkdir -p docs/_build/html
 rm -rf docs/gen_modules
 rm -rf docs/doxygen
 
+# prepare auto scheduler tutorials
+rm -rf tutorials/auto_scheduler/*.json
+cp -f tutorials/auto_scheduler/ci_logs/*.json tutorials/auto_scheduler
+
 # remove stale tutorials and always build from scratch.
 rm -rf docs/tutorials
 rm -rf docs/vta/tutorials
@@ -68,12 +72,10 @@ npm install
 npm run typedoc
 cd ..
 
-# TODO(@jroesch): add Rust to CI container
-# see: https://github.com/apache/incubator-tvm/issues/6628
 # Rust doc
-# cd rust
-# cargo doc --workspace --no-deps
-# cd ..
+cd rust
+cargo doc --workspace --no-deps
+cd ..
 
 # Prepare the doc dir
 rm -rf _docs
@@ -82,8 +84,7 @@ rm -f _docs/.buildinfo
 mkdir -p _docs/api
 mv docs/doxygen/html _docs/api/doxygen
 mv jvm/core/target/site/apidocs _docs/api/javadoc
-# See above TODO
-# mv rust/target/doc _docs/api/rust
+mv rust/target/doc _docs/api/rust
 mv web/dist/docs _docs/api/typedoc
 
 echo "Start creating the docs tarball.."
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index f5332ef34606..7fb8d471a53a 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -18,3 +18,12 @@
 
 set -e
 set -u
+
+source tests/scripts/setup-pytest-env.sh
+
+# cleanup pycache
+find . -type f -path "*.pyc" | xargs rm -f
+
+TVM_FFI=ctypes python3 -m pytest tests/micro/qemu
+make cython3
+TVM_FFI=cython python3 -m pytest tests/micro/qemu
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index 622646b76189..0aaf9fc86664 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -23,7 +23,9 @@ source tests/scripts/setup-pytest-env.sh
 
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
+make cython3
 
+TVM_FFI=ctypes python3 -m pytest tests/python/all-platform-minimal-test
+TVM_FFI=cython python3 -m pytest tests/python/all-platform-minimal-test
 TVM_FFI=ctypes python3 -m pytest tests/python/unittest
-make cython3
 TVM_FFI=cython python3 -m pytest tests/python/unittest
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 18361feb03ee..2c87cceec8bb 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -74,7 +74,7 @@ cd tests/test_tvm_dso
 cargo run
 cd -
 
-# # run wasm32 test
+# run wasm32 test
 # cd tests/test_wasm32
 # cargo build
 # wasmtime $RUST_DIR/target/wasm32-wasi/debug/test-wasm32.wasm
@@ -110,8 +110,6 @@ cargo run --bin array
 cargo run --bin string
 cd -
 
-# TODO(@jroesch): we need to renable MxNet in ci-cpu image
-# https://github.com/apache/incubator-tvm/pull/6563
-# cd examples/resnet
-# cargo build
+cd examples/resnet
+cargo run
 cd -
diff --git a/tests/scripts/task_web_wasm.sh b/tests/scripts/task_web_wasm.sh
index 717d3284fce1..c117bb08c6f4 100755
--- a/tests/scripts/task_web_wasm.sh
+++ b/tests/scripts/task_web_wasm.sh
@@ -21,6 +21,7 @@ set -u
 
 export PYTHONPATH=`pwd`/python
 
+rm -rf .emscripten_cache
 cd web
 make clean
 npm install
diff --git a/tutorials/auto_scheduler/ci_logs/conv2d.json b/tutorials/auto_scheduler/ci_logs/conv2d.json
new file mode 100644
index 000000000000..c748920d14db
--- /dev/null
+++ b/tutorials/auto_scheduler/ci_logs/conv2d.json
@@ -0,0 +1,2 @@
+# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI.
+{"i": [["[\"conv2d_layer\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32"], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 512, [1, 64, 2, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 7, [1, 1, 7, 1], 1], ["SP", 3, 20, 512, [4, 2], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 48, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 504, [4], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000429498], 0, 1.59126, 1603259147], "v": "v0.2"}
diff --git a/tutorials/auto_scheduler/ci_logs/matmul.json b/tutorials/auto_scheduler/ci_logs/matmul.json
new file mode 100644
index 000000000000..bc5d6f0ba70d
--- /dev/null
+++ b/tutorials/auto_scheduler/ci_logs/matmul.json
@@ -0,0 +1,2 @@
+# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI.
+{"i": [["[\"matmul_add\", 1024, 1024, 1024, \"float32\"]", "llvm -keys=cpu -link-params=0", [24, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1024, [2, 4, 16], 1], ["SP", 2, 4, 1024, [16, 4, 16], 1], ["SP", 2, 8, 1024, [8], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$0"], ["AN", 2, 6, 2]]]], "r": [[0.028777], 0, 0.613435, 1607038574], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
new file mode 100644
index 000000000000..8d0a6ae980c4
--- /dev/null
+++ b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
@@ -0,0 +1,26 @@
+# Provide valid schedules for resnet-18 on GPU.
+# This is used to run the tutorial on the documentation web server.
+{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$512"], ["PR", 3, 0, "auto_unroll_max_step$512"]]]], "r": [[4.87396e-06], 0, 1.30575, 1606984701], "v": "v0.3"}
+{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 32, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[2.25155e-05], 0, 1.5128, 1606984719], "v": "v0.3"}
+{"i": [["[\"7de313da0ca29a8c63f647791692430d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.91068e-06], 0, 1.63708, 1606984742], "v": "v0.3"}
+{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [2], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [4, 1, 4, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 2, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000190231], 0, 1.95863, 1606984773], "v": "v0.3"}
+{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [4, 2, 2, 1], 1], ["SP", 6, 15, 512, [1, 16, 2, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000218188], 0, 2.05807, 1606984806], "v": "v0.3"}
+{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 8], 1], ["SP", 6, 15, 512, [1, 16, 1, 2], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [2], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000165484], 0, 2.76154, 1606984831], "v": "v0.3"}
+{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 128, 1, 2], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000157488], 0, 2.05375, 1606984883], "v": "v0.3"}
+{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.00011824], 0, 1.84964, 1606984912], "v": "v0.3"}
+{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 1], 1], ["SP", 6, 15, 256, [4, 8, 1, 1], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.67244e-05], 0, 1.93124, 1606984935], "v": "v0.3"}
+{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 7, 2, 1], 1], ["SP", 3, 15, 256, [2, 2, 1, 4], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [4, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[9.20105e-05], 0, 1.88263, 1606984952], "v": "v0.3"}
+{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 7], 1], ["SP", 6, 15, 128, [1, 4, 1, 16], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102747], 0, 2.2858, 1606984979], "v": "v0.3"}
+{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [2, 8, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000133211], 0, 2.07337, 1606985017], "v": "v0.3"}
+{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 7, 1], 1], ["SP", 6, 15, 128, [1, 2, 2, 2], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000150142], 0, 1.90539, 1606985042], "v": "v0.3"}
+{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 2, 1], 1], ["SP", 3, 10, 28, [1, 7, 2, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 64, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 360, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000101548], 0, 1.92449, 1606985059], "v": "v0.3"}
+{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 14, 1, 1], 1], ["SP", 6, 15, 64, [2, 2, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[5.64548e-05], 0, 3.15692, 1606985088], "v": "v0.3"}
+{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 2, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 4, 1], 1], ["SP", 6, 15, 64, [1, 8, 1, 4], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135574], 0, 2.88002, 1606985120], "v": "v0.3"}
+{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 1, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 48, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 96, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000115802], 0, 4.06441, 1606985158], "v": "v0.3"}
+{"i": [["[\"ba2026d923536b75e9b4faed89287d5f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$16"]]]], "r": [[2.00968e-05], 0, 1.53065, 1606985193], "v": "v0.3"}
+{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 7, 1], 1], ["SP", 3, 10, 112, [1, 7, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 4, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 84, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 273, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.14326e-05], 0, 2.05623, 1606985220], "v": "v0.3"}
+{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 2], 1], ["SP", 3, 10, 56, [1, 7, 1, 2], 1], ["SP", 3, 15, 64, [1, 16, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 256, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.17113e-05], 0, 1.9863, 1606985239], "v": "v0.3"}
+{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 7], 1], ["SP", 3, 15, 128, [8, 8, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[1.76965e-05], 0, 1.63284, 1606985253], "v": "v0.3"}
+{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 1], 1], ["SP", 3, 10, 14, [2, 1, 7, 1], 1], ["SP", 3, 15, 256, [2, 64, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.05015e-05], 0, 1.59532, 1606985280], "v": "v0.3"}
+{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [4, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 2704, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.18808e-05], 0, 1.88033, 1606985298], "v": "v0.3"}
+{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [7, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 4, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 256, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000190239], 0, 2.28266, 1606985323], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
new file mode 100644
index 000000000000..611f7765f584
--- /dev/null
+++ b/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
@@ -0,0 +1,31 @@
+# Provide valid schedules for resnet-50 for CPU.
+# This is used to run the tutorial on the documentation web server.
+{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.3"}
+{"i": [["[\"6129df1a3d5f6326c8393a8d17160199\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1, [1, 1, 1], 1], ["SP", 2, 4, 1000, [1, 1, 1], 1], ["SP", 2, 8, 16, [2, 2, 4], 1], ["SP", 2, 12, 128, [32], 1], ["RE", 2, [0, 4, 8, 1, 5, 9, 12, 2, 6, 10, 13, 3, 7, 11]], ["CR", 5], ["CA", 3, 5, 1], ["FU", 2, [0, 1]], ["AN", 2, 0, 3], ["FU", 5, [0, 1]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 2, 12, 2]]]], "r": [[8.7769e-05, 8.6467e-05, 8.6989e-05, 9.3901e-05, 8.6221e-05, 8.4351e-05, 8.4747e-05, 8.8687e-05, 8.8928e-05, 8.3574e-05], 0, 0.33759, 1606960890], "v": "v0.3"}
+{"i": [["[\"36ee2798ed60bae3bcd1bb89a0285fe8\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.3"}
+{"i": [["[\"dcf6fcf5f56fa614bf9aef0c82382caf\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.3"}
+{"i": [["[\"7657f886f5e9d8b5f19a5fd2c5b90d8d\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.3"}
+{"i": [["[\"7e09b626cf077cd419190fee02091dd6\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.3"}
+{"i": [["[\"1dce2c5e4269b8a12dfc50cd4dd23ff1\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.3"}
+{"i": [["[\"d3b36ce001dc24d693facfbdae1979b4\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.3"}
+{"i": [["[\"a085717fb3dcb046e5c4c2c04d3dc541\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.3"}
+{"i": [["[\"8dd7d81db440763f622f03fdc99e6d46\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.3"}
+{"i": [["[\"ba2026d923536b75e9b4faed89287d5f\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.3"}
+{"i": [["[\"0fb1dfcdb5b755e2dab290ed0129dcf2\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 2], 1], ["SP", 3, 12, 128, [2, 2, 16], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 128, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 3, 8], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.000224019, 0.000238271, 0.000237129, 0.000233981, 0.000223557, 0.000238411, 0.000238778, 0.000236382, 0.000236069, 0.000239037], 0, 0.285437, 1606961576], "v": "v0.3"}
+{"i": [["[\"e043f834cc7f19597227e09dc7f59503\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.3"}
+{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.3"}
+{"i": [["[\"03614e726dc588d11887eb0953a77e53\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.3"}
+{"i": [["[\"b51e06c1131d4cded40d1b215f722a4e\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.3"}
+{"i": [["[\"a9e632e5167afb60fbe29e7aeef1d152\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.3"}
+{"i": [["[\"e0a9eb3795b531085e0ebb772e7e800c\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.3"}
+{"i": [["[\"8fcee68a4342c38248a827f1c6c69177\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.3"}
+{"i": [["[\"4d7e646d99bfa3cea8245bd7100369cb\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.3"}
+{"i": [["[\"b2010aa63c95dedf1f58f3fe8bc78634\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.3"}
+{"i": [["[\"537c8642716948c33a6eaaabc86b159d\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.3"}
+{"i": [["[\"7e3f0cf5a6dd80d36dab1a3dad92674a\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.3"}
+{"i": [["[\"cd7c4a374fb2bbc0d075c8cae638ad14\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.3"}
+{"i": [["[\"45b4de07687dee43ee1cbde9f516b2bf\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.3"}
+{"i": [["[\"95bf49cc8cf7a351e974b2359702aac0\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 2, 1], 1], ["SP", 3, 8, 14, [1, 7, 1], 1], ["SP", 3, 12, 256, [2, 1, 8], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000230538, 0.000229192, 0.000235935, 0.000233141, 0.000233405, 0.000233217, 0.000225995, 0.000231786, 0.000229054, 0.00022851], 0, 0.256995, 1606961941], "v": "v0.3"}
+{"i": [["[\"5e3ceb6e23ae8c351d5a1770d5fc6c7c\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.3"}
+{"i": [["[\"691feef049c8693bbe91bd5e7c9cdf34\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.3"}
+{"i": [["[\"45acfc473c772458684f36a34549d8aa\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 5004a5f73286..103ceb49dced 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -17,22 +17,28 @@
 """
 .. _auto-scheduler-conv-gpu:
 
-Auto-scheduling a convolution layer for GPU
+Auto-scheduling a Convolution Layer for GPU
 ===========================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
+This is a tutorial on how to use the auto-scheduler for GPUs.
 
-Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
 manual templates to define the search space, the auto-scheduler does not require any templates.
-The auto-scheduler is template-free, so users only need to write the computation declaration without
-any schedule commands or templates.
-The auto-scheduler can automatically generate a large
-search space and find a good schedule in the space.
+Users only need to write the computation declaration without any schedule commands or templates.
+The auto-scheduler can automatically generate a large search space and
+find a good schedule in the space.
 
 We use a convolution layer as an example in this tutorial.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
+import os
+
 import numpy as np
 import tvm
 from tvm import te, auto_scheduler, topi
@@ -65,21 +71,24 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 
 # Use the last layer in ResNet-50
 N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
-task = auto_scheduler.create_task(conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding), target)
+task = auto_scheduler.SearchTask(
+    func=conv2d_layer, args=(N, H, W, CO, CI, KH, KW, strides, padding), target=target
+)
 
 # Inspect the computational graph
+print("Computational DAG:")
 print(task.compute_dag)
 
 ######################################################################
 # Next, we set parameters for the auto-scheduler. These parameters
-# mainly specify how we do the measurement during the search and auto-tuning.
+# mainly specify how we do the measurement during the search.
 #
-# * :code:`measure_ctx` launches a different process for measurement. This
-#   provides an isolation. It can protect the master process from GPU crashes
-#   happended during measurement and avoid other runtime conflicts.
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the master process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
-#   Typically, we recommend a value > 300 ms.
+#   Typically, we recommend a value >= 300 ms.
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
 #   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
 #   good value for the search to converge. You can do more trials according to your time budget.
@@ -89,11 +98,13 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # * see :any:`auto_scheduler.TuningOptions`,
 #   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
 
+log_file = "conv2d.json"
 measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
 tune_option = auto_scheduler.TuningOptions(
-    num_measure_trials=10,
+    num_measure_trials=10,  # change this to 1000 to achieve the best performance
     runner=measure_ctx.runner,
-    measure_callbacks=[auto_scheduler.RecordToFile("conv2d.json")],
+    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    verbose=2,
 )
 
 ######################################################################
@@ -101,11 +112,15 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # ^^^^^^^^^^^^^^
 # Now we get all inputs ready. Pretty simple, isn't it?
 # We can kick off the search and let the auto-scheduler do its magic.
-# After some measurement trials, it will return the best schedule it found.
+# After some measurement trials, we can load the best schedule from the log
+# file and apply it.
 
-sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
+# Run auto-tuning (search)
+task.tune(tune_option)
+# Apply the best schedule
+sch, args = task.apply_best(log_file)
 
-# Kill the process for measurement
+# Kill the measurement process
 del measure_ctx
 
 ######################################################################
@@ -113,6 +128,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # The auto-scheduler correctly performs optimizations including multi-level tiling,
 # cooperative fetching, unrolling and operator fusion.
 
+print("Lowered TIR:")
 print(tvm.lower(sch, args, simple_mode=True))
 
 ######################################################################
@@ -149,26 +165,20 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 ######################################################################
 # Using the record file
 # ^^^^^^^^^^^^^^^^^^^^^
-# During the search, all measuremnt records are dumpped into the record
+# During the search, all measurement records are dumped into the record
 # file "conv2d.json". The measurement records can be used to re-apply search results,
 # resume the search, and perform other analyses.
 
 ######################################################################
 # Here is an example where we load the best schedule from a file,
-# print the equivalent python schedule API, and build the binary again.
-
-# Load the measuremnt record for the best schedule
-inp, res = auto_scheduler.load_best("conv2d.json", task.workload_key)
+# print the equivalent python schedule API and CUDA source code.
+# They can be used for debugging and learning the behavior of the auto-scheduler.
 
-# Print equivalent python schedule API. This can be used for debugging and
-# learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
-print(task.compute_dag.print_python_code_from_state(inp.state))
+print(task.print_best(log_file, print_mode="schedule"))
 
-# Rebuild the binary. This shows how you can apply the best schedule from a
-# log file without reruning the search again.
-sch, args = task.compute_dag.apply_steps_from_state(inp.state)
-func = tvm.build(sch, args, target)
+print("CUDA source code:")
+print(task.print_best(log_file, print_mode="cuda"))
 
 ######################################################################
 # A more complicated example is to resume the search.
@@ -176,8 +186,6 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # and resume the status of search policy and cost model with the log file.
 # In the example below we resume the status and do more 5 trials.
 
-
-log_file = "conv2d.json"
 cost_model = auto_scheduler.XGBModel()
 cost_model.update_from_file(log_file)
 search_policy = auto_scheduler.SketchPolicy(
@@ -189,7 +197,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
     runner=measure_ctx.runner,
     measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
 )
-sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+task.tune(tune_option, search_policy=search_policy)
 
 # Kill the measurement process
 del measure_ctx
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index e1e011555445..bdd14bea1dfd 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -15,21 +15,28 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-scheduling matrix multiplication for CPU
+Auto-scheduling Matrix Multiplication for CPU
 =============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
-Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
+This is a tutorial on how to use the auto-scheduler for CPUs.
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
 manual templates to define the search space, the auto-scheduler does not require any templates.
-The auto-scheduler is template-free, so users only need to write the computation declaration without
-any schedule commands or templates.
-The auto-scheduler can automatically generate a large
-search space and find a good schedule in the space.
+Users only need to write the computation declaration without any schedule commands or templates.
+The auto-scheduler can automatically generate a large search space and
+find a good schedule in the space.
 
 We use matrix multiplication as an example in this tutorial.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
+import os
+
 import numpy as np
 import tvm
 from tvm import te, auto_scheduler
@@ -49,7 +56,12 @@ def matmul_add(N, L, M, dtype):
     C = te.placeholder((N, M), name="C", dtype=dtype)
 
     k = te.reduce_axis((0, L), name="k")
-    matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="matmul")
+    matmul = te.compute(
+        (N, M),
+        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
+        name="matmul",
+        attrs={"layout_free_placeholders": [B]},  # enable automatic layout transform for tensor B
+    )
     out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
 
     return [A, B, C, out]
@@ -58,16 +70,18 @@ def matmul_add(N, L, M, dtype):
 ######################################################################
 # Create the search task
 # ^^^^^^^^^^^^^^^^^^^^^^
-# We then create a search task with N=L=M=128 and dtype="float32"
+# We then create a search task with N=L=M=1024 and dtype="float32"
 # If your machine supports avx instructions, you can
 #
 #   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
 #   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
 
 target = tvm.target.Target("llvm")
-task = tvm.auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)
+N = L = M = 1024
+task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)
 
 # Inspect the computational graph
+print("Computational DAG:")
 print(task.compute_dag)
 
 ######################################################################
@@ -81,8 +95,11 @@ def matmul_add(N, L, M, dtype):
 #   and do more analyses later.
 # * see :any:`auto_scheduler.TuningOptions` for more parameters
 
+log_file = "matmul.json"
 tune_option = auto_scheduler.TuningOptions(
-    num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")]
+    num_measure_trials=10,
+    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    verbose=2,
 )
 
 ######################################################################
@@ -90,15 +107,20 @@ def matmul_add(N, L, M, dtype):
 # ^^^^^^^^^^^^^^
 # Now we get all inputs ready. Pretty simple, isn't it?
 # We can kick off the search and let the auto-scheduler do its magic.
-# After some measurement trials, it will return the best schedule it found.
+# After some measurement trials, we can load the best schedule from the log
+# file and apply it.
 
-sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
+# Run auto-tuning (search)
+task.tune(tune_option)
+# Apply the best schedule
+sch, args = task.apply_best(log_file)
 
 ######################################################################
 # We can lower the schedule to see the IR after auto-scheduling.
 # The auto-scheduler correctly performs optimizations including multi-level tiling,
 # parallelization, vectorization, unrolling and operator fusion.
 
+print("Lowered TIR:")
 print(tvm.lower(sch, args, simple_mode=True))
 
 ######################################################################
@@ -106,10 +128,10 @@ def matmul_add(N, L, M, dtype):
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # We build the binary and check its correctness and performance.
 
-func = tvm.build(sch, args)
-a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
-b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
-c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
+func = tvm.build(sch, args, target)
+a_np = np.random.uniform(size=(N, L)).astype(np.float32)
+b_np = np.random.uniform(size=(L, M)).astype(np.float32)
+c_np = np.random.uniform(size=(N, M)).astype(np.float32)
 out_np = a_np.dot(b_np) + c_np
 
 ctx = tvm.cpu()
@@ -133,26 +155,17 @@ def matmul_add(N, L, M, dtype):
 ######################################################################
 # Using the record file
 # ^^^^^^^^^^^^^^^^^^^^^
-# During the search, all measuremnt records are dumpped into the record
+# During the search, all measurement records are dumped into the record
 # file "matmul.json". The measurement records can be used to re-apply search results,
 # resume the search, and perform other analyses.
 
 ######################################################################
 # Here is an example where we load the best schedule from a file,
-# print the equivalent python schedule API, and build the binary again.
-
-# Load the measuremnt record for the best schedule
-inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)
+# and print the equivalent python schedule API. This can be used for
+# debugging and learning the behavior of the auto-scheduler.
 
-# Print equivalent python schedule API. This can be used for debugging and
-# learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
-print(task.compute_dag.print_python_code_from_state(inp.state))
-
-# Rebuild the binary. This shows how you can apply the best schedule from a
-# log file without reruning the search again.
-sch, args = task.compute_dag.apply_steps_from_state(inp.state)
-func = tvm.build(sch, args)
+print(task.print_best(log_file))
 
 ######################################################################
 # A more complicated example is to resume the search.
@@ -161,19 +174,21 @@ def matmul_add(N, L, M, dtype):
 # In the example below we resume the status and do more 5 trials.
 
 
-def resume_search(task, log_file):
+def resume_search(task, log_file_name):
     cost_model = auto_scheduler.XGBModel()
-    cost_model.update_from_file(log_file)
+    cost_model.update_from_file(log_file_name)
     search_policy = auto_scheduler.SketchPolicy(
-        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
+        task,
+        cost_model,
+        init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file_name)],
     )
     tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
+        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file_name)]
     )
-    sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+    task.tune(tune_option, search_policy=search_policy)
 
 
-# resume_search(task, "matmul.json")
+# resume_search(task, log_file)
 
 ######################################################################
 # .. note::
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
new file mode 100644
index 000000000000..03be05abd363
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -0,0 +1,308 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and 
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_runtime
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-18"
+batch_size = 1
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the master process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
+# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
+#   This can warmup the GPU, which is necessary to get accurate measurement results.
+#   Typically, we recommend a value >= 300 ms.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=200,  # change this to 20000 to achieve the best performance
+        runner=measure_ctx.runner,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.005 |           0.88 |     64 |
+#     |    1 |        0.010 |          99.10 |     64 |
+#     |    2 |        0.006 |           0.00 |     64 |
+#     |    3 |        0.145 |         979.78 |    384 |
+#     |    4 |        0.130 |        1097.02 |    384 |
+#     |    5 |        0.143 |         992.69 |    384 |
+#     |    6 |        0.076 |        1526.86 |    192 |
+#     |    7 |        0.115 |         999.44 |    320 |
+#     |    8 |        0.079 |        1449.39 |    320 |
+#     |    9 |        0.122 |         938.73 |    384 |
+#     |   10 |        0.063 |        1832.98 |    192 |
+#     |   11 |        0.072 |        1763.62 |    256 |
+#     |   12 |        0.062 |        2036.40 |    192 |
+#     |   13 |        0.068 |        1874.44 |    192 |
+#     |   14 |        0.049 |        2346.50 |    128 |
+#     |   15 |        0.076 |        1694.31 |    256 |
+#     |   16 |        0.067 |        1933.30 |    448 |
+#     |   17 |        0.076 |        1680.90 |    256 |
+#     |   18 |        0.022 |          98.43 |     64 |
+#     |   19 |        0.076 |        3112.55 |    192 |
+#     |   20 |        0.013 |        2026.44 |     64 |
+#     |   21 |        0.011 |        1136.69 |     64 |
+#     |   22 |        0.013 |         992.47 |     64 |
+#     |   23 |        0.020 |         627.56 |     64 |
+#     -------------------------------------------------
+#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "dmlc::Error"s and CUDA errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+print("Compile...")
+with auto_scheduler.ApplyHistoryBest(log_file):
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        lib = relay.build(mod, target=target, params=params)
+
+# Create graph runtime
+ctx = tvm.context(str(target), 0)
+module = graph_runtime.GraphModule(lib["default"](ctx))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
+print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
+
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. If you have multiple target GPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.
+# 3. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+#    to distill the large log file and only save the best useful records.
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py
new file mode 100644
index 000000000000..aba75b253e0c
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_network_x86.py
@@ -0,0 +1,307 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for x86 CPU
+============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for x86 CPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and 
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_runtime
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet50_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target.
+# If the target machine supports avx512 instructions, replace the
+# "llvm -mcpu=core-avx2" with "llvm -mcpu=skylake-avx512"
+network = "resnet-50"
+batch_size = 1
+layout = "NHWC"
+target = tvm.target.Target("llvm -mcpu=core-avx2")
+dtype = "float32"
+log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`800 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 29 tasks in resnet-50, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRunner` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=200,  # change this to 20000 to achieve the best performance
+        runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True),
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.010 |           0.40 |     64 |
+#     |    1 |        0.087 |          47.19 |     64 |
+#     |    2 |        0.008 |          -0.00 |     64 |
+#     |    3 |        0.177 |         582.07 |     64 |
+#     |    4 |        0.268 |         862.37 |    256 |
+#     |    5 |        0.166 |         621.13 |    128 |
+#     |    6 |        0.170 |         605.10 |    128 |
+#     |    7 |        0.128 |         403.20 |     64 |
+#     |    8 |        0.189 |         545.71 |     64 |
+#     |    9 |        0.231 |        1001.01 |    448 |
+#     |   10 |        0.155 |         664.80 |    256 |
+#     |   11 |        0.155 |         662.86 |    256 |
+#     |   12 |        0.119 |         434.08 |     64 |
+#     |   13 |        0.199 |         522.13 |     64 |
+#     |   14 |        0.235 |         986.56 |    320 |
+#     |   15 |        0.149 |         689.13 |    128 |
+#     |   16 |        0.155 |         664.80 |    192 |
+#     |   17 |        0.151 |         340.64 |     64 |
+#     |   18 |        0.176 |         597.55 |    128 |
+#     |   19 |        0.220 |        1054.37 |    192 |
+#     |   20 |        0.150 |         686.01 |    128 |
+#     |   21 |        0.159 |         650.88 |    128 |
+#     |   22 |        0.073 |         358.19 |     64 |
+#     |   23 |        0.031 |          70.63 |     64 |
+#     |   24 |        0.251 |         947.73 |    128 |
+#     |   25 |        0.157 |         652.47 |    128 |
+#     |   26 |        0.215 |         954.84 |    128 |
+#     |   27 |        0.237 |         868.92 |    128 |
+#     |   28 |        0.266 |         774.06 |    128 |
+#     -------------------------------------------------
+#     Estimated total latency: 10.016 ms      Trials: 3992    Used time : 1131 s      Next ID: 15
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "dmlc::Error"s errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+print("Compile...")
+with auto_scheduler.ApplyHistoryBest(log_file):
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        lib = relay.build(mod, target=target, params=params)
+
+# Create graph runtime
+ctx = tvm.context(str(target), 0)
+module = graph_runtime.GraphModule(lib["default"](ctx))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
+print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
+
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. If you have multiple target CPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.
+# 3. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+#    to distill the large log file and only save the best useful records.
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index ce9c19860ff4..b662bafd73e6 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -22,6 +22,10 @@
 This is an advanced tutorial for writing high performance tunable template for
 NVIDIA GPU. By running auto-tuner on this template, we can outperform the
 vendor provided library CuDNN in many cases.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
@@ -49,8 +53,7 @@
 import numpy as np
 
 import tvm
-from tvm import te
-from tvm import topi
+from tvm import te, topi, testing
 from tvm.topi.testing import conv2d_nchw_python
 
 from tvm import autotvm
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index f024ba4f201a..317af5f1632d 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -17,7 +17,7 @@
 """
 .. _tune_relay_arm:
 
-Auto-tuning a convolutional network for ARM CPU
+Auto-tuning a Convolutional Network for ARM CPU
 ===============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Zhao Wu <https://github.com/FrozenGene>`_, `Eddie Yan <https://github.com/eqy>`_
 
@@ -33,8 +33,12 @@
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some arm devices. You can go to
-`ARM CPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#arm-cpu>`_
+`ARM CPU Benchmark <https://github.com/apache/tvm/wiki/Benchmark#arm-cpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
@@ -62,12 +66,10 @@
 
 import numpy as np
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
+from tvm import relay, autotvm
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -100,7 +102,7 @@ def get_network(name, batch_size):
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
@@ -162,7 +164,7 @@ def get_network(name, batch_size):
 #   (replace :code:`[HOST_IP]` with the IP address of your host machine)
 #
 # * For Android:
-#   Follow this `readme page <https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc>`_ to
+#   Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
 #   install the TVM RPC APK on the android device. Make sure you can pass the android rpc test.
 #   Then you have already registered your device. During tuning, you have to go to developer option
 #   and enable "Keep screen awake during changing" and charge your phone to make it stable.
@@ -412,4 +414,4 @@ def tune_and_evaluate(tuning_opt):
 #      import logging
 #      logging.getLogger('autotvm').setLevel(logging.DEBUG)
 #
-#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 4636103a22e2..76a30ec15eb6 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-tuning a convolutional network for NVIDIA GPU
+Auto-tuning a Convolutional Network for NVIDIA GPU
 ==================================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Eddie Yan <https://github.com/eqy/>`_
 
@@ -31,8 +31,12 @@
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
-`NVIDIA GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#nvidia-gpu>`_
+`NVIDIA GPU Benchmark <https://github.com/apache/tvm/wiki/Benchmark#nvidia-gpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
@@ -60,12 +64,9 @@
 import numpy as np
 
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
+from tvm import relay, autotvm
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib.util import tempdir
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -98,7 +99,7 @@ def get_network(name, batch_size):
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
@@ -235,11 +236,6 @@ def tune_and_evaluate(tuning_opt):
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build_module.build(mod, target=target, params=params)
 
-        # export library
-        tmp = tempdir()
-        filename = "net.tar"
-        lib.export_library(tmp.relpath(filename))
-
         # load parameters
         ctx = tvm.context(str(target), 0)
         module = runtime.GraphModule(lib["default"](ctx))
@@ -313,12 +309,13 @@ def tune_and_evaluate(tuning_opt):
 #      import logging
 #      logging.getLogger('autotvm').setLevel(logging.DEBUG)
 #
-#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org
 
 
 #################################################################
 # Scale up measurement by using multiple devices
 # ----------------------------------------------
+# .. _tutorials-autotvm-rpc-tracker:
 #
 # If you have multiple devices, you can use all of them for measurement.
 # TVM uses the RPC Tracker to manage distributed devices.
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 61254662c463..5e972730d9be 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-tuning a convolutional network for Mobile GPU
+Auto-tuning a Convolutional Network for Mobile GPU
 ==================================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Eddie Yan <https://github.com/eqy>`_
 
@@ -31,8 +31,12 @@
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some arm devices. You can go to
-`Mobile GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#mobile-gpu>`_
+`Mobile GPU Benchmark <https://github.com/apache/tvm/wiki/Benchmark#mobile-gpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
@@ -61,12 +65,10 @@
 import numpy as np
 
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
+from tvm import relay, autotvm
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -99,7 +101,7 @@ def get_network(name, batch_size):
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
@@ -161,7 +163,7 @@ def get_network(name, batch_size):
 #   (replace :code:`[HOST_IP]` with the IP address of your host machine)
 #
 # * For Android:
-#   Follow this `readme page <https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc>`_ to
+#   Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
 #   install TVM RPC APK on the android device. Make sure you can pass the android RPC test.
 #   Then you have already registered your device. During tuning, you have to go to developer option
 #   and enable "Keep screen awake during changing" and charge your phone to make it stable.
@@ -417,4 +419,4 @@ def tune_and_evaluate(tuning_opt):
 #      import logging
 #      logging.getLogger('autotvm').setLevel(logging.DEBUG)
 #
-#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 1dd947fefd25..30e62efe0d9d 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -17,20 +17,22 @@
 """
 .. _tune_relay_x86:
 
-Auto-tuning a convolutional network for x86 CPU
+Auto-tuning a Convolutional Network for x86 CPU
 ===============================================
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Eddie Yan <https://github.com/eqy>`_
 
 This is a tutorial about how to tune convolution neural network
 for x86 CPU.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 import os
 import numpy as np
 
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
+from tvm import relay, autotvm
 from tvm.relay import testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
@@ -69,7 +71,7 @@ def get_network(name, batch_size):
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 357abf19a09c..db199fc717fa 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Writing tunable template and Using auto-tuner
-=============================================
+Writing Tunable Templates and Using the Auto-tuner
+==================================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
 
 This is an introduction tutorial to the auto-tuning module in TVM.
@@ -26,6 +26,10 @@
 The second step is running a search algorithm to explore through this space.
 In this tutorial, you can learn how to perform these two steps in TVM.
 The whole workflow is illustrated by a matrix multiplication example.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
@@ -55,7 +59,7 @@
 
 import numpy as np
 import tvm
-from tvm import te
+from tvm import te, testing
 
 # the module is called `autotvm`
 from tvm import autotvm
diff --git a/tutorials/dev/bring_your_own_datatypes.py b/tutorials/dev/bring_your_own_datatypes.py
index c85ec073f875..f9dc8bcdc948 100644
--- a/tutorials/dev/bring_your_own_datatypes.py
+++ b/tutorials/dev/bring_your_own_datatypes.py
@@ -116,7 +116,7 @@
 
 ######################################################################
 # Note that the type code, 150, is currently chosen manually by the user.
-# See ``TVMTypeCode::kCustomBegin`` in `include/tvm/runtime/c_runtime_api.h <https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/data_type.h>`_.
+# See ``TVMTypeCode::kCustomBegin`` in `include/tvm/runtime/c_runtime_api.h <https://github.com/apache/tvm/blob/main/include/tvm/runtime/data_type.h>`_.
 # Now we can generate our program again:
 
 x_myfloat = relay.cast(x, dtype="custom[myfloat]32")
@@ -176,7 +176,7 @@
 # To provide for the general case, we have made a helper function, ``create_lower_func(...)``,
 # which does just this: given a dictionary, it replaces the given operation with a ``Call`` to the appropriate function name provided based on the op and the bit widths.
 # It additionally removes usages of the custom datatype by storing the custom datatype in an opaque ``uint`` of the appropriate width; in our case, a ``uint32_t``.
-# For more information, see `the source code <https://github.com/apache/incubator-tvm/blob/main/python/tvm/target/datatype.py>`_.
+# For more information, see `the source code <https://github.com/apache/tvm/blob/main/python/tvm/target/datatype.py>`_.
 
 # We can now re-try running the program:
 try:
diff --git a/tutorials/dev/use_pass_infra.py b/tutorials/dev/use_pass_infra.py
index b16eb93749de..6a33d14e38c8 100644
--- a/tutorials/dev/use_pass_infra.py
+++ b/tutorials/dev/use_pass_infra.py
@@ -142,7 +142,7 @@ def alter_conv2d(attrs, inputs, tinfos, out_type):
 # packing them as a whole to execute. For example, the same passes can now be
 # applied using the sequential style as the following. :py:class:`tvm.transform.Sequential` is
 # similiar to `torch.nn.sequential <https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential>`_
-# and `mxnet.gluon.block <https://mxnet.incubator.apache.org/api/python/docs/_modules/mxnet/gluon/block.html>`_.
+# and `mxnet.gluon.block <https://mxnet.apache.org/api/python/docs/_modules/mxnet/gluon/block.html>`_.
 # For example, `torch.nn.sequential` is used to contain a sequence of PyTorch
 # `Modules` that will be added to build a network. It focuses on the network
 # layers. Instead, the :py:class:`tvm.transform.Sequential` in our pass infra works on the optimizing
diff --git a/tutorials/frontend/build_gcn.py b/tutorials/frontend/build_gcn.py
index 5c571ef1ff25..b832d18f9c3a 100644
--- a/tutorials/frontend/build_gcn.py
+++ b/tutorials/frontend/build_gcn.py
@@ -242,7 +242,9 @@ def GraphConv(layer_name, input_dim, output_dim, adj, input, norm=None, bias=Tru
 
 def prepare_params(g, data):
     params = {}
-    params["infeats"] = data.features.astype("float32")  # Only support float32 as feature for now
+    params["infeats"] = data.features.numpy().astype(
+        "float32"
+    )  # Only support float32 as feature for now
 
     # Generate adjacency matrix
     adjacency = nx.to_scipy_sparse_matrix(g)
@@ -350,5 +352,7 @@ def prepare_params(g, data):
 acc = evaluate(data, logits_tvm)
 print("Test accuracy of TVM results: {:.2%}".format(acc))
 
+import tvm.testing
+
 # Verify the results with the DGL model
 tvm.testing.assert_allclose(logits_torch, logits_tvm, atol=1e-3)
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index 851810a83958..ff7ef44a7acb 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -34,7 +34,7 @@
 from tvm import te
 import tvm.relay as relay
 from tvm import rpc
-from tvm.contrib import util, ndk, graph_runtime as runtime
+from tvm.contrib import utils, ndk, graph_runtime as runtime
 from tvm.contrib.download import download_testdata
 
 
@@ -47,7 +47,7 @@
 #
 # .. code-block:: bash
 #
-#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   git clone --recursive https://github.com/apache/tvm tvm
 #   cd tvm
 #   docker build -t tvm.demo_android -f docker/Dockerfile.demo_android ./docker
 #   docker run --pid=host -h tvm -v $PWD:/workspace \
@@ -106,7 +106,7 @@
 # --------------------------------------
 # Now we can register our Android device to the tracker.
 #
-# Follow this `readme page <https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc>`_ to
+# Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
 # install TVM RPC APK on the android device.
 #
 # Here is an example of config.mk. I enabled OpenCL and Vulkan.
@@ -139,7 +139,7 @@
 #
 # .. note::
 #
-#   At this time, don't forget to `create a standalone toolchain <https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc#architecture-and-android-standalone-toolchain>`_ .
+#   At this time, don't forget to `create a standalone toolchain <https://github.com/apache/tvm/tree/main/apps/android_rpc#architecture-and-android-standalone-toolchain>`_ .
 #
 #   for example
 #
@@ -282,7 +282,7 @@ def transform_image(image):
 # change the parameters but keep the result of model as the same.
 
 # Save the library at local temporary directory.
-tmp = util.tempdir()
+tmp = utils.tempdir()
 lib_fname = tmp.relpath("net.so")
 fcompile = ndk.create_shared if not local_demo else None
 lib.export_library(lib_fname, fcompile)
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py
index 8b49a213ef36..cae9d905898b 100644
--- a/tutorials/frontend/deploy_model_on_rasp.py
+++ b/tutorials/frontend/deploy_model_on_rasp.py
@@ -30,7 +30,7 @@
 from tvm import te
 import tvm.relay as relay
 from tvm import rpc
-from tvm.contrib import util, graph_runtime as runtime
+from tvm.contrib import utils, graph_runtime as runtime
 from tvm.contrib.download import download_testdata
 
 ######################################################################
@@ -53,7 +53,7 @@
 #
 # .. code-block:: bash
 #
-#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   git clone --recursive https://github.com/apache/tvm tvm
 #   cd tvm
 #   mkdir build
 #   cp cmake/config.cmake build
@@ -96,7 +96,7 @@
 # Back to the host machine, which should have a full TVM installed (with LLVM).
 #
 # We will use pre-trained model from
-# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# `MXNet Gluon model zoo <https://mxnet.apache.org/api/python/gluon/model_zoo.html>`_.
 # You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
 
 from mxnet.gluon.model_zoo.vision import get_model
@@ -193,7 +193,7 @@ def transform_image(image):
 # change the parameters but keep the result of model as the same.
 
 # Save the library at local temporary directory.
-tmp = util.tempdir()
+tmp = utils.tempdir()
 lib_fname = tmp.relpath("net.tar")
 lib.export_library(lib_fname)
 
diff --git a/tutorials/frontend/deploy_object_detection_pytorch.py b/tutorials/frontend/deploy_object_detection_pytorch.py
index 6408685febfb..2852dd3ad99d 100644
--- a/tutorials/frontend/deploy_object_detection_pytorch.py
+++ b/tutorials/frontend/deploy_object_detection_pytorch.py
@@ -27,8 +27,8 @@
 
 .. code-block:: bash
 
-    pip install torch==1.4.0
-    pip install torchvision==0.5.0
+    pip install torch==1.7.0
+    pip install torchvision==0.8.1
 
 or please refer to official site
 https://pytorch.org/get-started/locally/
@@ -36,7 +36,7 @@
 PyTorch versions should be backwards compatible but should be used
 with the proper TorchVision version.
 
-Currently, TVM supports PyTorch 1.4 and 1.3. Other versions may
+Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may
 be unstable.
 """
 
diff --git a/tutorials/frontend/deploy_prequantized.py b/tutorials/frontend/deploy_prequantized.py
index e9f1a4c7c326..beba332a8a26 100644
--- a/tutorials/frontend/deploy_prequantized.py
+++ b/tutorials/frontend/deploy_prequantized.py
@@ -22,7 +22,7 @@
 This is a tutorial on loading models quantized by deep learning frameworks into TVM.
 Pre-quantized model import is one of the quantization support we have in TVM. More details on
 the quantization story in TVM can be found
-`here <https://discuss.tvm.ai/t/quantization-story/3920>`_.
+`here <https://discuss.tvm.apache.org/t/quantization-story/3920>`_.
 
 Here, we demonstrate how to load and run models quantized by PyTorch, MXNet, and TFLite.
 Once loaded, we can run compiled, quantized models on any hardware TVM supports.
diff --git a/tutorials/frontend/deploy_quantized.py b/tutorials/frontend/deploy_quantized.py
index 093bd732eb3c..e75f6e92a6f1 100644
--- a/tutorials/frontend/deploy_quantized.py
+++ b/tutorials/frontend/deploy_quantized.py
@@ -22,7 +22,7 @@
 This article is an introductory tutorial of automatic quantization with TVM.
 Automatic quantization is one of the quantization modes in TVM. More details on
 the quantization story in TVM can be found
-`here <https://discuss.tvm.ai/t/quantization-story/3920>`_.
+`here <https://discuss.tvm.apache.org/t/quantization-story/3920>`_.
 In this tutorial, we will import a GluonCV pre-trained model on ImageNet to
 Relay, quantize the Relay model and then perform the inference.
 """
diff --git a/tutorials/frontend/from_keras.py b/tutorials/frontend/from_keras.py
index 3dcefd59327a..25a1e5c9d1fa 100644
--- a/tutorials/frontend/from_keras.py
+++ b/tutorials/frontend/from_keras.py
@@ -45,13 +45,25 @@
 # Load pretrained keras model
 # ----------------------------
 # We load a pretrained resnet-50 classification model provided by keras.
-weights_url = "".join(
-    [
-        "https://github.com/fchollet/deep-learning-models/releases/",
-        "download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
-    ]
-)
-weights_file = "resnet50_weights.h5"
+
+if tuple(keras.__version__.split(".")) < ("2", "4", "0"):
+    weights_url = "".join(
+        [
+            "https://github.com/fchollet/deep-learning-models/releases/",
+            "download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
+        ]
+    )
+    weights_file = "resnet50_keras_old.h5"
+else:
+    weights_url = "".join(
+        [
+            " https://storage.googleapis.com/tensorflow/keras-applications/",
+            "resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
+        ]
+    )
+    weights_file = "resnet50_keras_new.h5"
+
+
 weights_path = download_testdata(weights_url, weights_file, module="keras")
 keras_resnet50 = keras.applications.resnet50.ResNet50(
     include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py
index 3eeef874eaae..d103d17e5d24 100644
--- a/tutorials/frontend/from_mxnet.py
+++ b/tutorials/frontend/from_mxnet.py
@@ -33,7 +33,7 @@
     pip install mxnet --user
 
 or please refer to offical installation guide.
-https://mxnet.incubator.apache.org/versions/master/install/index.html
+https://mxnet.apache.org/versions/master/install/index.html
 """
 # some standard imports
 import mxnet as mx
diff --git a/tutorials/frontend/from_onnx.py b/tutorials/frontend/from_onnx.py
index 141defe65488..1557ea551d28 100644
--- a/tutorials/frontend/from_onnx.py
+++ b/tutorials/frontend/from_onnx.py
@@ -111,4 +111,4 @@
 # retains that dynamism upon import, and the compiler attemps to convert the model
 # into a static shapes at compile time. If this fails, there may still be dynamic
 # operations in the model. Not all TVM kernels currently support dynamic shapes,
-# please file an issue on discuss.tvm.ai if you hit an error with dynamic kernels.
+# please file an issue on discuss.tvm.apache.org if you hit an error with dynamic kernels.
diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
index 33a05884f61d..b5bcdf6792f9 100644
--- a/tutorials/frontend/from_pytorch.py
+++ b/tutorials/frontend/from_pytorch.py
@@ -28,8 +28,8 @@
 
 .. code-block:: bash
 
-    pip install torch==1.4.0
-    pip install torchvision==0.5.0
+    pip install torch==1.7.0
+    pip install torchvision==0.8.1
 
 or please refer to official site
 https://pytorch.org/get-started/locally/
@@ -37,7 +37,7 @@
 PyTorch versions should be backwards compatible but should be used
 with the proper TorchVision version.
 
-Currently, TVM supports PyTorch 1.4 and 1.3. Other versions may
+Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may
 be unstable.
 """
 
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py
index 572ebb897e3c..2386e7bdd135 100644
--- a/tutorials/get_started/cross_compilation_and_rpc.py
+++ b/tutorials/get_started/cross_compilation_and_rpc.py
@@ -49,7 +49,7 @@
 #
 # .. code-block:: bash
 #
-#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   git clone --recursive https://github.com/apache/tvm tvm
 #   cd tvm
 #   make runtime -j2
 #
@@ -98,7 +98,7 @@
 import tvm
 from tvm import te
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 
 n = tvm.runtime.convert(1024)
 A = te.placeholder((n,), name="A")
@@ -120,7 +120,7 @@
 
 func = tvm.build(s, [A, B], target=target, name="add_one")
 # save the lib at a local temp folder
-temp = util.tempdir()
+temp = utils.tempdir()
 path = temp.relpath("lib.tar")
 func.export_library(path)
 
diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py
index cece7ab04316..6da62f5ced4b 100644
--- a/tutorials/get_started/relay_quick_start.py
+++ b/tutorials/get_started/relay_quick_start.py
@@ -129,9 +129,9 @@
 ####################################################
 
 # save the graph, lib and params into separate files
-from tvm.contrib import util
+from tvm.contrib import utils
 
-temp = util.tempdir()
+temp = utils.tempdir()
 path_lib = temp.relpath("deploy_lib.tar")
 lib.export_library(path_lib)
 print(temp.listdir())
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
index 0ca2243c7c3b..7f1bb6a3d1e5 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/tutorials/get_started/tensor_expr_get_started.py
@@ -210,9 +210,9 @@
 # - cc.create_shared calls a compiler (gcc) to create a shared library
 #
 from tvm.contrib import cc
-from tvm.contrib import util
+from tvm.contrib import utils
 
-temp = util.tempdir()
+temp = utils.tempdir()
 fadd.save(temp.relpath("myadd.o"))
 if tgt == "cuda":
     fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py
index d844de592035..bcdf03e56875 100644
--- a/tutorials/get_started/tvmc_command_line_driver.py
+++ b/tutorials/get_started/tvmc_command_line_driver.py
@@ -246,10 +246,10 @@
     with np.load(output_file) as data:
         scores = softmax(data["output_0"])
         scores = np.squeeze(scores)
-        scores = np.argsort(scores)[::-1]
+        ranks = np.argsort(scores)[::-1]
 
-        for i in scores[0:5]:
-            print("class='%s' with probability=%f" % (labels[i], scores[i]))
+        for rank in ranks[0:5]:
+            print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
 
 
 ########################################################################
diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
index 601adb8dce46..e91cfe43ab46 100644
--- a/tutorials/language/tensorize.py
+++ b/tutorials/language/tensorize.py
@@ -160,9 +160,9 @@ def gemv_impl():
         return 0;
       }
     """
-    from tvm.contrib import util, clang
+    from tvm.contrib import utils, clang
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     ll_path = temp.relpath("temp.ll")
     # Create LLVM ir from c source code
     ll_code = clang.create_llvm(cc_code, output=ll_path)
@@ -182,7 +182,7 @@ def gemv_impl():
 #
 func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
 
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 dtype = A.dtype
 ctx = tvm.context("cpu", 0)
@@ -228,9 +228,9 @@ def gemv_impl():
         return 0;
       }
     """
-    from tvm.contrib import util, clang
+    from tvm.contrib import utils, clang
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     ll_path = temp.relpath("temp.ll")
     # Create LLVM ir from c source code
     ll_code = clang.create_llvm(cc_code, output=ll_path)
diff --git a/tutorials/micro/micro_reference_vm.py b/tutorials/micro/micro_reference_vm.py
new file mode 100644
index 000000000000..4b449a0e7e14
--- /dev/null
+++ b/tutorials/micro/micro_reference_vm.py
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+===================================
+microTVM Reference Virtual Machines
+===================================
+**Author**: `Andrew Reusch <areusch@octoml.ai>`_
+
+This tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to
+develop on real physical hardware without needing to individually install the microTVM
+dependencies. These are also particularly useful when trying to reproduce behavior with
+microTVM, such as when filing bug reports.
+
+microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers.
+microTVM aims to be compatible with a wide variety of SoCs and runtime environments (i.e. bare metal,
+RTOS, etc). However, some stable software environment is needed to allow developers to share and
+reproduce bugs and results. The microTVM Reference Virtual Machines are intended to provide that
+environment.
+
+How it works
+============
+
+No Virtual Machines are stored in the TVM repository--instead, the files stored in
+``apps/microtvm/reference-vm`` describe how to build VMs to the Vagrant_ VM builder tool.
+
+The Reference VMs are split into two parts:
+
+1. A Vagrant Base Box, which contains all of the stable dependencies for that platform. Build
+   scripts are stored in ``apps/microtvm/reference-vm/<platform>/base-box``. TVM committers run
+   these when a platform's "stable" dependencies change, and the generated base boxes are stored in
+   `Vagrant Cloud`_.
+2. A per-workspace VM, which users normally build using the Base Box as a starting point. Build
+   scripts are stored in ``apps/microtvm/reference-vm/<platform>`` (everything except ``base-box``).
+
+.. _Vagrant: https://vagrantup.com
+.. _Vagrant Cloud: https://app.vagrantup.com/tlcpack
+
+Setting up the VM
+=================
+
+Installing prerequisites
+------------------------
+
+A minimal set of prerequisites are needed:
+
+
+1. `Vagrant <https://vagrantup.com>`__
+2. A supported Virtual Machine hypervisor.
+   `VirtualBox <https://www.virtualbox.org>`__ is one suggested free hypervisor, but please note
+   that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox,
+   also consider installing the `vbguest <https://github.com/dotless-de/vagrant-vbguest>`_ plugin.
+
+.. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
+
+First boot
+----------
+
+The first time you use a reference VM, you need to create the box locally and then provision it.
+
+.. code-block:: bash
+
+    # Replace zepyhr with the name of a different platform, if you are not using Zephyr.
+    ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr
+    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox).
+    ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider=<provider_name>
+
+
+This command will take a couple of minutes to run and will require 4 to 5GB of storage on your
+machine. It does the following:
+
+1. Downloads the `microTVM base box`_ and clones it to form a new VM specific to this TVM directory.
+2. Mounts your TVM directory (and, if using ``git-subtree``, the original ``.git`` repo) into the
+   VM.
+3. Builds TVM and installs a Python virtualenv with the dependencies corresponding with your TVM
+   build.
+
+.. _microTVM base box: https://app.vagrantup.com/tlcpack/boxes/microtvm
+
+
+Next, you need to configure USB passthrough to attach your physical development board to the virtual
+machine (rather than directly to your laptop's host OS).
+
+It's suggested you setup a device filter, rather than doing a one-time forward, because often the
+device may reboot during the programming process and you may, at that time, need to enable
+forwarding again. It may not be obvious to the end user when this occurs. Instructions to do that:
+
+ * `VirtualBox <https://www.virtualbox.org/manual/ch03.html#usb-support>`__
+ * `Parallels <https://kb.parallels.com/122993>`__
+ * `VMWare Workstation <https://docs.vmware.com/en/VMware-Workstation-Pro/15.0/com.vmware.ws.using.doc/GUID-E003456F-EB94-4B53-9082-293D9617CB5A.html>`__
+
+Future use
+----------
+
+After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm``,
+up-to-date when you modify the C++ runtime or checkout a different revision. You can either
+re-provision the machine (``vagrant provision`` in the same directory you ran ``vagrant up`` before)
+or manually rebuild TVM yourself.
+
+Remember: the TVM ``.so`` built inside the VM is different from the one you may use on your host
+machine. This is why it's built inside the special directory ``build-microtvm``.
+
+Logging in to the VM
+--------------------
+
+The VM should be available to your host only with the hostname ``microtvm``. You can SSH to the VM
+as follows:
+
+.. code-block:: bash
+
+    $ vagrant ssh
+
+Then ``cd`` to the same path used on your host machine for TVM. For example, on Mac:
+
+.. code-block:: bash
+
+    $ cd /Users/yourusername/path/to/tvm
+
+Running tests
+=============
+
+Once the VM has been provisioned, tests can executed using ``poetry``:
+
+.. code-block:: bash
+
+    $ poetry run python3 tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+
+"""
diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py
index 6fd2de15288d..293f95cf75c6 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/tutorials/micro/micro_tflite.py
@@ -98,7 +98,7 @@
 import tvm
 import tvm.micro as micro
 from tvm.contrib.download import download_testdata
-from tvm.contrib import graph_runtime, util
+from tvm.contrib import graph_runtime, utils
 from tvm import relay
 
 # %%
diff --git a/version.py b/version.py
index 6554343ac7c6..f3af4202fe6b 100644
--- a/version.py
+++ b/version.py
@@ -22,19 +22,104 @@
 List of affected files:
 - tvm-root/python/tvm/_ffi/libinfo.py
 - tvm-root/include/tvm/runtime/c_runtime_api.h
-- tvm-root/conda/tvm/meta.yaml
-- tvm-root/conda/tvm-libs/meta.yaml
+- tvm-root/conda/recipe/meta.yaml
+- tvm-root/web/package.json
 """
 import os
 import re
+import argparse
+import logging
+import subprocess
 
-# current version
+# Modify the following two settings during release
+# ---------------------------------------------------
+# Current version
 # We use the version of the incoming release for code
 # that is under development
 __version__ = "0.8.dev0"
 
+# Most recent tag, used for git describe validation
+# set this value to be the most recent release tag
+# before this development cycle.
+__most_recent_tag__ = "v0.7.0"
+# ---------------------------------------------------
+
+PROJ_ROOT = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+
+
+def py_str(cstr):
+    return cstr.decode("utf-8")
+
+
+def git_describe_version():
+    """Get PEP-440 compatible public and local version using git describe.
+
+    Returns
+    -------
+    pub_ver: str
+        Public version.
+
+    local_ver: str
+        Local version (with additional label appended to pub_ver).
+
+    Note
+    ----
+    We follow PEP 440's convention of public version
+    and local versions.
+
+    Here are some examples:
+
+    - pub_ver = '0.7.0', local_ver = '0.7.0':
+      We are at the 0.7.0 release.
+    - pub_ver =  '0.8.dev94', local_ver = '0.8.dev94+g0d07a329e':
+      We are at the the 0.8 development cycle.
+      The current source contains 94 additional commits
+      after the most recent tag(v0.7.0),
+      the git short hash tag of the current commit is 0d07a329e.
+    """
+    cmd = ["git", "describe", "--tags"]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=PROJ_ROOT)
+    (out, _) = proc.communicate()
+
+    if proc.returncode != 0:
+        msg = py_str(out)
+        if msg.find("not a git repository") != -1:
+            return __version__, __version__
+        logging.warning("git describe: %s, use %s", msg, __version__)
+        return __version__, __version__
+    describe = py_str(out).strip()
+    arr_info = describe.split("-")
+
+    if not arr_info[0].endswith(__most_recent_tag__):
+        logging.warning(
+            "%s does not match most recent tag %s, fallback to %s",
+            describe,
+            __most_recent_tag__,
+            __version__,
+        )
+        return __version__, __version__
+
+    # Remove the v prefix, mainly to be robust
+    # to the case where v is not presented as well.
+    if arr_info[0].startswith("v"):
+        arr_info[0] = arr_info[0][1:]
+
+    # hit the exact tag
+    if len(arr_info) == 1:
+        return arr_info[0], arr_info[0]
+
+    if len(arr_info) != 3:
+        logging.warning("Invalid output from git describe %s", describe)
+        return __version__, __version__
+
+    dev_pos = __version__.find(".dev")
+    pub_ver = "%s.dev%s" % (__version__[:dev_pos], arr_info[1])
+    local_ver = "%s+%s" % (pub_ver, arr_info[2])
+    return pub_ver, local_ver
+
+
 # Implementations
-def update(file_name, pattern, repl):
+def update(file_name, pattern, repl, dry_run=False):
     update = []
     hit_counter = 0
     need_update = False
@@ -46,7 +131,7 @@ def update(file_name, pattern, repl):
             if result[0] != repl:
                 l = re.sub(pattern, repl, l)
                 need_update = True
-                print("%s: %s->%s" % (file_name, result[0], repl))
+                print("%s: %s -> %s" % (file_name, result[0], repl))
             else:
                 print("%s: version is already %s" % (file_name, repl))
 
@@ -54,33 +139,72 @@ def update(file_name, pattern, repl):
     if hit_counter != 1:
         raise RuntimeError("Cannot find version in %s" % file_name)
 
-    if need_update:
+    if need_update and not dry_run:
         with open(file_name, "w") as output_file:
             for l in update:
                 output_file.write(l)
 
 
-def main():
-    proj_root = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    # python path
+def sync_version(pub_ver, local_ver, dry_run):
+    """Synchronize version."""
+    # python uses the PEP-440: local version
     update(
-        os.path.join(proj_root, "python", "tvm", "_ffi", "libinfo.py"),
-        r"(?<=__version__ = \")[.0-9a-z]+",
-        __version__,
+        os.path.join(PROJ_ROOT, "python", "tvm", "_ffi", "libinfo.py"),
+        r"(?<=__version__ = \")[.0-9a-z\+]+",
+        local_ver,
+        dry_run,
     )
+    # Use public version for other parts for now
+    # Note that full git hash is already available in libtvm
     # C++ header
     update(
-        os.path.join(proj_root, "include", "tvm", "runtime", "c_runtime_api.h"),
-        '(?<=TVM_VERSION ")[.0-9a-z]+',
-        __version__,
+        os.path.join(PROJ_ROOT, "include", "tvm", "runtime", "c_runtime_api.h"),
+        r'(?<=TVM_VERSION ")[.0-9a-z\+]+',
+        pub_ver,
+        dry_run,
     )
     # conda
-    for path in ["tvm", "tvm-libs"]:
-        update(
-            os.path.join(proj_root, "conda", path, "meta.yaml"),
-            '(?<=version = ")[.0-9a-z]+',
-            __version__,
-        )
+    update(
+        os.path.join(PROJ_ROOT, "conda", "recipe", "meta.yaml"),
+        r"(?<=version = ')[.0-9a-z\+]+",
+        pub_ver,
+        dry_run,
+    )
+    # web
+    # change to pre-release convention by npm
+    dev_pos = pub_ver.find(".dev")
+    npm_ver = pub_ver if dev_pos == -1 else "%s.0-%s" % (pub_ver[:dev_pos], pub_ver[dev_pos + 1 :])
+    update(
+        os.path.join(PROJ_ROOT, "web", "package.json"),
+        r'(?<="version": ")[.0-9a-z\-\+]+',
+        npm_ver,
+        dry_run,
+    )
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser(description="Detect and sychnronize version.")
+    parser.add_argument(
+        "--print-version",
+        action="store_true",
+        help="Print version to the command line. No changes is applied to files.",
+    )
+    parser.add_argument(
+        "--git-describe",
+        action="store_true",
+        help="Use git describe to generate development version.",
+    )
+    parser.add_argument("--dry-run", action="store_true")
+
+    opt = parser.parse_args()
+    pub_ver, local_ver = __version__, __version__
+    if opt.git_describe:
+        pub_ver, local_ver = git_describe_version()
+    if opt.print_version:
+        print(local_ver)
+    else:
+        sync_version(pub_ver, local_ver, opt.dry_run)
 
 
 if __name__ == "__main__":
diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py
index d39f982823f9..d143c4db6884 100644
--- a/vta/python/vta/__init__.py
+++ b/vta/python/vta/__init__.py
@@ -20,8 +20,6 @@
 Besides the compiler toolchain, it also includes utility functions to
 configure the hardware environment and access remote device through RPC.
 """
-from __future__ import absolute_import as _abs
-
 import sys
 
 from .bitstream import get_bitstream_path, download_bitstream
diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py
index 254243d59543..3f7064061c06 100644
--- a/vta/python/vta/bitstream.py
+++ b/vta/python/vta/bitstream.py
@@ -29,7 +29,7 @@
     import urllib2
 
 # bitstream repo
-BITSTREAM_URL = "https://github.com/uwsaml/vta-distro/raw/master/bitstreams/"
+BITSTREAM_URL = "https://github.com/uwsampl/vta-distro/raw/master/bitstreams/"
 
 
 def get_bitstream_path():
diff --git a/vta/python/vta/testing/__init__.py b/vta/python/vta/testing/__init__.py
index fbc50b063fb5..8d294c2f4d22 100644
--- a/vta/python/vta/testing/__init__.py
+++ b/vta/python/vta/testing/__init__.py
@@ -17,4 +17,4 @@
 
 """Testing utilities, this namespace is not imported by default."""
 
-from .util import run
+from .utils import run
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/utils.py
similarity index 100%
rename from vta/python/vta/testing/util.py
rename to vta/python/vta/testing/utils.py
diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index 6f62aff469d4..b9ebe55703c5 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -24,4 +24,4 @@
 from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
 from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
 from .vta_dense import dense_packed, schedule_dense_packed
-from . import util
+from . import utils
diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
index 52bd13da6c23..50b3729a1e44 100644
--- a/vta/python/vta/top/bitpack.py
+++ b/vta/python/vta/top/bitpack.py
@@ -21,7 +21,7 @@
 
 import tvm
 from tvm import te
-from tvm.topi import util
+from tvm.topi import utils
 
 from tvm.relay.op.op import register_compute, register_injective_schedule
 from tvm.relay.op.op import register_pattern, OpPattern
@@ -55,7 +55,7 @@ def bitpack(data, bits, pack_type="int8", name="bitpack"):
     lanes = data_width // bits
 
     # Data must be in multiples of the data_width
-    assert util.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
+    assert utils.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
     shape_vec[-1] = shape_vec[-1] // lanes
     oshape = tuple(shape_vec)
 
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 27105576f218..a217104a9ae7 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -26,7 +26,7 @@
 from tvm.relay.op import strategy as _strategy
 from tvm.relay.op.op import OpPattern, OpStrategy
 
-from .util import is_packed_layout
+from .utils import is_packed_layout
 from .vta_conv2d import conv2d_packed, schedule_conv2d_packed
 from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
 from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
@@ -69,7 +69,7 @@ def conv2d_strategy_vta(attrs, inputs, out_type, target):
     """conv2d vta strategy"""
     strategy = OpStrategy()
     kernel = inputs[1]
-    dilation = topi.util.get_const_tuple(attrs.dilation)
+    dilation = topi.utils.get_const_tuple(attrs.dilation)
     groups = attrs.groups
     layout = attrs.data_layout
 
@@ -102,7 +102,7 @@ def conv2d_strategy_vta(attrs, inputs, out_type, target):
 @_strategy.conv2d_transpose_strategy.register("vta")
 def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target):
     """conv2d_transpose vta strategy"""
-    dilation = topi.util.get_const_tuple(attrs.dilation)
+    dilation = topi.utils.get_const_tuple(attrs.dilation)
     layout = attrs.data_layout
     assert dilation == (1, 1), "support for dilation limited to (1, 1)"
 
@@ -123,7 +123,7 @@ def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target):
 @_strategy.dense_strategy.register("vta")
 def dense_strategy_vta(attrs, inputs, out_type, target):
     """dense vta strategy"""
-    if inputs[0].shape == 4:  # this implies the layout is packed
+    if len(inputs[0].shape) == 4:  # this implies the layout is packed
         strategy = OpStrategy()
         strategy.add_implementation(
             _strategy.wrap_compute_dense(dense_packed),
diff --git a/vta/python/vta/top/util.py b/vta/python/vta/top/utils.py
similarity index 100%
rename from vta/python/vta/top/util.py
rename to vta/python/vta/top/utils.py
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index e1555654b1e2..0b9cb719189f 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -23,7 +23,7 @@
 from tvm import autotvm
 from tvm import topi
 
-from .util import is_packed_layout
+from .utils import is_packed_layout
 from ..environment import get_env
 
 
@@ -40,12 +40,12 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
         pad_data = data
     assert len(data.shape) == 6
     assert len(kernel.shape) == 6
-    oheight = topi.util.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
-    owidth = topi.util.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
+    oheight = topi.utils.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
+    owidth = topi.utils.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
     oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])
 
-    ishape = topi.util.get_const_tuple(data.shape)
-    kshape = topi.util.get_const_tuple(kernel.shape)
+    ishape = topi.utils.get_const_tuple(data.shape)
+    kshape = topi.utils.get_const_tuple(kernel.shape)
     d_i = te.reduce_axis((0, kshape[2]), name="d_i")
     d_j = te.reduce_axis((0, kshape[3]), name="d_j")
     k_o = te.reduce_axis((0, ishape[1]), name="k_o")
@@ -64,7 +64,7 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
 
     cfg.add_flop(
         2
-        * np.prod(topi.util.get_const_tuple(oshape))
+        * np.prod(topi.utils.get_const_tuple(oshape))
         * kshape[2]
         * kshape[3]
         * ishape[1]
diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py
index c020747221c5..5a44104baa57 100644
--- a/vta/python/vta/top/vta_conv2d_transpose.py
+++ b/vta/python/vta/top/vta_conv2d_transpose.py
@@ -22,8 +22,8 @@
 from tvm import te
 from tvm import autotvm
 from tvm import topi
-from tvm.topi.util import get_const_tuple
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 from ..environment import get_env
 
@@ -75,7 +75,7 @@ def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype, outp
 
     cfg.add_flop(
         2
-        * np.prod(topi.util.get_const_tuple(oshape))
+        * np.prod(topi.utils.get_const_tuple(oshape))
         * kshape[2]
         * kshape[3]
         * ishape[1]
diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index 4a618f198324..5e06cf9f5624 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -44,8 +44,8 @@ def dense_packed(cfg, data, weight, bias=None, out_dtype=None):
         raise topi.InvalidShapeError()
 
     # Derive shapes
-    ishape = topi.util.get_const_tuple(data.shape)
-    wshape = topi.util.get_const_tuple(weight.shape)
+    ishape = topi.utils.get_const_tuple(data.shape)
+    wshape = topi.utils.get_const_tuple(weight.shape)
     oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2])
 
     # Reduction axes (input channel)
@@ -64,7 +64,7 @@ def dense_packed(cfg, data, weight, bias=None, out_dtype=None):
         tag="dense_pack",
     )
 
-    cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * ishape[1] * ishape[3])
+    cfg.add_flop(2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1] * ishape[3])
 
     return res
 
diff --git a/vta/python/vta/top/vta_group_conv2d.py b/vta/python/vta/top/vta_group_conv2d.py
index b2661b38f15c..deb4ea779214 100644
--- a/vta/python/vta/top/vta_group_conv2d.py
+++ b/vta/python/vta/top/vta_group_conv2d.py
@@ -41,12 +41,12 @@ def group_conv2d_packed(cfg, data, kernel, strides, padding, dilation, group, ou
     assert kernel.dtype == "int8", kernel.dtype
     assert out_dtype == "int32", out_dtype
 
-    oheight = topi.util.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
-    owidth = topi.util.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
+    oheight = topi.utils.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
+    owidth = topi.utils.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
     oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])
 
-    ishape = topi.util.get_const_tuple(data.shape)
-    kshape = topi.util.get_const_tuple(kernel.shape)
+    ishape = topi.utils.get_const_tuple(data.shape)
+    kshape = topi.utils.get_const_tuple(kernel.shape)
     assert group * kshape[1] == ishape[1]
     assert kshape[0] % group == 0
     d_i = te.reduce_axis((0, kshape[2]), name="d_i")
@@ -74,7 +74,7 @@ def group_conv2d_packed(cfg, data, kernel, strides, padding, dilation, group, ou
 
     cfg.add_flop(
         2
-        * np.prod(topi.util.get_const_tuple(oshape))
+        * np.prod(topi.utils.get_const_tuple(oshape))
         * kshape[2]
         * kshape[3]
         * ishape[1]
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index ed64ba3dc26c..a485d2cfb7b8 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -18,7 +18,7 @@
 # pylint: disable=len-as-condition, no-else-return, unused-argument, invalid-name
 import tvm
 from tvm import te
-from tvm.topi import util
+from tvm.topi import utils
 
 from .environment import get_env
 
@@ -346,7 +346,7 @@ def _check_compact(buf):
         ndim = len(buf.shape)
         size = tvm.tir.const(1, buf.shape[0].dtype)
         for i in reversed(range(ndim)):
-            if not util.equal_const_int(size - buf.strides[i], 0):
+            if not utils.equal_const_int(size - buf.strides[i], 0):
                 raise RuntimeError(
                     "Cannot prove compact: shape=%s, strides=%s" % (buf.shape, buf.strides)
                 )
@@ -357,10 +357,10 @@ def _fold_buffer_dim(buf, scope, elem_block):
         x_size = 1
         base = 0
         for i in range(1, ndim + 1):
-            if not util.equal_const_int(buf.strides[ndim - i] - x_size, 0):
+            if not utils.equal_const_int(buf.strides[ndim - i] - x_size, 0):
                 raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block))
             x_size = x_size * buf.shape[ndim - i]
-            if util.equal_const_int(x_size - elem_block, 0):
+            if utils.equal_const_int(x_size - elem_block, 0):
                 base = i + 1
                 break
         if base == 0:
@@ -370,7 +370,7 @@ def _fold_buffer_dim(buf, scope, elem_block):
         shape = [elem_block]
         strides = [1]
 
-        if base < ndim + 1 and not util.equal_const_int(buf.strides[ndim - base], elem_block):
+        if base < ndim + 1 and not utils.equal_const_int(buf.strides[ndim - base], elem_block):
             shape.append(1)
             strides.append(elem_block)
 
@@ -379,14 +379,14 @@ def _fold_buffer_dim(buf, scope, elem_block):
             x_size = 1
             x_stride = buf.strides[ndim - base]
             next_base = base
-            if not util.equal_const_int(idxm(x_stride, elem_block), 0):
+            if not utils.equal_const_int(idxm(x_stride, elem_block), 0):
                 raise RuntimeError(
                     "scope %s need to have block=%d, shape=%s, strides=%s"
                     % (scope, elem_block, buf.shape, buf.strides)
                 )
             for i in range(base, ndim + 1):
                 k = ndim - i
-                if not util.equal_const_int(x_size * x_stride - buf.strides[k], 0):
+                if not utils.equal_const_int(x_size * x_stride - buf.strides[k], 0):
                     break
                 x_size = x_size * buf.shape[k]
                 next_base = i + 1
@@ -404,7 +404,7 @@ def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
         if buf.dtype != dtype:
             raise RuntimeError("Expect buffer type to be %s instead of %s" % (dtype, buf.dtype))
         shape, strides = buf.shape, buf.strides
-        if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
+        if not utils.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
             raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))
         if allow_fold:
             shape, strides = _fold_buffer_dim(buf, scope, elem_block)
@@ -425,10 +425,10 @@ def raise_error():
         ndim = len(shape)
 
         # Check if the inner-tensor is already flat
-        flat = util.equal_const_int(shape[-1], elem_block)
+        flat = utils.equal_const_int(shape[-1], elem_block)
 
         if flat:
-            if not util.equal_const_int(strides[-1], 1):
+            if not utils.equal_const_int(strides[-1], 1):
                 raise_error()
 
             if ndim == 1:
@@ -436,7 +436,7 @@ def raise_error():
                 x_stride = 1
                 y_size = 1
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not util.equal_const_int(strides[-2] - elem_block, 0):
+            if not utils.equal_const_int(strides[-2] - elem_block, 0):
                 raise_error()
 
             if ndim == 2:
@@ -444,7 +444,7 @@ def raise_error():
                 x_stride = shape[-2]
                 y_size = 1
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not util.equal_const_int(idxm(strides[-3], elem_block), 0):
+            if not utils.equal_const_int(idxm(strides[-3], elem_block), 0):
                 raise_error()
 
             if ndim == 3:
@@ -454,11 +454,11 @@ def raise_error():
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
 
         else:
-            if not util.equal_const_int(strides[-1], 1):
+            if not utils.equal_const_int(strides[-1], 1):
                 raise_error()
-            if not util.equal_const_int(strides[-2] - shape[-1], 0):
+            if not utils.equal_const_int(strides[-2] - shape[-1], 0):
                 raise_error()
-            if not util.equal_const_int(shape[-1] * shape[-2], elem_block):
+            if not utils.equal_const_int(shape[-1] * shape[-2], elem_block):
                 raise_error()
 
             if ndim == 2:
@@ -466,7 +466,7 @@ def raise_error():
                 x_stride = 1
                 y_size = 1
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not util.equal_const_int(strides[-3], elem_block):
+            if not utils.equal_const_int(strides[-3], elem_block):
                 raise_error()
 
             if ndim == 3:
@@ -474,7 +474,7 @@ def raise_error():
                 x_stride = shape[-3]
                 y_size = 1
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not util.equal_const_int(idxm(strides[-4], elem_block), 0):
+            if not utils.equal_const_int(idxm(strides[-4], elem_block), 0):
                 raise_error()
 
             if ndim == 4:
@@ -556,9 +556,9 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
                     y_pad_after = pad_after[1]
                     x_pad_after = pad_after[2]
                     for dim in range(3, ndim):
-                        if not util.equal_const_int(pad_before[dim], 0):
+                        if not utils.equal_const_int(pad_before[dim], 0):
                             raise ValueError("Do not support pad on the innermost block")
-                        if not util.equal_const_int(pad_after[dim], 0):
+                        if not utils.equal_const_int(pad_after[dim], 0):
                             raise ValueError("Do not support pad on the innermost block")
                 else:
                     y_pad_before = pad_before[0]
@@ -566,9 +566,9 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
                     y_pad_after = pad_after[0]
                     x_pad_after = pad_after[1]
                     for dim in range(2, ndim):
-                        if not util.equal_const_int(pad_before[dim], 0):
+                        if not utils.equal_const_int(pad_before[dim], 0):
                             raise ValueError("Do not support pad on the innermost block")
-                        if not util.equal_const_int(pad_after[dim], 0):
+                        if not utils.equal_const_int(pad_after[dim], 0):
                             raise ValueError("Do not support pad on the innermost block")
                 allow_fold = False
             else:
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 6d6490448e78..04f430ef8624 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -28,7 +28,7 @@
 from tvm import rpc, autotvm, relay
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib import graph_runtime, util, download
+from tvm.contrib import graph_runtime, utils, download
 from tvm.contrib.debugger import debug_runtime
 import vta
 from vta.testing import simulator
@@ -318,7 +318,7 @@ def tune_tasks(
                 )
 
         # Export library
-        temp = util.tempdir()
+        temp = utils.tempdir()
         lib.save(temp.relpath("graphlib.o"))
         remote.upload(temp.relpath("graphlib.o"))
         lib = remote.load_module("graphlib.o")
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
index 432d6f7041ba..3ce2d9c9e4a9 100644
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -18,7 +18,7 @@
 import tvm.testing
 from tvm import te
 import numpy as np
-from tvm.contrib import util
+from tvm.contrib import utils
 import vta.testing
 from vta.testing import simulator
 
@@ -61,7 +61,7 @@ def run_gemm_packed(env, remote, batch_size, channel, block):
 
         def verify(s, check_correctness=True):
             mod = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="gemm")
-            temp = util.tempdir()
+            temp = utils.tempdir()
             mod.save(temp.relpath("gemm.o"))
             remote.upload(temp.relpath("gemm.o"))
             f = remote.load_module("gemm.o")
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 004cc6b9c7d3..cad560c208b6 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -28,7 +28,7 @@
 from tvm import te
 from tvm import relay
 from tvm import autotvm
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
@@ -224,13 +224,13 @@ def get_ref_data():
         mod = tvm.build(
             s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d"
         )
-    temp = util.tempdir()
+    temp = utils.tempdir()
     mod.save(temp.relpath("conv2d.o"))
     remote.upload(temp.relpath("conv2d.o"))
     f = remote.load_module("conv2d.o")
     ctx = remote.context(str(target))
 
-    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     bias_arr = tvm.nd.array(bias_np, ctx)
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
index 23c4a5c78d90..f750225ed8f7 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
@@ -28,7 +28,7 @@
 from tvm import te
 from tvm import relay
 from tvm import autotvm
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
@@ -220,13 +220,13 @@ def get_ref_data():
             target_host=env.target_host,
             name="conv2d_transpose",
         )
-    temp = util.tempdir()
+    temp = utils.tempdir()
     mod.save(temp.relpath("conv2d_transpose.o"))
     remote.upload(temp.relpath("conv2d_transpose.o"))
     f = remote.load_module("conv2d_transpose.o")
     ctx = remote.context(str(target))
 
-    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     res_arr = tvm.nd.array(res_np, ctx)
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
index 37cfac16f5d7..0b604108a35f 100644
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -26,7 +26,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
@@ -131,13 +131,13 @@ def get_ref_data():
         mod = tvm.build(
             s, [data, kernel, res], target=target, target_host=env.target_host, name="dense"
         )
-    temp = util.tempdir()
+    temp = utils.tempdir()
     mod.save(temp.relpath("dense.o"))
     remote.upload(temp.relpath("dense.o"))
     f = remote.load_module("dense.o")
     ctx = remote.context(str(target))
 
-    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     res_arr = tvm.nd.array(res_np, ctx)
diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
index 08d5e4b52555..da6ba5b8fb94 100644
--- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
@@ -28,7 +28,7 @@
 from tvm import te
 from tvm import relay
 from tvm import autotvm
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm import topi
 import tvm.topi.testing
 import vta
@@ -218,13 +218,13 @@ def get_ref_data():
         mod = tvm.build(
             s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d"
         )
-    temp = util.tempdir()
+    temp = utils.tempdir()
     mod.save(temp.relpath("conv2d.o"))
     remote.upload(temp.relpath("conv2d.o"))
     f = remote.load_module("conv2d.o")
     ctx = remote.context(str(target))
 
-    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     bias_arr = tvm.nd.array(bias_np, ctx)
diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py
index fb0acf1d065f..b83510f4a9dc 100644
--- a/vta/tests/python/unittest/test_vta_insn.py
+++ b/vta/tests/python/unittest/test_vta_insn.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import topi
-from tvm.contrib import util
+from tvm.contrib import utils
 
 import vta
 import vta.testing
@@ -54,7 +54,7 @@ def _run(env, remote):
 
         if not remote:
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         m.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
@@ -125,7 +125,7 @@ def check_padded_load(pad_before, pad_after, test_name=None):
 
             if not remote:
                 return
-            temp = util.tempdir()
+            temp = utils.tempdir()
             mod.save(temp.relpath("padded_load.o"))
             remote.upload(temp.relpath("padded_load.o"))
             f = remote.load_module("padded_load.o")
@@ -209,7 +209,7 @@ def _run(env, remote):
 
         def verify(s, name=None):
             mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
-            temp = util.tempdir()
+            temp = utils.tempdir()
             mod.save(temp.relpath("gemm.o"))
             remote.upload(temp.relpath("gemm.o"))
             f = remote.load_module("gemm.o")
@@ -371,7 +371,7 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
                     mod = vta.build(s, [a, res], "ext_dev", env.target_host)
                 else:
                     mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
-            temp = util.tempdir()
+            temp = utils.tempdir()
             mod.save(temp.relpath("load_act.o"))
             remote.upload(temp.relpath("load_act.o"))
             f = remote.load_module("load_act.o")
@@ -454,7 +454,7 @@ def _run(env, remote):
             mod = vta.build(s, [a, res], "ext_dev", env.target_host)
         if not remote:
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         mod.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
@@ -516,7 +516,7 @@ def _run(env, remote):
         mod = vta.build(s, [a, res], "ext_dev", env.target_host)
         if not remote:
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         mod.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index cb36040f2da4..7f0442402c57 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -62,7 +62,7 @@
 import tvm
 from tvm import te
 from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, util, download
+from tvm.contrib import graph_runtime, utils, download
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 
@@ -424,7 +424,7 @@ def tune_and_evaluate(tuning_opt):
 
         # Export library
         print("Upload...")
-        temp = util.tempdir()
+        temp = utils.tempdir()
         lib.save(temp.relpath("graphlib.o"))
         remote.upload(temp.relpath("graphlib.o"))
         lib = remote.load_module("graphlib.o")
@@ -507,4 +507,4 @@ def tune_and_evaluate(tuning_opt):
 #      import logging
 #      logging.getLogger('autotvm').setLevel(logging.DEBUG)
 #
-#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 582eb03adc56..1bf4161a3340 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -52,7 +52,7 @@
 import tvm
 from tvm import te
 from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, util, download
+from tvm.contrib import graph_runtime, utils, download
 from tvm.contrib.debugger import debug_runtime
 from tvm.relay import transform
 
@@ -204,7 +204,7 @@
     print(model + " inference graph built in {0:.2f}s!".format(build_time))
 
     # Send the inference library over to the remote RPC server
-    temp = util.tempdir()
+    temp = utils.tempdir()
     lib.export_library(temp.relpath("graphlib.tar"))
     remote.upload(temp.relpath("graphlib.tar"))
     lib = remote.load_module("graphlib.tar")
diff --git a/vta/tutorials/frontend/legacy/deploy_detection.py b/vta/tutorials/frontend/legacy/deploy_detection.py
index f2c42c1fc1d8..7a4aba93146b 100644
--- a/vta/tutorials/frontend/legacy/deploy_detection.py
+++ b/vta/tutorials/frontend/legacy/deploy_detection.py
@@ -241,7 +241,7 @@
     print(MODEL_NAME + " inference graph built in {0:.2f}s!".format(build_time))
 
     # Send the inference library over to the remote RPC server
-    temp = util.tempdir()
+    temp = utils.tempdir()
     lib.export_library(temp.relpath("graphlib.tar"))
     remote.upload(temp.relpath("graphlib.tar"))
     lib = remote.load_module("graphlib.tar")
diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py
index 71a8f672dbb4..593ac3c5a0ee 100644
--- a/vta/tutorials/matrix_multiply.py
+++ b/vta/tutorials/matrix_multiply.py
@@ -40,7 +40,7 @@
 import vta
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from vta.testing import simulator
 
 # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
@@ -86,7 +86,7 @@
 # The last operation is a cast and copy back to DRAM, into results tensor
 # :code:`C`.
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/gemm_dataflow.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/gemm_dataflow.png
 #      :align: center
 
 ######################################################################
@@ -107,7 +107,7 @@
 #   adding the result matrix to an accumulator matrix, as shown in the
 #   figure below.
 #
-#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/tensor_core.png
+#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/tensor_core.png
 #        :align: center
 #        :width: 480px
 #
@@ -126,7 +126,7 @@
 #   contiguous.
 #   The resulting tiled tensor has a shape of (2, 4, 2, 2).
 #
-#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/data_tiling.png
+#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/data_tiling.png
 #        :align: center
 #        :width: 480px
 #
@@ -389,7 +389,7 @@
 my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")
 
 # Write the compiled module into an object file.
-temp = util.tempdir()
+temp = utils.tempdir()
 my_gemm.save(temp.relpath("gemm.o"))
 
 # Send the executable over RPC
diff --git a/vta/tutorials/optimize/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py
index 2888f34855f6..185b71fdc210 100644
--- a/vta/tutorials/optimize/convolution_opt.py
+++ b/vta/tutorials/optimize/convolution_opt.py
@@ -45,7 +45,7 @@
 import numpy as np
 
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from vta.testing import simulator
 
 # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
@@ -93,7 +93,7 @@
 # convolution followed by a rectified linear activation.
 # We describe the TVM dataflow graph of the 2D convolution layer below:
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/conv2d_dataflow.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/conv2d_dataflow.png
 #      :align: center
 #
 # This computation is intentionally too large to fit onto VTA's on-chip
@@ -120,7 +120,7 @@
 #   loaded from DRAM into VTA's SRAM, following a 2D strided and padded memory
 #   read.
 #
-#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/padding.png
+#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/padding.png
 #        :align: center
 #        :width: 480px
 
@@ -292,7 +292,7 @@
 # We show how work is split when computing the 2D convolution in the figure
 # below.
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/virtual_threading.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/virtual_threading.png
 #      :align: center
 #      :width: 480px
 
@@ -370,7 +370,7 @@
 
 # Compile the TVM module
 my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
-temp = util.tempdir()
+temp = utils.tempdir()
 my_conv.save(temp.relpath("conv2d.o"))
 remote.upload(temp.relpath("conv2d.o"))
 f = remote.load_module("conv2d.o")
diff --git a/vta/tutorials/optimize/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
index 8797c3edeffd..c9d1c137fbff 100644
--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ b/vta/tutorials/optimize/matrix_multiply_opt.py
@@ -43,7 +43,7 @@
 import vta
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from vta.testing import simulator
 
 # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
@@ -88,7 +88,7 @@
 # matrix multiplication followed by a rectified linear activation.
 # We describe the TVM dataflow graph of the fully connected layer below:
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/fc_dataflow.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/fc_dataflow.png
 #      :align: center
 #
 # This computation is intentionally too large to fit onto VTA's on-chip
@@ -183,7 +183,7 @@
 # We show the outcome of blocking on the computation schedule in the diagram
 # below:
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/blocking.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/blocking.png
 #      :align: center
 #      :width: 480px
 #
@@ -311,7 +311,7 @@
 
 # Compile the TVM module
 my_gemm = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="my_gemm")
-temp = util.tempdir()
+temp = utils.tempdir()
 my_gemm.save(temp.relpath("gemm.o"))
 remote.upload(temp.relpath("gemm.o"))
 f = remote.load_module("gemm.o")
diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py
index 8f37b2d10179..1a097b804a31 100644
--- a/vta/tutorials/vta_get_started.py
+++ b/vta/tutorials/vta_get_started.py
@@ -67,7 +67,7 @@
 
 # We'll need the TVM RPC module and the VTA simulator module
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from vta.testing import simulator
 
 # We read the Pynq RPC host IP address and port number from the OS environment
@@ -115,7 +115,7 @@
 # The last operation is a cast and copy back to DRAM, into results tensor
 # :code:`C`.
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/vadd_dataflow.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/vadd_dataflow.png
 #      :align: center
 
 ######################################################################
@@ -320,7 +320,7 @@
 # execution.
 
 # Write the compiled module into an object file.
-temp = util.tempdir()
+temp = utils.tempdir()
 my_vadd.save(temp.relpath("vadd.o"))
 
 # Send the executable over RPC
diff --git a/web/Makefile b/web/Makefile
index eaf5a954accb..8c4dbc20dadc 100644
--- a/web/Makefile
+++ b/web/Makefile
@@ -26,8 +26,8 @@ all: dist/wasm/tvmjs_runtime.wasm dist/wasm/tvmjs_runtime.wasi.js
 
 EMCC = emcc
 
-EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++14 -Wno-ignored-attributes \
-	-s ALLOW_MEMORY_GROWTH=1 -s STANDALONE_WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0
+EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++14 -Wno-ignored-attributes --no-entry \
+	-s ALLOW_MEMORY_GROWTH=1 -s STANDALONE_WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0 
 
 EMCC_LDFLAGS = --pre-js emcc/preload.js
 
diff --git a/web/README.md b/web/README.md
index b4d7eb104d56..4154300e62e4 100644
--- a/web/README.md
+++ b/web/README.md
@@ -63,11 +63,11 @@ This command will create the tvmjs library that we can use to interface with the
 
 Check code snippet in
 
-- [tests/python/prepare_test_libs.py](https://github.com/apache/incubator-tvm/tree/main/web/tests/python/prepare_test_libs.py)
+- [tests/python/prepare_test_libs.py](https://github.com/apache/tvm/tree/main/web/tests/python/prepare_test_libs.py)
   shows how to create a wasm library that links with tvm runtime.
   - Note that all wasm libraries have to created using the `--system-lib` option
   - emcc.create_wasm will automatically link the runtime library `dist/wasm/libtvm_runtime.bc`
-- [tests/web/test_module_load.js](https://github.com/apache/incubator-tvm/tree/main/web/tests/node/test_module_load.js) demonstrate
+- [tests/web/test_module_load.js](https://github.com/apache/tvm/tree/main/web/tests/node/test_module_load.js) demonstrate
   how to run the generated library through tvmjs API.
 
 
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index a67b4c3dcd14..214c1883f874 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -34,7 +34,7 @@
 
 #include "src/runtime/c_runtime_api.cc"
 #include "src/runtime/cpu_device_api.cc"
-#include "src/runtime/file_util.cc"
+#include "src/runtime/file_utils.cc"
 #include "src/runtime/graph/graph_runtime.cc"
 #include "src/runtime/library_module.cc"
 #include "src/runtime/module.cc"
@@ -75,5 +75,13 @@ TVM_REGISTER_GLOBAL("testing.wrap_callback").set_body([](TVMArgs args, TVMRetVal
   PackedFunc pf = args[0];
   *ret = runtime::TypedPackedFunc<void()>([pf]() { pf(); });
 });
+
+// internal function used for debug and testing purposes
+TVM_REGISTER_GLOBAL("testing.object_use_count").set_body([](TVMArgs args, TVMRetValue* ret) {
+  runtime::ObjectRef obj = args[0];
+  // substract the current one because we always copy
+  // and get another value.
+  *ret = (obj.use_count() - 1);
+});
 }  // namespace runtime
 }  // namespace tvm
diff --git a/web/package.json b/web/package.json
index 1f52a0781e60..dafccb0a8648 100644
--- a/web/package.json
+++ b/web/package.json
@@ -2,7 +2,7 @@
   "name": "tvmjs",
   "displayName": "TVM Wasm JS runtime",
   "license": "Apache-2.0",
-  "version": "0.7.0",
+  "version": "0.8.0-dev0",
   "scripts": {
     "prepwasm": "make && python3 tests/python/prepare_test_libs.py",
     "build": "tsc -b && make rmtypedep",
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 5c9b9d8181d7..80e7d71f06ad 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1216,6 +1216,7 @@ export class Instance implements Disposable {
           tcode == ArgTypeCode.TVMObjectHandle ||
           tcode == ArgTypeCode.TVMObjectRValueRefArg ||
           tcode == ArgTypeCode.TVMPackedFuncHandle ||
+          tcode == ArgTypeCode.TVMNDArrayHandle ||
           tcode == ArgTypeCode.TVMModuleHandle
         ) {
           lib.checkCall(
diff --git a/web/tests/node/test_packed_func.js b/web/tests/node/test_packed_func.js
index e18c0aecfdc0..87b48df3d67a 100644
--- a/web/tests/node/test_packed_func.js
+++ b/web/tests/node/test_packed_func.js
@@ -109,3 +109,16 @@ test("RegisterGlobal", () => {
   let syslib = tvm.systemLib();
   syslib.dispose();
 });
+
+test("NDArrayCbArg", () => {
+  let use_count = tvm.getGlobalFunc("testing.object_use_count");
+
+  let fcheck = tvm.toPackedFunc(function (x) {
+    assert(use_count(x) == 2);
+    x.dispose();
+  });
+  let x = tvm.empty([2], "float32").copyFrom([1, 2]);
+  assert(use_count(x) == 1);
+  fcheck(x);
+  assert(use_count(x) == 1);
+});
diff --git a/web/tests/python/webgpu_rpc_test.py b/web/tests/python/webgpu_rpc_test.py
index 5de8cf8c72bf..5efc85cf5e32 100644
--- a/web/tests/python/webgpu_rpc_test.py
+++ b/web/tests/python/webgpu_rpc_test.py
@@ -23,7 +23,7 @@
 import tvm
 from tvm import te
 from tvm import rpc
-from tvm.contrib import util, emcc
+from tvm.contrib import utils, emcc
 import numpy as np
 
 proxy_host = "localhost"
@@ -50,7 +50,7 @@ def test_rpc():
     s[B].bind(xo, te.thread_axis("blockIdx.x"))
 
     fadd = tvm.build(s, [A, B], target_device, target_host=target_host, name="addone")
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     wasm_path = temp.relpath("addone_gpu.wasm")
     fadd.export_library(wasm_path, emcc.create_tvmjs_wasm)
diff --git a/web/tests/python/websock_rpc_test.py b/web/tests/python/websock_rpc_test.py
index 6729964e11d2..48603e86b7f7 100644
--- a/web/tests/python/websock_rpc_test.py
+++ b/web/tests/python/websock_rpc_test.py
@@ -23,7 +23,7 @@
 import tvm
 from tvm import te
 from tvm import rpc
-from tvm.contrib import util, emcc
+from tvm.contrib import utils, emcc
 import numpy as np
 
 proxy_host = "localhost"
@@ -43,7 +43,7 @@ def test_rpc():
     s = te.create_schedule(B.op)
 
     fadd = tvm.build(s, [A, B], target, name="addone")
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     wasm_path = temp.relpath("addone.wasm")
     fadd.export_library(wasm_path, emcc.create_tvmjs_wasm)