From 99cf26d8b500001c04f7d88429190a3d52550461 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Wed, 28 Oct 2020 21:14:30 +0000
Subject: [PATCH 001/258] Remove TRT integration

---
 cmake/modules/contrib/TensorRT.cmake          |   53 -
 python/tvm/relay/tensorrt.py                  | 1014 ------------
 src/relay/backend/contrib/tensorrt/README     |   61 -
 .../contrib/tensorrt/codegen_tensorrt.cc      |  125 --
 .../contrib/tensorrt/tensorrt_builder.cc      |  517 -------
 .../contrib/tensorrt/tensorrt_builder.h       |  238 ---
 .../contrib/tensorrt/tensorrt_logger.h        |   73 -
 .../contrib/tensorrt/tensorrt_module.cc       |  389 -----
 .../contrib/tensorrt/tensorrt_module.h        |   47 -
 src/runtime/contrib/tensorrt/tensorrt_ops.h   | 1353 -----------------
 src/runtime/contrib/tensorrt/utils.h          |   97 --
 tests/python/relay/test_tensorrt.py           |  914 -----------
 12 files changed, 4881 deletions(-)
 delete mode 100644 cmake/modules/contrib/TensorRT.cmake
 delete mode 100644 python/tvm/relay/tensorrt.py
 delete mode 100644 src/relay/backend/contrib/tensorrt/README
 delete mode 100644 src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc
 delete mode 100644 src/runtime/contrib/tensorrt/tensorrt_builder.cc
 delete mode 100644 src/runtime/contrib/tensorrt/tensorrt_builder.h
 delete mode 100644 src/runtime/contrib/tensorrt/tensorrt_logger.h
 delete mode 100644 src/runtime/contrib/tensorrt/tensorrt_module.cc
 delete mode 100644 src/runtime/contrib/tensorrt/tensorrt_module.h
 delete mode 100644 src/runtime/contrib/tensorrt/tensorrt_ops.h
 delete mode 100644 src/runtime/contrib/tensorrt/utils.h
 delete mode 100644 tests/python/relay/test_tensorrt.py

diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake
deleted file mode 100644
index 2615f1fe31e1..000000000000
--- a/cmake/modules/contrib/TensorRT.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# TensorRT Runtime
-if(USE_TENSORRT)
-    # Enable codegen as well
-    SET(USE_TENSORRT_CODEGEN ON)
-    if(IS_DIRECTORY ${USE_TENSORRT})
-        set(TENSORRT_ROOT_DIR ${USE_TENSORRT})
-        message(STATUS "Custom TensorRT path: " ${TENSORRT_ROOT_DIR})
-    endif()
-    find_path(TENSORRT_INCLUDE_DIR NvInfer.h HINTS ${TENSORRT_ROOT_DIR} PATH_SUFFIXES include)
-    find_library(TENSORRT_LIB_DIR nvinfer HINTS ${TENSORRT_ROOT_DIR} PATH_SUFFIXES lib)
-    find_package_handle_standard_args(TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR TENSORRT_LIB_DIR)
-    if(NOT TENSORRT_FOUND)
-        message(ERROR "Could not find TensorRT.")
-    endif()
-    message(STATUS "TENSORRT_LIB_DIR: " ${TENSORRT_LIB_DIR})
-    include_directories(${TENSORRT_INCLUDE_DIR})
-    list(APPEND TVM_RUNTIME_LINKER_LIBS ${TENSORRT_LIB_DIR})
-
-    # Relay TRT runtime sources
-    file(GLOB TENSORRT_RELAY_CONTRIB_SRC src/runtime/contrib/tensorrt/*.cc)
-    list(APPEND RUNTIME_SRCS ${TENSORRT_RELAY_CONTRIB_SRC})
-
-    # Set defines
-    add_definitions(-DTVM_GRAPH_RUNTIME_TENSORRT)
-endif()
-# TensorRT Codegen only. This can be enabled independently of USE_TENSORRT to
-# enable compilation of TensorRT modules without requiring TensorRT to be
-# installed. The compiled modules will only be able to be executed using a TVM
-# built with USE_TENSORRT=ON.
-if(USE_TENSORRT_CODEGEN)
-    message(STATUS "Build with TensorRT codegen")
-    # Relay TRT codegen sources
-    file(GLOB TENSORRT_RELAY_CONTRIB_SRC src/relay/backend/contrib/tensorrt/*.cc)
-    list(APPEND COMPILER_SRCS ${TENSORRT_RELAY_CONTRIB_SRC})
-    list(APPEND COMPILER_SRCS src/runtime/contrib/tensorrt/tensorrt_module.cc)
-endif()
diff --git a/python/tvm/relay/tensorrt.py b/python/tvm/relay/tensorrt.py
deleted file mode 100644
index c04a679b1049..000000000000
--- a/python/tvm/relay/tensorrt.py
+++ /dev/null
@@ -1,1014 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name,arguments-differ,no-else-return,unused-argument,missing-docstring
-"""
-Relay TensorRT codegen.
-"""
-import os
-import numpy as np
-import tvm
-import tvm.ir
-import tvm.relay.transform as transform
-from tvm import relay
-from tvm.relay.expr import Call, Constant, Tuple, GlobalVar, Var, TupleGetItem
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.transform import _ffi_api
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
-
-
-class LegalizeLayoutTranform(ExprMutator):
-    """
-    Legalize Relay layout transforms to transpose ops to simplify TensorRT conversion.
-    """
-
-    def visit_call(self, expr):
-        visit = super().visit_call(expr)
-        if expr.op == tvm.relay.op.get("layout_transform"):
-            src_layout = expr.attrs["src_layout"]
-            dst_layout = expr.attrs["dst_layout"]
-            if src_layout == "NCHW" and dst_layout == "NHWC":
-                return relay.transpose(visit.args[0], axes=[0, 2, 3, 1])
-            elif src_layout == "NHWC" and dst_layout == "NCHW":
-                return relay.transpose(visit.args[0], axes=[0, 3, 1, 2])
-            elif src_layout == "NDHWC" and dst_layout == "NCDHW":
-                return relay.transpose(visit.args[0], axes=[0, 4, 1, 2, 3])
-            elif src_layout == "NCDHW" and dst_layout == "NDHWC":
-                return relay.transpose(visit.args[0], axes=[0, 2, 3, 4, 1])
-            elif src_layout == "HWIO" and dst_layout == "OIHW":
-                return relay.transpose(visit.args[0], axes=[3, 2, 0, 1])
-            elif src_layout == "HWOI" and dst_layout == "OIHW":
-                return relay.transpose(visit.args[0], axes=[2, 3, 0, 1])
-            elif src_layout == "HWIO" and dst_layout == "IOHW":
-                return relay.transpose(visit.args[0], axes=[2, 3, 0, 1])
-        return visit
-
-
-class RemoveDropout(ExprMutator):
-    """
-    Removes all nn.dropout from an expr.
-    """
-
-    def visit_tuple_getitem(self, expr):
-        visit = super().visit_tuple_getitem(expr)
-        if visit.index != 0:
-            return visit
-        elif isinstance(visit.tuple_value, Call) and visit.tuple_value.op.name == "nn.dropout":
-            return visit.tuple_value.args[0]
-        return visit
-
-
-@transform.function_pass(opt_level=0)
-class LegalizeLayoutTranformPass:
-    def transform_function(self, func, mod, _):
-        return LegalizeLayoutTranform().visit(func)
-
-
-@transform.function_pass(opt_level=0)
-class RemoveDropoutPass:
-    def transform_function(self, func, mod, _):
-        return RemoveDropout().visit(func)
-
-
-def GetTrtVersion():
-    """Gets the version of TensorRT that TVM is built against.
-
-    Returns
-    -------
-    ret: Tuple[int]
-        TensorRT version as a tuple of major, minor, and patch number. If TVM
-        is not built with TensorRT, an empty tuple is returned instead.
-    """
-    return tuple(map(int, _ffi_api.GetTrtVersion()))
-
-
-def IsTrtRuntimeAvailable():
-    if not tvm.get_global_func("relay._transform.GetTrtVersion", True):
-        return False
-    return GetTrtVersion() != ()
-
-
-def check_dynamism(args, op_name):
-    """
-    This function checks for dynamism inside any of the args in the op.
-    Can be used to offload dynamic ops that are not supported by TRT to
-    be offloaded to relay VM.
-
-    Raises a NotImplementedError if the type of the arg is not of types
-    Call, Var, Constant, or TupleGetItem.
-
-    Parameters
-    ----------
-    args: a TRT array of the arguments of the op
-    op_name: name of the op for debugging purposes only
-
-    Returns
-    ----------
-    True if dynamism is present, False otherwise
-    """
-    for arg in args:
-        if isinstance(arg, (Call, Var, Constant, TupleGetItem)):
-            for dim_shape in arg.checked_type.shape[1:]:
-                if isinstance(dim_shape, tvm.tir.expr.Any):
-                    print(
-                        "Dynamic inputs are not supported for TensorRT for ",
-                        op_name,
-                        arg.checked_type.shape,
-                    )
-                    return True
-        elif isinstance(arg, Tuple):
-            return check_dynamism(arg.fields, op_name)
-        else:
-            print(
-                "Arg not supported in TensorRT for ",
-                op_name,
-                type(arg),
-            )
-            return True
-    return False
-
-
-def _register_external_op_helper(op_name, supported=True):
-    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-    def _func_wrapper(attrs, args):
-        if check_dynamism(args, op_name):
-            return False
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        # TODO (codeislife99): Here we are excluding multiply calculations which get "batched" in
-        # implicit batch mode. This leads to wrong or invalid multiply calculations.
-        # Since the Neo-service uses implicit batch mode=True, this is a temporary workaround.
-        # A more generalizable workaround is in the works for a future update.
-        if op_name == "multiply":
-            shapes = [
-                [
-                    int(x) if not isinstance(x, tvm.tir.expr.Any) else -1
-                    for x in arg.checked_type.shape
-                ]
-                for arg in args
-            ]
-            if all(
-                [list(map(int, shape)) in [[300, 64, 7, 7], [300, 1, 1, 1]] for shape in shapes]
-            ):
-                return False
-
-        return supported
-
-    return _func_wrapper
-
-
-def _register_external_op_helper_func(op_name, func, trt_version):
-    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-    def _func_wrapper(attrs, args):
-        if check_dynamism(args, op_name):
-            return False
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        return func(attrs, args, op_name, trt_version)
-
-    return _func_wrapper
-
-
-def _register_external_dynamic_check_func(op_name, func):
-    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-    def _func_wrapper(attrs, args):
-        if check_dynamism(args, op_name):
-            return False
-        return func(attrs, args)
-
-    return _func_wrapper
-
-
-def register_tensorrt_annotations(trt_version, use_implicit_batch=True):
-    if hasattr(register_tensorrt_annotations, "registered"):
-        # Can't register annotations more than once.
-        return
-    register_tensorrt_annotations.registered = True
-    if not use_implicit_batch and trt_version < (6, 0, 1):
-        print("Explicit batch mode only available for TRT 6+")
-        use_implicit_batch = True
-    # Ops which are always supported
-    _register_external_op_helper("nn.relu")
-    _register_external_op_helper("sigmoid")
-    _register_external_op_helper("tanh")
-    _register_external_op_helper("subtract")
-    _register_external_op_helper("multiply")
-    _register_external_op_helper("divide")
-    _register_external_op_helper("power")
-    _register_external_op_helper("maximum")
-    _register_external_op_helper("minimum")
-    _register_external_op_helper("exp")
-    _register_external_op_helper("log")
-    _register_external_op_helper("sqrt")
-    _register_external_op_helper("abs")
-    _register_external_op_helper("negative")
-    _register_external_op_helper("nn.batch_flatten")
-    _register_external_op_helper("clip")
-    # TODO(trevmorr): Temporarily disable split due to TRT bug on xavier.
-    # _register_external_op_helper("split")
-    # _register_external_op_helper("slice_like")
-
-    def add_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        shapes = [
-            [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
-            for arg in args
-        ]
-
-        for shape in shapes:
-            if len(shape) < 1:
-                return False
-
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-
-        if (
-            (isinstance(args[0], Constant) or isinstance(args[1], Constant))
-            and shapes[0][0] == shapes[1][0]
-            and shapes[0][0] != 1
-            and (len(shapes[0]) > 3 or len(shapes[1]) > 3)
-        ):
-            print("add: bug in TRT with adding batched constants.")
-            return False
-
-        # Skip this add op in TRT to avoid accuracy mismatch
-        if all([list(map(int, shape)) == [1, 546, 1, 1] for shape in shapes]):
-            print("add: bug in TRT with add of shape (1, 546, 1, 1).")
-            return False
-
-        return True
-
-    def batch_norm_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if int(attrs.axis) != 1 and int(attrs.axis) != 3:
-            print("nn.batch_norm: axis is {} but must be 1 or 3.".format(int(attrs.axis)))
-            return False
-        return True
-
-    def softmax_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if use_implicit_batch and int(attrs.axis) == 0:
-            print("nn.softmax: can't modify batch dimension.")
-            return False
-        return True
-
-    def conv2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.data_layout != "NCHW":
-            print("nn.conv2d: data_layout is {} but must be NCHW.".format(attrs.data_layout))
-            return False
-        if attrs.kernel_layout != "OIHW":
-            print("nn.conv2d: kernel_layout is {} but must be OIHW.".format(attrs.kernel_layout))
-            return False
-        if attrs.out_layout and attrs.out_layout != "NCHW":
-            print("nn.conv2d: out_layout is {} but must be NCHW.".format(attrs.out_layout))
-            return False
-        return True
-
-    def dense_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        input_rank = len(args[0].checked_type.shape)
-        weight_rank = len(args[1].checked_type.shape)
-        if input_rank < 2 or input_rank > 4:
-            print("nn.dense: input has rank {} but must be 2, 3 or 4.".format(input_rank))
-            return False
-        if weight_rank != 2:
-            print("nn.dense: weight has rank {} but must be 2.".format(weight_rank))
-            return False
-        return True
-
-    def bias_add_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        # TODO(trevmorr): BiasAddSimplifier creates a pattern which cannot be
-        # converted to TRT without binding params and constant folding.
-        # if trt_version < (6, 0, 1):
-        #     return False
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        input_rank = len(args[0].checked_type.shape)
-        if input_rank < 2 or input_rank > 4:
-            print("nn.bias_add: input rank is {} but must be 2, 3 or 4.".format(input_rank))
-            return False
-        return True
-
-    def max_pool_2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.layout != "NCHW":
-            print("nn.max_pool2d: layout is {} but must be NCHW.".format(attrs.layout))
-            return False
-        if attrs.ceil_mode and trt_version < (5, 1, 5):
-            print("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
-            return False
-        return True
-
-    def avg_pool_2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.layout != "NCHW":
-            print("nn.avg_pool2d: layout is {} but must be NCHW.".format(attrs.layout))
-            return False
-        if (
-            attrs.count_include_pad
-            and len(attrs.padding) == 4
-            and (
-                int(attrs.padding[0]) != int(attrs.padding[2])
-                or int(attrs.padding[1]) != int(attrs.padding[3])
-            )
-        ):
-            print(
-                "nn.avg_pool2d: inclusive-counted blended or average "
-                "pooling is not supported in combination with asymmetric padding"
-            )
-            return False
-        if attrs.ceil_mode and trt_version < (5, 1, 5):
-            print("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
-            return False
-        return True
-
-    def global_max_pool_2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.layout != "NCHW":
-            print("nn.global_max_pool2d: layout is {} but must be NCHW.".format(attrs.layout))
-            return False
-        return True
-
-    def global_avg_pool_2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.layout != "NCHW":
-            print("nn.global_avg_pool2d: layout is {} but must be NCHW.".format(attrs.layout))
-            return False
-        return True
-
-    def expand_dims_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if use_implicit_batch and int(attrs.axis) == 0:
-            print("expand_dims: can't modify batch dimension.")
-            return False
-        return True
-
-    def squeeze_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if not attrs.axis:
-            print("squeeze: must explicitly set axis.")
-            return False
-        if use_implicit_batch and any([axis == 0 for axis in map(int, attrs.axis)]):
-            print("squeeze: can't modify batch dimension.")
-            return False
-        return True
-
-    def concatenate_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.dtype != "float32" for x in args[0].checked_type.fields]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if not use_implicit_batch:
-            return True
-        if int(attrs.axis) == 0:
-            print("concatenate: can't modify batch dimension.")
-            return False
-        if isinstance(args[0], Tuple):
-            for tuple_input in args[0].fields:
-                if isinstance(tuple_input, Constant):
-                    print("concatenate: can't concatenate tensors with constants.")
-                    return False
-        return True
-
-    def conv2d_transpose_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.data_layout != "NCHW":
-            print(
-                "nn.conv2d_transpose: data_layout is {} but must be NCHW.".format(attrs.data_layout)
-            )
-            return False
-        if attrs.kernel_layout != "OIHW":
-            print(
-                "nn.conv2d_transpose: kernel_layout is {} but must be OIHW.".format(
-                    attrs.kernel_layout
-                )
-            )
-            return False
-        if attrs.out_layout and attrs.out_layout != "NCHW":
-            print(
-                "nn.conv2d_transpose: out_layout is {} but must be NCHW.".format(attrs.out_layout)
-            )
-            return False
-        if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
-            print("nn.conv2d_transpose: dilation rate must be 1.")
-            return False
-        return True
-
-    def transpose_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if use_implicit_batch and int(attrs.axes[0]) != 0:
-            print("transpose: can't modify batch dimension.")
-            return False
-        return True
-
-    def reshape_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if args[0].checked_type.dtype != "float32":
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if any([x < -1 for x in map(int, attrs.newshape)]):
-            print("reshape: new shape dims must be explicit.")
-            return False
-        if use_implicit_batch:
-            shape = args[0].checked_type.shape
-            new_shape = attrs.newshape
-            if len(new_shape) == 0 or len(shape) == 0:
-                print("reshape: Can't reshape to or from scalar.")
-                return False
-
-            dynamic_reshape = any([isinstance(x, tvm.tir.expr.Any) for x in shape])
-
-            if dynamic_reshape:
-                # Make sure that the batch dim is unmodified.
-                if int(new_shape[0]) < 0:
-                    for shape_val, new_shape_val in enumerate(shape[1:], new_shape[1:]):
-                        if not (
-                            isinstance(shape_val, int)
-                            and isinstance(new_shape_val, int)
-                            and int(shape_val) == int(new_shape_val)
-                        ):
-                            return False
-                elif int(new_shape[0]) > 0:
-                    if not (
-                        isinstance(shape[0], int)
-                        and isinstance(new_shape[0], int)
-                        and int(shape[0]) == int(new_shape[0])
-                    ):
-                        return False
-                return True
-            else:
-                shape = list(map(int, shape))
-                new_shape = list(map(int, new_shape))
-
-                # TRT cannot modify batch dimension.
-                original_volume = np.prod(shape)
-                # First, resolve 0.
-                for i, value in enumerate(new_shape):
-                    if value == 0:
-                        new_shape[i] = shape[i]
-                # Resolve -1.
-                for i, value in enumerate(new_shape):
-                    if value == -1:
-                        new_shape[i] = original_volume // np.prod([x for x in new_shape if x != -1])
-                # Remove batch dimension and see if volumes match
-                if shape[0] != new_shape[0]:
-                    print("reshape: can't modify batch dimension.")
-                    return False
-        return True
-
-    def pad_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if attrs.pad_mode != "constant":
-            print("nn.pad: pad mode is {} but must be constant.".format(attrs.pad_mode))
-            return False
-        if float(attrs.pad_value) != 0.0:
-            print("nn.pad: pad value is {} but must be 0.0.".format(float(attrs.pad_value)))
-            return False
-        return True
-
-    def reduce_whitelist_fn(attrs, args, op_name, trt_version):
-        if not attrs.axis or len(attrs.axis) == 0:
-            print("{}: cannot reduce to scalar.".format(op_name))
-            return False
-        if attrs.exclude:
-            print("{}: exclude not supported.".format(op_name))
-            return False
-        if use_implicit_batch and any([x == 0 for x in map(int, attrs.axis)]):
-            print("{}: can't modify batch dimension.".format(op_name))
-            return False
-        return True
-
-    def trt_5_1_5_whitelist_fn(attrs, args, op_name, trt_version):
-        if trt_version < (5, 1, 5):
-            print("{}: requires TensorRT version 5.1.5 or higher.".format(op_name))
-            return False
-        return True
-
-    _register_external_op_helper_func("sum", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("prod", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("max", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("min", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("mean", reduce_whitelist_fn, trt_version)
-    _register_external_op_helper_func("nn.leaky_relu", trt_5_1_5_whitelist_fn, trt_version)
-    _register_external_op_helper_func("sin", trt_5_1_5_whitelist_fn, trt_version)
-    _register_external_op_helper_func("cos", trt_5_1_5_whitelist_fn, trt_version)
-    _register_external_op_helper_func("atan", trt_5_1_5_whitelist_fn, trt_version)
-    _register_external_op_helper_func("ceil", trt_5_1_5_whitelist_fn, trt_version)
-
-    def strided_slice_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (5, 1, 5):
-            print("strided_slice: requires TensorRT version 5.1.5 or higher.")
-            return False
-        if args[0].checked_type.dtype != "float32":
-            print("strided_slice: only fp32 inputs are supported.")
-            return False
-        if use_implicit_batch:
-            batch_dim_begin_modified = attrs.begin[0] is not None and int(attrs.begin[0]) != 0
-            batch_dim_end_modified = (
-                attrs.end[0] is not None
-                and int(attrs.end[0]) != -1
-                and int(attrs.end[0]) != int(args[0].checked_type.shape[0])
-            )
-            if batch_dim_begin_modified or batch_dim_end_modified:
-                print("strided_slice: can't modify batch dimension.")
-                return False
-        if any([x is not None and x <= 0 for x in attrs.strides]):
-            print("strided_slice: stride must be positive")
-            return False
-        for i in range(0, len(args[0].checked_type.shape)):
-            begin = int(attrs.begin[i])
-            end = (
-                int(attrs.end[i])
-                if attrs.end[i] is not None and int(attrs.end[i]) != -1
-                else args[0].checked_type.shape[i]
-            )
-            if int(end) - int(begin) < 1:
-                print("strided_slice: size of slice must be at least 1")
-                return False
-        return True
-
-    def resize_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        # TODO(trevmorr): Output does not match TVM. Disable.
-        return False
-
-    def adapative_max_pool2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
-            print("nn.adaptive_max_pool2d: output size must be (1, 1).")
-            return False
-        return True
-
-    def adapative_avg_pool2d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
-            print("nn.adaptive_avg_pool2d: output size must be (1, 1).")
-            return False
-        return True
-
-    def upsampling_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        # TODO(trevmorr): Output does not match TVM. Disable.
-        return False
-
-    def conv3d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (6, 0, 1):
-            print("nn.conv3d: requires TensorRT version 6.0.1 or higher.")
-            return False
-        if attrs.data_layout != "NCDHW":
-            print("nn.conv3d: data_layout is {} but must be NCDHW.".format(attrs.data_layout))
-            return False
-        if attrs.kernel_layout != "OIDHW":
-            print("nn.conv3d: kernel_layout is {} but must be OIDHW.".format(attrs.kernel_layout))
-            return False
-        if attrs.out_layout and attrs.out_layout != "NCDHW":
-            print("nn.conv3d: out_layout is {} but must be NCDHW.".format(attrs.out_layout))
-            return False
-        return True
-
-    def max_pool_3d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (6, 0, 1):
-            print("nn.max_pool3d: requires TensorRT version 6.0.1 or higher.")
-            return False
-        if attrs.layout != "NCDHW":
-            print("nn.max_pool3d: layout is {} but must be NCDHW.".format(attrs.layout))
-            return False
-        return True
-
-    def avg_pool_3d_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (6, 0, 1):
-            print("nn.avg_pool3d: requires TensorRT version 6.0.1 or higher.")
-            return False
-        if attrs.layout != "NCDHW":
-            print("nn.avg_pool3d: layout is {} but must be NCDHW.".format(attrs.layout))
-            return False
-        return True
-
-    def conv3d_transpose_whitelist_fn(attrs, args):  # pylint: disable=unused-variable
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            print("Only float32 inputs are supported for TensorRT.")
-            return False
-        if trt_version < (6, 0, 1):
-            print("nn.conv3d_transpose: requires TensorRT version 6.0.1 or higher.")
-            return False
-        if attrs.data_layout != "NCDHW":
-            print(
-                "nn.conv3d_transpose: data_layout is {} but must be NCDHW.".format(
-                    attrs.data_layout
-                )
-            )
-            return False
-        if attrs.kernel_layout != "OIDHW":
-            print(
-                "nn.conv3d_transpose: kernel_layout is {} but must be OIDHW.".format(
-                    attrs.kernel_layout
-                )
-            )
-            return False
-        if attrs.out_layout and attrs.out_layout != "NCDHW":
-            print(
-                "nn.conv3d_transpose: out_layout is {} but must be NCDHW.".format(attrs.out_layout)
-            )
-            return False
-        if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
-            print("nn.conv3d_transpose: dilation rate must be 1.")
-            return False
-        if attrs.output_padding and any([x != 0 for x in map(int, attrs.output_padding)]):
-            print("nn.conv3d_transpose: output padding is not supported.")
-            return False
-        return True
-
-    _register_external_dynamic_check_func("add", add_whitelist_fn)
-    _register_external_dynamic_check_func("nn.batch_norm", batch_norm_whitelist_fn)
-    _register_external_dynamic_check_func("nn.softmax", softmax_whitelist_fn)
-    _register_external_dynamic_check_func("nn.conv2d", conv2d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.dense", dense_whitelist_fn)
-    _register_external_dynamic_check_func("nn.bias_add", bias_add_whitelist_fn)
-    _register_external_dynamic_check_func("nn.max_pool2d", max_pool_2d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.avg_pool2d", avg_pool_2d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.global_max_pool2d", global_max_pool_2d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.global_avg_pool2d", global_avg_pool_2d_whitelist_fn)
-    _register_external_dynamic_check_func("expand_dims", expand_dims_whitelist_fn)
-    _register_external_dynamic_check_func("squeeze", squeeze_whitelist_fn)
-    _register_external_dynamic_check_func("concatenate", concatenate_whitelist_fn)
-    _register_external_dynamic_check_func("nn.conv2d_transpose", conv2d_transpose_whitelist_fn)
-    _register_external_dynamic_check_func("transpose", transpose_whitelist_fn)
-    _register_external_dynamic_check_func("reshape", reshape_whitelist_fn)
-    _register_external_dynamic_check_func("nn.pad", pad_whitelist_fn)
-    _register_external_dynamic_check_func("strided_slice", strided_slice_whitelist_fn)
-    _register_external_dynamic_check_func("image.resize", resize_whitelist_fn)
-    _register_external_dynamic_check_func(
-        "nn.adaptive_max_pool2d", adapative_max_pool2d_whitelist_fn
-    )
-    _register_external_dynamic_check_func(
-        "nn.adaptive_avg_pool2d", adapative_avg_pool2d_whitelist_fn
-    )
-    _register_external_dynamic_check_func("nn.upsampling", upsampling_whitelist_fn)
-    _register_external_dynamic_check_func("nn.conv3d", conv3d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.max_pool3d", max_pool_3d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.avg_pool3d", avg_pool_3d_whitelist_fn)
-    _register_external_dynamic_check_func("nn.conv3d_transpose", conv3d_transpose_whitelist_fn)
-
-
-class VarReplacer(ExprMutator):
-    """
-    Visit an expression while replacing vars according to var_map. Used by
-    SubgraphRemover/PruneSubgraphs to return a subgraph originally partitioned to TRT back to TVM.
-    """
-
-    def __init__(self, var_map):
-        ExprMutator.__init__(self)
-        self.var_map = var_map
-
-    def visit_var(self, var):
-        if var in self.var_map:
-            return self.var_map[var]
-        return super().visit_var(var)
-
-
-class SubgraphRemover(ExprMutator):
-    """
-    Reverts subgraphs in subgraphs_to_remove back to TVM instead of using an external codegen.
-    """
-
-    def __init__(self, subgraphs_to_remove, mod, new_mod):
-        ExprMutator.__init__(self)
-        self.subgraphs_to_remove = subgraphs_to_remove
-        self.mod = mod
-        self.new_mod = new_mod
-
-    def visit_call(self, call):
-        if isinstance(call.op, GlobalVar):
-            name = call.op.name_hint
-            if name in self.subgraphs_to_remove:
-                # "Inline" the subgraph back into new main function.
-                func = self.mod[name]
-                var_map = {}
-                for arg, param in zip(call.args, func.params):
-                    var_map[param] = super().visit(arg)
-                new_body = VarReplacer(var_map).visit(func.body)
-                return new_body
-            elif name != "main":
-                # Copy the GlobalVar (subgraph function) to the new module and call.
-                args = []
-                for arg in call.args:
-                    args.append(super().visit(arg))
-                return call.op(*args)
-
-        return super().visit_call(call)
-
-
-class IsComputeIntensiveGraph(ExprVisitor):
-    """
-    Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
-    its transpose, dense and batch mat-mul.
-    """
-
-    def __init__(self):
-        ExprVisitor.__init__(self)
-        self.is_compute_intensive = False
-
-    def visit_call(self, call):
-        heavy_ops = set(
-            [
-                "nn.conv2d",
-                "nn.conv2d_transpose",
-                "nn.conv3d",
-                "nn.conv3d_transpose",
-                "nn.dense",
-                "nn.batch_matmul",
-            ]
-        )
-        if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op) in heavy_ops:
-                self.is_compute_intensive = True
-
-        return super().visit_call(call)
-
-    def is_graph_compute_intensive(self, subgraph):
-        self.visit(subgraph)
-        return self.is_compute_intensive
-
-
-def PruneSubgraphs(mod, compiler="tensorrt", use_implicit_batch=True, prune_no_macs=False):
-    """
-    If use_implicit_batch is True, removes subgraphs which were originally partitioned for TRT
-    that are incompatible with implicit batch mode.
-    If prune_no_macs is True, also remove subgraph if the number of multiply-accumulates is 0.
-    This is a heuristic which can improve performance by around 5% because TVM provides better
-    optimization for certain ops.
-
-     Parameters
-    ----------
-    mod: Module
-        The module which has been partitioned for tensorrt compiler.
-
-    compiler : str
-        Compiler string, should be "tensorrt".
-
-    use_implicit_batch : bool
-        Which mode we plan to use for TensorRT. Will be used to determine which subgraphs are
-        valid. In implicit batch mode, all inputs to a subgraph must have the same batch size.
-
-    prune_no_macs : bool
-        Whether to also remove subgraphs which have no multiple-accumulate operations.
-
-    Returns
-    -------
-    mod: Module
-        The modified module which has pruned subgraphs reverted back to TVM.
-    """
-    subgraphs_to_remove = []
-
-    def is_valid_subgraph(func):
-        """Whether a subgraph is valid in TRT.
-
-        Returns
-        -------
-        compatible : bool
-            True if the subgraph is compatible with TRT.
-        """
-        if not use_implicit_batch:
-            return True
-        input_batch_sizes = []
-        for var in func.params:
-            # In implicit batch mode, all inputs must have same batch size
-            if isinstance(var.checked_type, relay.TupleType):
-                for tupe_type in var.checked_type.fields:
-                    # Scalar inputs not allowed
-                    if len(tupe_type.shape) == 0:
-                        return False
-                    if not isinstance(tupe_type.shape[0], tvm.tir.expr.Any):
-                        input_batch_sizes.append(int(tupe_type.shape[0]))
-            else:
-                # Scalar inputs not allowed
-                if len(var.checked_type.shape) == 0:
-                    return False
-                if not isinstance(var.checked_type.shape[0], tvm.tir.expr.Any):
-                    input_batch_sizes.append(int(var.checked_type.shape[0]))
-        if len(input_batch_sizes) > 1 and any(
-            [x != input_batch_sizes[0] for x in input_batch_sizes[1:]]
-        ):
-            return False
-        return True
-
-    # Remove invalid subgraphs
-    for subgraph in mod.get_global_vars():
-        name = subgraph.name_hint
-        if (
-            mod[name].attrs
-            and hasattr(mod[name].attrs, "SkipOptimization")
-            and mod[name].attrs["SkipOptimization"] == 1
-        ):
-            continue
-        if not mod[name].attrs or mod[name].attrs["Compiler"] != compiler:
-            continue
-        if not is_valid_subgraph(mod[name]):
-            subgraphs_to_remove.append(name)
-
-    # Remove subgraphs with no multiply-accumulates
-    if prune_no_macs:
-        subgraph_with_compute_intensive_filter = []
-        for subgraph in mod.get_global_vars():
-            name = subgraph.name_hint
-            if (
-                mod[name].attrs
-                and hasattr(mod[name].attrs, "SkipOptimization")
-                and mod[name].attrs["SkipOptimization"] == 1
-            ):
-                continue
-            if not mod[name].attrs or mod[name].attrs["Compiler"] != compiler:
-                continue
-            is_compute_intensive = IsComputeIntensiveGraph().is_graph_compute_intensive(mod[name])
-            subgraph_with_compute_intensive_filter.append([name, is_compute_intensive])
-        print("Subgraphs with compute heavy filter", subgraph_with_compute_intensive_filter)
-        subgraphs_to_remove.extend(
-            [
-                name
-                for name, is_compute_intensive in subgraph_with_compute_intensive_filter
-                if not is_compute_intensive
-            ]
-        )
-    if len(subgraphs_to_remove) == 0:
-        return mod
-    print("Will remove these subgraphs:", subgraphs_to_remove)
-
-    # Create new pruned module with functions and type defns from mod
-    new_mod = tvm.IRModule(mod.functions, mod.type_definitions)
-    new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
-    return new_mod
-
-
-def EnableTrt(
-    mod,
-    params=None,
-    trt_version=None,
-    use_implicit_batch=True,
-    max_workspace_size=1 << 30,
-    prune_subgraphs=False,
-):
-    """Converts the "main" function in the module into one that can be executed using
-    TensorRT. If any of the operators are not supported by the TensorRT
-    conversion, the unmodified program will be returned instead.
-
-    Parameters
-    ----------
-    mod: Module
-        The original module.
-
-    params : dict of str to NDArray
-        Input parameters to the graph that do not change
-        during inference time. Used for constant folding.
-
-    trt_version : Optional[Tuple[int]]
-        Which version of TensorRT to target for partitioning as a tuple of
-        (major, minor, patch). If not specified, will attempt to get using
-        GetTrtVersion.
-
-    use_implicit_batch : bool
-        If false, will use explicit batch mode. Explicit batch mode is
-        available in TRT 6+. It increases operator coverage but comes at a
-        performance penalty.
-
-    max_workspace_size : int
-        Number of bytes for TensorRT workspace size.
-
-    prune_subgraphs : bool
-        If true, will prune subgraphs with 0 MACS and run them with TVM instead.
-
-    Returns
-    -------
-    mod: Module
-        The modified module which will use the TensorRT runtime if compatible.
-    """
-    if not trt_version:
-        trt_version = GetTrtVersion()
-        # If TVM wasn't built against TRT, default to target TRT 6. Since the
-        # actual conversion to TRT is done at runtime, building against TRT is
-        # not required for compilation.
-        if not trt_version:
-            trt_version = (6, 0, 1)
-    assert isinstance(trt_version, (list, tuple))
-    assert len(trt_version) == 3
-
-    register_tensorrt_annotations(trt_version, use_implicit_batch=use_implicit_batch)
-
-    def _set_optimization_attr(mod, skip_optimization=1):
-        """
-        Prepare the mod such that all functions except main are tagged for SkipOptimization
-        :param mod: input TRT mod
-        :param skip_optimization: flag to set SkipOptimization for all functions except main
-        :return: updated mod
-        """
-        gvs = mod.get_global_vars()
-        for gv in gvs:
-            func = mod[gv]
-            name = gv.name_hint
-            if name != "main":
-                new_func = func.with_attr(
-                    "SkipOptimization", tvm.tir.IntImm("int32", skip_optimization)
-                )
-                mod.update_func(gv, new_func)
-        return mod
-
-    if params:
-        # Bind params so that we can use FoldConstant.
-        mod["main"] = bind_params_by_name(mod["main"], params)
-    # Apply passes required for TRT
-    mod = transform.InferType()(mod)
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            RemoveDropoutPass(),
-            transform.RemoveUnusedFunctions(),
-            transform.ConvertLayout(
-                {
-                    "nn.conv2d": ["NCHW", "default"],
-                    "nn.conv2d_transpose": ["NCHW", "default"],
-                    "nn.conv3d": ["NCDHW", "default"],
-                }
-            ),
-            transform.FoldConstant(),
-            LegalizeLayoutTranformPass(),
-            transform.InferType(),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-
-    # Set SkipOptimization for all functions but main for the following passes
-    mod = _set_optimization_attr(mod, skip_optimization=1)
-    seq = tvm.transform.Sequential(
-        [
-            transform.AnnotateTarget("tensorrt"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            transform.InferType(),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    mod = PruneSubgraphs(mod, use_implicit_batch=use_implicit_batch, prune_no_macs=prune_subgraphs)
-
-    # Set SkipOptimization back to 0
-    mod = _set_optimization_attr(mod, skip_optimization=0)
-
-    # Set environment variables used to communicate with TensorRT module.
-    os.environ["TVM_TENSORRT_MAX_WORKSPACE_SIZE"] = str(max_workspace_size)
-    os.environ["TVM_TENSORRT_USE_IMPLICIT_BATCH"] = str(int(use_implicit_batch))
-    return mod
diff --git a/src/relay/backend/contrib/tensorrt/README b/src/relay/backend/contrib/tensorrt/README
deleted file mode 100644
index 09d03a157acd..000000000000
--- a/src/relay/backend/contrib/tensorrt/README
+++ /dev/null
@@ -1,61 +0,0 @@
-# Relay TensorRT Integration
-
-Currently, Relay/TRT integration only works when the entire model can be converted to TRT. It is enabled with the `EnableTrt` pass. If any op in the model cannot be converted to TRT, `EnableTrt` will return the original module unmodified.
-
-# How to use
-1. Build TVM with cmake flag `USE_TENSORRT=ON` or `USE_TENSORRT=/path/to/TensorRT`. USE_CUDA should be enabled as well.
-
-2. Convert the model into TensorRT. This step will determine if every node in the graph can be converted to TensorrRT and if so will mark the graph to use TensorRT and apply some specific optimization passes.
-```python
-import tvm.relay.tensorrt
-mod = relay.tensorrt.EnableTrt(mod, params)
-```
-
-3. Check if TRT was enabled. If not, it means some op in the graph is not supported by the TensorRT conversion. EnableTrt will output which particular ops are not supported and why.
-```python
-assert mod['main'].attrs and mod['main'].attrs.Compiler == 'tensorrt'
-```
-
-4. Finish compilation.
-```python
-with relay.build_config(opt_level=2, disabled_pass={"SimplifyInference"}):
-  graph, lib, params = relay.build(mod, "cuda", params=params)
-```
-
-5. (Optional) Serialize/deserialize the compiled model. The model will be serialized to three files: `compiled.json`, `compiled.params`, and `compiled.tensorrt`.
-```python
-# Serialize
-with open('compiled.json', 'w') as f_graph_json:
-  f_graph_json.write(graph)
-with open('compiled.params', 'wb') as f_params:
-  f_params.write(relay.save_param_dict(params))
-lib.save('compiled.tensorrt')
-
-# Deserialize
-with open('compiled.json', 'r') as f_graph_json:
-  graph = f_graph_json.read()
-with open('compiled.params', 'rb') as f_params:
-  params = tvm.relay.load_param_dict(f_params.read())
-lib = tvm.runtime.load_module("compiled.tensorrt")
-```
-
-6. Run inference. The first invocation will trigger creation of the TensorRT engine. This could take up to a few minutes.
-```python
-# Create graph runtime
-mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
-mod.set_input(**params)
-
-i_data = np.random.uniform(0, 1, input_shape).astype(dtype)
-# Build TensorRT engine
-mod.run(data=i_data)
-
-# Run inference
-mod.run(data=i_data)
-res = mod.get_output(0)
-```
-
-
-
-The tests `tests/python/relay/test_tensorrt.py` provide some deeper examples of how to use this feature.
-
-The NNVM/TRT integration is still present.
\ No newline at end of file
diff --git a/src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc b/src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc
deleted file mode 100644
index a627611fa905..000000000000
--- a/src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/relay/backend/contrib/tensorrt/codegen_tensorrt.cc
- * \brief Implementation of TensorRT codegen APIs.
- */
-
-#include <tvm/node/serialization.h>
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-
-#include <fstream>
-#include <sstream>
-#include <unordered_map>
-
-#include "../../../../runtime/contrib/tensorrt/tensorrt_module.h"
-#include "../../utils.h"
-#include "../codegen_c/codegen_c.h"
-#if TVM_GRAPH_RUNTIME_TENSORRT
-#include "NvInfer.h"
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-using namespace backend;
-
-/*!
- * \brief Generates a TensorRTModule from a relay expression. This "compilation"
- * does not require TensorRT since the actual conversion using TensorRT APIs is
- * deferred until runtime. This step simply serializes the relay functions into
- * strings.
- */
-class TensorRTModuleCodegen : public CSourceModuleCodegenBase {
- public:
-  /*!
-   * \brief Serializes a function and stores it in serialized_subgraphs_ so that
-   * it can be included in the TensorRT module.
-   * \param func A relay function to add to the TensorRT module.
-   */
-  void GenFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
-    // Record the external symbol for runtime lookup.
-    auto sid = GetExtSymbol(func);
-    serialized_subgraphs_[sid] = SaveJSON(func);
-  }
-
-  /*!
-   * \brief Creates the TensorRT module from the Relay function or IRModule.
-   * \param ref An object ref that could be either a Relay function or IRModule.
-   * \return The TensorRT runtime module.
-   */
-  runtime::Module CreateCSourceModule(const ObjectRef& ref) override {
-    if (ref->IsInstance<FunctionNode>()) {
-      GenFunc(Downcast<Function>(ref));
-    } else if (ref->IsInstance<IRModuleNode>()) {
-      IRModule mod = Downcast<IRModule>(ref);
-      for (const auto& it : mod->functions) {
-        GenFunc(Downcast<Function>(it.second));
-      }
-    } else {
-      LOG(FATAL)
-          << "The input ref is expected to be a Relay function or module.";
-    }
-    return runtime::TensorRTModuleCreate(serialized_subgraphs_);
-  }
-
- private:
-  /*! \brief Map of external symbol to serialized Relay functions. */
-  std::unordered_map<std::string, std::string> serialized_subgraphs_;
-};
-
-/*!
- * \brief The external compiler/codegen tool. It takes a Relay expression/module
- * and compiles it into a runtime module.
- */
-runtime::Module TrtCompiler(const ObjectRef& ref) {
-  TensorRTModuleCodegen tensorrt;
-  return tensorrt.CreateCSourceModule(ref);
-}
-
-TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TrtCompiler);
-
-/*!
- * \brief Get TensorRT version that TVM was compiled against.
- * \return TensorRT version as a list of [major, minor, patch], or an empty list
- * if not compiled against TensorRT.
- */
-Array<Integer> GetTrtVersion() {
-#if TVM_GRAPH_RUNTIME_TENSORRT
-  return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR),
-          Integer(NV_TENSORRT_PATCH)};
-#else
-  return {};
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.GetTrtVersion")
-    .set_body_typed(GetTrtVersion);
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
deleted file mode 100644
index 9180bd7b5ff4..000000000000
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ /dev/null
@@ -1,517 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime/contrib/tensorrt/tensorrt_builder.cc
- * \brief Contains TensorRTBuilder class which can be used to convert a relay
- * program into a TRT engine which can be used for inference.
- */
-
-#include "tensorrt_builder.h"
-
-#include <memory>
-#include <string>
-
-#include "tensorrt_logger.h"
-#include "tensorrt_ops.h"
-#include "utils.h"
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<TrtOpConverter>>>
-GetOpConverters() {
-  static auto map =
-      std::make_shared<std::unordered_map<std::string, std::shared_ptr<TrtOpConverter>>>();
-  if (!map->empty()) return map;
-  map->emplace("nn.relu", std::make_shared<ActivationOpConverter>());
-  map->emplace("sigmoid", std::make_shared<ActivationOpConverter>());
-  map->emplace("tanh", std::make_shared<ActivationOpConverter>());
-  map->emplace("nn.batch_norm", std::make_shared<BatchNormOpConverter>());
-  map->emplace("nn.softmax", std::make_shared<SoftmaxOpConverter>());
-  map->emplace("nn.conv2d", std::make_shared<Conv2DOpConverter>());
-  map->emplace("nn.dense", std::make_shared<DenseOpConverter>());
-  map->emplace("nn.bias_add", std::make_shared<BiasAddOpConverter>());
-  map->emplace("add", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("subtract", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("multiply", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("divide", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("power", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("maximum", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("minimum", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("nn.max_pool2d", std::make_shared<PoolingOpConverter>());
-  map->emplace("nn.avg_pool2d", std::make_shared<PoolingOpConverter>());
-  map->emplace("nn.global_max_pool2d", std::make_shared<GlobalPoolingOpConverter>());
-  map->emplace("nn.global_avg_pool2d", std::make_shared<GlobalPoolingOpConverter>());
-  map->emplace("exp", std::make_shared<UnaryOpConverter>());
-  map->emplace("log", std::make_shared<UnaryOpConverter>());
-  map->emplace("sqrt", std::make_shared<UnaryOpConverter>());
-  map->emplace("abs", std::make_shared<UnaryOpConverter>());
-  map->emplace("negative", std::make_shared<UnaryOpConverter>());
-  map->emplace("nn.batch_flatten", std::make_shared<BatchFlattenOpConverter>());
-  map->emplace("expand_dims", std::make_shared<ExpandDimsOpConverter>());
-  map->emplace("squeeze", std::make_shared<SqueezeOpConverter>());
-  map->emplace("concatenate", std::make_shared<ConcatOpConverter>());
-  map->emplace("nn.conv2d_transpose", std::make_shared<Conv2DTransposeOpConverter>());
-  map->emplace("transpose", std::make_shared<TransposeOpConverter>());
-  map->emplace("reshape", std::make_shared<ReshapeOpConverter>());
-  map->emplace("nn.pad", std::make_shared<PadOpConverter>());
-  map->emplace("sum", std::make_shared<ReduceOpConverter>());
-  map->emplace("prod", std::make_shared<ReduceOpConverter>());
-  map->emplace("max", std::make_shared<ReduceOpConverter>());
-  map->emplace("min", std::make_shared<ReduceOpConverter>());
-  map->emplace("mean", std::make_shared<ReduceOpConverter>());
-  map->emplace("nn.adaptive_max_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
-  map->emplace("nn.adaptive_avg_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
-#if TRT_VERSION_GE(5, 1, 5)
-  map->emplace("clip", std::make_shared<ActivationOpConverter>());
-  map->emplace("nn.leaky_relu", std::make_shared<ActivationOpConverter>());
-  map->emplace("sin", std::make_shared<UnaryOpConverter>());
-  map->emplace("cos", std::make_shared<UnaryOpConverter>());
-  map->emplace("atan", std::make_shared<UnaryOpConverter>());
-  map->emplace("ceil", std::make_shared<UnaryOpConverter>());
-  map->emplace("floor", std::make_shared<UnaryOpConverter>());
-  map->emplace("strided_slice", std::make_shared<StridedSliceOpConverter>());
-  map->emplace("split", std::make_shared<SplitOpConverter>());
-#else
-  map->emplace("clip", std::make_shared<ClipLegacyOpConverter>());
-#endif
-#if TRT_VERSION_GE(6, 0, 1)
-  map->emplace("image.resize", std::make_shared<ResizeOpConverter>());
-  map->emplace("nn.upsampling", std::make_shared<UpsamplingOpConverter>());
-  map->emplace("nn.conv3d", std::make_shared<Conv3DOpConverter>());
-  map->emplace("nn.max_pool3d", std::make_shared<Pooling3DOpConverter>());
-  map->emplace("nn.avg_pool3d", std::make_shared<Pooling3DOpConverter>());
-  map->emplace("nn.conv3d_transpose", std::make_shared<Conv3DTransposeOpConverter>());
-#endif
-  return map;
-}
-
-TensorRTBuilder::TensorRTBuilder(runtime::TensorRTLogger* logger,
-                                 const std::vector<DLTensor*>& args, size_t max_workspace_size,
-                                 bool use_implicit_batch)
-    : execution_args_(args),
-      max_workspace_size_(max_workspace_size),
-      use_implicit_batch_(use_implicit_batch) {
-  // Create TRT builder and network.
-  builder_ = nvinfer1::createInferBuilder(*logger);
-#if TRT_VERSION_GE(6, 0, 1)
-  // Use INetworkV2.
-  auto flags =
-      1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-  if (use_implicit_batch_) {
-    flags = 0U;
-    batch_size_ = args[0]->shape[0];
-    builder_->setMaxBatchSize(batch_size_);
-  }
-  network_ = builder_->createNetworkV2(flags);
-#else
-  // Use INetwork with implicit batch.
-  batch_size_ = args[0]->shape[0];
-  builder_->setMaxBatchSize(batch_size_);
-  builder_->setMaxWorkspaceSize(max_workspace_size_);
-  const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
-  builder_->setFp16Mode(use_fp16);
-  network_ = builder_->createNetwork();
-#endif
-}
-
-void TensorRTBuilder::ProcessInputs(const Function& func) {
-  // All input names in order. This order matches that of execution args.
-  for (size_t i = 0; i < func->params.size(); i++) {
-    network_input_names_.push_back(func->params[i]->name_hint());
-    network_input_map_[func->params[i]->name_hint()] = i;
-  }
-  // Assume all inputs are real to start. If an input is baked into the TRT
-  // engine, we will set the entry in this array to true.
-  network_input_is_baked_.assign(func->params.size(), false);
-}
-
-void TensorRTBuilder::ProcessOutputs(const Expr& expr) {
-  // Mark outputs.
-  auto it = node_output_map_.find(expr.operator->());
-  CHECK(it != node_output_map_.end()) << "Output was not found.";
-  auto network_outputs = it->second;
-  for (size_t i = 0; i < network_outputs.size(); ++i) {
-    CHECK(network_outputs[i].type == kTensor);
-    auto out_tensor = network_outputs[i].tensor;
-    std::string output_name = "tensorrt_output" + std::to_string(i);
-    // If the network is already marked as an output, make a copy to avoid TRT crash. This shouldn't
-    // happen since duplicate output issue in partitioning was fixed.
-    if (out_tensor->isNetworkOutput()) {
-      LOG(WARNING) << output_name << " is a duplicate output.";
-      out_tensor = network_->addIdentity(*out_tensor)->getOutput(0);
-    } else if (out_tensor->isNetworkInput()) {
-      LOG(WARNING) << output_name << " is also an input.";
-      out_tensor = network_->addIdentity(*out_tensor)->getOutput(0);
-    }
-    out_tensor->setName(output_name.c_str());
-    network_output_names_.push_back(output_name);
-    network_->markOutput(*out_tensor);
-    DLOG(INFO) << "Added TRT network output: " << out_tensor->getName() << " -> " << output_name;
-  }
-}
-
-runtime::TrtEngineAndContext TensorRTBuilder::BuildEngine(const Function& func) {
-  // Process graph to create INetworkDefinition.
-  ProcessInputs(func);
-  VisitExpr(func->body);
-  ProcessOutputs(func->body);
-// Build engine.
-#if TRT_VERSION_GE(6, 0, 1)
-  config_ = builder_->createBuilderConfig();
-  config_->setMaxWorkspaceSize(max_workspace_size_);
-  if (dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false)) {
-    config_->setFlag(nvinfer1::BuilderFlag::kFP16);
-  }
-  // Add profiles.
-  if (!use_implicit_batch_) {
-    auto profile = builder_->createOptimizationProfile();
-    for (int i = 0; i < network_->getNbInputs(); ++i) {
-      auto name = network_->getInput(i)->getName();
-      auto dims = network_->getInput(i)->getDimensions();
-      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, dims);
-      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, dims);
-      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, dims);
-    }
-    config_->addOptimizationProfile(profile);
-  }
-  nvinfer1::ICudaEngine* engine = builder_->buildEngineWithConfig(*network_, *config_);
-#else
-  nvinfer1::ICudaEngine* engine = builder_->buildCudaEngine(*network_);
-#endif
-  CleanUp();
-  const int num_input_bindings =
-      std::count(network_input_is_baked_.begin(), network_input_is_baked_.end(), false);
-  CHECK_EQ(engine->getNbBindings(), num_input_bindings + network_output_names_.size());
-  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
-  std::vector<runtime::NDArray> device_buffers;
-  device_buffers.resize(engine->getNbBindings());
-
-  for (size_t i = 0; i < network_input_names_.size(); i++) {
-    if (network_input_is_baked_[i] || execution_args_[i]->ctx.device_type == kDLGPU) {
-      continue;
-    } else {
-      int binding_index = engine->getBindingIndex(network_input_names_[i].c_str());
-      std::vector<int64_t> shape(execution_args_[i]->shape,
-                                 execution_args_[i]->shape + execution_args_[i]->ndim);
-      device_buffers[binding_index] =
-          runtime::NDArray::Empty(shape, execution_args_[i]->dtype, {kDLGPU, 0});
-    }
-  }
-
-  for (size_t i = 0; i < network_output_names_.size(); i++) {
-    int index_in_args = execution_args_.size() - network_output_names_.size() + i;
-    if (execution_args_[index_in_args]->ctx.device_type == kDLGPU) {
-      continue;
-    } else {
-      int binding_index = engine->getBindingIndex(network_output_names_[i].c_str());
-      std::vector<int64_t> shape(
-          execution_args_[index_in_args]->shape,
-          execution_args_[index_in_args]->shape + execution_args_[index_in_args]->ndim);
-      device_buffers[binding_index] =
-          runtime::NDArray::Empty(shape, execution_args_[index_in_args]->dtype, {kDLGPU, 0});
-    }
-  }
-  return {
-      engine,        context, network_input_names_, network_input_is_baked_, network_output_names_,
-      device_buffers};
-}
-
-nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(DLTensor* dptr, DLDeviceType src_device) {
-  CHECK_EQ(dptr->ctx.device_type, src_device);
-  CHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
-        static_cast<int>(dptr->dtype.code) == kDLInt);
-  const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
-                             ? nvinfer1::DataType::kFLOAT
-                             : nvinfer1::DataType::kINT32;
-  const size_t weight_bytes = runtime::GetDataSize(*dptr);
-  nvinfer1::Weights weight{trt_dtype, nullptr, 0};
-  size_t count = 1;
-  for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
-    count *= dptr->shape[i];
-  }
-  CHECK_EQ(count * 4, weight_bytes);
-  weight.count = count;
-  weight.values = new float[count];
-  CHECK_EQ(TVMArrayCopyToBytes(dptr, const_cast<void*>(weight.values), weight_bytes), 0)
-      << TVMGetLastError();
-  trt_weights_.push_back(weight);
-  return weight;
-}
-
-nvinfer1::Weights TensorRTBuilder::GetNdArrayAsWeights(const runtime::NDArray& array,
-                                                       DLDeviceType src_device) {
-  DLTensor* dptr = const_cast<DLTensor*>(array.operator->());
-  return GetDLTensorAsWeights(dptr, src_device);
-}
-
-void TensorRTBuilder::GetInputAsWeights(const VarNode* node) {
-  const int var_node_idx = network_input_map_[node->name_hint()];
-  // This input will be baked into TensorRT engine using value from first invocation.
-  network_input_is_baked_[var_node_idx] = true;
-  nvinfer1::Weights weight = GetDLTensorAsWeights(execution_args_[var_node_idx], kDLGPU);
-  node_output_map_[node] = {TrtOpInput(weight, GetShape(node->checked_type()))};
-}
-
-void TensorRTBuilder::GetConstantAsWeights(const ConstantNode* node) {
-  auto weight = GetNdArrayAsWeights(node->data, kDLCPU);
-  auto shape_long = node->data.Shape();
-  std::vector<int> shape(shape_long.begin(), shape_long.end());
-  node_output_map_[node] = {TrtOpInput(weight, shape)};
-}
-
-void TensorRTBuilder::GetInputAsTransposedWeights(const CallNode* transpose, const VarNode* node) {
-  GetInputAsWeights(node);
-  CHECK_EQ(node_output_map_[node].size(), 1);
-  const nvinfer1::Weights& original_weight = node_output_map_[node][0].weight;
-  const auto& shape = node_output_map_[node][0].weight_shape;
-  const float* original_values = static_cast<const float*>(original_weight.values);
-  float* values = new float[original_weight.count];
-  // Get order and new shape.
-  const auto* attrs = transpose->attrs.as<TransposeAttrs>();
-  std::vector<int> order(attrs->axes.size(), 0);
-  std::vector<int> new_shape(attrs->axes.size(), 0);
-  for (size_t i = 0; i < attrs->axes.size(); ++i) {
-    const int axis = attrs->axes[i].as<IntImmNode>()->value;
-    order[i] = axis;
-    new_shape[i] = shape[axis];
-  }
-  // Perform transpose.
-  if (order.size() == 4 && order[0] == 3 && order[1] == 2 && order[2] == 0 && order[3] == 1) {
-    const int output_strides[4] = {shape[1], 1, shape[0] * shape[1],
-                                   shape[0] * shape[1] * shape[2]};
-    TransposeWeights4D(shape, output_strides, original_values, values);
-  } else if (order.size() == 4 && order[0] == 2 && order[1] == 3 && order[2] == 0 &&
-             order[3] == 1) {
-    const int output_strides[4] = {shape[1], 1, shape[0] * shape[1] * shape[3],
-                                   shape[0] * shape[1]};
-    TransposeWeights4D(shape, output_strides, original_values, values);
-  } else if (order.size() == 2 && order[0] == 1 && order[1] == 0) {
-    TransposeWeights2D(shape, original_values, values);
-  } else {
-    LOG(FATAL) << "Constant transpose " << DebugString(order) << " is not supported.";
-  }
-  // Map as output of transpose op.
-  nvinfer1::Weights transposed_weight{nvinfer1::DataType::kFLOAT, values, original_weight.count};
-  trt_weights_.push_back(transposed_weight);
-  node_output_map_[transpose] = {TrtOpInput(transposed_weight, new_shape)};
-}
-
-void TensorRTBuilder::VisitExpr_(const TupleGetItemNode* op) {
-  if (const auto* tuple = op->tuple.as<TupleNode>()) {
-    Expr item = tuple->fields[op->index];
-    VisitExpr(item);
-    node_output_map_[op] = node_output_map_[item.operator->()];
-  } else {
-    VisitExpr(op->tuple);
-    // Index into tensor outputs from expr.
-    node_output_map_[op] = {node_output_map_[op->tuple.operator->()][op->index]};
-  }
-}
-
-void TensorRTBuilder::VisitExpr_(const TupleNode* op) {
-  std::vector<TrtOpInput> outputs;
-  for (auto item : op->fields) {
-    VisitExpr(item);
-    auto item_outputs = node_output_map_[item.operator->()];
-    outputs.reserve(outputs.size() + item_outputs.size());
-    outputs.insert(outputs.end(), item_outputs.begin(), item_outputs.end());
-  }
-  node_output_map_[op] = outputs;
-}
-
-nvinfer1::ITensor* TensorRTBuilder::AddInput(const std::string& tensor_name, const Type& type) {
-  auto shape = GetShape(type);
-  // Remove batch dim when not in explicit batch mode.
-  if (use_implicit_batch_ && shape.size() > 1) {
-    shape.erase(shape.begin());
-  }
-  DLOG(INFO) << "Added TRT network input: " << tensor_name << " " << DebugString(shape);
-  nvinfer1::Dims dims = VectorToTrtDims(shape);
-  auto type_node = type.as<TensorTypeNode>();
-  CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-      << "Only FP32 inputs are supported.";
-  return network_->addInput(tensor_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
-}
-
-void TensorRTBuilder::VisitExpr_(const VarNode* node) {
-  if (node->checked_type().as<TupleTypeNode>()) {
-    // Handle TupleTypes by creating multiple TRT inputs from one.
-    auto* tuple_type = node->type_as<TupleTypeNode>();
-    std::vector<TrtOpInput> outputs;
-    const std::string& original_name = node->name_hint();
-    std::vector<std::string> new_names;
-    for (int i = 0; i < tuple_type->fields.size(); ++i) {
-      std::string tensor_name = original_name + "_" + std::to_string(i);
-      new_names.push_back(tensor_name);
-      outputs.push_back(TrtOpInput(AddInput(tensor_name, tuple_type->fields[i])));
-    }
-    node_output_map_[node] = outputs;
-    // Update network_input_map_
-    const int original_index = network_input_map_[original_name];
-    network_input_map_.erase(original_name);
-    // Push all other inputs back.
-    for (auto it : network_input_map_) {
-      if (it.second > original_index) {
-        network_input_map_[it.first] += new_names.size() - 1;
-      }
-    }
-    for (size_t i = 0; i < new_names.size(); ++i) {
-      network_input_map_[new_names[i]] = original_index + i;
-    }
-    // Update network_input_names_
-    network_input_names_.erase(network_input_names_.begin() + original_index);
-    network_input_names_.insert(network_input_names_.begin() + original_index, new_names.begin(),
-                                new_names.end());
-    // Update network_input_is_baked_
-    bool is_baked = network_input_is_baked_[original_index];
-    network_input_is_baked_.erase(network_input_is_baked_.begin() + original_index);
-    network_input_is_baked_.insert(network_input_is_baked_.begin() + original_index,
-                                   new_names.size(), is_baked);
-  } else if (node->checked_type().as<TensorTypeNode>()) {
-    // Standard TensorType case.
-    const std::string& tensor_name = node->name_hint();
-    node_output_map_[node] = {TrtOpInput(AddInput(tensor_name, node->checked_type()))};
-  } else {
-    LOG(FATAL) << "VarNode must be Tensor or Tuple type.";
-  }
-}
-
-void TensorRTBuilder::VisitExpr_(const ConstantNode* node) {
-  nvinfer1::Weights weight = GetNdArrayAsWeights(node->data, kDLCPU);
-  auto shape = node->data.Shape();
-  // Remove batch dim when not in explicit batch mode.
-  if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) {
-    shape.erase(shape.begin());
-  }
-  nvinfer1::Dims dims = VectorToTrtDims(shape);
-  auto const_layer = network_->addConstant(dims, weight);
-  CHECK(const_layer != nullptr);
-  node_output_map_[node] = {TrtOpInput(const_layer->getOutput(0))};
-}
-
-void TensorRTBuilder::VisitExpr_(const CallNode* call) {
-  AddTrtLayerParams params(network_, call, &trt_weights_);
-  // Look up converter.
-  auto it = GetOpConverters()->find(params.op_name);
-  CHECK(it != GetOpConverters()->end())
-      << "Unsupported operator conversion to TRT, op name: " << params.op_name;
-  const auto converter = it->second;
-
-  // Ensure that nodes are processed in topological order by visiting their inputs first.
-  for (size_t i = 0; i < call->args.size(); ++i) {
-    if (converter->variable_input_count || converter->input_types[i] != kWeight) {
-      VisitExpr(call->args[i]);
-      continue;
-    }
-    // Handle special case where input must be constant array on CPU.
-    if (auto* var = call->args[i].as<VarNode>()) {
-      GetInputAsWeights(var);
-    } else if (auto* node = call->args[i].as<ConstantNode>()) {
-      GetConstantAsWeights(node);
-    } else {
-      // Temporary workaround for transposed weights. Once partitioning is
-      // available, the transpose will be computed by tvm and the result will be
-      // a var input. Also not needed when params are bound to constants since
-      // FoldConstants will remove the transpose for us.
-      const CallNode* transpose = call->args[i].as<CallNode>();
-      const VarNode* weights = nullptr;
-      if (transpose && transpose->op.as<OpNode>()->name == "transpose" &&
-          (weights = transpose->args[0].as<VarNode>())) {
-        GetInputAsTransposedWeights(transpose, weights);
-      } else {
-        LOG(FATAL) << "TRT requires a constant input here.";
-      }
-    }
-  }
-
-  // Get inputs.
-  for (size_t i = 0; i < call->args.size(); ++i) {
-    auto it = node_output_map_.find(call->args[i].operator->());
-    CHECK(it != node_output_map_.end()) << "Input was not found.";
-    for (auto out : it->second) {
-      params.inputs.push_back(out);
-    }
-  }
-  if (!converter->variable_input_count) {
-    CHECK_EQ(converter->input_types.size(), params.inputs.size())
-        << "Op expected a different number of inputs.";
-  }
-
-  // Convert op to TRT.
-  converter->Convert(&params);
-
-  // Get outputs.
-  node_output_map_[call] = {};
-  std::vector<TrtOpInput> outputs;
-  for (auto out : params.outputs) {
-    node_output_map_[call].push_back(TrtOpInput(out));
-  }
-}
-
-void TensorRTBuilder::CleanUp() {
-  network_->destroy();
-#if TRT_VERSION_GE(6, 0, 1)
-  config_->destroy();
-#endif
-  builder_->destroy();
-  for (auto weight : trt_weights_) {
-    if (weight.type == nvinfer1::DataType::kFLOAT) {
-      delete[] static_cast<const float*>(weight.values);
-    } else {
-      delete[] static_cast<const uint16_t*>(weight.values);
-    }
-  }
-}
-
-void TransposeWeights4D(const std::vector<int>& original_shape, const int* output_strides,
-                        const float* input_values, float* output_values) {
-  const int input_strides[4] = {original_shape[1] * original_shape[2] * original_shape[3],
-                                original_shape[2] * original_shape[3], original_shape[3], 1};
-  for (int i = 0; i < original_shape[0]; i++) {
-    for (int j = 0; j < original_shape[1]; j++) {
-      for (int k = 0; k < original_shape[2]; k++) {
-        for (int l = 0; l < original_shape[3]; l++) {
-          const int input_index = (i * input_strides[0]) + (j * input_strides[1]) +
-                                  (k * input_strides[2]) + (l * input_strides[3]);
-          const int output_index = (i * output_strides[0]) + (j * output_strides[1]) +
-                                   (k * output_strides[2]) + (l * output_strides[3]);
-          output_values[output_index] = input_values[input_index];
-        }
-      }
-    }
-  }
-}
-
-void TransposeWeights2D(const std::vector<int>& original_shape, const float* input_values,
-                        float* output_values) {
-  const int c = original_shape[0];
-  const int k = original_shape[1];
-  for (int i = 0; i < c; i++) {
-    for (int j = 0; j < k; j++) {
-      const int input_index = i * k + j;
-      const int output_index = j * c + i;
-      output_values[output_index] = input_values[input_index];
-    }
-  }
-}
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
deleted file mode 100644
index e99873a781ce..000000000000
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
-* \file runtime/contrib/tensorrt/tensorrt_builder.h
-* \brief Contains TensorRTBuilder class which can be used to convert a relay
-* program into a TRT engine which can be used for inference.
-*/
-
-#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
-#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
-
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/ndarray.h>
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "NvInfer.h"
-
-#define TRT_VERSION_GE(major, minor, patch)                    \
-  ((NV_TENSORRT_MAJOR > major) ||                              \
-  (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
-  (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
-  NV_TENSORRT_PATCH >= patch))
-
-#include "tensorrt_logger.h"
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief The product of TensorRTBuilder which provides everything needed to
- * perform inference.
- */
-struct TrtEngineAndContext {
-  nvinfer1::ICudaEngine* engine;
-  nvinfer1::IExecutionContext* context;
-  std::vector<std::string> inputs;
-  std::vector<bool> input_is_baked;
-  std::vector<std::string> outputs;
-  std::vector<runtime::NDArray> device_mem_buffers;
-};
-
-}  // namespace runtime
-
-namespace relay {
-namespace contrib {
-
-/*!
- * \brief An input to a op may be either kTensor in the case of nvifner::ITensor
- * or kWeight for nvinfer1::Weights.
- */
-enum TrtInputType {
-  kTensor,
-  kWeight,
-};
-
-/*!
- * \brief An input to a TrtOpConverter. The type of the input is either kTensor
- * or kWeight. For kTensor, "tensor" contains the input tensor. For kWeight,
- * "weight" contains the input weight and "weight_shape" contains the shape.
- */
-struct TrtOpInput {
-  TrtInputType type;
-  nvinfer1::ITensor* tensor;
-  nvinfer1::Weights weight;
-  std::vector<int> weight_shape;
-
-  explicit TrtOpInput(nvinfer1::ITensor* tensor)
-      : tensor(tensor), type(kTensor) {}
-  TrtOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
-      : weight(weight), type(kWeight), weight_shape(shape) {}
-};
-
-/*!
- * \brief An ExprVisitor to convert a relay expression into a TensorRT engine
- * and execution context.
- */
-class TensorRTBuilder : public ExprVisitor {
- public:
-  /*!
-   * \brief Create TensorRT builder.
-   * \param args Inputs to this execution.
-   */
-  explicit TensorRTBuilder(runtime::TensorRTLogger* logger, const std::vector<DLTensor*>& args,
-                           size_t max_workspace_size, bool use_implicit_batch_);
-
-  void VisitExpr_(const VarNode* node) final;
-
-  void VisitExpr_(const ConstantNode* node) final;
-
-  void VisitExpr_(const TupleGetItemNode* op) final;
-
-  void VisitExpr_(const TupleNode* op) final;
-
-  void VisitExpr_(const CallNode* call) final;
-
-  /*!
-   * \brief Convert Expr into TensorRT.
-   * \param expr The relay expression.
-   * \return TRT engine, context, and input/output information.
-   */
-  runtime::TrtEngineAndContext BuildEngine(const Function& func);
-
- private:
-  /*!
-   * \brief Helper function fto convert NDArray to TRT Weights.
-   * \param array NDArray containing data.
-   * \param src_device Which device the data is expected to be on.
-   * \return Newly created weights
-   */
-  nvinfer1::Weights GetNdArrayAsWeights(const runtime::NDArray& array,
-                                        DLDeviceType src_device);
-
-  /*!
-   * \brief Helper function fto convert DLTensor to TRT Weights.
-   * \param dptr Pointer to DLTensor containing data.
-   * \param src_device Which device the data is expected to be on.
-   * \return Newly created weights
-   */
-  nvinfer1::Weights GetDLTensorAsWeights(DLTensor* dptr,
-                                         DLDeviceType src_device);
-
-  nvinfer1::ITensor* AddInput(const std::string& tensor_name, const Type& type);
-
-  /*! \brief Gets value from execution args and converts to constant weight
-   * stored in node_output_map_ with node as the key. */
-  void GetInputAsWeights(const VarNode* node);
-
-  /*! \brief Gets value from ConstantNode data and converts to constant weight
-   * stored in node_output_map_ with node as the key. */
-  void GetConstantAsWeights(const ConstantNode* node);
-
-  /*! \brief Temporary workaround for transposed weights. */
-  void GetInputAsTransposedWeights(const CallNode* transpose,
-                                   const VarNode* node);
-
-  /*! \brief Deallocates weights and destroys network definition. */
-  void CleanUp();
-
-  /*! \brief Initializes network_input_names_, network_input_map_ and
-   * network_input_is_baked_ based on function parameters. */
-  void ProcessInputs(const Function& expr);
-
-  /*! \brief Populates network_output_names_ from the final outputs of the
-   * processed expr. */
-  void ProcessOutputs(const Expr& expr);
-
-  /*! \brief Maps a node to its outputs. */
-  std::unordered_map<const ExprNode*, std::vector<TrtOpInput>> node_output_map_;
-
-  /*! \brief TensorRT builder. */
-  nvinfer1::IBuilder* builder_;
-
-#if TRT_VERSION_GE(6, 0, 1)
-  /*! \brief TensorRT builder config. */
-  nvinfer1::IBuilderConfig* config_;
-#endif
-
-  /*! \brief TensorRT network definition. */
-  nvinfer1::INetworkDefinition* network_;
-
-  /*! \brief List of all weights held in memory. */
-  std::vector<nvinfer1::Weights> trt_weights_;
-
-  /*! \brief Execution inputs from this invocation. */
-  const std::vector<DLTensor*>& execution_args_;
-
-  /*! \brief Batch size of inputs from this invocation. */
-  int batch_size_;
-
-  /*! \brief Max workspace size in bytes for TRT. */
-  size_t max_workspace_size_;
-
-  /*! \brief Whether to use implicit batch mode. */
-  bool use_implicit_batch_;
-
-  /*! \brief Input names in same order as execution args during runtime. Some of
-   * these are not actual input bindings in the TRT engine - use
-   * network_input_is_baked_ to find out which. */
-  std::vector<std::string> network_input_names_;
-
-  /*! \brief Maps input name to execution args index. */
-  std::unordered_map<std::string, int> network_input_map_;
-
-  /*! \brief True if the corresponding input is baked into the TensorRT engine
-   * and therefore should not be included in the input bindings during
-   * execution. */
-  std::vector<bool> network_input_is_baked_;
-
-  /*! \brief Output names in same order as execution args during runtime. */
-  std::vector<std::string> network_output_names_;
-};
-
-/*!
- * \brief Helper function for GetInputAsTransposedWeights to transpose 4-D
- * weights.
- * \param original_shape Shape of weight before transpose.
- * \param output_strides Multipliers for each index to compute output index in
- * flat buffer. Must be of length 4.
- * \param input_values The original weight values.
- * \param output_values Buffer where transposed values will be placed.
- */
-void TransposeWeights4D(const std::vector<int>& original_shape,
-                        const int* output_strides, const float* input_values,
-                        float* output_values);
-
-/*!
- * \brief Helper function for GetInputAsTransposedWeights to transpose CK to KC.
- * \param original_shape Shape of weight before transpose.
- * \param input_values The original weight values.
- * \param output_values Buffer where transposed values will be placed.
- */
-void TransposeWeights2D(const std::vector<int>& original_shape,
-                        const float* input_values, float* output_values);
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_logger.h b/src/runtime/contrib/tensorrt/tensorrt_logger.h
deleted file mode 100644
index c606ffdb0b68..000000000000
--- a/src/runtime/contrib/tensorrt/tensorrt_logger.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime/contrib/tensorrt/tensorrt_logger.h
- * \brief Contains TensorRTLogger class which is required by TRT and used to
- * print info, warnings, and errors.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
-#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
-
-#include "NvInfer.h"
-
-namespace tvm {
-namespace runtime {
-
-/*! \brief Logger for TensorRT info/warning/errors. */
-class TensorRTLogger : public nvinfer1::ILogger {
- public:
-  TensorRTLogger() : TensorRTLogger(Severity::kWARNING) {}
-  explicit TensorRTLogger(Severity severity) : reportable_severity(severity) {}
-  void log(Severity severity, const char* msg) override {
-    // suppress messages with severity enum value greater than the reportable
-    if (severity > reportable_severity) return;
-
-    switch (severity) {
-      case Severity::kINTERNAL_ERROR:
-        LOG(ERROR) << "INTERNAL_ERROR: " << msg;
-        break;
-      case Severity::kERROR:
-        LOG(ERROR) << "ERROR: " << msg;
-        break;
-      case Severity::kWARNING:
-        LOG(WARNING) << "WARNING: " << msg;
-        break;
-      case Severity::kINFO:
-        LOG(INFO) << "INFO: " << msg;
-        break;
-#if TRT_VERSION_GE(5, 1, 5)
-      case Severity::kVERBOSE:
-        DLOG(INFO) << "VERBOSE: " << msg;
-        break;
-#endif
-      default:
-        LOG(INFO) << "UNKNOWN: " << msg;
-        break;
-    }
-  }
-
- private:
-  Severity reportable_severity{Severity::kWARNING};
-};
-
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_module.cc b/src/runtime/contrib/tensorrt/tensorrt_module.cc
deleted file mode 100644
index d1a83736a504..000000000000
--- a/src/runtime/contrib/tensorrt/tensorrt_module.cc
+++ /dev/null
@@ -1,389 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime/contrib/tensorrt/tensorrt_module.cc
- * \brief TensorRTModule is the runtime module for tensorrt backend.
- */
-
-#include "tensorrt_module.h"
-
-#include <cuda_runtime_api.h>
-#include <stdlib.h>
-#include <tvm/node/serialization.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/ndarray.h>
-
-#include <fstream>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "../../file_util.h"
-
-#ifdef TVM_GRAPH_RUNTIME_TENSORRT
-#include "NvInfer.h"
-#include "tensorrt_builder.h"
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-
-namespace tvm {
-namespace runtime {
-
-struct PairHash {
-  template <class T1, class T2>
-  std::size_t operator()(const std::pair<T1, T2>& pair) const {
-    return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
-  }
-};
-
-/*! \brief A module for TensorRT runtime. */
-class TensorRTModule : public runtime::ModuleNode {
- public:
-  explicit TensorRTModule(const std::unordered_map<std::string, std::string>& serialized_subgraphs)
-      : serialized_subgraphs_(serialized_subgraphs) {
-    max_workspace_size_ = dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(1) << 31);
-    use_implicit_batch_ = dmlc::GetEnv("TVM_TENSORRT_USE_IMPLICIT_BATCH", true);
-#if TVM_GRAPH_RUNTIME_TENSORRT
-    GetCachedEnginesFromDisk();
-#endif
-  }
-
-  ~TensorRTModule() {
-#if TVM_GRAPH_RUNTIME_TENSORRT
-    for (auto& it : trt_engine_cache_) {
-      it.second.context->destroy();
-      it.second.engine->destroy();
-    }
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-  }
-
-  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
-    // Returning nullptr tells TVM that the function is not in this module, so
-    // it can look for the correct one.
-    auto it_subgraph = serialized_subgraphs_.find(name);
-    if (it_subgraph == serialized_subgraphs_.end()) {
-      return PackedFunc(nullptr);
-    }
-#if TVM_GRAPH_RUNTIME_TENSORRT
-    // Generate an external packed function
-    return PackedFunc([this, name](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      auto inputs = ConvertInputs(args);
-      const int batch_size = inputs[0]->shape[0];
-      auto it = trt_engine_cache_.find(std::make_pair(name, batch_size));
-      if (it == trt_engine_cache_.end()) {
-        // Build new trt engine and place in cache.
-        LOG(INFO) << "Building new TensorRT engine for subgraph " << name << " with batch size "
-                  << batch_size;
-        auto func = Downcast<relay::Function>(LoadJSON(this->serialized_subgraphs_[name]));
-        auto inputs = ConvertInputs(args);
-        std::string key = GetSubgraphKey(serialized_subgraphs_[name]);
-        relay::contrib::TensorRTBuilder builder(&logger_, inputs, max_workspace_size_,
-                                                use_implicit_batch_);
-        auto engine_and_context = builder.BuildEngine(func);
-        CacheEngineToDisk(key, engine_and_context);
-        LOG(INFO) << "Finished building TensorRT engine for subgraph " << name;
-        this->trt_engine_cache_[std::make_pair(name, batch_size)] = engine_and_context;
-        this->ExecuteEngine(&this->trt_engine_cache_[std::make_pair(name, batch_size)], args, rv);
-      } else {
-        this->ExecuteEngine(&it->second, args, rv);
-      }
-    });
-#else
-    LOG(FATAL) << "TVM was not built with TensorRT runtime enabled. Build "
-               << "with USE_TENSORRT=ON.";
-    return PackedFunc();
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-  }
-
-  const char* type_key() const { return "tensorrt"; }
-
-  void SaveToFile(const std::string& file_name, const std::string& format) final {
-    std::string fmt = runtime::GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, type_key()) << "Can only save to format=" << type_key();
-    SaveBinaryToFile(file_name, SerializeModuleToString());
-  }
-
-  void SaveToBinary(dmlc::Stream* stream) final { stream->Write(SerializeModuleToString()); }
-
-  static Module LoadFromFile(const std::string& path) {
-    std::ifstream filep(path);
-    filep.seekg(0, std::ios::end);
-    size_t size = filep.tellg();
-    std::string serialized_module(size, ' ');
-    filep.seekg(0);
-    filep.read(&serialized_module[0], size);
-    return CreateModuleFromString(serialized_module);
-  }
-
-  static Module LoadFromBinary(void* strm) {
-    dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
-    std::string serialized_module;
-    stream->Read(&serialized_module);
-    return CreateModuleFromString(serialized_module);
-  }
-
- private:
-  /*! \brief Relay program serialized using SaveJSON */
-  std::unordered_map<std::string, std::string> serialized_subgraphs_;
-
-  /*! \brief Max workspace size for TensorRT */
-  size_t max_workspace_size_;
-
-  /*! \brief Whether to use implicit batch mode. */
-  bool use_implicit_batch_;
-
-#if TVM_GRAPH_RUNTIME_TENSORRT
-  /*! \brief Map of function name to TRT engine if built already. */
-  std::unordered_map<std::pair<std::string, int>, TrtEngineAndContext, PairHash> trt_engine_cache_;
-
-  /*! \brief TensorRT object used to log warnings and errors. */
-  TensorRTLogger logger_;
-
-  /*!
-   * \brief Convert TVMArgs to make compatible with VM or graph runtime.
-   * \param args Inputs to the PackedFunc.
-   * \return Inputs converted to vector of DLTensor*
-   */
-  std::vector<DLTensor*> ConvertInputs(tvm::TVMArgs args) {
-    std::vector<DLTensor*> inputs(args.size(), nullptr);
-    for (size_t i = 0; i < args.size(); ++i) {
-      if (args[i].type_code() == kTVMNDArrayHandle) {
-        // Relay Debug/VM uses NDArray
-        runtime::NDArray array = args[i];
-        inputs[i] = const_cast<DLTensor*>(array.operator->());
-      } else if (args[i].type_code() == kTVMDLTensorHandle) {
-        // Graph runtime uses DLTensors
-        inputs[i] = args[i];
-      } else {
-        LOG(FATAL) << "Invalid TVMArgs type.";
-      }
-    }
-    return inputs;
-  }
-
-  /*!
-   * \brief Perform inference using TensorRT.
-   * \param engine_and_context TRT engine from TrtBuilder::BuildEngine()
-   * \param args Inputs to the PackedFunc.
-   * \param rv Return value pointer for the PackedFunc.
-   * \return Inputs converted to vector of DLTensor*
-   */
-  void ExecuteEngine(TrtEngineAndContext* engine_and_context, tvm::TVMArgs args,
-                     tvm::TVMRetValue* rv) {
-    auto engine = engine_and_context->engine;
-    auto context = engine_and_context->context;
-    auto& device_buffers = engine_and_context->device_mem_buffers;
-    const int num_bindings = engine->getNbBindings();
-    std::vector<void*> bindings(num_bindings, nullptr);
-    // Set inputs.
-    auto inputs = ConvertInputs(args);
-    const size_t num_outputs = engine_and_context->outputs.size();
-    CHECK_GT(inputs.size(), num_outputs);
-    for (size_t i = 0; i < engine_and_context->inputs.size(); ++i) {
-      // If an input was baked into the engine, skip.
-      if (engine_and_context->input_is_baked[i]) continue;
-      DLTensor* arg = inputs[i];
-      int binding_index = engine->getBindingIndex(engine_and_context->inputs[i].c_str());
-      CHECK_NE(binding_index, -1);
-      if (!runtime::TypeMatch(arg->dtype, kDLFloat, 32)) {
-        LOG(FATAL) << "Only float32 inputs are supported.";
-      }
-      if (inputs[i]->ctx.device_type == kDLGPU) {
-        bindings[binding_index] = reinterpret_cast<float*>(arg->data);
-      } else {
-        device_buffers[binding_index].CopyFrom(inputs[i]);
-        bindings[binding_index] = reinterpret_cast<float*>(device_buffers[binding_index]->data);
-      }
-#if TRT_VERSION_GE(6, 0, 1)
-      // Set binding dimensions for INetworkV2 explicit batch mode engines.
-      if (!use_implicit_batch_) {
-        nvinfer1::Dims dims;
-        dims.d[0] = 1;
-        dims.nbDims = arg->ndim;
-        for (int i = 0; i < arg->ndim; ++i) {
-          dims.d[i] = arg->shape[i];
-        }
-        context->setBindingDimensions(binding_index, dims);
-      }
-#endif
-    }
-    // Set outputs.
-    for (size_t i = 0; i < num_outputs; ++i) {
-      const int index_in_inputs = inputs.size() - num_outputs + i;
-      DLTensor* out_arg = inputs[index_in_inputs];
-      int binding_index = engine->getBindingIndex(engine_and_context->outputs[i].c_str());
-      CHECK_NE(binding_index, -1);
-      if (out_arg->ctx.device_type == kDLGPU) {
-        bindings[binding_index] = reinterpret_cast<float*>(out_arg->data);
-      } else {
-        bindings[binding_index] = reinterpret_cast<float*>(device_buffers[binding_index]->data);
-      }
-    }
-#if TRT_VERSION_GE(6, 0, 1)
-    if (use_implicit_batch_) {
-      // Use batch size from first input.
-      const int batch_size = inputs[0]->shape[0];
-      CHECK(context->execute(batch_size, bindings.data())) << "Running TensorRT failed.";
-    } else {
-      CHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
-    }
-    for (size_t i = 0; i < num_outputs; ++i) {
-      const int index_in_inputs = inputs.size() - num_outputs + i;
-      DLTensor* out_arg = inputs[index_in_inputs];
-      int binding_index = engine->getBindingIndex(engine_and_context->outputs[i].c_str());
-      CHECK_NE(binding_index, -1);
-      if (out_arg->ctx.device_type != kDLGPU) {
-        device_buffers[binding_index].CopyTo(out_arg);
-      }
-    }
-#else
-    // Use batch size from first input.
-    const int batch_size = inputs[0]->shape[0];
-    CHECK(context->execute(batch_size, bindings.data())) << "Running TensorRT failed.";
-#endif
-    *rv = bindings[num_bindings - num_outputs];
-  }
-
-  std::string GetSubgraphKey(const std::string& serialized_subgraph) {
-    if (dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string("")).empty()) return "";
-    std::string key = std::to_string(std::hash<std::string>()(serialized_subgraph));
-    if (dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false)) {
-      key += "_fp16";
-    }
-    return key;
-  }
-
-  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
-   * already built TRT engines and load into trt_engine_cache_ so they don't
-   * have to be built at first inference.
-   */
-  void GetCachedEnginesFromDisk() {
-    std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
-    if (cache_dir.empty()) return;
-    for (auto it : serialized_subgraphs_) {
-      std::string key = GetSubgraphKey(it.second);
-      std::string path = cache_dir + "/" + key + ".plan";
-      // Check if engine is in the cache.
-      std::ifstream infile(path, std::ios::binary);
-      if (!infile.good()) continue;
-      LOG(INFO) << "Loading cached TensorRT engine from " << path;
-      infile.close();
-      std::string serialized_engine;
-      LoadBinaryFromFile(path, &serialized_engine);
-      // Deserialize engine
-      nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger_);
-      TrtEngineAndContext engine_and_context;
-      engine_and_context.engine =
-          runtime->deserializeCudaEngine(&serialized_engine[0], serialized_engine.size(), nullptr);
-      engine_and_context.context = engine_and_context.engine->createExecutionContext();
-      // Load metadata
-      std::string meta_path = cache_dir + "/" + key + ".meta";
-      std::string serialized_meta;
-      LoadBinaryFromFile(meta_path, &serialized_meta);
-      std::istringstream is(serialized_meta);
-      dmlc::JSONReader reader(&is);
-      dmlc::JSONObjectReadHelper helper;
-      helper.DeclareField("inputs", &engine_and_context.inputs);
-      helper.DeclareField("input_is_baked", &engine_and_context.input_is_baked);
-      helper.DeclareField("outputs", &engine_and_context.outputs);
-      helper.ReadAllFields(&reader);
-      const int batch_size = 1;
-      trt_engine_cache_[std::make_pair(it.first, batch_size)] = engine_and_context;
-    }
-  }
-
-  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will save the engine to that
-   * directory so it can be loaded later. A hash of the source relay function is
-   * used as the key for the file name.
-   * \param name Subgraph name
-   * \param engine_and_context Engine to cache
-   */
-  void CacheEngineToDisk(const std::string& key, const TrtEngineAndContext& engine_and_context) {
-    std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
-    if (cache_dir.empty()) return;
-    std::string path = cache_dir + "/" + key + ".plan";
-    LOG(INFO) << "Caching TensorRT engine to " << path;
-    // Serialize engine to disk
-    nvinfer1::IHostMemory* serialized_engine = engine_and_context.engine->serialize();
-    SaveBinaryToFile(path, std::string(static_cast<const char*>(serialized_engine->data()),
-                                       serialized_engine->size()));
-    serialized_engine->destroy();
-    // Serialize metadata
-    std::ostringstream os;
-    dmlc::JSONWriter writer(&os);
-    writer.BeginObject();
-    writer.WriteObjectKeyValue("inputs", engine_and_context.inputs);
-    writer.WriteObjectKeyValue("input_is_baked", engine_and_context.input_is_baked);
-    writer.WriteObjectKeyValue("outputs", engine_and_context.outputs);
-    writer.EndObject();
-    std::string meta_path = cache_dir + "/" + key + ".meta";
-    SaveBinaryToFile(meta_path, os.str());
-  }
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
-
-  /*! \brief Serialize this module to a string. To be used during codegen. */
-  std::string SerializeModuleToString() {
-    std::ostringstream os;
-    dmlc::JSONWriter writer(&os);
-    writer.BeginObject();
-    writer.WriteObjectKeyValue("subgraphs", serialized_subgraphs_);
-    writer.WriteObjectKeyValue("max_workspace_size", max_workspace_size_);
-    writer.WriteObjectKeyValue("use_implicit_batch", use_implicit_batch_);
-    writer.EndObject();
-    return os.str();
-  }
-
-  /*! \brief Load serialized module from string created by SerializeModuleToString. */
-  static Module CreateModuleFromString(const std::string& str) {
-    std::unordered_map<std::string, std::string> serialized_subgraphs;
-    size_t max_workspace_size = 0;
-    bool use_implicit_batch = true;
-    std::istringstream is(str);
-    dmlc::JSONReader reader(&is);
-    dmlc::JSONObjectReadHelper helper;
-    helper.DeclareField("subgraphs", &serialized_subgraphs);
-    helper.DeclareOptionalField("max_workspace_size", &max_workspace_size);
-    helper.DeclareOptionalField("use_implicit_batch", &use_implicit_batch);
-    helper.ReadAllFields(&reader);
-    auto n = make_object<TensorRTModule>(serialized_subgraphs);
-    // Use max_workspace_size from artifact if it is set and it is not overriden by env var.
-    if (max_workspace_size != 0 && dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", 0) != 0) {
-      n->max_workspace_size_ = max_workspace_size;
-    }
-    n->use_implicit_batch_ = use_implicit_batch;
-    return Module(n);
-  }
-};
-
-Module TensorRTModuleCreate(
-    const std::unordered_map<std::string, std::string>& serialized_subgraphs) {
-  auto n = make_object<TensorRTModule>(serialized_subgraphs);
-  return Module(n);
-}
-
-TVM_REGISTER_GLOBAL("runtime.module.loadfile_tensorrt").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = TensorRTModule::LoadFromFile(args[0]);
-});
-
-TVM_REGISTER_GLOBAL("runtime.module.loadbinary_tensorrt")
-    .set_body_typed(TensorRTModule::LoadFromBinary);
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_module.h b/src/runtime/contrib/tensorrt/tensorrt_module.h
deleted file mode 100644
index 889930eb4f54..000000000000
--- a/src/runtime/contrib/tensorrt/tensorrt_module.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime/contrib/tensorrt/tensorrt_module.h
- * \brief TensorRTModule is the runtime module for tensorrt backend.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_MODULE_H_
-#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_MODULE_H_
-
-#include <tvm/ir/module.h>
-
-#include <string>
-#include <unordered_map>
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief Create a TensorRTModule.
- * \param serialized_subgraphs Function name -> Relay func serialized with
- * SaveJSON.
- * \return TensorRTModule created from subgraphs.
- */
-Module TensorRTModuleCreate(
-    const std::unordered_map<std::string, std::string>& serialized_subgraphs);
-
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_MODULE_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.h b/src/runtime/contrib/tensorrt/tensorrt_ops.h
deleted file mode 100644
index 58fc8d7acb65..000000000000
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.h
+++ /dev/null
@@ -1,1353 +0,0 @@
-/* * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime/contrib/tensorrt/tensorrt_ops.h
- * \brief Converters from Relay ops into TensorRT layers. Converters should
- * inherit from TrtOpConverter and implement the Convert() method.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
-#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
-
-#include <tvm/relay/attrs/image.h>
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/attrs/reduce.h>
-#include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/attrs/vision.h>
-
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "NvInfer.h"
-#include "utils.h"
-
-#if TRT_VERSION_GE(6, 0, 1)
-#define TRT_HAS_IMPLICIT_BATCH(params) (params->network->hasImplicitBatchDimension())
-#else
-#define TRT_HAS_IMPLICIT_BATCH(params) (true)
-#endif
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-/*! \brief Parameters to convert an Op from relay to TensorRT. */
-struct AddTrtLayerParams {
-  /*! \brief The corresponding relay Call node. */
-  const CallNode* call;
-  /*! \brief The TRT network that the new layer should be added to. */
-  nvinfer1::INetworkDefinition* network;
-  /*! \brief The type of op. */
-  std::string op_name;
-  /*! \brief Inputs to the op. */
-  std::vector<TrtOpInput> inputs;
-  /*! \brief Outputs of the op should be populated here during Convert(). */
-  std::vector<nvinfer1::ITensor*> outputs;
-  /*! \brief Any newly allocated weights should be stored here also. */
-  std::vector<nvinfer1::Weights>* trt_weights;
-
-  AddTrtLayerParams(nvinfer1::INetworkDefinition* network, const CallNode* call,
-                    std::vector<nvinfer1::Weights>* trt_weights)
-      : network(network), call(call), trt_weights(trt_weights) {
-    if (auto* op = call->op.as<OpNode>()) {
-      op_name = op->name;
-    } else if (call->op->IsInstance<FunctionNode>()) {
-      Function func = Downcast<Function>(call->op);
-      const auto name_node = func->GetAttr<String>(attr::kComposite);
-      if (!name_node.defined() || name_node.value() == "") {
-        LOG(FATAL) << "Only composite functions can be converted.";
-      }
-      op_name = name_node.value();
-    } else {
-      LOG(FATAL) << "Call must be Op or Function.";
-    }
-  }
-};
-
-/*! \brief Base class for an op converter from Relay to TRT. */
-class TrtOpConverter {
- public:
-  /*! \brief Used to specify whether each input is tensor or weight. */
-  const std::vector<TrtInputType> input_types;
-  /*! \brief If set to true, any number of tensor inputs can be used for the op.
-   */
-  const bool variable_input_count;
-
-  /*!
-   * \brief Converter subclasses should call this constructor to set
-   * input_types or variable_input_count.
-   * \param input_types For each input to the op, there should be a
-   * corresponding entry in input_types to determine whether that input should
-   * be a tensor or a weight. TrtBuilder will prepare inputs in
-   * AddTrtLayerParams according to this.
-   * \param variable_input_count If the op can have multiple inputs, set this to
-   * true. input_types vector will be ignored and any number of input tensors
-   * can be used for this op. All inputs will be tensors and not weights.
-   */
-  explicit TrtOpConverter(const std::vector<TrtInputType>& input_types,
-                          bool variable_input_count = false)
-      : input_types(input_types), variable_input_count(variable_input_count) {}
-
-  /*!
-   * \brief Convert to TRT. Implementation should use inputs and attributes
-   * from the CallNode to add the corresponding TRT layers to network. Outputs
-   * should be pushed to outputs vector.
-   * \param params Parameters for this op.
-   */
-  virtual void Convert(AddTrtLayerParams* params) const = 0;
-
-  /*!
-   * \brief Helper function to reshape a tensor.
-   * \param params Parameters for this op.
-   * \param input Tensor to reshape.
-   * \param new_shape New shape, does not include batch dim.
-   * \return Reshaped tensor
-   */
-  nvinfer1::ITensor* Reshape(AddTrtLayerParams* params, nvinfer1::ITensor* input,
-                             const std::vector<int>& new_shape) const {
-    auto layer = params->network->addShuffle(*input);
-    CHECK(layer != nullptr);
-    layer->setReshapeDimensions(VectorToTrtDims(new_shape));
-    return layer->getOutput(0);
-  }
-
-  /*!
-   * \brief Helper function to transpose a tensor.
-   * \param params Parameters for this op.
-   * \param input Tensor to transpose.
-   * \param order New order of axes, does include batch dim.
-   * \return Transposed tensor
-   */
-  nvinfer1::ITensor* Transpose(AddTrtLayerParams* params, nvinfer1::ITensor* input,
-                               const std::vector<int>& order) const {
-    auto layer = params->network->addShuffle(*input);
-    CHECK(layer != nullptr);
-    nvinfer1::Permutation perm;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      // Batch dimension cannot be modified.
-      CHECK_EQ(input->getDimensions().nbDims, order.size() - 1);
-      CHECK_EQ(order[0], 0);
-      for (int i = 0; i < order.size(); ++i) {
-        perm.order[i] = order[i + 1] - 1;
-      }
-    } else {
-      CHECK_EQ(input->getDimensions().nbDims, order.size());
-      for (int i = 0; i < order.size(); ++i) {
-        perm.order[i] = order[i];
-      }
-    }
-    layer->setFirstTranspose(perm);
-    return layer->getOutput(0);
-  }
-
-  /*!
-   * \brief Helper function to convert an axis to TRT format.
-   * \param axis Axis from TVM.
-   * \param input_rank Rank of input, does not include batch dim.
-   * \return Axis in TRT format.
-   */
-  int ConvertAxis(AddTrtLayerParams* params, int axis, int input_rank) const {
-    // Add 1 for missing batch dim.
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      input_rank += 1;
-    }
-    CHECK(axis >= -input_rank && axis < input_rank);
-    if (axis < 0) axis += input_rank;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      // Can't modify batch dimenson.
-      CHECK_NE(axis, 0);
-      // Subtract 1 for implicit batch dim.
-      axis -= 1;
-    }
-    return axis;
-  }
-
-  /*!
-   * \brief Create constant that is broadcastable.
-   * \param params Parameters for this op.
-   * \param value Value of scalar.
-   * \param broadcast_to_dims Dims that scalar should be broadcastable against.
-   * \return Constant tensor.
-   */
-  nvinfer1::ITensor* CreateScalar(AddTrtLayerParams* params, float value,
-                                  const nvinfer1::Dims& broadcast_to_dims) const {
-    nvinfer1::Dims dims;
-    dims.nbDims = broadcast_to_dims.nbDims;
-    std::fill_n(dims.d, dims.nbDims, 1);
-    float* values = new float[1];
-    values[0] = value;
-    nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
-    params->trt_weights->push_back(weights);
-    return params->network->addConstant(dims, weights)->getOutput(0);
-  }
-
-  /*!
-   * \brief Get pre/post padding values from padding attributes array.
-   * \param padding Padding from relay op attributes.
-   * \param padding_is_asymmetric True if both pre and post are needed for asymmetric padding.
-   * \param prepadding Prepadding value or symmetric padding values if !padding_is_asymmetric.
-   * \param postpadding Postpadding value if padding_is_asymmetric.
-   */
-  void GetPadding(const Array<IndexExpr>& padding, bool* use_asymmetric_padding,
-                  nvinfer1::DimsHW* prepadding, nvinfer1::DimsHW* postpadding) const {
-    CHECK(padding.size() == 1 || padding.size() == 2 || padding.size() == 4);
-    if (padding.size() == 4) {
-      // four int : padding width in the order of (top, left, bottom, right).
-      *prepadding =
-          nvinfer1::DimsHW(padding[0].as<IntImmNode>()->value, padding[1].as<IntImmNode>()->value);
-      *postpadding =
-          nvinfer1::DimsHW(padding[2].as<IntImmNode>()->value, padding[3].as<IntImmNode>()->value);
-      *use_asymmetric_padding = true;
-    } else if (padding.size() == 2) {
-      // two int : bottom, right will use same padding as top, left
-      *prepadding =
-          nvinfer1::DimsHW(padding[0].as<IntImmNode>()->value, padding[1].as<IntImmNode>()->value);
-      *postpadding = *prepadding;
-      *use_asymmetric_padding = false;
-    } else {
-      // one int : same padding used on all sides
-      *prepadding =
-          nvinfer1::DimsHW(padding[0].as<IntImmNode>()->value, padding[0].as<IntImmNode>()->value);
-      *postpadding = *prepadding;
-      *use_asymmetric_padding = false;
-    }
-  }
-
-  /*!
-   * \brief Get pre/post padding values from padding attributes array for volumetric ops.
-   * \param padding Padding from relay op attributes.
-   * \param padding_is_asymmetric True if both pre and post are needed for asymmetric padding.
-   * \param prepadding Prepadding value or symmetric padding values if !padding_is_asymmetric.
-   * \param postpadding Postpadding value if padding_is_asymmetric.
-   */
-  void GetPadding3D(const Array<IndexExpr>& padding, bool* use_asymmetric_padding,
-                    nvinfer1::Dims* prepadding, nvinfer1::Dims* postpadding) const {
-    CHECK(padding.size() == 1 || padding.size() == 3 || padding.size() == 6);
-    if (padding.size() == 6) {
-      // six int : padding width in the order of (front, top, left, back, bottom, right)
-      *prepadding =
-          nvinfer1::Dims3(padding[0].as<IntImmNode>()->value, padding[1].as<IntImmNode>()->value,
-                          padding[2].as<IntImmNode>()->value);
-      *postpadding =
-          nvinfer1::Dims3(padding[3].as<IntImmNode>()->value, padding[4].as<IntImmNode>()->value,
-                          padding[5].as<IntImmNode>()->value);
-      *use_asymmetric_padding = true;
-    } else if (padding.size() == 3) {
-      // three int : back, bottom, right will use same padding as front, top, left
-      *prepadding =
-          nvinfer1::Dims3(padding[0].as<IntImmNode>()->value, padding[1].as<IntImmNode>()->value,
-                          padding[2].as<IntImmNode>()->value);
-      *postpadding = *prepadding;
-      *use_asymmetric_padding = false;
-    } else {
-      // one int : same padding used on all sides
-      *prepadding =
-          nvinfer1::Dims3(padding[0].as<IntImmNode>()->value, padding[0].as<IntImmNode>()->value,
-                          padding[0].as<IntImmNode>()->value);
-      *postpadding = *prepadding;
-      *use_asymmetric_padding = false;
-    }
-  }
-};
-
-class ActivationOpConverter : public TrtOpConverter {
- public:
-  ActivationOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    CHECK_EQ(params->inputs.size(), 1) << "Activation op expects 1 input.";
-    static const std::unordered_map<std::string, nvinfer1::ActivationType> op_map = {
-      {"nn.relu", nvinfer1::ActivationType::kRELU},
-      {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
-      {"tanh", nvinfer1::ActivationType::kTANH},
-#if TRT_VERSION_GE(5, 1, 5)
-      {"clip", nvinfer1::ActivationType::kCLIP},
-      {"nn.leaky_relu", nvinfer1::ActivationType::kLEAKY_RELU},
-#endif
-    };
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported activation type " << params->op_name;
-    nvinfer1::IActivationLayer* act_layer =
-        params->network->addActivation(*params->inputs.at(0).tensor, it->second);
-#if TRT_VERSION_GE(5, 1, 5)
-    if (params->op_name == "clip") {
-      const auto* clip_attr = params->call->attrs.as<ClipAttrs>();
-      act_layer->setAlpha(clip_attr->a_min);
-      act_layer->setBeta(clip_attr->a_max);
-    } else if (params->op_name == "nn.leaky_relu") {
-      const auto* leaky_relu_attr = params->call->attrs.as<LeakyReluAttrs>();
-      act_layer->setAlpha(leaky_relu_attr->alpha);
-    }
-#endif
-    CHECK(act_layer != nullptr);
-    params->outputs.push_back(act_layer->getOutput(0));
-  }
-};
-
-class ClipLegacyOpConverter : public TrtOpConverter {
- public:
-  ClipLegacyOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    const auto* attrs = params->call->attrs.as<ClipAttrs>();
-    CHECK_EQ(params->inputs.size(), 1) << "Activation op expects 1 input.";
-    auto input = params->inputs.at(0).tensor;
-    // relu(x)
-    nvinfer1::ITensor* output = nullptr;
-    if (attrs->a_min == 0.0f) {
-      // Use relu instead of max(x, 0) because relu can be fused.
-      nvinfer1::IActivationLayer* relu_layer =
-          params->network->addActivation(*input, nvinfer1::ActivationType::kRELU);
-      CHECK(relu_layer != nullptr);
-      output = relu_layer->getOutput(0);
-    } else {
-      // max(x, a_min)
-      nvinfer1::ITensor* a_min = CreateScalar(params, attrs->a_min, input->getDimensions());
-      nvinfer1::IElementWiseLayer* max_layer =
-          params->network->addElementWise(*input, *a_min, nvinfer1::ElementWiseOperation::kMAX);
-      CHECK(max_layer != nullptr);
-      output = max_layer->getOutput(0);
-    }
-    // min(relu(x), a_max)
-    nvinfer1::ITensor* a_max = CreateScalar(params, attrs->a_max, input->getDimensions());
-    nvinfer1::IElementWiseLayer* min_layer =
-        params->network->addElementWise(*output, *a_max, nvinfer1::ElementWiseOperation::kMIN);
-    params->outputs.push_back(min_layer->getOutput(0));
-  }
-};
-
-class ElementWiseBinaryOpConverter : public TrtOpConverter {
- public:
-  ElementWiseBinaryOpConverter() : TrtOpConverter({kTensor, kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation> op_map = {
-        {"add", nvinfer1::ElementWiseOperation::kSUM},
-        {"subtract", nvinfer1::ElementWiseOperation::kSUB},
-        {"multiply", nvinfer1::ElementWiseOperation::kPROD},
-        {"divide", nvinfer1::ElementWiseOperation::kDIV},
-        {"power", nvinfer1::ElementWiseOperation::kPOW},
-        {"maximum", nvinfer1::ElementWiseOperation::kMAX},
-        {"minimum", nvinfer1::ElementWiseOperation::kMIN}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
-    // Broadcast
-    auto input0 = params->inputs.at(0).tensor;
-    auto input0_dims = TrtDimsToVector(input0->getDimensions());
-    auto input1 = params->inputs.at(1).tensor;
-    auto input1_dims = TrtDimsToVector(input1->getDimensions());
-    const bool need_broadcast = input0_dims.size() != input1_dims.size();
-    if (need_broadcast) {
-      if (input0_dims.size() < input1_dims.size()) {
-        std::vector<int> new_shape(input0_dims);
-        while (new_shape.size() < input1_dims.size()) new_shape.insert(new_shape.begin(), 1);
-        input0 = Reshape(params, input0, new_shape);
-      } else if (input1_dims.size() < input0_dims.size()) {
-        std::vector<int> new_shape(input1_dims);
-        while (new_shape.size() < input0_dims.size()) new_shape.insert(new_shape.begin(), 1);
-        input1 = Reshape(params, input1, new_shape);
-      }
-    }
-
-    nvinfer1::IElementWiseLayer* elemwise_layer =
-        params->network->addElementWise(*input0, *input1, it->second);
-    CHECK(elemwise_layer != nullptr);
-    params->outputs.push_back(elemwise_layer->getOutput(0));
-  }
-};
-
-class Conv2DOpConverter : public TrtOpConverter {
- public:
-  Conv2DOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    auto weight_shape = params->inputs.at(1).weight_shape;
-    const auto* conv2d_attr = params->call->attrs.as<Conv2DAttrs>();
-    CHECK_EQ(conv2d_attr->data_layout, "NCHW");
-    CHECK(conv2d_attr->out_layout == "" || conv2d_attr->out_layout == "NCHW");
-    CHECK_EQ(conv2d_attr->kernel_layout, "OIHW");
-
-    // TRT conv2d op doesn't support asymmetric padding before 5.1, so we
-    // workaround by adding a padding layer before the pooling op.
-    nvinfer1::DimsHW prepadding, postpadding;
-    bool use_asymmetric_padding;
-    GetPadding(conv2d_attr->padding, &use_asymmetric_padding, &prepadding, &postpadding);
-#if !TRT_VERSION_GE(5, 1, 5)
-    if (use_asymmetric_padding) {
-      auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
-      input_tensor = pad_layer->getOutput(0);
-      // No need for conv op to do any padding.
-      use_asymmetric_padding = false;
-      prepadding = nvinfer1::DimsHW(0, 0);
-    }
-#endif
-
-    // Could use conv2d_attr->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[0];
-    const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto conv_layer = params->network->addConvolution(*input_tensor, num_outputs, kernel_size,
-                                                      params->inputs.at(1).weight, bias);
-    CHECK(conv_layer != nullptr);
-    if (use_asymmetric_padding) {
-#if TRT_VERSION_GE(5, 1, 5)
-      conv_layer->setPrePadding(prepadding);
-      conv_layer->setPostPadding(postpadding);
-#endif
-    } else {
-      conv_layer->setPadding(prepadding);
-    }
-    CHECK_EQ(conv2d_attr->strides.size(), 2);
-    const auto strides = nvinfer1::DimsHW(conv2d_attr->strides[0].as<IntImmNode>()->value,
-                                          conv2d_attr->strides[1].as<IntImmNode>()->value);
-    conv_layer->setStride(strides);
-    CHECK_EQ(conv2d_attr->dilation.size(), 2);
-    const auto dilation = nvinfer1::DimsHW(conv2d_attr->dilation[0].as<IntImmNode>()->value,
-                                           conv2d_attr->dilation[1].as<IntImmNode>()->value);
-    conv_layer->setDilation(dilation);
-    conv_layer->setNbGroups(conv2d_attr->groups);
-    params->outputs.push_back(conv_layer->getOutput(0));
-  }
-};
-
-#if TRT_VERSION_GE(6, 0, 1)
-class Conv3DOpConverter : public TrtOpConverter {
- public:
-  Conv3DOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    auto weight_shape = params->inputs.at(1).weight_shape;
-    const auto* attrs = params->call->attrs.as<Conv3DAttrs>();
-    CHECK_EQ(attrs->data_layout, "NCDHW");
-    CHECK(attrs->out_layout == "" || attrs->out_layout == "NCDHW");
-    CHECK_EQ(attrs->kernel_layout, "OIDHW");
-
-    nvinfer1::Dims prepadding, postpadding;
-    bool use_asymmetric_padding;
-    GetPadding3D(attrs->padding, &use_asymmetric_padding, &prepadding, &postpadding);
-
-    // Could use attrs->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[0];
-    const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
-                                                        params->inputs.at(1).weight, bias);
-    CHECK(conv_layer != nullptr);
-    if (use_asymmetric_padding) {
-      conv_layer->setPrePadding(prepadding);
-      conv_layer->setPostPadding(postpadding);
-    } else {
-      conv_layer->setPaddingNd(prepadding);
-    }
-    CHECK_EQ(attrs->strides.size(), 3);
-    const auto strides = nvinfer1::Dims3(attrs->strides[0].as<IntImmNode>()->value,
-                                         attrs->strides[1].as<IntImmNode>()->value,
-                                         attrs->strides[2].as<IntImmNode>()->value);
-    conv_layer->setStrideNd(strides);
-    CHECK_EQ(attrs->dilation.size(), 3);
-    const auto dilation = nvinfer1::Dims3(attrs->dilation[0].as<IntImmNode>()->value,
-                                          attrs->dilation[1].as<IntImmNode>()->value,
-                                          attrs->dilation[2].as<IntImmNode>()->value);
-    conv_layer->setDilationNd(dilation);
-    conv_layer->setNbGroups(attrs->groups);
-    params->outputs.push_back(conv_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
-
-// Using FullyConnected
-class DenseOpConverter : public TrtOpConverter {
- public:
-  DenseOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    CHECK(input_dims.size() > 0 && input_dims.size() <= 3);
-    const int required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    const bool need_reshape_on_input = input_dims.size() != required_rank;
-    if (need_reshape_on_input) {
-      // Add dims of size 1 until rank is required_rank.
-      std::vector<int> new_shape(input_dims);
-      while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1);
-      input_tensor = Reshape(params, input_tensor, new_shape);
-    }
-    // Weights are in KC format.
-    CHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
-    const int num_units = params->inputs.at(1).weight_shape[0];
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
-        *input_tensor, num_units, params->inputs.at(1).weight, bias);
-    CHECK(fc_layer != nullptr);
-    auto output_tensor = fc_layer->getOutput(0);
-    if (need_reshape_on_input) {
-      // Remove added dims.
-      input_dims[input_dims.size() - 1] = num_units;
-      output_tensor = Reshape(params, output_tensor, input_dims);
-    }
-    params->outputs.push_back(output_tensor);
-  }
-};
-
-class BatchNormOpConverter : public TrtOpConverter {
- public:
-  BatchNormOpConverter() : TrtOpConverter({kTensor, kWeight, kWeight, kWeight, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    auto gamma = params->inputs.at(1).weight;
-    auto beta = params->inputs.at(2).weight;
-    auto mean = params->inputs.at(3).weight;
-    auto var = params->inputs.at(4).weight;
-    const auto* bn_attr = params->call->attrs.as<BatchNormAttrs>();
-    CHECK_EQ(gamma.count, beta.count);
-    CHECK_EQ(gamma.count, mean.count);
-    CHECK_EQ(gamma.count, var.count);
-    CHECK(bn_attr->axis == 1 || bn_attr->axis == 3);
-    const bool need_transpose = bn_attr->axis == 3;
-
-    void* weight_scale_ptr = new float[gamma.count];
-    nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
-    params->trt_weights->push_back(weight_scale);
-    void* weight_shift_ptr = new float[gamma.count];
-    nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
-    params->trt_weights->push_back(weight_shift);
-    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
-
-    // fill in the content of weights for the Scale layer
-    const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
-    const float* beta_ptr = reinterpret_cast<const float*>(beta.values);
-    const float* mean_ptr = reinterpret_cast<const float*>(mean.values);
-    const float* var_ptr = reinterpret_cast<const float*>(var.values);
-    float* scale_ptr = reinterpret_cast<float*>(weight_scale_ptr);
-    float* shift_ptr = reinterpret_cast<float*>(weight_shift_ptr);
-    for (int i = 0; i < gamma.count; ++i) {
-      scale_ptr[i] = 1.0 / std::sqrt(var_ptr[i] + bn_attr->epsilon);
-      if (bn_attr->scale) {
-        scale_ptr[i] *= gamma_ptr[i];
-      }
-      shift_ptr[i] = -mean_ptr[i] * scale_ptr[i];
-      if (bn_attr->center) {
-        shift_ptr[i] += beta_ptr[i];
-      }
-    }
-    if (need_transpose) {
-      input = Transpose(params, input, {0, 3, 1, 2});
-    }
-    nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
-        *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power);
-    CHECK(scale_layer != nullptr);
-    auto output = scale_layer->getOutput(0);
-    if (need_transpose) {
-      output = Transpose(params, output, {0, 2, 3, 1});
-    }
-    params->outputs.push_back(output);
-  }
-};
-
-class BatchFlattenOpConverter : public TrtOpConverter {
- public:
-  BatchFlattenOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    std::vector<int> new_shape{-1};
-    if (!TRT_HAS_IMPLICIT_BATCH(params)) {
-      new_shape.insert(new_shape.begin(), params->inputs.at(0).tensor->getDimensions().d[0]);
-    }
-    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, new_shape));
-  }
-};
-
-class SoftmaxOpConverter : public TrtOpConverter {
- public:
-  SoftmaxOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const int input_rank = input->getDimensions().nbDims;
-    const auto* softmax_attr = params->call->attrs.as<SoftmaxAttrs>();
-    const int axis = ConvertAxis(params, softmax_attr->axis, input_rank);
-    nvinfer1::ISoftMaxLayer* softmax_layer = params->network->addSoftMax(*input);
-    softmax_layer->setAxes(1 << axis);
-    CHECK(softmax_layer != nullptr);
-    params->outputs.push_back(softmax_layer->getOutput(0));
-  }
-};
-
-class PoolingOpConverter : public TrtOpConverter {
- public:
-  PoolingOpConverter() : TrtOpConverter({kTensor}) {}
-
-  // Get attributes from MaxPool2DAttrs or AvgPool2DAttrs. If
-  // use_assymetric_padding is false, symmetric padding values will be returned
-  // in prepadding only.
-  template <class PoolAttrs>
-  void GetPoolAttrs(const PoolAttrs* attrs, nvinfer1::DimsHW* prepadding,
-                    nvinfer1::DimsHW* postpadding, nvinfer1::DimsHW* window_size,
-                    nvinfer1::DimsHW* strides, bool* ceil_mode,
-                    bool* use_asymmetric_padding) const {
-    CHECK_EQ(attrs->layout, "NCHW");
-    GetPadding(attrs->padding, use_asymmetric_padding, prepadding, postpadding);
-    *window_size = nvinfer1::DimsHW(attrs->pool_size[0].template as<IntImmNode>()->value,
-                                    attrs->pool_size[1].template as<IntImmNode>()->value);
-    *strides = nvinfer1::DimsHW(attrs->strides[0].template as<IntImmNode>()->value,
-                                attrs->strides[1].template as<IntImmNode>()->value);
-    *ceil_mode = attrs->ceil_mode;
-  }
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
-        {"nn.max_pool2d", nvinfer1::PoolingType::kMAX},
-        {"nn.avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-
-    nvinfer1::DimsHW prepadding, postpadding, window_size, strides;
-    bool use_asymmetric_padding = false, ceil_mode = false, count_include_pad = true;
-    if (params->op_name == "nn.max_pool2d") {
-      const auto* attrs = params->call->attrs.as<MaxPool2DAttrs>();
-      GetPoolAttrs<MaxPool2DAttrs>(attrs, &prepadding, &postpadding, &window_size, &strides,
-                                   &ceil_mode, &use_asymmetric_padding);
-    } else if (params->op_name == "nn.avg_pool2d") {
-      const auto* attrs = params->call->attrs.as<AvgPool2DAttrs>();
-      count_include_pad = attrs->count_include_pad;
-      GetPoolAttrs<AvgPool2DAttrs>(attrs, &prepadding, &postpadding, &window_size, &strides,
-                                   &ceil_mode, &use_asymmetric_padding);
-    }
-
-// TRT pooling op doesn't support asymmetric padding before 5.1, so we
-// workaround by adding a padding layer before the pooling op.
-#if !TRT_VERSION_GE(5, 1, 5)
-    if (use_asymmetric_padding) {
-      auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
-      input = pad_layer->getOutput(0);
-      // No need for pooling op to do any padding.
-      use_asymmetric_padding = false;
-      prepadding = nvinfer1::DimsHW(0, 0);
-    }
-#endif
-
-    auto pool_layer = params->network->addPooling(*input, it->second, window_size);
-    CHECK(pool_layer != nullptr);
-    pool_layer->setStride(strides);
-    if (use_asymmetric_padding) {
-#if TRT_VERSION_GE(5, 1, 5)
-      pool_layer->setPrePadding(prepadding);
-      pool_layer->setPostPadding(postpadding);
-#endif
-    } else {
-      pool_layer->setPadding(prepadding);
-    }
-    if (params->op_name == "nn.avg_pool2d") {
-      // count_include_pad=True is useless if there is no padding. TRT doesn't
-      // like count_include_pad in combination with strides even when there is
-      // no padding or assymetric padding even, so turn off inclusive to avoid
-      // error message. Note: Padding will always be symmetric with
-      // count_include_pad since partitioner will prevent unsupported case.
-      if (prepadding.h() == 0 && prepadding.w() == 0) {
-        count_include_pad = false;
-      }
-      pool_layer->setAverageCountExcludesPadding(!count_include_pad);
-    }
-#if TRT_VERSION_GE(5, 1, 5)
-    if (ceil_mode) {
-      pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
-    }
-#else
-    CHECK(!ceil_mode);
-#endif
-    params->outputs.push_back(pool_layer->getOutput(0));
-  }
-};
-
-#if TRT_VERSION_GE(6, 0, 1)
-class Pooling3DOpConverter : public TrtOpConverter {
- public:
-  Pooling3DOpConverter() : TrtOpConverter({kTensor}) {}
-
-  // Get attributes from MaxPool2DAttrs or AvgPool2DAttrs. If
-  // use_assymetric_padding is false, symmetric padding values will be returned
-  // in prepadding only.
-  template <class PoolAttrs>
-  void GetPoolAttrs(const PoolAttrs* attrs, nvinfer1::Dims* prepadding, nvinfer1::Dims* postpadding,
-                    nvinfer1::Dims* window_size, nvinfer1::Dims* strides, bool* ceil_mode,
-                    bool* use_asymmetric_padding) const {
-    CHECK_EQ(attrs->layout, "NCDHW");
-    GetPadding3D(attrs->padding, use_asymmetric_padding, prepadding, postpadding);
-    *window_size = nvinfer1::Dims3(attrs->pool_size[0].template as<IntImmNode>()->value,
-                                   attrs->pool_size[1].template as<IntImmNode>()->value,
-                                   attrs->pool_size[2].template as<IntImmNode>()->value);
-    *strides = nvinfer1::Dims3(attrs->strides[0].template as<IntImmNode>()->value,
-                               attrs->strides[1].template as<IntImmNode>()->value,
-                               attrs->strides[2].template as<IntImmNode>()->value);
-    *ceil_mode = attrs->ceil_mode;
-  }
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
-        {"nn.max_pool3d", nvinfer1::PoolingType::kMAX},
-        {"nn.avg_pool3d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-
-    nvinfer1::Dims prepadding, postpadding, window_size, strides;
-    bool use_asymmetric_padding = false, ceil_mode = false, count_include_pad = true;
-    if (params->op_name == "nn.max_pool3d") {
-      const auto* attrs = params->call->attrs.as<MaxPool3DAttrs>();
-      GetPoolAttrs<MaxPool3DAttrs>(attrs, &prepadding, &postpadding, &window_size, &strides,
-                                   &ceil_mode, &use_asymmetric_padding);
-    } else if (params->op_name == "nn.avg_pool3d") {
-      const auto* attrs = params->call->attrs.as<AvgPool3DAttrs>();
-      count_include_pad = attrs->count_include_pad;
-      GetPoolAttrs<AvgPool3DAttrs>(attrs, &prepadding, &postpadding, &window_size, &strides,
-                                   &ceil_mode, &use_asymmetric_padding);
-    }
-    auto pool_layer = params->network->addPoolingNd(*input, it->second, window_size);
-    CHECK(pool_layer != nullptr);
-    pool_layer->setStrideNd(strides);
-    if (use_asymmetric_padding) {
-      pool_layer->setPrePadding(prepadding);
-      pool_layer->setPostPadding(postpadding);
-    } else {
-      pool_layer->setPaddingNd(prepadding);
-    }
-    if (params->op_name == "nn.avg_pool3d") {
-      // count_include_pad=True is useless if there is no padding. TRT doesn't
-      // like count_include_pad in combination with strides even when there is
-      // no padding or assymetric padding even, so turn off inclusive to avoid
-      // error message. Note: Padding will always be symmetric with
-      // count_include_pad since partitioner will prevent unsupported case.
-      pool_layer->setAverageCountExcludesPadding(!count_include_pad);
-    }
-    if (ceil_mode) {
-      pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
-    }
-    params->outputs.push_back(pool_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
-
-class GlobalPoolingOpConverter : public TrtOpConverter {
- public:
-  GlobalPoolingOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
-        {"nn.global_max_pool2d", nvinfer1::PoolingType::kMAX},
-        {"nn.global_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-    const auto* pool_attr = params->call->attrs.as<GlobalPool2DAttrs>();
-    CHECK_EQ(pool_attr->layout, "NCHW");
-    const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
-    const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
-    auto pool_layer =
-        params->network->addPooling(*input_tensor, it->second, nvinfer1::DimsHW(h, w));
-    CHECK(pool_layer != nullptr);
-    params->outputs.push_back(pool_layer->getOutput(0));
-  }
-};
-
-class ExpandDimsOpConverter : public TrtOpConverter {
- public:
-  ExpandDimsOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    const auto* attrs = params->call->attrs.as<ExpandDimsAttrs>();
-    const int axis = ConvertAxis(params, attrs->axis, input_dims.size() + 1);
-    for (int i = 0; i < attrs->num_newaxis; ++i) {
-      input_dims.insert(input_dims.begin() + axis, 1);
-    }
-    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, input_dims));
-  }
-};
-
-class SqueezeOpConverter : public TrtOpConverter {
- public:
-  SqueezeOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    const auto* attrs = params->call->attrs.as<SqueezeAttrs>();
-    // TODO(tmorris): if axis not defined, squeeze all dimensions with size 1.
-    CHECK(attrs->axis.defined());
-    for (size_t i = 0; i < attrs->axis.size(); ++i) {
-      const int axis =
-          ConvertAxis(params, attrs->axis[i].as<IntImmNode>()->value, input_dims.size());
-      input_dims[axis] = 0;
-    }
-    input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0), input_dims.end());
-    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, input_dims));
-  }
-};
-
-class UnaryOpConverter : public TrtOpConverter {
- public:
-  UnaryOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    // The following ops are supported by TRT but don't exist in relay yet:
-    // recip, tan, sinh, cosh, asin, acos, asinh, acosh, atanh
-    static const std::unordered_map<std::string, nvinfer1::UnaryOperation> op_map = {
-      {"exp", nvinfer1::UnaryOperation::kEXP},
-      {"log", nvinfer1::UnaryOperation::kLOG},
-      {"sqrt", nvinfer1::UnaryOperation::kSQRT},
-      {"abs", nvinfer1::UnaryOperation::kABS},
-      {"negative", nvinfer1::UnaryOperation::kNEG},
-#if TRT_VERSION_GE(5, 1, 5)
-      {"sin", nvinfer1::UnaryOperation::kSIN},
-      {"cos", nvinfer1::UnaryOperation::kCOS},
-      {"atan", nvinfer1::UnaryOperation::kATAN},
-      {"ceil", nvinfer1::UnaryOperation::kCEIL},
-      {"floor", nvinfer1::UnaryOperation::kFLOOR},
-#endif
-    };
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name;
-    nvinfer1::IUnaryLayer* unary_layer =
-        params->network->addUnary(*params->inputs.at(0).tensor, it->second);
-    CHECK(unary_layer != nullptr);
-    params->outputs.push_back(unary_layer->getOutput(0));
-  }
-};
-
-class ConcatOpConverter : public TrtOpConverter {
- public:
-  ConcatOpConverter() : TrtOpConverter({}, /*variable_input_count=*/true) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    const int num_inputs = params->inputs.size();
-    CHECK_GT(num_inputs, 0);
-    const int input_rank = params->inputs[0].tensor->getDimensions().nbDims;
-    std::vector<nvinfer1::ITensor*> input_tensors;
-    for (auto input : params->inputs) {
-      CHECK(input.type == kTensor);
-      CHECK_EQ(input_rank, input.tensor->getDimensions().nbDims);
-      input_tensors.push_back(input.tensor);
-    }
-
-    const auto* concat_attr = params->call->attrs.as<ConcatenateAttrs>();
-    const int axis = ConvertAxis(params, concat_attr->axis, input_rank);
-
-    nvinfer1::IConcatenationLayer* concat_layer =
-        params->network->addConcatenation(input_tensors.data(), input_tensors.size());
-    CHECK(concat_layer != nullptr);
-    concat_layer->setAxis(axis);
-    params->outputs.push_back(concat_layer->getOutput(0));
-  }
-};
-
-class BiasAddOpConverter : public TrtOpConverter {
- public:
-  BiasAddOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    const int required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    CHECK(input_dims.size() > 0 && input_dims.size() <= required_rank);
-    const bool need_reshape_on_input = input_dims.size() != required_rank;
-    if (need_reshape_on_input) {
-      // Add dims of size 1 until rank is required_rank.
-      std::vector<int> new_shape(input_dims);
-      while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1);
-      input_tensor = Reshape(params, input_tensor, new_shape);
-    }
-
-    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
-        *input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
-    CHECK(scale_layer != nullptr);
-    auto output_tensor = scale_layer->getOutput(0);
-    if (need_reshape_on_input) {
-      // Remove added dims.
-      output_tensor = Reshape(params, output_tensor, input_dims);
-    }
-    params->outputs.push_back(output_tensor);
-  }
-};
-
-class Conv2DTransposeOpConverter : public TrtOpConverter {
- public:
-  Conv2DTransposeOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto weight_shape = params->inputs.at(1).weight_shape;
-    const auto* conv2d_attr = params->call->attrs.as<Conv2DTransposeAttrs>();
-    CHECK_EQ(conv2d_attr->data_layout, "NCHW");
-    CHECK(conv2d_attr->out_layout == "" || conv2d_attr->out_layout == "NCHW");
-    CHECK_EQ(conv2d_attr->kernel_layout, "OIHW");
-    CHECK(conv2d_attr->dilation[0].as<IntImmNode>()->value == 1 &&
-          conv2d_attr->dilation[1].as<IntImmNode>()->value == 1);
-
-    // TRT deconv op doesn't support asymmetric padding before 5.1, so we
-    // workaround by adding a padding layer before the pooling op.
-    nvinfer1::DimsHW prepadding, postpadding;
-    bool use_asymmetric_padding;
-    GetPadding(conv2d_attr->padding, &use_asymmetric_padding, &prepadding, &postpadding);
-#if !TRT_VERSION_GE(5, 1, 5)
-    if (use_asymmetric_padding) {
-      auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
-      input_tensor = pad_layer->getOutput(0);
-      // No need for conv op to do any padding.
-      use_asymmetric_padding = false;
-      prepadding = nvinfer1::DimsHW(0, 0);
-    }
-#endif
-
-    // Could use conv2d_attr->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[1];
-    const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
-                                                          params->inputs.at(1).weight, bias);
-    CHECK(deconv_layer != nullptr);
-    if (use_asymmetric_padding) {
-#if TRT_VERSION_GE(5, 1, 5)
-      deconv_layer->setPrePadding(prepadding);
-      deconv_layer->setPostPadding(postpadding);
-#endif
-    } else {
-      deconv_layer->setPadding(prepadding);
-    }
-    const auto strides = nvinfer1::DimsHW(conv2d_attr->strides[0].as<IntImmNode>()->value,
-                                          conv2d_attr->strides[1].as<IntImmNode>()->value);
-    deconv_layer->setStride(strides);
-    deconv_layer->setNbGroups(conv2d_attr->groups);
-    nvinfer1::ITensor* output = deconv_layer->getOutput(0);
-    // Output padding.
-    if (conv2d_attr->output_padding.size()) {
-      GetPadding(conv2d_attr->output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
-      if (prepadding.h() != 0 || prepadding.w() != 0 || postpadding.h() != 0 ||
-          postpadding.w() != 0) {
-        // Output padding for Conv2D transpose is always asymmetric and applied to post only.
-        prepadding = nvinfer1::DimsHW(0, 0);
-        auto pad_layer = params->network->addPadding(*output, prepadding, postpadding);
-        output = pad_layer->getOutput(0);
-      }
-    }
-    params->outputs.push_back(output);
-  }
-};
-
-#if TRT_VERSION_GE(6, 0, 1)
-class Conv3DTransposeOpConverter : public TrtOpConverter {
- public:
-  Conv3DTransposeOpConverter() : TrtOpConverter({kTensor, kWeight}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto weight_shape = params->inputs.at(1).weight_shape;
-    const auto* attrs = params->call->attrs.as<Conv3DTransposeAttrs>();
-    CHECK_EQ(attrs->data_layout, "NCDHW");
-    CHECK(attrs->out_layout == "" || attrs->out_layout == "NCDHW");
-    CHECK_EQ(attrs->kernel_layout, "OIDHW");
-    CHECK(attrs->dilation[0].as<IntImmNode>()->value == 1 &&
-          attrs->dilation[1].as<IntImmNode>()->value == 1 &&
-          attrs->dilation[2].as<IntImmNode>()->value == 1);
-
-    nvinfer1::Dims prepadding, postpadding;
-    bool use_asymmetric_padding;
-    GetPadding3D(attrs->padding, &use_asymmetric_padding, &prepadding, &postpadding);
-
-    // Could use attrs->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[1];
-    const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
-                                                            params->inputs.at(1).weight, bias);
-    CHECK(deconv_layer != nullptr);
-    if (use_asymmetric_padding) {
-      deconv_layer->setPrePadding(prepadding);
-      deconv_layer->setPostPadding(postpadding);
-    } else {
-      deconv_layer->setPaddingNd(prepadding);
-    }
-    const auto strides = nvinfer1::Dims3(attrs->strides[0].as<IntImmNode>()->value,
-                                         attrs->strides[1].as<IntImmNode>()->value,
-                                         attrs->strides[2].as<IntImmNode>()->value);
-    deconv_layer->setStrideNd(strides);
-    deconv_layer->setNbGroups(attrs->groups);
-    nvinfer1::ITensor* output = deconv_layer->getOutput(0);
-    // Output padding.
-    if (attrs->output_padding.size()) {
-      GetPadding3D(attrs->output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
-      // Are any post-padding values non-zero?
-      CHECK(!std::any_of(postpadding.d, postpadding.d + postpadding.nbDims, [](int x) {
-        return x != 0;
-      })) << "TRT does not support padding on 3 dimensions.";
-    }
-    params->outputs.push_back(output);
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
-
-class TransposeOpConverter : public TrtOpConverter {
- public:
-  TransposeOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<TransposeAttrs>();
-    std::vector<int> order;
-    for (size_t i = 0; i < attrs->axes.size(); ++i) {
-      order.push_back(attrs->axes[i].as<IntImmNode>()->value);
-    }
-    params->outputs.push_back(Transpose(params, input, order));
-  }
-};
-
-class ReshapeOpConverter : public TrtOpConverter {
- public:
-  ReshapeOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<ReshapeAttrs>();
-    CHECK_EQ(attrs->reverse, false);
-    std::vector<int> new_shape;
-    const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
-    for (size_t i = start_index; i < attrs->newshape.size(); ++i) {
-      CHECK(attrs->newshape[i].defined());
-      const int value = attrs->newshape[i]->value;
-      CHECK_GE(value, -1);
-      new_shape.push_back(value);
-    }
-    params->outputs.push_back(Reshape(params, input, new_shape));
-  }
-};
-
-class PadOpConverter : public TrtOpConverter {
- public:
-  PadOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<PadAttrs>();
-    const int input_rank_with_batch =
-        input->getDimensions().nbDims + (TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0);
-    CHECK_EQ(input_rank_with_batch, attrs->pad_width.size());
-    CHECK(!TRT_HAS_IMPLICIT_BATCH(params) || (attrs->pad_width[0][0].as<IntImmNode>()->value == 0 &&
-                                              attrs->pad_width[0][1].as<IntImmNode>()->value == 0))
-        << "Cannot pad on batch dimension.";
-
-    nvinfer1::DimsHW prepadding, postpadding;
-    // Check if we need to transpose from NHWC -> NCHW.
-    const bool need_transpose = attrs->pad_width[1][0].as<IntImmNode>()->value != 0 ||
-                                attrs->pad_width[1][1].as<IntImmNode>()->value != 0;
-    if (need_transpose) {
-      input = Transpose(params, input, {0, 3, 1, 2});
-      prepadding = nvinfer1::DimsHW(attrs->pad_width[1][0].as<IntImmNode>()->value,
-                                    attrs->pad_width[2][0].as<IntImmNode>()->value);
-      postpadding = nvinfer1::DimsHW(attrs->pad_width[1][1].as<IntImmNode>()->value,
-                                     attrs->pad_width[2][1].as<IntImmNode>()->value);
-    } else {
-      prepadding = nvinfer1::DimsHW(attrs->pad_width[2][0].as<IntImmNode>()->value,
-                                    attrs->pad_width[3][0].as<IntImmNode>()->value);
-      postpadding = nvinfer1::DimsHW(attrs->pad_width[2][1].as<IntImmNode>()->value,
-                                     attrs->pad_width[3][1].as<IntImmNode>()->value);
-    }
-    auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
-    CHECK(pad_layer != nullptr);
-    auto output = pad_layer->getOutput(0);
-    if (need_transpose) {
-      // NCHW -> NHWC
-      output = Transpose(params, output, {0, 2, 3, 1});
-    }
-    params->outputs.push_back(output);
-  }
-};
-
-class ReduceOpConverter : public TrtOpConverter {
- public:
-  ReduceOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    static const std::unordered_map<std::string, nvinfer1::ReduceOperation> op_map = {
-        {"sum", nvinfer1::ReduceOperation::kSUM},
-        {"prod", nvinfer1::ReduceOperation::kPROD},
-        {"max", nvinfer1::ReduceOperation::kMAX},
-        {"min", nvinfer1::ReduceOperation::kMIN},
-        {"mean", nvinfer1::ReduceOperation::kAVG}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported reduce type " << params->op_name;
-
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<ReduceAttrs>();
-    CHECK(attrs->exclude == false);
-    // TODO(trevmorr): Support reduce to scalar.
-    CHECK(attrs->axis.defined() && attrs->axis.size() > 0);
-    uint32_t reduce_axes = 0;
-    for (size_t i = 0; i < attrs->axis.size(); ++i) {
-      const int axis = ConvertAxis(params, attrs->axis[i].as<IntImmNode>()->value,
-                                   input->getDimensions().nbDims);
-      reduce_axes |= 1 << axis;
-    }
-    auto reduce_layer =
-        params->network->addReduce(*input, it->second, reduce_axes, attrs->keepdims);
-    params->outputs.push_back(reduce_layer->getOutput(0));
-  }
-};
-
-#if TRT_VERSION_GE(5, 1, 5)
-class StridedSliceOpConverter : public TrtOpConverter {
- public:
-  StridedSliceOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input->getDimensions());
-    const auto* attrs = params->call->attrs.as<StridedSliceAttrs>();
-    // Dynamic shapes not supported.
-    CHECK(attrs->begin && attrs->end && attrs->strides);
-    const int input_rank_with_batch =
-        input->getDimensions().nbDims + (TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0);
-    CHECK_EQ(input_rank_with_batch, attrs->begin.value().size());
-    CHECK_EQ(input_rank_with_batch, attrs->end.value().size());
-    const bool default_strides =
-        !attrs->strides.value().defined() || attrs->strides.value().size() == 0;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      CHECK(default_strides || !attrs->strides.value()[0].defined() ||
-            attrs->strides.value()[0].as<IntImmNode>()->value == 1);
-    }
-
-    auto process_slice_index = [](Integer x, int default_value) {
-      if (!x.defined()) return default_value;
-      int value = x.as<IntImmNode>()->value;
-      if (value == -1) return default_value;
-      return value;
-    };
-
-    const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
-    std::vector<int> start, size, strides;
-    for (size_t i = start_index; i < attrs->begin.value().size(); ++i) {
-      const int begin_value = process_slice_index(attrs->begin.value()[i], 0);
-      const int end_value = process_slice_index(attrs->end.value()[i], input_dims[i - start_index]);
-      const int stride_value = (default_strides || i >= attrs->strides.value().size() ||
-                                !attrs->strides.value()[i].defined())
-                                   ? 1
-                                   : attrs->strides.value()[i].as<IntImmNode>()->value;
-      CHECK_GT(stride_value, 0);
-      const int size_value = (end_value - begin_value + stride_value - 1) / stride_value;
-      CHECK_GE(begin_value, 0);
-      CHECK_GT(size_value, 0);
-      start.push_back(begin_value);
-      size.push_back(size_value);
-      strides.push_back(stride_value);
-    }
-
-    auto slice_layer = params->network->addSlice(*input, VectorToTrtDims(start),
-                                                 VectorToTrtDims(size), VectorToTrtDims(strides));
-    params->outputs.push_back(slice_layer->getOutput(0));
-  }
-};
-#endif
-
-class AdaptivePoolingOpConverter : public TrtOpConverter {
- public:
-  AdaptivePoolingOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input_tensor = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
-        {"nn.adaptive_max_pool2d", nvinfer1::PoolingType::kMAX},
-        {"nn.adaptive_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-    const auto* attrs = params->call->attrs.as<AdaptivePool2DAttrs>();
-    CHECK_EQ(attrs->layout, "NCHW");
-
-    // This is an approximation of adaptive pooling. Results will not be
-    // mathematically exact except when output_size is (1, 1).
-    // Annotation rules will only allow output size of (1, 1).
-    auto output_size = nvinfer1::DimsHW(1, 1);
-    const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
-    const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
-    const auto stride = nvinfer1::DimsHW(h / output_size.h(), w / output_size.w());
-    const auto window_size = nvinfer1::DimsHW(h - (output_size.h() - 1) * stride.h(),
-                                              w - (output_size.w() - 1) * stride.w());
-    auto pool_layer = params->network->addPooling(*input_tensor, it->second, window_size);
-    CHECK(pool_layer != nullptr);
-    pool_layer->setStride(stride);
-    params->outputs.push_back(pool_layer->getOutput(0));
-  }
-};
-
-#if TRT_VERSION_GE(6, 0, 1)
-class ResizeOpConverter : public TrtOpConverter {
- public:
-  ResizeOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<ResizeAttrs>();
-    static const std::unordered_map<std::string, nvinfer1::ResizeMode> op_map = {
-        {"nearest_neighbor", nvinfer1::ResizeMode::kNEAREST},
-        {"bilinear", nvinfer1::ResizeMode::kLINEAR}};
-    auto it = op_map.find(attrs->method);
-    CHECK(it != op_map.end()) << "Unsupported resize type " << attrs->method;
-    CHECK_EQ(attrs->size.size(), 2);
-    auto output_dims = TrtDimsToVector(input->getDimensions());
-    const int required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    CHECK_EQ(output_dims.size(), required_rank);
-    CHECK(attrs->layout == "NCHW" || attrs->layout == "NHWC");
-    int h_index = attrs->layout == "NCHW" ? 2 : 1;
-    int w_index = attrs->layout == "NCHW" ? 3 : 2;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      h_index -= 1;
-      w_index -= 1;
-    }
-    output_dims[h_index] = attrs->size[0].as<IntImmNode>()->value;
-    output_dims[w_index] = attrs->size[1].as<IntImmNode>()->value;
-
-    nvinfer1::IResizeLayer* resize_layer = params->network->addResize(*input);
-    CHECK(resize_layer != nullptr);
-    resize_layer->setResizeMode(it->second);
-    resize_layer->setOutputDimensions(VectorToTrtDims(output_dims));
-    resize_layer->setAlignCorners(attrs->coordinate_transformation_mode == "align_corners");
-    params->outputs.push_back(resize_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
-
-#if TRT_VERSION_GE(5, 1, 5)
-class SplitOpConverter : public TrtOpConverter {
- public:
-  SplitOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    auto input_dims = TrtDimsToVector(input->getDimensions());
-    const auto* attrs = params->call->attrs.as<SplitAttrs>();
-    const int input_rank = input->getDimensions().nbDims;
-    const int axis = ConvertAxis(params, attrs->axis, input_dims.size());
-    const int sections = attrs->indices_or_sections.as<IntImmNode>()->value;
-
-    std::vector<int> start(input_dims.size(), 0);
-    std::vector<int> size(input_dims.begin(), input_dims.end());
-    size[axis] = input_dims[axis] / sections;
-    std::vector<int> strides(input_dims.size(), 1);
-    for (int i = 0; i < sections; ++i) {
-      start[axis] = i * size[axis];
-      auto slice_layer = params->network->addSlice(*input, VectorToTrtDims(start),
-                                                   VectorToTrtDims(size), VectorToTrtDims(strides));
-
-      params->outputs.push_back(slice_layer->getOutput(0));
-    }
-  }
-};
-#endif  // TRT_VERSION_GE(5, 1, 5)
-
-#if TRT_VERSION_GE(5, 1, 5)
-// TODO(trevmorr): Not needed due to SimplifySliceLike which converts slice_like
-// to strided_slice. slice_like has a false dependency on the second input
-// tensor since only the shape is needed. This confuses TRT.
-class SliceLikeOpConverter : public TrtOpConverter {
- public:
-  SliceLikeOpConverter() : TrtOpConverter({kTensor, kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    auto input_2 = params->inputs.at(1).tensor;
-    auto input_dims = TrtDimsToVector(input->getDimensions());
-    auto new_dims = TrtDimsToVector(input_2->getDimensions());
-    const auto* attrs = params->call->attrs.as<SliceLikeAttrs>();
-    if (attrs->axes.defined()) {
-      for (int i = 0; i < attrs->axes.size(); i++) {
-        const int axis =
-            ConvertAxis(params, attrs->axes[i].as<IntImmNode>()->value, input_dims.size());
-        input_dims[axis] = new_dims[axis];
-      }
-    } else {
-      // Use all dims when axes is not defined.
-      CHECK_EQ(input_dims.size(), new_dims.size());
-      input_dims = new_dims;
-    }
-
-    // slice_like always begins at 0.
-    std::vector<int> start(input_dims.size(), 0);
-    std::vector<int> strides(input_dims.size(), 1);
-    auto slice_layer = params->network->addSlice(
-        *input, VectorToTrtDims(start), VectorToTrtDims(input_dims), VectorToTrtDims(strides));
-
-    params->outputs.push_back(slice_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(5, 1, 5)
-
-#if TRT_VERSION_GE(6, 0, 1)
-class UpsamplingOpConverter : public TrtOpConverter {
- public:
-  UpsamplingOpConverter() : TrtOpConverter({kTensor}) {}
-
-  void Convert(AddTrtLayerParams* params) const {
-    auto input = params->inputs.at(0).tensor;
-    const auto* attrs = params->call->attrs.as<UpSamplingAttrs>();
-    static const std::unordered_map<std::string, nvinfer1::ResizeMode> op_map = {
-        {"nearest_neighbor", nvinfer1::ResizeMode::kNEAREST},
-        {"bilinear", nvinfer1::ResizeMode::kLINEAR}};
-    auto it = op_map.find(attrs->method);
-    CHECK(it != op_map.end()) << "Unsupported resize type " << attrs->method;
-    auto output_dims = TrtDimsToVector(input->getDimensions());
-    const int required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    CHECK_EQ(output_dims.size(), required_rank);
-    CHECK(attrs->layout == "NCHW" || attrs->layout == "NHWC");
-    int h_index = attrs->layout == "NCHW" ? 2 : 1;
-    int w_index = attrs->layout == "NCHW" ? 3 : 2;
-    if (TRT_HAS_IMPLICIT_BATCH(params)) {
-      h_index -= 1;
-      w_index -= 1;
-    }
-    output_dims[h_index] *= attrs->scale_h;
-    output_dims[w_index] *= attrs->scale_w;
-
-    nvinfer1::IResizeLayer* resize_layer = params->network->addResize(*input);
-    CHECK(resize_layer != nullptr);
-    resize_layer->setResizeMode(it->second);
-    resize_layer->setOutputDimensions(VectorToTrtDims(output_dims));
-    resize_layer->setAlignCorners(attrs->align_corners);
-    params->outputs.push_back(resize_layer->getOutput(0));
-  }
-};
-#endif  // TRT_VERSION_GE(6, 0, 1)
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
diff --git a/src/runtime/contrib/tensorrt/utils.h b/src/runtime/contrib/tensorrt/utils.h
deleted file mode 100644
index a43183bda336..000000000000
--- a/src/runtime/contrib/tensorrt/utils.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime/contrib/tensorrt/utils.h
- * \brief Helper functions used by TensorRTBuilder or TrtOpConverters.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_UTILS_H_
-#define TVM_RUNTIME_CONTRIB_TENSORRT_UTILS_H_
-
-#include "NvInfer.h"
-
-#include <tvm/relay/expr.h>
-#include <tvm/relay/type.h>
-#include <string>
-#include <vector>
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-/*!
- * \brief Helper function to convert an vector to TRT Dims.
- * \param vec Vector.
- * \return TRT Dims.
- */
-template <typename T>
-nvinfer1::Dims VectorToTrtDims(const std::vector<T>& vec) {
-  nvinfer1::Dims dims;
-  // Dims(nbDims=0, d[0]=1) is used to represent a scalar in TRT.
-  dims.d[0] = 1;
-  dims.nbDims = vec.size();
-  for (size_t i = 0; i < vec.size(); ++i) {
-    dims.d[i] = vec[i];
-  }
-  return dims;
-}
-
-/*!
- * \brief Helper function to convert TRT Dims to vector.
- * \param vec TRT Dims.
- * \return Vector.
- */
-std::vector<int> TrtDimsToVector(const nvinfer1::Dims& dims) {
-  return std::vector<int>(dims.d, dims.d + dims.nbDims);
-}
-
-/*!
- * \brief Helper function to convert vector to string.
- * \param vec Vector.
- * \return Vector as a string.
- */
-std::string DebugString(const std::vector<int>& vec) {
-  std::ostringstream ss;
-  ss << "(";
-  for (size_t i = 0; i < vec.size(); ++i) {
-    if (i != 0) ss << ", ";
-    ss << vec[i];
-  }
-  ss << ")";
-  return ss.str();
-}
-
-std::vector<int> GetShape(const Type& type) {
-  const auto* ttype = type.as<TensorTypeNode>();
-  CHECK(ttype);
-  std::vector<int> _shape;
-  _shape.reserve(ttype->shape.size());
-  for (size_t i = 0; i < ttype->shape.size(); ++i) {
-    auto* val = ttype->shape[i].as<IntImmNode>();
-    _shape.push_back(val ? val->value : -1);
-  }
-  return _shape;
-}
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_UTILS_H_
diff --git a/tests/python/relay/test_tensorrt.py b/tests/python/relay/test_tensorrt.py
deleted file mode 100644
index 3ed1cdb0fd44..000000000000
--- a/tests/python/relay/test_tensorrt.py
+++ /dev/null
@@ -1,914 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-from tvm import relay
-import tvm.relay.testing
-import tvm.relay.tensorrt
-from tvm.contrib import graph_runtime
-from tvm.runtime.vm import VirtualMachine
-
-
-def should_skip():
-    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
-        print("skip because cuda is not enabled.")
-        return True
-    if not relay.tensorrt.IsTrtRuntimeAvailable():
-        print("skip because tensorrt runtime is not available")
-        return True
-    return False
-
-
-def vmobj_to_list(o):
-    if isinstance(o, tvm.nd.NDArray):
-        return [o.asnumpy()]
-    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
-        result = []
-        for f in o:
-            result.extend(vmobj_to_list(f))
-        return result
-    elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
-        if o.constructor.name_hint == "Cons":
-            tl = vmobj_to_list(o.fields[1])
-            hd = vmobj_to_list(o.fields[0])
-            hd.extend(tl)
-            return hd
-        elif o.constructor.name_hint == "Nil":
-            return []
-        elif "tensor_nil" in o.constructor.name_hint:
-            return [0]
-        elif "tensor" in o.constructor.name_hint:
-            return [o.fields[0].asnumpy()]
-        else:
-            raise RuntimeError("Unknown object type: %s" % o.constructor.name_hint)
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-def assert_result_matches(res1, res2):
-    for r1, r2 in zip(vmobj_to_list(res1), vmobj_to_list(res2)):
-        tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
-
-
-def test_tensorrt_simple():
-    if should_skip():
-        return
-    dtype = "float32"
-    xshape = (1, 3, 2, 2)
-    yshape = (1, 3, 1, 1)
-    zshape = (1, 1, 1, 1)
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.var("y", shape=(yshape), dtype=dtype)
-    z = relay.var("z", shape=(zshape), dtype=dtype)
-    w = z * (x + y)
-    out = relay.nn.relu(w)
-    f = relay.Function([x, y, z], out)
-
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
-    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
-    mod = tvm.IRModule()
-    mod["main"] = f
-
-    result_dict = dict()
-    for mode in ["vm", "graph"]:
-        for use_trt in [True, False]:
-            result_key = mode + ("_trt" if use_trt else "")
-            if use_trt:
-                mod = relay.tensorrt.EnableTrt(mod)
-            with relay.build_config(opt_level=3):
-                relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
-                results = relay_exec.evaluate()(x_data, y_data, z_data)
-            result_dict[result_key] = results
-
-    assert_result_matches(result_dict["vm_trt"], result_dict["vm"])
-    assert_result_matches(result_dict["graph_trt"], result_dict["graph"])
-    assert_result_matches(result_dict["graph_trt"], result_dict["vm_trt"])
-
-
-def test_tensorrt_simple_cpu_io():
-    if should_skip():
-        return
-    dtype = "float32"
-    xshape = (1, 3, 2, 2)
-    yshape = (1, 3, 1, 1)
-    zshape = (1, 1, 1, 1)
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.var("y", shape=(yshape), dtype=dtype)
-    z = relay.var("z", shape=(zshape), dtype=dtype)
-    w = z * (x + y)
-    out = relay.nn.relu(w)
-    f = relay.Function([x, y, z], out)
-
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
-    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
-
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = relay.tensorrt.EnableTrt(mod)
-    params = {"y": y_data}
-    with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(mod, target="llvm", params=params)
-    mod = graph_runtime.create(graph, lib, ctx=tvm.cpu())
-    mod.set_input(**params)
-    mod.run(x=x_data, z=z_data)
-    results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
-
-
-def test_tensorrt_not_compatible():
-    if should_skip():
-        return
-    dtype = "float32"
-    xshape = (1, 32, 14, 14)
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.add(x, x)
-    z = relay.erf(y)
-    out = relay.nn.relu(z)
-    f = relay.Function([x], out)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = relay.tensorrt.EnableTrt(mod)
-    assert not mod["main"].attrs
-
-
-def test_tensorrt_ops():
-    if should_skip():
-        return
-
-    def run_and_verify(config):
-        f, input_shapes, is_param = config
-        params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param}
-        input_dict = {
-            k: np.random.uniform(-1, 1, v).astype(np.float32)
-            for k, v in input_shapes.items()
-            if k not in is_param
-        }
-
-        results = dict()
-        for mode in ["graph", "vm"]:
-            for use_trt in [True, False]:
-                mod = tvm.IRModule()
-                mod["main"] = f
-                result_key = mode + ("_trt" if use_trt else "")
-                if use_trt:
-                    mod = relay.tensorrt.EnableTrt(mod, params)
-
-                with relay.build_config(opt_level=3):
-                    vm_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
-                    results[result_key] = vm_exec.evaluate()(**input_dict, **params)
-
-        assert_result_matches(results["vm_trt"], results["vm"])
-        assert_result_matches(results["graph_trt"], results["graph"])
-        assert_result_matches(results["graph_trt"], results["vm_trt"])
-
-    def test_conv2d(
-        x_shape=(1, 32, 8, 8),
-        k_shape=(16, 32, 3, 3),
-        groups=1,
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv2d(
-            x,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_conv2d_const_weights(
-        x_shape=(1, 32, 8, 8),
-        k_shape=(16, 32, 3, 3),
-        groups=1,
-        padding=(0, 0),
-        strides=(1, 1),
-        dilation=(1, 1),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.const(np.ones(k_shape).astype("float32"))
-        out = relay.nn.conv2d(
-            x,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-        )
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_dense(x_shape=(1, 16), k_shape=(32, 16)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.dense(x, kernel, units=k_shape[0])
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_bias_add(x_shape=(1, 16), channels=16):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        bias = relay.var("bias", shape=(channels,), dtype="float32")
-        out = relay.nn.bias_add(x, bias)
-        f = relay.Function([x, bias], out)
-        return f, {"x": x_shape, "bias": (channels,)}, ["bias"]
-
-    def test_pool2d(
-        op,
-        x_shape=(1, 3, 32, 32),
-        pool_size=(2, 2),
-        strides=(2, 2),
-        padding=(0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_global_pool2d(op, x_shape=(1, 3, 32, 32)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_batch_flatten(x_shape=(1, 3, 4, 6)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.batch_flatten(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_expand_dims(x_shape=(1, 3), axis=1, num_newaxis=1):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.expand_dims(x, axis, num_newaxis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_squeeze(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.squeeze(x, axis=axis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_concatenate(input_shapes, axis):
-        concat_inputs = []
-        shapes_dict = {}
-        for i in range(len(input_shapes)):
-            name = "input_{}".format(i)
-            concat_inputs.append(relay.var(name, shape=(input_shapes[i]), dtype="float32"))
-            shapes_dict[name] = input_shapes[i]
-        out = relay.concatenate(concat_inputs, axis)
-        f = relay.Function(concat_inputs, out)
-        return f, shapes_dict, []
-
-    def test_conv2d_transpose(
-        x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), groups=1, padding=(0, 0), strides=(1, 1)
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv2d_transpose(
-            x,
-            kernel,
-            channels=k_shape[1],
-            kernel_size=k_shape[2:4],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_reshape(x_shape, new_shape):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.reshape(x, new_shape)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_transpose(x_shape, order):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.transpose(x, order)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_transpose_weights_conv2d(
-        x_shape=(1, 32, 9, 9), k_shape=(3, 3, 32, 16), order=(3, 2, 0, 1)
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        kernel_t = relay.transpose(kernel, order)
-        # Conv2d requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.conv2d(x, kernel_t, channels=k_shape[order[0]], kernel_size=(3, 3))
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_transpose_weights_dense(x_shape=(1, 16), k_shape=(16, 32)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        kernel_t = relay.transpose(kernel, (1, 0))
-        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.dense(x, kernel_t)
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_dense_from_pytorch(x_shape=(1, 16), k_shape=(32, 16)):
-        # FixPyTorchAddmm will fold away the tranpose -> mult -> transpose.
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        kernel_t = relay.transpose(kernel, (1, 0))
-        beta = relay.const(1, dtype="float32")
-        kernel_t = relay.multiply(kernel_t, beta)
-        kernel_t = relay.transpose(kernel_t, (1, 0))
-        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
-        out = relay.nn.dense(x, kernel_t)
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_float_const(x_shape=(1, 16)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        beta = relay.const(1, dtype="float32")
-        out = relay.multiply(x, beta)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_pad(x_shape, pad_width):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.pad(x, pad_width=pad_width)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_softmax(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.softmax(x, axis=axis)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_batch_norm(x_shape, param_shape, axis=1, epsilon=1e-5):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        beta = relay.var("beta", shape=(param_shape), dtype="float32")
-        gamma = relay.var("gamma", shape=(param_shape), dtype="float32")
-        moving_mean = relay.var("moving_mean", shape=(param_shape), dtype="float32")
-        moving_var = relay.var("moving_var", shape=(param_shape), dtype="float32")
-        out, _, _ = relay.nn.batch_norm(
-            x,
-            gamma=gamma,
-            beta=beta,
-            moving_mean=moving_mean,
-            moving_var=moving_var,
-            axis=axis,
-            center=True,
-            scale=True,
-            epsilon=epsilon,
-        )
-        f = relay.Function([x, gamma, beta, moving_mean, moving_var], out)
-        return (
-            f,
-            {
-                "x": x_shape,
-                "beta": param_shape,
-                "gamma": param_shape,
-                "moving_mean": param_shape,
-                "moving_var": param_shape,
-            },
-            ["beta", "gamma", "moving_mean", "moving_var"],
-        )
-
-    def test_unary(op, x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_clip(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.clip(x, a_min=-0.2, a_max=0.4)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_leaky_relu(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.nn.leaky_relu(x, alpha=0.1)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_binary(op, x_shape, y_shape, y_is_const=False):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if y_is_const:
-            y = relay.const(np.ones(y_shape).astype("float32"))
-            out = op(x, y)
-            f = relay.Function([x], out)
-            return f, {"x": x_shape}, []
-        y = relay.var("y", shape=(y_shape), dtype="float32")
-        out = op(x, y)
-        f = relay.Function([x, y], out)
-        return f, {"x": x_shape, "y": y_shape}, []
-
-    def test_reduce(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x, axis=axis, keepdims=keepdims)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_strided_slice(x_shape, begin, end, strides=None):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.strided_slice(x, begin, end, strides)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_adaptive_pool2d(op, x_shape=(1, 3, 32, 32), out_size=(1, 1)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = op(x, out_size)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_resize(
-        x_shape=(1, 3, 16, 16),
-        out_size=(32, 32),
-        layout="NCHW",
-        method="nearest_neighbor",
-        coordinate_transformation_mode="align_corners",
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        out = relay.image.resize(
-            x,
-            out_size,
-            layout=layout,
-            method=method,
-            coordinate_transformation_mode=coordinate_transformation_mode,
-        )
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_multiple_outputs():
-        x = relay.var("x", shape=(1, 3), dtype="float32")
-        y = relay.var("y", shape=(1, 3), dtype="float32")
-        z = relay.add(x, y)
-        w = relay.add(z, y)
-        out = relay.Tuple((z, w))
-        f = relay.Function([x, y], out)
-        return f, {"x": (1, 3), "y": (1, 3)}, []
-
-    def test_conv3d(
-        x_shape=(1, 32, 8, 8, 8),
-        k_shape=(16, 32, 3, 3, 3),
-        groups=1,
-        padding=(0, 0, 0),
-        strides=(1, 1, 1),
-        dilation=(1, 1, 1),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv3d(
-            x,
-            kernel,
-            channels=k_shape[0],
-            kernel_size=k_shape[2:],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    def test_pool3d(
-        op,
-        x_shape=(1, 3, 8, 32, 32),
-        pool_size=(2, 2, 2),
-        strides=(2, 2, 2),
-        padding=(0, 0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode)
-        f = relay.Function([x], out)
-        return f, {"x": x_shape}, []
-
-    def test_conv3d_transpose(
-        x_shape=(1, 32, 8, 8, 8),
-        k_shape=(32, 16, 3, 3, 3),
-        groups=1,
-        padding=(0, 0, 0),
-        strides=(1, 1, 1),
-        output_padding=(0, 0, 0),
-    ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
-        out = relay.nn.conv3d_transpose(
-            x,
-            kernel,
-            channels=k_shape[1],
-            kernel_size=k_shape[2:5],
-            groups=groups,
-            padding=padding,
-            strides=strides,
-            output_padding=output_padding,
-        )
-        f = relay.Function([x, kernel], out)
-        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
-
-    run_and_verify(test_float_const())
-    run_and_verify(test_multiple_outputs())
-    run_and_verify(test_clip())
-    run_and_verify(test_leaky_relu())
-    run_and_verify(test_batch_norm((1, 64, 56, 56), (64,)))
-    run_and_verify(test_batch_norm((1, 56, 56, 64), (64,), axis=3, epsilon=1.001e-05))
-    run_and_verify(test_softmax((1, 1000), axis=1))
-    run_and_verify(test_softmax((1, 1000), axis=-1))
-    run_and_verify(test_softmax((1, 3, 4), axis=-2))
-    run_and_verify(test_softmax((1, 3, 4), axis=1))
-    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                for dilation in [(1, 1), (2, 2)]:
-                    run_and_verify(
-                        test_conv2d(
-                            k_shape=k_shape,
-                            groups=groups,
-                            padding=padding,
-                            strides=strides,
-                            dilation=dilation,
-                        )
-                    )
-    # Disabled due to incorrect results from TVM.
-    run_and_verify(test_conv2d_const_weights())
-    run_and_verify(test_dense())
-    run_and_verify(test_dense_from_pytorch())
-    run_and_verify(test_bias_add())
-    run_and_verify(test_bias_add((1, 6, 3, 4), 6))
-    for op in [relay.add, relay.subtract, relay.multiply, relay.divide, relay.power]:
-        # Disabled y_is_const=True due to incorrect results from TVM.
-        for y_is_const in [True, False]:
-            run_and_verify(test_binary(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const))
-            run_and_verify(test_binary(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const))
-            run_and_verify(test_binary(op, (1, 10), (10,), y_is_const))
-            run_and_verify(test_binary(op, (1, 1, 1, 10), (10,), y_is_const))
-            run_and_verify(test_binary(op, (1, 1, 1), (3,), y_is_const))
-    for pool_size in [(2, 2), (3, 3)]:
-        for strides in [(1, 1), (2, 2)]:
-            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
-                for ceil_mode in [False, True]:
-                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
-                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
-                        continue
-                    for count_include_pad in [False, True]:
-                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
-                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
-                            continue
-                        run_and_verify(
-                            test_pool2d(
-                                relay.nn.avg_pool2d,
-                                pool_size=pool_size,
-                                strides=strides,
-                                padding=padding,
-                                ceil_mode=ceil_mode,
-                                count_include_pad=count_include_pad,
-                            )
-                        )
-                    run_and_verify(
-                        test_pool2d(
-                            relay.nn.max_pool2d,
-                            pool_size=pool_size,
-                            strides=strides,
-                            padding=padding,
-                            ceil_mode=ceil_mode,
-                        )
-                    )
-    for op in [relay.nn.global_max_pool2d, relay.nn.global_max_pool2d]:
-        run_and_verify(test_global_pool2d(op))
-    for op in [
-        relay.nn.relu,
-        relay.sigmoid,
-        relay.tanh,
-        relay.exp,
-        relay.log,
-        relay.sqrt,
-        relay.abs,
-        relay.negative,
-        relay.sin,
-        relay.cos,
-        relay.atan,
-        relay.ceil,
-        relay.floor,
-    ]:
-        run_and_verify(test_unary(op))
-    run_and_verify(test_batch_flatten())
-    run_and_verify(test_expand_dims())
-    run_and_verify(test_squeeze((1, 5, 1, 1), (2, 3)))
-    run_and_verify(test_squeeze((1, 3, 1), (-1,)))
-    run_and_verify(test_concatenate([(1, 2, 6, 6), (1, 3, 6, 6)], axis=1))
-    for padding in [(0, 0), (1, 1)]:
-        for strides in [(1, 1), (2, 2)]:
-            run_and_verify(test_conv2d_transpose(padding=padding, strides=strides))
-    run_and_verify(test_transpose((1, 16, 7, 7), [0, 2, 3, 1]))
-    run_and_verify(test_transpose((1, 7, 7, 16), [0, 3, 1, 2]))
-    run_and_verify(test_transpose_weights_conv2d())
-    run_and_verify(test_transpose_weights_conv2d((1, 32, 9, 9), (3, 3, 16, 32), (2, 3, 0, 1)))
-    run_and_verify(test_transpose_weights_dense())
-    run_and_verify(test_reshape((1, 1, 1, 10), (-1, 10)))
-    run_and_verify(test_reshape((1, 10, 2, 3), (1, -1)))
-    run_and_verify(test_reshape((1, 1, 2, 3), (1, 6)))
-    run_and_verify(test_pad((1, 64, 56, 56), [[0, 0], [0, 0], [0, 0], [0, 0]]))
-    run_and_verify(test_pad((1, 64, 56, 56), [[0, 0], [0, 0], [1, 1], [1, 1]]))
-    run_and_verify(test_pad((1, 56, 56, 64), [[0, 0], [1, 1], [1, 1], [0, 0]]))
-    for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]:
-        for keepdims in [True, False]:
-            run_and_verify(test_reduce(op, axis=(1), keepdims=keepdims))
-            run_and_verify(test_reduce(op, axis=(2, 3), keepdims=keepdims))
-            run_and_verify(test_reduce(op, axis=(1, 2), keepdims=keepdims))
-            run_and_verify(test_reduce(op, axis=(1, 2, 3), keepdims=keepdims))
-    run_and_verify(test_strided_slice((1, 3, 6, 7), (0, 0, 0, 0), (1, 1, 6, 7)))
-    run_and_verify(test_strided_slice((1, 3, 6, 7), (0, 1, 0, 0), (1, 2, 6, 6)))
-    run_and_verify(test_strided_slice((1, 10), (0, 0), (1, 10), (1, 2)))
-    for op in [relay.nn.adaptive_max_pool2d, relay.nn.adaptive_avg_pool2d]:
-        run_and_verify(test_adaptive_pool2d(op))
-    # for x_shape, layout in [((1, 3, 16, 16), 'NCHW'), ((1, 16, 16, 3), 'NHWC')]:
-    #     for out_size in [(32, 32), (40, 40), (5, 21)]:
-    #         for method in ['nearest_neighbor', 'bilinear']:
-    #             for coordinate_transformation_mode in ['asymmetric']:
-    #                 # TODO(trevmorr): 'align_corners' gives incorrect results. 'half_pixel' not supported?
-    #                 run_and_verify(test_resize(x_shape, out_size, layout, method, coordinate_transformation_mode))
-    run_and_verify(test_conv3d())
-    run_and_verify(test_conv3d(padding=(0, 0, 0, 1, 1, 1)))
-    run_and_verify(test_pool3d(relay.nn.avg_pool3d))
-    run_and_verify(test_pool3d(relay.nn.max_pool3d))
-    run_and_verify(test_pool3d(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)))
-    run_and_verify(test_pool3d(relay.nn.max_pool3d, strides=(1, 1, 1)))
-    run_and_verify(test_conv3d_transpose())
-
-
-def test_tensorrt_integration(test_all_models=False):
-    if should_skip():
-        return
-
-    def test_model(model, mode, i_data, input_shape, dtype, use_trt=True):
-        from mxnet.gluon.model_zoo.vision import get_model
-
-        assert mode in ["graph", "vm"]
-
-        def check_trt_used(mod):
-            num_trt_subgraphs = sum(
-                [1 if gv.name_hint == "tensorrt_0" else 0 for gv in mod.get_global_vars()]
-            )
-            assert num_trt_subgraphs == 1
-
-        block = get_model(model, pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
-
-        if use_trt:
-            mod = relay.tensorrt.EnableTrt(mod, params)
-            check_trt_used(mod)
-
-        with relay.build_config(opt_level=3):
-            exec = relay.create_executor(mode, mod=mod, ctx=tvm.cpu(0), target="llvm")
-
-        res = exec.evaluate()(i_data, **params)
-        return res
-
-    models = [
-        "alexnet",
-        "resnet18_v1",
-        "resnet18_v2",
-        "squeezenet1.0",
-        "mobilenet0.25",
-        "mobilenetv2_0.25",
-        "vgg11",
-        "densenet121",
-    ]
-    additional_models = [
-        "resnet34_v1",
-        "resnet50_v1",
-        "resnet101_v1",
-        "resnet152_v1",
-        "resnet34_v2",
-        "resnet50_v2",
-        "resnet101_v2",
-        "resnet152_v2",
-        "mobilenet0.5",
-        "mobilenet0.75",
-        "mobilenet1.0",
-        "mobilenetv2_0.5",
-        "mobilenetv2_0.75",
-        "mobilenetv2_1.0",
-        "vgg16",
-        "densenet169",
-        "densenet201",
-    ]
-
-    if test_all_models:
-        models.extend(additional_models)
-
-    dtype = "float32"
-    input_shape = (1, 3, 224, 224)
-    i_data = np.random.uniform(-1, 1, input_shape).astype(dtype)
-
-    results = dict()
-    for model in models:
-        print("Testing model : {}".format(model))
-        for mode in ["vm", "graph"]:
-            for use_trt in [True, False]:
-                result_key = mode + ("_trt" if use_trt else "")
-                results[result_key] = test_model(
-                    model, mode, i_data, input_shape, dtype, use_trt=use_trt
-                )
-
-        assert_result_matches(results["vm_trt"], results["vm"])
-        assert_result_matches(results["graph_trt"], results["graph"])
-        assert_result_matches(results["graph_trt"], results["vm_trt"])
-
-
-def test_tensorrt_serialize(data_shape=(1, 3, 224, 224)):
-    if should_skip():
-        return
-
-    from mxnet.gluon.model_zoo.vision import get_model
-
-    i_data = np.random.uniform(0, 1, data_shape).astype("float32")
-    block = get_model("resnet18_v1", pretrained=True)
-    mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype="float32")
-    mod = relay.tensorrt.EnableTrt(mod, params)
-
-    def compile_vm(mod, params):
-        with relay.build_config(opt_level=3):
-            vm_exec = relay.vm.compile(mod, target="llvm", params=params)
-            code, lib = vm_exec.save()
-        return code, lib
-
-    def run_vm(code, lib):
-        vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
-        vm = VirtualMachine(vm_exec, tvm.cpu(0))
-        result = vm.invoke("main", data=i_data)
-        return result
-
-    def save_vm(code, lib):
-        # save and load the code and lib file.
-        lib.export_library("path_lib.so")
-        with open("path_code.ro", "wb") as fo:
-            fo.write(code)
-
-    def load_vm():
-        lib = tvm.runtime.load_module("path_lib.so")
-        code = bytearray(open("path_code.ro", "rb").read())
-        return lib, code
-
-    def compile_graph(mod, params):
-        with relay.build_config(opt_level=3):
-            graph, lib, params = relay.build(mod, params=params, target="cuda")
-            params = relay.save_param_dict(params)
-        return graph, lib, params
-
-    def run_graph(graph, lib, params):
-        mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
-        mod_.load_params(params)
-        mod_.run(data=i_data)
-        res = mod_.get_output(0)
-        return res
-
-    def save_graph(graph, lib, params):
-        # Serialize
-        with open("compiled.json", "w") as f_graph_json:
-            f_graph_json.write(graph)
-        with open("compiled.params", "wb") as f_params:
-            f_params.write(params)
-        lib.export_library("compiled.so")
-
-    def load_graph():
-        # Deserialize
-        with open("compiled.json", "r") as f_graph_json:
-            graph = f_graph_json.read()
-        with open("compiled.params", "rb") as f_params:
-            params = bytearray(f_params.read())
-        lib = tvm.runtime.load_module("compiled.so")
-        return graph, lib, params
-
-    # Test serialization with graph runtime and check if the results match
-    graph, lib, graph_params = compile_graph(mod, params)
-    save_graph(graph, lib, graph_params)
-    loaded_graph, loaded_lib, loaded_params = load_graph()
-
-    ref_res_graph = run_graph(graph, lib, graph_params)
-    res_graph_serialized = run_graph(loaded_graph, loaded_lib, loaded_params)
-    assert_result_matches(res_graph_serialized, ref_res_graph)
-
-    # Test serialization with VM and check if the results match
-    code, lib = compile_vm(mod, params)
-    save_vm(code, lib)
-    loaded_lib, loaded_code = load_vm()
-
-    ref_res_vm = run_vm(code, lib)
-    res_vm_serialized = run_vm(loaded_code, loaded_lib)
-    assert_result_matches(res_vm_serialized, ref_res_vm)
-
-    # Finally check accuracy between VM and graph
-    assert_result_matches(res_vm_serialized, res_graph_serialized)
-
-
-def test_tensorrt_dynamic_batch():
-    if should_skip():
-        return
-
-    batches_to_test = [1, 1, 2, 3, 1, 3, 2]
-    x_shape = (relay.Any(), 1, 8, 8)
-    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
-    result_dict = {}
-    for use_trt in [True, False]:
-        x = relay.var("x", shape=x_shape, dtype="float32")
-        out = relay.nn.relu(x)
-        f = relay.Function([x], out)
-        mod = tvm.IRModule()
-        mod["main"] = f
-        if use_trt:
-            mod = relay.tensorrt.EnableTrt(mod)
-        with relay.build_config(opt_level=3):
-            relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
-
-        for i, batch_size in enumerate(batches_to_test):
-            result_dict[(i, use_trt)] = relay_exec.evaluate()(x_data[:batch_size, ...])
-
-    for i in range(len(batches_to_test)):
-        assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
-
-
-def test_tensorrt_dynamic_batch_conv():
-    if should_skip():
-        return
-    batches_to_test = [1, 1, 2, 3, 1, 3, 2]
-    x_shape = (relay.Any(), 32, 8, 8)
-    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
-    k_shape = (16, 32, 3, 3)
-    params = {"kernel": np.random.uniform(-1, 1, k_shape).astype("float32")}
-    result_dict = {}
-    for use_trt in [True, False]:
-        x = relay.var("x", shape=x_shape, dtype="float32")
-        kernel = relay.var("kernel", shape=k_shape, dtype="float32")
-        out = relay.nn.conv2d(x, kernel, channels=16, kernel_size=(3, 3), groups=1)
-        f = relay.Function([x, kernel], out)
-        mod = tvm.IRModule()
-        mod["main"] = f
-        if use_trt:
-            mod = relay.tensorrt.EnableTrt(mod, params)
-        with relay.build_config(opt_level=3):
-            relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
-
-        for i, batch_size in enumerate(batches_to_test):
-            result_dict[(i, use_trt)] = relay_exec.evaluate()(x=x_data[:batch_size, ...], **params)
-
-    for i in range(len(batches_to_test)):
-        assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
-
-
-if __name__ == "__main__":
-    test_tensorrt_ops()
-    test_tensorrt_simple()
-    test_tensorrt_simple_cpu_io()
-    test_tensorrt_not_compatible()
-    test_tensorrt_integration()
-    test_tensorrt_serialize()
-    test_tensorrt_dynamic_batch()
-    test_tensorrt_dynamic_batch_conv()

From 11bad5182d6c50527b91f5b28a2312bdd3db6502 Mon Sep 17 00:00:00 2001
From: Przemyslaw Tredak <ptredak@nvidia.com>
Date: Tue, 13 Oct 2020 17:23:23 -0700
Subject: [PATCH 002/258] Revert #5238 (#6680)

---
 nnvm/src/core/graph.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc
index fd5b64f4777d..e5042802906c 100644
--- a/nnvm/src/core/graph.cc
+++ b/nnvm/src/core/graph.cc
@@ -96,7 +96,7 @@ IndexedGraph::IndexedGraph(const Graph& g) {
     // input entries
     for (const auto& e : n->inputs) {
       auto it = node2index_.find(e.node.get());
-      if (it == node2index_.end() || it->first != e.node.get()) continue;
+      CHECK(it != node2index_.end() && it->first == e.node.get());
       input_entries_.emplace_back(NodeEntry{it->second, e.index, e.version});
     }
     inputs_rptr.push_back(input_entries_.size());

From f8b3aa0d1ef33c0b3f2087cd5029eb8a3e83948e Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 14 Oct 2020 04:13:40 -0700
Subject: [PATCH 003/258] [AutoScheduler] Fix a bug in thread binding (#6683)

* fix for lstm use case

* update
---
 src/auto_scheduler/search_policy/sketch_policy_rules.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 045ee860f1f1..99188d4a6292 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -702,11 +702,14 @@ PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode*
 
 PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy, State* state,
                                                            std::mt19937* rand_gen) const {
+  // Collect all stages that are roots of stages that perform multi-level tiling.
   std::set<int> multi_level_tiling_root_set;
   for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
     if (NeedsMultilevelTiling(policy->search_task, *state, stage_id)) {
       const Stage& stage = (*state)->stages[stage_id];
-      if (stage->compute_at != ComputeAtKind::kIter) {
+      if (stage->compute_at == ComputeAtKind::kInlined) {
+        continue;
+      } else if (stage->compute_at != ComputeAtKind::kIter) {
         // This stage is not multi-level tiled,
         // so it must be produced by RuleCrossThreadReduction.
         CHECK(HasCrossThreadReduction(*state, stage_id));

From 32bcf45a09e6be78de609a3abd5a124a15526ee7 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 14 Oct 2020 08:33:15 -0400
Subject: [PATCH 004/258] [ARITH] Introduce iterator (quasi)affine map
 detection. (#6667)

* [ARITH] Introduce iterator (quasi)affine map detection.

The loop transformations (split, fuse) create bijective
maps from a collection of source iterators to target iterators.

DetectIterMap is a function that detects such bijective mappings
from the lowered index expression.

We choose the term quasi affine to be consistent with the
terminology used by in polyhedral compilation.
DetectIterMap can handle symbolic integers(in split/fuse) to some extent.

The utility can be useful in detecting loop transformation
patterns and data layout change patterns in TIR.

* Update per feedback
---
 include/tvm/arith/iter_affine_map.h           | 277 +++++++
 python/tvm/arith/__init__.py                  |   2 +
 python/tvm/arith/iter_affine_map.py           | 108 +++
 src/arith/iter_affine_map.cc                  | 717 ++++++++++++++++++
 src/arith/rewrite_simplify.cc                 |   3 +
 src/node/structural_hash.cc                   |  18 +-
 src/support/util.h                            |  10 +
 .../unittest/test_arith_iter_affine_map.py    | 176 +++++
 8 files changed, 1298 insertions(+), 13 deletions(-)
 create mode 100644 include/tvm/arith/iter_affine_map.h
 create mode 100644 python/tvm/arith/iter_affine_map.py
 create mode 100644 src/arith/iter_affine_map.cc
 create mode 100644 tests/python/unittest/test_arith_iter_affine_map.py

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
new file mode 100644
index 000000000000..00f8cf6ee9f0
--- /dev/null
+++ b/include/tvm/arith/iter_affine_map.h
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/arith/iter_affine_map.h
+ * \brief Iterator quasi-affine mapping patterns.
+ *
+ *  This file defines a collection of mapping patterns
+ *  maps a collection of independent iterators to another
+ *  collection of independent iterators.
+ *
+ *  There are two main kinds of mapping patterns:
+ *
+ *  - Fuse: fuse a collection of iterators into a single one
+ *
+ *    domain(x0) = [0, 4), domain(x1) = [0, 3), domain(x2) = [0, 2)
+ *    fuse(x0, x1, x2): y = x2 * 12 + x1 * 4 + x0
+ *    domain(y) = [0, 24)
+ *
+ *  - Split: split an iterator into multiple ones
+ *
+ *    domain(x) = [0, 24)
+ *    split(x, 3, 12): [y0, y1, y2] = [x % 3, (x % 12) / 3, x / 12]
+ *    domain(y0) = [0, 3), domain(y1) = [0, 4), domain(y2) = [0, 2)
+ *
+ *  We use the name "(quasi)affine" to be consistent with
+ *  the terminology used in the polyhedral compilation.
+ *  Notably, fuse is an affine transformation,
+ *  while split corresponds to additional floordiv/mod operations
+ *  that can appear in quasi-affine transformations.
+ */
+#ifndef TVM_ARITH_ITER_AFFINE_MAP_H_
+#define TVM_ARITH_ITER_AFFINE_MAP_H_
+
+#include <tvm/ir/expr.h>
+
+namespace tvm {
+namespace arith {
+
+/*!
+ * \brief Base class of all iter map expressions.
+ *
+ *  An IterMapExpr is a special expression to store
+ *  the result of IterMapDetection.
+ *  It should not appear in a legal TIR PrimFunc.
+ */
+class IterMapExprNode : public PrimExprNode {
+ public:
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {}
+
+  static constexpr const char* _type_key = "arith.IterMapExpr";
+  static constexpr const uint32_t _type_child_slots = 3;
+  TVM_DECLARE_BASE_OBJECT_INFO(IterMapExprNode, PrimExprNode);
+};
+
+/*!
+ * \brief Managed reference to IterMapExprNode.
+ * \sa IterMapExprNode
+ */
+class IterMapExpr : public PrimExpr {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(IterMapExpr, PrimExpr, IterMapExprNode);
+};
+
+/*!
+ * \brief Mark the source as an iterator in [0, extent).
+ *
+ *  IterMark is used to mark source expression as a valid
+ *  iterator to make future analysis easy.
+ */
+class IterMarkNode : public Object {
+ public:
+  /*!
+   * \brief The source expression, can either be
+   *  a IterSumExpr or a Var.
+   */
+  PrimExpr source;
+  /*!
+   * \brief The extent of the iteration.
+   */
+  PrimExpr extent;
+
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("source", &source);
+    v->Visit("extent", &extent);
+  }
+
+  bool SEqualReduce(const IterMarkNode* other, SEqualReducer equal) const {
+    equal->MarkGraphNode();
+    return equal(source, other->source) && equal(extent, other->extent);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce->MarkGraphNode();
+    hash_reduce(source);
+    hash_reduce(extent);
+  }
+
+  static constexpr const bool _type_has_method_sequal_reduce = true;
+  static constexpr const bool _type_has_method_shash_reduce = true;
+  static constexpr const char* _type_key = "arith.IterMark";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IterMarkNode, Object);
+};
+
+/*!
+ * \brief Managed reference to IterMarkExprNode.
+ * \sa IterMarkExprNode
+ */
+class IterMark : public ObjectRef {
+ public:
+  /*!
+   * \brief constructor.
+   * \param source The source expression.
+   * \param extent The extent of the iterator.
+   */
+  TVM_DLL IterMark(PrimExpr source, PrimExpr extent);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(IterMark, ObjectRef, IterMarkNode);
+};
+
+/*!
+ * \brief Split of an iterator.
+ *
+ *  result = floormod(floordiv(source, lower_factor), extent) * scale
+ */
+class IterSplitExprNode : public IterMapExprNode {
+ public:
+  /*! \brief The source marked iterator. */
+  IterMark source;
+  /*! \brief The lower factor to split the source. */
+  PrimExpr lower_factor;
+  /*! \brief The extent of the split. */
+  PrimExpr extent;
+  /*! \brief Additional scale. */
+  PrimExpr scale;
+
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("source", &source);
+    v->Visit("lower_factor", &lower_factor);
+    v->Visit("extent", &extent);
+    v->Visit("scale", &scale);
+  }
+
+  bool SEqualReduce(const IterSplitExprNode* other, SEqualReducer equal) const {
+    return equal(source, other->source) && equal(lower_factor, other->lower_factor) &&
+           equal(extent, other->extent) && equal(scale, other->scale);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(source);
+    hash_reduce(lower_factor);
+    hash_reduce(extent);
+    hash_reduce(scale);
+  }
+
+  static constexpr const char* _type_key = "arith.IterSplitExpr";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IterSplitExprNode, IterMapExprNode);
+};
+
+/*!
+ * \brief Managed reference to IterSplitExprNode.
+ * \sa IterSplitExprNode
+ */
+class IterSplitExpr : public IterMapExpr {
+ public:
+  /*!
+   * \brief constructor from just source.
+   * \param source The source expression.
+   */
+  TVM_DLL explicit IterSplitExpr(IterMark source);
+  /*!
+   * \brief constructor
+   * \param source The source expression.
+   * \param lower_factor The lower factor to split the source.
+   * \param extent The extent of the split.
+   * \param scale The additional scaling factor.
+   */
+  TVM_DLL explicit IterSplitExpr(IterMark source, PrimExpr lower_factor, PrimExpr extent,
+                                 PrimExpr scale);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(IterSplitExpr, IterMapExpr, IterSplitExprNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(IterSplitExprNode);
+};
+
+/*!
+ * \brief Fuse multiple iterators by summing them with scaling.
+ *
+ *  result = sum(args) + base
+ */
+class IterSumExprNode : public IterMapExprNode {
+ public:
+  /*! \brief The args to the sum. */
+  Array<IterSplitExpr> args;
+  /*! \brief The base offset. */
+  PrimExpr base;
+
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("args", &args);
+    v->Visit("base", &base);
+  }
+
+  bool SEqualReduce(const IterSumExprNode* other, SEqualReducer equal) const {
+    return equal(args, other->args) && equal(base, other->base);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(args);
+    hash_reduce(base);
+  }
+
+  static constexpr const char* _type_key = "arith.IterSumExpr";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IterSumExprNode, IterMapExprNode);
+};
+
+/*!
+ * \brief Managed reference to IterSumExprNode.
+ * \sa IterSumExprNode
+ */
+class IterSumExpr : public IterMapExpr {
+ public:
+  /*!
+   * \brief constructor.
+   * \param args The args to the sum.
+   * \param base The base offset.
+   */
+  TVM_DLL IterSumExpr(Array<IterSplitExpr> args, PrimExpr base);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(IterSumExpr, IterMapExpr, IterSumExprNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(IterSumExprNode);
+};
+
+/*!
+ * \brief Detect if indices can be written as
+ *
+ *  [y_0 + c_0, y_1 + c_1, ..., y_n + c_n]
+ *
+ *  Here y = some-quasi-affine-iter-map(input_iters)
+ *  and c are symbolic constants.
+ *
+ *  We also requires that y_i and y_j to be independent for i != j.
+ *
+ *  For returned value rv, the following is always true:
+ *  - rv[i]->args.size() <=1: only one iterator per element.
+ *
+ * \param indices The indices to detect pattern for.
+ * \param input_iters Map from variable to iterator's range.
+ * \param analyzer Analyzer used to get context information.
+ *
+ * \return The detected pattern if a match exists,
+ *         otherwise return an empty array.
+ */
+Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                                 arith::Analyzer* analyzer);
+
+}  // namespace arith
+}  // namespace tvm
+#endif  // TVM_ARITH_ITER_AFFINE_MAP_H_
diff --git a/python/tvm/arith/__init__.py b/python/tvm/arith/__init__.py
index e5af52938f5c..77ec869a171e 100644
--- a/python/tvm/arith/__init__.py
+++ b/python/tvm/arith/__init__.py
@@ -21,3 +21,5 @@
 from .bound import deduce_bound
 from .pattern import detect_linear_equation, detect_clip_bound
 from .int_solver import solve_linear_equations, solve_linear_inequalities
+from .iter_affine_map import IterMapExpr, IterMark, IterSplitExpr, IterSumExpr
+from .iter_affine_map import detect_iter_map
diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py
new file mode 100644
index 000000000000..123d9b85480a
--- /dev/null
+++ b/python/tvm/arith/iter_affine_map.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Iterator (quasi)affine mapping patterns."""
+import tvm._ffi
+from tvm.runtime import Object
+from tvm.ir import PrimExpr
+from . import _ffi_api
+
+
+class IterMapExpr(PrimExpr):
+    """Base class of all IterMap expressions."""
+
+
+@tvm._ffi.register_object("arith.IterMark")
+class IterMark(Object):
+    """Mark the source as an iterator in [0, extent).
+
+    Parameters
+    ----------
+    source : PrimExpr.
+        The source expression.
+
+    extent : PrimExpr
+        The extent of the iterator.
+    """
+
+    def __init__(self, source, extent):
+        self.__init_handle_by_constructor__(_ffi_api.IterMark, source, extent)
+
+
+@tvm._ffi.register_object("arith.IterSplitExpr")
+class IterSplitExpr(IterMapExpr):
+    """Split of an iterator.
+
+    result = floormod(floordiv(source, lower_factor), extent) * scale
+
+    Parameters
+    ----------
+    source : IterMark
+        The source marked iterator.
+
+    lower_factor : PrimExpr
+        The lower factor to split the domain.
+
+    extent : PrimExpr
+        The extent of the split.
+
+    scale : PrimExpr
+        Additional scale to the split.
+    """
+
+    def __init__(self, source, lower_factor, extent, scale):
+        self.__init_handle_by_constructor__(
+            _ffi_api.IterSplitExpr, source, lower_factor, extent, scale
+        )
+
+
+@tvm._ffi.register_object("arith.IterSumExpr")
+class IterSumExpr(IterMapExpr):
+    """Fuse multiple iterators by summing them with scaling.
+
+    result = sum(args) + base
+
+    Parameters
+    ----------
+    args : List[IterSplitExpr]
+        The input to the sum expression.
+
+    base : PrimExpr
+        The base offset.
+    """
+
+    def __init__(self, args, base):
+        self.__init_handle_by_constructor__(_ffi_api.IterSumExpr, args, base)
+
+
+def detect_iter_map(indices, input_iters):
+    """Detect if indices can be written mapped iters from input_iters.
+
+    Parameters
+    ----------
+    indices : List[PrimExpr]
+        The input indices.
+
+    input_iters : Map[Var, Range]
+        The domain of each input iterators.
+
+    Returns
+    -------
+    results : List[IterSumExpr]
+        The iter map matching result.
+        Empty array if no match can be found.
+    """
+    return _ffi_api.DetectIterMap(indices, input_iters)
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
new file mode 100644
index 000000000000..7afa75a7efb0
--- /dev/null
+++ b/src/arith/iter_affine_map.cc
@@ -0,0 +1,717 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/arith/iter_affine_map.cc
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/arith/iter_affine_map.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/expr_functor.h>
+#include <tvm/tir/op.h>
+
+#include "../support/util.h"
+#include "const_fold.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace tir;
+
+IterMark::IterMark(PrimExpr source, PrimExpr extent) {
+  auto n = make_object<IterMarkNode>();
+  n->source = std::move(source);
+  n->extent = std::move(extent);
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_GLOBAL("arith.IterMark").set_body_typed([](PrimExpr source, PrimExpr extent) {
+  return IterMark(source, extent);
+});
+
+TVM_REGISTER_NODE_TYPE(IterMarkNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<IterMarkNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const IterMarkNode*>(node.get());
+      p->stream << "IterMark(" << op->source << ", extent=" << op->extent;
+    });
+
+IterSplitExpr::IterSplitExpr(IterMark source) {
+  auto n = make_object<IterSplitExprNode>();
+  auto one = make_const(source->source->dtype, 1);
+  n->dtype = source->source->dtype;
+  n->source = std::move(source);
+  n->extent = n->source->extent;
+  n->lower_factor = one;
+  n->scale = one;
+  data_ = std::move(n);
+}
+
+IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr lower_factor, PrimExpr extent,
+                             PrimExpr scale) {
+  auto n = make_object<IterSplitExprNode>();
+  n->dtype = source->source->dtype;
+  n->source = std::move(source);
+  n->lower_factor = std::move(lower_factor);
+  n->extent = std::move(extent);
+  n->scale = std::move(scale);
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_GLOBAL("arith.IterSplitExpr")
+    .set_body_typed([](IterMark source, PrimExpr lower_factor, PrimExpr extent, PrimExpr scale) {
+      return IterSplitExpr(source, lower_factor, extent, scale);
+    });
+
+TVM_REGISTER_NODE_TYPE(IterSplitExprNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<IterSplitExprNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const IterSplitExprNode*>(node.get());
+      p->stream << "IterSplit(" << op->source << ", lower_factor=" << op->lower_factor
+                << ", extent=" << op->extent << ", scale=" << op->scale;
+    });
+
+IterSumExpr::IterSumExpr(Array<IterSplitExpr> args, PrimExpr base) {
+  auto n = make_object<IterSumExprNode>();
+  n->dtype = base->dtype;
+  n->args = std::move(args);
+  n->base = std::move(base);
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_GLOBAL("arith.IterSumExpr")
+    .set_body_typed([](Array<IterSplitExpr> args, PrimExpr base) {
+      return IterSumExpr(args, base);
+    });
+
+TVM_REGISTER_NODE_TYPE(IterSumExprNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<IterSumExprNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const IterSumExprNode*>(node.get());
+      p->stream << "IterSum(" << op->args << ", " << op->base << ")";
+    });
+
+/*!
+ * \brief Collector that collects
+ *  the outgoing split reference of each IterMark.
+ *
+ *  These out-going splits can then be used to
+ *  check if the iterators are independent.
+ */
+class IterMarkSplitCollector {
+ public:
+  // mark all IterMarks that are visited.
+  std::unordered_set<IterMark, ObjectPtrHash, ObjectPtrEqual> visited_;
+  // each iter mark to its outgoing splits that are referenced.
+  std::unordered_map<IterMark, std::vector<IterSplitExpr>, ObjectPtrHash, ObjectPtrEqual>
+      mark2splits_;
+  /*!
+   * \brief Collect all mark2splits recursively from indices.
+   * \param indices The iterator of interest.
+   */
+  void Collect(const Array<IterSumExpr>& indices) {
+    for (IterSumExpr sum_expr : indices) {
+      for (IterSplitExpr split : sum_expr->args) {
+        this->CollectInternal(split->source);
+        mark2splits_[split->source].push_back(split);
+      }
+    }
+  }
+
+  void CollectInternal(const IterMark& mark) {
+    if (visited_.count(mark)) return;
+    visited_.insert(mark);
+    if (auto* op = mark->source.as<IterSumExprNode>()) {
+      for (IterSplitExpr split : op->args) {
+        this->CollectInternal(split->source);
+        mark2splits_[split->source].push_back(split);
+      }
+    }
+  }
+};
+
+// Rewriter to rewrite PrimExpr to IterMapExpr
+// when possible
+class IterMapRewriter : public ExprMutator {
+ public:
+  using Parent = ExprMutator;
+
+  explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters)
+      : analyzer_(analyzer) {
+    for (auto kv : input_iters) {
+      const auto& vrng = kv.second;
+      if (is_zero(vrng->min)) {
+        IterMark mark(kv.first, vrng->extent);
+        var_map_[kv.first] = IterSplitExpr(mark);
+        input_marks_.push_back(mark);
+      } else {
+        IterMark mark(kv.first - vrng->min, vrng->extent);
+        auto sum_expr = ToIterSumExpr(IterSplitExpr(mark));
+        sum_expr.CopyOnWrite()->base = vrng->min;
+        var_map_[kv.first] = sum_expr;
+        input_marks_.push_back(mark);
+      }
+    }
+  }
+
+  size_t unresolved_count() const { return unresolved_count_; }
+
+  IterSumExpr Rewrite(PrimExpr expr) {
+    return NormalizeToIterWithOffset(ToIterSumExpr(DirectMutate(expr)));
+  }
+
+  bool CheckBijective(const Array<IterSumExpr>& indices) {
+    // This function checks two conditions:
+    // - C0: Each iter mark should be fully covered by non-overlapping splits.
+    // - C1: All of the input iterators are used.
+    //
+    // Example: given x in [0, 8) y in [0, 6)
+    // - indices = [x, x+1, y] won't pass because x and x+1 contribute
+    //   two splits that overlaps with each other.
+    // - indices = [x / 4, x % 4, y] will pass because x / 4 and x % 4
+    //   contribute two non-overlapping splits that covers x.
+    // - indices = [x / 4, x % 4] won't pass because y is not used.
+    //
+    IterMarkSplitCollector collector;
+    // We can check that for each iter mark:
+    // All the splits that refers to the itermark covers its extent.
+    // The splits do not overlap with each other.
+    collector.Collect(indices);
+    for (IterMark mark : collector.visited_) {
+      if (TryNormalizeSplits(mark, collector.mark2splits_[mark]).size() == 0) return false;
+    }
+    // all input marks must be visited
+    for (auto mark : input_marks_) {
+      if (collector.visited_.count(mark) == 0) return false;
+    }
+    return true;
+  }
+
+  // override the original mutate function.
+  PrimExpr VisitExpr(const PrimExpr& input_expr) final {
+    auto expr = ExprMutator::VisitExpr(input_expr);
+    if (expr->IsInstance<IterMapExprNode>()) {
+      ++unresolved_count_;
+    }
+    return expr;
+  }
+
+  // Normal mutation without normalization.
+  PrimExpr DirectMutate(PrimExpr expr) { return ExprMutator::VisitExpr(expr); }
+
+  PrimExpr VisitExpr_(const VarNode* op) final;
+  PrimExpr VisitExpr_(const AddNode* op) final;
+  PrimExpr VisitExpr_(const SubNode* op) final;
+  PrimExpr VisitExpr_(const MulNode* op) final;
+  PrimExpr VisitExpr_(const FloorDivNode* op) final;
+  PrimExpr VisitExpr_(const FloorModNode* op) final;
+
+ private:
+  // temp hash for de-duplication purposes.
+  struct IterSumHash {
+    size_t operator()(const IterSumExpr& value) const {
+      // for now only hash on source index.
+      size_t hash = value->args.size();
+      for (size_t i = 0; i < value->args.size(); ++i) {
+        hash = support::HashCombine(hash, std::hash<const Object*>()(value->args[i]->source.get()));
+      }
+      return hash;
+    }
+  };
+
+  struct IterSumEqual {
+    bool operator()(const IterSumExpr& lhs, const IterSumExpr& rhs) const {
+      tir::ExprDeepEqual equal;
+      if (lhs->args.size() != rhs->args.size()) return false;
+      if (!equal(lhs->base, rhs->base)) return false;
+      for (size_t i = 0; i < lhs->args.size(); ++i) {
+        auto lvalue = lhs->args[i];
+        auto rvalue = lhs->args[i];
+        if (!lvalue->source.same_as(rvalue->source)) return false;
+        if (!equal(lvalue->lower_factor, rvalue->lower_factor)) return false;
+        if (!equal(lvalue->scale, rvalue->scale)) return false;
+        if (!equal(lvalue->extent, rvalue->extent)) return false;
+      }
+      return true;
+    }
+  };
+
+  // Internal analyzer
+  Analyzer* analyzer_;
+  // Counter to keep track of unresolved cases.
+  int unresolved_count_{0};
+  // The var map
+  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> var_map_;
+  // input iter marks
+  std::vector<IterMark> input_marks_;
+  // The canonical map for sum
+  std::unordered_map<IterSumExpr, IterSplitExpr, IterSumHash, IterSumEqual> sum_fuse_map_;
+
+  /*!
+   * \brief Verify that splits fully covers mark in a non-overlapping fashion.
+   *        If verification passes, return splits from outermost to inner most order.
+   *        If not, return an empty array
+   * \param mark The iterator of interest.
+   * \param splits The splits to be verified.
+   * \return The normalized splits.
+   */
+  Array<IterSplitExpr> TryNormalizeSplits(const IterMark& mark,
+                                          const std::vector<IterSplitExpr>& splits) {
+    std::vector<bool> used(splits.size(), false);
+    std::vector<IterSplitExpr> iters;
+    PrimExpr expected_lower_factor = make_const(mark->source->dtype, 1);
+
+    for (size_t i = 0; i < splits.size(); ++i) {
+      size_t j = 0;
+      for (; j < splits.size(); ++j) {
+        if (used[j]) continue;
+        if (!used[j] && CanProveEqual(splits[j]->lower_factor, expected_lower_factor)) break;
+      }
+      if (j == splits.size()) {
+        return Array<IterSplitExpr>();
+      }
+      used[j] = true;
+      iters.push_back(splits[j]);
+      expected_lower_factor *= splits[j]->extent;
+    }
+    if (!CanProveEqual(expected_lower_factor, mark->extent)) return Array<IterSplitExpr>();
+    return Array<IterSplitExpr>(iters.rbegin(), iters.rend());
+  }
+
+  /*!
+   * \brief Normalize expr to an iterator + offset.
+   * \param expr The input expression.
+   * \return The Normalized expression.
+   */
+  IterSumExpr NormalizeToIterWithOffset(IterSumExpr expr) {
+    if (expr->args.size() <= 1) return expr;
+    PrimExpr base = expr->base;
+    expr.CopyOnWrite()->base = make_zero(expr->dtype);
+    auto opt = TryFuseIters(expr);
+    expr.CopyOnWrite()->base = base;
+    if (opt) {
+      expr.CopyOnWrite()->args = Array<IterSplitExpr>({opt.value()});
+      return expr;
+    } else {
+      ++unresolved_count_;
+      return expr;
+    }
+  }
+
+  bool CanProveEqual(PrimExpr lhs, PrimExpr rhs) {
+    const auto* clhs = lhs.as<IntImmNode>();
+    const auto* crhs = rhs.as<IntImmNode>();
+    if (clhs && crhs) return clhs->value == crhs->value;
+    return analyzer_->CanProve(lhs - rhs == 0);
+  }
+
+  /*!
+   * \brief Create a IterSumExpr from expr.
+   * \param expr The input expr.
+   * \return The transformed IterSumExpr.
+   */
+  IterSumExpr ToIterSumExpr(PrimExpr expr) {
+    if (const auto* op = expr.as<IterSumExprNode>()) {
+      return GetRef<IterSumExpr>(op);
+    } else if (const auto* op = expr.as<IterSplitExprNode>()) {
+      return IterSumExpr({GetRef<IterSplitExpr>(op)}, make_zero(expr->dtype));
+    } else {
+      CHECK(!expr->IsInstance<IterMapExprNode>());
+      return IterSumExpr({}, expr);
+    }
+  }
+
+  // Try to normalize IterSum into a fused IterMark
+  // return a corresponding splitexpr if needed.
+  Optional<IterSplitExpr> TryFuseIters(IterSumExpr expr) {
+    if (!is_zero(expr->base)) return NullOpt;
+    if (expr->args.size() == 1) return expr->args[0];
+    // select the iterators in order
+    std::vector<bool> visited(expr->args.size(), false);
+    std::vector<IterSplitExpr> iters;
+    iters.reserve(expr->args.size());
+    // canonicalize the expression
+    // check if it can be remapped into a fused pattern.
+    PrimExpr expected_scale = make_const(expr->base->dtype, 1);
+    for (size_t i = 0; i < expr->args.size(); ++i) {
+      size_t j = 0;
+      for (; j < expr->args.size(); ++j) {
+        if (!visited[j] && CanProveEqual(expr->args[j]->scale, expected_scale)) break;
+      }
+      if (j == expr->args.size()) {
+        return NullOpt;
+      }
+      visited[j] = true;
+      iters.push_back(expr->args[j]);
+      expected_scale *= expr->args[j]->extent;
+    }
+    // update the iterator to use the canonicalized form
+    expr.CopyOnWrite()->args = Array<IterSplitExpr>(iters.rbegin(), iters.rend());
+    auto it = sum_fuse_map_.find(expr);
+    if (it != sum_fuse_map_.end()) return it->second;
+    auto mark = IterMark(expr, expected_scale);
+    IterSplitExpr split(mark);
+    sum_fuse_map_[expr] = split;
+    return split;
+  }
+
+  bool CanProveDivisible(PrimExpr lhs, PrimExpr rhs) {
+    const auto* clhs = lhs.as<IntImmNode>();
+    const auto* crhs = rhs.as<IntImmNode>();
+    if (clhs && crhs) return clhs->value % crhs->value == 0;
+    return analyzer_->CanProve(floormod(lhs, rhs) == 0);
+  }
+
+  PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs);
+  PrimExpr SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs);
+
+  static void AddToLhs(IterSumExprNode* lhs, IterSplitExpr rhs, int sign) {
+    tir::ExprDeepEqual equal;
+    for (size_t i = 0; i < lhs->args.size(); ++i) {
+      IterSplitExpr lvalue = lhs->args[i];
+      if (lvalue->source.same_as(rhs->source) && equal(lvalue->lower_factor, rhs->lower_factor) &&
+          equal(lvalue->extent, rhs->extent)) {
+        if (sign > 0) {
+          rhs.CopyOnWrite()->scale = lvalue->scale + rhs->scale;
+        } else {
+          rhs.CopyOnWrite()->scale = lvalue->scale - rhs->scale;
+        }
+        lhs->args.Set(i, rhs);
+        return;
+      }
+    }
+    if (sign > 0) {
+      lhs->args.push_back(rhs);
+    } else {
+      rhs.CopyOnWrite()->scale = make_zero(rhs->scale.dtype()) - rhs->scale;
+      lhs->args.push_back(rhs);
+    }
+  }
+
+  static void AddToLhs(IterSumExprNode* lhs, IterSumExpr rhs, int sign) {
+    for (size_t i = 0; i < rhs->args.size(); ++i) {
+      AddToLhs(lhs, rhs->args[i], sign);
+    }
+    if (sign > 0) {
+      lhs->base += rhs->base;
+    } else {
+      lhs->base -= rhs->base;
+    }
+  }
+
+  static void MulToLhs(IterSumExprNode* lhs, PrimExpr rhs) {
+    for (size_t i = 0; i < lhs->args.size(); ++i) {
+      IterSplitExpr lvalue = lhs->args[i];
+      lvalue.CopyOnWrite()->scale *= rhs;
+      lhs->args.Set(i, lvalue);
+    }
+    lhs->base *= rhs;
+  }
+};
+
+Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                                 arith::Analyzer* analyzer) {
+  // Overall detection algorithm is divided into two steps:
+  // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns.
+  // - Step1: IterIndependenceChecker checks if the iterator are independent.
+  IterMapRewriter rewriter(analyzer, input_iters);
+  Array<IterSumExpr> results;
+
+  for (PrimExpr value : indices) {
+    results.push_back(rewriter.Rewrite(value));
+    if (rewriter.unresolved_count() != 0) return Array<IterSumExpr>();
+  }
+  if (!rewriter.CheckBijective(results)) return Array<IterSumExpr>();
+
+  return results;
+}
+
+TVM_REGISTER_GLOBAL("arith.DetectIterMap")
+    .set_body_typed([](const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters) {
+      arith::Analyzer ana;
+      return DetectIterMap(indices, input_iters, &ana);
+    });
+
+PrimExpr IterMapRewriter::VisitExpr_(const VarNode* op) {
+  auto var = GetRef<Var>(op);
+  auto it = var_map_.find(var);
+  if (it != var_map_.end()) return it->second;
+  return std::move(var);
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<Add>(a, b);
+  if (const_res.defined()) return const_res;
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return Add(a, b);
+    }
+  }
+
+  // canonical form simplification.
+  IterSumExpr ret = ToIterSumExpr(std::move(a));
+
+  if (!b->IsInstance<IterMapExprNode>()) {
+    ret.CopyOnWrite()->base += b;
+  } else if (const auto* op = b.as<IterSumExprNode>()) {
+    AddToLhs(ret.CopyOnWrite(), GetRef<IterSumExpr>(op), 1);
+  } else if (const auto* op = b.as<IterSplitExprNode>()) {
+    AddToLhs(ret.CopyOnWrite(), GetRef<IterSplitExpr>(op), 1);
+  } else {
+    AddToLhs(ret.CopyOnWrite(), ToIterSumExpr(b), 1);
+  }
+  return std::move(ret);
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<Sub>(a, b);
+  if (const_res.defined()) return const_res;
+
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return Sub(a, b);
+    }
+  }
+
+  // canonical form simplification.
+  IterSumExpr ret = ToIterSumExpr(std::move(a));
+
+  if (!b->IsInstance<IterMapExprNode>()) {
+    ret.CopyOnWrite()->base -= b;
+  } else if (const auto* op = b.as<IterSumExprNode>()) {
+    AddToLhs(ret.CopyOnWrite(), GetRef<IterSumExpr>(op), -1);
+  } else if (const auto* op = b.as<IterSplitExprNode>()) {
+    AddToLhs(ret.CopyOnWrite(), GetRef<IterSplitExpr>(op), -1);
+  } else {
+    AddToLhs(ret.CopyOnWrite(), ToIterSumExpr(b), -1);
+  }
+  return std::move(ret);
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+  // normalize
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<Mul>(a, b);
+  if (const_res.defined()) return const_res;
+
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return Mul(a, b);
+    }
+  }
+
+  if (a->IsInstance<IterMapExprNode>() && b->IsInstance<IterMapExprNode>()) {
+    // cannot multiply two iterators, mark as unresolved.
+    ++unresolved_count_;
+    return Mul(a, b);
+  }
+
+  if (!a->IsInstance<IterMapExprNode>()) {
+    std::swap(a, b);
+  }
+
+  if (a->IsInstance<IterSumExprNode>()) {
+    IterSumExpr ret = Downcast<IterSumExpr>(std::move(a));
+    MulToLhs(ret.CopyOnWrite(), b);
+    return std::move(ret);
+  } else {
+    CHECK(a->IsInstance<IterSplitExprNode>());
+    IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
+    ret.CopyOnWrite()->scale *= b;
+    return std::move(ret);
+  }
+}
+
+PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) {
+  if (is_one(rhs)) return std::move(lhs);
+  if (!is_one(lhs->scale)) {
+    if (CanProveDivisible(lhs->scale, rhs)) {
+      lhs.CopyOnWrite()->scale = floordiv(lhs->scale, rhs);
+      return std::move(lhs);
+    } else {
+      if (CanProveDivisible(rhs, lhs->scale)) {
+        rhs = floordiv(rhs, lhs->scale);
+        lhs.CopyOnWrite()->scale = make_const(rhs->dtype, 1);
+      } else {
+        // mark as unresolved.
+        ++unresolved_count_;
+        return floordiv(lhs, rhs);
+      }
+    }
+  }
+
+  if (CanProveDivisible(lhs->extent, rhs)) {
+    auto* ptr_lhs = lhs.CopyOnWrite();
+    ptr_lhs->lower_factor *= rhs;
+    ptr_lhs->extent = analyzer_->Simplify(floordiv(ptr_lhs->extent, rhs));
+    return std::move(lhs);
+  } else {
+    // mark as unresolved.
+    ++unresolved_count_;
+    return floordiv(lhs, rhs);
+  }
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<FloorDiv>(a, b);
+  if (const_res.defined()) return const_res;
+
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return FloorDiv(a, b);
+    }
+  }
+
+  if (b->IsInstance<IterMapExprNode>()) {
+    // cannot divide an iterator, mark as unresolved.
+    ++unresolved_count_;
+    return FloorDiv(a, b);
+  }
+
+  if (a->IsInstance<IterSumExprNode>()) {
+    IterSumExpr ret = Downcast<IterSumExpr>(std::move(a));
+    if (auto opt = TryFuseIters(ret)) {
+      return SplitFloorDivConst(opt.value(), b);
+    } else {
+      ++unresolved_count_;
+      return FloorDiv(a, b);
+    }
+  } else {
+    CHECK(a->IsInstance<IterSplitExprNode>());
+    IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
+    return SplitFloorDivConst(ret, b);
+  }
+}
+
+PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) {
+  if (is_one(rhs)) return make_zero(lhs->dtype);
+  if (!is_one(lhs->scale)) {
+    if (CanProveDivisible(lhs->scale, rhs)) {
+      return make_zero(lhs->dtype);
+    } else {
+      if (CanProveDivisible(rhs, lhs->scale)) {
+        rhs = floormod(rhs, lhs->scale);
+      } else {
+        // mark as unresolved.
+        ++unresolved_count_;
+        return floormod(lhs, rhs);
+      }
+    }
+  }
+
+  if (CanProveDivisible(lhs->extent, rhs)) {
+    lhs.CopyOnWrite()->extent = rhs;
+    return std::move(lhs);
+  } else {
+    // mark as unresolved.
+    ++unresolved_count_;
+    return floormod(lhs, rhs);
+  }
+}
+
+PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Parent::VisitExpr_(op);
+  }
+
+  PrimExpr a = this->DirectMutate(op->a);
+  PrimExpr b = this->DirectMutate(op->b);
+
+  // const folding
+  PrimExpr const_res = TryConstFold<FloorMod>(a, b);
+  if (const_res.defined()) return const_res;
+
+  // does not contain iter map.
+  if (!a->IsInstance<IterMapExprNode>() && !b->IsInstance<IterMapExprNode>()) {
+    if (op->a.same_as(a) && op->b.same_as(b)) {
+      return GetRef<PrimExpr>(op);
+    } else {
+      return FloorMod(a, b);
+    }
+  }
+
+  if (b->IsInstance<IterMapExprNode>()) {
+    // cannot mod an iterator, mark as unresolved.
+    ++unresolved_count_;
+    return FloorMod(a, b);
+  }
+
+  if (a->IsInstance<IterSumExprNode>()) {
+    IterSumExpr ret = Downcast<IterSumExpr>(std::move(a));
+    if (auto opt = TryFuseIters(ret)) {
+      return SplitFloorModConst(opt.value(), b);
+    } else {
+      ++unresolved_count_;
+      return FloorMod(a, b);
+    }
+  } else {
+    CHECK(a->IsInstance<IterSplitExprNode>());
+    IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
+    return SplitFloorModConst(ret, b);
+  }
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index c237edc493a6..cb8ef01e7369 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -882,6 +882,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
     TVM_TRY_REWRITE_IF(floormod(x + y * c1, c2), floormod(x, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
+    TVM_TRY_REWRITE(floormod(x * y, y), ZeroWithTypeLike(x));
+    TVM_TRY_REWRITE(floormod(y * x, y), ZeroWithTypeLike(y));
+
     // try modular analysis
     if (floormod(x, c1).Match(ret)) {
       ModularSet mod = analyzer_->modular_set(x.Eval());
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index d21cb1f2d9b3..1122b8e1ee40 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -28,6 +28,8 @@
 #include <algorithm>
 #include <unordered_map>
 
+#include "../support/util.h"
+
 namespace tvm {
 
 // Define the dispatch functio here since primary user is in this file.
@@ -163,7 +165,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
     // combine in the reverse order of the stack.
     size_t reduced_hash = task.reduced_hash;
     for (size_t i = result_stack_.size(); i != stack_begin; --i) {
-      reduced_hash = HashCombine(reduced_hash, result_stack_[i - 1]);
+      reduced_hash = support::HashCombine(reduced_hash, result_stack_[i - 1]);
     }
     result_stack_.resize(stack_begin);
     return reduced_hash;
@@ -186,8 +188,8 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
           // Append the graph node counter to the hash
           // so that we can distinguish DAG from trees.
           if (entry.graph_node_hash) {
-            entry.reduced_hash =
-                HashCombine(entry.reduced_hash, std::hash<size_t>()(graph_node_counter_++));
+            entry.reduced_hash = support::HashCombine(entry.reduced_hash,
+                                                      std::hash<size_t>()(graph_node_counter_++));
           }
           hash_memo_[entry.object] = entry.reduced_hash;
         }
@@ -229,16 +231,6 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
     vtable_->SHashReduce(object.get(), SHashReducer(this, map_free_vars));
   }
 
-  /*!
-   * \brief Combine two hash values into a single one.
-   * \param key The left operand.
-   * \param value The right operand.
-   * \return the combined result.
-   */
-  size_t HashCombine(size_t key, size_t value) {
-    return key ^ (value + 0x9e3779b9 + (key << 6) + (key >> 2));
-  }
-
  private:
   // free var counter.
   size_t free_var_counter_{0};
diff --git a/src/support/util.h b/src/support/util.h
index 859b372bd761..5020df2e2ea7 100644
--- a/src/support/util.h
+++ b/src/support/util.h
@@ -152,6 +152,16 @@ inline int Execute(std::string cmd, std::string* err_msg) {
   return 255;
 }
 
+/*!
+ * \brief Combine two hash values into a single one.
+ * \param key The left operand.
+ * \param value The right operand.
+ * \return the combined result.
+ */
+inline size_t HashCombine(size_t key, size_t value) {
+  return key ^ (value + 0x9e3779b9 + (key << 6) + (key >> 2));
+}
+
 }  // namespace support
 }  // namespace tvm
 #endif  // TVM_SUPPORT_UTIL_H_
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
new file mode 100644
index 000000000000..9fb098831a71
--- /dev/null
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -0,0 +1,176 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.testing
+from tvm import te
+
+
+def ifuse(inputs):
+    """Fuse iterators"""
+    value, extent = 0, 1
+    for i, ext in inputs:
+        value = value * ext + i
+        extent = extent * ext
+    return (value, extent)
+
+
+def isplit(axis, factor):
+    """Split iterators"""
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    return [
+        (fld(axis[0], factor), fld(axis[1] + (factor - 1), factor)),
+        (flm(axis[0], factor), factor),
+    ]
+
+
+def var_dom(iters):
+    """Get domains of iterators"""
+    return {var: tvm.ir.Range(0, ext) for var, ext in iters}
+
+
+def assert_iter_sum_pattern(sum_expr, extent, base, scale=1):
+    """Check the sum expr have the right pattern."""
+    assert isinstance(sum_expr, tvm.arith.IterSumExpr)
+    if extent == 1:
+        assert len(sum_expr.args) == 0
+    else:
+        assert len(sum_expr.args) == 1
+        tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
+        tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
+    tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+
+
+def test_trivial():
+    x = tvm.tir.Var("x", "int32"), 3
+    y = tvm.tir.Var("y", "int32"), 4
+
+    res = tvm.arith.detect_iter_map([x[0], y[0], 3], var_dom([x, y]))
+
+    assert len(res) == 3
+    assert_iter_sum_pattern(res[0], 3, 0)
+    assert_iter_sum_pattern(res[1], 4, 0)
+    assert_iter_sum_pattern(res[2], 1, 3)
+
+    res = tvm.arith.detect_iter_map([x[0], 3], var_dom([x, y]))
+    assert len(res) == 0
+
+    # not independent
+    res = tvm.arith.detect_iter_map([x[0], x[0], 3], var_dom([x, y]))
+    assert len(res) == 0
+
+
+def test_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+
+    res = tvm.arith.detect_iter_map([y * 3 + 1 + c + x], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 12, 1 + c)
+
+    res = tvm.arith.detect_iter_map([ifuse([(x, 3), (y, 4)])[0]], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 12, 0)
+
+    # fuse with symbolic factor
+    res = tvm.arith.detect_iter_map([(y + 1) * c + x], var_dom([(x, c), (y, 4)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 4 * c, c)
+
+    # duplication
+    res = tvm.arith.detect_iter_map([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 0
+
+    # duplication 2
+    res = tvm.arith.detect_iter_map([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 0
+
+    # factor mismatch
+    res = tvm.arith.detect_iter_map([y * 4 + x], var_dom([(x, 3), (y, 4)]))
+    assert len(res) == 0
+
+
+def test_split():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("y", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+    c1 = tvm.tir.SizeVar("c1", "int32")
+    c2 = tvm.tir.SizeVar("c1", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    res = tvm.arith.detect_iter_map([fld(x, 3), flm(x, 3) * 2 + c1], var_dom([(x, 24)]))
+
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 8, 0)
+    assert_iter_sum_pattern(res[1], 3, c1, 2)
+
+    res = tvm.arith.detect_iter_map([fld(x, 6), fld(flm(x, 6), 2), flm(x, 2)], var_dom([(x, 24)]))
+
+    assert len(res) == 3
+    assert_iter_sum_pattern(res[0], 4, 0)
+    assert_iter_sum_pattern(res[1], 3, 0)
+    assert_iter_sum_pattern(res[2], 2, 0)
+
+    # simple symbolic bound
+    # TODO(tvm-team) improve symbolic divisible check to enable
+    # more complicated symbolic bound
+    res = tvm.arith.detect_iter_map([fld(x, c0), flm(x, c0)], var_dom([(x, c1 * c0)]))
+
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], c1, 0)
+    assert_iter_sum_pattern(res[1], c0, 0)
+
+
+def test_compound():
+    x = tvm.tir.Var("x", "int32"), 10
+    y = tvm.tir.Var("y", "int32"), 9
+
+    xo, xi = isplit(x, 5)
+    yo, yi = isplit(y, 3)
+    z = ifuse([yo, xo, yi])
+
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([x, y]))
+
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 18, 0)
+    assert_iter_sum_pattern(res[1], 5, 0)
+    # reconstruct the pattern manually
+    mx = tvm.arith.IterMark(x[0], 10)
+    my = tvm.arith.IterMark(y[0], 9)
+
+    xoscale = 3
+    xiscale = 1
+    yoscale = 6
+    yiscale = 1
+    mxo = tvm.arith.IterSplitExpr(mx, 5, 2, xoscale)
+    mxi = tvm.arith.IterSplitExpr(mx, 1, 5, xiscale)
+    myo = tvm.arith.IterSplitExpr(my, 3, 3, yoscale)
+    myi = tvm.arith.IterSplitExpr(my, 1, 3, yiscale)
+
+    mz = tvm.arith.IterMark(tvm.arith.IterSumExpr([myo, mxo, myi], 0), 18)
+    sz = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(mz, 1, 18, 1)], 0)
+    tvm.ir.assert_structural_equal(sz, res[0])
+
+
+if __name__ == "__main__":
+    test_split()
+    test_trivial()
+    test_fuse()
+    test_compound()

From 28a2be47142db87047a9957138001c538abbec82 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 14 Oct 2020 19:04:23 -0400
Subject: [PATCH 005/258] [REFACTOR] util => utils for consistency in the
 project. (#6684)

* [REFACTOR] util => utils for consistency in the project.

* Update CMake
---
 CMakeLists.txt                                | 14 +++++-----
 .../app/src/main/jni/tvm_runtime.h            |  2 +-
 .../app/src/main/jni/tvm_runtime.h            |  2 +-
 .../app/src/main/jni/tvm_runtime.h            |  2 +-
 apps/bundle_deploy/runtime.cc                 |  2 +-
 apps/cpp_rpc/main.cc                          |  2 +-
 apps/cpp_rpc/rpc_env.cc                       |  4 +--
 apps/howto_deploy/tvm_runtime_pack.cc         |  2 +-
 apps/ios_rpc/tvmrpc/TVMRuntime.mm             |  2 +-
 cmake/{util => utils}/FindCUDA.cmake          |  0
 cmake/{util => utils}/FindEthosN.cmake        |  0
 cmake/{util => utils}/FindLLVM.cmake          |  0
 cmake/{util => utils}/FindOpenCL.cmake        |  0
 cmake/{util => utils}/FindROCM.cmake          |  0
 cmake/{util => utils}/FindVulkan.cmake        |  0
 cmake/{util/Util.cmake => utils/Utils.cmake}  |  0
 docs/dev/relay_add_op.rst                     |  2 +-
 golang/src/tvm_runtime_pack.cc                |  2 +-
 include/tvm/topi/{util.h => utils.h}          |  8 +++---
 python/tvm/te/hybrid/__init__.py              |  4 +--
 python/tvm/te/hybrid/calls.py                 |  2 +-
 python/tvm/te/hybrid/module.py                |  4 +--
 python/tvm/te/hybrid/parser.py                | 26 +++++++++----------
 python/tvm/te/hybrid/preprocessor.py          |  2 +-
 python/tvm/te/hybrid/{util.py => utils.py}    |  0
 src/arith/int_constraints.cc                  |  2 +-
 src/arith/iter_affine_map.cc                  |  2 +-
 src/node/structural_hash.cc                   |  2 +-
 src/relay/analysis/feature.cc                 |  2 +-
 src/relay/analysis/mac_count.cc               |  2 +-
 src/relay/analysis/util.cc                    |  2 +-
 src/relay/backend/compile_engine.cc           |  2 +-
 src/relay/backend/vm/compiler.cc              |  2 +-
 src/relay/backend/vm/compiler.h               |  2 +-
 src/relay/op/annotation/annotation.cc         |  2 +-
 src/relay/op/device_copy.cc                   |  2 +-
 src/relay/op/dyn/tensor/transform.cc          |  2 +-
 src/relay/op/memory/memory.cc                 |  2 +-
 src/relay/op/nn/bitserial.cc                  |  2 +-
 src/relay/op/nn/convolution.cc                |  2 +-
 src/relay/op/nn/nn.cc                         |  2 +-
 src/relay/op/nn/pooling.cc                    |  2 +-
 src/relay/op/nn/sparse.cc                     |  2 +-
 src/relay/op/op_common.h                      |  2 +-
 src/relay/op/tensor/transform.cc              |  6 ++---
 src/relay/op/vision/rcnn_op.cc                |  2 +-
 src/relay/op/vm/vm.cc                         |  2 +-
 src/relay/qnn/op/concatenate.cc               |  6 ++---
 src/relay/qnn/op/convolution.cc               |  4 +--
 src/relay/qnn/op/dense.cc                     |  4 +--
 src/relay/qnn/op/dequantize.cc                |  4 +--
 src/relay/qnn/op/mul.cc                       |  4 +--
 src/relay/qnn/op/op_common.h                  |  4 +--
 src/relay/qnn/op/quantize.cc                  |  4 +--
 src/relay/qnn/op/requantize.cc                |  6 ++---
 src/relay/qnn/{util.cc => utils.cc}           |  6 ++---
 src/relay/qnn/{util.h => utils.h}             |  8 +++---
 src/relay/quantize/partition.cc               |  2 +-
 src/relay/quantize/quantize.h                 |  2 +-
 src/relay/quantize/realize.cc                 |  4 +--
 src/relay/transforms/alter_op_layout.cc       |  2 +-
 src/relay/transforms/annotate_target.cc       |  2 +-
 src/relay/transforms/canonicalize_cast.cc     |  4 +--
 src/relay/transforms/canonicalize_ops.cc      |  2 +-
 .../combine_parallel_batch_matmul.cc          |  2 +-
 .../transforms/combine_parallel_conv2d.cc     |  2 +-
 .../transforms/combine_parallel_dense.cc      |  2 +-
 src/relay/transforms/combine_parallel_op.cc   |  2 +-
 src/relay/transforms/combine_parallel_op.h    |  2 +-
 .../transforms/combine_parallel_op_batch.cc   |  2 +-
 .../transforms/combine_parallel_op_batch.h    |  2 +-
 src/relay/transforms/convert_layout.cc        |  2 +-
 src/relay/transforms/defunctionalization.cc   |  2 +-
 src/relay/transforms/dynamic_to_static.cc     |  2 +-
 .../transforms/eliminate_common_subexpr.cc    |  2 +-
 src/relay/transforms/fast_math.cc             |  2 +-
 src/relay/transforms/fold_constant.cc         |  2 +-
 src/relay/transforms/fold_scale_axis.cc       |  4 +--
 src/relay/transforms/forward_rewrite.cc       |  2 +-
 src/relay/transforms/fuse_ops.cc              |  4 +--
 src/relay/transforms/gradient.cc              |  4 +--
 ...fer_layout_util.h => infer_layout_utils.h} | 10 +++----
 .../transforms/merge_compiler_regions.cc      |  2 +-
 src/relay/transforms/partial_eval.cc          |  2 +-
 src/relay/transforms/partition_graph.cc       |  2 +-
 .../transforms/{pass_util.h => pass_utils.h}  |  8 +++---
 .../{pattern_util.h => pattern_utils.h}       |  8 +++---
 src/relay/transforms/simplify_inference.cc    |  2 +-
 src/relay/transforms/to_a_normal_form.cc      |  2 +-
 .../transforms/to_basic_block_normal_form.cc  |  2 +-
 src/relay/transforms/to_cps.cc                |  2 +-
 src/relay/transforms/transform_layout.h       |  4 +--
 src/relay/transforms/type_infer.cc            |  2 +-
 src/runtime/contrib/ethosn/ethosn_runtime.cc  |  2 +-
 src/runtime/cuda/cuda_module.cc               |  2 +-
 src/runtime/{file_util.cc => file_utils.cc}   |  4 +--
 src/runtime/{file_util.h => file_utils.h}     | 10 +++----
 src/runtime/hexagon/hexagon_module.cc         |  2 +-
 src/runtime/metal/metal_module.mm             |  2 +-
 src/runtime/module.cc                         |  2 +-
 src/runtime/opencl/opencl_common.h            |  2 +-
 src/runtime/rocm/rocm_module.cc               |  2 +-
 src/runtime/rpc/rpc_server_env.cc             |  2 +-
 src/runtime/stackvm/stackvm_module.cc         |  2 +-
 src/runtime/vm/executable.cc                  |  2 +-
 .../{serialize_util.h => serialize_utils.h}   |  8 +++---
 src/runtime/vulkan/vulkan.cc                  |  2 +-
 src/support/socket.h                          |  2 +-
 src/support/{util.h => utils.h}               |  8 +++---
 src/target/llvm/codegen_llvm.h                |  2 +-
 src/target/llvm/llvm_module.cc                |  2 +-
 src/target/source/codegen_aocl.cc             |  2 +-
 src/target/source/source_module.cc            |  2 +-
 src/te/autodiff/ad_simplify.cc                |  2 +-
 src/te/autodiff/{ad_util.cc => ad_utils.cc}   |  4 +--
 src/te/autodiff/{ad_util.h => ad_utils.h}     |  8 +++---
 src/te/autodiff/adjoint.cc                    |  2 +-
 src/te/autodiff/jacobian.cc                   |  2 +-
 src/te/operation/compute_op.cc                |  2 +-
 src/te/operation/cross_thread_reduction.cc    |  2 +-
 src/te/operation/extern_op.cc                 |  2 +-
 src/te/operation/hybrid_op.cc                 |  2 +-
 src/te/operation/hybrid_op.h                  |  2 +-
 src/te/operation/{op_util.cc => op_utils.cc}  |  4 +--
 src/te/operation/{op_util.h => op_utils.h}    | 10 +++----
 src/te/operation/scan_op.cc                   |  2 +-
 src/te/operation/tensor_compute_op.cc         |  2 +-
 src/te/operation/tensorize.cc                 |  2 +-
 src/te/schedule/operation_inline.cc           |  2 +-
 src/te/schedule/schedule_dataflow_rewrite.cc  |  2 +-
 src/te/schedule/schedule_ops.cc               |  4 +--
 src/tir/transforms/arg_binder.cc              |  2 +-
 src/tir/transforms/coproc_sync.cc             |  2 +-
 src/tir/transforms/hoist_if_then_else.cc      |  2 +-
 src/tir/transforms/inject_double_buffer.cc    |  2 +-
 src/tir/transforms/inject_virtual_thread.cc   |  2 +-
 .../transforms/{ir_util.cc => ir_utils.cc}    |  4 +--
 src/tir/transforms/{ir_util.h => ir_utils.h}  |  8 +++---
 src/tir/transforms/lift_attr_scope.cc         |  2 +-
 src/tir/transforms/loop_partition.cc          |  2 +-
 .../lower_device_storage_access_info.cc       |  2 +-
 src/tir/transforms/lower_thread_allreduce.cc  |  2 +-
 src/tir/transforms/lower_tvm_builtin.cc       |  2 +-
 src/tir/transforms/make_packed_api.cc         |  2 +-
 src/tir/transforms/storage_access.cc          |  2 +-
 src/tir/transforms/storage_flatten.cc         |  2 +-
 src/tir/transforms/storage_rewrite.cc         |  2 +-
 .../transforms/tensorcore_infer_fragment.cc   |  2 +-
 src/tir/transforms/thread_storage_sync.cc     |  2 +-
 src/tir/transforms/unroll_loop.cc             |  2 +-
 src/topi/broadcast.cc                         |  2 +-
 src/topi/reduction.cc                         |  2 +-
 src/topi/transform.cc                         |  2 +-
 web/emcc/wasm_runtime.cc                      |  2 +-
 154 files changed, 228 insertions(+), 228 deletions(-)
 rename cmake/{util => utils}/FindCUDA.cmake (100%)
 rename cmake/{util => utils}/FindEthosN.cmake (100%)
 rename cmake/{util => utils}/FindLLVM.cmake (100%)
 rename cmake/{util => utils}/FindOpenCL.cmake (100%)
 rename cmake/{util => utils}/FindROCM.cmake (100%)
 rename cmake/{util => utils}/FindVulkan.cmake (100%)
 rename cmake/{util/Util.cmake => utils/Utils.cmake} (100%)
 rename include/tvm/topi/{util.h => utils.h} (92%)
 rename python/tvm/te/hybrid/{util.py => utils.py} (100%)
 rename src/relay/qnn/{util.cc => utils.cc} (98%)
 rename src/relay/qnn/{util.h => utils.h} (98%)
 rename src/relay/transforms/{infer_layout_util.h => infer_layout_utils.h} (97%)
 rename src/relay/transforms/{pass_util.h => pass_utils.h} (98%)
 rename src/relay/transforms/{pattern_util.h => pattern_utils.h} (99%)
 rename src/runtime/{file_util.cc => file_utils.cc} (99%)
 rename src/runtime/{file_util.h => file_utils.h} (93%)
 rename src/runtime/vm/{serialize_util.h => serialize_utils.h} (96%)
 rename src/support/{util.h => utils.h} (97%)
 rename src/te/autodiff/{ad_util.cc => ad_utils.cc} (99%)
 rename src/te/autodiff/{ad_util.h => ad_utils.h} (97%)
 rename src/te/operation/{op_util.cc => op_utils.cc} (99%)
 rename src/te/operation/{op_util.h => op_utils.h} (95%)
 rename src/tir/transforms/{ir_util.cc => ir_utils.cc} (99%)
 rename src/tir/transforms/{ir_util.h => ir_utils.h} (97%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e24bbeb5acd8..7aacfcd4d9ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,13 +2,13 @@ cmake_minimum_required(VERSION 3.2)
 project(tvm C CXX)
 
 # Utility functions
-include(cmake/util/Util.cmake)
-include(cmake/util/FindCUDA.cmake)
-include(cmake/util/FindOpenCL.cmake)
-include(cmake/util/FindVulkan.cmake)
-include(cmake/util/FindLLVM.cmake)
-include(cmake/util/FindROCM.cmake)
-include(cmake/util/FindEthosN.cmake)
+include(cmake/utils/Utils.cmake)
+include(cmake/utils/FindCUDA.cmake)
+include(cmake/utils/FindOpenCL.cmake)
+include(cmake/utils/FindVulkan.cmake)
+include(cmake/utils/FindLLVM.cmake)
+include(cmake/utils/FindROCM.cmake)
+include(cmake/utils/FindEthosN.cmake)
 
 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
   include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h
index bc10bdaa508c..5f3db04274a1 100644
--- a/apps/android_camera/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h
@@ -40,7 +40,7 @@
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
-#include "../src/runtime/file_util.cc"
+#include "../src/runtime/file_utils.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
 #include "../src/runtime/library_module.cc"
 #include "../src/runtime/module.cc"
diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
index f1a47a674281..362d278c38c4 100644
--- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
@@ -28,7 +28,7 @@
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
-#include "../src/runtime/file_util.cc"
+#include "../src/runtime/file_utils.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
 #include "../src/runtime/library_module.cc"
 #include "../src/runtime/module.cc"
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index aea61e757aa7..2005568c608c 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -40,7 +40,7 @@
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
-#include "../src/runtime/file_util.cc"
+#include "../src/runtime/file_utils.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
 #include "../src/runtime/graph/graph_runtime_factory.cc"
 #include "../src/runtime/library_module.cc"
diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc
index 8e294a05775d..3224028b60a1 100644
--- a/apps/bundle_deploy/runtime.cc
+++ b/apps/bundle_deploy/runtime.cc
@@ -24,7 +24,7 @@
 
 #include "../../src/runtime/c_runtime_api.cc"
 #include "../../src/runtime/cpu_device_api.cc"
-#include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/file_utils.cc"
 #include "../../src/runtime/graph/graph_runtime.cc"
 #include "../../src/runtime/library_module.cc"
 #include "../../src/runtime/module.cc"
diff --git a/apps/cpp_rpc/main.cc b/apps/cpp_rpc/main.cc
index 777fffa7d37c..9bbbea92f41c 100644
--- a/apps/cpp_rpc/main.cc
+++ b/apps/cpp_rpc/main.cc
@@ -35,7 +35,7 @@
 #include <vector>
 
 #include "../../src/support/socket.h"
-#include "../../src/support/util.h"
+#include "../../src/support/utils.h"
 #include "rpc_server.h"
 
 #if defined(_WIN32)
diff --git a/apps/cpp_rpc/rpc_env.cc b/apps/cpp_rpc/rpc_env.cc
index c64cb2f09f94..b76a1abbdadd 100644
--- a/apps/cpp_rpc/rpc_env.cc
+++ b/apps/cpp_rpc/rpc_env.cc
@@ -40,8 +40,8 @@ int mkdir(const char* path, int /* ignored */) { return _mkdir(path); }
 #include <string>
 #include <vector>
 
-#include "../../src/runtime/file_util.h"
-#include "../../src/support/util.h"
+#include "../../src/runtime/file_utils.h"
+#include "../../src/support/utils.h"
 #include "rpc_env.h"
 
 namespace {
diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc
index b43f920b6056..d6dd5876a994 100644
--- a/apps/howto_deploy/tvm_runtime_pack.cc
+++ b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -39,7 +39,7 @@
  */
 #include "../../src/runtime/c_runtime_api.cc"
 #include "../../src/runtime/cpu_device_api.cc"
-#include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/file_utils.cc"
 #include "../../src/runtime/library_module.cc"
 #include "../../src/runtime/module.cc"
 #include "../../src/runtime/ndarray.cc"
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index 9e2899bf6e5e..61a4668cdd91 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -25,7 +25,7 @@
 #include "../../../src/runtime/c_runtime_api.cc"
 #include "../../../src/runtime/cpu_device_api.cc"
 #include "../../../src/runtime/dso_library.cc"
-#include "../../../src/runtime/file_util.cc"
+#include "../../../src/runtime/file_utils.cc"
 #include "../../../src/runtime/library_module.cc"
 #include "../../../src/runtime/metadata_module.cc"
 #include "../../../src/runtime/module.cc"
diff --git a/cmake/util/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
similarity index 100%
rename from cmake/util/FindCUDA.cmake
rename to cmake/utils/FindCUDA.cmake
diff --git a/cmake/util/FindEthosN.cmake b/cmake/utils/FindEthosN.cmake
similarity index 100%
rename from cmake/util/FindEthosN.cmake
rename to cmake/utils/FindEthosN.cmake
diff --git a/cmake/util/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake
similarity index 100%
rename from cmake/util/FindLLVM.cmake
rename to cmake/utils/FindLLVM.cmake
diff --git a/cmake/util/FindOpenCL.cmake b/cmake/utils/FindOpenCL.cmake
similarity index 100%
rename from cmake/util/FindOpenCL.cmake
rename to cmake/utils/FindOpenCL.cmake
diff --git a/cmake/util/FindROCM.cmake b/cmake/utils/FindROCM.cmake
similarity index 100%
rename from cmake/util/FindROCM.cmake
rename to cmake/utils/FindROCM.cmake
diff --git a/cmake/util/FindVulkan.cmake b/cmake/utils/FindVulkan.cmake
similarity index 100%
rename from cmake/util/FindVulkan.cmake
rename to cmake/utils/FindVulkan.cmake
diff --git a/cmake/util/Util.cmake b/cmake/utils/Utils.cmake
similarity index 100%
rename from cmake/util/Util.cmake
rename to cmake/utils/Utils.cmake
diff --git a/docs/dev/relay_add_op.rst b/docs/dev/relay_add_op.rst
index 7dca251dd532..0697939be162 100644
--- a/docs/dev/relay_add_op.rst
+++ b/docs/dev/relay_add_op.rst
@@ -231,7 +231,7 @@ Adding a Gradient in C++
 Adding a gradient in C++ is similar to adding one in Python, but the
 interface for registering is slightly different.
 
-First, make sure ``src/relay/pass/pattern_util.h`` is included. It provides
+First, make sure ``src/relay/pass/pattern_utils.h`` is included. It provides
 helper functions for creating nodes in the Relay AST. Then, define the
 gradient in a similar fashion as in the Python example:
 
diff --git a/golang/src/tvm_runtime_pack.cc b/golang/src/tvm_runtime_pack.cc
index 644249fa75c9..7dd6dd5e94c5 100644
--- a/golang/src/tvm_runtime_pack.cc
+++ b/golang/src/tvm_runtime_pack.cc
@@ -23,7 +23,7 @@
  */
 #include "src/runtime/c_runtime_api.cc"
 #include "src/runtime/cpu_device_api.cc"
-#include "src/runtime/file_util.cc"
+#include "src/runtime/file_utils.cc"
 #include "src/runtime/library_module.cc"
 #include "src/runtime/module.cc"
 #include "src/runtime/ndarray.cc"
diff --git a/include/tvm/topi/util.h b/include/tvm/topi/utils.h
similarity index 92%
rename from include/tvm/topi/util.h
rename to include/tvm/topi/utils.h
index 4e0cdc6f2057..60dc3a6a01dd 100644
--- a/include/tvm/topi/util.h
+++ b/include/tvm/topi/utils.h
@@ -19,10 +19,10 @@
 
 /*!
  * \brief Topi utility function
- * \file topi/util.h
+ * \file topi/utils.h
  */
-#ifndef TVM_TOPI_UTIL_H_
-#define TVM_TOPI_UTIL_H_
+#ifndef TVM_TOPI_UTILS_H_
+#define TVM_TOPI_UTILS_H_
 
 #include <tvm/ir/expr.h>
 #include <tvm/runtime/packed_func.h>
@@ -44,4 +44,4 @@ inline Array<Integer> ArrayOrInt(TVMArgValue arg) {
 }
 }  // namespace topi
 }  // namespace tvm
-#endif  // TVM_TOPI_UTIL_H_
+#endif  // TVM_TOPI_UTILS_H_
diff --git a/python/tvm/te/hybrid/__init__.py b/python/tvm/te/hybrid/__init__.py
index 3cd1b01797ce..3a10bda29a7d 100644
--- a/python/tvm/te/hybrid/__init__.py
+++ b/python/tvm/te/hybrid/__init__.py
@@ -35,7 +35,7 @@
 
 from .module import HybridModule
 from .parser import source_to_op
-from .util import _pruned_source
+from .utils import _pruned_source
 
 
 def script(pyfunc):
@@ -51,7 +51,7 @@ def script(pyfunc):
     """
     # pylint: disable=import-outside-toplevel, missing-docstring
     def wrapped_func(func, *args, **kwargs):
-        from .util import _is_tvm_arg_types
+        from .utils import _is_tvm_arg_types
 
         if _is_tvm_arg_types(args):
             src = _pruned_source(func)
diff --git a/python/tvm/te/hybrid/calls.py b/python/tvm/te/hybrid/calls.py
index 2e7fc2b72311..761189115050 100644
--- a/python/tvm/te/hybrid/calls.py
+++ b/python/tvm/te/hybrid/calls.py
@@ -25,7 +25,7 @@
 from tvm.tir import call_intrin
 from tvm.tir.stmt import For
 
-from .util import _internal_assert
+from .utils import _internal_assert
 
 # pylint: disable=redefined-builtin,invalid-name
 
diff --git a/python/tvm/te/hybrid/module.py b/python/tvm/te/hybrid/module.py
index 672089cef384..2af67853ca5b 100644
--- a/python/tvm/te/hybrid/module.py
+++ b/python/tvm/te/hybrid/module.py
@@ -24,8 +24,8 @@
 import ast
 
 from tvm.contrib import util
-from .util import _internal_assert
-from .util import _is_tvm_arg_types
+from .utils import _internal_assert
+from .utils import _is_tvm_arg_types
 from .parser import source_to_op
 
 
diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
index 8704518723f0..d47b2ee879fc 100644
--- a/python/tvm/te/hybrid/parser.py
+++ b/python/tvm/te/hybrid/parser.py
@@ -37,16 +37,16 @@
 from tvm.tir import all as _all
 from tvm.tir import any as _any
 
-from .util import _internal_assert
+from .utils import _internal_assert
 from . import calls
-from . import util
+from . import utils
 from .preprocessor import determine_variable_usage
 
 
 def concat_list_to_block(lst):
     """Concatenate a list of Python IR nodes to HalideIR Block"""
     if not lst:
-        return util.make_nop()
+        return utils.make_nop()
     n = len(lst)
     if n == 1:
         return lst[0]
@@ -55,10 +55,10 @@ def concat_list_to_block(lst):
 
 def visit_list_to_block(visit, lst):
     """Visit and concatenate a list of Python IR nodes to HalideIR Block"""
-    lst = [visit(stmt) for stmt in lst if not util.is_docstring(stmt)]
-    lst = [stmt for stmt in lst if not tvm.ir.structural_equal(stmt, util.make_nop())]
+    lst = [visit(stmt) for stmt in lst if not utils.is_docstring(stmt)]
+    lst = [stmt for stmt in lst if not tvm.ir.structural_equal(stmt, utils.make_nop())]
     if not lst:
-        return util.make_nop()
+        return utils.make_nop()
     return concat_list_to_block(lst)
 
 
@@ -314,7 +314,7 @@ def visit_Assign(self, node):
                 )
                 self.add_symbol(node.targets[i].id, Symbol.GlobalBuffer, rhs.output(i))
                 rmap[rhs.outputs[i].op] = rhs.output(i)
-            return util.replace_io(rhs.body, rmap)
+            return utils.replace_io(rhs.body, rmap)
 
         _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!")
         lhs = node.targets[0]
@@ -339,8 +339,8 @@ def visit_Assign(self, node):
                     self.add_symbol(lhs, getattr(Symbol, scope.title() + "Buffer"), ph)
                     if scope == "output":
                         self.outputs.append(lhs)
-                    return util.make_nop()
-                if isinstance(rhs, util.halide_imm_types) and ast.Store not in rw:
+                    return utils.make_nop()
+                if isinstance(rhs, utils.halide_imm_types) and ast.Store not in rw:
                     self.add_symbol(lhs, Symbol.ConstVar, rhs)
                 else:
                     _internal_assert(
@@ -355,7 +355,7 @@ def visit_Assign(self, node):
             if lhs is not None:
                 buf, args = lhs
                 return tvm.tir.ProducerStore(buf, rhs, args)
-            return util.make_nop()
+            return utils.make_nop()
 
         lhs, args = self.visit(lhs)
         _internal_assert(
@@ -412,7 +412,7 @@ def visit_If(self, node):
                 return visit_list_to_block(self.visit, node.body)
             if node.orelse:
                 return visit_list_to_block(self.visit, node.orelse)
-            return util.make_nop()
+            return utils.make_nop()
 
         if_body = visit_list_to_block(self.visit, node.body)
 
@@ -559,7 +559,7 @@ def visit_Return(self, node):
             logging.log(logging.CRITICAL, "[Warning] Not all the output buffers returned!")
         self.outputs = [self.symbols[i][1] for i in ids]
         self.returned = True
-        return util.make_nop()
+        return utils.make_nop()
 
     def visit_Tuple(self, node):
         return tuple(self.visit(i) for i in node.elts)
@@ -570,7 +570,7 @@ def visit_Str(self, node):
     def visit_Assert(self, node):
         test = self.visit(node.test)
         mesg = tvm.runtime.convert(self.visit(node.msg))
-        return tvm.tir.AssertStmt(test, mesg, util.make_nop())
+        return tvm.tir.AssertStmt(test, mesg, utils.make_nop())
 
 
 def parse_python(src, args, symbols, closure_vars):
diff --git a/python/tvm/te/hybrid/preprocessor.py b/python/tvm/te/hybrid/preprocessor.py
index b046231fbf48..295476f80812 100644
--- a/python/tvm/te/hybrid/preprocessor.py
+++ b/python/tvm/te/hybrid/preprocessor.py
@@ -19,7 +19,7 @@
 import ast
 import sys
 from .runtime import HYBRID_GLOBALS
-from .util import _internal_assert
+from .utils import _internal_assert
 
 
 class PyVariableUsage(ast.NodeVisitor):
diff --git a/python/tvm/te/hybrid/util.py b/python/tvm/te/hybrid/utils.py
similarity index 100%
rename from python/tvm/te/hybrid/util.py
rename to python/tvm/te/hybrid/utils.py
diff --git a/src/arith/int_constraints.cc b/src/arith/int_constraints.cc
index 189869bd64e7..56c95d0ab713 100644
--- a/src/arith/int_constraints.cc
+++ b/src/arith/int_constraints.cc
@@ -33,7 +33,7 @@
 #include <unordered_map>
 #include <utility>
 
-#include "../tir/transforms/ir_util.h"
+#include "../tir/transforms/ir_utils.h"
 
 namespace tvm {
 namespace arith {
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 7afa75a7efb0..e56ef2a75ee1 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -27,7 +27,7 @@
 #include <tvm/tir/expr_functor.h>
 #include <tvm/tir/op.h>
 
-#include "../support/util.h"
+#include "../support/utils.h"
 #include "const_fold.h"
 
 namespace tvm {
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index 1122b8e1ee40..cb576fa9c067 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -28,7 +28,7 @@
 #include <algorithm>
 #include <unordered_map>
 
-#include "../support/util.h"
+#include "../support/utils.h"
 
 namespace tvm {
 
diff --git a/src/relay/analysis/feature.cc b/src/relay/analysis/feature.cc
index 63f5e711bfcd..b3516e965b85 100644
--- a/src/relay/analysis/feature.cc
+++ b/src/relay/analysis/feature.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/feature.h>
 
-#include "../transforms/pass_util.h"
+#include "../transforms/pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/analysis/mac_count.cc b/src/relay/analysis/mac_count.cc
index d2e62b705d99..5e35ab7ba62d 100644
--- a/src/relay/analysis/mac_count.cc
+++ b/src/relay/analysis/mac_count.cc
@@ -32,7 +32,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 
-#include "../transforms/pattern_util.h"
+#include "../transforms/pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index b98106a091b3..59ce01ce4227 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -31,7 +31,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/pattern_functor.h>
 
-#include "../transforms/pass_util.h"
+#include "../transforms/pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index b679fea50099..d720e94ddc75 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -45,7 +45,7 @@
 #include <utility>
 #include <vector>
 
-#include "../transforms/pass_util.h"
+#include "../transforms/pass_utils.h"
 #include "utils.h"
 
 namespace tvm {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index fb9ca08f0592..c7ceca3604c8 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -46,7 +46,7 @@
 #include "../../../target/source/codegen_source_base.h"
 #include "../../backend/compile_engine.h"
 #include "../../op/op_common.h"
-#include "../../transforms/pass_util.h"
+#include "../../transforms/pass_utils.h"
 #include "../utils.h"
 #include "compiler.h"
 
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 19924ab38358..56965c544701 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -44,7 +44,7 @@
 #include "../../../runtime/vm/naive_allocator.h"
 #include "../../../runtime/vm/profiler/vm.h"
 #include "../../backend/compile_engine.h"
-#include "../../transforms/pass_util.h"
+#include "../../transforms/pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index d3eb4f96ed09..a2c54656d464 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -30,7 +30,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/topi/elemwise.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../type_relations.h"
 
 namespace tvm {
diff --git a/src/relay/op/device_copy.cc b/src/relay/op/device_copy.cc
index 3a58607e6dd8..b26dc879be0a 100644
--- a/src/relay/op/device_copy.cc
+++ b/src/relay/op/device_copy.cc
@@ -33,7 +33,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/topi/elemwise.h>
 
-#include "../transforms/infer_layout_util.h"
+#include "../transforms/infer_layout_utils.h"
 #include "type_relations.h"
 
 namespace tvm {
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index 4b594ffccfa5..863ad643f0da 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <vector>
 
-#include "../../../transforms/infer_layout_util.h"
+#include "../../../transforms/infer_layout_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index 771024502b21..b853ef635b12 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -29,7 +29,7 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/topi/elemwise.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 #include "../type_relations.h"
 
diff --git a/src/relay/op/nn/bitserial.cc b/src/relay/op/nn/bitserial.cc
index 022ca5cc96d8..61a1b8fdf289 100644
--- a/src/relay/op/nn/bitserial.cc
+++ b/src/relay/op/nn/bitserial.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/tir/data_layout.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 
 namespace tvm {
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 2b9103b9709a..cf3f0fa85d68 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -29,7 +29,7 @@
 
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 #include "convolution_make.h"
 
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 1de7ca003772..cbd7ae47acd7 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -37,7 +37,7 @@
 #include <string>
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../make_op.h"
 #include "../op_common.h"
 #include "../type_relations.h"
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 1e5306035a6c..cee7b6456ce6 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -31,7 +31,7 @@
 
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index f12afe2a7f1f..3f51e1f8ab37 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -28,7 +28,7 @@
 
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index cbb8cec2d43b..d530345fc9e8 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -33,7 +33,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../transforms/infer_layout_util.h"
+#include "../transforms/infer_layout_utils.h"
 #include "type_relations.h"
 
 namespace tvm {
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 817023424900..20cd0a12ed98 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -40,9 +40,9 @@
 
 #include <vector>
 
-#include "../../transforms/infer_layout_util.h"
-#include "../../transforms/pass_util.h"
-#include "../../transforms/pattern_util.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../../transforms/pass_utils.h"
+#include "../../transforms/pattern_utils.h"
 #include "../make_op.h"
 #include "../op_common.h"
 
diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc
index f14b29604f06..6550815c6422 100644
--- a/src/relay/op/vision/rcnn_op.cc
+++ b/src/relay/op/vision/rcnn_op.cc
@@ -25,7 +25,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/vm/vm.cc b/src/relay/op/vm/vm.cc
index 59756ea6ffab..424ed5f4bc98 100644
--- a/src/relay/op/vm/vm.cc
+++ b/src/relay/op/vm/vm.cc
@@ -30,7 +30,7 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/topi/elemwise.h>
 
-#include "../../transforms/infer_layout_util.h"
+#include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 #include "../type_relations.h"
 
diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc
index bda8cf878793..29ecf451767e 100644
--- a/src/relay/qnn/op/concatenate.cc
+++ b/src/relay/qnn/op/concatenate.cc
@@ -28,9 +28,9 @@
 #include <tvm/tir/expr.h>
 
 #include "../../op/tensor/transform.h"
-#include "../../transforms/infer_layout_util.h"
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index f112a7259552..b2b6b092fd62 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -31,8 +31,8 @@
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/data_layout.h>
 
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 464b3f9aeff3..3cfc418868ea 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -28,8 +28,8 @@
 #include <tvm/relay/qnn/attrs.h>
 
 #include "../../op/nn/nn.h"
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 3a5f81e2627a..f0c139c2a5e3 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -27,8 +27,8 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
 
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/op/mul.cc b/src/relay/qnn/op/mul.cc
index ec74b799407b..781114cc5f5a 100644
--- a/src/relay/qnn/op/mul.cc
+++ b/src/relay/qnn/op/mul.cc
@@ -25,8 +25,8 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
 
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 #include "op_common.h"
 
 namespace tvm {
diff --git a/src/relay/qnn/op/op_common.h b/src/relay/qnn/op/op_common.h
index 50fc0cda30cf..e99c11b6f02b 100644
--- a/src/relay/qnn/op/op_common.h
+++ b/src/relay/qnn/op/op_common.h
@@ -32,8 +32,8 @@
 #include <vector>
 
 #include "../../op/type_relations.h"
-#include "../../transforms/infer_layout_util.h"
-#include "../util.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index fb7ef9720c87..1b5cb5e2b55b 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -27,8 +27,8 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
 
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 817a734dc637..ea878557d98e 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -26,9 +26,9 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/qnn/attrs.h>
 
-#include "../../transforms/infer_layout_util.h"
-#include "../../transforms/pattern_util.h"
-#include "../util.h"
+#include "../../transforms/infer_layout_utils.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/util.cc b/src/relay/qnn/utils.cc
similarity index 98%
rename from src/relay/qnn/util.cc
rename to src/relay/qnn/utils.cc
index 113038e327d7..fc59b61cc6a5 100644
--- a/src/relay/qnn/util.cc
+++ b/src/relay/qnn/utils.cc
@@ -18,13 +18,13 @@
  */
 
 /*!
- * \file src/relay/qnn/util.cc
+ * \file src/relay/qnn/utils.cc
  * \brief Utility functions for QNN.
  */
 
-#include "util.h"
+#include "utils.h"
 
-#include "../transforms/pattern_util.h"
+#include "../transforms/pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/utils.h
similarity index 98%
rename from src/relay/qnn/util.h
rename to src/relay/qnn/utils.h
index 72eb2a46b2ae..f8885c36d162 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/utils.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file src/relay/qnn/util.h
+ * \file src/relay/qnn/utils.h
  * \brief Utility methods needs for quantized ops that can be shared
  */
 
-#ifndef TVM_RELAY_QNN_UTIL_H_
-#define TVM_RELAY_QNN_UTIL_H_
+#ifndef TVM_RELAY_QNN_UTILS_H_
+#define TVM_RELAY_QNN_UTILS_H_
 
 #include <tvm/relay/expr.h>
 #include <tvm/relay/qnn/attrs.h>
@@ -216,4 +216,4 @@ static inline std::vector<float> GetFloatVectorFromConstant(const Expr& expr) {
 }  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_QNN_UTIL_H_
+#endif  // TVM_RELAY_QNN_UTILS_H_
diff --git a/src/relay/quantize/partition.cc b/src/relay/quantize/partition.cc
index 14b420d6034c..c65cc1879932 100644
--- a/src/relay/quantize/partition.cc
+++ b/src/relay/quantize/partition.cc
@@ -26,7 +26,7 @@
 
 #include <tvm/relay/transform.h>
 
-#include "../transforms/pattern_util.h"
+#include "../transforms/pattern_utils.h"
 #include "./quantize.h"
 
 namespace tvm {
diff --git a/src/relay/quantize/quantize.h b/src/relay/quantize/quantize.h
index d5396dea00d1..7c2acbcb06d4 100644
--- a/src/relay/quantize/quantize.h
+++ b/src/relay/quantize/quantize.h
@@ -29,7 +29,7 @@
 
 #include <string>
 
-#include "../transforms/pattern_util.h"
+#include "../transforms/pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index ace2c2473173..4b598907e76e 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -29,8 +29,8 @@
 #include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/transform.h>
 
-#include "../qnn/util.h"
-#include "../transforms/pattern_util.h"
+#include "../qnn/utils.h"
+#include "../transforms/pattern_utils.h"
 #include "./quantize.h"
 
 namespace tvm {
diff --git a/src/relay/transforms/alter_op_layout.cc b/src/relay/transforms/alter_op_layout.cc
index 3d242cd09f7d..7c5ee019a437 100644
--- a/src/relay/transforms/alter_op_layout.cc
+++ b/src/relay/transforms/alter_op_layout.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <vector>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 #include "transform_layout.h"
 
 namespace tvm {
diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
index 015489dd0857..b9d6cce762e5 100644
--- a/src/relay/transforms/annotate_target.cc
+++ b/src/relay/transforms/annotate_target.cc
@@ -29,7 +29,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/container.h>
 
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/canonicalize_cast.cc b/src/relay/transforms/canonicalize_cast.cc
index 055ab1480a6e..510d098990e3 100644
--- a/src/relay/transforms/canonicalize_cast.cc
+++ b/src/relay/transforms/canonicalize_cast.cc
@@ -26,8 +26,8 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 
-#include "pass_util.h"
-#include "pattern_util.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/canonicalize_ops.cc b/src/relay/transforms/canonicalize_ops.cc
index fec757ee68d5..dfb30cae4693 100644
--- a/src/relay/transforms/canonicalize_ops.cc
+++ b/src/relay/transforms/canonicalize_ops.cc
@@ -28,7 +28,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_batch_matmul.cc b/src/relay/transforms/combine_parallel_batch_matmul.cc
index b2b9703c28bc..5b56504602a9 100644
--- a/src/relay/transforms/combine_parallel_batch_matmul.cc
+++ b/src/relay/transforms/combine_parallel_batch_matmul.cc
@@ -42,7 +42,7 @@
 
 #include "./combine_parallel_op.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index 54aec99f46fb..20fa3e404f6a 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -44,7 +44,7 @@
 
 #include "./combine_parallel_op.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc
index 76b26d0e085b..74a6921c9409 100644
--- a/src/relay/transforms/combine_parallel_dense.cc
+++ b/src/relay/transforms/combine_parallel_dense.cc
@@ -43,7 +43,7 @@
 
 #include "./combine_parallel_op_batch.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_op.cc b/src/relay/transforms/combine_parallel_op.cc
index 7ca2ce8b5dba..b23d01ff469b 100644
--- a/src/relay/transforms/combine_parallel_op.cc
+++ b/src/relay/transforms/combine_parallel_op.cc
@@ -40,7 +40,7 @@
 #include <utility>
 
 #include "expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_op.h b/src/relay/transforms/combine_parallel_op.h
index 6f53e86d534b..9785a366299b 100644
--- a/src/relay/transforms/combine_parallel_op.h
+++ b/src/relay/transforms/combine_parallel_op.h
@@ -38,7 +38,7 @@
 #include <vector>
 
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_op_batch.cc b/src/relay/transforms/combine_parallel_op_batch.cc
index 2e9ffdb9bb3c..a41e1e0d6674 100644
--- a/src/relay/transforms/combine_parallel_op_batch.cc
+++ b/src/relay/transforms/combine_parallel_op_batch.cc
@@ -58,7 +58,7 @@
 
 #include "./combine_parallel_op.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/combine_parallel_op_batch.h b/src/relay/transforms/combine_parallel_op_batch.h
index 9f87d9d2184f..7a518e9ac370 100644
--- a/src/relay/transforms/combine_parallel_op_batch.h
+++ b/src/relay/transforms/combine_parallel_op_batch.h
@@ -37,7 +37,7 @@
 
 #include "./combine_parallel_op.h"
 #include "./expr_subst.h"
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/convert_layout.cc b/src/relay/transforms/convert_layout.cc
index 65fdeda5f6cd..577fb068aab9 100644
--- a/src/relay/transforms/convert_layout.cc
+++ b/src/relay/transforms/convert_layout.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <vector>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 #include "transform_layout.h"
 
 namespace tvm {
diff --git a/src/relay/transforms/defunctionalization.cc b/src/relay/transforms/defunctionalization.cc
index ec614d23a02e..135d7fcee548 100644
--- a/src/relay/transforms/defunctionalization.cc
+++ b/src/relay/transforms/defunctionalization.cc
@@ -68,7 +68,7 @@
 #include <tvm/te/operation.h>
 
 #include "../analysis/type_solver.h"
-#include "../transforms/pass_util.h"
+#include "../transforms/pass_utils.h"
 namespace tvm {
 namespace relay {
 
diff --git a/src/relay/transforms/dynamic_to_static.cc b/src/relay/transforms/dynamic_to_static.cc
index edcb83972cc7..5caaea8c9ead 100644
--- a/src/relay/transforms/dynamic_to_static.cc
+++ b/src/relay/transforms/dynamic_to_static.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/eliminate_common_subexpr.cc b/src/relay/transforms/eliminate_common_subexpr.cc
index 92cc64dedba6..720a97e9d19d 100644
--- a/src/relay/transforms/eliminate_common_subexpr.cc
+++ b/src/relay/transforms/eliminate_common_subexpr.cc
@@ -32,7 +32,7 @@
 
 #include <unordered_map>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/fast_math.cc b/src/relay/transforms/fast_math.cc
index 3c8d8db637c8..91fb4cfa8973 100644
--- a/src/relay/transforms/fast_math.cc
+++ b/src/relay/transforms/fast_math.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index a3f2f69f7a58..660aff2eed9a 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -31,7 +31,7 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc
index 0c2abbfdd238..e4c924dad1e8 100644
--- a/src/relay/transforms/fold_scale_axis.cc
+++ b/src/relay/transforms/fold_scale_axis.cc
@@ -30,8 +30,8 @@
 #include <tvm/tir/data_layout.h>
 
 #include "../op/tensor/transform.h"
-#include "pass_util.h"
-#include "pattern_util.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/forward_rewrite.cc b/src/relay/transforms/forward_rewrite.cc
index f093f5425d94..58396256105b 100644
--- a/src/relay/transforms/forward_rewrite.cc
+++ b/src/relay/transforms/forward_rewrite.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/transform.h>
 
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 10fa05435288..bc6335a539af 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -31,8 +31,8 @@
 #include <tvm/tir/op.h>
 
 #include "../../support/arena.h"
-#include "pass_util.h"
-#include "pattern_util.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/gradient.cc
index bf8105080317..1722c90069cb 100644
--- a/src/relay/transforms/gradient.cc
+++ b/src/relay/transforms/gradient.cc
@@ -29,8 +29,8 @@
 #include <tvm/te/operation.h>
 
 #include "let_list.h"
-#include "pass_util.h"
-#include "pattern_util.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/infer_layout_util.h b/src/relay/transforms/infer_layout_utils.h
similarity index 97%
rename from src/relay/transforms/infer_layout_util.h
rename to src/relay/transforms/infer_layout_utils.h
index 5cc180e8e2e3..3965b0a6a78b 100644
--- a/src/relay/transforms/infer_layout_util.h
+++ b/src/relay/transforms/infer_layout_utils.h
@@ -18,14 +18,14 @@
  */
 
 /*!
- * \file infer_layout_util.h
+ * \file infer_layout_utils.h
  * \brief Utility functions to alter the layouts of operators or replace primitive operators with
           other expressions. This pass can be used for computing convolution in
           custom layouts or other general weight pre-transformation.
  */
 
-#ifndef TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTIL_H_
-#define TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTIL_H_
+#ifndef TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTILS_H_
+#define TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTILS_H_
 
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op_attr_types.h>
@@ -34,7 +34,7 @@
 #include <string>
 #include <tuple>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
@@ -235,4 +235,4 @@ static inline std::tuple<Array<Layout>, Array<Layout>, bool> InferCorrectLayouts
 }  //  namespace relay
 }  //  namespace tvm
 
-#endif  // TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTIL_H_
+#endif  // TVM_RELAY_TRANSFORMS_INFER_LAYOUT_UTILS_H_
diff --git a/src/relay/transforms/merge_compiler_regions.cc b/src/relay/transforms/merge_compiler_regions.cc
index 5e615e4316bd..17fd44707b02 100644
--- a/src/relay/transforms/merge_compiler_regions.cc
+++ b/src/relay/transforms/merge_compiler_regions.cc
@@ -43,7 +43,7 @@
 #include <vector>
 
 #include "../analysis/annotated_region_set.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc
index afe2bd5d9302..276d093d6993 100644
--- a/src/relay/transforms/partial_eval.cc
+++ b/src/relay/transforms/partial_eval.cc
@@ -98,7 +98,7 @@
 #include <tvm/relay/transform.h>
 
 #include "let_list.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index e4560c093115..08d26d76ee2d 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -44,7 +44,7 @@
 
 #include "../analysis/annotated_region_set.h"
 #include "../backend/utils.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/pass_util.h b/src/relay/transforms/pass_utils.h
similarity index 98%
rename from src/relay/transforms/pass_util.h
rename to src/relay/transforms/pass_utils.h
index f3c99ccfa120..a2f22cbbf106 100644
--- a/src/relay/transforms/pass_util.h
+++ b/src/relay/transforms/pass_utils.h
@@ -19,11 +19,11 @@
 
 /*!
  *
- * \file tvm/relay/_transforms/pass_util.h
+ * \file tvm/relay/_transforms/pass_utils.h
  * \brief Utilities for writing passes
  */
-#ifndef TVM_RELAY_TRANSFORMS_PASS_UTIL_H_
-#define TVM_RELAY_TRANSFORMS_PASS_UTIL_H_
+#ifndef TVM_RELAY_TRANSFORMS_PASS_UTILS_H_
+#define TVM_RELAY_TRANSFORMS_PASS_UTILS_H_
 
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/relay/expr.h>
@@ -276,4 +276,4 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_TRANSFORMS_PASS_UTIL_H_
+#endif  // TVM_RELAY_TRANSFORMS_PASS_UTILS_H_
diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_utils.h
similarity index 99%
rename from src/relay/transforms/pattern_util.h
rename to src/relay/transforms/pattern_utils.h
index 3c653af01e2e..82ffd8a17c1b 100644
--- a/src/relay/transforms/pattern_util.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -19,12 +19,12 @@
 
 /*!
  *
- * \file tvm/relay/_pattern_util.h
+ * \file tvm/relay/transforms/pattern_utils.h
  * \brief Header of internal operator functions
  *  These can be used for writing passes.
  */
-#ifndef TVM_RELAY_TRANSFORMS_PATTERN_UTIL_H_
-#define TVM_RELAY_TRANSFORMS_PATTERN_UTIL_H_
+#ifndef TVM_RELAY_TRANSFORMS_PATTERN_UTILS_H_
+#define TVM_RELAY_TRANSFORMS_PATTERN_UTILS_H_
 
 #include <builtin_fp16.h>
 #include <tvm/node/structural_equal.h>
@@ -676,4 +676,4 @@ Expr CastHint(Expr data, DataType dtype);
 
 }  // namespace relay
 }  // namespace tvm
-#endif  // TVM_RELAY_TRANSFORMS_PATTERN_UTIL_H_
+#endif  // TVM_RELAY_TRANSFORMS_PATTERN_UTILS_H_
diff --git a/src/relay/transforms/simplify_inference.cc b/src/relay/transforms/simplify_inference.cc
index 8728e90f55a3..7df71967d834 100644
--- a/src/relay/transforms/simplify_inference.cc
+++ b/src/relay/transforms/simplify_inference.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
 
-#include "pattern_util.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/to_a_normal_form.cc b/src/relay/transforms/to_a_normal_form.cc
index adb757b9de0c..e5d7b133c0c0 100644
--- a/src/relay/transforms/to_a_normal_form.cc
+++ b/src/relay/transforms/to_a_normal_form.cc
@@ -31,7 +31,7 @@
 #include "../../support/arena.h"
 #include "../analysis/dependency_graph.h"
 #include "let_list.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc
index 5fc01e151760..fcec4e80ce5b 100644
--- a/src/relay/transforms/to_basic_block_normal_form.cc
+++ b/src/relay/transforms/to_basic_block_normal_form.cc
@@ -31,7 +31,7 @@
 #include "../../support/arena.h"
 #include "../analysis/dependency_graph.h"
 #include "let_list.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/to_cps.cc b/src/relay/transforms/to_cps.cc
index 7c11ce5d4cd9..5ece50133172 100644
--- a/src/relay/transforms/to_cps.cc
+++ b/src/relay/transforms/to_cps.cc
@@ -57,7 +57,7 @@
 #include <tvm/relay/transform.h>
 
 #include "let_list.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/transform_layout.h b/src/relay/transforms/transform_layout.h
index 61a74404afd1..c250d3801b68 100644
--- a/src/relay/transforms/transform_layout.h
+++ b/src/relay/transforms/transform_layout.h
@@ -34,8 +34,8 @@
 #include <unordered_map>
 #include <vector>
 
-#include "infer_layout_util.h"
-#include "pattern_util.h"
+#include "infer_layout_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index d34a662778a4..105aed3614cd 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -46,7 +46,7 @@
 #include <tvm/relay/transform.h>
 
 #include "../analysis/type_solver.h"
-#include "pass_util.h"
+#include "pass_utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.cc b/src/runtime/contrib/ethosn/ethosn_runtime.cc
index f5164e55c46c..7fc44368af6d 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.cc
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <vector>
 
-#include "../../file_util.h"
+#include "../../file_utils.h"
 #include "ethosn_device.h"
 #include "ethosn_driver_library/Inference.hpp"
 #include "ethosn_driver_library/Network.hpp"
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index bf844c1ad798..c356897c8e90 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -32,7 +32,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
diff --git a/src/runtime/file_util.cc b/src/runtime/file_utils.cc
similarity index 99%
rename from src/runtime/file_util.cc
rename to src/runtime/file_utils.cc
index 68d174e470a2..c3298d266cdd 100644
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_utils.cc
@@ -18,9 +18,9 @@
  */
 
 /*!
- * \file file_util.cc
+ * \file file_utils.cc
  */
-#include "file_util.h"
+#include "file_utils.h"
 
 #include <dmlc/json.h>
 #include <dmlc/logging.h>
diff --git a/src/runtime/file_util.h b/src/runtime/file_utils.h
similarity index 93%
rename from src/runtime/file_util.h
rename to src/runtime/file_utils.h
index 1c350357ec9a..696a9760c2e1 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file file_util.h
- * \brief Minimum file manipulation util for runtime.
+ * \file file_utils.h
+ * \brief Minimum file manipulation utils for runtime.
  */
-#ifndef TVM_RUNTIME_FILE_UTIL_H_
-#define TVM_RUNTIME_FILE_UTIL_H_
+#ifndef TVM_RUNTIME_FILE_UTILS_H_
+#define TVM_RUNTIME_FILE_UTILS_H_
 
 #include <string>
 #include <unordered_map>
@@ -94,4 +94,4 @@ void LoadMetaDataFromFile(const std::string& file_name,
 void RemoveFile(const std::string& file_name);
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_RUNTIME_FILE_UTIL_H_
+#endif  // TVM_RUNTIME_FILE_UTILS_H_
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 66e2a5698fb2..305fd50cbdd5 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -31,7 +31,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 
 namespace tvm {
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index 8d10ff210d8d..2d3a901c8524 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -27,7 +27,7 @@
 #include <array>
 #include <mutex>
 #include <string>
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 98b0b3a83466..8c3fb49ea7e0 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -28,7 +28,7 @@
 #include <cstring>
 #include <unordered_set>
 
-#include "file_util.h"
+#include "file_utils.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index aab0c27cb39b..290f8c256508 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -51,7 +51,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 79958d20aa1f..8a83599c644b 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -31,7 +31,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
diff --git a/src/runtime/rpc/rpc_server_env.cc b/src/runtime/rpc/rpc_server_env.cc
index b999a48a376a..cb25150449a1 100644
--- a/src/runtime/rpc/rpc_server_env.cc
+++ b/src/runtime/rpc/rpc_server_env.cc
@@ -23,7 +23,7 @@
  */
 #include <tvm/runtime/registry.h>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
index 6c9af1cbeb42..88c19362a1f8 100644
--- a/src/runtime/stackvm/stackvm_module.cc
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -30,7 +30,7 @@
 #include <unordered_map>
 #include <utility>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index cc1dc8dd19e5..08e9af61fdc3 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -37,7 +37,7 @@
 #include <utility>
 #include <vector>
 
-#include "serialize_util.h"
+#include "serialize_utils.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/vm/serialize_util.h b/src/runtime/vm/serialize_utils.h
similarity index 96%
rename from src/runtime/vm/serialize_util.h
rename to src/runtime/vm/serialize_utils.h
index d17256d6a079..726a46ee2fa1 100644
--- a/src/runtime/vm/serialize_util.h
+++ b/src/runtime/vm/serialize_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file src/runtime/vm/serialize_util.h
+ * \file src/runtime/vm/serialize_utils.h
  * \brief Definitions of helpers for serializing and deserializing a Relay VM.
  */
-#ifndef TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
-#define TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
+#ifndef TVM_RUNTIME_VM_SERIALIZE_UTILS_H_
+#define TVM_RUNTIME_VM_SERIALIZE_UTILS_H_
 
 #include <dmlc/common.h>
 #include <dmlc/memory_io.h>
@@ -164,4 +164,4 @@ struct VMInstructionSerializer {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
+#endif  // TVM_RUNTIME_VM_SERIALIZE_UTILS_H_
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index 5b630337acbb..3cbe245ed095 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -26,7 +26,7 @@
 #include <array>
 #include <cstring>
 
-#include "../file_util.h"
+#include "../file_utils.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../workspace_pool.h"
diff --git a/src/support/socket.h b/src/support/socket.h
index d70f956a51fb..f38918feb8cb 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -52,7 +52,7 @@ using ssize_t = int;
 #include <unordered_map>
 #include <vector>
 
-#include "../support/util.h"
+#include "../support/utils.h"
 
 #if defined(_WIN32)
 static inline int poll(struct pollfd* pfd, int nfds, int timeout) {
diff --git a/src/support/util.h b/src/support/utils.h
similarity index 97%
rename from src/support/util.h
rename to src/support/utils.h
index 5020df2e2ea7..ce1f2bed43f9 100644
--- a/src/support/util.h
+++ b/src/support/utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file util.h
+ * \file utils.h
  * \brief Defines some common utility function..
  */
-#ifndef TVM_SUPPORT_UTIL_H_
-#define TVM_SUPPORT_UTIL_H_
+#ifndef TVM_SUPPORT_UTILS_H_
+#define TVM_SUPPORT_UTILS_H_
 
 #include <stdio.h>
 #ifndef _WIN32
@@ -164,4 +164,4 @@ inline size_t HashCombine(size_t key, size_t value) {
 
 }  // namespace support
 }  // namespace tvm
-#endif  // TVM_SUPPORT_UTIL_H_
+#endif  // TVM_SUPPORT_UTILS_H_
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 351d7d90b4fe..3b0ce10534fd 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -45,7 +45,7 @@
 #include <vector>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "llvm_common.h"
 
 namespace tvm {
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 712980cdbe41..a37710d5622b 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -30,7 +30,7 @@
 
 #include <mutex>
 
-#include "../../runtime/file_util.h"
+#include "../../runtime/file_utils.h"
 #include "../../runtime/library_module.h"
 #include "codegen_blob.h"
 #include "codegen_llvm.h"
diff --git a/src/target/source/codegen_aocl.cc b/src/target/source/codegen_aocl.cc
index e90b7d4f8b2c..00533d27c5a6 100644
--- a/src/target/source/codegen_aocl.cc
+++ b/src/target/source/codegen_aocl.cc
@@ -25,7 +25,7 @@
 #include <string>
 #include <vector>
 
-#include "../../runtime/file_util.h"
+#include "../../runtime/file_utils.h"
 #include "../../runtime/opencl/aocl/aocl_module.h"
 #include "../build_common.h"
 #include "codegen_opencl.h"
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 68a34c8304f8..e1ee1539d986 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -25,7 +25,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
-#include "../../runtime/file_util.h"
+#include "../../runtime/file_utils.h"
 #include "../../runtime/meta_data.h"
 #include "codegen_source_base.h"
 
diff --git a/src/te/autodiff/ad_simplify.cc b/src/te/autodiff/ad_simplify.cc
index eefd243281c3..81df5c9d6b42 100644
--- a/src/te/autodiff/ad_simplify.cc
+++ b/src/te/autodiff/ad_simplify.cc
@@ -56,7 +56,7 @@
 #include <memory>
 #include <utility>
 
-#include "ad_util.h"
+#include "ad_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/autodiff/ad_util.cc b/src/te/autodiff/ad_utils.cc
similarity index 99%
rename from src/te/autodiff/ad_util.cc
rename to src/te/autodiff/ad_utils.cc
index 024015a601aa..268abab9cacb 100644
--- a/src/te/autodiff/ad_util.cc
+++ b/src/te/autodiff/ad_utils.cc
@@ -18,10 +18,10 @@
  */
 
 /*!
- * \file ad_util.cc
+ * \file ad_utils.cc
  * \brief Utility for tensor-level auto-differentiation.
  */
-#include "ad_util.h"
+#include "ad_utils.h"
 
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt_functor.h>
diff --git a/src/te/autodiff/ad_util.h b/src/te/autodiff/ad_utils.h
similarity index 97%
rename from src/te/autodiff/ad_util.h
rename to src/te/autodiff/ad_utils.h
index 21de61cc46c2..56070ef27267 100644
--- a/src/te/autodiff/ad_util.h
+++ b/src/te/autodiff/ad_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file ad_util.h
+ * \file ad_utils.h
  * \brief Helper utilities to implement auto-differentiation.
  */
-#ifndef TVM_TE_AUTODIFF_AD_UTIL_H_
-#define TVM_TE_AUTODIFF_AD_UTIL_H_
+#ifndef TVM_TE_AUTODIFF_AD_UTILS_H_
+#define TVM_TE_AUTODIFF_AD_UTILS_H_
 
 #include <tvm/arith/int_solver.h>
 #include <tvm/te/operation.h>
@@ -132,4 +132,4 @@ TVM_DLL Tensor RemoveJacobianAndLiftNonzeroCond(const Tensor& tensor,
 
 }  // namespace te
 }  // namespace tvm
-#endif  // TVM_TE_AUTODIFF_AD_UTIL_H_
+#endif  // TVM_TE_AUTODIFF_AD_UTILS_H_
diff --git a/src/te/autodiff/adjoint.cc b/src/te/autodiff/adjoint.cc
index 9f3adfb01a2c..34d38aa75882 100644
--- a/src/te/autodiff/adjoint.cc
+++ b/src/te/autodiff/adjoint.cc
@@ -39,7 +39,7 @@
 #include <memory>
 #include <vector>
 
-#include "ad_util.h"
+#include "ad_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/autodiff/jacobian.cc b/src/te/autodiff/jacobian.cc
index 3724af51c63a..ba03ba08febd 100644
--- a/src/te/autodiff/jacobian.cc
+++ b/src/te/autodiff/jacobian.cc
@@ -30,7 +30,7 @@
 
 #include <memory>
 
-#include "ad_util.h"
+#include "ad_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index 527b251867ad..64995761524b 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -37,7 +37,7 @@
 
 #include "../../arith/interval_set.h"
 #include "../schedule/message_passing.h"
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/operation/cross_thread_reduction.cc b/src/te/operation/cross_thread_reduction.cc
index 6369ecbabad8..6aba9ab500b6 100644
--- a/src/te/operation/cross_thread_reduction.cc
+++ b/src/te/operation/cross_thread_reduction.cc
@@ -24,7 +24,7 @@
 #include <tvm/tir/builtin.h>
 
 #include "compute_op.h"
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/operation/extern_op.cc b/src/te/operation/extern_op.cc
index e61fe51470fe..2afdd4a93c7e 100644
--- a/src/te/operation/extern_op.cc
+++ b/src/te/operation/extern_op.cc
@@ -28,7 +28,7 @@
 
 #include <unordered_set>
 
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc
index 01162cb14e18..98270e9a2952 100644
--- a/src/te/operation/hybrid_op.cc
+++ b/src/te/operation/hybrid_op.cc
@@ -35,7 +35,7 @@
 #include <unordered_set>
 #include <utility>
 
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/operation/hybrid_op.h b/src/te/operation/hybrid_op.h
index a11ae89e23f7..705456850ce6 100644
--- a/src/te/operation/hybrid_op.h
+++ b/src/te/operation/hybrid_op.h
@@ -32,7 +32,7 @@
 #include <vector>
 
 #include "../../tir/transforms/arg_binder.h"
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "../schedule/message_passing.h"
 
 namespace tvm {
diff --git a/src/te/operation/op_util.cc b/src/te/operation/op_utils.cc
similarity index 99%
rename from src/te/operation/op_util.cc
rename to src/te/operation/op_utils.cc
index 2abf68a71d54..80f7fe2b4e41 100644
--- a/src/te/operation/op_util.cc
+++ b/src/te/operation/op_utils.cc
@@ -19,9 +19,9 @@
 
 /*!
  * \brief Utility to make loop nest.
- * \file op_util.cc
+ * \file op_utils.cc
  */
-#include "op_util.h"
+#include "op_utils.h"
 
 #include <tvm/te/operation.h>
 #include <tvm/tir/expr.h>
diff --git a/src/te/operation/op_util.h b/src/te/operation/op_utils.h
similarity index 95%
rename from src/te/operation/op_util.h
rename to src/te/operation/op_utils.h
index 6c864fca67d5..16f7d96cfa77 100644
--- a/src/te/operation/op_util.h
+++ b/src/te/operation/op_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file op_util.h
+ * \file op_utils.h
  * \brief Common utility used in operator construction.
  */
-#ifndef TVM_TE_OPERATION_OP_UTIL_H_
-#define TVM_TE_OPERATION_OP_UTIL_H_
+#ifndef TVM_TE_OPERATION_OP_UTILS_H_
+#define TVM_TE_OPERATION_OP_UTILS_H_
 
 #include <tvm/te/schedule.h>
 #include <tvm/tir/expr.h>
@@ -32,7 +32,7 @@
 #include <vector>
 
 #include "../../tir/transforms/arg_binder.h"
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "../schedule/message_passing.h"
 
 namespace tvm {
@@ -101,4 +101,4 @@ tir::ForType IterVarTypeToForType(IterVarType iter_type);
 
 }  // namespace te
 }  // namespace tvm
-#endif  // TVM_TE_OPERATION_OP_UTIL_H_
+#endif  // TVM_TE_OPERATION_OP_UTILS_H_
diff --git a/src/te/operation/scan_op.cc b/src/te/operation/scan_op.cc
index 99b0edf60a7c..726714580b78 100644
--- a/src/te/operation/scan_op.cc
+++ b/src/te/operation/scan_op.cc
@@ -26,7 +26,7 @@
 #include <tvm/tir/expr.h>
 
 #include "../schedule/graph.h"
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/operation/tensor_compute_op.cc b/src/te/operation/tensor_compute_op.cc
index f6f00584aa76..ecb2e860c3e6 100644
--- a/src/te/operation/tensor_compute_op.cc
+++ b/src/te/operation/tensor_compute_op.cc
@@ -31,7 +31,7 @@
 #include <unordered_set>
 
 #include "./compute_op.h"
-#include "./op_util.h"
+#include "./op_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc
index ab96ae8180f5..9733cd940a9f 100644
--- a/src/te/operation/tensorize.cc
+++ b/src/te/operation/tensorize.cc
@@ -28,7 +28,7 @@
 
 #include "../schedule/message_passing.h"
 #include "compute_op.h"
-#include "op_util.h"
+#include "op_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/schedule/operation_inline.cc b/src/te/schedule/operation_inline.cc
index 7399af90f570..01d93c5ec8bd 100644
--- a/src/te/schedule/operation_inline.cc
+++ b/src/te/schedule/operation_inline.cc
@@ -29,7 +29,7 @@
 
 #include <utility>
 
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 
 namespace tvm {
 namespace te {
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index 78f0608a0d14..f335f953b7b2 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -27,7 +27,7 @@
 
 #include <unordered_set>
 
-#include "../../tir/transforms/ir_util.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "message_passing.h"
 #include "operation_inline.h"
 
diff --git a/src/te/schedule/schedule_ops.cc b/src/te/schedule/schedule_ops.cc
index e5124dfdc965..a16d9bb73000 100644
--- a/src/te/schedule/schedule_ops.cc
+++ b/src/te/schedule/schedule_ops.cc
@@ -31,8 +31,8 @@
 #include <unordered_set>
 #include <utility>
 
-#include "../../tir/transforms/ir_util.h"
-#include "../operation/op_util.h"
+#include "../../tir/transforms/ir_utils.h"
+#include "../operation/op_utils.h"
 #include "graph.h"
 
 namespace tvm {
diff --git a/src/tir/transforms/arg_binder.cc b/src/tir/transforms/arg_binder.cc
index b88d2980b770..1faa6267b4fe 100644
--- a/src/tir/transforms/arg_binder.cc
+++ b/src/tir/transforms/arg_binder.cc
@@ -28,7 +28,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/coproc_sync.cc b/src/tir/transforms/coproc_sync.cc
index 716ec625d5a8..9de9eaa8a639 100644
--- a/src/tir/transforms/coproc_sync.cc
+++ b/src/tir/transforms/coproc_sync.cc
@@ -29,7 +29,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 #include "storage_access.h"
 
 namespace tvm {
diff --git a/src/tir/transforms/hoist_if_then_else.cc b/src/tir/transforms/hoist_if_then_else.cc
index 4e7589c3a795..9db800c2a6d2 100644
--- a/src/tir/transforms/hoist_if_then_else.cc
+++ b/src/tir/transforms/hoist_if_then_else.cc
@@ -32,7 +32,7 @@
 
 #include "../../arith/interval_set.h"
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/inject_double_buffer.cc b/src/tir/transforms/inject_double_buffer.cc
index 9d5ee950cdfa..1eea43d27d46 100644
--- a/src/tir/transforms/inject_double_buffer.cc
+++ b/src/tir/transforms/inject_double_buffer.cc
@@ -26,7 +26,7 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index d5405790a15a..c0a0b08f22a0 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -28,7 +28,7 @@
 
 #include <unordered_set>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/ir_util.cc b/src/tir/transforms/ir_utils.cc
similarity index 99%
rename from src/tir/transforms/ir_util.cc
rename to src/tir/transforms/ir_utils.cc
index 4f21f0bb7411..744e9a4f8ed3 100644
--- a/src/tir/transforms/ir_util.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -18,10 +18,10 @@
  */
 
 /*!
- * \file ir_util.cc
+ * \file ir_utils.cc
  * \brief Helper functions to construct and compose IR nodes.
  */
-#include "ir_util.h"
+#include "ir_utils.h"
 
 #include <tvm/tir/stmt_functor.h>
 
diff --git a/src/tir/transforms/ir_util.h b/src/tir/transforms/ir_utils.h
similarity index 97%
rename from src/tir/transforms/ir_util.h
rename to src/tir/transforms/ir_utils.h
index 2f9d70659f4d..eb7a246957d2 100644
--- a/src/tir/transforms/ir_util.h
+++ b/src/tir/transforms/ir_utils.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file ir_util.h
+ * \file ir_utils.h
  * \brief Helper functions to construct and compose IR nodes.
  */
-#ifndef TVM_TIR_TRANSFORMS_IR_UTIL_H_
-#define TVM_TIR_TRANSFORMS_IR_UTIL_H_
+#ifndef TVM_TIR_TRANSFORMS_IR_UTILS_H_
+#define TVM_TIR_TRANSFORMS_IR_UTILS_H_
 
 #include <tvm/runtime/device_api.h>
 #include <tvm/tir/builtin.h>
@@ -170,4 +170,4 @@ Stmt ConvertSSA(Stmt stmt);
 
 }  // namespace tir
 }  // namespace tvm
-#endif  // TVM_TIR_TRANSFORMS_IR_UTIL_H_
+#endif  // TVM_TIR_TRANSFORMS_IR_UTILS_H_
diff --git a/src/tir/transforms/lift_attr_scope.cc b/src/tir/transforms/lift_attr_scope.cc
index 1a1279f0640a..44b121a7b559 100644
--- a/src/tir/transforms/lift_attr_scope.cc
+++ b/src/tir/transforms/lift_attr_scope.cc
@@ -27,7 +27,7 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index 23f41e1676a6..68c43fac1170 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -33,7 +33,7 @@
 
 #include "../../arith/interval_set.h"
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/lower_device_storage_access_info.cc b/src/tir/transforms/lower_device_storage_access_info.cc
index fac50a08a9b7..3b317e3f9968 100644
--- a/src/tir/transforms/lower_device_storage_access_info.cc
+++ b/src/tir/transforms/lower_device_storage_access_info.cc
@@ -30,7 +30,7 @@
 #include <tvm/tir/transform.h>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
index bd216bb1c6cb..720c9d0a67e0 100644
--- a/src/tir/transforms/lower_thread_allreduce.cc
+++ b/src/tir/transforms/lower_thread_allreduce.cc
@@ -32,7 +32,7 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index c8df122d40b5..39e6640eece6 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -29,7 +29,7 @@
 
 #include <unordered_set>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 3fae2bbf40c8..3cacf52d90d2 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -36,7 +36,7 @@
 #include <vector>
 
 #include "arg_binder.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc
index f9adfb82a33f..6514a834b397 100644
--- a/src/tir/transforms/storage_access.cc
+++ b/src/tir/transforms/storage_access.cc
@@ -28,7 +28,7 @@
 #include <string>
 #include <utility>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 7475bf6d2f8e..c062cf73aeef 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -41,7 +41,7 @@
 #include "../../arith/ir_visitor_with_analyzer.h"
 #include "../../runtime/thread_storage_scope.h"
 #include "arg_binder.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 09d96510f7f0..3abff415d1f1 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -36,7 +36,7 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/transforms/tensorcore_infer_fragment.cc b/src/tir/transforms/tensorcore_infer_fragment.cc
index 1b3b3c44ff9c..81c8645f3dda 100644
--- a/src/tir/transforms/tensorcore_infer_fragment.cc
+++ b/src/tir/transforms/tensorcore_infer_fragment.cc
@@ -30,7 +30,7 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 #include "storage_access.h"
 
 namespace tvm {
diff --git a/src/tir/transforms/thread_storage_sync.cc b/src/tir/transforms/thread_storage_sync.cc
index 4893748bf522..05ee8146cbd8 100644
--- a/src/tir/transforms/thread_storage_sync.cc
+++ b/src/tir/transforms/thread_storage_sync.cc
@@ -31,7 +31,7 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "ir_util.h"
+#include "ir_utils.h"
 #include "storage_access.h"
 
 namespace tvm {
diff --git a/src/tir/transforms/unroll_loop.cc b/src/tir/transforms/unroll_loop.cc
index a15190665949..122654149f24 100644
--- a/src/tir/transforms/unroll_loop.cc
+++ b/src/tir/transforms/unroll_loop.cc
@@ -33,7 +33,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "ir_util.h"
+#include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/topi/broadcast.cc b/src/topi/broadcast.cc
index a06d91401580..f6a28c7722af 100644
--- a/src/topi/broadcast.cc
+++ b/src/topi/broadcast.cc
@@ -24,7 +24,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/topi/broadcast.h>
-#include <tvm/topi/util.h>
+#include <tvm/topi/utils.h>
 
 namespace tvm {
 namespace topi {
diff --git a/src/topi/reduction.cc b/src/topi/reduction.cc
index b5c6690e1676..55c59162e68c 100644
--- a/src/topi/reduction.cc
+++ b/src/topi/reduction.cc
@@ -24,7 +24,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/topi/reduction.h>
-#include <tvm/topi/util.h>
+#include <tvm/topi/utils.h>
 
 namespace tvm {
 namespace topi {
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index d79952e2494f..19243803cdc9 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -24,7 +24,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/topi/transform.h>
-#include <tvm/topi/util.h>
+#include <tvm/topi/utils.h>
 
 namespace tvm {
 namespace topi {
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index a67b4c3dcd14..a5f8c8252571 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -34,7 +34,7 @@
 
 #include "src/runtime/c_runtime_api.cc"
 #include "src/runtime/cpu_device_api.cc"
-#include "src/runtime/file_util.cc"
+#include "src/runtime/file_utils.cc"
 #include "src/runtime/graph/graph_runtime.cc"
 #include "src/runtime/library_module.cc"
 #include "src/runtime/module.cc"

From e5cb917f8e6fe2412bac14b2304426d30076331a Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Wed, 14 Oct 2020 20:49:13 -0700
Subject: [PATCH 006/258] [Relay][Frontend][Onnx] Allow A to B broadcasting of
 batch_matmul and reverse strided slice (#6681)

* slice and batch_matmul fixes.

* Bug fix in shape inference.

* Test backwards strided slice.

* Fix batch_matmul dynamic shape function.

* formatting.

* Fix edge case for implicit broadcast
---
 python/tvm/relay/frontend/onnx.py          | 33 ++++++++++++++++++----
 python/tvm/relay/op/_transform.py          |  2 ++
 python/tvm/relay/op/dyn/_transform.py      |  2 ++
 python/tvm/relay/op/nn/_nn.py              |  5 +++-
 src/relay/op/dyn/nn/pad.cc                 |  6 ----
 src/relay/op/nn/nn.cc                      |  6 +++-
 tests/python/frontend/onnx/test_forward.py | 18 +++++++-----
 tests/python/relay/test_op_level4.py       |  3 ++
 8 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 0598094398f7..9fae94b5a8a1 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -513,9 +513,11 @@ def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs))
         # Need to check input shape as batch matmul must be supported.
         a_shape = _op.shape_of(inputs[0])
+        a_rank = infer_shape(a_shape)[0]
+        b_shape = _op.shape_of(inputs[1])
+        b_rank = infer_shape(b_shape)[0]
         # When performing a batch matmul, we need to properly handle N-dim shapes.
-        if infer_shape(a_shape)[0] > 2:
-            b_shape = _op.shape_of(inputs[1])
+        if a_rank > 2 or b_rank > 2:
 
             def flatten_to_3d(x, x_shape):
                 ndims = infer_shape(x_shape)[0]
@@ -532,10 +534,31 @@ def flatten_to_3d(x, x_shape):
             b = _op.transpose(b, [0, 2, 1])
             # Perform a batch matmul.
             output = _op.nn.batch_matmul(a, b)
+            # Determine the output batch dimension.
+            if a_rank > b_rank:
+                out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
+            elif a_rank < b_rank:
+                out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
+            # If its unclear how broadcasting should be applied, the output
+            # shape is determined by choosing the maximum value from each input.
+            else:
+                out_batch = _op.concatenate(
+                    [
+                        _op.maximum(
+                            _op.strided_slice(a_shape, [i], [i + 1]),
+                            _op.strided_slice(b_shape, [i], [i + 1]),
+                        )
+                        for i in range(a_rank - 2)
+                    ],
+                    0,
+                )
             # Reshape output to original dimensions.
             final_shape = _op.concatenate(
                 [
-                    _op.strided_slice(a_shape, [0], [infer_shape(a_shape)[0] - 1]),
+                    out_batch,
+                    _op.strided_slice(
+                        a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
+                    ),
                     _op.strided_slice(
                         b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
                     ),
@@ -684,9 +707,7 @@ def _impl_v11(cls, inputs, attr, params):
         else:
             value = 0
 
-        pads_shape = infer_shape(pads)
-        dims = int(pads_shape[0] / 2)
-        pad_width_expr = _op.transpose(_op.reshape(pads, (2, dims)))
+        pad_width_expr = _op.transpose(_op.reshape(pads, (2, -1)))
         pad_mode = attr.get("mode", b"constant").decode("utf-8")
 
         if not pad_mode in ["constant", "edge", "reflect"]:
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index be0382aae8d2..3b70d78cf967 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -172,6 +172,8 @@ def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, slice
         else:
             if end[i] > data_shape[i]:
                 cend = int64(data_shape[i])
+            elif end[i] < -data_shape[i]:
+                cend = int64(-1)
             else:
                 cend = int64(end[i])
                 if cend < 0:
diff --git a/python/tvm/relay/op/dyn/_transform.py b/python/tvm/relay/op/dyn/_transform.py
index dedd3dfb66d7..559d63acaefd 100644
--- a/python/tvm/relay/op/dyn/_transform.py
+++ b/python/tvm/relay/op/dyn/_transform.py
@@ -173,6 +173,8 @@ def _strided_slice_shape_func_input_data(data, begin, end, strides, slice_mode):
         else:
             if end[i] > data.shape[i]:
                 cend = int64(data.shape[i])
+            elif end[i] < -data.shape[i]:
+                cend = int64(-1)
             else:
                 cend = int64(end[i])
                 if cend < 0:
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 9e47dc0a17f1..e1aabe1e15b5 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -982,7 +982,10 @@ def dense_shape_func(attrs, inputs, _):
 def _batch_matmul_shape_func(data_shape, weight_shape):
     out = output_tensor((data_shape.shape[0],), "int64")
     for i in const_range(out.shape[0] - 1):
-        out[i] = data_shape[i]
+        if i == 0:
+            out[i] = max(data_shape[i], weight_shape[i])
+        else:
+            out[i] = data_shape[i]
     out[out.shape[0] - 1] = weight_shape[weight_shape.shape[0] - 2]
 
     return out
diff --git a/src/relay/op/dyn/nn/pad.cc b/src/relay/op/dyn/nn/pad.cc
index 8a17f50df0df..73daccbd97fd 100644
--- a/src/relay/op/dyn/nn/pad.cc
+++ b/src/relay/op/dyn/nn/pad.cc
@@ -57,12 +57,6 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   int pad_width_rank = pad_width->shape.size();
   CHECK_EQ(pad_width_rank, 2) << "Pad width must be 2D";
 
-  auto pad_width_dim1 = pad_width->shape[0].as<IntImmNode>();
-  auto pad_width_dim2 = pad_width->shape[1].as<IntImmNode>();
-
-  CHECK(pad_width_dim1->value == data_rank && pad_width_dim2->value == 2)
-      << "Pad width must have shape (N, 2), where N is the rank of input data";
-
   const PadAttrs* param = attrs.as<PadAttrs>();
   CHECK(param != nullptr);
 
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index cbd7ae47acd7..58dfab27a933 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -859,7 +859,11 @@ bool BatchMatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
       is_dyn = true;
       oshape.push_back(Any());
     } else {
-      oshape.push_back(x->shape[i]);
+      if (i == 0) {
+        oshape.push_back(max(x->shape[i], y->shape[i]));
+      } else {
+        oshape.push_back(x->shape[i]);
+      }
     }
   }
   if (!is_dyn) {
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index ae32012e42e8..07e6dc465268 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -992,10 +992,9 @@ def test_matmul():
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
-def verify_batch_matmul(a_shape, b_shape, target, ctx):
+def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx):
     a_array = np.random.uniform(size=a_shape).astype("float32")
     b_array = np.random.uniform(size=b_shape).astype("float32")
-    out_np = np.matmul(a_array, b_array)
 
     mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
 
@@ -1006,21 +1005,26 @@ def verify_batch_matmul(a_shape, b_shape, target, ctx):
             helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
             helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
         ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_np.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, out_shape)],
     )
 
     model = helper.make_model(graph, producer_name="matmul_test")
+    onnx_out = get_onnxruntime_output(model, [a_array, b_array], "float32")[0]
 
     tvm_out = get_tvm_output_with_vm(model, [a_array, b_array], target, ctx)
-    tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+    tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
 # TODO(mbrookhart): enable cuda once VM supports heterogenous execution
 @tvm.testing.parametrize_targets("llvm")
 def test_batch_matmul(target, ctx):
-    verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4), target, ctx)
-    verify_batch_matmul((2, 4, 3), (3, 4), target, ctx)
-    verify_batch_matmul((2, 3, 4, 3), (3, 4), target, ctx)
+    verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4), (2, 3, 4, 4), target, ctx)
+    verify_batch_matmul((2, 4, 3), (3, 4), (2, 4, 4), target, ctx)
+    verify_batch_matmul((2, 3, 4, 3), (3, 4), (2, 3, 4, 4), target, ctx)
+    # Test implicit broadcasting.
+    verify_batch_matmul((4, 3), (2, 3, 4), (2, 4, 4), target, ctx)
+    verify_batch_matmul((2, 4, 3), (1, 3, 4), (2, 4, 4), target, ctx)
+    verify_batch_matmul((1, 4, 3), (2, 3, 4), (2, 4, 4), target, ctx)
 
 
 def verify_simple_dynamic_model(a_shape, b_shape, target, ctx):
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 0df5a286b4e7..eafc743634d8 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -392,6 +392,9 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
     verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
     verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
+    # Test backwards slicing.
+    verify((3, 4, 3), [-1, -1, -1], [-5, -5, -5], [-1, -1, -1], (3, 4, 3))
+    # Test slice mode.
     verify(
         (3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1], (2, 4, 3), slice_mode="size", test_ref=False
     )

From 3cec24a8e47998e5f63bdd5d7943b953ab9f1a77 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 15 Oct 2020 00:55:45 -0700
Subject: [PATCH 007/258] =?UTF-8?q?Add=20=C2=B5TVM=20Zephyr=20support=20+?=
 =?UTF-8?q?=20QEMU=20regression=20test=20(#6603)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Split transport classes into transport package.

* Introduce transport timeouts.

* black format

* Add metadata-only artifacts

* Simplify utvm rpc server API and ease handling of short packets.

* add zephyr test against qemu

* Add qemu build config

* fix typo

* cleanup zephyr main

* fix nonblocking piping on some linux kernels

* don't double-open transport

* validate FD are in non-blocking mode

* gitignore test debug files

* cleanup zephyr compiler

* re-comment serial until added

* remove logging

* add zephyr exclusions to check_file_type

* add asf header

* lint

* black format

* more pylint

* kill utvm rpc_server bindings, which don't work anymore and fail pylint

* fix compiler warning

* fixes related to pylint

* clang-format again

* more black format

* add qemu regression

* Fix paths for qemu/ dir

* fix typo

* fix SETFL logic

* export SessionTerminatedError and update except after moving

* fix test_micro_artifact

* retrigger staging CI

* fix jenkins syntax hopefully

* one last syntax error

* Add ci_qemu to Jenkinsfile

* build in qemu

* address liangfu comments

* fix new bug with list passing

* retrigger CI
---
 include/tvm/runtime/crt/error_codes.h         |   1 +
 include/tvm/runtime/crt/utvm_rpc_server.h     |  24 +-
 python/tvm/exec/rpc_server.py                 |  69 --
 python/tvm/micro/__init__.py                  |   2 +-
 python/tvm/micro/artifact.py                  | 108 ++-
 python/tvm/micro/contrib/__init__.py          |  16 +
 python/tvm/micro/contrib/base.py              |  67 ++
 python/tvm/micro/contrib/zephyr.py            | 621 ++++++++++++++++++
 python/tvm/micro/debugger.py                  |  25 +-
 python/tvm/micro/micro_binary.py              |  15 +-
 python/tvm/micro/micro_library.py             |  13 +-
 python/tvm/micro/session.py                   |  50 +-
 python/tvm/micro/transport.py                 | 238 -------
 python/tvm/micro/transport/__init__.py        |  26 +
 python/tvm/micro/transport/base.py            | 299 +++++++++
 python/tvm/micro/transport/debug.py           |  63 ++
 python/tvm/micro/transport/file_descriptor.py | 105 +++
 python/tvm/micro/transport/subprocess.py      |  67 ++
 python/tvm/micro/transport/wakeup.py          |  74 +++
 src/runtime/crt/host/main.cc                  |  19 +-
 src/runtime/crt/utvm_rpc_server/rpc_server.cc |  50 +-
 src/runtime/micro/micro_session.cc            | 136 +++-
 tests/lint/check_file_type.py                 |   3 +
 tests/micro/qemu/.gitignore                   |   2 +
 tests/micro/qemu/test_zephyr.py               | 143 ++++
 tests/micro/qemu/zephyr-runtime/.gitignore    |   3 +
 .../micro/qemu/zephyr-runtime/CMakeLists.txt  |  27 +
 .../qemu/zephyr-runtime/crt/crt_config.h      |  64 ++
 tests/micro/qemu/zephyr-runtime/prj.conf      |  31 +
 .../zephyr-runtime/qemu-hack/qemu-system-i386 |  33 +
 tests/micro/qemu/zephyr-runtime/sample.yaml   |  22 +
 tests/micro/qemu/zephyr-runtime/src/main.c    | 238 +++++++
 tests/python/unittest/test_crt.py             |   3 +-
 tests/python/unittest/test_micro_artifact.py  | 137 ++++
 tests/scripts/task_python_microtvm.sh         |   9 +
 35 files changed, 2383 insertions(+), 420 deletions(-)
 create mode 100644 python/tvm/micro/contrib/__init__.py
 create mode 100644 python/tvm/micro/contrib/base.py
 create mode 100644 python/tvm/micro/contrib/zephyr.py
 delete mode 100644 python/tvm/micro/transport.py
 create mode 100644 python/tvm/micro/transport/__init__.py
 create mode 100644 python/tvm/micro/transport/base.py
 create mode 100644 python/tvm/micro/transport/debug.py
 create mode 100644 python/tvm/micro/transport/file_descriptor.py
 create mode 100644 python/tvm/micro/transport/subprocess.py
 create mode 100644 python/tvm/micro/transport/wakeup.py
 create mode 100644 tests/micro/qemu/.gitignore
 create mode 100644 tests/micro/qemu/test_zephyr.py
 create mode 100644 tests/micro/qemu/zephyr-runtime/.gitignore
 create mode 100644 tests/micro/qemu/zephyr-runtime/CMakeLists.txt
 create mode 100644 tests/micro/qemu/zephyr-runtime/crt/crt_config.h
 create mode 100644 tests/micro/qemu/zephyr-runtime/prj.conf
 create mode 100755 tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386
 create mode 100644 tests/micro/qemu/zephyr-runtime/sample.yaml
 create mode 100644 tests/micro/qemu/zephyr-runtime/src/main.c
 create mode 100644 tests/python/unittest/test_micro_artifact.py

diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index e01304061313..16d0e793848b 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -72,6 +72,7 @@ typedef enum {
   // Platform
   kTvmErrorPlatformCheckFailure = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 0),
   kTvmErrorPlatformMemoryManagerInitialized = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 1),
+  kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
 
   // System errors are always negative integers; this mask indicates presence of a system error.
   // Cast tvm_crt_error_t to a signed integer to interpret the negative error code.
diff --git a/include/tvm/runtime/crt/utvm_rpc_server.h b/include/tvm/runtime/crt/utvm_rpc_server.h
index 314463ac8652..e1cc05906932 100644
--- a/include/tvm/runtime/crt/utvm_rpc_server.h
+++ b/include/tvm/runtime/crt/utvm_rpc_server.h
@@ -27,6 +27,7 @@
 
 #include <stdlib.h>
 #include <sys/types.h>
+#include <tvm/runtime/crt/error_codes.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -66,24 +67,17 @@ utvm_rpc_server_t UTvmRpcServerInit(uint8_t* memory, size_t memory_size_bytes,
                                     size_t page_size_bytes_log2,
                                     utvm_rpc_channel_write_t write_func, void* write_func_ctx);
 
-/*! \brief Copy received data into an internal buffer for processing.
- *
- * Currently only handles 1 byte of data. In the future, the goal of this function is to be safe to
- * invoke from an ISR. At that time, this function will just append to an internal buffer.
- *
- * \param server The TVM RPC Server pointer.
- * \param byte The received byte of data.
- * \return The number of bytes copied to the internal buffer. May be less than data_size_bytes when
- * the internal buffer fills.
- */
-size_t UTvmRpcServerReceiveByte(utvm_rpc_server_t server, uint8_t byte);
-
-/*! \brief Perform normal processing of received data.
+/*! \brief Do any tasks suitable for the main thread, and maybe process new incoming data.
  *
  * \param server The TVM RPC Server pointer.
- * \return true while the server is still running. false when it shuts down gracefully.
+ * \param new_data If not nullptr, a pointer to a buffer pointer, which should point at new input
+ *     data to process. On return, updated to point past data that has been consumed.
+ * \param new_data_size_bytes Points to the number of valid bytes in `new_data`. On return,
+ *     updated to the number of unprocessed bytes remaining in `new_data` (usually 0).
+ * \return An error code indicating the outcome of the server main loop iteration.
  */
-bool UTvmRpcServerLoop(utvm_rpc_server_t server);
+tvm_crt_error_t UTvmRpcServerLoop(utvm_rpc_server_t server, uint8_t** new_data,
+                                  size_t* new_data_size_bytes);
 
 #ifdef __cplusplus
 }
diff --git a/python/tvm/exec/rpc_server.py b/python/tvm/exec/rpc_server.py
index 345e44718ac3..9692b98fe22b 100644
--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@@ -19,12 +19,9 @@
 from __future__ import absolute_import
 
 import argparse
-import ast
-import json
 import multiprocessing
 import sys
 import logging
-import tvm
 from .. import rpc
 
 
@@ -45,9 +42,6 @@ def main(args):
     else:
         tracker_addr = None
 
-    if args.utvm_dev_config or args.utvm_dev_id:
-        init_utvm(args)
-
     server = rpc.Server(
         args.host,
         args.port,
@@ -61,40 +55,6 @@ def main(args):
     server.proc.join()
 
 
-def init_utvm(args):
-    """MicroTVM-specific RPC initialization
-
-    Parameters
-    ----------
-    args : argparse.Namespace
-        parsed args from command-line invocation
-    """
-    from tvm import micro  # pylint: disable=import-outside-toplevel
-
-    if args.utvm_dev_config and args.utvm_dev_id:
-        raise RuntimeError("only one of --utvm-dev-config and --utvm-dev-id allowed")
-
-    if args.utvm_dev_config:
-        with open(args.utvm_dev_config, "r") as dev_conf_file:
-            dev_config = json.load(dev_conf_file)
-    else:
-        dev_config_args = ast.literal_eval(args.utvm_dev_config_args)
-        generate_config_func = micro.device.get_device_funcs(args.utvm_dev_id)["generate_config"]
-        dev_config = generate_config_func(*dev_config_args)
-
-    if args.utvm_dev_config or args.utvm_dev_id:
-        # add MicroTVM overrides
-        @tvm.register_func("tvm.rpc.server.start", override=True)
-        def server_start():
-            # pylint: disable=unused-variable
-            session = micro.Session(dev_config)
-            session._enter()
-
-            @tvm.register_func("tvm.rpc.server.shutdown", override=True)
-            def server_shutdown():
-                session._exit()
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="0.0.0.0", help="the hostname of the server")
@@ -121,35 +81,6 @@ def server_shutdown():
     parser.add_argument(
         "--custom-addr", type=str, help="Custom IP Address to Report to RPC Tracker"
     )
-    parser.add_argument(
-        "--utvm-dev-config",
-        type=str,
-        help=(
-            "JSON config file for the target device (if using MicroTVM). "
-            "This file should contain serialized output similar to that returned "
-            "from the device module's generate_config. Can't be specified when "
-            "--utvm-dev-config-args is specified."
-        ),
-    )
-    parser.add_argument(
-        "--utvm-dev-config-args",
-        type=str,
-        help=(
-            "Arguments to the device module's generate_config function. "
-            "Must be a python literal parseable by literal_eval. If specified, "
-            "the device configuration is generated using the device module's "
-            "generate_config. Can't be specified when --utvm-dev-config is "
-            "specified."
-        ),
-    )
-    parser.add_argument(
-        "--utvm-dev-id",
-        type=str,
-        help=(
-            "Unique ID for the target device (if using MicroTVM). Should "
-            "match the name of a module underneath tvm.micro.device)."
-        ),
-    )
 
     parser.set_defaults(fork=True)
     args = parser.parse_args()
diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index 30f81e76f697..a6e24343e378 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -23,5 +23,5 @@
 from .debugger import GdbRemoteDebugger
 from .micro_library import MicroLibrary
 from .micro_binary import MicroBinary
-from .session import create_local_graph_runtime, Session
+from .session import create_local_graph_runtime, Session, SessionTerminatedError
 from .transport import TransportLogger, DebugWrapperTransport, SubprocessTransport
diff --git a/python/tvm/micro/artifact.py b/python/tvm/micro/artifact.py
index 78939760c42b..c8faccb3f512 100644
--- a/python/tvm/micro/artifact.py
+++ b/python/tvm/micro/artifact.py
@@ -17,6 +17,7 @@
 
 """"Defines abstractions around compiler artifacts produced in compiling micro TVM binaries."""
 
+import hashlib
 import io
 import os
 import json
@@ -36,11 +37,53 @@ class ArtifactBadArchiveError(Exception):
     """Raised when an artifact archive is malformed."""
 
 
+class ImmobileArtifactError(Exception):
+    """Raised when an artifact is declared immobile and thus cannot be archived."""
+
+
+class ArchiveModifiedError(Exception):
+    """Raised when the underlying files in a metadata-only archive were modified after archiving."""
+
+
+def sha256_hexdigest(path):
+    with open(path, "rb") as path_fd:
+        h = hashlib.sha256()
+        chunk = path_fd.read(1 * 1024 * 1024)
+        while chunk:
+            h.update(chunk)
+            chunk = path_fd.read(1 * 1024 * 1024)
+
+    return h.hexdigest()
+
+
+def _validate_metadata_only(metadata):
+    """Validate that the files in a metadata-only archive have not changed."""
+    problems = []
+    for files in metadata["labelled_files"].values():
+        for f in files:
+            disk_path = os.path.join(metadata["base_dir"], f)
+            try:
+                sha = sha256_hexdigest(disk_path)
+            except FileNotFoundError:
+                problems.append(f"{f}: original file not found")
+                continue
+
+            expected_sha = metadata["file_digests"][f]
+            if sha != expected_sha:
+                problems.append(f"{f}: sha256 mismatch: expected {expected_sha}, got {sha}")
+
+    if problems:
+        raise ArchiveModifiedError(
+            "Files in metadata-only archive have been modified:\n"
+            + "\n".join([f" * {p}" for p in problems])
+        )
+
+
 class Artifact:
     """Describes a compiler artifact and defines common logic to archive it for transport."""
 
     # A version number written to the archive.
-    ENCODING_VERSION = 1
+    ENCODING_VERSION = 2
 
     # A unique string identifying the type of artifact in an archive. Subclasses must redefine this
     # variable.
@@ -55,7 +98,8 @@ def unarchive(cls, archive_path, base_dir):
         archive_path : str
             Path to the archive file.
         base_dir : str
-            Path to a non-existent, empty directory under which the artifact will live.
+            Path to a non-existent, empty directory under which the artifact will live. If working
+            with a metadata-only archive, this directory will just hold the metadata.json.
 
         Returns
         -------
@@ -92,6 +136,10 @@ def unarchive(cls, archive_path, base_dir):
                         f"archive version: expect {cls.EXPECTED_VERSION}, found {version}"
                     )
 
+                metadata_only = metadata.get("metadata_only")
+                if metadata_only:
+                    _validate_metadata_only(metadata)
+
                 os.rename(os.path.join(temp_dir, temp_dir_contents[0]), base_dir)
 
                 artifact_cls = cls
@@ -103,16 +151,19 @@ def unarchive(cls, archive_path, base_dir):
                         break
 
                 return artifact_cls.from_unarchived(
-                    base_dir, metadata["labelled_files"], metadata["metadata"]
+                    base_dir if not metadata_only else metadata["base_dir"],
+                    metadata["labelled_files"],
+                    metadata["metadata"],
+                    immobile=metadata.get("immobile"),
                 )
         finally:
             shutil.rmtree(temp_dir)
 
     @classmethod
-    def from_unarchived(cls, base_dir, labelled_files, metadata):
-        return cls(base_dir, labelled_files, metadata)
+    def from_unarchived(cls, base_dir, labelled_files, metadata, immobile):
+        return cls(base_dir, labelled_files, metadata, immobile)
 
-    def __init__(self, base_dir, labelled_files, metadata):
+    def __init__(self, base_dir, labelled_files, metadata, immobile=False):
         """Create a new artifact.
 
         Parameters
@@ -123,10 +174,16 @@ def __init__(self, base_dir, labelled_files, metadata):
             A dict mapping a file label to the relative paths of the files that carry that label.
         metadata : Dict
             A dict containing artitrary JSON-serializable key-value data describing the artifact.
+        immobile : bool
+            True when this artifact can't be used after being moved out of its current location on
+            disk. This can happen when artifacts contain absolute paths or when it's not feasible to
+            include enough files in the artifact to reliably re-run commands in arbitrary locations.
+            Setting this flag will cause archive() to raise ImmboileArtifactError.
         """
         self.base_dir = os.path.realpath(base_dir)
         self.labelled_files = labelled_files
         self.metadata = metadata
+        self.immobile = immobile
 
         for label, files in labelled_files.items():
             for f in files:
@@ -158,7 +215,7 @@ def label(self, label):
     def label_abspath(self, label):
         return [self.abspath(p) for p in self.labelled_files[label]]
 
-    def archive(self, archive_path):
+    def archive(self, archive_path, metadata_only=False):
         """Create a relocatable tar archive of the artifacts.
 
         Parameters
@@ -166,12 +223,24 @@ def archive(self, archive_path):
         archive_path : str
             Path to the tar file to create. Or, path to a directory, under which a tar file will be
             created named {base_dir}.tar.
+        metadata_only : bool
+            If true, don't archive artifacts; instead, just archive metadata plus original
+            base_path. A metadata-only archive can be unarchived and used like a regular archive
+            provided none of the files have changed in their original locations on-disk.
 
         Returns
         -------
         str :
             The value of archive_path, after potentially making the computation describe above.
+
+        Raises
+        ------
+        ImmboileArtifactError :
+            When immobile=True was passed to the constructor.
         """
+        if self.immobile and not metadata_only:
+            raise ImmobileArtifactError("This artifact can't be moved")
+
         if os.path.isdir(archive_path):
             archive_path = os.path.join(archive_path, f"{os.path.basename(self.base_dir)}.tar")
 
@@ -185,17 +254,24 @@ def _add_file(name, data, f_type):
                 tar_info.size = len(data)
                 tar_f.addfile(tar_info, io.BytesIO(data_bytes))
 
+            metadata = {
+                "version": self.ENCODING_VERSION,
+                "labelled_files": self.labelled_files,
+                "metadata": self.metadata,
+                "metadata_only": False,
+            }
+            if metadata_only:
+                metadata["metadata_only"] = True
+                metadata["base_dir"] = self.base_dir
+                metadata["immobile"] = self.immobile
+                metadata["file_digests"] = {}
+                for files in self.labelled_files.values():
+                    for f in files:
+                        metadata["file_digests"][f] = sha256_hexdigest(self.abspath(f))
+
             _add_file(
                 f"{archive_name}/metadata.json",
-                json.dumps(
-                    {
-                        "version": self.ENCODING_VERSION,
-                        "labelled_files": self.labelled_files,
-                        "metadata": self.metadata,
-                    },
-                    indent=2,
-                    sort_keys=True,
-                ),
+                json.dumps(metadata, indent=2, sort_keys=True),
                 tarfile.REGTYPE,
             )
             for dir_path, _, files in os.walk(self.base_dir):
diff --git a/python/tvm/micro/contrib/__init__.py b/python/tvm/micro/contrib/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/python/tvm/micro/contrib/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/tvm/micro/contrib/base.py b/python/tvm/micro/contrib/base.py
new file mode 100644
index 000000000000..9c4f4863e3bc
--- /dev/null
+++ b/python/tvm/micro/contrib/base.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines common helper functions useful for integrating custom compiler toolchains."""
+
+import glob
+import os
+import shutil
+
+
+GLOB_PATTERNS = ["__tvm_*", "libtvm__*"]
+
+
+def populate_tvm_objs(dest_dir, objs):
+    """Replace tvm-prefixed files in a build worktree.
+
+    This function is intended to be used to place TVM source files and libraries into a
+    template on-device runtime project.
+
+    Parameters
+    ----------
+    dest_dir : str
+        Path to the destination directory.
+
+    objs : List[MicroLibrary]
+        List of MicroLibrary to place in the project directory.
+
+    Returns
+    -------
+    List[str] :
+        List of paths, each relative to  `dest_dir` to the newly-copied MicroLibrary files.
+    """
+    copied = []
+    for p in GLOB_PATTERNS:
+        for f in glob.glob(os.path.join(dest_dir, p)):
+            if os.path.isdir(f):
+                shutil.rmtree(f)
+            else:
+                os.unlink(f)
+
+    for obj in objs:
+        for lib_file in obj.library_files:
+            obj_base = os.path.basename(lib_file)
+            if obj_base.endswith(".a"):
+                dest_basename = f"libtvm__{obj_base}"
+            else:
+                dest_basename = f"__tvm_{obj_base}"
+
+            copied.append(dest_basename)
+            dest = os.path.join(dest_dir, dest_basename)
+            shutil.copy(obj.abspath(lib_file), dest)
+
+    return copied
diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
new file mode 100644
index 000000000000..25b3a6bd4f48
--- /dev/null
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -0,0 +1,621 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines a compiler integration that uses an externally-supplied Zephyr project."""
+
+import collections
+import logging
+import multiprocessing
+import os
+import re
+import tempfile
+import termios
+import textwrap
+import signal
+import shlex
+import shutil
+import subprocess
+import sys
+
+import yaml
+
+import tvm.micro
+from . import base
+from .. import compiler
+from .. import debugger
+from ..transport import debug
+from ..transport import file_descriptor
+
+from ..transport import Transport, TransportClosedError, TransportTimeouts
+from ..transport import wakeup
+
+
+_LOG = logging.getLogger(__name__)
+
+
+class SubprocessEnv(object):
+    def __init__(self, default_overrides):
+        self.default_overrides = default_overrides
+
+    def run(self, cmd, **kw):
+        env = dict(os.environ)
+        for k, v in self.default_overrides.items():
+            env[k] = v
+
+        return subprocess.check_output(cmd, env=env, **kw)
+
+
+class FlashRunnerNotSupported(Exception):
+    """Raised when the FLASH_RUNNER for a project isn't supported by this Zephyr adapter."""
+
+
+class ZephyrCompiler(tvm.micro.Compiler):
+    """A Compiler instance that builds against a pre-existing zephyr project."""
+
+    def __init__(
+        self,
+        project_dir=None,
+        board=None,
+        west_cmd=None,
+        zephyr_base=None,
+        zephyr_toolchain_variant=None,
+        env_vars=None,
+    ):
+        """Configure the compiler for use.
+
+        Parameters
+        ----------
+        project_dir : str
+            Path to the pre-existing Zephyr project.
+        board : str
+            Name of the Zephyr board to build for (i.e. passed to `west build -b`)
+        west_cmd : Optional[list]
+            If given, argv that invoke the west build tool. Used only for flashing.
+        zephyr_base : Optional[str]
+            If given, path to Zephyr, as would normally be present in the ZEPHYR_BASE environment
+            variable. If not given, consults this environment variable. This value must be set in
+            one of those two places.
+        zephyr_toolchain_variant: Optional[str]
+            If given, overrides the toolchain used by Zephyr. If not given, uses the default
+            zephyr toolchain. When running on OS X outside of docker, you need to specify this.
+        env_vars : Optional[Dict[str,str]]
+            If given, additional environment variables present when invoking west, cmake, or make.
+        """
+        self._project_dir = project_dir
+        self._board = board
+        if west_cmd is None:
+            self._west_cmd = [sys.executable, "-mwest.app.main"]
+        elif isinstance(west_cmd, str):
+            self._west_cmd = [west_cmd]
+        elif isinstance(west_cmd, list):
+            self._west_cmd = west_cmd
+        else:
+            raise TypeError("west_cmd: expected string, list, or None; got %r" % (west_cmd,))
+
+        env = {}
+        if zephyr_toolchain_variant is not None:
+            env["ZEPHYR_TOOLCHAIN_VARIANT"] = zephyr_toolchain_variant
+
+        self._zephyr_base = zephyr_base or os.environ["ZEPHYR_BASE"]
+        assert (
+            self._zephyr_base is not None
+        ), f"Must specify zephyr_base=, or ZEPHYR_BASE must be in environment variables"
+        env["ZEPHYR_BASE"] = self._zephyr_base
+
+        if env_vars:
+            env.update(env_vars)
+
+        self._subprocess_env = SubprocessEnv(env)
+
+    OPT_KEY_TO_CMAKE_DEFINE = {
+        "cflags": "CFLAGS",
+        "ccflags": "CXXFLAGS",
+        "ldflags": "LDFLAGS",
+    }
+
+    @classmethod
+    def _options_to_cmake_args(cls, options):
+        args = []
+        for key, define in cls.OPT_KEY_TO_CMAKE_DEFINE.items():
+            if key in options:
+                quoted_opts = [shlex.quote(o).replace(";", "\\;") for o in options[key]]
+                args.append(f'-DEXTRA_{define}={" ".join(quoted_opts)}')
+
+        if "cmake_args" in options:
+            args.extend(options["cmake_args"])
+
+        return args
+
+    def library(self, output, sources, options=None):
+        project_name = os.path.basename(output)
+        if project_name.startswith("lib"):
+            project_name = project_name[3:]
+
+        lib_prj_conf = os.path.join(output, "prj.conf")
+        if self._project_dir is not None:
+            project_dir_conf = os.path.join(self._project_dir, "prj.conf")
+            if os.path.exists(project_dir_conf):
+                shutil.copy(project_dir_conf, lib_prj_conf)
+        else:
+            with open(lib_prj_conf, "w") as prj_conf_f:
+                prj_conf_f.write("CONFIG_CPLUSPLUS=y\n")
+
+        cmakelists_path = os.path.join(output, "CMakeLists.txt")
+        with open(cmakelists_path, "w") as cmake_f:
+            sources = " ".join(f'"{o}"' for o in sources)
+            cmake_f.write(
+                textwrap.dedent(
+                    f"""\
+                cmake_minimum_required(VERSION 3.13.1)
+
+                find_package(Zephyr HINTS $ENV{{ZEPHYR_BASE}})
+                project({project_name}_prj)
+                target_sources(app PRIVATE)
+                zephyr_library_named({project_name})
+                target_sources({project_name} PRIVATE {sources})
+                target_sources(app PRIVATE main.c)
+                target_link_libraries(app PUBLIC {project_name})
+                """
+                )
+            )
+            if "include_dirs" in options:
+                cmake_f.write(
+                    f"target_include_directories({project_name} PRIVATE "
+                    f'{" ".join(os.path.abspath(d) for d in options["include_dirs"])})\n'
+                )
+
+        with open(os.path.join(output, "main.c"), "w"):
+            pass
+
+        # expecetd not to exist after populate_tvm_libs
+        build_dir = os.path.join(output, "__tvm_build")
+        os.mkdir(build_dir)
+        self._subprocess_env.run(
+            ["cmake", "..", f"-DBOARD={self._board}"] + self._options_to_cmake_args(options),
+            cwd=build_dir,
+        )
+        num_cpus = multiprocessing.cpu_count()
+        self._subprocess_env.run(
+            ["make", f"-j{num_cpus}", "VERBOSE=1", project_name], cwd=build_dir
+        )
+        return tvm.micro.MicroLibrary(build_dir, [f"lib{project_name}.a"])
+
+    def binary(self, output, objects, options=None, link_main=True, main_options=None):
+        assert link_main, "Must pass link_main=True"
+        assert self._project_dir is not None, "Must supply project_dir= to build binaries"
+
+        copied_libs = base.populate_tvm_objs(self._project_dir, objects)
+
+        # expected not to exist after populate_tvm_objs
+        cmake_args = [
+            "cmake",
+            os.path.abspath(self._project_dir),
+            f"-DBOARD={self._board}",
+        ] + self._options_to_cmake_args(options)
+        if "include_dirs" in options:
+            cmake_args.append(
+                "-DTVM_INCLUDE_DIRS="
+                f'{";".join(os.path.abspath(d) for d in options["include_dirs"])}'
+            )
+        cmake_args.append(f'-DTVM_LIBS={";".join(copied_libs)}')
+        self._subprocess_env.run(cmake_args, cwd=output)
+
+        self._subprocess_env.run(["make"], cwd=output)
+
+        return tvm.micro.MicroBinary(
+            output,
+            binary_file=os.path.join("zephyr", "zephyr.elf"),
+            debug_files=[],
+            labelled_files={
+                "cmake_cache": ["CMakeCache.txt"],
+                "device_tree": [os.path.join("zephyr", "zephyr.dts")],
+            },
+            immobile="qemu" in self._board,
+        )
+
+    @property
+    def flasher_factory(self):
+        return compiler.FlasherFactory(
+            ZephyrFlasher,
+            (self._west_cmd,),
+            dict(
+                zephyr_base=self._zephyr_base,
+                project_dir=self._project_dir,
+                subprocess_env=self._subprocess_env.default_overrides,
+            ),
+        )
+
+
+CACHE_ENTRY_RE = re.compile(r"(?P<name>[^:]+):(?P<type>[^=]+)=(?P<value>.*)")
+
+
+CMAKE_BOOL_MAP = dict(
+    [(k, True) for k in ("1", "ON", "YES", "TRUE", "Y")]
+    + [(k, False) for k in ("0", "OFF", "NO", "FALSE", "N", "IGNORE", "NOTFOUND", "")]
+)
+
+
+def read_cmake_cache(file_name):
+    """Read a CMakeCache.txt-like file and return a dictionary of values."""
+    entries = collections.OrderedDict()
+    with open(file_name, encoding="utf-8") as f:
+        for line in f:
+            m = CACHE_ENTRY_RE.match(line.rstrip("\n"))
+            if not m:
+                continue
+
+            if m.group("type") == "BOOL":
+                value = CMAKE_BOOL_MAP[m.group("value").upper()]
+            else:
+                value = m.group("value")
+
+            entries[m.group("name")] = value
+
+    return entries
+
+
+class BoardError(Exception):
+    """Raised when an attached board cannot be opened (i.e. missing /dev nodes, etc)."""
+
+
+class BoardAutodetectFailed(Exception):
+    """Raised when no attached hardware is found matching the board= given to ZephyrCompiler."""
+
+
+class ZephyrFlasher(tvm.micro.compiler.Flasher):
+    """A Flasher implementation that delegates to Zephyr/west."""
+
+    def __init__(
+        self,
+        west_cmd,
+        zephyr_base=None,
+        project_dir=None,
+        subprocess_env=None,
+        nrfjprog_snr=None,
+        openocd_serial=None,
+        flash_args=None,
+        debug_rpc_session=None,
+    ):
+        zephyr_base = zephyr_base or os.environ["ZEPHYR_BASE"]
+        sys.path.insert(0, os.path.join(zephyr_base, "scripts", "dts"))
+        try:
+            import dtlib  # pylint: disable=import-outside-toplevel
+
+            self._dtlib = dtlib
+        finally:
+            sys.path.pop(0)
+
+        self._zephyr_base = zephyr_base
+        self._project_dir = project_dir
+        self._west_cmd = west_cmd
+        self._flash_args = flash_args
+        self._openocd_serial = openocd_serial
+        self._autodetected_openocd_serial = None
+        self._subprocess_env = SubprocessEnv(subprocess_env)
+        self._debug_rpc_session = debug_rpc_session
+        self._nrfjprog_snr = nrfjprog_snr
+
+    def _get_nrf_device_args(self):
+        nrfjprog_args = ["nrfjprog", "--ids"]
+        nrfjprog_ids = subprocess.check_output(nrfjprog_args, encoding="utf-8")
+        if not nrfjprog_ids.strip("\n"):
+            raise BoardAutodetectFailed(
+                f'No attached boards recognized by {" ".join(nrfjprog_args)}'
+            )
+
+        boards = nrfjprog_ids.split("\n")[:-1]
+        if len(boards) > 1:
+            if self._nrfjprog_snr is None:
+                raise BoardError(
+                    "Multiple boards connected; specify one with nrfjprog_snr=: "
+                    f'{", ".join(boards)}'
+                )
+
+            if str(self._nrfjprog_snr) not in boards:
+                raise BoardError(
+                    f"nrfjprog_snr ({self._nrfjprog_snr}) not found in {nrfjprog_args}: {boards}"
+                )
+
+            return ["--snr", str(self._nrfjprog_snr)]
+
+        if not boards:
+            return []
+
+        return ["--snr", boards[0]]
+
+    # kwargs passed to usb.core.find to find attached boards for the openocd flash runner.
+    BOARD_USB_FIND_KW = {
+        "nucleo_f746zg": {"idVendor": 0x0483, "idProduct": 0x374B},
+    }
+
+    def openocd_serial(self, cmake_entries):
+        """Find the serial port to use for a board with OpenOCD flash strategy."""
+        if self._openocd_serial is not None:
+            return self._openocd_serial
+
+        if self._autodetected_openocd_serial is None:
+            import usb  # pylint: disable=import-outside-toplevel
+
+            find_kw = self.BOARD_USB_FIND_KW[cmake_entries["BOARD"]]
+            boards = usb.core.find(find_all=True, **find_kw)
+            serials = []
+            for b in boards:
+                serials.append(b.serial_number)
+
+            if len(serials) == 0:
+                raise BoardAutodetectFailed(f"No attached USB devices matching: {find_kw!r}")
+            serials.sort()
+
+            self._autodetected_openocd_serial = serials[0]
+            print("autodetected", serials[0])
+
+        return self._autodetected_openocd_serial
+
+    def _get_openocd_device_args(self, cmake_entries):
+        return ["--serial", self.openocd_serial(cmake_entries)]
+
+    @classmethod
+    def _get_flash_runner(cls, cmake_entries):
+        flash_runner = cmake_entries.get("ZEPHYR_BOARD_FLASH_RUNNER")
+        if flash_runner is not None:
+            return flash_runner
+
+        with open(cmake_entries["ZEPHYR_RUNNERS_YAML"]) as f:
+            doc = yaml.load(f)
+        return doc["flash-runner"]
+
+    def _get_device_args(self, cmake_entries):
+        flash_runner = self._get_flash_runner(cmake_entries)
+
+        if flash_runner == "nrfjprog":
+            return self._get_nrf_device_args()
+        if flash_runner == "openocd":
+            return self._get_openocd_device_args(cmake_entries)
+
+        raise BoardError(
+            f"Don't know how to find serial terminal for board {cmake_entries['BOARD']} with flash "
+            f"runner {flash_runner}"
+        )
+
+    def flash(self, micro_binary):
+        cmake_entries = read_cmake_cache(
+            micro_binary.abspath(micro_binary.labelled_files["cmake_cache"][0])
+        )
+        if "qemu" in cmake_entries["BOARD"]:
+            return ZephyrQemuTransport(micro_binary.base_dir, startup_timeout_sec=30.0)
+
+        build_dir = os.path.dirname(
+            micro_binary.abspath(micro_binary.labelled_files["cmake_cache"][0])
+        )
+        west_args = (
+            self._west_cmd
+            + ["flash", "--build-dir", build_dir, "--skip-rebuild"]
+            + self._get_device_args(cmake_entries)
+        )
+        if self._flash_args is not None:
+            west_args.extend(self._flash_args)
+        self._subprocess_env.run(west_args, cwd=build_dir)
+
+        return self.transport(micro_binary)
+
+    def _find_nrf_serial_port(self, cmake_entries):
+        com_ports = subprocess.check_output(
+            ["nrfjprog", "--com"] + self._get_device_args(cmake_entries), encoding="utf-8"
+        )
+        ports_by_vcom = {}
+        for line in com_ports.split("\n")[:-1]:
+            parts = line.split()
+            ports_by_vcom[parts[2]] = parts[1]
+
+        return {"port_path": ports_by_vcom["VCOM2"]}
+
+    def _find_openocd_serial_port(self, cmake_entries):
+        return {"grep": self.openocd_serial(cmake_entries)}
+
+    def _find_serial_port(self, micro_binary):
+        cmake_entries = read_cmake_cache(
+            micro_binary.abspath(micro_binary.labelled_files["cmake_cache"][0])
+        )
+        flash_runner = self._get_flash_runner(cmake_entries)
+
+        if flash_runner == "nrfjprog":
+            return self._find_nrf_serial_port(cmake_entries)
+
+        if flash_runner == "openocd":
+            return self._find_openocd_serial_port(cmake_entries)
+
+        raise FlashRunnerNotSupported(
+            f"Don't know how to deduce serial port for flash runner {flash_runner}"
+        )
+
+    def transport(self, micro_binary):
+        """Instantiate the transport for use with non-QEMU Zephyr."""
+        dt_inst = self._dtlib.DT(
+            micro_binary.abspath(micro_binary.labelled_files["device_tree"][0])
+        )
+        uart_baud = (
+            dt_inst.get_node("/chosen")
+            .props["zephyr,console"]
+            .to_path()
+            .props["current-speed"]
+            .to_num()
+        )
+        _LOG.debug("zephyr transport: found UART baudrate from devicetree: %d", uart_baud)
+
+        port_kwargs = self._find_serial_port(micro_binary)
+        serial_transport = serial.SerialTransport(baudrate=uart_baud, **port_kwargs)
+        if self._debug_rpc_session is None:
+            return serial_transport
+
+        return debug.DebugWrapperTransport(
+            debugger.RpcDebugger(
+                self._debug_rpc_session,
+                debugger.DebuggerFactory(
+                    ZephyrDebugger,
+                    (
+                        " ".join(shlex.quote(x) for x in self._west_cmd),
+                        os.path.join(self._project_dir, "__tvm_build"),
+                        micro_binary.abspath(micro_binary.debug_files[0]),
+                        self._zephyr_base,
+                    ),
+                    {},
+                ),
+            ),
+            serial_transport,
+        )
+
+
+class QemuStartupFailureError(Exception):
+    """Raised when the qemu pipe is not present within startup_timeout_sec."""
+
+
+class QemuFdTransport(file_descriptor.FdTransport):
+    """An FdTransport subclass that escapes written data to accomodate the QEMU monitor.
+
+    It's supposedly possible to disable the monitor, but Zephyr controls most of the command-line
+    arguments for QEMU and there are too many options which implictly enable the monitor, so this
+    approach seems more robust.
+    """
+
+    def write_monitor_quit(self):
+        file_descriptor.FdTransport.write(self, b"\x01x", 1.0)
+
+    def close(self):
+        file_descriptor.FdTransport.close(self)
+
+    def timeouts(self):
+        assert False, "should not get here"
+
+    def write(self, data, timeout_sec):
+        """Write data, escaping for QEMU monitor."""
+        to_write = bytearray()
+        escape_pos = []
+        for i, b in enumerate(data):
+            if b == 0x01:
+                to_write.append(b)
+                escape_pos.append(i)
+            to_write.append(b)
+
+        num_written = file_descriptor.FdTransport.write(self, to_write, timeout_sec)
+        num_written -= sum(1 if x < num_written else 0 for x in escape_pos)
+        return num_written
+
+
+class ZephyrQemuTransport(Transport):
+    """The user-facing Zephyr QEMU transport class."""
+
+    def __init__(self, base_dir, startup_timeout_sec=5.0, **kwargs):
+        self.base_dir = base_dir
+        self.startup_timeout_sec = startup_timeout_sec
+        self.kwargs = kwargs
+        self.proc = None
+        self.fd_transport = None
+        self.pipe_dir = None
+
+    def timeouts(self):
+        return TransportTimeouts(
+            session_start_retry_timeout_sec=2.0,
+            session_start_timeout_sec=self.startup_timeout_sec,
+            session_established_timeout_sec=5.0,
+        )
+
+    def open(self):
+        self.pipe_dir = tempfile.mkdtemp()
+        self.pipe = os.path.join(self.pipe_dir, "fifo")
+        self.write_pipe = os.path.join(self.pipe_dir, "fifo.in")
+        self.read_pipe = os.path.join(self.pipe_dir, "fifo.out")
+        os.mkfifo(self.write_pipe)
+        os.mkfifo(self.read_pipe)
+        self.proc = subprocess.Popen(
+            ["make", "run", f"QEMU_PIPE={self.pipe}"],
+            cwd=self.base_dir,
+            **self.kwargs,
+        )
+        # NOTE: although each pipe is unidirectional, open both as RDWR to work around a select
+        # limitation on linux. Without this, non-blocking I/O can't use timeouts because named
+        # FIFO are always considered ready to read when no one has opened them for writing.
+        self.fd_transport = wakeup.WakeupTransport(
+            QemuFdTransport(
+                os.open(self.read_pipe, os.O_RDWR | os.O_NONBLOCK),
+                os.open(self.write_pipe, os.O_RDWR | os.O_NONBLOCK),
+                self.timeouts(),
+            ),
+            b"\xfe\xff\xfd\x03\0\0\0\0\0\x02" b"fw",
+        )
+        self.fd_transport.open()
+
+    def close(self):
+        if self.fd_transport is not None:
+            self.fd_transport.child_transport.write_monitor_quit()
+            self.proc.wait()
+            self.fd_transport.close()
+            self.fd_transport = None
+
+        if self.proc is not None:
+            #            self.proc.wait()
+            #            self.proc.terminate()
+            self.proc = None
+
+        if self.pipe_dir is not None:
+            shutil.rmtree(self.pipe_dir)
+            self.pipe_dir = None
+
+    def read(self, n, timeout_sec):
+        if self.fd_transport is None:
+            raise TransportClosedError()
+        return self.fd_transport.read(n, timeout_sec)
+
+    def write(self, data, timeout_sec):
+        if self.fd_transport is None:
+            raise TransportClosedError()
+        return self.fd_transport.write(data, timeout_sec)
+
+
+class ZephyrDebugger(debugger.Debugger):
+    """A Zephyr debugger implementation."""
+
+    def __init__(self, west_cmd, build_dir, elf_path, zephyr_base):
+        debugger.Debugger.__init__(self)
+        self._west_cmd = shlex.split(west_cmd)
+        self._build_dir = build_dir
+        self._elf_path = elf_path
+        self._zephyr_base = zephyr_base
+
+    def start(self):
+        env = dict(os.environ)
+        env["ZEPHYR_BASE"] = self._zephyr_base
+        sys.stdin = open(0)  # re-open stdin, closed by multiprocessing.
+        self._old_termios = termios.tcgetattr(sys.stdin)
+        self._proc = subprocess.Popen(
+            self._west_cmd
+            + [
+                "debug",
+                "--skip-rebuild",
+                "--build-dir",
+                self._build_dir,
+                "--elf-file",
+                self._elf_path,
+            ],
+            env=env,
+        )
+        self._old_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+    def stop(self):
+        signal.signal(signal.SIGINT, self._old_sigint_handler)
+        termios.tcsetattr(sys.stdin, termios.TCSADRAIN, self._old_termios)
+        self._proc.terminate()
+        self._proc.wait()
diff --git a/python/tvm/micro/debugger.py b/python/tvm/micro/debugger.py
index 8c7a300c2aae..0b5430c52e66 100644
--- a/python/tvm/micro/debugger.py
+++ b/python/tvm/micro/debugger.py
@@ -23,7 +23,7 @@
 import subprocess
 import threading
 
-from . import transport as _transport
+from . import transport
 
 
 class Debugger(metaclass=abc.ABCMeta):
@@ -116,8 +116,8 @@ def popen_kwargs(self):
         else:
             raise NotImplementedError(f"System {sysname} is not yet supported")
 
-        self.stdin = os.fdopen(stdin_write, "wb", buffering=0)
-        self.stdout = os.fdopen(stdout_read, "rb", buffering=0)
+        self.fd_transport = fd.FdTransport(stdout_read, stdin_write)
+        self.fd_transport.open()
 
         return {
             "args": args,
@@ -126,8 +126,7 @@ def popen_kwargs(self):
 
     def _wait_for_process_death(self):
         self.popen.wait()
-        self.stdin.close()
-        self.stdout.close()
+        self.fd_transport.close()
 
     def start(self):
         to_return = super(GdbTransportDebugger, self).start()
@@ -135,22 +134,24 @@ def start(self):
         return to_return
 
     def stop(self):
-        self.stdin.close()
-        self.stdout.close()
+        self.fd_transport.close()
         super(GdbTransportDebugger, self).stop()
 
-    class _Transport(_transport.Transport):
+    class _Transport(transport.Transport):
         def __init__(self, gdb_transport_debugger):
             self.gdb_transport_debugger = gdb_transport_debugger
 
+        def timeouts(self):
+            return transport.debug_transport_timeouts()
+
         def open(self):
             pass  # Pipes opened by parent class.
 
-        def write(self, data):
-            return self.gdb_transport_debugger.stdin.write(data)
+        def write(self, data, timeout_sec):
+            return self.gdb_transport_debugger.fd_transport.write(data, timeout_sec)
 
-        def read(self, n):
-            return self.gdb_transport_debugger.stdout.read(n)
+        def read(self, n, timeout_sec):
+            return self.gdb_transport_debugger.fd_transport.read(n, timeout_sec)
 
         def close(self):
             pass  # Pipes closed by parent class.
diff --git a/python/tvm/micro/micro_binary.py b/python/tvm/micro/micro_binary.py
index 9d411a165150..74b760b67650 100644
--- a/python/tvm/micro/micro_binary.py
+++ b/python/tvm/micro/micro_binary.py
@@ -26,7 +26,7 @@ class MicroBinary(artifact.Artifact):
     ARTIFACT_TYPE = "micro_binary"
 
     @classmethod
-    def from_unarchived(cls, base_dir, labelled_files, metadata):
+    def from_unarchived(cls, base_dir, labelled_files, metadata, immobile):
         binary_file = labelled_files["binary_file"][0]
         del labelled_files["binary_file"]
 
@@ -41,16 +41,25 @@ def from_unarchived(cls, base_dir, labelled_files, metadata):
             debug_files=debug_files,
             labelled_files=labelled_files,
             metadata=metadata,
+            immobile=immobile,
         )
 
-    def __init__(self, base_dir, binary_file, debug_files=None, labelled_files=None, metadata=None):
+    def __init__(
+        self,
+        base_dir,
+        binary_file,
+        debug_files=None,
+        labelled_files=None,
+        metadata=None,
+        immobile=False,
+    ):
         labelled_files = {} if labelled_files is None else dict(labelled_files)
         metadata = {} if metadata is None else dict(metadata)
         labelled_files["binary_file"] = [binary_file]
         if debug_files is not None:
             labelled_files["debug_files"] = debug_files
 
-        super(MicroBinary, self).__init__(base_dir, labelled_files, metadata)
+        super(MicroBinary, self).__init__(base_dir, labelled_files, metadata, immobile=immobile)
 
         self.binary_file = binary_file
         self.debug_files = debug_files
diff --git a/python/tvm/micro/micro_library.py b/python/tvm/micro/micro_library.py
index 52c8cf29116e..b2876509708e 100644
--- a/python/tvm/micro/micro_library.py
+++ b/python/tvm/micro/micro_library.py
@@ -28,7 +28,7 @@ class MicroLibrary(artifact.Artifact):
     ARTIFACT_TYPE = "micro_library"
 
     @classmethod
-    def from_unarchived(cls, base_dir, labelled_files, metadata):
+    def from_unarchived(cls, base_dir, labelled_files, metadata, immobile):
         library_files = labelled_files["library_files"]
         del labelled_files["library_files"]
 
@@ -43,10 +43,17 @@ def from_unarchived(cls, base_dir, labelled_files, metadata):
             debug_files=debug_files,
             labelled_files=labelled_files,
             metadata=metadata,
+            immobile=immobile,
         )
 
     def __init__(
-        self, base_dir, library_files, debug_files=None, labelled_files=None, metadata=None
+        self,
+        base_dir,
+        library_files,
+        debug_files=None,
+        labelled_files=None,
+        metadata=None,
+        immobile=False,
     ):
         labelled_files = {} if labelled_files is None else dict(labelled_files)
         metadata = {} if metadata is None else dict(metadata)
@@ -54,7 +61,7 @@ def __init__(
         if debug_files is not None:
             labelled_files["debug_files"] = debug_files
 
-        super(MicroLibrary, self).__init__(base_dir, labelled_files, metadata)
+        super(MicroLibrary, self).__init__(base_dir, labelled_files, metadata, immobile=immobile)
 
         self.library_files = library_files
         self.debug_file = debug_files
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 3565040e1d76..a6b6d266db36 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -18,11 +18,12 @@
 """Defines a top-level glue class that operates the Transport and Flasher classes."""
 
 import logging
-import time
 
+from ..error import register_error
 from .._ffi import get_global_func
 from ..contrib import graph_runtime
 from ..rpc import RPCSession
+from .transport import IoTimeoutError
 from .transport import TransportLogger
 
 try:
@@ -31,6 +32,11 @@
     raise ImportError("micro tvm is not enabled. Set USE_MICRO to ON in config.cmake")
 
 
+@register_error
+class SessionTerminatedError(Exception):
+    """Raised when a transport read operationd discovers that the remote session is terminated."""
+
+
 class Session:
     """MicroTVM Device Session
 
@@ -51,7 +57,12 @@ class Session:
     """
 
     def __init__(
-        self, binary=None, flasher=None, transport_context_manager=None, session_name="micro-rpc"
+        self,
+        binary=None,
+        flasher=None,
+        transport_context_manager=None,
+        session_name="micro-rpc",
+        timeout_override=None,
     ):
         """Configure a new session.
 
@@ -68,11 +79,15 @@ def __init__(
             should establish a tarnsport between this TVM instance and the device.
         session_name : str
             Name of the session, used for debugging.
+        timeout_override : TransportTimeouts
+            If given, TransportTimeouts that govern the way Receive() behaves. If not given, this is
+            determined by calling has_flow_control() on the transport.
         """
         self.binary = binary
         self.flasher = flasher
         self.transport_context_manager = transport_context_manager
         self.session_name = session_name
+        self.timeout_override = timeout_override
 
         self._rpc = None
         self._graph_runtime = None
@@ -80,6 +95,22 @@ def __init__(
     def get_system_lib(self):
         return self._rpc.get_function("runtime.SystemLib")()
 
+    def _wrap_transport_read(self, n, timeout_microsec):
+        try:
+            return self.transport.read(
+                n, float(timeout_microsec) / 1e6 if timeout_microsec is not None else 0
+            )
+        except IoTimeoutError:
+            return bytes([])
+
+    def _wrap_transport_write(self, data, timeout_microsec):
+        try:
+            return self.transport.write(
+                data, float(timeout_microsec) / 1e6 if timeout_microsec is not None else 0
+            )
+        except IoTimeoutError:
+            return 0
+
     def __enter__(self):
         """Initialize this session and establish an RPC session with the on-device RPC server.
 
@@ -90,13 +121,24 @@ def __enter__(self):
         """
         if self.flasher is not None:
             self.transport_context_manager = self.flasher.flash(self.binary)
-            time.sleep(3.0)
 
         self.transport = TransportLogger(
             self.session_name, self.transport_context_manager, level=logging.INFO
         ).__enter__()
+
+        timeouts = self.timeout_override
+        if timeouts is None:
+            timeouts = self.transport.timeouts()
+
         self._rpc = RPCSession(
-            _rpc_connect(self.session_name, self.transport.write, self.transport.read)
+            _rpc_connect(
+                self.session_name,
+                self._wrap_transport_write,
+                self._wrap_transport_read,
+                int(timeouts.session_start_retry_timeout_sec * 1e6),
+                int(timeouts.session_start_timeout_sec * 1e6),
+                int(timeouts.session_established_timeout_sec * 1e6),
+            )
         )
         self.context = self._rpc.cpu(0)
         return self
diff --git a/python/tvm/micro/transport.py b/python/tvm/micro/transport.py
deleted file mode 100644
index c789bc6c856a..000000000000
--- a/python/tvm/micro/transport.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Defines abstractions and implementations of the RPC transport used with micro TVM."""
-
-import abc
-import logging
-import string
-import subprocess
-import typing
-
-import tvm
-
-_LOG = logging.getLogger(__name__)
-
-
-@tvm.error.register_error
-class SessionTerminatedError(Exception):
-    """Raised when a transport read operationd discovers that the remote session is terminated."""
-
-
-class Transport(metaclass=abc.ABCMeta):
-    """The abstract Transport class used for micro TVM."""
-
-    def __enter__(self):
-        self.open()
-        return self
-
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        self.close()
-
-    @abc.abstractmethod
-    def open(self):
-        """Open any resources needed to send and receive RPC protocol data for a single session."""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def close(self):
-        """Release resources associated with this transport."""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def read(self, n):
-        """Read up to n bytes from the transport.
-
-        Parameters
-        ----------
-        n : int
-            Maximum number of bytes to read from the transport.
-
-        Returns
-        -------
-        bytes :
-            Data read from the channel. Less than `n` bytes may be returned, but 0 bytes should
-            never be returned except in error. Note that if a transport error occurs, an Exception
-            should be raised rather than simply returning empty bytes.
-
-
-        Raises
-        ------
-        SessionTerminatedError :
-            When the transport layer determines that the active session was terminated by the
-            remote side. Typically this indicates that the remote device has reset.
-        """
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def write(self, data):
-        """Write data to the transport channel.
-
-        Parameters
-        ----------
-        data : bytes
-            The data to write over the channel.
-
-        Returns
-        -------
-        int :
-            The number of bytes written to the underlying channel. This can be less than the length
-            of `data`, but cannot be 0.
-        """
-        raise NotImplementedError()
-
-
-class TransportLogger(Transport):
-    """Wraps a Transport implementation and logs traffic to the Python logging infrastructure."""
-
-    def __init__(self, name, child, logger=None, level=logging.INFO):
-        self.name = name
-        self.child = child
-        self.logger = logger or _LOG
-        self.level = level
-
-    # Construct PRINTABLE to exclude whitespace from string.printable.
-    PRINTABLE = string.digits + string.ascii_letters + string.punctuation
-
-    @classmethod
-    def _to_hex(cls, data):
-        lines = []
-        if not data:
-            lines.append("")
-            return lines
-
-        for i in range(0, (len(data) + 15) // 16):
-            chunk = data[i * 16 : (i + 1) * 16]
-            hex_chunk = " ".join(f"{c:02x}" for c in chunk)
-            ascii_chunk = "".join((chr(c) if chr(c) in cls.PRINTABLE else ".") for c in chunk)
-            lines.append(f"{i * 16:04x}  {hex_chunk:47}  {ascii_chunk}")
-
-        if len(lines) == 1:
-            lines[0] = lines[0][6:]
-
-        return lines
-
-    def open(self):
-        self.logger.log(self.level, "opening transport")
-        self.child.open()
-
-    def close(self):
-        self.logger.log(self.level, "closing transport")
-        return self.child.close()
-
-    def read(self, n):
-        data = self.child.read(n)
-        hex_lines = self._to_hex(data)
-        if len(hex_lines) > 1:
-            self.logger.log(
-                self.level,
-                "%s read %4d B -> [%d B]:\n%s",
-                self.name,
-                n,
-                len(data),
-                "\n".join(hex_lines),
-            )
-        else:
-            self.logger.log(
-                self.level, "%s read %4d B -> [%d B]: %s", self.name, n, len(data), hex_lines[0]
-            )
-
-        return data
-
-    def write(self, data):
-        bytes_written = self.child.write(data)
-        hex_lines = self._to_hex(data[:bytes_written])
-        if len(hex_lines) > 1:
-            self.logger.log(
-                self.level,
-                "%s write      <- [%d B]:\n%s",
-                self.name,
-                bytes_written,
-                "\n".join(hex_lines),
-            )
-        else:
-            self.logger.log(
-                self.level, "%s write      <- [%d B]: %s", self.name, bytes_written, hex_lines[0]
-            )
-
-        return bytes_written
-
-
-class SubprocessTransport(Transport):
-    """A Transport implementation that uses a subprocess's stdin/stdout as the channel."""
-
-    def __init__(self, args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-        self.popen = None
-
-    def open(self):
-        self.kwargs["stdout"] = subprocess.PIPE
-        self.kwargs["stdin"] = subprocess.PIPE
-        self.kwargs["bufsize"] = 0
-        self.popen = subprocess.Popen(self.args, **self.kwargs)
-        self.stdin = self.popen.stdin
-        self.stdout = self.popen.stdout
-
-    def write(self, data):
-        to_return = self.stdin.write(data)
-        self.stdin.flush()
-
-        return to_return
-
-    def read(self, n):
-        return self.stdout.read(n)
-
-    def close(self):
-        self.stdin.close()
-        self.stdout.close()
-        self.popen.terminate()
-
-
-class DebugWrapperTransport(Transport):
-    """A Transport wrapper class that launches a debugger before opening the transport.
-
-    This is primiarly useful when debugging the other end of a SubprocessTransport. It allows you
-    to pipe data through the GDB process to drive the subprocess with a debugger attached.
-    """
-
-    def __init__(self, debugger, transport):
-        self.debugger = debugger
-        self.transport = transport
-        self.debugger.on_terminate_callbacks.append(self.transport.close)
-
-    def open(self):
-        self.debugger.start()
-
-        try:
-            self.transport.open()
-        except Exception:
-            self.debugger.stop()
-            raise
-
-    def write(self, data):
-        return self.transport.write(data)
-
-    def read(self, n):
-        return self.transport.read(n)
-
-    def close(self):
-        self.transport.close()
-        self.debugger.stop()
-
-
-TransportContextManager = typing.ContextManager[Transport]
diff --git a/python/tvm/micro/transport/__init__.py b/python/tvm/micro/transport/__init__.py
new file mode 100644
index 000000000000..1e1709707568
--- /dev/null
+++ b/python/tvm/micro/transport/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines abstractions and implementations related to the microTVM RPC transport layer."""
+
+from .base import IoTimeoutError
+from .base import Transport
+from .base import TransportClosedError
+from .base import TransportLogger
+from .base import TransportTimeouts
+from .debug import DebugWrapperTransport
+from .subprocess import SubprocessTransport
diff --git a/python/tvm/micro/transport/base.py b/python/tvm/micro/transport/base.py
new file mode 100644
index 000000000000..15d91342ef5a
--- /dev/null
+++ b/python/tvm/micro/transport/base.py
@@ -0,0 +1,299 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines abstractions and implementations of the RPC transport used with micro TVM."""
+
+import abc
+import collections
+import logging
+import string
+import typing
+
+_LOG = logging.getLogger(__name__)
+
+
+class TransportClosedError(Exception):
+    """Raised when a transport can no longer be used due to underlying I/O problems."""
+
+
+class IoTimeoutError(Exception):
+    """Raised when the I/O operation could not be completed before the timeout.
+
+    Specifically:
+     - when no data could be read before the timeout
+     - when some of the write data could be written before the timeout
+
+    Note the asymmetric behavior of read() vs write(), since in one case the total length of the
+    data to transfer is known.
+    """
+
+
+# Timeouts supported by the underlying C++ MicroSession.
+#
+# session_start_retry_timeout_sec : float
+#     Number of seconds to wait for the device to send a kSessionStartReply after sending the
+#     initial session start message. After this time elapses another
+#     kSessionTerminated-kSessionStartInit train is sent. 0 disables this.
+# session_start_timeout_sec : float
+#     Total number of seconds to wait for the session to be established. After this time, the
+#     client gives up trying to establish a session and raises an exception.
+# session_established_timeout_sec : float
+#     Number of seconds to wait for a reply message after a session has been established. 0
+#     disables this.
+TransportTimeouts = collections.namedtuple(
+    "TransportTimeouts",
+    [
+        "session_start_retry_timeout_sec",
+        "session_start_timeout_sec",
+        "session_established_timeout_sec",
+    ],
+)
+
+
+def debug_transport_timeouts(session_start_retry_timeout_sec=0.0):
+    return TransportTimeouts(
+        session_start_retry_timeout_sec=session_start_retry_timeout_sec,
+        session_start_timeout_sec=0,
+        session_established_timeout_sec=0,
+    )
+
+
+class Transport(metaclass=abc.ABCMeta):
+    """The abstract Transport class used for micro TVM."""
+
+    def __enter__(self):
+        self.open()
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.close()
+
+    @abc.abstractmethod
+    def timeouts(self):
+        """Return TransportTimeouts suitable for use with this transport.
+
+        See the TransportTimeouts documentation in python/tvm/micro/session.py.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def open(self):
+        """Open any resources needed to send and receive RPC protocol data for a single session."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def close(self):
+        """Release resources associated with this transport."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def read(self, n, timeout_sec):
+        """Read up to n bytes from the transport.
+
+        Parameters
+        ----------
+        n : int
+            Maximum number of bytes to read from the transport.
+        timeout_sec : float
+            Number of seconds to wait for all `n` bytes to be received before timing out. The
+            transport can wait additional time to account for transport latency or bandwidth
+            limitations based on the selected configuration and number of bytes being received. If
+            timeout_sec is 0, read should attempt to service the request in a non-blocking fashion.
+
+        Returns
+        -------
+        bytes :
+            Data read from the channel. Less than `n` bytes may be returned, but 0 bytes should
+            never be returned. If returning less than `n` bytes, the full timeout_sec, plus any
+            internally-added timeout, should be waited. If a timeout or transport error occurs,
+            an exception should be raised rather than simply returning empty bytes.
+
+
+        Raises
+        ------
+        TransportClosedError :
+            When the transport layer determines that the transport can no longer send or receive
+            data due to an underlying I/O problem (i.e. file descriptor closed, cable removed, etc).
+
+        IoTimeoutError :
+            When `timeout_sec` elapses without receiving any data.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def write(self, data, timeout_sec):
+        """Write data to the transport channel.
+
+        Parameters
+        ----------
+        data : bytes
+            The data to write over the channel.
+        timeout_sec : float
+            Number of seconds to wait for all `n` bytes to be received before timing out. The
+            transport can wait additional time to account for transport latency or bandwidth
+            limitations based on the selected configuration and number of bytes being received. If
+            timeout_sec is 0, read should attempt to service the request in a non-blocking fashion.
+
+        Returns
+        -------
+        int :
+            The number of bytes written to the underlying channel. This can be less than the length
+            of `data`, but cannot be 0.
+
+        Raises
+        ------
+        TransportClosedError :
+            When the transport layer determines that the transport can no longer send or receive
+            data due to an underlying I/O problem (i.e. file descriptor closed, cable removed, etc).
+
+        IoTimeoutError :
+            When `timeout_sec` elapses without receiving any data.
+        """
+        raise NotImplementedError()
+
+
+class TransportLogger(Transport):
+    """Wraps a Transport implementation and logs traffic to the Python logging infrastructure."""
+
+    def __init__(self, name, child, logger=None, level=logging.INFO):
+        self.name = name
+        self.child = child
+        self.logger = logger or _LOG
+        self.level = level
+
+    # Construct PRINTABLE to exclude whitespace from string.printable.
+    PRINTABLE = string.digits + string.ascii_letters + string.punctuation
+
+    @classmethod
+    def _to_hex(cls, data):
+        lines = []
+        if not data:
+            lines.append("")
+            return lines
+
+        for i in range(0, (len(data) + 15) // 16):
+            chunk = data[i * 16 : (i + 1) * 16]
+            hex_chunk = " ".join(f"{c:02x}" for c in chunk)
+            ascii_chunk = "".join((chr(c) if chr(c) in cls.PRINTABLE else ".") for c in chunk)
+            lines.append(f"{i * 16:04x}  {hex_chunk:47}  {ascii_chunk}")
+
+        if len(lines) == 1:
+            lines[0] = lines[0][6:]
+
+        return lines
+
+    def timeouts(self):
+        return self.child.timeouts()
+
+    def open(self):
+        self.logger.log(self.level, "opening transport")
+        self.child.open()
+
+    def close(self):
+        self.logger.log(self.level, "closing transport")
+        return self.child.close()
+
+    def read(self, n, timeout_sec):
+        try:
+            data = self.child.read(n, timeout_sec)
+        except IoTimeoutError:
+            self.logger.log(
+                self.level,
+                "%s read {%3.2fs} %4d B -> [IoTimeoutError %.2f s]",
+                self.name,
+                timeout_sec,
+                n,
+                timeout_sec,
+            )
+            raise
+        except Exception as err:
+            self.logger.log(
+                self.level,
+                "%s read {%3.2fs} %4d B -> [err: %s]",
+                self.name,
+                timeout_sec,
+                n,
+                str(err),
+                exc_info=1,
+            )
+            raise err
+
+        hex_lines = self._to_hex(data)
+        if len(hex_lines) > 1:
+            self.logger.log(
+                self.level,
+                "%s read {%3.2fs} %4d B -> [%d B]:\n%s",
+                self.name,
+                timeout_sec,
+                n,
+                len(data),
+                "\n".join(hex_lines),
+            )
+        else:
+            self.logger.log(
+                self.level,
+                "%s read {%3.2fs} %4d B -> [%d B]: %s",
+                self.name,
+                timeout_sec,
+                n,
+                len(data),
+                hex_lines[0],
+            )
+
+        return data
+
+    def write(self, data, timeout_sec):
+        try:
+            bytes_written = self.child.write(data, timeout_sec)
+        except IoTimeoutError:
+            self.logger.log(
+                self.level,
+                "%s write             <- [%d B]: [IoTimeoutError %.2f s]",
+                self.name,
+                len(data),
+                timeout_sec,
+            )
+            raise
+        except Exception as err:
+            self.logger.log(
+                self.level,
+                "%s write             <- [%d B]: [err: %s]",
+                self.name,
+                len(data),
+                str(err),
+                exc_info=1,
+            )
+            raise err
+
+        hex_lines = self._to_hex(data[:bytes_written])
+        if len(hex_lines) > 1:
+            self.logger.log(
+                self.level,
+                "%s write      <- [%d B]:\n%s",
+                self.name,
+                bytes_written,
+                "\n".join(hex_lines),
+            )
+        else:
+            self.logger.log(
+                self.level, "%s write      <- [%d B]: %s", self.name, bytes_written, hex_lines[0]
+            )
+
+        return bytes_written
+
+
+TransportContextManager = typing.ContextManager[Transport]
diff --git a/python/tvm/micro/transport/debug.py b/python/tvm/micro/transport/debug.py
new file mode 100644
index 000000000000..6fc14f8a7a3d
--- /dev/null
+++ b/python/tvm/micro/transport/debug.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines a wrapper Transport class that launches a debugger before opening."""
+
+from .base import Transport, TransportTimeouts
+
+
+class DebugWrapperTransport(Transport):
+    """A Transport wrapper class that launches a debugger before opening the transport.
+
+    This is primiarly useful when debugging the other end of a SubprocessTransport. It allows you
+    to pipe data through the GDB process to drive the subprocess with a debugger attached.
+    """
+
+    def __init__(self, debugger, transport, disable_session_start_retry=False):
+        self.debugger = debugger
+        self.transport = transport
+        self.disable_session_start_retry = disable_session_start_retry
+        self.debugger.on_terminate_callbacks.append(self.transport.close)
+
+    def timeouts(self):
+        child_timeouts = self.transport.timeouts()
+        return TransportTimeouts(
+            session_start_retry_timeout_sec=(
+                0 if self.disable_session_start_retry else child_timeouts.session_start_retry
+            ),
+            session_start_timeout_sec=0,
+            session_established_timeout_sec=0,
+        )
+
+    def open(self):
+        self.debugger.start()
+
+        try:
+            self.transport.open()
+        except Exception:
+            self.debugger.stop()
+            raise
+
+    def write(self, data, timeout_sec):
+        return self.transport.write(data, timeout_sec)
+
+    def read(self, n, timeout_sec):
+        return self.transport.read(n, timeout_sec)
+
+    def close(self):
+        self.transport.close()
+        self.debugger.stop()
diff --git a/python/tvm/micro/transport/file_descriptor.py b/python/tvm/micro/transport/file_descriptor.py
new file mode 100644
index 000000000000..ce3025ccbf55
--- /dev/null
+++ b/python/tvm/micro/transport/file_descriptor.py
@@ -0,0 +1,105 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines an implementation of Transport that uses file descriptors."""
+
+import fcntl
+import os
+import select
+import time
+from . import base
+
+
+class FdConfigurationError(Exception):
+    """Raised when specified file descriptors can't be placed in non-blocking mode."""
+
+
+class FdTransport(base.Transport):
+    """A Transport implementation that implements timeouts using non-blocking I/O."""
+
+    @classmethod
+    def _validate_configure_fd(cls, file_descriptor):
+        file_descriptor = (
+            file_descriptor if isinstance(file_descriptor, int) else file_descriptor.fileno()
+        )
+        flag = fcntl.fcntl(file_descriptor, fcntl.F_GETFL)
+        if flag & os.O_NONBLOCK != 0:
+            return file_descriptor
+
+        fcntl.fcntl(file_descriptor, fcntl.F_SETFL, os.O_NONBLOCK | flag)
+        new_flag = fcntl.fcntl(file_descriptor, fcntl.F_GETFL)
+        if (new_flag & os.O_NONBLOCK) == 0:
+            raise FdConfigurationError(
+                f"Cannot set file descriptor {file_descriptor} to non-blocking"
+            )
+        return file_descriptor
+
+    def __init__(self, read_fd, write_fd, timeouts):
+        self.read_fd = self._validate_configure_fd(read_fd)
+        self.write_fd = self._validate_configure_fd(write_fd)
+        self._timeouts = timeouts
+
+    def timeouts(self):
+        return self._timeouts
+
+    def open(self):
+        pass
+
+    def close(self):
+        if self.read_fd is not None:
+            os.close(self.read_fd)
+        if self.write_fd is not None:
+            os.close(self.write_fd)
+
+    def _await_ready(self, rlist, wlist, timeout_sec=None, end_time=None):
+        if end_time is None:
+            return True
+
+        if timeout_sec is None:
+            timeout_sec = max(0, end_time - time.monotonic())
+        rlist, wlist, xlist = select.select(rlist, wlist, rlist + wlist, timeout_sec)
+        if not rlist and not wlist and not xlist:
+            raise IoTimeoutError()
+
+        return True
+
+    def read(self, n, timeout_sec):
+        end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
+
+        self._await_ready([self.read_fd], [], end_time=end_time)
+        to_return = os.read(self.read_fd, n)
+
+        if not to_return:
+            self.close()
+            raise base.TransportClosedError()
+
+        return to_return
+
+    def write(self, data, timeout_sec):
+        end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
+
+        data_len = len(data)
+        while data:
+            self._await_ready(end_time, [], [self.write_fd])
+            num_written = os.write(self.write_fd, data)
+            if not num_written:
+                self.close()
+                raise base.TransportClosedError()
+
+            data = data[num_written:]
+
+        return data_len
diff --git a/python/tvm/micro/transport/subprocess.py b/python/tvm/micro/transport/subprocess.py
new file mode 100644
index 000000000000..4de1fa1266d3
--- /dev/null
+++ b/python/tvm/micro/transport/subprocess.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines an implementation of Transport that uses subprocesses."""
+
+import subprocess
+from . import base
+from . import file_descriptor
+
+
+class SubprocessFdTransport(file_descriptor.FdTransport):
+    def timeouts(self):
+        raise NotImplementedError()
+
+
+class SubprocessTransport(base.Transport):
+    """A Transport implementation that uses a subprocess's stdin/stdout as the channel."""
+
+    def __init__(self, args, max_startup_latency_sec=5.0, max_latency_sec=5.0, **kwargs):
+        self.max_startup_latency_sec = max_startup_latency_sec
+        self.max_latency_sec = max_latency_sec
+        self.args = args
+        self.kwargs = kwargs
+        self.popen = None
+        self.child_transport = None
+
+    def timeouts(self):
+        return base.TransportTimeouts(
+            session_start_retry_timeout_sec=0,
+            session_start_timeout_sec=self.max_startup_latency_sec,
+            session_established_timeout_sec=self.max_latency_sec,
+        )
+
+    def open(self):
+        self.kwargs["stdout"] = subprocess.PIPE
+        self.kwargs["stdin"] = subprocess.PIPE
+        self.kwargs["bufsize"] = 0
+        self.popen = subprocess.Popen(self.args, **self.kwargs)
+        self.child_transport = SubprocessFdTransport(
+            self.popen.stdout, self.popen.stdin, self.timeouts()
+        )
+
+    def write(self, data, timeout_sec):
+        return self.child_transport.write(data, timeout_sec)
+
+    def read(self, n, timeout_sec):
+        return self.child_transport.read(n, timeout_sec)
+
+    def close(self):
+        if self.child_transport is not None:
+            self.child_transport.close()
+
+        self.popen.terminate()
diff --git a/python/tvm/micro/transport/wakeup.py b/python/tvm/micro/transport/wakeup.py
new file mode 100644
index 000000000000..4e5427939263
--- /dev/null
+++ b/python/tvm/micro/transport/wakeup.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines an implementation of Transport that uses subprocesses."""
+
+import logging
+import time
+from . import base
+
+
+_LOG = logging.getLogger(__name__)
+
+
+class WakeupTransport(base.Transport):
+    """A Transport implementation that waits for a "wakeup sequence" from the remote end."""
+
+    def __init__(self, child_transport, wakeup_sequence):
+        self.child_transport = child_transport
+        self.wakeup_sequence = bytes(wakeup_sequence)
+        self.wakeup_sequence_buffer = bytearray()
+        self.line_start_index = 0
+        self.found_wakeup_sequence = False
+
+    def open(self):
+        return self.child_transport.open()
+
+    def close(self):
+        return self.child_transport.close()
+
+    def timeouts(self):
+        return self.child_transport.timeouts()
+
+    def _await_wakeup(self, end_time):
+        if not self.found_wakeup_sequence:
+            while self.wakeup_sequence not in self.wakeup_sequence_buffer:
+                x = self.child_transport.read(1, max(0, end_time - time.monotonic()))
+                self.wakeup_sequence_buffer.extend(x)
+                if x[0] in (b"\n", b"\xff"):
+                    _LOG.debug("%s", self.wakeup_sequence_buffer[self.line_start_index : -1])
+                    self.line_start_index = len(self.wakeup_sequence_buffer)
+
+            _LOG.info("remote side woke up!")
+            self.found_wakeup_sequence = True
+            time.sleep(0.2)
+
+        return max(0, end_time - time.monotonic())
+
+    def read(self, n, timeout_sec):
+        if not self.found_wakeup_sequence:
+            end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
+            timeout_sec = self._await_wakeup(end_time)
+
+        return self.child_transport.read(n, timeout_sec)
+
+    def write(self, data, timeout_sec):
+        if not self.found_wakeup_sequence:
+            end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
+            timeout_sec = self._await_wakeup(end_time)
+
+        return self.child_transport.write(data, timeout_sec)
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index dcca305b8b65..5623b2515585 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -109,13 +109,18 @@ int main(int argc, char** argv) {
       fprintf(stderr, "utvm runtime: 0-length read, exiting!\n");
       return 2;
     }
-    if (UTvmRpcServerReceiveByte(rpc_server, c) != 1) {
-      abort();
-    }
-    if (!UTvmRpcServerLoop(rpc_server)) {
-      execvp(argv[0], argv);
-      perror("utvm runtime: error restarting");
-      return 2;
+    uint8_t* cursor = &c;
+    size_t bytes_to_process = 1;
+    while (bytes_to_process > 0) {
+      tvm_crt_error_t err = UTvmRpcServerLoop(rpc_server, &cursor, &bytes_to_process);
+      if (err == kTvmErrorPlatformShutdown) {
+        break;
+      } else if (err != kTvmErrorNoError) {
+        char buf[1024];
+        snprintf(buf, sizeof(buf), "utvm runtime: UTvmRpcServerLoop error: %08x", err);
+        perror(buf);
+        return 2;
+      }
     }
   }
   return 0;
diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
index f36e67223c98..34eff6a3270d 100644
--- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
@@ -117,7 +117,6 @@ class MicroRPCServer {
         io_{&session_, &receive_buffer_},
         unframer_{session_.Receiver()},
         rpc_server_{&io_},
-        has_pending_byte_{false},
         is_running_{true} {}
 
   void* operator new(size_t count, void* ptr) { return ptr; }
@@ -126,25 +125,30 @@ class MicroRPCServer {
 
   /*! \brief Process one message from the receive buffer, if possible.
    *
-   * \return true if additional messages could be processed. false if the server shutdown request
-   * has been received.
+   * \param new_data If not nullptr, a pointer to a buffer pointer, which should point at new input
+   *     data to process. On return, updated to point past data that has been consumed.
+   * \param new_data_size_bytes Points to the number of valid bytes in `new_data`. On return,
+   *     updated to the number of unprocessed bytes remaining in `new_data` (usually 0).
+   * \return an error code indicating the outcome of the processing loop.
    */
-  bool Loop() {
-    if (has_pending_byte_) {
+  tvm_crt_error_t Loop(uint8_t** new_data, size_t* new_data_size_bytes) {
+    if (!is_running_) {
+      return kTvmErrorPlatformShutdown;
+    }
+
+    tvm_crt_error_t err = kTvmErrorNoError;
+    if (new_data != nullptr && new_data_size_bytes != nullptr && *new_data_size_bytes > 0) {
       size_t bytes_consumed;
-      CHECK_EQ(unframer_.Write(&pending_byte_, 1, &bytes_consumed), kTvmErrorNoError,
-               "unframer_.Write");
-      CHECK_EQ(bytes_consumed, 1, "bytes_consumed");
-      has_pending_byte_ = false;
+      err = unframer_.Write(*new_data, *new_data_size_bytes, &bytes_consumed);
+      *new_data += bytes_consumed;
+      *new_data_size_bytes -= bytes_consumed;
     }
 
-    return is_running_;
-  }
+    if (err == kTvmErrorNoError && !is_running_) {
+      err = kTvmErrorPlatformShutdown;
+    }
 
-  void HandleReceivedByte(uint8_t byte) {
-    CHECK(!has_pending_byte_);
-    has_pending_byte_ = true;
-    pending_byte_ = byte;
+    return err;
   }
 
   void Log(const uint8_t* message, size_t message_size_bytes) {
@@ -164,8 +168,6 @@ class MicroRPCServer {
   Unframer unframer_;
   MinRPCServer<MicroIOHandler> rpc_server_;
 
-  bool has_pending_byte_;
-  uint8_t pending_byte_;
   bool is_running_;
 
   void HandleCompleteMessage(MessageType message_type, FrameBuffer* buf) {
@@ -243,19 +245,11 @@ void TVMLogf(const char* format, ...) {
   }
 }
 
-size_t UTvmRpcServerReceiveByte(utvm_rpc_server_t server_ptr, uint8_t byte) {
-  // NOTE(areusch): In the future, this function is intended to work from an IRQ context. That's not
-  // needed at present.
-  tvm::runtime::micro_rpc::MicroRPCServer* server =
-      static_cast<tvm::runtime::micro_rpc::MicroRPCServer*>(server_ptr);
-  server->HandleReceivedByte(byte);
-  return 1;
-}
-
-bool UTvmRpcServerLoop(utvm_rpc_server_t server_ptr) {
+tvm_crt_error_t UTvmRpcServerLoop(utvm_rpc_server_t server_ptr, uint8_t** new_data,
+                                  size_t* new_data_size_bytes) {
   tvm::runtime::micro_rpc::MicroRPCServer* server =
       static_cast<tvm::runtime::micro_rpc::MicroRPCServer*>(server_ptr);
-  return server->Loop();
+  return server->Loop(new_data, new_data_size_bytes);
 }
 
 }  // extern "C"
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 0ac2a014d858..1b12d8f341ff 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -28,8 +28,11 @@
 #include <tvm/runtime/crt/rpc_common/session.h>
 #include <tvm/runtime/registry.h>
 
+#include <algorithm>
+#include <chrono>
 #include <cstdarg>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <utility>
 
@@ -45,75 +48,135 @@ namespace micro_rpc {
 
 class CallbackWriteStream : public WriteStream {
  public:
-  explicit CallbackWriteStream(PackedFunc fsend) : fsend_{fsend} {}
+  explicit CallbackWriteStream(PackedFunc fsend, ::std::chrono::microseconds write_timeout)
+      : fsend_{fsend}, write_timeout_{write_timeout} {}
 
   ssize_t Write(const uint8_t* data, size_t data_size_bytes) override {
     TVMByteArray bytes;
     bytes.data = (const char*)data;
     bytes.size = data_size_bytes;
-    int64_t n = fsend_(bytes);
+    int64_t n = fsend_(bytes, write_timeout_.count());
     return n;
   }
 
   void PacketDone(bool is_valid) override {}
 
+  void SetWriteTimeout(::std::chrono::microseconds timeout) { write_timeout_ = timeout; }
+
  private:
   PackedFunc fsend_;
+  ::std::chrono::microseconds write_timeout_;
 };
 
 class MicroTransportChannel : public RPCChannel {
  public:
-  MicroTransportChannel(PackedFunc fsend, PackedFunc frecv)
-      : write_stream_{fsend},
+  enum class State : uint8_t {
+    kReset = 0,               // state entered before the transport has been read or written to.
+    kSessionTerminated = 1,   // session is terminated, but transport is alive.
+    kSessionEstablished = 2,  // session is alive.
+  };
+
+  MicroTransportChannel(PackedFunc fsend, PackedFunc frecv,
+                        ::std::chrono::microseconds session_start_retry_timeout,
+                        ::std::chrono::microseconds session_start_timeout,
+                        ::std::chrono::microseconds session_established_timeout)
+      : state_{State::kReset},
+        session_start_retry_timeout_{session_start_retry_timeout},
+        session_start_timeout_{session_start_timeout},
+        session_established_timeout_{session_established_timeout},
+        write_stream_{fsend, session_start_timeout},
         framer_{&write_stream_},
         receive_buffer_{new uint8_t[TVM_CRT_MAX_PACKET_SIZE_BYTES], TVM_CRT_MAX_PACKET_SIZE_BYTES},
-        session_{0x5b, &framer_, &receive_buffer_, &HandleMessageReceivedCb, this},
+        session_{0x5c, &framer_, &receive_buffer_, &HandleMessageReceivedCb, this},
         unframer_{session_.Receiver()},
         did_receive_message_{false},
         frecv_{frecv},
         message_buffer_{nullptr} {}
 
-  size_t ReceiveUntil(TypedPackedFunc<bool(void)> pf) {
+  bool ReceiveUntil(TypedPackedFunc<bool(void)> pf, ::std::chrono::microseconds timeout) {
     size_t bytes_received = 0;
     if (pf()) {
-      return 0;
+      return true;
     }
 
+    auto end_time = ::std::chrono::steady_clock::now() + timeout;
     for (;;) {
       while (pending_chunk_.size() > 0) {
         size_t bytes_consumed = 0;
         int unframer_error = unframer_.Write((const uint8_t*)pending_chunk_.data(),
                                              pending_chunk_.size(), &bytes_consumed);
 
-        CHECK(bytes_consumed <= pending_chunk_.size());
+        CHECK(bytes_consumed <= pending_chunk_.size())
+            << "consumed " << bytes_consumed << " want <= " << pending_chunk_.size();
         pending_chunk_ = pending_chunk_.substr(bytes_consumed);
         bytes_received += bytes_consumed;
         if (unframer_error < 0) {
           LOG(ERROR) << "unframer got error code: " << unframer_error;
         } else {
           if (pf()) {
-            return bytes_received;
+            return true;
           }
         }
       }
 
-      std::string chunk = frecv_(128);
+      ::std::string chunk;
+      if (timeout != ::std::chrono::microseconds::zero()) {
+        ::std::chrono::microseconds iter_timeout{
+            ::std::max(::std::chrono::microseconds{0},
+                       ::std::chrono::duration_cast<::std::chrono::microseconds>(
+                           end_time - ::std::chrono::steady_clock::now()))};
+        chunk = frecv_(128, iter_timeout.count()).operator std::string();
+      } else {
+        chunk = frecv_(128, nullptr).operator std::string();
+      }
       pending_chunk_ = chunk;
-      CHECK(pending_chunk_.size() != 0) << "zero-size chunk encountered";
-      CHECK_GT(pending_chunk_.size(), 0);
+      if (pending_chunk_.size() == 0) {
+        // Timeout occurred
+        return false;
+      }
     }
   }
 
-  void StartSession() {
-    CHECK_EQ(kTvmErrorNoError, session_.Initialize());
-    CHECK_EQ(kTvmErrorNoError, session_.StartSession());
-    ReceiveUntil([this]() -> bool { return session_.IsEstablished(); });
+  bool StartSession() {
+    CHECK(state_ == State::kReset)
+        << "MicroSession: state_: expected kReset, got " << uint8_t(state_);
+
+    ::std::chrono::steady_clock::time_point start_time = ::std::chrono::steady_clock::now();
+    auto session_start_end_time = start_time + session_start_timeout_;
+
+    ::std::chrono::steady_clock::time_point end_time;
+    if (session_start_retry_timeout_ != ::std::chrono::microseconds::zero()) {
+      end_time = start_time + session_start_retry_timeout_;
+    } else {
+      end_time = session_start_end_time;
+    }
+    while (!session_.IsEstablished()) {
+      CHECK_EQ(kTvmErrorNoError, session_.Initialize());
+      CHECK_EQ(kTvmErrorNoError, session_.StartSession());
+
+      ::std::chrono::microseconds time_remaining = ::std::max(
+          ::std::chrono::microseconds{0}, ::std::chrono::duration_cast<::std::chrono::microseconds>(
+                                              end_time - ::std::chrono::steady_clock::now()));
+
+      if (!ReceiveUntil([this]() -> bool { return session_.IsEstablished(); }, time_remaining)) {
+        if (end_time >= session_start_end_time) {
+          break;
+        }
+        end_time += session_start_retry_timeout_;
+      }
+    }
+
+    if (session_.IsEstablished()) {
+      write_stream_.SetWriteTimeout(session_established_timeout_);
+    }
+
+    return session_.IsEstablished();
   }
 
   size_t Send(const void* data, size_t size) override {
     const uint8_t* data_bytes = static_cast<const uint8_t*>(data);
-    ssize_t ret = session_.SendMessage(MessageType::kNormal, data_bytes, size);
-    CHECK(ret == 0) << "SendMessage returned " << ret;
+    tvm_crt_error_t err = session_.SendMessage(MessageType::kNormal, data_bytes, size);
+    CHECK(err == kTvmErrorNoError) << "SendMessage returned " << err;
 
     return size;
   }
@@ -134,7 +197,14 @@ class MicroTransportChannel : public RPCChannel {
       }
 
       did_receive_message_ = false;
-      ReceiveUntil([this]() -> bool { return did_receive_message_; });
+      if (!ReceiveUntil([this]() -> bool { return did_receive_message_; },
+                        session_established_timeout_)) {
+        std::stringstream ss;
+        ss << "MicroSessionTimeoutError: failed to read reply message after timeout "
+           << session_established_timeout_.count() / 1e6 << "s";
+
+        throw std::runtime_error(ss.str());
+      }
     }
 
     return num_bytes_recv;
@@ -158,11 +228,21 @@ class MicroTransportChannel : public RPCChannel {
     size_t message_size_bytes;
     switch (message_type) {
       case MessageType::kStartSessionInit:
+        break;
+
       case MessageType::kStartSessionReply:
+        state_ = State::kSessionEstablished;
         break;
 
       case MessageType::kTerminateSession:
-        LOG(FATAL) << "SessionTerminatedError: remote side has probably reset";
+        if (state_ == State::kReset) {
+          state_ = State::kSessionTerminated;
+        } else if (state_ == State::kSessionTerminated) {
+          LOG(FATAL) << "SessionTerminatedError: multiple session-terminated messages received; "
+                        "device in reboot loop?";
+        } else if (state_ == State::kSessionEstablished) {
+          LOG(FATAL) << "SessionTerminatedError: remote device terminated connection";
+        }
         break;
 
       case MessageType::kLog:
@@ -189,6 +269,10 @@ class MicroTransportChannel : public RPCChannel {
     }
   }
 
+  State state_;
+  ::std::chrono::microseconds session_start_retry_timeout_;
+  ::std::chrono::microseconds session_start_timeout_;
+  ::std::chrono::microseconds session_established_timeout_;
   CallbackWriteStream write_stream_;
   Framer framer_;
   FrameBuffer receive_buffer_;
@@ -201,8 +285,16 @@ class MicroTransportChannel : public RPCChannel {
 };
 
 TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue* rv) {
-  MicroTransportChannel* micro_channel = new MicroTransportChannel(args[1], args[2]);
-  micro_channel->StartSession();
+  MicroTransportChannel* micro_channel =
+      new MicroTransportChannel(args[1], args[2], ::std::chrono::microseconds(uint64_t(args[3])),
+                                ::std::chrono::microseconds(uint64_t(args[4])),
+                                ::std::chrono::microseconds(uint64_t(args[5])));
+  if (!micro_channel->StartSession()) {
+    std::stringstream ss;
+    ss << "MicroSessionTimeoutError: session start handshake failed after " << double(args[4]) / 1e6
+       << "s";
+    throw std::runtime_error(ss.str());
+  }
   std::unique_ptr<RPCChannel> channel(micro_channel);
   auto ep = RPCEndpoint::Create(std::move(channel), args[0], "");
   auto sess = CreateClientSession(ep);
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 60aa732fa6a0..8a90bb3745ca 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -124,6 +124,9 @@
     "docs/_static/img/tvm-logo-square.png",
     # pytest config
     "pytest.ini",
+    # Zephyr tests
+    "tests/micro/qemu/zephyr-runtime/prj.conf",
+    "tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386",
 }
 
 
diff --git a/tests/micro/qemu/.gitignore b/tests/micro/qemu/.gitignore
new file mode 100644
index 000000000000..1066e164f0eb
--- /dev/null
+++ b/tests/micro/qemu/.gitignore
@@ -0,0 +1,2 @@
+/test_zephyr-workspace
+/*.micro-binary
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
new file mode 100644
index 000000000000..67943ce2896f
--- /dev/null
+++ b/tests/micro/qemu/test_zephyr.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import contextlib
+import copy
+import datetime
+import glob
+import os
+import subprocess
+
+import numpy as np
+
+import tvm
+import tvm.rpc
+import tvm.micro
+import tvm.relay
+
+from tvm.micro.contrib import zephyr
+from tvm.contrib import util
+
+BUILD = True
+DEBUG = False
+
+
+TARGET = tvm.target.target.micro("host")
+
+
+def _make_sess_from_op(op_name, sched, arg_bufs):
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.build(sched, arg_bufs, TARGET, target_host=TARGET, name=op_name)
+
+    return _make_session(mod)
+
+
+def _make_session(mod):
+    prev_build = f"{os.path.splitext(__file__)[0]}-last-build.micro-binary"
+    test_name = os.path.splitext(os.path.abspath(__file__))[0]
+    workspace_root = (
+        f'{test_name}-workspace/{datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")}'
+    )
+    workspace_parent = os.path.dirname(workspace_root)
+    if not os.path.exists(workspace_parent):
+        os.makedirs(workspace_parent)
+    workspace = tvm.micro.Workspace(debug=True, root=workspace_root)
+
+    project_dir = os.path.join(os.path.dirname(__file__) or ".", "zephyr-runtime")
+    compiler = zephyr.ZephyrCompiler(
+        project_dir=project_dir,
+        board="qemu_x86",
+        zephyr_toolchain_variant="zephyr",
+    )
+
+    opts = tvm.micro.default_options(f"{project_dir}/crt")
+    # TODO(weberlo) verify this is necessary
+    opts["bin_opts"]["ccflags"] = ["-std=gnu++14"]
+    opts["lib_opts"]["ccflags"] = ["-std=gnu++14"]
+
+    flasher_kw = {}
+    if DEBUG:
+        flasher_kw["debug_rpc_session"] = tvm.rpc.connect("127.0.0.1", 9090)
+
+    session_kw = {
+        "flasher": compiler.flasher(**flasher_kw),
+    }
+
+    if BUILD:
+        session_kw["binary"] = tvm.micro.build_static_runtime(
+            # the x86 compiler *expects* you to give the exact same dictionary for both
+            # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
+            # the binary compiler is expecting those mutations to be in bin_opts.
+            # TODO(weberlo) fix this very bizarre behavior
+            workspace,
+            compiler,
+            mod,
+            lib_opts=opts["lib_opts"],
+            bin_opts=opts["bin_opts"],
+        )
+        if os.path.exists(prev_build):
+            os.unlink(prev_build)
+        session_kw["binary"].archive(prev_build, metadata_only=True)
+    else:
+        unarchive_dir = util.tempdir()
+        session_kw["binary"] = tvm.micro.MicroBinary.unarchive(
+            prev_build, unarchive_dir.relpath("binary")
+        )
+
+    return tvm.micro.Session(**session_kw)
+
+
+def _make_add_sess():
+    A = tvm.te.placeholder((2,), dtype="int8")
+    B = tvm.te.placeholder((1,), dtype="int8")
+    C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
+    sched = tvm.te.create_schedule(C.op)
+    return _make_sess_from_op("add", sched, [A, B, C])
+
+
+def _make_ident_sess():
+    A = tvm.te.placeholder((2,), dtype="int8")
+    B = tvm.te.compute(A.shape, lambda i: A[i], name="B")
+    sched = tvm.te.create_schedule(B.op)
+    return _make_sess_from_op("ident", sched, [A, B])
+
+
+def test_compile_runtime():
+    """Test compiling the on-device runtime."""
+
+    # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
+    def test_basic_add(sess):
+        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
+        assert (A_data.asnumpy() == np.array([2, 3])).all()
+        B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context)
+        assert (B_data.asnumpy() == np.array([4])).all()
+        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context)
+        assert (C_data.asnumpy() == np.array([0, 0])).all()
+
+        system_lib = sess.get_system_lib()
+        system_lib.get_function("add")(A_data, B_data, C_data)
+        assert (C_data.asnumpy() == np.array([6, 7])).all()
+
+    with _make_add_sess() as sess:
+        test_basic_add(sess)
+
+
+if __name__ == "__main__":
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
+    test_compile_runtime()
diff --git a/tests/micro/qemu/zephyr-runtime/.gitignore b/tests/micro/qemu/zephyr-runtime/.gitignore
new file mode 100644
index 000000000000..64be5d3a487c
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/.gitignore
@@ -0,0 +1,3 @@
+__tvm*
+libtvm__*
+/build
diff --git a/tests/micro/qemu/zephyr-runtime/CMakeLists.txt b/tests/micro/qemu/zephyr-runtime/CMakeLists.txt
new file mode 100644
index 000000000000..ce5605469fcb
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/CMakeLists.txt
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.13.1)
+
+set(ENV{QEMU_BIN_PATH} "${CMAKE_SOURCE_DIR}/qemu-hack")
+
+set(QEMU_PIPE "\${QEMU_PIPE}")  # QEMU_PIPE is set by the calling TVM instance.
+
+find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
+project(microtvm_zephyr_runtime)
+
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+file(GLOB TVM_SOURCES ${CMAKE_SOURCE_DIR}/__tvm*.c)
+target_sources(app PRIVATE src/main.c ${TVM_SOURCES})
+
+foreach(tvm_lib ${TVM_LIBS})
+  string(LENGTH ${tvm_lib} tvm_lib_length)
+  math(EXPR tvm_lib_cut "${tvm_lib_length} - 2")
+  string(SUBSTRING ${tvm_lib} 3 ${tvm_lib_cut} tvm_lib_name)
+  add_library(${tvm_lib_name} STATIC IMPORTED)
+  set_target_properties(${tvm_lib_name} PROPERTIES
+      IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/${tvm_lib})
+  target_link_libraries(app PRIVATE ${tvm_lib_name})
+endforeach(tvm_lib ${TVM_LIBS})
+
+target_include_directories(app PRIVATE ${TVM_INCLUDE_DIRS})
diff --git a/tests/micro/qemu/zephyr-runtime/crt/crt_config.h b/tests/micro/qemu/zephyr-runtime/crt/crt_config.h
new file mode 100644
index 000000000000..a7f4f90b0538
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/crt/crt_config.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/runtime/crt_config.h.template
+ * \brief Template for CRT configuration, to be modified on each target.
+ */
+#ifndef TVM_RUNTIME_CRT_CONFIG_H_
+#define TVM_RUNTIME_CRT_CONFIG_H_
+
+#include <tvm/runtime/crt/logging.h>
+
+/*! Log level of the CRT runtime */
+#define TVM_CRT_LOG_LEVEL TVM_CRT_LOG_LEVEL_DEBUG
+
+/*! Maximum supported dimension in NDArray */
+#define TVM_CRT_MAX_NDIM 6
+
+/*! Maximum supported arguments in generated functions */
+#define TVM_CRT_MAX_ARGS 10
+
+/*! Size of the global function registry, in bytes. */
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 200
+
+/*! Maximum number of registered modules. */
+#define TVM_CRT_MAX_REGISTERED_MODULES 2
+
+/*! Maximum packet size, in bytes, including the length header. */
+#define TVM_CRT_MAX_PACKET_SIZE_BYTES 8192
+
+/*! Maximum supported string length in dltype, e.g. "int8", "int16", "float32" */
+#define TVM_CRT_MAX_STRLEN_DLTYPE 10
+
+/*! Maximum supported string length in function names */
+#define TVM_CRT_MAX_STRLEN_FUNCTION_NAME 80
+
+/*! \brief Maximum length of a PackedFunc function name. */
+#define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
+
+/*! \brief Log2 of the page size (bytes) for a virtual memory page. */
+#define TVM_CRT_PAGE_BITS 10  // 1 kB
+
+/*! \brief Number of pages on device. */
+#define TVM_CRT_MAX_PAGES 300
+
+//#define TVM_CRT_FRAMER_ENABLE_LOGS
+
+#endif  // TVM_RUNTIME_CRT_CONFIG_H_
diff --git a/tests/micro/qemu/zephyr-runtime/prj.conf b/tests/micro/qemu/zephyr-runtime/prj.conf
new file mode 100644
index 000000000000..cebb55756e8c
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/prj.conf
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# For UART implementation in main().
+CONFIG_RING_BUFFER=y
+CONFIG_UART_CONSOLE=n
+CONFIG_UART_INTERRUPT_DRIVEN=y
+
+# For RPC server C++ bindings.
+CONFIG_CPLUSPLUS=y
+CONFIG_NEWLIB_LIBC=y
+
+# For models with floating point.
+CONFIG_FPU=y
+
+# For TVMPlatformAbort().
+CONFIG_REBOOT=y
diff --git a/tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386 b/tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386
new file mode 100755
index 000000000000..a0bf0f2c4dee
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386
@@ -0,0 +1,33 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Zephyr insists on running qemu with a -pidfile option, but that option doesn't appear to
+# work given the way we've configured docker (the underlying filesystem doesn't suppor the
+# file locking it needs to). This script strips any -pidfile option, then invokes qemu.
+
+ARGS=( "$(basename $0)" )
+while [ "$#" -gt 0 ]; do
+    if [ "$1" == "-pidfile" ]; then
+        shift
+    else
+        ARGS=( "${ARGS[@]}" "$1" )
+    fi
+    shift
+done
+
+"${ARGS[@]}"
diff --git a/tests/micro/qemu/zephyr-runtime/sample.yaml b/tests/micro/qemu/zephyr-runtime/sample.yaml
new file mode 100644
index 000000000000..88616b4acc40
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/sample.yaml
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+sample:
+  description: uTVM RPC Server unit test
+  name: utvm rpc server
+common:
+    tags: introduction
diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c
new file mode 100644
index 000000000000..19e72e1c076d
--- /dev/null
+++ b/tests/micro/qemu/zephyr-runtime/src/main.c
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <drivers/gpio.h>
+#include <drivers/uart.h>
+#include <kernel.h>
+#include <power/reboot.h>
+#include <stdio.h>
+#include <sys/printk.h>
+#include <sys/ring_buffer.h>
+#include <tvm/runtime/crt/logging.h>
+#include <tvm/runtime/crt/utvm_rpc_server.h>
+#include <unistd.h>
+#include <zephyr.h>
+
+#ifdef CONFIG_ARCH_POSIX
+#include "posix_board_if.h"
+#endif
+
+#include "crt_config.h"
+
+K_SEM_DEFINE(tx_sem, 0, 1);
+
+static const struct device* tvm_uart;
+
+int write_hook(int c) {
+  uart_poll_out(tvm_uart, c);
+  return 0;
+}
+
+ssize_t write_serial(void* unused_context, const uint8_t* data, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    uart_poll_out(tvm_uart, data[i]);
+  }
+
+  return size;
+}
+
+void TVMPlatformAbort(tvm_crt_error_t error) {
+  sys_reboot(SYS_REBOOT_COLD);
+  for (;;)
+    ;
+}
+
+uint32_t g_utvm_start_time;
+
+#define MILLIS_TIL_EXPIRY 200
+#define TIME_TIL_EXPIRY (K_MSEC(MILLIS_TIL_EXPIRY))
+K_TIMER_DEFINE(g_utvm_timer, /* expiry func */ NULL, /* stop func */ NULL);
+
+int g_utvm_timer_running = 0;
+
+#ifdef CONFIG_LED
+/* The devicetree node identifier for the "led0" alias. */
+#define LED0_NODE DT_ALIAS(led0)
+
+#define LED0 DT_GPIO_LABEL(LED0_NODE, gpios)
+#define PIN DT_GPIO_PIN(LED0_NODE, gpios)
+#define FLAGS DT_GPIO_FLAGS(LED0_NODE, gpios)
+
+static struct device* led_pin;
+#endif  // CONFIG_LED
+
+int TVMPlatformTimerStart() {
+  if (g_utvm_timer_running) {
+    TVMLogf("timer already running");
+    return -1;
+  }
+
+#ifdef CONFIG_LED
+  gpio_pin_set(led_pin, PIN, 1);
+#endif
+  k_timer_start(&g_utvm_timer, TIME_TIL_EXPIRY, TIME_TIL_EXPIRY);
+  g_utvm_start_time = k_cycle_get_32();
+  g_utvm_timer_running = 1;
+  return 0;
+}
+
+int TVMPlatformTimerStop(double* res_us) {
+  if (!g_utvm_timer_running) {
+    TVMLogf("timer not running");
+    return -1;
+  }
+
+  uint32_t stop_time = k_cycle_get_32();
+#ifdef CONFIG_LED
+  gpio_pin_set(led_pin, PIN, 0);
+#endif
+
+  // compute how long the work took
+  uint32_t cycles_spent = stop_time - g_utvm_start_time;
+  if (stop_time < g_utvm_start_time) {
+    // we rolled over *at least* once, so correct the rollover it was *only*
+    // once, because we might still use this result
+    cycles_spent = ~((uint32_t)0) - (g_utvm_start_time - stop_time);
+  }
+
+  uint32_t ns_spent = (uint32_t)k_cyc_to_ns_floor64(cycles_spent);
+  double hw_clock_res_us = ns_spent / 1000.0;
+
+  // need to grab time remaining *before* stopping. when stopped, this function
+  // always returns 0.
+  int32_t time_remaining_ms = k_timer_remaining_get(&g_utvm_timer);
+  k_timer_stop(&g_utvm_timer);
+  // check *after* stopping to prevent extra expiries on the happy path
+  if (time_remaining_ms < 0) {
+    TVMLogf("negative time remaining");
+    return -1;
+  }
+  uint32_t num_expiries = k_timer_status_get(&g_utvm_timer);
+  uint32_t timer_res_ms = ((num_expiries * MILLIS_TIL_EXPIRY) + time_remaining_ms);
+  double approx_num_cycles =
+      (double)k_ticks_to_cyc_floor32(1) * (double)k_ms_to_ticks_ceil32(timer_res_ms);
+  // if we approach the limits of the HW clock datatype (uint32_t), use the
+  // coarse-grained timer result instead
+  if (approx_num_cycles > (0.5 * (~((uint32_t)0)))) {
+    *res_us = timer_res_ms * 1000.0;
+  } else {
+    *res_us = hw_clock_res_us;
+  }
+
+  g_utvm_timer_running = 0;
+  return 0;
+}
+
+#define WORKSPACE_SIZE_BYTES (120 * 1024)
+#define WORKSPACE_PAGE_SIZE_BYTES_LOG2 8
+
+uint8_t workspace[WORKSPACE_SIZE_BYTES];
+
+#define RING_BUF_SIZE 512
+struct uart_rx_buf_t {
+  struct ring_buf buf;
+  uint32_t buffer[RING_BUF_SIZE];
+};
+
+struct uart_rx_buf_t uart_rx_buf;
+
+void uart_irq_cb(const struct device* dev, void* user_data) {
+  while (uart_irq_update(dev) && uart_irq_is_pending(dev)) {
+    struct uart_rx_buf_t* buf = (struct uart_rx_buf_t*)user_data;
+    if (uart_irq_rx_ready(dev) == 0) {
+      continue;
+    }
+
+    uint8_t data[32];
+    for (;;) {
+      int bytes_read = uart_fifo_read(dev, data, sizeof(data));
+      if (bytes_read < 0) {
+        TVMPlatformAbort(0xbeef);
+      } else if (bytes_read == 0) {
+        break;
+      }
+      int bytes_written = ring_buf_put(&buf->buf, data, bytes_read);
+      CHECK_EQ(bytes_read, bytes_written, "bytes_read: %d; bytes_written: %d", bytes_read,
+               bytes_written);
+    }
+  }
+}
+
+void uart_rx_init(struct uart_rx_buf_t* buf, const struct device* dev) {
+  ring_buf_init(&buf->buf, RING_BUF_SIZE, buf->buffer);
+  uart_irq_callback_user_data_set(dev, uart_irq_cb, (void*)buf);
+  uart_irq_rx_enable(dev);
+}
+
+int uart_rx_buf_read(struct uart_rx_buf_t* buf, uint8_t* data, size_t data_size_bytes) {
+  unsigned int key = irq_lock();
+  int bytes_read = ring_buf_get(&buf->buf, data, data_size_bytes);
+  irq_unlock(key);
+  return bytes_read;
+}
+
+extern void __stdout_hook_install(int (*hook)(int));
+void main(void) {
+#ifdef CONFIG_LED
+  led_pin = device_get_binding(LED0);
+  if (led_pin == NULL) {
+    for (;;)
+      ;
+  }
+  int ret = gpio_pin_configure(led_pin, PIN, GPIO_OUTPUT_ACTIVE | FLAGS);
+  if (ret < 0) {
+    for (;;)
+      ;
+  }
+  gpio_pin_set(led_pin, PIN, 0);
+#endif
+
+  /* Claim console device */
+  tvm_uart = device_get_binding(DT_LABEL(DT_CHOSEN(zephyr_console)));
+  uart_rx_init(&uart_rx_buf, tvm_uart);
+  __stdout_hook_install(&write_hook);
+
+  utvm_rpc_server_t server = UTvmRpcServerInit(workspace, WORKSPACE_SIZE_BYTES,
+                                               WORKSPACE_PAGE_SIZE_BYTES_LOG2, write_serial, NULL);
+  TVMLogf("uTVM On-Device Runtime");
+
+  while (true) {
+    uint8_t buf[256];
+    int bytes_read = uart_rx_buf_read(&uart_rx_buf, buf, sizeof(buf));
+    if (bytes_read > 0) {
+      size_t bytes_remaining = bytes_read;
+      uint8_t* cursor = buf;
+      while (bytes_remaining > 0) {
+        tvm_crt_error_t err = UTvmRpcServerLoop(server, &cursor, &bytes_remaining);
+        if (err != kTvmErrorNoError && err != kTvmErrorFramingShortPacket) {
+          TVMPlatformAbort(err);
+        }
+      }
+    }
+  }
+
+#ifdef CONFIG_ARCH_POSIX
+  posix_exit(0);
+#endif
+}
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 9893a6449d96..502304d92d30 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -29,7 +29,6 @@
 import tvm
 import tvm.relay
 import tvm.micro
-from tvm.micro import transport
 
 from tvm.topi.util import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python
@@ -110,7 +109,7 @@ def test_reset():
         try:
             sess._rpc.get_function("tvm.testing.reset_server")()
             assert False, "expected to raise SessionTerminatedError; did not raise"
-        except transport.SessionTerminatedError:
+        except tvm.micro.SessionTerminatedError:
             pass
 
 
diff --git a/tests/python/unittest/test_micro_artifact.py b/tests/python/unittest/test_micro_artifact.py
new file mode 100644
index 000000000000..39fea16c0a02
--- /dev/null
+++ b/tests/python/unittest/test_micro_artifact.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Unit tests for the artifact module."""
+
+import json
+import os
+import shutil
+
+from tvm.contrib import util
+from tvm.micro import artifact
+
+
+FILE_LIST = ["label1", "label2", "label12", "unlabelled"]
+
+
+TEST_METADATA = {"foo": "bar"}
+
+
+TEST_LABELS = {"label1": ["label1", "label12"], "label2": ["label2", "label12"]}
+
+
+def build_artifact(artifact_path, immobile=False):
+    os.mkdir(artifact_path)
+
+    for f in FILE_LIST:
+        with open(os.path.join(artifact_path, f), "w") as lib_f:
+            lib_f.write(f"{f}\n")
+
+    sub_dir = os.path.join(artifact_path, "sub_dir")
+    os.mkdir(sub_dir)
+    os.symlink("label1", os.path.join(artifact_path, "rel_symlink"))
+    os.symlink("label2", os.path.join(artifact_path, "abs_symlink"), "label2")
+    os.symlink(
+        os.path.join(artifact_path, "sub_dir"), os.path.join(artifact_path, "abs_dir_symlink")
+    )
+
+    art = artifact.Artifact(artifact_path, TEST_LABELS, TEST_METADATA, immobile=immobile)
+
+    return art
+
+
+def test_basic_functionality():
+    temp_dir = util.tempdir()
+    artifact_path = temp_dir.relpath("foo")
+    art = build_artifact(artifact_path)
+
+    assert art.abspath("bar") == os.path.join(artifact_path, "bar")
+
+    for label, paths in TEST_LABELS.items():
+        assert art.label(label) == paths
+        assert art.label_abspath(label) == [os.path.join(artifact_path, p) for p in paths]
+
+
+def test_archive():
+    temp_dir = util.tempdir()
+    art = build_artifact(temp_dir.relpath("foo"))
+
+    # Create archive
+    archive_path = art.archive(temp_dir.temp_dir)
+    assert archive_path == temp_dir.relpath("foo.tar")
+
+    # Inspect created archive
+    unpack_dir = temp_dir.relpath("unpack")
+    os.mkdir(unpack_dir)
+    shutil.unpack_archive(archive_path, unpack_dir)
+
+    for path in FILE_LIST:
+        with open(os.path.join(unpack_dir, "foo", path)) as f:
+            assert f.read() == f"{path}\n"
+
+    with open(os.path.join(unpack_dir, "foo", "metadata.json")) as metadata_f:
+        metadata = json.load(metadata_f)
+
+    assert metadata["version"] == 2
+    assert metadata["labelled_files"] == TEST_LABELS
+    assert metadata["metadata"] == TEST_METADATA
+
+    # Unarchive and verify basic functionality
+    unarchive_base_dir = temp_dir.relpath("unarchive")
+    unarch = artifact.Artifact.unarchive(archive_path, unarchive_base_dir)
+
+    assert unarch.metadata == TEST_METADATA
+    assert unarch.labelled_files == TEST_LABELS
+    for f in FILE_LIST:
+        assert os.path.exists(os.path.join(unarchive_base_dir, f))
+
+
+def test_metadata_only():
+    temp_dir = util.tempdir()
+    base_dir = temp_dir.relpath("foo")
+    art = build_artifact(base_dir)
+
+    artifact_path = art.archive(temp_dir.relpath("foo.artifact"), metadata_only=True)
+    unarch_base_dir = temp_dir.relpath("bar")
+    unarch = artifact.Artifact.unarchive(artifact_path, unarch_base_dir)
+    assert unarch.base_dir == base_dir
+
+    for p in unarch.label_abspath("label1") + unarch.label_abspath("label2"):
+        assert os.path.exists(p)
+
+    os.unlink(art.abspath("label1"))
+    with open(art.abspath("label2"), "w+") as f:
+        f.write("changed line\n")
+
+    try:
+        artifact.Artifact.unarchive(artifact_path, os.path.join(temp_dir.temp_dir, "bar2"))
+        assert False, "unarchive should raise error"
+    except artifact.ArchiveModifiedError as err:
+        assert str(err) == (
+            "Files in metadata-only archive have been modified:\n"
+            " * label1: original file not found\n"
+            " * label2: sha256 mismatch: expected "
+            "6aa3c5668c8794c791400e19ecd7123949ded1616eafb0395acdd2d896354e83, got "
+            "ed87db21670a81819d65eccde87c5ae0243b2b61783bf77e9b27993be9a3eca0"
+        )
+
+
+if __name__ == "__main__":
+    test_basic_functionality()
+    test_archive()
+    test_metadata_only()
+    # TODO: tests for dir symlinks, symlinks out of bounds, loading malformed artifact tars.
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index f5332ef34606..7fb8d471a53a 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -18,3 +18,12 @@
 
 set -e
 set -u
+
+source tests/scripts/setup-pytest-env.sh
+
+# cleanup pycache
+find . -type f -path "*.pyc" | xargs rm -f
+
+TVM_FFI=ctypes python3 -m pytest tests/micro/qemu
+make cython3
+TVM_FFI=cython python3 -m pytest tests/micro/qemu

From 85c1f2a0b309f259daabfbf3737e04c5bee1b7cb Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Thu, 15 Oct 2020 07:10:13 -0600
Subject: [PATCH 008/258] int32 pooling with int64 shapes (#6687)

* Failing tests for Int32 avg_pooling with Int64 shapes

* fix pooling implementations
---
 include/tvm/topi/nn/pooling.h             | 42 +++++++----
 tests/python/relay/test_op_grad_level2.py | 75 +++++++++++--------
 tests/python/relay/test_op_level10.py     | 22 +++---
 tests/python/relay/test_op_level2.py      | 87 ++++++++++++-----------
 4 files changed, 133 insertions(+), 93 deletions(-)

diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h
index 935d399a6604..2396fc25c23f 100644
--- a/include/tvm/topi/nn/pooling.h
+++ b/include/tvm/topi/nn/pooling.h
@@ -75,8 +75,8 @@ inline Tensor pool_impl(const Tensor& x, const Array<PrimExpr>& kernel_size,
   auto stride_height = cast(DataType::DataType::Int(32), stride_size[0]);
   auto stride_width = cast(DataType::DataType::Int(32), stride_size[1]);
 
-  auto height = x->shape[height_axis];
-  auto width = x->shape[width_axis];
+  auto height = cast(DataType::DataType::Int(32), x->shape[height_axis]);
+  auto width = cast(DataType::DataType::Int(32), x->shape[width_axis]);
 
   auto pad_top = cast(DataType::DataType::Int(32), padding_size[0]);
   auto pad_left = cast(DataType::DataType::Int(32), padding_size[1]);
@@ -107,6 +107,9 @@ inline Tensor pool_impl(const Tensor& x, const Array<PrimExpr>& kernel_size,
   auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width));
 
   Array<PrimExpr> out_shape = x->shape;
+  for (size_t i = 0; i < out_shape.size(); ++i) {
+    out_shape.Set(i, cast(DataType::DataType::Int(32), out_shape[i]));
+  }
   out_shape.Set(height_axis, out_height);
   out_shape.Set(width_axis, out_width);
 
@@ -189,8 +192,8 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
   auto stride_height = cast(DataType::DataType::Int(32), stride_size[0]);
   auto stride_width = cast(DataType::DataType::Int(32), stride_size[1]);
 
-  auto height = x->shape[height_axis];
-  auto width = x->shape[width_axis];
+  auto height = cast(DataType::DataType::Int(32), x->shape[height_axis]);
+  auto width = cast(DataType::DataType::Int(32), x->shape[width_axis]);
 
   auto pad_top = cast(DataType::DataType::Int(32), padding_size[0]);
   auto pad_left = cast(DataType::DataType::Int(32), padding_size[1]);
@@ -220,7 +223,12 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
   auto dheight = tvm::te::reduce_axis(Range(0, kernel_height));
   auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width));
 
-  Array<PrimExpr> out_shape = x->shape;
+  Array<PrimExpr> data_shape = x->shape;
+  for (size_t i = 0; i < data_shape.size(); ++i) {
+    data_shape.Set(i, cast(DataType::DataType::Int(32), data_shape[i]));
+  }
+
+  Array<PrimExpr> out_shape = data_shape;
   out_shape.Set(height_axis, out_height);
   out_shape.Set(width_axis, out_width);
 
@@ -232,7 +240,7 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
                       ((padding_h1 && *padding_h1) || (padding_w1 && *padding_w1));
 
   if (pool_type == kMaxPool) {
-    Array<PrimExpr> ravel_shape{x->shape.begin(), x->shape.end()};
+    Array<PrimExpr> ravel_shape{data_shape.begin(), data_shape.end()};
     ravel_shape.Set(height_axis, ravel_shape[height_axis] + pad_top + pad_bottom);
     ravel_shape.Set(width_axis, ravel_shape[width_axis] + pad_left + pad_right);
 
@@ -257,7 +265,7 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
     auto mp_inds = mp_argmax[0];
 
     return tvm::te::compute(
-        x->shape,
+        data_shape,
         [&](const Array<Var>& inds) {
           Array<PrimExpr> pad_inds{inds.begin(), inds.end()};
           pad_inds.Set(height_axis, pad_inds[height_axis] + pad_top);
@@ -288,7 +296,7 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
         tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height));
     auto windoww = tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width));
     return tvm::te::compute(
-        x->shape,
+        data_shape,
         [&](const Array<Var>& inds) {
           PrimExpr pad_h_idx = inds[height_axis] + pad_top;
           PrimExpr pad_w_idx = inds[width_axis] + pad_left;
@@ -483,10 +491,14 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_
   const auto n_dim = output_size.size();
   CHECK_EQ(axes.size(), n_dim) << "The number of axes not equal to the in/out dimension";
 
-  Array<PrimExpr> out_shape = x->shape;
+  Array<PrimExpr> data_shape = x->shape;
+  for (size_t i = 0; i < data_shape.size(); ++i) {
+    data_shape.Set(i, cast(DataType::DataType::Int(32), data_shape[i]));
+  }
+  Array<PrimExpr> out_shape = data_shape;
   Array<PrimExpr> in_size, out_size;
   for (size_t i = 0; i < n_dim; ++i) {
-    in_size.push_back(x->shape[axes[i]]);
+    in_size.push_back(data_shape[axes[i]]);
     out_size.push_back(cast(DataType::Int(32), output_size[i]));
     out_shape.Set(axes[i], out_size[i]);
   }
@@ -661,7 +673,11 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
   std::vector<PrimExpr> pad_tail(k_size);
   Array<PrimExpr> pad_before(std::vector<PrimExpr>(x_size, 0));
   Array<PrimExpr> pad_after(std::vector<PrimExpr>(x_size, 0));
-  Array<PrimExpr> out_shape = x->shape;
+  Array<PrimExpr> data_shape = x->shape;
+  for (size_t i = 0; i < data_shape.size(); ++i) {
+    data_shape.Set(i, cast(DataType::DataType::Int(32), data_shape[i]));
+  }
+  Array<PrimExpr> out_shape = data_shape;
 
   bool do_pad = false;
   for (int i = 0; i < k_size; i++) {
@@ -687,7 +703,7 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
 
     arith::Analyzer analyzer;
     auto out_dim = analyzer.Simplify(
-        indexdiv(x->shape[ii] - kernel[i] + pad_head[i] + pad_tail[i], stride[i]) + 1);
+        indexdiv(data_shape[ii] - kernel[i] + pad_head[i] + pad_tail[i], stride[i]) + 1);
 
     out_shape.Set(ii, out_dim);
   }
@@ -746,7 +762,7 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
             for (int i = 0; i < k_size; i++) {
               int ii = axis[i];
               start[i] = output[ii] * stride[i] - pad_head[i];
-              end[i] = min(start[i] + kernel[i], x->shape[ii]);
+              end[i] = min(start[i] + kernel[i], data_shape[ii]);
               start[i] = max(start[i], make_const(DataType::Int(32), 0));
               kernel_size *= (end[i] - start[i]);
             }
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 34bbf9e60b3a..85332da64221 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -66,39 +66,43 @@ def test_max_pool2d_grad():
     )
 
 
-def verify_avg_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode, count_include_pad):
-    x = relay.var("x", relay.TensorType(x_shape, "float32"))
-    y = tvm.relay.nn.avg_pool2d(
-        x,
-        pool_size=pool_size,
-        strides=strides,
-        padding=padding,
-        ceil_mode=ceil_mode,
-        count_include_pad=count_include_pad,
-    )
-
-    fwd_func = relay.Function([x], y)
-    fwd_func = run_infer_type(fwd_func)
-    bwd_func = run_infer_type(gradient(fwd_func))
+def verify_avg_pool2d_grad(
+    x_shape, pool_size, strides, padding, ceil_mode, count_include_pad, dtype="float32"
+):
+
+    for shape_dtype in ["int32", "int64"]:
+        x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in x_shape], dtype=dtype)
+        y = tvm.relay.nn.avg_pool2d(
+            x,
+            pool_size=pool_size,
+            strides=strides,
+            padding=padding,
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
 
-    data = np.random.rand(*x_shape).astype("float32")
-    ph, pw = padding
-    y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
-    out_grad = np.ones(shape=y_shape)
-    ref_grad = tvm.topi.testing.pool_grad_nchw(
-        data,
-        out_grad,
-        pool_size=pool_size,
-        strides=strides,
-        padding=[ph, pw, ph, pw],
-        pool_type="avg",
-        ceil_mode=ceil_mode,
-    )
+        fwd_func = relay.Function([x], y)
+        fwd_func = run_infer_type(fwd_func)
+        bwd_func = run_infer_type(gradient(fwd_func))
+
+        data = np.random.rand(*x_shape).astype(dtype)
+        ph, pw = padding
+        y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
+        out_grad = np.ones(shape=y_shape)
+        ref_grad = tvm.topi.testing.pool_grad_nchw(
+            data,
+            out_grad,
+            pool_size=pool_size,
+            strides=strides,
+            padding=[ph, pw, ph, pw],
+            pool_type="avg",
+            ceil_mode=ceil_mode,
+        )
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp = relay.create_executor(ctx=ctx, target=target)
-        op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
-        np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
+        for target, ctx in tvm.testing.enabled_targets():
+            intrp = relay.create_executor(ctx=ctx, target=target)
+            op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
+            np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
@@ -119,6 +123,15 @@ def test_avg_pool2d_grad():
         ceil_mode=False,
         count_include_pad=False,
     )
+    verify_avg_pool2d_grad(
+        (1, 4, 16, 16),
+        pool_size=(1, 1),
+        strides=(1, 1),
+        padding=(1, 1),
+        ceil_mode=False,
+        count_include_pad=False,
+        dtype="int32",
+    )
 
 
 def verify_global_avg_pool2d_grad(x_shape):
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index bc565682d932..3ec1a5bb6129 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -425,17 +425,18 @@ def verify_ndarray_size(shape):
 
 
 def verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc):
-    x = relay.var("x", relay.TensorType(dshape, "float32"))
-    y = opfunc(x, out_size, layout)
-    func = relay.Function([x], y)
+    for shape_dtype in ["int32", "int64"]:
+        x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
+        y = opfunc(x, out_size, layout)
+        func = relay.Function([x], y)
 
-    np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
-    np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
+        np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
+        np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        relay_out = intrp1.evaluate(func)(np_data)
-        tvm.testing.assert_allclose(relay_out.asnumpy(), np_out, rtol=1e-5, atol=1e-5)
+        for target, ctx in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            relay_out = intrp1.evaluate(func)(np_data)
+            tvm.testing.assert_allclose(relay_out.asnumpy(), np_out, rtol=1e-5, atol=1e-5)
 
 
 def verify_adaptive_pool2d(dshape, out_size, pool_type, layout="NCHW", dtype="float32"):
@@ -452,6 +453,7 @@ def verify_adaptive_pool3d(dshape, out_size, pool_type, layout="NCHW", dtype="fl
 def test_adaptive_pool():
     verify_adaptive_pool2d((1, 9, 224, 224), (1, 1), "max")
     verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg")
+    verify_adaptive_pool2d((1, 3, 224, 224), (2, 3), "avg", dtype="int32")
     verify_adaptive_pool2d((1, 14, 56, 78), (34, 13), "max")
     verify_adaptive_pool2d((1, 5, 46, 97), (4, 96), "avg")
     verify_adaptive_pool2d((1, 224, 224, 3), (1, 1), "max", layout="NHWC")
@@ -459,6 +461,8 @@ def test_adaptive_pool():
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "max", layout="NCDHW")
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NCDHW")
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NDHWC")
+    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NCDHW", dtype="int32")
+    verify_adaptive_pool3d((1, 16, 32, 32, 32), (1, 1, 1), "avg", layout="NDHWC", dtype="int32")
     verify_adaptive_pool3d((1, 16, 32, 32, 32), (2, 4, 4), "max", layout="NDHWC")
 
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index c25c2bf48ca7..546ea6019e56 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -959,15 +959,16 @@ def _test_pool2d_int(opfunc, reffunc, dtype):
     # test execution
     dtype = "int32"
     dshape = (1, 3, 28, 28)
-    x = relay.var("x", shape=dshape, dtype=dtype)
-    y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
-    func = relay.Function([x], y)
-    data = np.random.randint(low=-128, high=128, size=dshape)
-    ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5)).astype(dtype)
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        op_res1 = intrp1.evaluate(func)(data)
-        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+    for shape_dtype in ["int32", "int64"]:
+        x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
+        y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        func = relay.Function([x], y)
+        data = np.random.randint(low=-128, high=128, size=dshape)
+        ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5)).astype(dtype)
+        for target, ctx in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(data)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 
 def _test_global_pool2d(opfunc, reffunc):
@@ -1010,7 +1011,7 @@ def test_pool2d():
 
 @tvm.testing.uses_gpu
 def test_pool1d():
-    def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0)):
+    def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0), dtype="float32"):
         n, c, w = te.var("n"), 10, 224
         x = relay.var("x", relay.TensorType((n, c, w), "float32"))
         y = opfunc(x, pool_size=(1,))
@@ -1018,24 +1019,26 @@ def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0)):
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((n, 10, 224), "float32")
         # test execution
-        dtype = "float32"
         dshape = (1, 3, 32)
-        x = relay.var("x", shape=dshape)
-        pool_type = "max" if "max" in str(opfunc) else "avg"
-        y = opfunc(x, pool_size=pool_size, strides=strides, padding=padding)
-        func = relay.Function([x], y)
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = tvm.topi.testing.pool1d_ncw_python(
-            data, (2,), (2,), (0, 0), (1, 3, 16), pool_type, False
-        )
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res1 = intrp1.evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+        for shape_dtype in ["int32", "int64"]:
+            x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
+            pool_type = "max" if "max" in str(opfunc) else "avg"
+            y = opfunc(x, pool_size=pool_size, strides=strides, padding=padding)
+            func = relay.Function([x], y)
+            data = np.random.uniform(size=dshape).astype(dtype)
+            ref_res = tvm.topi.testing.pool1d_ncw_python(
+                data, (2,), (2,), (0, 0), (1, 3, 16), pool_type, False
+            )
+            for target, ctx in tvm.testing.enabled_targets():
+                intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res1 = intrp1.evaluate(func)(data)
+                tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     _test_pool1d(relay.nn.max_pool1d)
+    _test_pool1d(relay.nn.max_pool1d, dtype="int32")
     _test_pool1d(relay.nn.max_pool1d, pool_size=2, strides=2, padding=0)
     _test_pool1d(relay.nn.avg_pool1d)
+    _test_pool1d(relay.nn.avg_pool1d, dtype="int32")
     _test_pool1d(relay.nn.avg_pool1d, pool_size=2, strides=2, padding=0)
 
 
@@ -1047,6 +1050,7 @@ def _test_pool3d(
         strides=(2, 2, 2),
         padding=(0, 0, 0, 0, 0, 0),
         out_shape=(1, 3, 16, 16, 16),
+        dtype="float32",
     ):
         n, c, d, h, w = te.size_var("n"), 10, 5, 224, 224
         x = relay.var("x", relay.TensorType((n, c, d, h, w), "float32"))
@@ -1057,30 +1061,33 @@ def _test_pool3d(
         # test execution
         dtype = "float32"
         dshape = (1, 3, 32, 32, 32)
-        x = relay.var("x", shape=dshape)
-        pool_type = "max" if "max" in str(opfunc) else "avg"
-        y = opfunc(x, pool_size=pool_size, strides=strides, padding=padding)
-        func = relay.Function([x], y)
-        # check output shape
-        f_out_shape = tuple(map(lambda x: int(x), run_infer_type(func).ret_type.shape))
-        assert out_shape == f_out_shape, "Output shape mismatch. expected {}, actual {}".format(
-            out_shape, f_out_shape
-        )
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = tvm.topi.testing.pool3d_ncdhw_python(
-            data, pool_size, strides, padding, out_shape, pool_type, False
-        )
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res1 = intrp1.evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+        for shape_dtype in ["int32", "int64"]:
+            x = relay.var("x", shape=[tvm.tir.IntImm(shape_dtype, x) for x in dshape], dtype=dtype)
+            pool_type = "max" if "max" in str(opfunc) else "avg"
+            y = opfunc(x, pool_size=pool_size, strides=strides, padding=padding)
+            func = relay.Function([x], y)
+            # check output shape
+            f_out_shape = tuple(map(lambda x: int(x), run_infer_type(func).ret_type.shape))
+            assert out_shape == f_out_shape, "Output shape mismatch. expected {}, actual {}".format(
+                out_shape, f_out_shape
+            )
+            data = np.random.uniform(size=dshape).astype(dtype)
+            ref_res = tvm.topi.testing.pool3d_ncdhw_python(
+                data, pool_size, strides, padding, out_shape, pool_type, False
+            )
+            for target, ctx in tvm.testing.enabled_targets():
+                intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res1 = intrp1.evaluate(func)(data)
+                tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     _test_pool3d(relay.nn.max_pool3d)
+    _test_pool3d(relay.nn.max_pool3d, dtype="int32")
     _test_pool3d(relay.nn.max_pool3d, padding=(2, 0, 0, 2, 0, 0), out_shape=(1, 3, 18, 16, 16))
     _test_pool3d(relay.nn.max_pool3d, padding=(0, 3, 0, 0, 3, 0), out_shape=(1, 3, 16, 19, 16))
     _test_pool3d(relay.nn.max_pool3d, padding=(0, 0, 4, 0, 0, 4), out_shape=(1, 3, 16, 16, 20))
     _test_pool3d(relay.nn.max_pool3d, pool_size=2, padding=0, strides=2)
     _test_pool3d(relay.nn.avg_pool3d)
+    _test_pool3d(relay.nn.avg_pool3d, dtype="int32")
     _test_pool3d(relay.nn.avg_pool3d, padding=(2, 0, 0, 2, 0, 0), out_shape=(1, 3, 18, 16, 16))
     _test_pool3d(relay.nn.avg_pool3d, padding=(0, 3, 0, 0, 3, 0), out_shape=(1, 3, 16, 19, 16))
     _test_pool3d(relay.nn.avg_pool3d, padding=(0, 0, 4, 0, 0, 4), out_shape=(1, 3, 16, 16, 20))

From 7e431e5defdaf62c0063b941dbe1d93a74f348ef Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 15 Oct 2020 17:12:12 -0700
Subject: [PATCH 009/258] [Docker] Update CI CPU and GPU images based on new
 Docker build files. (#6690)

* Turn on Rust docs and MxNet based ResNet

* Add deps needed for Rust examples and docs

* Setup Rust path

* Bump Jenkinsfile

* Fix broken version setting, which instead redirects stdout and stderr

* Update Jenkinsfile

* Format

* Disable Rust change for now

* Completely disable ResNet

* Reset test changes

* Remove temporary labels

* Remove patch needed for docs
---
 Jenkinsfile                              | 4 ++--
 docker/Dockerfile.ci_cpu                 | 4 ++++
 docker/Dockerfile.ci_gpu                 | 7 +++++++
 docker/install/ubuntu_install_darknet.sh | 7 ++-----
 docker/install/ubuntu_install_dgl.sh     | 0
 docker/install/ubuntu_install_sphinx.sh  | 2 +-
 rust/Cargo.toml                          | 1 -
 7 files changed, 16 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 docker/install/ubuntu_install_dgl.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index 207d12c21d6d..0f59de70680a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,8 +45,8 @@
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = "tlcpack/ci-lint:v0.62"
-ci_gpu = "tlcpack/ci-gpu:v0.64"
-ci_cpu = "tlcpack/ci-cpu:v0.66"
+ci_gpu = "tlcpack/ci-gpu:v0.65"
+ci_cpu = "tlcpack/ci-cpu:v0.67"
 ci_wasm = "tlcpack/ci-wasm:v0.60"
 ci_i386 = "tlcpack/ci-i386:v0.52"
 ci_qemu = "tlcpack/ci-qemu:v0.01"
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 4823488a731a..44eec4d6319c 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -36,6 +36,10 @@ RUN bash /install/ubuntu1804_install_llvm.sh
 COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh
 RUN bash /install/ubuntu_install_dnnl.sh
 
+# Install MxNet for access to Gluon Model Zoo.
+COPY install/ubuntu_install_mxnet.sh /install/ubuntu_install_mxnet.sh
+RUN bash /install/ubuntu_install_mxnet.sh
+
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 1197d8e4c7b6..bf2e21394f36 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -83,6 +83,13 @@ RUN bash /install/ubuntu_install_dgl.sh
 COPY install/ubuntu_install_vulkan.sh /install/ubuntu_install_vulkan.sh
 RUN bash /install/ubuntu_install_vulkan.sh
 
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
 # AutoTVM deps
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
diff --git a/docker/install/ubuntu_install_darknet.sh b/docker/install/ubuntu_install_darknet.sh
index c48724c6065b..37adf4a30270 100755
--- a/docker/install/ubuntu_install_darknet.sh
+++ b/docker/install/ubuntu_install_darknet.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -23,7 +23,4 @@ set -o pipefail
 #install the necessary dependancies, cffi, opencv
 wget -q 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
 debian_version=`cat /etc/debian_version`
-if [ "$debian_version" == "stretch/sid" ]; then
-    pip2 install opencv-python cffi
-fi
 pip3 install opencv-python cffi
diff --git a/docker/install/ubuntu_install_dgl.sh b/docker/install/ubuntu_install_dgl.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
index 2555164e2292..33757a0d4d57 100755
--- a/docker/install/ubuntu_install_sphinx.sh
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install sphinx sphinx-gallery==0.4.0 autodocsumm sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image commonmark>=0.7.3 docutils>=0.11
+pip3 install sphinx sphinx-gallery==0.4.0 autodocsumm sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image "commonmark>=0.7.3" "docutils>=0.11"
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 28312a5e73dc..9935ce7c8b9f 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -23,7 +23,6 @@ members = [
 	"tvm",
 	"tvm/tests/basics",
 	"tvm/tests/callback",
-	"tvm/examples/resnet",
 	"tvm-graph-rt",
 	"tvm-graph-rt/tests/test_tvm_basic",
 	"tvm-graph-rt/tests/test_tvm_dso",

From 9429c669a31edc7f4fa29f39b9823403fa3eec20 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 15 Oct 2020 18:22:58 -0700
Subject: [PATCH 010/258] [FIX,MICROTVM] Skip microtvm tests if microtvm is not
 built (#6693)

---
 python/tvm/testing.py             | 17 +++++++++++++++++
 tests/python/unittest/test_crt.py | 14 +++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index 20e968b328f6..51fa2d0d7def 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -617,6 +617,23 @@ def requires_llvm(*args):
     return _compose(args, _requires_llvm)
 
 
+def requires_micro(*args):
+    """Mark a test as requiring microTVM to run.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_micro = [
+        pytest.mark.skipif(
+            tvm.support.libinfo().get("USE_MICRO", "OFF") != "ON",
+            reason="MicroTVM support not enabled. Set USE_MICRO=ON in config.cmake to enable.",
+        )
+    ]
+    return _compose(args, _requires_micro)
+
+
 def _target_to_requirement(target):
     # mapping from target to decorator
     if target.startswith("cuda"):
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 502304d92d30..04018daca478 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -28,7 +28,6 @@
 
 import tvm
 import tvm.relay
-import tvm.micro
 
 from tvm.topi.util import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python
@@ -84,8 +83,11 @@ def _make_ident_sess(workspace):
     return _make_sess_from_op(workspace, "ident", sched, [A, B])
 
 
+@tvm.testing.requires_micro
 def test_compile_runtime():
     """Test compiling the on-device runtime."""
+    import tvm.micro
+
     workspace = tvm.micro.Workspace()
 
     with _make_add_sess(workspace) as sess:
@@ -101,8 +103,12 @@ def test_compile_runtime():
         assert (C_data.asnumpy() == np.array([6, 7])).all()
 
 
+@tvm.testing.requires_micro
 def test_reset():
     """Test when the remote end resets during a session."""
+    import tvm.micro
+    from tvm.micro import transport
+
     workspace = tvm.micro.Workspace()
 
     with _make_add_sess(workspace) as sess:
@@ -113,8 +119,11 @@ def test_reset():
             pass
 
 
+@tvm.testing.requires_micro
 def test_graph_runtime():
     """Test use of the graph runtime with microTVM."""
+    import tvm.micro
+
     workspace = tvm.micro.Workspace()
     relay_mod = tvm.parser.fromtext(
         """
@@ -143,8 +152,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
         assert (out.asnumpy() == np.array([6, 10])).all()
 
 
+@tvm.testing.requires_micro
 def test_std_math_functions():
     """Verify that standard math functions can be used."""
+    import tvm.micro
+
     workspace = tvm.micro.Workspace()
     A = tvm.te.placeholder((2,), dtype="float32", name="A")
     B = tvm.te.compute(A.shape, lambda i: tvm.te.exp(A[i]), name="B")

From 97038345a50b1a3944ff4e1806d9515ef3123748 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Thu, 15 Oct 2020 19:39:25 -0700
Subject: [PATCH 011/258] [TFLite] Fix detection of crop in
 convert_batch_to_space_nd (#6670)

---
 python/tvm/relay/frontend/tflite.py          | 2 +-
 tests/python/frontend/tflite/test_forward.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 1b09cf307554..f52c318c8e97 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -2601,7 +2601,7 @@ def convert_batch_to_space_nd(self, op):
         cropped = reshaped_permuted
         for axis in range(1, M + 1):
             crop = crops[axis - 1]
-            if (crop != [0, 0]).all():
+            if (crop != [0, 0]).any():
                 indices = _op.arange(
                     _expr.const(crop[0]),
                     _expr.const(reshaped_permuted_shape[axis] - crop[1]),
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 27980047e909..caa41806c8aa 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -706,6 +706,8 @@ def test_forward_batch_to_space_nd():
 
     _test_batch_to_space_nd(input_shape=[4, 2, 2, 1], block_shape=[2, 2], crops=[[0, 0], [0, 0]])
 
+    _test_batch_to_space_nd(input_shape=[4, 3, 3, 1], block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
 
 ######################################################################
 # SpaceToBatchND

From 334f97a7f09bd46ac59dea97245f6075af681e08 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 15 Oct 2020 22:27:11 -0700
Subject: [PATCH 012/258] Fix tutorial broken by Docker build (#6694)

---
 tutorials/frontend/build_gcn.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tutorials/frontend/build_gcn.py b/tutorials/frontend/build_gcn.py
index 5c571ef1ff25..b832d18f9c3a 100644
--- a/tutorials/frontend/build_gcn.py
+++ b/tutorials/frontend/build_gcn.py
@@ -242,7 +242,9 @@ def GraphConv(layer_name, input_dim, output_dim, adj, input, norm=None, bias=Tru
 
 def prepare_params(g, data):
     params = {}
-    params["infeats"] = data.features.astype("float32")  # Only support float32 as feature for now
+    params["infeats"] = data.features.numpy().astype(
+        "float32"
+    )  # Only support float32 as feature for now
 
     # Generate adjacency matrix
     adjacency = nx.to_scipy_sparse_matrix(g)
@@ -350,5 +352,7 @@ def prepare_params(g, data):
 acc = evaluate(data, logits_tvm)
 print("Test accuracy of TVM results: {:.2%}".format(acc))
 
+import tvm.testing
+
 # Verify the results with the DGL model
 tvm.testing.assert_allclose(logits_torch, logits_tvm, atol=1e-3)

From 708e1fab6be35783df5d3c4639f6870d8c2d9cbe Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 16 Oct 2020 19:07:45 +0900
Subject: [PATCH 013/258] [Torch, Quantization] Necessary workaround to prepare
 for 1.6 update (#6602)

* add support for 1.6 quantized models

* fix lint

* move version check function to a common utils

* fix lint

Co-authored-by: masa <masa@pop-os.localdomain>
---
 python/tvm/relay/frontend/pytorch.py       |  15 +-
 python/tvm/relay/frontend/pytorch_utils.py |  25 +++
 python/tvm/relay/frontend/qnn_torch.py     | 186 ++++++++++++++-------
 3 files changed, 153 insertions(+), 73 deletions(-)
 create mode 100644 python/tvm/relay/frontend/pytorch_utils.py

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 8e626f52d528..c8fbd5a5c10c 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -42,18 +42,11 @@
 from ..prelude import Prelude, StaticTensorArrayOps
 
 from . import qnn_torch
+from .pytorch_utils import is_version_greater_than
 
 __all__ = ["from_pytorch"]
 
 
-def _is_version_greater_than(ver):
-    import torch
-    from packaging import version
-
-    # Torch version > 1.4 changed upsampling API
-    return version.parse(torch.__version__) > version.parse(ver)
-
-
 # List ADT utilities
 def _infer_type_with_prelude(val, prelude):
     body = _infer_type(val, prelude.mod)
@@ -1882,7 +1875,7 @@ def func(x):
 
         if _is_quantized_tensor(data, prelude):
             # Torch version > 1.4 changed upsampling API
-            if _is_version_greater_than("1.4.0"):
+            if is_version_greater_than("1.4.0"):
                 num_inputs = 7
             else:
                 num_inputs = 5
@@ -2714,7 +2707,7 @@ def _run_jit_passes(graph):
     """ The inline pass is necessary to unwrap prim::CallMethod """
     import torch
 
-    if _is_version_greater_than("1.5.0"):
+    if is_version_greater_than("1.5.0"):
         # This is required for torchvision detection models from 1.6 above
         # It is the same as _jit_pass_inline, except that it has some special
         # case behaviors for some ops such as aten::__interpolate()
@@ -3069,8 +3062,6 @@ def convert_params(graph, state_dict):
             full_attr_node_name = _get_output_name(getattrs[-1])
 
             if full_attr.endswith("_packed_params"):  # for quantized models
-                err_msg = "parameter %s not found in state dict" % full_attr
-                assert full_attr in state_dict, err_msg
                 packed_param_map[full_attr_node_name] = full_attr
             elif full_attr in state_dict:
                 if full_attr in vars_by_name:
diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py
new file mode 100644
index 000000000000..e0c8f8da7d62
--- /dev/null
+++ b/python/tvm/relay/frontend/pytorch_utils.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-outside-toplevel
+""" Common utilities used by PyTorch frontend """
+
+
+def is_version_greater_than(ver):
+    import torch
+    from packaging import version
+
+    return version.parse(torch.__version__) > version.parse(ver)
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 121307385d7e..ca67391cebc7 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -26,6 +26,8 @@
 from tvm.relay import op as _op
 from tvm.relay.frontend.common import infer_shape
 
+from .pytorch_utils import is_version_greater_than
+
 
 class QNNParam:
     """ A placeholder for weight quantization parameters """
@@ -46,59 +48,95 @@ def __init__(self, weight, bias, scale, zero_point, param_key):
         self.zero_point = _expr.const(zero_point, dtype="int32")
 
 
-def _unpack_quant_params(param_name, packed_params, unpack_func):
-    # Torch stores quantized params in a custom packed format,
-    # need to unpack and retrieve them as numpy arrays
-    qweight, bias = unpack_func(packed_params)
-    weight_np = qweight.dequantize().numpy()
+class ConvPackedParam(QNNParam):
+    """A placeholder for quantized conv2d op attributes
+    As of PyTorch 1.6, attributes of quantized conv2d ops, like
+    stride, padding etc are stored in ConvPackedParams objects,
+    together with weights and quantization parameters
+    """
+
+    def __init__(
+        self, weight_np, bias, scale, zero_point, param_name, stride, padding, dilation, groups
+    ):
+        super().__init__(weight_np, bias, scale, zero_point, param_name)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
 
+
+def _get_quant_params(qweight):
     import torch
 
+    weight_np = qweight.dequantize().numpy()
+
     if qweight.qscheme() == torch.per_tensor_affine:
-        param = QNNParam(
-            weight_np, bias, qweight.q_scale(), int(qweight.q_zero_point()), param_name
-        )
-    else:
-        scales = qweight.q_per_channel_scales().numpy()
-        zero_points = qweight.q_per_channel_zero_points().numpy()
-        # This is an assumption posed by QNN
-        msg = "The values of zero points should be all zero for per channel"
-        assert np.all(zero_points == 0), msg
-        param = QNNParam(weight_np, bias, scales, 0, param_name)
+        return weight_np, qweight.q_scale(), int(qweight.q_zero_point())
+
+    scales = qweight.q_per_channel_scales().numpy()
+    zero_points = qweight.q_per_channel_zero_points().numpy()
+    # This is an assumption posed by QNN
+    msg = "The values of zero points should be all zero for per channel"
+    assert np.all(zero_points == 0), msg
+    return weight_np, scales, 0
+
 
-    return param
+def make_qnn_param(param_name, qweight, bias):
+    weight_np, scale, zero_point = _get_quant_params(qweight)
+    return QNNParam(weight_np, bias, scale, zero_point, param_name)
+
+
+def make_conv_packed_param(param_name, qweight, bias, packed_params):
+    weight_np, scale, zero_point = _get_quant_params(qweight)
+    stride = packed_params.stride()
+    padding = packed_params.padding()
+    dilation = packed_params.dilation()
+    groups = packed_params.groups()
+    return ConvPackedParam(
+        weight_np, bias, scale, zero_point, param_name, stride, padding, dilation, groups
+    )
 
 
 def get_weight_quant_params(script_module):
     """ Retrive and unpack weight parameters from quantized modules """
-    conv_packed_params = []
-    linear_packed_params = []
-
     import torch
 
-    # conv and linear requires different unpacking function
-    # extract all conv and linear parameters separately to distinguish them
-    for name, m in script_module.named_modules():
-        if isinstance(m, torch.jit.RecursiveScriptModule):
-            if "Conv" in m.original_name:
-                conv_packed_params.append((name, m.state_dict()))
-            elif m.original_name == "LinearPackedParams":
-                linear_packed_params.append((name, m.state_dict()))
+    param_name = "_packed_params"
+    quant_params = {}
+
+    def filter_func(named_module):
+        m = named_module[1]
+        return isinstance(m, torch.jit.RecursiveScriptModule) and (
+            ("Conv" in m.original_name) or (m.original_name == "LinearPackedParams")
+        )
 
-    pairs = [
-        (torch.ops.quantized.conv2d_unpack, conv_packed_params),
-        (torch.ops.quantized.linear_unpack, linear_packed_params),
-    ]
+    for name, m in filter(filter_func, script_module.named_modules()):
+        key = name + "." + param_name
+        state_dict = m.state_dict()
 
-    quant_params = {}
-    param_name = "_packed_params"
-    for unpack_func, params in pairs:
-        for name, state_dict in params:
+        if len(state_dict) == 0 and not hasattr(m, param_name):
+            # for v1.6 and above
+            # This case seems to happen if a model is serialized
+            # and loaded back
+            # This module can be safely ignored
+            continue
+
+        if len(state_dict) == 0 and hasattr(m, param_name):
+            # for v1.6 and above
+            packed_params = m._packed_params
+        else:
             assert len(state_dict) == 1
-            assert param_name in state_dict
-            key = name + "." + param_name
-            packed_param = state_dict[param_name]
-            quant_params[key] = _unpack_quant_params(key, packed_param, unpack_func)
+            packed_params = list(state_dict.values())[0]
+
+        if "Conv" in m.original_name and len(state_dict) == 0:
+            qweight, bias = torch.ops.quantized.conv2d_unpack(packed_params)
+            quant_params[key] = make_conv_packed_param(key, qweight, bias, packed_params)
+        elif "Conv" in m.original_name:
+            qweight, bias = torch.ops.quantized.conv2d_unpack(packed_params)
+            quant_params[key] = make_qnn_param(key, qweight, bias)
+        elif m.original_name == "LinearPackedParams":
+            qweight, bias = torch.ops.quantized.linear_unpack(packed_params)
+            quant_params[key] = make_qnn_param(key, qweight, bias)
 
     return quant_params
 
@@ -113,8 +151,12 @@ def add_quant_params_to_outputs(outputs, packed_param_map, quant_params):
         qweight = relay.qnn.op.quantize(
             qparam.weight_var, qparam.scale, qparam.zero_point, out_dtype="int8", axis=0
         )
-        param_tup = (qweight, qparam.scale, qparam.zero_point, qparam.bias_var)
-        outputs[node_name] = param_tup
+        params = [qweight, qparam.scale, qparam.zero_point, qparam.bias_var]
+
+        if isinstance(quant_params[packed_param_name], ConvPackedParam):
+            params += [qparam.stride, qparam.padding, qparam.dilation, qparam.groups]
+
+        outputs[node_name] = params
 
 
 def _get_quant_param_for_input(input_value):
@@ -129,10 +171,17 @@ def _get_quant_param_for_input(input_value):
     # Indices for output scale and zp
     # For example, in quantized::conv2d(%input, %1, %2, %3, %4, %5, %6, %7),
     # 6th and 7th arg are output scale and zp respectively.
+
+    # PyTorch 1.6 changed qconv API
+    if is_version_greater_than("1.5.0"):
+        qconv_indices = (2, 3)
+    else:
+        qconv_indices = (6, 7)
+
     output_quant_param_indices = {
         "aten::quantize_per_tensor": (1, 2),
-        "quantized::conv2d": (6, 7),
-        "quantized::conv2d_relu": (6, 7),
+        "quantized::conv2d": qconv_indices,
+        "quantized::conv2d_relu": qconv_indices,
         "quantized::linear": (2, 3),
         "quantized::linear_relu": (2, 3),
         "quantized::add_relu": (2, 3),
@@ -458,24 +507,40 @@ def _impl(inputs, _):
         # inputs[7]: output_zero_point
         # inputs[8]: input_scale (added manually by frontend)
         # inputs[9]: input_zero_point (added manually by frontend)
-        weight = inputs[1][0]
-        weight_scale = inputs[1][1]
-        weight_zero_point = inputs[1][2]
-
-        output_scale = _expr.const(inputs[6])
-        output_zero_point = _expr.const(inputs[7])
+        conv_params = inputs[1]
+        weight = conv_params[0]
+        weight_scale = conv_params[1]
+        weight_zero_point = conv_params[2]
+        bias = conv_params[3]
+
+        if len(conv_params) > 4:
+            # Torch 1.6 or newer case
+            strides = conv_params[4]
+            padding = conv_params[5]
+            dilation = conv_params[6]
+            groups = conv_params[7]
+
+            output_scale = _expr.const(inputs[2])
+            output_zero_point = _expr.const(inputs[3])
+
+            assert len(inputs) == 6, "Input quant params not found in op inputs"
+
+            # These are manually added by add_input_quant_params_to_op_inputs above
+            # In torch, they are retrieved from QTensor data structure at runtime
+            input_scale = _expr.const(inputs[4])
+            input_zero_point = _expr.const(inputs[5])
+        else:
+            strides = inputs[2]
+            padding = inputs[3]
+            dilation = inputs[4]
+            groups = inputs[5]
+            output_scale = _expr.const(inputs[6])
+            output_zero_point = _expr.const(inputs[7])
 
-        assert len(inputs) == 10, "Input quant params not found in op inputs"
-        # These are manually added by add_input_quant_params_to_op_inputs above
-        # In torch, they are retrieved from QTensor data structure at runtime
-        input_scale = _expr.const(inputs[8])
-        input_zero_point = _expr.const(inputs[9])
+            assert len(inputs) == 10, "Input quant params not found in op inputs"
 
-        strides, padding, dilation = inputs[2], inputs[3], inputs[4]
-        strides = inputs[2]
-        padding = inputs[3]
-        dilation = inputs[4]
-        groups = inputs[5]
+            input_scale = _expr.const(inputs[8])
+            input_zero_point = _expr.const(inputs[9])
 
         weight_shape = infer_shape(weight)
         kernel_size = (weight_shape[2], weight_shape[3])
@@ -507,11 +572,10 @@ def _impl(inputs, _):
             groups=groups,
             channels=out_channels,
         )
-        bias_var = inputs[1][3]
 
         return _do_bias_and_requantize(
             conv_out,
-            bias_var,
+            bias,
             input_scale,
             weight_scale,
             output_scale,

From 72e79f4ad7c1fe4289f2ae8db201b3b15be6fc98 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Fri, 16 Oct 2020 23:47:27 +0800
Subject: [PATCH 014/258] [Relay] Change some passes to mix mode (#6695)

---
 src/relay/analysis/util.cc            |  8 +++++--
 src/relay/analysis/well_formed.cc     | 16 ++++++--------
 src/relay/ir/expr_functor.cc          |  4 +++-
 src/relay/transforms/de_duplicate.cc  |  6 +++--
 src/relay/transforms/fold_constant.cc | 32 +++++++++++++--------------
 5 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index 59ce01ce4227..edf8fb644c57 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -71,7 +71,7 @@ class TypeVarTVisitor : public TypeVisitor {
   InsertionSet<TypeVar>* bound_type_vars_;
 };
 
-class TypeVarEVisitor : private ExprVisitor {
+class TypeVarEVisitor : private MixedModeVisitor {
  public:
   explicit TypeVarEVisitor(const IRModule& mod) : mod_(mod) {}
 
@@ -131,6 +131,8 @@ class TypeVarEVisitor : private ExprVisitor {
     return CollectAll();
   }
 
+  using MixedModeVisitor::VisitExpr_;
+
   void VisitExpr_(const FunctionNode* f) final {
     for (const auto& tp : f->type_params) {
       type_vars_.Insert(tp);
@@ -159,7 +161,7 @@ class TypeVarEVisitor : private ExprVisitor {
   const IRModule& mod_;
 };
 
-class VarVisitor : protected ExprVisitor, protected PatternVisitor {
+class VarVisitor : protected MixedModeVisitor, protected PatternVisitor {
  public:
   Array<Var> Free(const Expr& expr) {
     this->VisitExpr(expr);
@@ -204,6 +206,8 @@ class VarVisitor : protected ExprVisitor, protected PatternVisitor {
     vars_.Insert(v);
   }
 
+  using MixedModeVisitor::VisitExpr_;
+
   void VisitExpr_(const VarNode* var) final { vars_.Insert(GetRef<Var>(var)); }
 
   void VisitExpr_(const FunctionNode* op) final {
diff --git a/src/relay/analysis/well_formed.cc b/src/relay/analysis/well_formed.cc
index 3e409d10b885..5abbbc94fb36 100644
--- a/src/relay/analysis/well_formed.cc
+++ b/src/relay/analysis/well_formed.cc
@@ -32,7 +32,7 @@ namespace tvm {
 namespace relay {
 
 //! brief make sure each Var is bound at most once in a scope.
-class WellFormedChecker : private ExprVisitor, PatternVisitor {
+class WellFormedChecker : private MixedModeVisitor, PatternVisitor {
  public:
   Optional<DiagnosticContext> diag_ctx;
   Span occurs_in;
@@ -79,6 +79,8 @@ class WellFormedChecker : private ExprVisitor, PatternVisitor {
     total_bound.insert(v);
   }
 
+  using MixedModeVisitor::VisitExpr_;
+
   void VisitExpr_(const VarNode* op) final {
     Var v = GetRef<Var>(op);
     if (current_bound.count(v) == 0) {
@@ -126,7 +128,7 @@ class WellFormedChecker : private ExprVisitor, PatternVisitor {
 
     // CHECK(call->attrs.defined());
     CHECK(call->type_args.defined());
-    ExprVisitor::VisitExpr_(call);
+    MixedModeVisitor::VisitExpr_(call);
   }
 
   void VisitClause(const Clause& c) final {
@@ -139,18 +141,14 @@ class WellFormedChecker : private ExprVisitor, PatternVisitor {
 
   void VisitVar(const Var& v) final { Bound(v); }
 
-  void VisitExpr(const Expr& e) final {
+ public:
+  bool CheckWellFormed(const Expr& e) {
     if (auto v = e.as<VarNode>()) {
       VisitExpr_(v);
     } else {
       // this->occurs_in = e->span;
-      ExprVisitor::VisitExpr(e);
+      VisitExpr(e);
     }
-  }
-
- public:
-  bool CheckWellFormed(const Expr& e) {
-    this->VisitExpr(e);
     return well_formed;
   }
 };
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index cbc41d225d4b..a09179bcc585 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -517,10 +517,12 @@ TVM_REGISTER_GLOBAL("relay.analysis.post_order_visit").set_body_typed([](Expr ex
 });
 
 // Implement bind.
-class ExprBinder : public ExprMutator, PatternMutator {
+class ExprBinder : public MixedModeMutator, PatternMutator {
  public:
   explicit ExprBinder(const tvm::Map<Var, Expr>& args_map) : args_map_(args_map) {}
 
+  using MixedModeMutator::VisitExpr_;
+
   Expr VisitExpr_(const LetNode* op) final {
     CHECK(!args_map_.count(op->var)) << "Cannot bind an internel variable in let";
     return ExprMutator::VisitExpr_(op);
diff --git a/src/relay/transforms/de_duplicate.cc b/src/relay/transforms/de_duplicate.cc
index d90e5c584df3..8c62fe6100c3 100644
--- a/src/relay/transforms/de_duplicate.cc
+++ b/src/relay/transforms/de_duplicate.cc
@@ -31,7 +31,7 @@ namespace tvm {
 namespace relay {
 
 Expr DeDup(const Expr& e) {
-  class DeDupMutator : public TypeMutator, public ExprMutator, public PatternMutator {
+  class DeDupMutator : public TypeMutator, public MixedModeMutator, public PatternMutator {
    public:
     TypeVar Fresh(const TypeVar& tv) {
       TypeVar ret = TypeVar(tv->name_hint, tv->kind);
@@ -47,12 +47,14 @@ Expr DeDup(const Expr& e) {
       return ret;
     }
 
-    Expr VisitExpr(const Expr& e) final {
+    Expr DispatchVisitExpr(const Expr& e) final {
       auto ret = ExprMutator::VisitExpr(e);
       ret->checked_type_ = e->checked_type_;
       return ret;
     }
 
+    using MixedModeMutator::VisitExpr_;
+
     Expr VisitExpr_(const VarNode* op) final {
       Var v = GetRef<Var>(op);
       return rename_.count(v) != 0 ? rename_.at(v) : v;
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 660aff2eed9a..8d2cba05be49 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -75,7 +75,7 @@ TVM_REGISTER_GLOBAL("relay.analysis.check_constant").set_body_typed(ConstantChec
 
 // TODO(tvm-team) consider combine dead-code with constant folder.
 // or make a more powerful partial evaluator.
-class ConstantFolder : public ExprMutator {
+class ConstantFolder : public MixedModeMutator {
  public:
   explicit ConstantFolder(IRModule module)
       : module_(module),
@@ -89,6 +89,8 @@ class ConstantFolder : public ExprMutator {
         cast_op_(Op::Get("cast")),
         ndarray_size_op_(Op::Get("ndarray_size")) {}
 
+  using MixedModeMutator::VisitExpr_;
+
   Expr VisitExpr_(const LetNode* op) final {
     Expr value = this->Mutate(op->value);
     if (value.as<ConstantNode>()) {
@@ -118,7 +120,7 @@ class ConstantFolder : public ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const CallNode* call) final {
+  Expr Rewrite_(const CallNode* call, const Expr& post) final {
     if (inside_primitive) {
       return GetRef<Expr>(call);
     }
@@ -127,26 +129,25 @@ class ConstantFolder : public ExprMutator {
     std::unordered_set<std::string> skip_list{"zeros_like", "ones_like", "full_like", "full"};
 
     auto origin_args = call->args;
-    Expr res = ExprMutator::VisitExpr_(call);
-    call = res.as<CallNode>();
+    call = post.as<CallNode>();
     // We don't constant fold function with zero arguments.
     // This is a heuristic that is useful.
     // For example it is harmful to fold ones(shape=(4, 5)).
-    if (call->args.size() == 0) return res;
+    if (call->args.size() == 0) return post;
     const OpNode* op = call->op.as<OpNode>();
-    if (op == nullptr) return res;
+    if (op == nullptr) return post;
     if (skip_list.count(op->name)) {
-      return res;
+      return post;
     }
     // skip stateful ops.
-    if (op_stateful.get(GetRef<Op>(op), false)) return res;
+    if (op_stateful.get(GetRef<Op>(op), false)) return post;
     // Try to evaluate shape_of op
     if (call->op == shape_of_op_ || call->op == vm_shape_of_op_) {
-      return EvaluateShapeOf(res, origin_args, call->attrs);
+      return EvaluateShapeOf(post, origin_args, call->attrs);
     }
 
     if (call->op == ndarray_size_op_) {
-      return EvaluateNdarraySize(res, origin_args, call->attrs);
+      return EvaluateNdarraySize(post, origin_args, call->attrs);
     }
 
     // We should think about potentially constant evaluation over these ops too.
@@ -162,19 +163,18 @@ class ConstantFolder : public ExprMutator {
       }
     }
     if (all_const_args) {
-      return ConstEvaluate(res);
+      return ConstEvaluate(post);
     } else {
-      return res;
+      return post;
     }
   }
 
-  Expr VisitExpr_(const TupleGetItemNode* op) final {
-    Expr res = ExprMutator::VisitExpr_(op);
-    op = res.as<TupleGetItemNode>();
+  Expr Rewrite_(const TupleGetItemNode* op, const Expr& post) final {
+    op = post.as<TupleGetItemNode>();
     if (const auto* tuple = op->tuple.as<TupleNode>()) {
       return tuple->fields[op->index];
     } else {
-      return res;
+      return post;
     }
   }
 

From 5121ec68068e5e2060ab7cc9614e58e4c06519f7 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 16 Oct 2020 14:24:23 -0700
Subject: [PATCH 015/258] [LLVM][WINDOWS] Recover windows support for the
 latest LLVM (#6698)

Windows COFF requires comdat information to support weak-like linkage(via any).
This patch fixes the windows LLVM support after LLVM-8.
---
 CMakeLists.txt                 |  2 ++
 apps/cpp_rpc/rpc_env.cc        | 11 +++++++++--
 apps/cpp_rpc/win32_process.h   |  6 +++++-
 src/target/llvm/codegen_cpu.cc | 15 +++++++++++++++
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7aacfcd4d9ef..33c720c4cce4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,6 +100,8 @@ if(MSVC)
   add_definitions(-D_SCL_SECURE_NO_WARNINGS)
   add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
   add_definitions(-DNOMINMAX)
+  # regeneration does not work well with msbuild custom rules.
+  set(CMAKE_SUPPRESS_REGENERATION ON)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /bigobj")
diff --git a/apps/cpp_rpc/rpc_env.cc b/apps/cpp_rpc/rpc_env.cc
index b76a1abbdadd..967274fd88a2 100644
--- a/apps/cpp_rpc/rpc_env.cc
+++ b/apps/cpp_rpc/rpc_env.cc
@@ -40,7 +40,6 @@ int mkdir(const char* path, int /* ignored */) { return _mkdir(path); }
 #include <string>
 #include <vector>
 
-#include "../../src/runtime/file_utils.h"
 #include "../../src/support/utils.h"
 #include "rpc_env.h"
 
@@ -115,7 +114,15 @@ RPCEnv::RPCEnv() {
         std::string file_name = this->GetPath(args[0]);
         file_name = BuildSharedLibrary(file_name);
         std::string bin;
-        LoadBinaryFromFile(file_name, &bin);
+
+        std::ifstream fs(file_name, std::ios::in | std::ios::binary);
+        CHECK(!fs.fail()) << "Cannot open " << file_name;
+        fs.seekg(0, std::ios::end);
+        size_t size = static_cast<size_t>(fs.tellg());
+        fs.seekg(0, std::ios::beg);
+        bin.resize(size);
+        fs.read(dmlc::BeginPtr(bin), size);
+
         TVMByteArray binarr;
         binarr.data = bin.data();
         binarr.size = bin.length();
diff --git a/apps/cpp_rpc/win32_process.h b/apps/cpp_rpc/win32_process.h
index 621444e18764..0f784681f209 100644
--- a/apps/cpp_rpc/win32_process.h
+++ b/apps/cpp_rpc/win32_process.h
@@ -23,8 +23,12 @@
  */
 #ifndef TVM_APPS_CPP_RPC_WIN32_PROCESS_H_
 #define TVM_APPS_CPP_RPC_WIN32_PROCESS_H_
+
 #include <chrono>
 #include <string>
+
+#include "../../src/support/socket.h"
+
 namespace tvm {
 namespace runtime {
 /*!
@@ -41,4 +45,4 @@ void SpawnRPCChild(SOCKET fd, std::chrono::seconds timeout);
 void ChildProcSocketHandler(const std::string& mmap_path);
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_APPS_CPP_RPC_WIN32_PROCESS_H_
\ No newline at end of file
+#endif  // TVM_APPS_CPP_RPC_WIN32_PROCESS_H_
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 53104542417e..d15c6151edc5 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -233,6 +233,14 @@ void CodeGenCPU::AddMainFunction(const std::string& entry_func_name) {
 #else
   global->setAlignment(1);
 #endif
+  // comdat is needed for windows select any linking to work
+  // set comdat to Any(weak linking)
+  if (target_machine_->getTargetTriple().isOSWindows()) {
+    llvm::Comdat* comdat = module_->getOrInsertComdat(runtime::symbol::tvm_module_main);
+    comdat->setSelectionKind(llvm::Comdat::Any);
+    global->setComdat(comdat);
+  }
+
   global->setInitializer(llvm::ConstantDataArray::getString(*ctx_, entry_func_name));
   global->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass);
 }
@@ -358,6 +366,13 @@ llvm::GlobalVariable* CodeGenCPU::InitContextPtr(llvm::Type* p_type, std::string
 #endif
   gv->setInitializer(llvm::Constant::getNullValue(p_type));
   gv->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+  // comdat is needed for windows select any linking to work
+  // set comdat to Any(weak linking)
+  if (target_machine_->getTargetTriple().isOSWindows()) {
+    llvm::Comdat* comdat = module_->getOrInsertComdat(name);
+    comdat->setSelectionKind(llvm::Comdat::Any);
+    gv->setComdat(comdat);
+  }
   return gv;
 }
 

From 768726926e857680df7115e3108773756046d7d0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 17 Oct 2020 05:11:37 -0700
Subject: [PATCH 016/258] Resolve more warnings in msvc (#6702)

---
 src/support/socket.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/support/socket.h b/src/support/socket.h
index f38918feb8cb..571b1503072a 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -26,10 +26,14 @@
 #define TVM_SUPPORT_SOCKET_H_
 
 #if defined(_WIN32)
+
+#ifndef NOMINMAX
 #define NOMINMAX
+#endif
+
 #include <winsock2.h>
 #include <ws2tcpip.h>
-#undef NOMINMAX
+
 using ssize_t = int;
 #ifdef _MSC_VER
 #pragma comment(lib, "Ws2_32.lib")

From 9b35e3bacb91094b4de46960c7e834b81b479bb3 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Sat, 17 Oct 2020 17:26:50 -0700
Subject: [PATCH 017/258] Add cloudpickle dependency to docker images (#6701)

---
 docker/Dockerfile.ci_i386                     |  4 +++
 docker/install/ubuntu_install_cmake_source.sh | 32 +++++++++++++++++++
 .../install/ubuntu_install_python_package.sh  |  2 +-
 3 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 docker/install/ubuntu_install_cmake_source.sh

diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index a7d8308d4810..73ca50e3eb1e 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -21,6 +21,7 @@
 FROM ioft/i386-ubuntu:16.04
 
 RUN apt-get update --fix-missing
+RUN apt-get install -y ca-certificates
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
@@ -31,6 +32,9 @@ RUN bash /install/ubuntu_install_llvm.sh
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh
new file mode 100644
index 000000000000..f818fba9721b
--- /dev/null
+++ b/docker/install/ubuntu_install_cmake_source.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+v=3.13
+version=3.13.5
+wget https://cmake.org/files/v${v}/cmake-${version}.tar.gz
+tar xvf cmake-${version}.tar.gz
+cd cmake-${version}
+./bootstrap
+make -j$(nproc)
+make install
+cd ..
+rm -rf cmake-${version} cmake-${version}.tar.gz
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 2b8df74dab7b..2ed14c273678 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -21,4 +21,4 @@ set -u
 set -o pipefail
 
 # install libraries for python package on ubuntu
-pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest mypy orderedset attrs requests Pillow packaging
+pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest mypy orderedset attrs requests Pillow packaging cloudpickle

From acc7b73755ee3438cc55346b09f605e9c5862f61 Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Sat, 17 Oct 2020 17:42:38 -0700
Subject: [PATCH 018/258] Refactor diagnostic to avoid circular dependencies
 (#6692)

---
 include/tvm/ir/diagnostic.h       | 48 +------------------------------
 include/tvm/ir/type_relation.h    |  1 +
 include/tvm/relay/analysis.h      |  2 +-
 include/tvm/runtime/object.h      | 16 +++++------
 include/tvm/support/logging.h     | 46 +++++++++++++++++++++++++++++
 src/ir/diagnostic.cc              | 11 ++-----
 src/parser/parser.cc              |  2 +-
 src/parser/span_check.h           |  2 +-
 src/relay/analysis/well_formed.cc |  2 +-
 src/relay/op/nn/convolution.h     |  2 +-
 10 files changed, 63 insertions(+), 69 deletions(-)

diff --git a/include/tvm/ir/diagnostic.h b/include/tvm/ir/diagnostic.h
index 6b9807487bae..2a2a6cd4e867 100644
--- a/include/tvm/ir/diagnostic.h
+++ b/include/tvm/ir/diagnostic.h
@@ -21,68 +21,22 @@
  * \file diagnostic.h
  * \brief A new diagnostic interface for TVM error reporting.
  *
- * A prototype of the new diagnostic reporting interface for TVM.
- *
- * Eventually we hope to promote this file to the top-level and
- * replace the existing errors.h.
  */
 
 #ifndef TVM_IR_DIAGNOSTIC_H_
 #define TVM_IR_DIAGNOSTIC_H_
 
 #include <tvm/ir/module.h>
-#include <tvm/ir/span.h>
 #include <tvm/parser/source_map.h>
-#include <tvm/runtime/container.h>
-#include <tvm/runtime/object.h>
-#include <tvm/support/logging.h>
 
-#include <fstream>
+#include <sstream>
 #include <string>
-#include <utility>
-#include <vector>
 
 namespace tvm {
 
 using tvm::parser::SourceMap;
 using tvm::runtime::TypedPackedFunc;
 
-extern const char* kTVM_INTERNAL_ERROR_MESSAGE;
-
-#define ICHECK_INDENT "  "
-
-#define ICHECK_BINARY_OP(name, op, x, y)                           \
-  if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \
-  dmlc::LogMessageFatal(__FILE__, __LINE__).stream()               \
-      << kTVM_INTERNAL_ERROR_MESSAGE << std::endl                  \
-      << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": "
-
-#define ICHECK(x)                                    \
-  if (!(x))                                          \
-  dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
-      << kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: "
-
-#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
-#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
-#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y)
-#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
-#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
-#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
-#define ICHECK_NOTNULL(x)                                                                   \
-  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()                      \
-                        << kTVM_INTERNAL_ERROR_MESSAGE << __INDENT << "Check not null: " #x \
-                        << ' ',                                                             \
-   (x) : (x))  // NOLINT(*)
-
-/*! \brief The diagnostic level, controls the printing of the message. */
-enum class DiagnosticLevel : int {
-  kBug = 10,
-  kError = 20,
-  kWarning = 30,
-  kNote = 40,
-  kHelp = 50,
-};
-
 class DiagnosticBuilder;
 
 /*! \brief A compiler diagnostic. */
diff --git a/include/tvm/ir/type_relation.h b/include/tvm/ir/type_relation.h
index 83323b01e419..462588006c9b 100644
--- a/include/tvm/ir/type_relation.h
+++ b/include/tvm/ir/type_relation.h
@@ -29,6 +29,7 @@
 #include <tvm/ir/env_func.h>
 #include <tvm/ir/module.h>
 #include <tvm/ir/type.h>
+#include <tvm/support/logging.h>
 
 namespace tvm {
 
diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h
index 26e5a65ddb5e..5dd837038731 100644
--- a/include/tvm/relay/analysis.h
+++ b/include/tvm/relay/analysis.h
@@ -24,12 +24,12 @@
 #ifndef TVM_RELAY_ANALYSIS_H_
 #define TVM_RELAY_ANALYSIS_H_
 
-#include <tvm/ir/diagnostic.h>
 #include <tvm/ir/module.h>
 #include <tvm/relay/adt.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
 #include <tvm/relay/type.h>
+#include <tvm/support/logging.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index e6ca832c70c2..b5cf77d590f6 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -23,8 +23,8 @@
 #ifndef TVM_RUNTIME_OBJECT_H_
 #define TVM_RUNTIME_OBJECT_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/support/logging.h>
 
 #include <string>
 #include <type_traits>
@@ -153,9 +153,9 @@ struct TypeIndex {
  *    ObjectRef leaf_ref(make_object<LeafObj>());
  *    // cast to a specific instance
  *    const LeafObj* leaf_ptr = leaf_ref.as<LeafObj>();
- *    CHECK(leaf_ptr != nullptr);
+ *    ICHECK(leaf_ptr != nullptr);
  *    // can also cast to the base class.
- *    CHECK(leaf_ref.as<BaseObj>() != nullptr);
+ *    ICHECK(leaf_ref.as<BaseObj>() != nullptr);
  *  }
  *
  * \endcode
@@ -756,7 +756,7 @@ struct ObjectPtrEqual {
  */
 #define TVM_DEFINE_OBJECT_REF_COW_METHOD(ObjectName)     \
   ObjectName* CopyOnWrite() {                            \
-    CHECK(data_ != nullptr);                             \
+    ICHECK(data_ != nullptr);                            \
     if (!data_.unique()) {                               \
       auto n = make_object<ObjectName>(*(operator->())); \
       ObjectPtr<Object>(std::move(n)).swap(data_);       \
@@ -845,7 +845,7 @@ inline RefType GetRef(const ObjType* ptr) {
   static_assert(std::is_base_of<typename RefType::ContainerType, ObjType>::value,
                 "Can only cast to the ref of same container type");
   if (!RefType::_type_is_nullable) {
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
   }
   return RefType(ObjectPtr<Object>(const_cast<Object*>(static_cast<const Object*>(ptr))));
 }
@@ -860,12 +860,12 @@ inline ObjectPtr<BaseType> GetObjectPtr(ObjType* ptr) {
 template <typename SubRef, typename BaseRef>
 inline SubRef Downcast(BaseRef ref) {
   if (ref.defined()) {
-    CHECK(ref->template IsInstance<typename SubRef::ContainerType>())
+    ICHECK(ref->template IsInstance<typename SubRef::ContainerType>())
         << "Downcast from " << ref->GetTypeKey() << " to " << SubRef::ContainerType::_type_key
         << " failed.";
   } else {
-    CHECK(SubRef::_type_is_nullable) << "Downcast from nullptr to not nullable reference of "
-                                     << SubRef::ContainerType::_type_key;
+    ICHECK(SubRef::_type_is_nullable) << "Downcast from nullptr to not nullable reference of "
+                                      << SubRef::ContainerType::_type_key;
   }
   return SubRef(std::move(ref.data_));
 }
diff --git a/include/tvm/support/logging.h b/include/tvm/support/logging.h
index c318b89e5c51..4322435c06b0 100644
--- a/include/tvm/support/logging.h
+++ b/include/tvm/support/logging.h
@@ -24,6 +24,8 @@
 #ifndef TVM_SUPPORT_LOGGING_H_
 #define TVM_SUPPORT_LOGGING_H_
 
+#include <dmlc/logging.h>
+
 // a technique that enables overriding macro names on the number of parameters. This is used
 // to define other macros below
 #define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME
@@ -109,4 +111,48 @@
 #define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false)
 #define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false)
 
+namespace tvm {
+
+constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
+    "\n---------------------------------------------------------------\n"
+    "An internal invariant was violated during the execution of TVM.\n"
+    "Please read TVM's error reporting guidelines.\n"
+    "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n"
+    "---------------------------------------------------------------\n";
+
+#define ICHECK_INDENT "  "
+
+#define ICHECK_BINARY_OP(name, op, x, y)                           \
+  if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \
+  dmlc::LogMessageFatal(__FILE__, __LINE__).stream()               \
+      << kTVM_INTERNAL_ERROR_MESSAGE << std::endl                  \
+      << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": "
+
+#define ICHECK(x)                                    \
+  if (!(x))                                          \
+  dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
+      << kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: "
+
+#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
+#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
+#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y)
+#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
+#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
+#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
+#define ICHECK_NOTNULL(x)                                                                   \
+  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()                      \
+                        << kTVM_INTERNAL_ERROR_MESSAGE << __INDENT << "Check not null: " #x \
+                        << ' ',                                                             \
+   (x) : (x))  // NOLINT(*)
+
+/*! \brief The diagnostic level, controls the printing of the message. */
+enum class DiagnosticLevel : int {
+  kBug = 10,
+  kError = 20,
+  kWarning = 30,
+  kNote = 40,
+  kHelp = 50,
+};
+
+}  // namespace tvm
 #endif  // TVM_SUPPORT_LOGGING_H_
diff --git a/src/ir/diagnostic.cc b/src/ir/diagnostic.cc
index ceadf78e2cfc..148831dc3ab6 100644
--- a/src/ir/diagnostic.cc
+++ b/src/ir/diagnostic.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file src/ir/transform.cc
- * \brief Infrastructure for transformation passes.
+ * \file src/ir/diagnostic.cc
+ * \brief Implementation of DiagnosticContext and friends.
  */
 #include <tvm/ir/diagnostic.h>
 #include <tvm/parser/source_map.h>
@@ -30,13 +30,6 @@ namespace tvm {
 
 using tvm::parser::Source;
 
-const char* kTVM_INTERNAL_ERROR_MESSAGE =
-    "\n---------------------------------------------------------------\n"
-    "An internal invariant was violated during the execution of TVM.\n"
-    "Please read TVM's error reporting guidelines.\n"
-    "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n"
-    "---------------------------------------------------------------\n";
-
 // failed to check to argument arg0.dims[0] != 0
 
 /* Diagnostic */
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 7dc55b0b519a..9c9965ca588f 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -21,7 +21,6 @@
  * \file parser.cc
  * \brief A parser for TVM IR.
  */
-#include <tvm/ir/diagnostic.h>
 #include <tvm/ir/module.h>
 #include <tvm/node/reflection.h>
 #include <tvm/parser/parser.h>
@@ -31,6 +30,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <fstream>
 
diff --git a/src/parser/span_check.h b/src/parser/span_check.h
index b9ba76df4b8f..9a887474fe67 100644
--- a/src/parser/span_check.h
+++ b/src/parser/span_check.h
@@ -25,13 +25,13 @@
 #ifndef TVM_PARSER_SPAN_CHECK_H_
 #define TVM_PARSER_SPAN_CHECK_H_
 
-#include <tvm/ir/diagnostic.h>
 #include <tvm/ir/transform.h>
 #include <tvm/ir/type_functor.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/object.h>
+#include <tvm/support/logging.h>
 
 #include <fstream>
 #include <string>
diff --git a/src/relay/analysis/well_formed.cc b/src/relay/analysis/well_formed.cc
index 5abbbc94fb36..0b6e043a0d21 100644
--- a/src/relay/analysis/well_formed.cc
+++ b/src/relay/analysis/well_formed.cc
@@ -21,10 +21,10 @@
  * \file well_formed.cc
  * \brief check that expression is well formed.
  */
-#include <tvm/ir/diagnostic.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
+#include <tvm/support/logging.h>
 
 #include <unordered_set>
 
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index cd334d7269ab..935058c1a5b3 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -24,7 +24,7 @@
 #ifndef TVM_RELAY_OP_NN_CONVOLUTION_H_
 #define TVM_RELAY_OP_NN_CONVOLUTION_H_
 
-#include <tvm/ir/diagnostic.h>
+#include <tvm/support/logging.h>
 #include <tvm/tir/analysis.h>
 
 #include <string>

From ce34c4590755db226d2522a7e902902c7025bd5a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 18 Oct 2020 06:09:17 -0700
Subject: [PATCH 019/258] [TEST] Address flaky error in test_any (#6705)

---
 tests/python/relay/test_any.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index c445cd194400..872728514c3e 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -58,7 +58,7 @@ def check_result(
             if flatten:
                 result = result.flatten()
                 expected = expected.flatten()
-            tvm.testing.assert_allclose(result, expected)
+            tvm.testing.assert_allclose(result, expected, atol=2e-6)
 
 
 def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):

From fdf5474d8b5d9d32d0b52f21a0c362e585a4ab1a Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@connect.ust.hk>
Date: Sun, 18 Oct 2020 15:27:46 -0700
Subject: [PATCH 020/258] [Frontend][Relay] Fix MXNet frontend to support NLP
 backbones in GluonNLP (#6699)

* update

Update type_relations.cc

Update transform.cc

Update transform.cc

Update transform.cc

Update transform.cc

Update transform.cc

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

update

Update mxnet.py

debug

Update generic.py

Update topi_integration.py

fix bug

update

Update test_forward.py

Update test_forward.py

fix test case

Update mxnet.py

update

Update mxnet.py

Update mxnet.py

Update test_forward.py

Update mxnet.py

Update mxnet.py

Update test_forward.py

Update mxnet.py

Update mxnet.py

Update mxnet.py

debug

Update mxnet.py

Update mxnet.py

Update test_forward.py

Update mxnet.py

* address comments

* Update mxnet.py

* Update mxnet.py

* fix

* improve where test

* Update test_forward.py

* Update test_forward.py

* Update test_forward.py

* update

* Update mxnet.py

* Update mxnet.py

* Update mxnet.py

debug

Update common.py

update

Update mxnet.py

update

Update test_forward.py

Update test_forward.py

* update

* fix lint

* Update mxnet.py

* Update test_op_level1.py

* fix lint
---
 python/tvm/relay/frontend/mxnet.py          | 133 +++++++++++++++-----
 python/tvm/topi/x86/batch_matmul.py         |  12 +-
 tests/python/frontend/mxnet/test_forward.py |  30 +++--
 tests/python/relay/test_op_level1.py        |   2 +-
 4 files changed, 134 insertions(+), 43 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 984945f71868..a543f78bd949 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -790,6 +790,16 @@ def _mx_dot(inputs, attrs):
 def _mx_batch_dot(inputs, attrs):
     assert len(inputs) == 2
     a, b = inputs
+    a_shape = _infer_type(a).checked_type.shape
+    batch_shapes = None
+    if len(a_shape) > 3:
+        batch_shapes = a_shape[:-2]
+        a = _op.reverse_reshape(a, newshape=(-1, 0, 0))
+    b_shape = _infer_type(b).checked_type.shape
+    if len(b_shape) > 3:
+        if batch_shapes is None:
+            batch_shapes = b_shape[:-2]
+        b = _op.reverse_reshape(b, newshape=(-1, 0, 0))
     transpose_a = attrs.get_bool("transpose_a", False)
     transpose_b = attrs.get_bool("transpose_b", False)
     if transpose_a is True:
@@ -797,7 +807,10 @@ def _mx_batch_dot(inputs, attrs):
         raise tvm.error.OpAttributeInvalid(msg.format(transpose_a))
     if transpose_b is False:
         b = _op.transpose(b, axes=[0, 2, 1])
-    return _op.nn.batch_matmul(a, b)
+    out = _op.nn.batch_matmul(a, b)
+    if batch_shapes is not None:
+        out = _op.reverse_reshape(out, newshape=tuple(batch_shapes) + (0, 0))
+    return out
 
 
 def _mx_arange(inputs, attrs):
@@ -2294,18 +2307,16 @@ def _mx_npi_pad(inputs, attrs):
         raise tvm.error.OpAttributeRequired('Attribute "mode" not found in operator pad.')
     if pad_mode not in ["constant", "edge", "reflect"]:
         raise tvm.error.OpAttributeInvalid("Value " + mode + ' in attribute "mode" is not valid')
-    pad_width = attrs.get_int_tuple("pad_width", None)
-    if pad_width is None:
+    if "pad_width" not in attrs.attrs:
         raise tvm.error.OpAttributeRequired('Attribute "pad_width" not found in operator pad.')
-    if None in pad_width:
-        raise tvm.error.OpAttributeInvalid(
-            'Value None in attribute "pad_width" of operator Slice is not valid.'
-        )
+    # Begin to parse tuple of tuple, we cannot use get_int_tuple here because it's a tuple of tuple.
+    pad_width = attrs.attrs["pad_width"]
+    pad_width = pad_width.replace("(", "[")
+    pad_width = pad_width.replace(")", "]")
+    pad_width = json.loads(pad_width)
     constant_values = attrs.get_float("constant_values", 0.0)
-    padding = tuple(tuple((b, a)) for b, a in zip(pad_width[::2], pad_width[1::2]))
-
     return _op.nn.pad(
-        data=inputs[0], pad_width=padding, pad_value=constant_values, pad_mode=pad_mode
+        data=inputs[0], pad_width=pad_width, pad_value=constant_values, pad_mode=pad_mode
     )
 
 
@@ -2321,24 +2332,74 @@ def _mx_npx_reshape(inputs, attrs):
     shape = attrs.get_int_tuple("newshape")
     reverse = attrs.get_bool("reverse", False)
     shape_list = list(shape)
-    new_shape_list = []
-    for num in shape_list:
-        if num > 0 or num == -1:
-            new_shape_list.append(num)
-        elif num == -2:
-            new_shape_list.append(0)
-        elif num == -4:
-            new_shape_list.append(-2)
-        elif num == -5:
-            new_shape_list.append(-3)
-        elif num == -6:
-            new_shape_list.append(-4)
+    old_shape = get_const_tuple(_infer_type(inputs[0]).checked_type.shape)
+    new_shape = []
+    if reverse:
+        old_shape = old_shape[::-1]
+        shape_list = shape_list[::-1]
+    ptr = 0
+    unknown_axis = None
+    src_ptr = 0
+    while src_ptr < len(shape_list):
+        ele = shape_list[src_ptr]
+        src_ptr += 1
+        if ele > 0:
+            new_shape.append(ele)
+            ptr += 1
+        elif ele == -1:
+            new_shape.append(-1)
+            if unknown_axis is not None:
+                raise tvm.error.OpAttributeInvalid("Can only have one -1 in the input shape.")
+            unknown_axis = len(new_shape)
+            ptr += 1
+        elif ele == -2:
+            new_shape.append(old_shape[ptr])
+            ptr += 1
+        elif ele == -3:
+            if old_shape[ptr] != 1:
+                raise tvm.error.OpAttributeInvalid(
+                    "Dimension of the original shape "
+                    "that corresponds to -3 must be 1. Received"
+                    " {}".format(old_shape[ptr])
+                )
+            ptr += 1
+        elif ele == -4:
+            new_shape += old_shape[ptr:]
+            break
+        elif ele == -5:
+            new_shape.append(old_shape[ptr] * old_shape[ptr + 1])
+            ptr += 2
+        elif ele == -6:
+            # Split axis
+            lhs = shape_list[src_ptr]
+            rhs = shape_list[src_ptr + 1]
+            src_ptr += 2
+            if lhs == -1 and rhs == -1:
+                raise tvm.error.OpAttributeInvalid("The lhs and rhs can not both be -1.")
+            if lhs == -1:
+                if old_shape[ptr] % rhs != 0:
+                    raise tvm.error.OpAttributeInvalid(
+                        "When splitting the axis, "
+                        "the dimension of the split axis must "
+                        "be divisible by the splitted values."
+                    )
+                lhs = old_shape[ptr] // rhs
+            if rhs == -1:
+                if old_shape[ptr] % lhs != 0:
+                    raise tvm.error.OpAttributeInvalid(
+                        "When splitting the axis, "
+                        "the dimension of the split axis must "
+                        "be divisible by the splitted values."
+                    )
+                rhs = old_shape[ptr] // lhs
+            new_shape.append(lhs)
+            new_shape.append(rhs)
+            ptr += 1
         else:
-            raise tvm.error.OpAttributeInvalid("Shape dimension %d is not supported" % num)
-    shape = tuple(new_shape_list)
+            raise tvm.error.OpAttributeInvalid("Shape dimension %d is not supported" % ele)
     if reverse:
-        return _op.reverse_reshape(inputs[0], newshape=shape)
-    return _op.reshape(inputs[0], newshape=shape)
+        new_shape = new_shape[::-1]
+    return _op.reshape(inputs[0], newshape=new_shape)
 
 
 def _mx_split_v2(inputs, attrs):
@@ -2356,12 +2417,21 @@ def _mx_split_v2(inputs, attrs):
 
 
 def _mx_npi_where_rscalar(inputs, attrs):
+    cond, dat = inputs
     scalar = attrs.get_float("scalar")
-    dtype = _infer_type(inputs[1]).checked_type.dtype
+    cond_shape = get_const_tuple(_infer_type(cond).checked_type.shape)
+    dat_shape = get_const_tuple(_infer_type(dat).checked_type.shape)
+    dtype = _infer_type(dat).checked_type.dtype
+    # Check for broadcasting
+    out_shape = np.broadcast(np.empty(cond_shape), np.empty(dat_shape)).shape
+    if out_shape != cond_shape:
+        cond = _op.broadcast_to(cond, out_shape)
+    if out_shape != dat_shape:
+        dat = _op.broadcast_to(dat, out_shape)
     scalar = _expr.const(scalar, dtype=dtype)
-    ones = _op.ones_like(inputs[1])
+    ones = _op.ones_like(dat)
     scalar = _op.multiply(ones, scalar)
-    return _op.where(inputs[0], inputs[1], scalar)
+    return _op.where(cond, dat, scalar)
 
 
 # Note: due to attribute conversion constraint
@@ -2382,13 +2452,13 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "reshape_like",
     "zeros_like",
     "ones_like",
-    "where",
     "cos",
     "cosh",
     "sin",
     "sinh",
     "tan",
     "tanh",
+    "where",
 ]
 
 _convert_map = {
@@ -2609,6 +2679,7 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "_npi_concatenate": _mx_npi_concatenate,
     "_npx_reshape": _mx_npx_reshape,
     "_np_copy": _rename(_op.copy),
+    "_npi_copy": _rename(_op.copy),
     "_npi_power": _rename(_op.power),
     "_npi_power_scalar": _binop_scalar(_op.power),
     "_npi_multiply": _rename(_op.multiply),
@@ -2617,6 +2688,7 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "_npi_add_scalar": _binop_scalar(_op.add),
     "_npi_where_rscalar": _mx_npi_where_rscalar,
     "_npi_less": _rename(_op.less),
+    "_npi_less_equal": _mx_compare(_op.less_equal, _rename),
     "_npi_tanh": _rename(_op.tanh),
     "_npi_true_divide_scalar": _binop_scalar(_op.divide),
 }
@@ -2728,7 +2800,6 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info, params=None, mod=None):
             else:
                 raise RuntimeError("unexpected type %s" % type(res))
             node_map[nid] = res
-
     outputs = [node_map[e[0]][e[1]] for e in jgraph["heads"]]
     outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
     func = _function.Function(analysis.free_vars(outputs), outputs)
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index e3f08160509e..4e5f6efc815a 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -37,6 +37,9 @@ def batch_matmul(cfg, x, y, out_shape=None):
         3-D with shape [batch, M, K]
     y : tvm.te.Tensor
         3-D with shape [batch, N, K]
+    out_shape : tuple or None
+        Shape of the outputs
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -135,7 +138,7 @@ def _default_batch_matmul_config(cfg, M, N, K):
 
 
 @autotvm.register_topi_compute("batch_matmul_cblas.x86")
-def batch_matmul_cblas(cfg, x, y):
+def batch_matmul_cblas(cfg, x, y, out_shape=None):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
     data in batch.
 
@@ -147,6 +150,9 @@ def batch_matmul_cblas(cfg, x, y):
         3-D with shape [batch, M, K]
     y : tvm.te.Tensor
         3-D with shape [batch, N, K]
+    out_shape : tuple or None
+        Shape of the output
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -157,6 +163,10 @@ def batch_matmul_cblas(cfg, x, y):
     YB, N, YK = get_const_tuple(y.shape)
     assert XB == YB, "batch dimension doesn't match"
     assert XK == YK, "shapes of x and y is inconsistant"
+    if out_shape is not None:
+        assert out_shape[0] == XB, "got invalid output shape"
+        assert out_shape[1] == M, "got invalid output shape"
+        assert out_shape[2] == N, "got invalid output shape"
     cfg.add_flop(XB * M * N * XK * 2)
     return cblas.batch_matmul(x, y, False, True)
 
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 44307f4e60fe..79c587fc7f9e 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -1932,7 +1932,10 @@ def verify(data_shape, axis, use_length, length):
 @pytest.mark.skipif(not hasattr(mx.sym.np, "pad"), reason="mx.sym.np.pad hasn't been publish yet")
 @pytest.mark.parametrize(
     "data_shape, pad_width",
-    [((1, 1, 3, 5), (0, 0, 0, 0, 1, 2, 3, 4)), ((1, 1, 3, 5, 7), (0, 0, 0, 0, 1, 2, 3, 4, 5, 6))],
+    [
+        ((1, 1, 3, 5), ((0, 0), (0, 0), (1, 2), (3, 4))),
+        ((1, 1, 3, 5, 7), ((0, 0), (0, 0), (1, 2), (3, 4), (5, 6))),
+    ],
 )
 @pytest.mark.parametrize("mode", ["constant", "edge", "reflect"])
 @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"])
@@ -1943,19 +1946,17 @@ def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, tar
     data_np = np.random.uniform(size=data_shape).astype(dtype)
     data = mx.sym.var("data")
     if mode == "constant":
-        ref_res = mx.ndarray.pad(
-            mx.nd.array(data_np), mode=mode, pad_width=pad_width, constant_value=constant_value
-        )
+        ref_res = np.pad(data_np, mode=mode, pad_width=pad_width, constant_values=constant_value)
         mx_sym = mx.sym.np.pad(
             data.as_np_ndarray(), mode=mode, pad_width=pad_width, constant_values=constant_value
         )
     else:
-        ref_res = mx.ndarray.pad(mx.nd.array(data_np), mode=mode, pad_width=pad_width)
+        ref_res = np.pad(data_np, mode=mode, pad_width=pad_width)
         mx_sym = mx.sym.np.pad(data.as_np_ndarray(), mode=mode, pad_width=pad_width)
     mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype)
     intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
     op_res = intrp.evaluate()(data_np)
-    tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+    tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 
 @pytest.mark.skipif(
@@ -2029,8 +2030,12 @@ def test_forward_np_copy(data_shape, dtype, target, ctx, kind):
         ((2, 3, 8), (-2, -2, 2, -1), False),
         ((8, 3, 3, 3, 4, 4), (-6, 2, -1, -4), False),
         ((8, 3, 3, 3, 4, 4), (-5, -4), False),
+        ((1, 8, 3, 3, 3, 4, 4), (-3, -5, -4), False),
+        ((8, 1, 3, 4), (-2, -3, -1), False),
         ((8, 3, 3, 3, 3, 8), (-4, -5), True),
         ((8, 3, 2, 4, 8), (-4, -1, 2, -6), True),
+        ((3, 2, 4, 8, 1, 1), (-4, -1, 2, -6, -5, -3), True),
+        ((2, 4, 1, 8), (-4, -3, -1, 2, -6), True),
     ],
 )
 def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, ctx, kind):
@@ -2117,16 +2122,21 @@ def test_forward_npi_tanh(data_shape, dtype, target, ctx, kind):
 
 
 @pytest.mark.skipif(not hasattr(mx.np, "where"), reason="mx.np.where hasn't been publish yet")
-@pytest.mark.parametrize("data_shape", [(2, 2, 2), (2, 7, 2), (1, 8), (2, 2), (1, 3)])
+@pytest.mark.parametrize(
+    "data_shape,cond_shape",
+    [[(2, 2, 2), (2, 2, 2)], [(2, 7, 2), (7, 2)], [(2, 2), (1, 2)], [(1, 3), (3, 3)]],
+)
 @pytest.mark.parametrize("data_dtype", ["float64", "float32", "int64", "int32", "bool"])
 @pytest.mark.parametrize("cond_dtype", ["float64", "float32", "int64", "int32", "bool"])
 @pytest.mark.parametrize("scalar", [1.0, 2.0])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_where_rscalar(data_shape, cond_dtype, data_dtype, scalar, target, ctx, kind):
+def test_forward_npi_where_rscalar(
+    data_shape, cond_shape, data_dtype, cond_dtype, scalar, target, ctx, kind
+):
     if data_dtype == "bool":
         scalar = scalar == 0.0
-    cond_np = np.random.uniform(size=data_shape).astype(cond_dtype)
+    cond_np = np.random.uniform(size=cond_shape).astype(cond_dtype)
     data_np = np.random.uniform(size=data_shape).astype(data_dtype)
     cond = mx.sym.var("condition")
     data = mx.sym.var("x")
@@ -2136,7 +2146,7 @@ def test_forward_npi_where_rscalar(data_shape, cond_dtype, data_dtype, scalar, t
     dtypeDic["condition"] = cond_dtype
     dtypeDic["x"] = data_dtype
     mod, _ = relay.frontend.from_mxnet(
-        mx_sym, shape={"condition": data_shape, "x": data_shape}, dtype=dtypeDic
+        mx_sym, shape={"condition": cond_shape, "x": data_shape}, dtype=dtypeDic
     )
     intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
     op_res = intrp.evaluate()(cond_np, data_np)
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 8c724daaa9d0..37a59c30f410 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -134,7 +134,7 @@ def check_binary_op(opfunc, ref, dtype):
                     continue
                 intrp = relay.create_executor("graph", ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
-                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01, atol=1e-3)
 
     for opfunc, ref in [
         (relay.add, np.add),

From 19ec8f5b38240987fc351c9bf1e9f17edbee0e72 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 18 Oct 2020 15:48:11 -0700
Subject: [PATCH 021/258] [PYTHON][WINDOWS] More robust dll loading behavior
 after python3.8 (#6707)

The dll search directories need to be manually added
by os.add_dll_directory after python3.8.
---
 python/tvm/_ffi/base.py    |  5 +++++
 python/tvm/_ffi/libinfo.py | 39 +++++++++++++++++++++-----------------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index df220ae1111f..397090618ade 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -49,6 +49,11 @@ def _py_str(x):
 def _load_lib():
     """Load libary by searching possible path."""
     lib_path = libinfo.find_lib_path()
+    # The dll search path need to be added explicitly in
+    # windows after python 3.8
+    if sys.platform.startswith("win32") and sys.version_info >= (3, 8):
+        for path in libinfo.get_dll_directories():
+            os.add_dll_directory(path)
     lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)
     lib.TVMGetLastError.restype = ctypes.c_char_p
     return lib, os.path.basename(lib_path[0])
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index ae3cba6e3dd7..28614d072f01 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -40,23 +40,8 @@ def split_env_var(env_var, split):
     return []
 
 
-def find_lib_path(name=None, search_path=None, optional=False):
-    """Find dynamic library files.
-
-    Parameters
-    ----------
-    name : list of str
-        List of names to be found.
-
-    Returns
-    -------
-    lib_path : list(string)
-        List of all found path to the libraries
-    """
-    use_runtime = os.environ.get("TVM_USE_RUNTIME_LIB", False)
-
-    # See https://github.com/apache/incubator-tvm/issues/281 for some background.
-
+def get_dll_directories():
+    """Get the possible dll directories"""
     # NB: This will either be the source directory (if TVM is run
     # inplace) or the install directory (if TVM is installed).
     # An installed TVM's curr_path will look something like:
@@ -94,11 +79,31 @@ def find_lib_path(name=None, search_path=None, optional=False):
         dll_path.append(os.path.join(source_dir, "web", "dist"))
 
     dll_path = [os.path.realpath(x) for x in dll_path]
+    return [x for x in dll_path if os.path.isdir(x)]
+
+
+def find_lib_path(name=None, search_path=None, optional=False):
+    """Find dynamic library files.
+
+    Parameters
+    ----------
+    name : list of str
+        List of names to be found.
+
+    Returns
+    -------
+    lib_path : list(string)
+        List of all found path to the libraries
+    """
+    use_runtime = os.environ.get("TVM_USE_RUNTIME_LIB", False)
+    dll_path = get_dll_directories()
+
     if search_path is not None:
         if isinstance(search_path, list):
             dll_path = dll_path + search_path
         else:
             dll_path.append(search_path)
+
     if name is not None:
         if isinstance(name, list):
             lib_dll_path = []

From a82f52e239294778bf0c794123cc88d6fc758ca2 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 18 Oct 2020 23:55:40 -0700
Subject: [PATCH 022/258] [AutoScheduler] Add task scheduler (#6663)

* Add task scheduler

* fix lint

* fix tests

* fix tests

* fix tests

* fix test cases

* fix test cases

* fix tests

* address comments
---
 include/tvm/auto_scheduler/measure.h          |  22 +-
 include/tvm/auto_scheduler/search_policy.h    |  31 +-
 python/tvm/auto_scheduler/__init__.py         |   2 +
 python/tvm/auto_scheduler/auto_schedule.py    |  11 +-
 .../auto_scheduler/cost_model/xgb_model.py    |   6 +-
 python/tvm/auto_scheduler/measure.py          |  27 ++
 python/tvm/auto_scheduler/search_policy.py    |  51 ++-
 python/tvm/auto_scheduler/task_scheduler.py   | 422 ++++++++++++++++++
 python/tvm/auto_scheduler/utils.py            |  47 ++
 src/auto_scheduler/auto_schedule.cc           |   4 +-
 src/auto_scheduler/feature.cc                 |   4 +-
 src/auto_scheduler/measure.cc                 |  22 +-
 .../search_policy/empty_policy.cc             |  39 +-
 .../search_policy/empty_policy.h              |  14 +-
 .../search_policy/search_policy.cc            |   9 +-
 .../search_policy/sketch_policy.cc            |  67 ++-
 .../search_policy/sketch_policy.h             |  13 +-
 .../search_policy/sketch_policy_rules.cc      |   9 +-
 .../search_policy/sketch_policy_rules.h       |   3 +-
 .../test_auto_scheduler_task_scheduler.py     | 112 +++++
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |   7 +-
 tutorials/auto_scheduler/tune_matmul_x86.py   |   7 +-
 22 files changed, 819 insertions(+), 110 deletions(-)
 create mode 100644 python/tvm/auto_scheduler/task_scheduler.py
 create mode 100644 tests/python/unittest/test_auto_scheduler_task_scheduler.py

diff --git a/include/tvm/auto_scheduler/measure.h b/include/tvm/auto_scheduler/measure.h
index 349f4f8c7d51..339f42896b66 100755
--- a/include/tvm/auto_scheduler/measure.h
+++ b/include/tvm/auto_scheduler/measure.h
@@ -423,7 +423,7 @@ class RPCRunner : public ProgramRunner {
 
 /*!
  * \brief Measurer that measures the time costs of tvm programs
- * This class combines ProgramBuilder and ProgramRunner and provides a simpler API */
+ * This class combines ProgramBuilder and ProgramRunner, and provides a simpler API */
 class ProgramMeasurerNode : public Object {
  public:
   /*! \brief Measured programs counter. */
@@ -444,7 +444,7 @@ class ProgramMeasurerNode : public Object {
   Optional<Array<MeasureCallback>> callbacks;
   /*! \brief Verbosity level. 0 for silent, 1 to output information during program measuring. */
   int verbose;
-  /*! \brief The number of max continuous error. */
+  /*! \brief The number of allowed maximum continuous error before forcely stopping the tuning */
   int max_continuous_error;
 
   /*! \brief Reset book keeping variables */
@@ -454,13 +454,12 @@ class ProgramMeasurerNode : public Object {
    * \brief Do measurement.
    * \param task The current SearchTask.
    * \param policy The current SearchPolicy.
-   * \param inputs The MeasureInputs.
-   * \param results A pointer to a MeasureResult Array, this is used as output.
+   * \param inputs The inputs of measurement.
    * \param batch_size Number of programs to be measured in one batch.
+   * \return results The results of measurement.
    */
-  void Measure(const SearchTask& task, const SearchPolicy& policy,
-               const Array<MeasureInput>& inputs, Array<MeasureResult>* results,
-               int batch_size = -1);
+  Array<MeasureResult> Measure(const SearchTask& task, const SearchPolicy& policy,
+                               const Array<MeasureInput>& inputs, int batch_size = -1);
   /*!
    * \brief Do measurement silently.
    * This API will not print the measure results to screen.
@@ -486,12 +485,13 @@ class ProgramMeasurer : public ObjectRef {
  public:
   /*!
    * \brief The constructor.
-   * \param builder The ProgramBuilder to build each program.
-   * \param runner The ProgramRunner to measure each program.
-   * \param callbacks MeasureCallback to be called after each measure batch.
+   * \param builder The ProgramBuilder to build programs.
+   * \param runner The ProgramRunner to measure programs.
+   * \param callbacks MeasureCallback to be called after each measurement batch.
    * \param verbose Verbosity level. 0 for silent, 1 to output information during program
    * measuring.
-   * \param max_continuous_error The number of allowed maximum continuous error.
+   * \param max_continuous_error The number of allowed maximum continuous error before
+   * forcely stopping the tuning.
    */
   ProgramMeasurer(ProgramBuilder builder, ProgramRunner runner,
                   Optional<Array<MeasureCallback>> callbacks, int verbose,
diff --git a/include/tvm/auto_scheduler/search_policy.h b/include/tvm/auto_scheduler/search_policy.h
index ddb0dd284875..e433799b7fa5 100755
--- a/include/tvm/auto_scheduler/search_policy.h
+++ b/include/tvm/auto_scheduler/search_policy.h
@@ -22,26 +22,6 @@
  * \brief The base class of search policies, including the abstract definition of search policy and
  * other supporting data structures.
  *
- * The basic schedule search process for the auto-scheduler is design to be:
- * `Program sampling` -> `Performance Tuning`.
- *
- * In `Program sampling`, we use some predefined precise or heuristic rules to generate several
- * initial schedules. Based on these initial starting points, we perform `Performance Tuning` which
- * uses cost model based evolutionary search to select schedules with the best performance.
- *
- * Candidate schedules are measured against the specific hardware target.
- *
- * We intend to introduce different level of automation on the schedule generation process:
- * - Level 0(the default level): For all kinds of ops/subgraphs, the search policy should be able
- *   to generate schedule automatically.
- * - Level 1: For some complicated ops/subgraphs(e.g. conv2d windograd), the default search space
- *   of level 0 may be too large to find a high performance schedule efficiently. We provide some
- *   op attributes to help reduce the total search space, see `SearchPolicyKey` below for more
- *   information.
- * - Level 2: For some further special ops/subgraphs, users may more likely to write their own
- *   template(just like AutoTVM). Search policy should be able to provide a flexible approach as
- *   well.
- *
  * \note How to add a new search policy.
  * In design, there's no need for users to implement their own search policy, our formal search
  * policy(will be brought later) should be enough to cover most use cases. Meanwhile, a custom rule
@@ -62,11 +42,13 @@
 #ifndef TVM_AUTO_SCHEDULER_SEARCH_POLICY_H_
 #define TVM_AUTO_SCHEDULER_SEARCH_POLICY_H_
 
+#include <tvm/auto_scheduler/measure.h>
 #include <tvm/auto_scheduler/search_task.h>
 #include <tvm/node/node.h>
 
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 namespace tvm {
@@ -171,6 +153,15 @@ class SearchPolicyNode : public Object {
   virtual State Search(int num_measure_trials, int early_stopping, int num_measures_per_round,
                        ProgramMeasurer measurer) = 0;
 
+  /*!
+   * \brief Continue the search by doing an additional search round.
+   * \param num_measure The number of measurements
+   * \param measurer The measurer to measure programs
+   * \return The measurement records for measurements in this search round
+   */
+  virtual std::pair<Array<MeasureInput>, Array<MeasureResult>> ContinueSearchOneRound(
+      int num_measure, ProgramMeasurer measurer) = 0;
+
   /*!
    * \brief Preload measured states from a log file to resume the state of the search policy.
    * \param log_file The name of the record log file.
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index 6a395e776257..99d96e893ba0 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -24,6 +24,7 @@
 from . import measure_record
 from . import search_policy
 from . import search_task
+from . import task_scheduler
 from . import utils
 from . import workload_registry
 
@@ -42,4 +43,5 @@
 from .measure_record import RecordToFile, RecordReader, load_best, load_records, save_records
 from .search_task import SearchTask
 from .search_policy import EmptyPolicy, SketchPolicy, PreloadMeasuredStates
+from .task_scheduler import TaskScheduler
 from .workload_registry import register_workload, make_workload_key
diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index ca069bb0b4e9..a53c29d174d7 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -16,16 +16,7 @@
 # under the License.
 
 """
-User interface for TVM Auto-scheduler.
-
-The basic schedule search process for TVM Auto-scheduler is designed to be:
-`Program sampling` -> `Performance Tuning`.
-
-In `Program sampling`, we use some predefined precise or heuristic rules to generate several
-initial schedules. Based on these initial starting points, we perform `Performance Tuning` which
-uses cost model based evolutionary search to select schedules with the best performance.
-
-Candidate schedules are measured against the specific hardware target.
+The user interface and tuning options of the TVM auto-scheduler.
 """
 
 import tvm._ffi
diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index 3eb64df693f2..9a534aa96af5 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -20,6 +20,7 @@
 import multiprocessing
 import logging
 from collections import defaultdict
+import time
 
 import numpy as np
 import xgboost as xgb
@@ -76,7 +77,7 @@ def set(self, key, matrix, value):
 class XGBModel(PythonBasedModel):
     """Train a XGBoost model to predict the normalized throughputs of programs.
     Let the normalized throughput be the score of a program (higher is better). We predict
-    the (approximiate) score of a program = the sum of the scores of all stages in this program.
+    the (approximate) score of a program = the sum of the scores of all stages in this program.
     i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
     where score_si is the score of Stage i in Program P.
     We extract feature for each stage and let the xgboost predict the score for each stage.
@@ -128,6 +129,7 @@ def update(self, inputs, results):
         if len(inputs) <= 0:
             return
         assert len(inputs) == len(results)
+        tic = time.time()
 
         self.inputs.extend(inputs)
         self.results.extend(results)
@@ -167,6 +169,8 @@ def update(self, inputs, results):
             ],
         )
 
+        logger.info("XGBModel Training time: %.2f s", time.time() - tic)
+
     def predict(self, task, states):
         """Predict the scores of states
         Parameters
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 81c314fb332a..8a8b92201d15 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -185,6 +185,33 @@ def run(self, measure_inputs, build_results, verbose=1):
         return _ffi_api.ProgramRunnerRun(self, measure_inputs, build_results, verbose)
 
 
+@tvm._ffi.register_object("auto_scheduler.ProgramMeasurer")
+class ProgramMeasurer(Object):
+    """
+    Measurer that measures the time costs of tvm programs
+    This class combines ProgramBuilder and ProgramRunner, and provides a simpler API.
+
+    Parameters
+    ----------
+    builder : ProgramBuilder
+        The ProgramBuilder to build programs
+    runner : ProgramRunner
+        The ProgramRunner to measure programs.
+    callbacks : List[MeasureCallback]
+        Callbacks to be called after each measurement batch
+    verbose : int
+        The Verbosity level: 0 for silent, 1 to output information during program
+    max_continuous_error : Optional[int]
+        The number of allowed maximum continuous error before stop the tuning
+    """
+
+    def __init__(self, builder, runner, callbacks, verbose, max_continuous_error=None):
+        max_continuous_error = max_continuous_error or -1  # -1 means using the default value
+        self.__init_handle_by_constructor__(
+            _ffi_api.ProgramMeasurer, builder, runner, callbacks, verbose, max_continuous_error
+        )
+
+
 @tvm._ffi.register_object("auto_scheduler.LocalBuilder")
 class LocalBuilder(ProgramBuilder):
     """LocalBuilder use local CPU cores to build programs in parallel.
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index 5533aec8a5e9..f3d459e4a7d2 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -16,15 +16,17 @@
 # under the License.
 
 """
-The search policies for TVM Auto-scheduler.
+The search policies of TVM auto-scheduler.
 
-This contains the strategies to generate a schedule automatically. We provide an EmptyPolicy
-which always returns an unchanged initial state, and a more advanced SketchPolicy which can
-deal with various ops/subgraphs on different target devices.
+The auto-scheduler constructs a search space according to the compute declaration.
+It then randomly samples programs from the search space and uses evolutionary search with a
+learned cost model to fine tune the sampled programs.
+The final optimized programs are sent to actual hardware for measurement.
+The above process is repeated until the auto-scheduler runs out of time budget.
 
 Reference:
 L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
-Programs for Deep Learning." arXiv preprint arXiv:2006.06762 (2020).
+Programs for Deep Learning." (OSDI 2020).
 """
 
 import random
@@ -63,11 +65,42 @@ def __init__(self, filename="auto_scheduler_tuning.json"):
 class SearchPolicy(Object):
     """ The base class of search policies. """
 
+    def continue_search_one_round(self, num_measure, measurer):
+        """
+        Continue the search by doing an additional search round.
+
+        Parameters
+        ----------
+        num_measure: int
+            The number of programs to measure in this round
+        measurer: ProgramMeasurer
+            The program measurer to measure programs
+
+        Returns
+        -------
+        inputs: List[MeasureInput]
+            The inputs of measurments in this search round
+        results: List[MeasureResult]
+            The results of measurments in this search round
+        """
+        return _ffi_api.SearchPolicyContinueSearchOneRound(self, num_measure, measurer)
+
+    def set_verbose(self, verbose):
+        """
+        Set the verbosity level of the search policy.
+
+        Parameters
+        ----------
+        verbose: int
+            The verbosity level
+        """
+        return _ffi_api.SearchPolicySetVerbose(self, verbose)
+
 
 @tvm._ffi.register_object("auto_scheduler.EmptyPolicy")
 class EmptyPolicy(SearchPolicy):
-    """This is an example empty search policy which will always generate
-    the init state of ComputeDAG.
+    """A simple example of the search policy which always returns
+    the initial naive schedule (state).
 
     Parameters
     ----------
@@ -195,15 +228,17 @@ def sample_initial_population(self, pop_size):
         return states
 
     def evolutionary_search(self, init_populations, out_size):
-        """Evolutionary search.
+        """Perform evolutionary search.
         This python interface is mainly used for debugging and testing.
         The actual search is all done in c++.
+
         Parameters
         ----------
         init_populations: List[State]
             The initial population states
         out_size : int
             The size of generated states
+
         Returns
         -------
         states: List[State]
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
new file mode 100644
index 000000000000..e45573be61c6
--- /dev/null
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -0,0 +1,422 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+""" The task scheduler that allocates the time resources when tuning multiple tasks together
+
+The details of the "gradient" strategy below can be found in the section 6 of this paper:
+L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
+Programs for Deep Learning." (OSDI 2020).
+"""
+
+import time
+import math
+import logging
+
+import numpy as np
+
+from .search_policy import SearchPolicy, SketchPolicy
+from .cost_model import RandomModel, XGBModel
+from .utils import array_mean, to_str_round
+from .measure import ProgramMeasurer
+from .measure_record import RecordReader
+
+logger = logging.getLogger("auto_scheduler")
+
+
+def make_search_policies(
+    search_policy, tasks, num_measures_per_round, verbose, load_model_file=None, load_log_file=None
+):
+    """Make a list of search policies for a list of search tasks.
+    It creates one policy per task.
+
+    Parameters
+    ----------
+    search_policy: Union[str, List[SearchPolicy]]
+        The name of search policy.
+    tasks: List[SearchTask]
+        The list of all tasks
+    num_measures_per_round: int
+        The number of schedules to be measured at each search round.
+        This should be the same as `TuningOptions.num_measures_per_round`
+    verbose: int
+        The verbosity level. 0 for silent.
+    load_model_file: Optional[str]
+        Load pre-trained model from this file. If this is None, the cost model will
+        be trained from scratch.
+    load_log_file: Optional[str]
+        Load measurement records from this file. If it is not None, the status of the
+        task scheduler, search policies and cost models will be restored according to this file.
+
+    Returns
+    -------
+    policies: List[SearchPolicy]
+        The list of search policies
+    """
+    if search_policy == "default":
+        search_policy = "sketch.xgb"
+
+    if isinstance(search_policy, str):
+        policy_type, model_type = search_policy.split(".")
+        if model_type == "xgb":
+            cost_model = XGBModel(num_warmup_sample=len(tasks) * num_measures_per_round)
+            if load_model_file:
+                logger.info("Load pretrained model...")
+                cost_model.load(load_model_file)
+            elif load_log_file:
+                cost_model.load_log_file(load_log_file)
+        elif model_type == "random":
+            cost_model = RandomModel()
+        else:
+            raise ValueError("Invalid search policy: " + search_policy)
+
+        if policy_type == "sketch":
+            search_policies = [SketchPolicy(task, cost_model, verbose=verbose) for task in tasks]
+        else:
+            raise ValueError("Invalid search policy: " + search_policy)
+    else:
+        # check type
+        assert isinstance(search_policy, (tuple, list))
+        for item in search_policy:
+            assert isinstance(item, SearchPolicy)
+        search_policies = search_policy
+
+    return search_policies
+
+
+def derive_similarity_tag(dag, log_base=1.618):
+    """Derive the tag for similarity check from one computational DAG.
+    The DAGs with the same tag are considered as similar tasks.
+
+    The tag format is <op1-tag>_<op2-tag> ... <log(flop)>.
+
+    If the tag is "", then the task is not considered to be similar to any other tasks.
+
+    Parameters
+    ----------
+    dag: ComputeDAG
+        The input computational DAG
+    log_base: float = 1.618
+        The base of log to normalize FLOPS
+
+    Returns
+    -------
+    tag: str
+        The tag of this computational DAG.
+    """
+    ret = ""
+    for op in dag.ops:
+        tag = op.attrs.get("auto_scheduler_task_scheduler_tag", None)
+        if tag:
+            ret += op.attrs["auto_scheduler_task_scheduler_tag"] + "_"
+    if ret:
+        ret += "%d" % int(math.log(dag.flop_ct + 1, log_base))
+    return ret
+
+
+class TaskScheduler:
+    """
+    Allocate the time resources when tuning multiple tasks together.
+    This implements two strategies: "round-robin" and "gradient".
+
+    Parameters
+    ----------
+    tasks: List[SearchTask]
+        All tasks to tune
+    objective_func: Optional[Callable[List[float] -> float]]
+        The objective function to be minimized.
+        The objective function accepts the current latencies of all tasks and returns the
+        objective. If not presented, the objective is the sum of the latencies of all task.
+    strategy: str = "gradient"
+        The scheduling strategy.
+        "round-robin": Tune tasks in round robin order.
+        "gradient" : Tune tasks with gradient descent.
+    load_model_file: Optional[str]
+        Load pre-trained model from this file. If this is None, the cost model will
+        be trained from scratch.
+    load_log_file: Optional[str]
+        Load measurement records from this file. If it is not None, the status of the
+        task scheduler, search policies and cost models will be restored according to this file.
+    verbose: int = 1
+        The level of verbosity. 0 means silent.
+    alpha: float = 0.2
+        The parameter used for 'gradient' strategy
+    beta: float = 2
+        The parameter used for 'gradient' strategy
+    backward_window_size: int = 3
+        The parameter used for 'gradient' strategy
+    """
+
+    def __init__(
+        self,
+        tasks,
+        objective_func=None,
+        strategy="gradient",
+        load_model_file: str = None,
+        load_log_file: str = None,
+        verbose: int = 1,
+        alpha: float = 0.2,
+        beta: float = 2,
+        gamma: float = 0.5,
+        backward_window_size: int = 3,
+    ):
+        self.tasks = tasks
+        self.objective_func = objective_func or sum
+        self.strategy = strategy
+        self.verbose = verbose
+        self.load_log_file = load_log_file
+        self.load_model_file = load_model_file
+        self.alpha = alpha
+        self.beta = beta
+        self.gamma = gamma
+        self.backward_window_size = backward_window_size
+
+        assert len(self.tasks) != 0, "No tasks"
+        assert self.strategy in ["round-robin", "gradient"]
+
+        # task_cts[i] saves how many times task i is tuned
+        self.task_cts = [0 for _ in range(len(self.tasks))]
+
+        # task_costs_history[i] saves the latency history of task i
+        self.task_costs_history = [[] for _ in range(len(self.tasks))]
+
+        # best_costs[i] saves the best latency of task i
+        self.best_costs = 1e10 * np.ones(len(self.tasks))
+        self.cur_score = self._compute_score(self.best_costs)
+
+        self.tune_option = self.measurer = self.search_policies = self.ct = self.tic = None
+        self.num_measures_per_round = None
+        self.dead_tasks = set()
+
+        # Build similarity groups
+        self.task_tags = []  # task_id -> tag
+        self.tag_to_group_id = {}  # tag -> group_id
+        self.group_task_ids = []  # group_id -> all task ids in this group
+        self.flop_cts = []  # task_id -> the number of floating ops
+        for i, task in enumerate(self.tasks):
+            tag = derive_similarity_tag(task.compute_dag)
+            self.task_tags.append(tag)
+            self.flop_cts.append(task.compute_dag.flop_ct)
+            if not tag:
+                continue
+
+            if tag not in self.tag_to_group_id:
+                self.tag_to_group_id[tag] = len(self.tag_to_group_id)
+                self.group_task_ids.append([])
+            self.group_task_ids[self.tag_to_group_id[tag]].append(i)
+
+    def tune(self, tune_option, search_policy="default"):
+        """Tune a batch of tasks together.
+
+        Parameters
+        ----------
+        tune_option: TuningOptions
+            The options of tuning
+        search_policy: : Union[str, List[SearchPolicy]]
+            The list of search policies.
+            If it is str.
+            "sketch.xgb" for SketchPolicy + XGBModel
+            "sketch.random" for SketchPolicy + RandomModel
+        """
+        # init members
+        self.tune_option = tune_option
+        self.measurer = ProgramMeasurer(
+            tune_option.builder,
+            tune_option.runner,
+            tune_option.measure_callbacks,
+            tune_option.verbose,
+        )
+        self.ct = 0
+        self.tic = time.time()
+        # reset num_measures_per_round to make sure every task is tuned at least once
+        self.num_measures_per_round = min(
+            tune_option.num_measures_per_round, tune_option.num_measure_trials // len(self.tasks)
+        )
+        if self.num_measures_per_round <= 0:
+            raise ValueError("num_measure_trials is too small. Please set it to a higher value.")
+
+        # restore the status of the task scheduler from a log file
+        if self.load_log_file:
+            self._restore_status(self.load_log_file, self.num_measures_per_round)
+
+        # make one search policy for one task
+        self.search_policies = make_search_policies(
+            search_policy,
+            self.tasks,
+            self.num_measures_per_round,
+            tune_option.verbose,
+            self.load_model_file,
+            self.load_log_file,
+        )
+
+        # do a round robin first to warm up
+        for i in range(len(self.tasks)):
+            self._tune_task(i)
+
+        # use the specific strategy to choose workload to tune
+        task_idx = -1
+        while self.ct < tune_option.num_measure_trials and len(self.dead_tasks) < len(self.tasks):
+            if self.strategy == "round-robin":
+                task_idx = (task_idx + 1) % len(self.tasks)
+                while task_idx in self.dead_tasks:
+                    task_idx = (task_idx + 1) % len(self.tasks)
+            elif self.strategy == "gradient":
+                gradients = []
+                for i in range(len(self.tasks)):
+                    if i in self.dead_tasks:
+                        gradients.append(0)
+                        continue
+
+                    # compute gradient from chain rule : (delta f / delta g_i)
+                    delta = 1e-7
+                    new_costs = list(self.best_costs)
+                    new_costs[i] -= delta
+                    chain_grad = (
+                        self._compute_score(self.best_costs) - self._compute_score(new_costs)
+                    ) / delta
+
+                    # compute (g_i(t_i) - g(t_i - \Delta t)) / (\Delta t)
+                    if (
+                        self.task_cts[i] - 1 < len(self.task_costs_history[i])
+                        and self.task_cts[i] - 1 - self.backward_window_size >= 0
+                    ):
+                        backward_grad = (
+                            self.task_costs_history[i][self.task_cts[i] - 1]
+                            - self.task_costs_history[i][
+                                self.task_cts[i] - 1 - self.backward_window_size
+                            ]
+                        ) / self.backward_window_size
+                    else:
+                        backward_grad = 0
+
+                    # compute (g_i(t_i + \Delta t) - g(t_i)) / (\Delta t)
+                    g_next_1 = self.best_costs[i] - (self.best_costs[i] / self.task_cts[i])
+
+                    g_next_2 = self.beta * 1e30
+                    group_id = self.tag_to_group_id.get(self.task_tags[i], None)
+                    if group_id is not None and len(self.group_task_ids[group_id]) > 1:
+                        best_flops = max(
+                            [
+                                self.flop_cts[j] / self.best_costs[j]
+                                for j in self.group_task_ids[group_id]
+                            ]
+                        )
+                        g_next_2 = self.beta * self.flop_cts[i] / best_flops
+
+                    g_next = min(g_next_1, g_next_2)
+                    forward_grad = g_next - self.best_costs[i]
+
+                    # combine all grads
+                    grad = chain_grad * (
+                        self.alpha * backward_grad + (1 - self.alpha) * forward_grad
+                    )
+                    assert grad <= 0
+                    gradients.append(grad)
+
+                if max(gradients) == min(gradients):
+                    task_idx = np.random.choice(len(gradients))
+                else:
+                    task_idx = np.argmin(gradients)
+            else:
+                raise ValueError("Invalid strategy: " + self.strategy)
+
+            self._tune_task(task_idx)
+            self._adjust_similarity_group(task_idx)
+
+    def _tune_task(self, task_idx):
+        """Tune the select task for one round"""
+        if self.verbose >= 1:
+            logger.info("TaskScheduler: task id:\t%d", task_idx)
+        measure_inputs, measure_results = self.search_policies[task_idx].continue_search_one_round(
+            self.num_measures_per_round, self.measurer
+        )
+
+        for res in measure_results:
+            cost = array_mean(res.costs)
+            if cost < self.best_costs[task_idx]:
+                self.best_costs[task_idx] = cost
+
+        if len(measure_inputs) == 0:
+            self.dead_tasks.add(task_idx)
+
+        self.task_cts[task_idx] += 1
+        self.task_costs_history[task_idx].append(self.best_costs[task_idx])
+
+        self.ct += len(measure_inputs)
+        self.cur_score = self._compute_score(self.best_costs)
+
+        if self.verbose >= 1:
+            logger.info(
+                "TaskScheduler\tct: %d\testimated cost (ms): %.3f\ttime elapsed: %.2f\t"
+                "best_costs (ms): %s\ttask_ct: %s",
+                self.ct,
+                self.cur_score * 1e3,
+                time.time() - self.tic,
+                to_str_round(self.best_costs * 1e3, decimal=3),
+                self.task_cts,
+            )
+
+    def _compute_score(self, costs):
+        """compute the objective function"""
+        return self.objective_func(costs)
+
+    def _adjust_similarity_group(self, task_idx):
+        """adjust the similarity group for the selected task"""
+        group_id = self.tag_to_group_id.get(self.task_tags[task_idx], None)
+        if group_id is None or len(self.group_task_ids[group_id]) <= 1:
+            return
+
+        group_ids = self.group_task_ids[group_id]
+        best_group_flops = max([self.flop_cts[j] / self.best_costs[j] for j in group_ids])
+        cur_flops = self.flop_cts[task_idx] / self.best_costs[task_idx]
+
+        # if we tune a task for many times but it still cannot achieve
+        # a similar speed to the fastest one in its group, this means this task
+        # is actually not similar to other tasks in its group.
+        # So we will remove it from its original group.
+        if cur_flops < best_group_flops / self.beta and self.task_cts[task_idx] > 5 + max(
+            self.task_cts[j] for j in group_ids if j != task_idx
+        ):
+            self.task_tags[task_idx] = None
+            group_ids.remove(task_idx)
+
+    def _restore_status(self, log_file, num_measures_per_round):
+        """restore task_cts and best_costs from a log file"""
+        str_target = str(self.tasks[0].target)
+        workload_key_to_task_id = {t.workload_key: i for i, t in enumerate(self.tasks)}
+        total_ct = -1
+
+        for total_ct, (inp, res) in enumerate(RecordReader(log_file)):
+            if str(inp.task.target) != str_target:
+                continue
+            task_idx = workload_key_to_task_id.get(inp.task.workload_key, None)
+            if task_idx is None:
+                continue
+
+            if res.error_no == 0:
+                self.best_costs[task_idx] = min(self.best_costs[task_idx], array_mean(res.costs))
+
+            self.task_cts[task_idx] += 1
+
+        for i in range(len(self.tasks)):
+            # The computation of taks_cts is just an estimation.
+            # The estimation may not be accurate if the log file is changed externally or
+            # `num_measures_per_round` is different from the last tuning.
+            self.task_cts[i] = int(self.task_cts[i] / num_measures_per_round + 0.5)
+            self.task_costs_history[i].append(self.best_costs[i])
+
+        logger.info("TaskScheduler: Loaded %d measurement records from %s", total_ct + 1, log_file)
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index ff357c439797..75fec9c891e8 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -25,6 +25,8 @@
 import threading
 import os
 
+import numpy as np
+
 try:
     import psutil
 except ImportError:
@@ -264,3 +266,48 @@ def _check():
     t.start()
     t.join(timeout)
     return not t.is_alive()
+
+
+def array_mean(arr):
+    """Compute mean of the elments in a TVM Array<PrimExpr>
+
+    Parameters
+    ----------
+    arr: Array
+        A TVM Array<PrimExpr>
+
+    Returns
+    -------
+    mean: float
+        The mean of the elements in the array
+    """
+    return sum(x.value for x in arr) / len(arr)
+
+
+def to_str_round(x, decimal=6):
+    """Convert an object to str and round float numbers
+
+    Parameters
+    ----------
+    x: Union[str, list, int, float, np.ndarray]
+        The input object
+    decimal: int
+        The precision of decimal fraction
+
+    Returns
+    -------
+    ret: str
+        The string format of these objects
+    """
+    if isinstance(x, str):
+        return x
+    if isinstance(x, (list, tuple, np.ndarray)):
+        return "[" + ", ".join([to_str_round(y, decimal=decimal) for y in x]) + "]"
+    if isinstance(x, dict):
+        return str({k: to_str_round(v) for k, v in x.items()})
+    if isinstance(x, int):
+        return str(x)
+    if isinstance(x, (np.float32, np.float64, float)):
+        format_str = "%%.%df" % decimal
+        return format_str % x
+    raise ValueError("Invalid value: " + str(x) + "\ttype: " + str(type(x)))
diff --git a/src/auto_scheduler/auto_schedule.cc b/src/auto_scheduler/auto_schedule.cc
index dd6b70573a3b..747aa01cfa05 100755
--- a/src/auto_scheduler/auto_schedule.cc
+++ b/src/auto_scheduler/auto_schedule.cc
@@ -19,9 +19,7 @@
 
 /*!
  * \file auto_scheduler/auto_schedule.cc
- * \brief The user interface of the TVM Auto-scheduler. This is the entry structure to get
- * schedule search requirements from upper level (Python API), and returns a high performance
- * schedule after search process.
+ * \brief The user interface and tuning options of the TVM auto-scheduler.
  */
 
 #include <tvm/auto_scheduler/auto_schedule.h>
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 2744e0de4108..15066a98e2bc 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -871,7 +871,9 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
         stride = (i == static_cast<int>(for_loop_stack_.size()) - 1 ? stride : 0);
 
         float n_continuous = ele_bytes;
-        for (int i = static_cast<int>(tmp_region.size()) - 1; i >= 0; i--) {
+        for (int i = std::min(static_cast<int>(tmp_region.size()) - 1,
+                              static_cast<int>(int_shape.size()) - 1);
+             i >= 0; i--) {
           if (tmp_region[i] == int_shape[i]) {
             n_continuous *= tmp_region[i];
             break;
diff --git a/src/auto_scheduler/measure.cc b/src/auto_scheduler/measure.cc
index 70ea7abfae88..c3ee6a1495e3 100755
--- a/src/auto_scheduler/measure.cc
+++ b/src/auto_scheduler/measure.cc
@@ -38,6 +38,7 @@ TVM_REGISTER_NODE_TYPE(MeasureResultNode);
 TVM_REGISTER_OBJECT_TYPE(MeasureCallbackNode);
 TVM_REGISTER_OBJECT_TYPE(ProgramRunnerNode);
 TVM_REGISTER_OBJECT_TYPE(ProgramBuilderNode);
+TVM_REGISTER_OBJECT_TYPE(ProgramMeasurerNode);
 TVM_REGISTER_OBJECT_TYPE(LocalBuilderNode);
 TVM_REGISTER_OBJECT_TYPE(LocalRunnerNode);
 TVM_REGISTER_OBJECT_TYPE(RPCRunnerNode);
@@ -204,11 +205,12 @@ void ProgramMeasurerNode::Reset() {
   best_state.clear();
 }
 
-void ProgramMeasurerNode::Measure(const SearchTask& task, const SearchPolicy& policy,
-                                  const Array<MeasureInput>& inputs, Array<MeasureResult>* results,
-                                  int batch_size) {
-  results->clear();
-  results->reserve(inputs.size());
+Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
+                                                  const SearchPolicy& policy,
+                                                  const Array<MeasureInput>& inputs,
+                                                  int batch_size) {
+  Array<MeasureResult> results;
+  results.reserve(inputs.size());
 
   if (batch_size == -1) {
     // set default batch size
@@ -261,13 +263,15 @@ void ProgramMeasurerNode::Measure(const SearchTask& task, const SearchPolicy& po
 
     // Store result batch
     for (auto& res : result_batch) {
-      results->push_back(res);
+      results.push_back(res);
     }
 
     if (error_ct > max_continuous_error) {
       LOG(FATAL) << "Too many errors happened during tuning";
     }
   }
+
+  return results;
 }
 
 void ProgramMeasurerNode::SilentMeasure(const SearchTask& task, const Array<MeasureInput>& inputs,
@@ -343,6 +347,12 @@ TVM_REGISTER_GLOBAL("auto_scheduler.MeasureResult")
       return MeasureResult(costs, error_no, error_msg, all_cost, timestamp);
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.ProgramMeasurer")
+    .set_body_typed([](ProgramBuilder builder, ProgramRunner runner,
+                       Array<MeasureCallback> callbacks, int verbose, int max_continuous_error) {
+      return ProgramMeasurer(builder, runner, callbacks, verbose, max_continuous_error);
+    });
+
 TVM_REGISTER_GLOBAL("auto_scheduler.ProgramBuilderBuild")
     .set_body_typed([](const ProgramBuilder& builder, const Array<MeasureInput>& inputs,
                        int verbose) { return builder->Build(inputs, verbose); });
diff --git a/src/auto_scheduler/search_policy/empty_policy.cc b/src/auto_scheduler/search_policy/empty_policy.cc
index 21a68ac21d91..fba1ac2f42f8 100644
--- a/src/auto_scheduler/search_policy/empty_policy.cc
+++ b/src/auto_scheduler/search_policy/empty_policy.cc
@@ -19,7 +19,8 @@
 
 /*!
  * \file auto_scheduler/search_policy/empty_policy.cc
- * \brief This is an brief example of search policy.
+ * \brief A simple example of the search policy which always returns the initial naive schedule
+ * (state).
  */
 
 #include "empty_policy.h"
@@ -29,6 +30,8 @@
 
 #include <utility>
 
+#include "utils.h"
+
 namespace tvm {
 namespace auto_scheduler {
 
@@ -64,19 +67,18 @@ State EmptyPolicyNode::Search(int num_measure_trials, int early_stopping,
     measurer->Reset();
     int ct = 0;
     // In each round, we call SearchOneRound to get several candidate states,
-    // then use ProgramMeasurer to test their performance
+    // then use ProgramMeasurer to measure their performance.
     while (ct < num_measure_trials) {
       const auto& res = SearchOneRound();
       ct += res.size();
       // Build MeasureInputs for measuring
       inputs.clear();
       for (const auto& state : res) {
-        // The class members measured_states_set_ provided by SearchPolicy can be used to filter
-        // out the already measured states
         inputs.push_back(MeasureInput(search_task, state));
       }
+      // Perform measurement.
       // ProgramMeasurer will record the state with best performance during measure process
-      measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs, &results);
+      results = measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs);
     }
 
     // Return a state with best measured performance
@@ -84,18 +86,33 @@ State EmptyPolicyNode::Search(int num_measure_trials, int early_stopping,
   }
 }
 
+std::pair<Array<MeasureInput>, Array<MeasureResult>> EmptyPolicyNode::ContinueSearchOneRound(
+    int num_measure, ProgramMeasurer measurer) {
+  Array<State> best_states;
+  Array<MeasureInput> inputs;
+  Array<MeasureResult> results;
+
+  // Search one round to get promising states
+  PrintTitle("Search", verbose);
+  best_states = SearchOneRound();
+
+  // Measure these states
+  PrintTitle("Measure", verbose);
+  for (const auto& state : best_states) {
+    inputs.push_back(MeasureInput(search_task, state));
+  }
+  results = measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs);
+
+  return std::make_pair(std::move(inputs), std::move(results));
+}
+
 // As an example policy, EmptyPolicy always returns a init state
 Array<State> EmptyPolicyNode::SearchOneRound() {
   Array<State> res;
 
-  // 1. We will process `Program sampling` first to generate several initial schedules
+  // Simply return the initial naive schedule (state).
   res.push_back(search_task->compute_dag->init_state);
 
-  // 2. Then `Performance Tuning`: use cost model and evolutionary search to seek for the schedule
-  // with best performance
-  // Note: This example policy does not include this part
-
-  // 3. The returned candidate schedules will be measured in hardware
   return res;
 }
 
diff --git a/src/auto_scheduler/search_policy/empty_policy.h b/src/auto_scheduler/search_policy/empty_policy.h
index 3d138220dc0b..2219ebce83f0 100644
--- a/src/auto_scheduler/search_policy/empty_policy.h
+++ b/src/auto_scheduler/search_policy/empty_policy.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file auto_scheduler/search_policy/empty_policy.h
- * \brief A brief example of the search policy which always returns the initial naive schedule
+ * \brief A simple example of the search policy which always returns the initial naive schedule
  * (state).
  */
 
@@ -27,14 +27,17 @@
 #define TVM_AUTO_SCHEDULER_SEARCH_POLICY_EMPTY_POLICY_H_
 
 #include <tvm/auto_scheduler/loop_state.h>
+#include <tvm/auto_scheduler/measure.h>
 #include <tvm/auto_scheduler/search_policy.h>
 
+#include <utility>
+
 namespace tvm {
 namespace auto_scheduler {
 
 /*!
- * \brief A brief example of the search policy which always returns the initial naive schedule
- * (state), the formal search policy will continue to follow its design.
+ * \brief A simple example of the search policy which always returns the initial naive schedule
+ * (state).
  * The key implementation for this structure is `Search()`, check `empty_policy.cc` for more
  * details.
  */
@@ -43,13 +46,16 @@ class EmptyPolicyNode : public SearchPolicyNode {
   State Search(int num_measure_trials, int early_stopping, int num_measures_per_round,
                ProgramMeasurer measurer) final;
 
+  std::pair<Array<MeasureInput>, Array<MeasureResult>> ContinueSearchOneRound(
+      int num_measure, ProgramMeasurer measurer) final;
+
   static constexpr const char* _type_key = "auto_scheduler.EmptyPolicy";
   TVM_DECLARE_FINAL_OBJECT_INFO(EmptyPolicyNode, SearchPolicyNode);
 
  private:
   /*!
    * \brief Use a sub function to generate several candidate states in each search round.
-   * \returns Several generated states
+   * \returns The generated states
    */
   Array<State> SearchOneRound();
 };
diff --git a/src/auto_scheduler/search_policy/search_policy.cc b/src/auto_scheduler/search_policy/search_policy.cc
index d73bd911a921..8b6d22bb2725 100644
--- a/src/auto_scheduler/search_policy/search_policy.cc
+++ b/src/auto_scheduler/search_policy/search_policy.cc
@@ -104,8 +104,13 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyRunCallbacks")
       }
     });
 
-TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicySetTask")
-    .set_body_typed([](SearchPolicy policy, SearchTask task) { policy->search_task = task; });
+TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyContinueSearchOneRound")
+    .set_body_typed([](SearchPolicy policy, int num_measure, ProgramMeasurer measurer) {
+      Array<MeasureInput> inputs;
+      Array<MeasureResult> results;
+      std::tie(inputs, results) = policy->ContinueSearchOneRound(num_measure, measurer);
+      return Array<ObjectRef>{inputs, results};
+    });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicySetVerbose")
     .set_body_typed([](SearchPolicy policy, int verbose) { policy->verbose = verbose; });
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index a89fa4b0c77a..8de17a626707 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -157,6 +157,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
 
     int ct = 0;
     int empty_retry_count = GetIntParam(params, SketchParamKey::empty_retry_count);
+    Array<State> best_states, random_states;
     Array<MeasureInput> inputs;
     Array<MeasureResult> results;
     while (ct < n_trials) {
@@ -168,8 +169,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
 
       // Search one round to get promising states
       PrintTitle("Search", verbose);
-      Array<State> random_states;
-      Array<State> best_states = SearchOneRound(num_random, &random_states);
+      best_states = SearchOneRound(num_random * 3, &random_states);
 
       // Infer bound. This is necessary for computing the correct ToStr() for redundancy check
       best_states = search_task->compute_dag.InferBound(best_states);
@@ -196,7 +196,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
 
       // Measure candidate states
       PrintTitle("Measure", verbose);
-      measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs, &results);
+      results = measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs);
       ct += inputs.size();
 
       // Check if reach the early stopping condition
@@ -218,15 +218,45 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
   }
 }
 
-Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State>* random_states) {
-  // Temporal object to be used if the input pointer is nullptr
-  Array<State> temp_random_states;
-  if (random_states == nullptr) {
-    random_states = &temp_random_states;
-  } else {
-    random_states->clear();
+std::pair<Array<MeasureInput>, Array<MeasureResult>> SketchPolicyNode::ContinueSearchOneRound(
+    int num_measure, ProgramMeasurer measurer) {
+  num_measure_per_iter_ = num_measure;
+
+  Array<State> best_states, random_states;
+  Array<MeasureInput> inputs;
+  Array<MeasureResult> results;
+  int num_random = static_cast<int>(GetDoubleParam(params, "eps_greedy") * num_measure);
+
+  // Search one round to get promising states
+  PrintTitle("Search", verbose);
+  best_states = SearchOneRound(num_random * 3, &random_states);
+
+  // Infer bound. This is necessary for computing the correct ToStr() for redundancy check
+  best_states = search_task->compute_dag.InferBound(best_states);
+  random_states = search_task->compute_dag.InferBound(random_states);
+
+  // Pick `num_measure_per_iter` states to measure, check hash to remove already measured state
+  // Also pick some random states to do eps-greedy
+  inputs = PickStatesWithEpsGreedy(best_states, random_states, num_measure);
+
+  // Measure candidate states
+  PrintTitle("Measure", verbose);
+  results = measurer->Measure(search_task, GetRef<SearchPolicy>(this), inputs);
+
+  // Update measured states throughputs. These states will join the EvolutionarySearch in later
+  // search rounds.
+  for (const auto& res : results) {
+    measured_states_throughputs_.push_back(1.0 / FloatArrayMean(res->costs));
   }
 
+  // Update the cost model
+  PrintTitle("Train cost model", verbose);
+  program_cost_model->Update(inputs, results);
+
+  return std::make_pair(std::move(inputs), std::move(results));
+}
+
+Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State>* random_states) {
   // Get parameters
   int population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
   int num_use_measured =
@@ -245,8 +275,8 @@ Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State
   Array<State> init_population = SampleInitPopulation(
       sketch_cache_, is_cost_model_reasonable ? population - num_use_measured : population);
 
-  // 3. If the cost model is useless (i.e. RandomCostModel), just random pick some generated
-  // states, else perform evolutionary search
+  // 3. Perform evolutionary search if a cost model is utilized. Otherwise,
+  // just return some random states.
   if (is_cost_model_reasonable) {
     // Also insert already measured good states to the initial population
     std::vector<int> indices = Argsort(measured_states_throughputs_);
@@ -254,11 +284,13 @@ Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State
       init_population.push_back(measured_states_vector_[indices[i]]);
     }
     // Sample some random states for eps-greedy
-    *random_states = RandomSampleStates(init_population, &rand_gen, num_random_states * 3);
+    if (num_random_states > 0 && random_states != nullptr) {
+      *random_states = RandomSampleStates(init_population, &rand_gen, num_random_states);
+    }
     return EvolutionarySearch(init_population, num_measure_per_iter_ * 2);
   } else {
     PruneInvalidState(search_task, &init_population);
-    return RandomSampleStates(init_population, &rand_gen, num_measure_per_iter_ * 3);
+    return RandomSampleStates(init_population, &rand_gen, num_measure_per_iter_ * 2);
   }
 }
 
@@ -347,10 +379,7 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
 
     support::parallel_for(0, out_size - out_states.size(),
                           [this, &temp_states, &sketches, &rand_gens](int index) {
-                            // Random choose a starting sketch
-                            // TODO(jcf94, merrymercy): Maybe choose sketches in different
-                            // possibility for they may have different potential on generating state
-                            // with better performance
+                            // Randomly choose a sketch
                             State tmp_s = sketches[(rand_gens[index])() % sketches.size()];
                             // Derivation rule based enumeration
                             bool valid = true;
@@ -472,6 +501,8 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
     // Compute selection probability
     ComputePrefixSumProb(pop_scores, &pop_selection_probs);
 
+    // TODO(merrymercy, comaniac): add crossover.
+
     // Do mutation
     while (pnext->size() < population) {
       State tmp_s = (*pnow)[RandomChoose(pop_selection_probs, &rand_gen)];
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index 21aaa6ef7b90..edaa89e6cfd6 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -19,13 +19,15 @@
 
 /*!
  * \file auto_scheduler/search_policy/sketch_policy.h
- * \brief The search policy that searches in a hierarchical search space defined by sketches.
- * The policy randomly samples programs from the space defined by sketches and use evolutionary
- * search to fine-tune them.
+ * \brief This search policy constructs a search space according to the compute declaration.
+ * It then randomly samples programs from the search space and uses evolutionary search with a
+ * learned cost model to fine tune the sampled programs.
+ * The final optimized programs are sent to actual hardware for measurement.
+ * The above process is repeated until the auto-scheduler runs out of time budget.
  *
  * Reference:
  * L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
- * Programs for Deep Learning." arXiv preprint arXiv:2006.06762 (2020).
+ * Programs for Deep Learning." (OSDI 2020).
  */
 
 #ifndef TVM_AUTO_SCHEDULER_SEARCH_POLICY_SKETCH_POLICY_H_
@@ -106,6 +108,9 @@ class SketchPolicyNode : public SearchPolicyNode {
   State Search(int num_measure_trials, int early_stopping, int num_measures_per_round,
                ProgramMeasurer measurer) final;
 
+  std::pair<Array<MeasureInput>, Array<MeasureResult>> ContinueSearchOneRound(
+      int num_measure, ProgramMeasurer measurer) final;
+
   /*!
    * \brief Generate sketches.
    * \return The generated sketches(states).
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 99188d4a6292..b6ad4d32b290 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -19,7 +19,8 @@
 
 /*!
  * \file auto_scheduler/search_policy/sketch_policy_rules.cc
- * \brief Rules defined to generate the sketches and initial sampled states in SketchPolicy.
+ * \brief Rules for generating the sketches, sampling the initial population, and mutating the
+ * population in SketchPolicy.
  */
 
 #include "sketch_policy_rules.h"
@@ -317,7 +318,7 @@ SketchGenerationRule::ConditionKind RuleCrossThreadReduction::MeetCondition(
     const SketchPolicyNode& policy, const State& state, int stage_id) const {
   CHECK(IsGPUTask(policy.search_task));
 
-  // If it is an intermidiate state created by RuleAddCacheWrite,
+  // If it is an intermediate state created by RuleAddCacheWrite,
   // we just skip it.
   if (HasCacheWriteStage(state, stage_id)) {
     return ConditionKind::kSkip;
@@ -1116,6 +1117,10 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol
     }
   }
 
+  if (max_fusable_iter_id == 0) {
+    return ResultKind::kInvalid;
+  }
+
   // Randomly pick one granularity
   int fuse_to_iter_id = (*rand_gen)() % max_fusable_iter_id + 1;
   Array<Integer> fused_ids;
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h
index 035dc897d3da..046f036d59d9 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.h
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h
@@ -19,7 +19,8 @@
 
 /*!
  * \file auto_scheduler/search_policy/sketch_policy_rules.h
- * \brief Rules defined to generate the sketches and initial sampled states in SketchPolicy.
+ * \brief Rules for generating the sketches, sampling the initial population, and mutating the
+ * population in SketchPolicy.
  */
 
 #ifndef TVM_AUTO_SCHEDULER_SEARCH_POLICY_SKETCH_POLICY_RULES_H_
diff --git a/tests/python/unittest/test_auto_scheduler_task_scheduler.py b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
new file mode 100644
index 000000000000..72b998a5a38a
--- /dev/null
+++ b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Test task scheduler """
+
+import tempfile
+
+import numpy as np
+
+from tvm import auto_scheduler
+
+from test_auto_scheduler_common import matmul_auto_scheduler_test
+
+
+def test_task_scheduler_round_robin():
+    tasks = []
+    for n in [2, 4, 8]:
+        tasks.append(auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n), "llvm"))
+
+    def objective_func(costs):
+        return sum(costs)
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+        num_trials_per_task = 2
+
+        # Tune all tasks
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=num_trials_per_task * len(tasks),
+            num_measures_per_round=1,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func, strategy="round-robin")
+        task_scheduler.tune(tune_option, search_policy="sketch.random")
+
+        # Check the result of round robin
+        counters = {}
+        for task in tasks:
+            counters[task.workload_key] = 0
+
+        for inp, res in auto_scheduler.load_records(log_file):
+            counters[inp.task.workload_key] += 1
+
+        for task in tasks:
+            assert counters[task.workload_key] == num_trials_per_task
+
+        # test continuous tuning (restoring the status)
+        task_scheduler = auto_scheduler.TaskScheduler(
+            tasks, objective_func, strategy="round-robin", load_log_file=log_file
+        )
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=len(tasks),
+            num_measures_per_round=1,
+        )
+        task_scheduler.tune(tune_option, search_policy="sketch.random")
+
+
+def test_task_scheduler_gradient():
+    tasks = []
+    for n in [2, 4]:
+        tasks.append(auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n), "llvm"))
+
+    def objective_func(costs):
+        return costs[0]
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+
+        n_trials = 5
+
+        # Tune all tasks
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=n_trials,
+            num_measures_per_round=1,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func)
+
+        # Forcely rewrite the initial values.
+        # This can make this test more stable on the slow CI machines
+        task_scheduler.best_costs = np.array([1e2, 1e-8])
+
+        task_scheduler.tune(tune_option, search_policy="sketch.random")
+
+        # Check the allocation results
+        counters = {}
+        for task in tasks:
+            counters[task.workload_key] = 0
+
+        for inp, res in auto_scheduler.load_records(log_file):
+            counters[inp.task.workload_key] += 1
+
+        assert counters[tasks[0].workload_key] == n_trials - 1
+        assert counters[tasks[1].workload_key] == 1
+
+
+if __name__ == "__main__":
+    test_task_scheduler_round_robin()
+    test_task_scheduler_gradient()
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 5004a5f73286..b800eb469ec5 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -25,10 +25,9 @@
 
 Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
 manual templates to define the search space, the auto-scheduler does not require any templates.
-The auto-scheduler is template-free, so users only need to write the computation declaration without
-any schedule commands or templates.
-The auto-scheduler can automatically generate a large
-search space and find a good schedule in the space.
+Users only need to write the computation declaration without any schedule commands or templates.
+The auto-scheduler can automatically generate a large search space and
+find a good schedule in the space.
 
 We use a convolution layer as an example in this tutorial.
 """
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index e1e011555445..35c47444e081 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -22,10 +22,9 @@
 
 Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
 manual templates to define the search space, the auto-scheduler does not require any templates.
-The auto-scheduler is template-free, so users only need to write the computation declaration without
-any schedule commands or templates.
-The auto-scheduler can automatically generate a large
-search space and find a good schedule in the space.
+Users only need to write the computation declaration without any schedule commands or templates.
+The auto-scheduler can automatically generate a large search space and
+find a good schedule in the space.
 
 We use matrix multiplication as an example in this tutorial.
 """

From 1e66bfbd2d4961b2774876f45f31707772251c2f Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Sun, 18 Oct 2020 23:56:44 -0700
Subject: [PATCH 023/258] [AutoSchedule] Support multiple cache read and fix
 bugs (#6686)

* Add shape to DAG print

* avoid useless cross-thread reduction

* Fix stage order

* support multiple cache_read

* lint

* fix

* fix

* address comment

* fix ci

* Trigger CI & Update doc strings

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
---
 include/tvm/auto_scheduler/compute_dag.h      |  7 +-
 python/tvm/auto_scheduler/compute_dag.py      | 23 ++++--
 src/auto_scheduler/compute_dag.cc             | 77 ++++++++++++++-----
 .../search_policy/sketch_policy_rules.cc      | 31 +++++---
 src/te/schedule/schedule_dataflow_rewrite.cc  |  9 +++
 .../unittest/test_auto_scheduler_common.py    | 14 ++++
 .../test_auto_scheduler_compute_dag.py        | 53 ++++++++++++-
 7 files changed, 174 insertions(+), 40 deletions(-)

diff --git a/include/tvm/auto_scheduler/compute_dag.h b/include/tvm/auto_scheduler/compute_dag.h
index 553008a7fcbf..6e67fef0f283 100755
--- a/include/tvm/auto_scheduler/compute_dag.h
+++ b/include/tvm/auto_scheduler/compute_dag.h
@@ -200,11 +200,16 @@ class ComputeDAGNode : public Object {
  */
 class ComputeDAG : public ObjectRef {
  public:
-  /*! \brief The constructor.
+  /*! \brief Construct a DAG from a list of output tensors.
    * \param tensors `te::Tensor`s for a compute declaration.
    */
   TVM_DLL explicit ComputeDAG(Array<te::Tensor> tensors);
 
+  /*! \brief Construct a DAG based on a schedule.
+   * \param sch `te::Schedule`s for a compute declaration.
+   */
+  TVM_DLL explicit ComputeDAG(const te::Schedule& sch);
+
   /*!
    * \brief Rewrite the layout of placeholder specified by attr `layout_free_placeholders`
    * according to the loop nest derived with `transform_steps`.
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index 0115dbcf8ebe..4b1b264c30d8 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -47,22 +47,29 @@ class ComputeDAG(Object):
 
     Parameters
     ----------
-    compute : Union[List[Tensor], str]
+    compute : Union[List[Tensor], str, Schedule]
         Input/output tensors or workload key for a compute declaration.
     """
 
-    def __init__(self, compute):
-        if isinstance(compute, str):
-            compute = workload_key_to_tensors(compute)
-        elif isinstance(compute, list):
-            for item in compute:
+    def __init__(self, compute_or_sche):
+        if isinstance(compute_or_sche, str):
+            compute = workload_key_to_tensors(compute_or_sche)
+            sche = None
+        elif isinstance(compute_or_sche, list):
+            for item in compute_or_sche:
                 if not isinstance(item, tvm.te.Tensor):
                     raise ValueError("The input of ComputeDAG should be a list of Tensor")
+            compute = compute_or_sche
+            sche = None
+        elif isinstance(compute_or_sche, tvm.te.Schedule):
+            compute = None
+            sche = compute_or_sche
         else:
             raise ValueError(
-                "Invalid compute: " + compute + " . ComputeDAG expects a string or list of Tensor"
+                "Invalid compute type: %s. ComputeDAG expects string, list of Tensor, or Schedule"
+                % type(compute)
             )
-        self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, compute)
+        self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, compute, sche)
 
     def get_init_state(self):
         """Get the init state of this ComputeDAG.
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 23b38173c8c2..3b0de974617c 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -662,7 +662,42 @@ ComputeDAG::ComputeDAG(Array<te::Tensor> tensors) {
   auto node = make_object<ComputeDAGNode>();
   node->tensors = std::move(tensors);
   node->access_analyzer = AccessAnalyzer(node->tensors);
-  node->ops = node->access_analyzer->ops_topo_order;
+
+  Array<te::Operation> out_ops;
+  for (const auto& op : node->access_analyzer->ops_topo_order) {
+    if (node->access_analyzer.IsOutput(op)) {
+      out_ops.push_back(op);
+    }
+  }
+  te::Schedule sch = te::create_schedule(out_ops);
+  for (auto stage : sch->stages) {
+    node->ops.push_back(stage->op);
+  }
+
+  node->flop_ct = FlopEstimator().EstimateFlop(node->ops);
+  node->init_state = State(node->ops);
+  data_ = std::move(node);
+}
+
+ComputeDAG::ComputeDAG(const te::Schedule& sch) {
+  auto node = make_object<ComputeDAGNode>();
+
+  // Initialize ops. Here we enforce the order of ops and stages are consistent
+  for (auto stage : sch->stages) {
+    node->ops.push_back(stage->op);
+  }
+
+  // Collect input and output tensors
+  Array<te::Tensor> tensors;
+  for (auto stage : sch->stages) {
+    if (stage->op->IsInstance<te::PlaceholderOpNode>() || stage->is_output) {
+      for (auto i = 0; i < stage->op->num_outputs(); ++i) {
+        tensors.push_back(stage->op.output(i));
+      }
+    }
+  }
+  node->tensors = std::move(tensors);
+  node->access_analyzer = AccessAnalyzer(node->tensors);
   node->flop_ct = FlopEstimator().EstimateFlop(node->ops);
   node->init_state = State(node->ops);
   data_ = std::move(node);
@@ -949,8 +984,6 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
         }
       }
 
-      p_dag->init_state = State(p_dag->ops);
-
       Array<te::Tensor> old_tensors = p_dag->tensors;
       ArrayNode* p_tensors = p_dag->tensors.CopyOnWrite();
 
@@ -970,8 +1003,21 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
     }  // end for placeholder
   }    // end for stage
   p_dag->access_analyzer = AccessAnalyzer(p_dag->tensors);
-  p_dag->ops = p_dag->access_analyzer->ops_topo_order;
+
+  Array<te::Operation> out_ops;
+  for (const auto& op : p_dag->access_analyzer->ops_topo_order) {
+    if (p_dag->access_analyzer.IsOutput(op)) {
+      out_ops.push_back(op);
+    }
+  }
+
+  p_dag->ops.clear();
+  te::Schedule sch = te::create_schedule(out_ops);
+  for (auto stage : sch->stages) {
+    p_dag->ops.push_back(stage->op);
+  }
   p_dag->flop_ct = FlopEstimator().EstimateFlop(p_dag->ops);
+  p_dag->init_state = State(p_dag->ops);
 }
 
 std::pair<te::Schedule, Array<te::Tensor>> ComputeDAG::ApplySteps(
@@ -1144,17 +1190,7 @@ ComputeDAG ComputeDAG::ReplayAndGetDAG(const Array<Step>& transform_steps) const
   te::Schedule sch;
   Array<te::Tensor> old_tensors;
   std::tie(sch, old_tensors) = ApplySteps(transform_steps);
-
-  Array<te::Tensor> new_tensors;
-  for (auto stage : sch->stages) {
-    if (stage->op->IsInstance<te::PlaceholderOpNode>() || stage->is_output) {
-      for (auto i = 0; i < stage->op->num_outputs(); ++i) {
-        new_tensors.push_back(stage->op.output(i));
-      }
-    }
-  }
-
-  return ComputeDAG(new_tensors);
+  return ComputeDAG(sch);
 }
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -1259,9 +1295,14 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << ss.str();
     });
 
-TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAG").set_body_typed([](Array<te::Tensor> tensors) {
-  return ComputeDAG(tensors);
-});
+TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAG")
+    .set_body_typed([](Optional<Array<te::Tensor>> tensors, Optional<te::Schedule> sch) {
+      if (tensors) {
+        return ComputeDAG(tensors.value());
+      }
+      CHECK(sch) << "Both tensors and schedule are null";
+      return ComputeDAG(sch.value());
+    });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGApplyStepsFromState")
     .set_body_typed([](const ComputeDAG& dag, const State& state, const bool layout_rewrite) {
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index b6ad4d32b290..1b965c9886a1 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -151,9 +151,6 @@ SketchGenerationRule::ConditionKind RuleAddCacheRead::MeetCondition(const Sketch
 
   // Don't cache_read a stage if it has multiple consumers
   const std::set<int>& consumers = GetConsumers(task, state, stage_id);
-  if (consumers.size() != 1) {
-    return ConditionKind::kSkip;
-  }
 
   // Don't cache_read a stage if its consumer does not need multi-level tiling
   int target_stage_id = *consumers.begin();
@@ -179,16 +176,22 @@ std::vector<std::pair<State, int>> RuleAddCacheRead::Apply(const SketchPolicyNod
                                                            const State& state, int stage_id) const {
   const SearchTask& task = policy.search_task;
   const std::set<int>& consumers = GetConsumers(task, state, stage_id);
-  CHECK_EQ(consumers.size(), 1);
-  int target_stage_id = *consumers.begin();
   State tmp_s = state;
 
-  // Cache read add shared memory
-  int added_stage_id = tmp_s.cache_read(stage_id, "shared", {target_stage_id}, task->compute_dag);
-  target_stage_id++;
-  const auto& share_read_pos =
-      GetLastReduceIteratorInOutermostReduceTile(tmp_s->stages[target_stage_id]);
-  tmp_s.compute_at(added_stage_id, target_stage_id, share_read_pos);
+  int target_stage_id_offset = 0;
+  for (int orig_target_stage_id : consumers) {
+    int target_stage_id = orig_target_stage_id + target_stage_id_offset;
+
+    // Cache read add shared memory
+    int added_stage_id = tmp_s.cache_read(stage_id, "shared", {target_stage_id}, task->compute_dag);
+    target_stage_id_offset++;
+    target_stage_id++;
+
+    const auto& share_read_pos =
+        GetLastReduceIteratorInOutermostReduceTile(tmp_s->stages[target_stage_id]);
+    tmp_s.compute_at(added_stage_id, target_stage_id, share_read_pos);
+  }
+
   return {std::make_pair(tmp_s, stage_id)};
 }
 
@@ -332,7 +335,11 @@ SketchGenerationRule::ConditionKind RuleCrossThreadReduction::MeetCondition(
         GetCumulativeSpaceAndReductionLength(state->stages[stage_id]);
 
     if (NeedsMultilevelTiling(policy.search_task, state, stage_id)) {
-      // Do rfactor if we do not have enough parallelism on space iters
+      // Avoid rfactor if we have enough parallelism on space iters
+      if (cum_space_len > policy.search_task->hardware_params->max_threads_per_block) {
+        return ConditionKind::kSkip;
+      }
+
       return cum_space_len < cum_reduce_len ? ConditionKind::kApply : ConditionKind::kSkip;
     } else if (cum_reduce_len > 1) {
       // Try rfactor for other reduction operators
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index f335f953b7b2..941817a5d954 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -136,6 +136,15 @@ Tensor Schedule::cache_read(const Tensor& tensor, const std::string& scope,
   if (tensor->op->num_outputs() != 1) {
     os << ".v" << tensor->value_index;
   }
+
+  // when a schedule has multiple cache_read on the same tensor,
+  // we make sure their op names are unique. e.g., w.shared, w_d.shared, w_d_d.shared
+  for (auto pair : (*this)->stage_map) {
+    auto stage = pair.second;
+    if (stage->op->name == os.str() + "." + scope) {
+      os << ".d";
+    }
+  }
   os << "." << scope;
 
   std::unordered_map<Tensor, Tensor> vsub;
diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py
index 880b11211e4e..764099e7bd07 100644
--- a/tests/python/unittest/test_auto_scheduler_common.py
+++ b/tests/python/unittest/test_auto_scheduler_common.py
@@ -53,6 +53,20 @@ def double_matmul_auto_scheduler_test(N):
     return [A, B, C, E]
 
 
+@auto_scheduler.register_workload
+def parallel_matmul_auto_scheduler_test(N):
+    """Two parallel matmuls with shared A."""
+    A = te.placeholder((N, N), name="A", dtype="float32")
+    B = te.placeholder((N, N), name="B", dtype="float32")
+    C = te.placeholder((N, N), name="C", dtype="float32")
+    k = te.reduce_axis((0, N), name="k")
+    D = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="D")
+    k = te.reduce_axis((0, N), name="k")
+    E = te.compute((N, N), lambda i, j: te.sum(A[i][k] * C[k][j], axis=[k]), name="E")
+
+    return [A, B, C, D, E]
+
+
 # Test for register_workload with different name
 @auto_scheduler.register_workload("matmul_auto_scheduler_test_rename_1")
 def matmul_auto_scheduler_test_rename_0(N, M, K):
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index a58f2ca374a2..2ccedef9e2de 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -21,7 +21,11 @@
 from tvm import topi
 from tvm import auto_scheduler, te
 
-from test_auto_scheduler_common import get_tiled_matmul, matmul_auto_scheduler_test
+from test_auto_scheduler_common import (
+    get_tiled_matmul,
+    matmul_auto_scheduler_test,
+    parallel_matmul_auto_scheduler_test,
+)
 
 
 def test_apply_steps():
@@ -56,7 +60,54 @@ def test_estimate_flop():
     assert abs(dag.flop_ct - (2 * N ** 3 + 1234)) < 0.5
 
 
+def test_stage_order():
+    N = 512
+    A, B, C, D, E = parallel_matmul_auto_scheduler_test(N)
+    sch = te.create_schedule([D.op, E.op])
+    (D_local,) = sch.cache_write([D], "local")
+    (E_local,) = sch.cache_write([E], "local")
+    sch.cache_read(A, "shared", [D_local])
+    sch.cache_read(B, "shared", [D_local])
+    sch.cache_read(A, "shared", [E_local])
+    sch.cache_read(C, "shared", [E_local])
+
+    dag = auto_scheduler.ComputeDAG(sch)
+    stage_ops_1 = dag.get_init_state().stage_ops
+
+    # 3 placeholder, 4 x.shared, 2 {D,E}.local, 2 {D,E} compute
+    assert len(stage_ops_1) == 11
+
+    # Cache read stage should follow the source stage
+    for idx, op in enumerate(stage_ops_1):
+        if op.name == "A":
+            assert (
+                stage_ops_1[idx + 1].name == "A.d.shared"
+                and stage_ops_1[idx + 2].name == "A.shared"
+            )
+        elif op.name in ["B", "C"]:
+            assert stage_ops_1[idx + 1].name == "%s.shared" % op.name
+
+    # Apply the same schedule to Ansor state and it should have the same stage order
+    dag = auto_scheduler.ComputeDAG([A, B, C, D, E])
+    state = dag.get_init_state()
+
+    D_local = state.cache_write(D, "local")
+    E_local = state.cache_write(E, "local")
+    state.cache_read(A, "shared", [D_local])
+    state.cache_read(B, "shared", [D_local])
+    state.cache_read(A, "shared", [E_local])
+    state.cache_read(C, "shared", [E_local])
+
+    stage_ops_2 = state.stage_ops
+    assert len(stage_ops_1) == len(stage_ops_2)
+
+    # Cache read stage should follow the source stage
+    for op1, op2 in zip(stage_ops_1, stage_ops_2):
+        assert op1.name == op2.name
+
+
 if __name__ == "__main__":
     test_apply_steps()
     test_infer_bound()
     test_estimate_flop()
+    test_stage_order()

From cc1b8d062fe2e9a4e81dc14a212ef1e4f61ebab6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 19 Oct 2020 10:41:57 -0700
Subject: [PATCH 024/258] [CI] Update docker to latest (#6708)

---
 Jenkinsfile                |  6 +++---
 cmake/utils/FindCUDA.cmake | 11 ++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0f59de70680a..9d475045bb93 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,10 +45,10 @@
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = "tlcpack/ci-lint:v0.62"
-ci_gpu = "tlcpack/ci-gpu:v0.65"
-ci_cpu = "tlcpack/ci-cpu:v0.67"
+ci_gpu = "tlcpack/ci-gpu:v0.70"
+ci_cpu = "tlcpack/ci-cpu:v0.70"
 ci_wasm = "tlcpack/ci-wasm:v0.60"
-ci_i386 = "tlcpack/ci-i386:v0.52"
+ci_i386 = "tlcpack/ci-i386:v0.70"
 ci_qemu = "tlcpack/ci-qemu:v0.01"
 // <--- End of regex-scanned config.
 
diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
index f7d9b5ed6d08..c95f8ce722f4 100644
--- a/cmake/utils/FindCUDA.cmake
+++ b/cmake/utils/FindCUDA.cmake
@@ -87,15 +87,20 @@ macro(find_cuda use_cuda)
         NO_DEFAULT_PATH)
       find_library(CUDA_CUDNN_LIBRARY cudnn
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
+      # search default path if cannot find cudnn in non-default
+      find_library(CUDA_CUDNN_LIBRARY cudnn)
       find_library(CUDA_CUBLAS_LIBRARY cublas
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
       find_library(CUDA_CUBLASLT_LIBRARY
         NAMES cublaslt cublasLt
         PATHS
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
     endif(MSVC)
     message(STATUS "Found CUDA_TOOLKIT_ROOT_DIR=" ${CUDA_TOOLKIT_ROOT_DIR})
     message(STATUS "Found CUDA_CUDA_LIBRARY=" ${CUDA_CUDA_LIBRARY})

From a5cc256219619aa0293a0b9df0cf8a13ea7fed4b Mon Sep 17 00:00:00 2001
From: presburger <mayushengmusic@gmail.com>
Date: Tue, 20 Oct 2020 01:48:57 +0800
Subject: [PATCH 025/258] Fix the Type bug in ConvertSSA. (#6709)

Co-authored-by: YushengMa <yusheng.ma@streamcomputing.com>
---
 src/tir/transforms/ir_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index 744e9a4f8ed3..d6c7300f2edb 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -158,7 +158,7 @@ class IRConvertSSA final : public StmtExprMutator {
   Stmt VisitStmt_(const AllocateNode* op) final {
     const Var& v = op->buffer_var;
     if (defined_.count(v.get())) {
-      Var new_var(v->name_hint, v.dtype());
+      Var new_var(v->name_hint, v->type_annotation);
       scope_[v.get()].push_back(new_var);
       Stmt stmt = StmtExprMutator::VisitStmt_(op);
       scope_[v.get()].pop_back();

From bb767ac0532487b81876d55a7cd5b54056fcc365 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Mon, 19 Oct 2020 16:03:06 -0700
Subject: [PATCH 026/258] [BYOC][TensorRT] TensorRT BYOC integration (#6395)

* TensorRT integration using JSONRuntime

Support input nodes with multiple data entries

Fix failing tests

Support layout transform, add engine caching

Add comment

Add PruneSubgraph pass

Use prune_subgraph pass, make params member of trt runtime class

Hide deprecation warnings coming from TRT headers

Remove general prune subgraph

Save/load use_implicit_batch and workspace size

Clean up

Fix cpp lint

Addressing review comments

Refactor tests

Use relay.bind instead of VarReplacer. Improve some annotation functions

Add TRT docs

Use DLOG, formatting

Use logging.info instead of print

also  refactor integ tests

also  refactor integ tests

Formatting

Formatting

Format python

fix python format

Fix pylint

Fix sphinx precheck

Add tensorrt.rst to toctree

Allow codegen to be tested when TRT runtime is not available. Enable TRT codegen in CI

linty

Address more comments

Formatting

Formatting

* Documentation changes

* Address comments

* Rename USE_TENSORRT->USE_TENSORRT_CODEGEN and USE_TENSORRT_GRAPH_RUNTIME->USE_TENSORRT_RUNTIME

* Fix comment typo

* Test CI without TRT codegen enabled

* formatting

* Enable USE_TENSORRT_CODEGEN in CI

* Change file_util.h -> file_utils.h
---
 CMakeLists.txt                                |    3 +
 cmake/config.cmake                            |   10 +
 cmake/modules/contrib/TensorRT.cmake          |   54 +
 docs/deploy/index.rst                         |    1 +
 docs/deploy/tensorrt.rst                      |  297 +++++
 python/tvm/relay/op/contrib/__init__.py       |    1 +
 python/tvm/relay/op/contrib/tensorrt.py       |  769 ++++++++++++
 src/relay/backend/contrib/tensorrt/codegen.cc |  240 ++++
 .../contrib/tensorrt/tensorrt_builder.cc      |  222 ++++
 .../contrib/tensorrt/tensorrt_builder.h       |  159 +++
 .../contrib/tensorrt/tensorrt_logger.h        |   78 ++
 src/runtime/contrib/tensorrt/tensorrt_ops.cc  | 1070 +++++++++++++++++
 src/runtime/contrib/tensorrt/tensorrt_ops.h   |  207 ++++
 .../contrib/tensorrt/tensorrt_runtime.cc      |  312 +++++
 src/runtime/contrib/tensorrt/tensorrt_utils.h |   74 ++
 tests/python/contrib/test_tensorrt.py         |  905 ++++++++++++++
 tests/scripts/task_config_build_gpu.sh        |    3 +-
 17 files changed, 4403 insertions(+), 2 deletions(-)
 create mode 100644 cmake/modules/contrib/TensorRT.cmake
 create mode 100644 docs/deploy/tensorrt.rst
 create mode 100644 python/tvm/relay/op/contrib/tensorrt.py
 create mode 100644 src/relay/backend/contrib/tensorrt/codegen.cc
 create mode 100644 src/runtime/contrib/tensorrt/tensorrt_builder.cc
 create mode 100644 src/runtime/contrib/tensorrt/tensorrt_builder.h
 create mode 100644 src/runtime/contrib/tensorrt/tensorrt_logger.h
 create mode 100644 src/runtime/contrib/tensorrt/tensorrt_ops.cc
 create mode 100644 src/runtime/contrib/tensorrt/tensorrt_ops.h
 create mode 100644 src/runtime/contrib/tensorrt/tensorrt_runtime.cc
 create mode 100644 src/runtime/contrib/tensorrt/tensorrt_utils.h
 create mode 100644 tests/python/contrib/test_tensorrt.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33c720c4cce4..d07f55f06ad0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,8 @@ tvm_option(USE_COREML "Build with coreml support" OFF)
 tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF)
+tvm_option(USE_TENSORRT_CODEGEN "Build with TensorRT Codegen support" OFF)
+tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF)
 
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
@@ -363,6 +365,7 @@ include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
 include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
+include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 1d465b2fe389..b220f3b0b9f0 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -222,6 +222,16 @@ set(USE_ETHOSN OFF)
 # otherwise use ETHOSN_HW (OFF) to use the software test infrastructure
 set(USE_ETHOSN_HW OFF)
 
+# Whether to build with TensorRT codegen or runtime
+# Examples are available here: docs/deploy/tensorrt.rst.
+#
+# USE_TENSORRT_CODEGEN - Support for compiling a relay graph where supported operators are
+#                        offloaded to TensorRT. OFF/ON
+# USE_TENSORRT_RUNTIME - Support for running TensorRT compiled modules, requires presense of
+#                        TensorRT library. OFF/ON/"path/to/TensorRT"
+set(USE_TENSORRT_CODEGEN OFF)
+set(USE_TENSORRT_RUNTIME OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake
new file mode 100644
index 000000000000..1536d23205a7
--- /dev/null
+++ b/cmake/modules/contrib/TensorRT.cmake
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# TensorRT Codegen only. This can be enabled independently of USE_TENSORRT_RUNTIME to enable
+# compilation of TensorRT modules without requiring TensorRT to be installed. The compiled modules
+# will only be able to be executed using a TVM built with USE_TENSORRT_RUNTIME=ON.
+if(USE_TENSORRT_CODEGEN)
+    message(STATUS "Build with TensorRT codegen")
+    file(GLOB COMPILER_TENSORRT_SRCS src/relay/backend/contrib/tensorrt/*.cc)
+    set_source_files_properties(${COMPILER_TENSORRT_SRCS} PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
+    file(GLOB RUNTIME_TENSORRT_SRCS src/runtime/contrib/tensorrt/tensorrt_runtime.cc)
+    set_source_files_properties(${RUNTIME_TENSORRT_SRCS} PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
+    list(APPEND COMPILER_SRCS ${COMPILER_TENSORRT_SRCS})
+    list(APPEND COMPILER_SRCS ${RUNTIME_TENSORRT_SRCS})
+endif()
+
+# TensorRT Runtime
+if(USE_TENSORRT_RUNTIME)
+    if(IS_DIRECTORY ${USE_TENSORRT_RUNTIME})
+        set(TENSORRT_ROOT_DIR ${USE_TENSORRT_RUNTIME})
+        message(STATUS "Custom TensorRT path: " ${TENSORRT_ROOT_DIR})
+    endif()
+    find_path(TENSORRT_INCLUDE_DIR NvInfer.h HINTS ${TENSORRT_ROOT_DIR} PATH_SUFFIXES include)
+    find_library(TENSORRT_LIB_DIR nvinfer HINTS ${TENSORRT_ROOT_DIR} PATH_SUFFIXES lib)
+    find_package_handle_standard_args(TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR TENSORRT_LIB_DIR)
+    if(NOT TENSORRT_FOUND)
+        message(ERROR "Could not find TensorRT.")
+    endif()
+    message(STATUS "TENSORRT_LIB_DIR: " ${TENSORRT_LIB_DIR})
+    include_directories(${TENSORRT_INCLUDE_DIR})
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${TENSORRT_LIB_DIR})
+
+    # TRT runtime sources
+    file(GLOB RUNTIME_TENSORRT_SRCS src/runtime/contrib/tensorrt/*.cc)
+    set_source_files_properties(${RUNTIME_TENSORRT_SRCS} PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
+    list(APPEND RUNTIME_SRCS ${RUNTIME_TENSORRT_SRCS})
+
+    # Set defines
+    add_definitions(-DTVM_GRAPH_RUNTIME_TENSORRT)
+endif()
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
index b38a7f561ab3..68843ba18248 100644
--- a/docs/deploy/index.rst
+++ b/docs/deploy/index.rst
@@ -69,3 +69,4 @@ target device without relying on RPC. see the following resources on how to do s
    integrate
    hls
    arm_compute_lib
+   tensorrt
diff --git a/docs/deploy/tensorrt.rst b/docs/deploy/tensorrt.rst
new file mode 100644
index 000000000000..27f11e9b5377
--- /dev/null
+++ b/docs/deploy/tensorrt.rst
@@ -0,0 +1,297 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Relay TensorRT Integration
+==========================
+**Author**: `Trevor Morris <https://github.com/trevor-m>`_
+
+Introduction
+------------
+
+NVIDIA TensorRT is a library for optimized deep learning inference. This integration will offload as
+many operators as possible from Relay to TensorRT, providing a performance boost on NVIDIA GPUs
+without the need to tune schedules.
+
+This guide will demonstrate how to install TensorRT and build TVM with TensorRT BYOC and runtime
+enabled. It will also provide example code to compile and run a ResNet-18 model using TensorRT and
+how to configure the compilation and runtime settings. Finally, we document the supported operators
+and how to extend the integration to support other operators.
+
+Installing TensorRT
+-------------------
+
+In order to download TensorRT, you will need to create an NVIDIA Developer program account. Please
+see NVIDIA's documentation for more info:
+https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html. If you have a Jetson device
+such as a TX1, TX2, Xavier, or Nano, TensorRT will already be installed on the device via the
+JetPack SDK.
+
+There are two methods to install TensorRT:
+
+* System install via deb or rpm package.
+* Tar file installation.
+
+With the tar file installation method, you must provide the path of the extracted tar archive to
+USE_TENSORRT_RUNTIME=/path/to/TensorRT. With the system install method,
+USE_TENSORRT_RUNTIME=ON will automatically locate your installation.
+
+Building TVM with TensorRT support
+----------------------------------
+
+There are two separate build flags for TensorRT integration in TVM. These flags also enable
+cross-compilation: USE_TENSORRT_CODEGEN=ON will also you to build a module with TensorRT support on
+a host machine, while USE_TENSORRT_RUNTIME=ON will enable the TVM runtime on an edge device to
+execute the TensorRT module. You should enable both if you want to compile and also execute models
+with the same TVM build.
+
+* USE_TENSORRT_CODEGEN=ON/OFF - This flag will enable compiling a TensorRT module, which does not require any
+  TensorRT library.
+* USE_TENSORRT_RUNTIME=ON/OFF/path-to-TensorRT - This flag will enable the TensorRT runtime module.
+  This will build TVM against the installed TensorRT library.
+
+Example setting in config.cmake file:
+
+.. code:: cmake
+
+    set(USE_TENSORRT_CODEGEN ON)
+    set(USE_TENSORRT_RUNTIME /home/ubuntu/TensorRT-7.0.0.11)
+
+
+Build and Deploy ResNet-18 with TensorRT
+----------------------------------------
+
+Create a Relay graph from a MXNet ResNet-18 model.
+
+.. code:: python
+
+    import tvm
+    from tvm import relay
+    import mxnet
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    block = get_model('resnet18_v1', pretrained=True)
+    mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+
+
+Annotate and partition the graph for TensorRT. All ops which are supported by the TensorRT
+integration will be marked and offloaded to TensorRT. The rest of the ops will go through the
+regular TVM CUDA compilation and code generation.
+
+.. code:: python
+
+    from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt
+    mod, config = partition_for_tensorrt(mod, params)
+
+
+Build the Relay graph, using the new module and config returned by partition_for_tensorrt. The
+target must always be a cuda target. ``partition_for_tensorrt`` will automatically fill out the
+required values in the config, so there is no need to modify it - just pass it along to the
+PassContext so the values can be read during compilation.
+
+.. code:: python
+
+    target = "cuda"
+    with tvm.transform.PassContext(opt_level=3, config={'relay.ext.tensorrt.options': config}):
+        lib = relay.build(mod, target=target, params=params)
+
+
+Export the module.
+
+.. code:: python
+
+    lib.export_library('compiled.so')
+
+
+Load module and run inference on the target machine, which must be built with
+``USE_TENSORRT_RUNTIME`` enabled. The first run will take longer because the TensorRT engine will
+have to be built.
+
+.. code:: python
+
+    ctx = tvm.gpu(0)
+    loaded_lib = tvm.runtime.load_module('compiled.so')
+    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx))
+    input_data = np.random.uniform(0, 1, input_shape).astype(dtype)
+    gen_module.run(data=input_data)
+
+
+Partitioning and Compilation Settings
+-------------------------------------
+
+There are some options which can be configured in ``partition_for_tensorrt``.
+
+* ``version`` - TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled
+  with USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead. The version
+  will affect which ops can be partitioned to TensorRT.
+* ``use_implicit_batch`` - Use TensorRT implicit batch mode (default true). Setting to false will
+  enable explicit batch mode which will widen supported operators to include those which modify the
+  batch dimension, but may reduce performance for some models.
+* ``remove_no_mac_subgraphs`` - A heuristic to improve performance. Removes subgraphs which have
+  been partitioned for TensorRT if they do not have any multiply-accumulate operations. The removed
+  subgraphs will go through TVM's standard compilation instead.
+* ``max_workspace_size`` - How many bytes of workspace size to allow each subgraph to use for
+  TensorRT engine creation. See TensorRT documentation for more info. Can be overriden at runtime.
+
+
+Runtime Settings
+----------------
+
+There are some additional options which can be configured at runtime using environment variables.
+
+* Automatic FP16 Conversion - Environment variable ``TVM_TENSORRT_USE_FP16=1`` can be set to
+  automatically convert the TensorRT components of your model to 16-bit floating point precision.
+  This can greatly increase performance, but may cause some slight loss in the model accuracy.
+* Caching TensorRT Engines - During the first inference, the runtime will invoke the TensorRT API
+  to build an engine. This can be time consuming, so you can set ``TVM_TENSORRT_CACHE_DIR`` to
+  point to a directory to save these built engines to on the disk. The next time you load the model
+  and give it the same directory, the runtime will load the already built engines to avoid the long
+  warmup time. A unique directory is required for each model.
+* TensorRT has a paramter to configure the maximum amount of scratch space that each layer in the
+  model can use. It is generally best to use the highest value which does not cause you to run out
+  of memory. You can use ``TVM_TENSORRT_MAX_WORKSPACE_SIZE`` to override this by specifying the
+  workspace size in bytes you would like to use.
+
+
+Operator support
+----------------
++------------------------+------------------------------------+
+|       Relay Node       |              Remarks               |
++========================+====================================+
+| nn.relu                |                                    |
++------------------------+------------------------------------+
+| sigmoid                |                                    |
++------------------------+------------------------------------+
+| tanh                   |                                    |
++------------------------+------------------------------------+
+| nn.batch_norm          |                                    |
++------------------------+------------------------------------+
+| nn.softmax             |                                    |
++------------------------+------------------------------------+
+| nn.conv2d              |                                    |
++------------------------+------------------------------------+
+| nn.dense               |                                    |
++------------------------+------------------------------------+
+| nn.bias_add            |                                    |
++------------------------+------------------------------------+
+| add                    |                                    |
++------------------------+------------------------------------+
+| subtract               |                                    |
++------------------------+------------------------------------+
+| multiply               |                                    |
++------------------------+------------------------------------+
+| divide                 |                                    |
++------------------------+------------------------------------+
+| power                  |                                    |
++------------------------+------------------------------------+
+| maximum                |                                    |
++------------------------+------------------------------------+
+| minimum                |                                    |
++------------------------+------------------------------------+
+| nn.max_pool2d          |                                    |
++------------------------+------------------------------------+
+| nn.avg_pool2d          |                                    |
++------------------------+------------------------------------+
+| nn.global_max_pool2d   |                                    |
++------------------------+------------------------------------+
+| nn.global_avg_pool2d   |                                    |
++------------------------+------------------------------------+
+| exp                    |                                    |
++------------------------+------------------------------------+
+| log                    |                                    |
++------------------------+------------------------------------+
+| sqrt                   |                                    |
++------------------------+------------------------------------+
+| abs                    |                                    |
++------------------------+------------------------------------+
+| negative               |                                    |
++------------------------+------------------------------------+
+| nn.batch_flatten       |                                    |
++------------------------+------------------------------------+
+| expand_dims            |                                    |
++------------------------+------------------------------------+
+| squeeze                |                                    |
++------------------------+------------------------------------+
+| concatenate            |                                    |
++------------------------+------------------------------------+
+| nn.conv2d_transpose    |                                    |
++------------------------+------------------------------------+
+| transpose              |                                    |
++------------------------+------------------------------------+
+| layout_transform       |                                    |
++------------------------+------------------------------------+
+| reshape                |                                    |
++------------------------+------------------------------------+
+| nn.pad                 |                                    |
++------------------------+------------------------------------+
+| sum                    |                                    |
++------------------------+------------------------------------+
+| prod                   |                                    |
++------------------------+------------------------------------+
+| max                    |                                    |
++------------------------+------------------------------------+
+| min                    |                                    |
++------------------------+------------------------------------+
+| mean                   |                                    |
++------------------------+------------------------------------+
+| nn.adaptive_max_pool2d |                                    |
++------------------------+------------------------------------+
+| nn.adaptive_avg_pool2d |                                    |
++------------------------+------------------------------------+
+| clip                   | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| nn.leaky_relu          | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| sin                    | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| cos                    | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| atan                   | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| ceil                   | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| floor                  | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| strided_slice          | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
+| nn.conv3d              | Requires TensorRT 6.0.1 or greater |
++------------------------+------------------------------------+
+| nn.max_pool3d          | Requires TensorRT 6.0.1 or greater |
++------------------------+------------------------------------+
+| nn.avg_pool3d          | Requires TensorRT 6.0.1 or greater |
++------------------------+------------------------------------+
+| nn.conv3d_transpose    | Requires TensorRT 6.0.1 or greater |
++------------------------+------------------------------------+
+
+
+Adding a new operator
+---------------------
+To add support for a new operator, there are a series of files we need to make changes to:
+
+* `src/runtime/contrib/tensorrt/tensorrt_ops.cc` Create a new op converter class which
+  implements the ``TensorRTOpConverter`` interface. You must implement the constructor to specify how
+  many inputs there are and whether they are tensors or weights. You must also implement the
+  ``Convert`` method to perform the conversion. This is done by using the inputs, attributes, and
+  network from params to add the new TensorRT layers and push the layer outputs. You can use the
+  existing converters as an example. Finally, register your new op conventer in the
+  ``GetOpConverters()`` map.
+* `python/relay/op/contrib/tensorrt.py` This file contains the annotation rules for TensorRT. These
+  determine which operators and their attributes that are supported. You must register an annotation
+  function for the relay operator and specify which attributes are supported by your converter, by
+  checking the attributes are returning true or false.
+* `tests/python/contrib/test_tensorrt.py` Add unit tests for the given operator.
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index dbcd8055d30b..49abf36134b4 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -22,3 +22,4 @@
 from .dnnl import *
 from .coreml import *
 from .ethosn import *
+from .tensorrt import *
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
new file mode 100644
index 000000000000..a0e23a043a72
--- /dev/null
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -0,0 +1,769 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""TensorRT supported operators."""
+import logging
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.expr import Call, Constant, Tuple, GlobalVar
+from tvm.relay.expr_functor import ExprMutator
+
+logger = logging.getLogger("TensorRT")
+
+
+def is_tensorrt_runtime_enabled():
+    """Check if the TensorRT graph runtime is present.
+    Returns
+    -------
+    ret: bool
+        True if present, False if not.
+    """
+    check_enabled = tvm.get_global_func("relay.op.is_tensorrt_runtime_enabled", True)
+    if check_enabled:
+        return check_enabled()
+    return False
+
+
+def get_tensorrt_version():
+    """Gets the version of TensorRT that TVM is built against or is targeting.
+
+    Returns
+    -------
+    ret: Tuple[int, int, int]
+        TensorRT version as a tuple of major, minor, and patch number. If TVM
+        is not built with TensorRT, the value set by set_tensorrt_version() is returned instead.
+    """
+    pass_ctx = tvm.transform.PassContext.current()
+    if "relay.ext.tensorrt.options" in pass_ctx.config:
+        return tuple(pass_ctx.config["relay.ext.tensorrt.options"].tensorrt_version)
+    return tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())
+
+
+def get_tensorrt_use_implicit_batch_mode():
+    pass_ctx = tvm.transform.PassContext.current()
+    if "relay.ext.tensorrt.options" in pass_ctx.config:
+        return pass_ctx.config["relay.ext.tensorrt.options"].use_implicit_batch
+    logger.warning(
+        "PassContext has no relay.ext.tensorrt.options config, using default value "
+        "use_implicit_batch=True."
+    )
+    return True
+
+
+def get_tensorrt_remove_no_mac_subgraphs():
+    pass_ctx = tvm.transform.PassContext.current()
+    if "relay.ext.tensorrt.options" in pass_ctx.config:
+        return pass_ctx.config["relay.ext.tensorrt.options"].remove_no_mac_subgraphs
+    logger.warning(
+        "PassContext has no relay.ext.tensorrt.options config, using default value "
+        "remove_no_mac_subgraphs=False."
+    )
+    return False
+
+
+def partition_for_tensorrt(
+    mod,
+    params=None,
+    version=None,
+    use_implicit_batch=True,
+    remove_no_mac_subgraphs=False,
+    max_workspace_size=1 << 30,
+):
+    """Partition the graph greedily offloading supported operators to TensorRT.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+    version : Optional[Tuple[int, int, int]]
+        TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled with
+        USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead.
+    use_implicit_batch : Optional[bool]
+        Use TensorRT implicit batch mode (default true). Setting to false will enable explicit batch
+        mode which will widen supported operators to include those which modify the batch dimension,
+        but may reduce performance for some models.
+    remove_no_mac_subgraphs : Optional[bool]
+        Removes subgraphs which have been partitioned for TensorRT if they do not have any
+        multiply-accumulate operations. The removed subgraphs will go through TVM's standard
+        compilation instead. Can improve performance.
+    max_workspace_size : Optional[int]
+        How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
+        See TensorRT documentation for more info.
+    Returns
+    -------
+    mod_and_config : Tuple[Module, Dict[str, Any]]
+        A tuple of 1) annotated and partitioned module and 2) "relay.ext.tensorrt.options"
+        configuration which should be given to PassContext when building.
+    """
+    config = {
+        "use_implicit_batch": use_implicit_batch,
+        "max_workspace_size": max_workspace_size,
+        "remove_no_mac_subgraphs": remove_no_mac_subgraphs,
+    }
+    if version:
+        assert isinstance(version, tuple) and len(version) == 3
+        config["tensorrt_version"] = version
+    else:
+        linked_version = tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())
+        if not linked_version:
+            logger.warning(
+                "TVM was not built against TensorRT and no version was provided to "
+                "partition_for_tensorrt. Defaulting to 6.0.1"
+            )
+            linked_version = (6, 0, 1)
+        config["tensorrt_version"] = linked_version
+
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            RemoveDropoutPass(),
+            transform.RemoveUnusedFunctions(),
+            transform.ConvertLayout(
+                {"nn.conv2d": ["NCHW", "default"], "nn.conv3d": ["NCDHW", "default"]}
+            ),
+            transform.FoldConstant(),
+            transform.AnnotateTarget("tensorrt"),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+            transform.InferType(),
+        ]
+    )
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        mod = seq(mod)
+        mod = prune_tensorrt_subgraphs(mod)
+    return mod, config
+
+
+def _register_external_op_helper_with_checker(op_name, checker):
+    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
+    def _func_wrapper(attrs, args):
+        if any([x.checked_type.dtype != "float32" for x in args]):
+            logger.info("Only float32 inputs are supported for TensorRT.")
+            return False
+        return checker(attrs, args, op_name)
+
+    return _func_wrapper
+
+
+def _register_external_op_helper(op_name, supported=True):
+    return _register_external_op_helper_with_checker(
+        op_name, lambda attrs, args, op_name: supported
+    )
+
+
+# Ops which are always supported
+_register_external_op_helper("nn.relu")
+_register_external_op_helper("sigmoid")
+_register_external_op_helper("tanh")
+_register_external_op_helper("subtract")
+_register_external_op_helper("multiply")
+_register_external_op_helper("divide")
+_register_external_op_helper("power")
+_register_external_op_helper("maximum")
+_register_external_op_helper("minimum")
+_register_external_op_helper("exp")
+_register_external_op_helper("log")
+_register_external_op_helper("sqrt")
+_register_external_op_helper("abs")
+_register_external_op_helper("negative")
+_register_external_op_helper("nn.batch_flatten")
+_register_external_op_helper("clip")
+
+
+@tvm.ir.register_op_attr("add", "target.tensorrt")
+def add_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if add is supported by TensorRT."""
+
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if (
+        not get_tensorrt_use_implicit_batch_mode()
+        and (isinstance(args[0], Constant) or isinstance(args[1], Constant))
+        and args[0].checked_type.shape[0] == args[1].checked_type.shape[0]
+        and args[0].checked_type.shape[0] != 1
+        and (len(args[0].checked_type.shape) > 3 or len(args[1].checked_type.shape) > 3)
+    ):
+        logger.info("add: bug in TRT with adding batched constants.")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.batch_norm", "target.tensorrt")
+def batch_norm_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.batch_norm is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if int(attrs.axis) not in (1, 3):
+        logger.info("nn.batch_norm: axis is %d but must be 1 or 3.", int(attrs.axis))
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.softmax", "target.tensorrt")
+def softmax_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.softmax is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
+        logger.info("nn.softmax: can't modify batch dimension.")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.conv2d", "target.tensorrt")
+def conv2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.conv2d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.data_layout != "NCHW":
+        logger.info("nn.conv2d: data_layout is %s but must be NCHW.", attrs.data_layout)
+        return False
+    if attrs.kernel_layout != "OIHW":
+        logger.info("nn.conv2d: kernel_layout is %s but must be OIHW.", attrs.kernel_layout)
+        return False
+    if attrs.out_layout and attrs.out_layout != "NCHW":
+        logger.info("nn.conv2d: out_layout is %s but must be NCHW.", attrs.out_layout)
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.dense", "target.tensorrt")
+def dense_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if dense is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    input_rank = len(args[0].checked_type.shape)
+    weight_rank = len(args[1].checked_type.shape)
+    if input_rank not in (2, 3, 4):
+        logger.info("nn.dense: input has rank %d but must be 2, 3 or 4.", input_rank)
+        return False
+    if weight_rank != 2:
+        logger.info("nn.dense: weight has rank %d but must be 2.", weight_rank)
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.bias_add", "target.tensorrt")
+def bias_add_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.bias_add is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    input_rank = len(args[0].checked_type.shape)
+    if input_rank not in (2, 3, 4):
+        logger.info("nn.bias_add: input rank is %d but must be 2, 3 or 4.", input_rank)
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.max_pool2d", "target.tensorrt")
+def max_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.max_pool2d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.layout != "NCHW":
+        logger.info("nn.max_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        return False
+    if attrs.ceil_mode and get_tensorrt_version() < (5, 1, 5):
+        logger.info("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.avg_pool2d", "target.tensorrt")
+def avg_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.avg_pool2d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.layout != "NCHW":
+        logger.info("nn.avg_pool2d: layout is %d but must be NCHW.", attrs.layout)
+        return False
+    if (
+        attrs.count_include_pad
+        and len(attrs.padding) == 4
+        and (
+            int(attrs.padding[0]) != int(attrs.padding[2])
+            or int(attrs.padding[1]) != int(attrs.padding[3])
+        )
+    ):
+        logger.info(
+            "nn.avg_pool2d: inclusive-counted blended or average "
+            "pooling is not supported in combination with asymmetric padding"
+        )
+        return False
+    if attrs.ceil_mode and get_tensorrt_version() < (5, 1, 5):
+        logger.info("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.tensorrt")
+def global_max_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.global_max_pool2d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.layout != "NCHW":
+        logger.info("nn.global_max_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.tensorrt")
+def global_avg_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.global_avg_pool2d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.layout != "NCHW":
+        logger.info("nn.global_avg_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("expand_dims", "target.tensorrt")
+def expand_dims_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if expand_dims is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
+        logger.info("expand_dims: can't modify batch dimension.")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("squeeze", "target.tensorrt")
+def squeeze_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if squeeze is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not attrs.axis:
+        logger.info("squeeze: must explicitly set axis.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and any([axis == 0 for axis in map(int, attrs.axis)]):
+        logger.info("squeeze: can't modify batch dimension.")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("concatenate", "target.tensorrt")
+def concatenate_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if concatenate is supported by TensorRT."""
+    if any([x.dtype != "float32" for x in args[0].checked_type.fields]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not get_tensorrt_use_implicit_batch_mode():
+        return True
+    if int(attrs.axis) == 0:
+        logger.info("concatenate: can't modify batch dimension.")
+        return False
+    if isinstance(args[0], Tuple):
+        for tuple_input in args[0].fields:
+            if isinstance(tuple_input, Constant):
+                logger.info("concatenate: can't concatenate tensors with constants.")
+                return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.conv2d_transpose", "target.tensorrt")
+def conv2d_transpose_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.conv2d_transpose is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.data_layout != "NCHW":
+        logger.info("nn.conv2d_transpose: data_layout is %s but must be NCHW.", attrs.data_layout)
+        return False
+    if attrs.kernel_layout != "OIHW":
+        logger.info(
+            "nn.conv2d_transpose: kernel_layout is %s but must be OIHW.", attrs.kernel_layout
+        )
+        return False
+    if attrs.out_layout and attrs.out_layout != "NCHW":
+        logger.info("nn.conv2d_transpose: out_layout is %s but must be NCHW.", attrs.out_layout)
+        return False
+    if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
+        logger.info("nn.conv2d_transpose: dilation rate must be 1.")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("transpose", "target.tensorrt")
+def transpose_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if transpose is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axes[0]) != 0:
+        logger.info("transpose: can't modify batch dimension.")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("layout_transform", "target.tensorrt")
+def layout_transform_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if layout_transform is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if (attrs.src_layout, attrs.dst_layout) not in [
+        ("NCHW", "NHWC"),
+        ("NHWC", "NCHW"),
+        ("NDHWC", "NCDHW"),
+        ("NCDHW", "NDHWC"),
+    ]:
+        logger.info(
+            "layout_transform: %s to %s is not supported.", attrs.src_layout, attrs.dst_layout
+        )
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("reshape", "target.tensorrt")
+def reshape_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if reshape is supported by TensorRT."""
+    if args[0].checked_type.dtype != "float32":
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if any([x < -1 for x in map(int, attrs.newshape)]):
+        logger.info("reshape: new shape dims must be explicit.")
+        return False
+    if get_tensorrt_use_implicit_batch_mode():
+        shape = list(map(int, args[0].checked_type.shape))
+        new_shape = list(map(int, attrs.newshape))
+        if len(new_shape) == 0 or len(shape) == 0:
+            logger.info("reshape: Can't reshape to or from scalar.")
+            return False
+        # TRT cannot modify batch dimension.
+        original_volume = np.prod(shape)
+        # First, resolve 0.
+        for i, value in enumerate(new_shape):
+            if value == 0:
+                new_shape[i] = shape[i]
+        # Resolve -1.
+        for i, value in enumerate(new_shape):
+            if value == -1:
+                new_shape[i] = original_volume // np.prod([x for x in new_shape if x != -1])
+        if shape[0] != new_shape[0]:
+            logger.info("reshape: can't modify batch dimension.")
+            return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.pad", "target.tensorrt")
+def pad_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.pad is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if attrs.pad_mode != "constant":
+        logger.info("nn.pad: pad mode is %s but must be constant.", attrs.pad_mode)
+        return False
+    if float(attrs.pad_value) != 0.0:
+        logger.info("nn.pad: pad value is %f but must be 0.0.", float(attrs.pad_value))
+        return False
+    if any([x != 0 for x in attrs.pad_width[0]]) or any([x != 0 for x in attrs.pad_width[1]]):
+        logger.info("nn.pad: can't pad batch or channel dimensions.")
+        return False
+    if len(attrs.pad_width) == 5 and any([x != 0 for x in attrs.pad_width[2]]):
+        logger.info("nn.pad: can only pad last two dimensions for 5D inputs.")
+    return True
+
+
+def reduce_annotate_fn(attrs, args, op_name):
+    """Helper for reduce operations."""
+    if not attrs.axis or len(attrs.axis) == 0:
+        logger.info("%s: cannot reduce to scalar.", op_name)
+        return False
+    if attrs.exclude:
+        logger.info("%s: exclude not supported.", op_name)
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and any([x == 0 for x in map(int, attrs.axis)]):
+        logger.info("%s: can't modify batch dimension.", op_name)
+        return False
+    return True
+
+
+_register_external_op_helper_with_checker("sum", reduce_annotate_fn)
+_register_external_op_helper_with_checker("prod", reduce_annotate_fn)
+_register_external_op_helper_with_checker("max", reduce_annotate_fn)
+_register_external_op_helper_with_checker("min", reduce_annotate_fn)
+_register_external_op_helper_with_checker("mean", reduce_annotate_fn)
+
+
+def trt_version_annotate_fn(version):
+    """Helper for ops which require a minimum TRT version"""
+
+    def _func_wrapper(attrs, args, op_name):
+        if get_tensorrt_version() < version:
+            logger.info(
+                "%s: requires TensorRT version %s or higher.", op_name, ".".join(map(str, version))
+            )
+            return False
+        return True
+
+    return _func_wrapper
+
+
+_register_external_op_helper_with_checker("nn.leaky_relu", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("sin", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("cos", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("atan", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("ceil", trt_version_annotate_fn((5, 1, 5)))
+
+
+@tvm.ir.register_op_attr("strided_slice", "target.tensorrt")
+def strided_slice_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if strided_slice is supported by TensorRT."""
+    if args[0].checked_type.dtype != "float32":
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((5, 1, 5))(attrs, args, "strided_slice"):
+        return False
+    if get_tensorrt_use_implicit_batch_mode():
+        batch_dim_begin_modified = attrs.begin[0] is not None and int(attrs.begin[0]) != 0
+        batch_dim_end_modified = (
+            attrs.end[0] is not None
+            and int(attrs.end[0]) != -1
+            and int(attrs.end[0]) != int(args[0].checked_type.shape[0])
+        )
+        if batch_dim_begin_modified or batch_dim_end_modified:
+            logger.info("strided_slice: can't modify batch dimension.")
+            return False
+    if any([x is not None and x <= 0 for x in attrs.strides]):
+        logger.info("strided_slice: stride must be positive")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.adaptive_max_pool2d", "target.tensorrt")
+def adapative_max_pool2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.adaptive_max_pool2d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
+        logger.info("nn.adaptive_max_pool2d: output size must be (1, 1).")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.adaptive_avg_pool2d", "target.tensorrt")
+def adapative_avg_pool2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.adaptive_avg_pool2d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
+        logger.info("nn.adaptive_avg_pool2d: output size must be (1, 1).")
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.conv3d", "target.tensorrt")
+def conv3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.conv3d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d"):
+        return False
+    if attrs.data_layout != "NCDHW":
+        logger.info("nn.conv3d: data_layout is %s but must be NCDHW.", attrs.data_layout)
+        return False
+    if attrs.kernel_layout != "OIDHW":
+        logger.info("nn.conv3d: kernel_layout is %s but must be OIDHW.", attrs.kernel_layout)
+        return False
+    if attrs.out_layout and attrs.out_layout != "NCDHW":
+        logger.info("nn.conv3d: out_layout is %s but must be NCDHW.", attrs.out_layout)
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.max_pool3d", "target.tensorrt")
+def max_pool_3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.max_pool3d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.max_pool3d"):
+        return False
+    if attrs.layout != "NCDHW":
+        logger.info("nn.max_pool3d: layout is %s but must be NCDHW.", attrs.layout)
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.avg_pool3d", "target.tensorrt")
+def avg_pool_3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.avg_pool3d is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.avg_pool3d"):
+        return False
+    if attrs.layout != "NCDHW":
+        logger.info("nn.avg_pool3d: layout is %s but must be NCDHW.", attrs.layout)
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.conv3d_transpose", "target.tensorrt")
+def conv3d_transpose_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+    """Check if nn.conv3d_transpose is supported by TensorRT."""
+    if any([x.checked_type.dtype != "float32" for x in args]):
+        logger.info("Only float32 inputs are supported for TensorRT.")
+        return False
+    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d_transpose"):
+        return False
+    if attrs.data_layout != "NCDHW":
+        logger.info("nn.conv3d_transpose: data_layout is %s but must be NCDHW.", attrs.data_layout)
+        return False
+    if attrs.kernel_layout != "OIDHW":
+        logger.info(
+            "nn.conv3d_transpose: kernel_layout is %s but must be OIDHW.", attrs.kernel_layout
+        )
+        return False
+    if attrs.out_layout and attrs.out_layout != "NCDHW":
+        logger.info("nn.conv3d_transpose: out_layout is %s but must be NCDHW.", attrs.out_layout)
+        return False
+    if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
+        logger.info("nn.conv3d_transpose: dilation rate must be 1.")
+        return False
+    if attrs.output_padding and any([x != 0 for x in map(int, attrs.output_padding)]):
+        logger.info("nn.conv3d_transpose: output padding is not supported.")
+        return False
+    return True
+
+
+def is_valid_subgraph(params, body):
+    """Final check on whether the subgraph is valid and should be offloaded to TensorRT."""
+    # Remove invalid subgraphs for implicit batch mode.
+    if get_tensorrt_use_implicit_batch_mode():
+        input_batch_sizes = []
+        for var in params:
+            # In implicit batch mode, all inputs must have same batch size
+            if isinstance(var.checked_type, relay.TupleType):
+                for tupe_type in var.checked_type.fields:
+                    # Scalar inputs not allowed
+                    if len(tupe_type.shape) == 0:
+                        logger.info("tensorrt: scalar inputs not supported")
+                        return False
+                    input_batch_sizes.append(int(tupe_type.shape[0]))
+            else:
+                # Scalar inputs not allowed
+                if len(var.checked_type.shape) == 0:
+                    logger.info("tensorrt: scalar inputs not supported")
+                    return False
+                input_batch_sizes.append(int(var.checked_type.shape[0]))
+        if len(input_batch_sizes) > 1 and len(set(input_batch_sizes)) != 1:
+            logger.info("tensorrt: inputs have different batch sizes")
+            return False
+    # Remove subgraphs with no multiply-accumulates
+    if get_tensorrt_remove_no_mac_subgraphs() and relay.analysis.get_total_mac_number(body) == 0:
+        return False
+    return True
+
+
+def prune_tensorrt_subgraphs(mod):
+    """
+    Removes invalid subgraphs and those with no multiply-accumulates (if remove_no_max_subgraphs
+    is set).
+    """
+
+    class SubgraphRemover(ExprMutator):
+        """
+        Reverts subgraphs in subgraphs_to_remove back to TVM instead of using an external codegen.
+        """
+
+        def __init__(self, subgraphs_to_remove, mod, new_mod):
+            ExprMutator.__init__(self)
+            self.subgraphs_to_remove = subgraphs_to_remove
+            self.mod = mod
+            self.new_mod = new_mod
+
+        def visit_call(self, call):
+            if isinstance(call.op, GlobalVar):
+                name = call.op.name_hint
+                if name in self.subgraphs_to_remove:
+                    # "Inline" the subgraph back into new main function.
+                    func = self.mod[name]
+                    var_map = {}
+                    for arg, param in zip(call.args, func.params):
+                        var_map[param] = super().visit(arg)
+                    new_body = relay.bind(func.body, var_map)
+                    return new_body
+                if name != "main":
+                    # Copy the GlobalVar (subgraph function) to the new module and call.
+                    args = []
+                    for arg in call.args:
+                        args.append(super().visit(arg))
+                    subgraph_gv = relay.GlobalVar(name)
+                    self.new_mod[subgraph_gv] = self.mod[name]
+                    return subgraph_gv(*args)
+            return super().visit_call(call)
+
+    subgraphs_to_remove = []
+    # Remove invalid subgraphs
+    for subgraph in mod.get_global_vars():
+        name = subgraph.name_hint
+        if not mod[name].attrs or mod[name].attrs["Compiler"] != "tensorrt":
+            continue
+        if not is_valid_subgraph(mod[name].params, mod[name].body):
+            subgraphs_to_remove.append(name)
+    # Create new pruned module
+    new_mod = tvm.IRModule()
+    new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
+    return new_mod
+
+
+class RemoveDropout(ExprMutator):
+    """
+    Removes all nn.dropout from an expr.
+    """
+
+    def visit_tuple_getitem(self, op):
+        visit = super().visit_tuple_getitem(op)
+        if (
+            isinstance(visit.tuple_value, Call)
+            and visit.tuple_value.op.name == "nn.dropout"
+            and visit.index == 0
+        ):
+            return visit.tuple_value.args[0]
+        return visit
+
+
+@transform.function_pass(opt_level=0)
+class RemoveDropoutPass:
+    def transform_function(self, func, mod, _):
+        return RemoveDropout().visit(func)
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
new file mode 100644
index 000000000000..f692da3f31ac
--- /dev/null
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/tensorrt/codegen.cc
+ * \brief Implementation of the TensorRT JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+#if TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#endif
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*! \brief Attributes to store the compiler options for TensorRT. */
+struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfigNode> {
+  Array<Integer> tensorrt_version;
+  bool use_implicit_batch;
+  size_t max_workspace_size;
+  bool remove_no_mac_subgraphs;
+
+  TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
+    TVM_ATTR_FIELD(tensorrt_version)
+        .describe("TensorRT version as (major, minor, patch).")
+        .set_default(Array<Integer>({6, 0, 1}));
+    TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
+    TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
+    TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
+  }
+};
+
+class TensorRTCompilerConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorRTCompilerConfig, Attrs,
+                                            TensorRTCompilerConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", TensorRTCompilerConfig);
+
+/*!
+ * \brief Generates an TensorRTModule from a relay expression by serializing the expression to a
+ * json representation. TensorRT is not required here because use of TensorRT APIs is deferred until
+ * runtime.
+ */
+class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  TensorRTJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr) {}
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) {
+    std::string name;
+    if (const auto* op_node = cn->op.as<OpNode>()) {
+      name = op_node->name;
+    } else {
+      return JSONSerializer::VisitExpr_(cn);
+    }
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+    if (name == "nn.pad") {
+      SetPadNodeAttribute(node, cn);
+    } else if (name == "strided_slice") {
+      SetStridedSliceNodeAttribute(node, cn);
+    } else {
+      SetCallNodeAttribute(node, cn);
+    }
+    // These attributes are global to the whole module.
+    SaveGlobalAttributes(node);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+
+  void SetPadNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
+    const auto* pad_attr = cn->attrs.as<PadAttrs>();
+    CHECK(pad_attr);
+    auto p = pad_attr->pad_width;
+    const int dim_h = (p.size() == 5) ? 3 : 2;
+    const int dim_w = (p.size() == 5) ? 4 : 3;
+    std::vector<std::string> padding = {std::to_string(p[dim_h][0].as<IntImmNode>()->value),
+                                        std::to_string(p[dim_w][0].as<IntImmNode>()->value),
+                                        std::to_string(p[dim_h][1].as<IntImmNode>()->value),
+                                        std::to_string(p[dim_w][1].as<IntImmNode>()->value)};
+    std::vector<dmlc::any> padding_attr;
+    padding_attr.emplace_back(padding);
+    node->SetAttr("padding", padding_attr);
+  }
+
+  void SetStridedSliceNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
+    const auto* attrs = cn->attrs.as<StridedSliceAttrs>();
+    CHECK(attrs && attrs->begin && attrs->end && attrs->strides)
+        << "StridedSlice must have static begin, end, and strides.";
+    const bool default_strides =
+        !attrs->strides.value().defined() || attrs->strides.value().size() == 0;
+    auto ishape = backend::GetShape(cn->args[0]->checked_type());
+
+    auto process_slice_index = [](Integer x, int default_value, int dim_value) {
+      if (!x.defined()) return default_value;
+      int value = x.as<IntImmNode>()->value;
+      if (value < 0) value += dim_value;
+      return value;
+    };
+
+    std::vector<std::string> start, size, strides;
+    for (size_t i = 0; i < attrs->begin.value().size(); ++i) {
+      const int begin_value = process_slice_index(attrs->begin.value()[i], 0, ishape[i]);
+      const int end_value = process_slice_index(attrs->end.value()[i], ishape[i], ishape[i]);
+      const int stride_value = (default_strides || i >= attrs->strides.value().size() ||
+                                !attrs->strides.value()[i].defined())
+                                   ? 1
+                                   : attrs->strides.value()[i].as<IntImmNode>()->value;
+      CHECK_GT(stride_value, 0);
+      const int size_value = (end_value - begin_value + stride_value - 1) / stride_value;
+      CHECK_GE(begin_value, 0);
+      CHECK_GT(size_value, 0);
+      start.push_back(std::to_string(begin_value));
+      size.push_back(std::to_string(size_value));
+      strides.push_back(std::to_string(stride_value));
+    }
+    std::vector<dmlc::any> start_attr, size_attr, strides_attr;
+    start_attr.emplace_back(start);
+    size_attr.emplace_back(size);
+    strides_attr.emplace_back(strides);
+    node->SetAttr("start", start_attr);
+    node->SetAttr("size", size_attr);
+    node->SetAttr("strides", strides_attr);
+  }
+
+  void SaveGlobalAttributes(std::shared_ptr<JSONGraphNode> node) {
+    auto ctx = transform::PassContext::Current();
+    auto cfg = ctx->GetConfig<TensorRTCompilerConfig>("relay.ext.tensorrt.options");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<TensorRTCompilerConfig>();
+    }
+    CHECK_EQ(cfg.value()->tensorrt_version.size(), 3);
+    std::vector<std::string> tensorrt_version = {std::to_string(cfg.value()->tensorrt_version[0]),
+                                                 std::to_string(cfg.value()->tensorrt_version[1]),
+                                                 std::to_string(cfg.value()->tensorrt_version[2])};
+    std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
+    std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
+    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
+    tensorrt_version_attr.emplace_back(tensorrt_version);
+    use_implicit_batch_attr.emplace_back(use_implicit_batch);
+    max_workspace_size_attr.emplace_back(max_workspace_size);
+    node->SetAttr("tensorrt_version", tensorrt_version_attr);
+    node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
+    node->SetAttr("max_workspace_size", max_workspace_size_attr);
+  }
+};
+
+/*!
+ * \brief Create a runtime module for TensorRT.
+ * \param ref The ext_func Relay expression/module to be executed using extern ops.
+ * \return A runtime module.
+ */
+runtime::Module TensorRTCompiler(const ObjectRef& ref) {
+  CHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
+  Function func = Downcast<Function>(ref);
+  std::string func_name = backend::GetExtSymbol(func);
+
+  TensorRTJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto param_names = serializer.GetParams();
+  const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
+  CHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
+  runtime::Module lib = (*pf)(func_name, graph_json, param_names);
+  return lib;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TensorRTCompiler);
+
+/*!
+ * \brief Check whether TensorRT graph runtime is enabled.
+ * \return True if enabled, False if not.
+ */
+inline constexpr bool IsTensorRTRuntimeEnabled() {
+#if TVM_GRAPH_RUNTIME_TENSORRT
+  return true;
+#else
+  return false;
+#endif  // TVM_GRAPH_RUNTIME_TENSORRT
+}
+
+/*!
+ * \brief Get TensorRT version that TVM is built against.
+ * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph
+ * runtime is not enabled.
+ */
+Array<Integer> GetTensorRTVersion() {
+#if TVM_GRAPH_RUNTIME_TENSORRT
+  return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)};
+#else
+  return {};
+#endif  // TVM_GRAPH_RUNTIME_TENSORRT
+}
+
+TVM_REGISTER_GLOBAL("relay.op.is_tensorrt_runtime_enabled")
+    .set_body_typed(IsTensorRTRuntimeEnabled);
+TVM_REGISTER_GLOBAL("relay.op.get_tensorrt_version").set_body_typed(GetTensorRTVersion);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
new file mode 100644
index 000000000000..bf0dbfe724ed
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -0,0 +1,222 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/tensorrt_builder.cc
+ * \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph into a TRT engine
+ * which can be used for inference.
+ */
+
+#include "tensorrt_builder.h"
+
+#include <tvm/runtime/ndarray.h>
+
+#include <memory>
+#include <string>
+
+#include "tensorrt_logger.h"
+#include "tensorrt_ops.h"
+#include "tensorrt_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger, size_t max_workspace_size,
+                                 bool use_implicit_batch, bool use_fp16, int batch_size)
+    : max_workspace_size_(max_workspace_size),
+      use_implicit_batch_(use_implicit_batch),
+      use_fp16_(use_fp16),
+      batch_size_(batch_size) {
+  // Create TRT builder and network.
+  builder_ = nvinfer1::createInferBuilder(*logger);
+#if TRT_VERSION_GE(6, 0, 1)
+  // Use INetworkV2.
+  auto flags =
+      1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+  if (use_implicit_batch_) {
+    flags = 0U;
+    builder_->setMaxBatchSize(batch_size_);
+  }
+  network_ = builder_->createNetworkV2(flags);
+#else
+  // Use INetwork with implicit batch.
+  builder_->setMaxBatchSize(batch_size_);
+  builder_->setMaxWorkspaceSize(max_workspace_size_);
+  builder_->setFp16Mode(use_fp16_);
+  network_ = builder_->createNetwork();
+#endif
+}
+
+void TensorRTBuilder::AddInput(int nid, const JSONGraphNode& node) {
+  auto node_name = node.GetOpName();
+  auto shapes = node.GetOpShape();
+  auto dtypes = node.GetOpDataType();
+  CHECK_EQ(shapes.size(), dtypes.size());
+  node_output_map_[nid] = {};
+  for (size_t i = 0; i < shapes.size(); ++i) {
+    const std::string name = node_name + "_" + std::to_string(i);
+    auto shape = shapes[i];
+    // Remove batch dim when not in explicit batch mode.
+    if (use_implicit_batch_ && shape.size() > 1) {
+      shape.erase(shape.begin());
+    }
+    nvinfer1::Dims dims = VectorToTrtDims(shape);
+    CHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
+    auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+    node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
+    network_input_names_.push_back(input_tensor->getName());
+  }
+}
+
+void TensorRTBuilder::AddConstant(int nid, const DLTensor* data) {
+  nvinfer1::Weights weight = GetDLTensorAsWeights(data, kDLCPU);
+  std::vector<int> shape(data->shape, data->shape + data->ndim);
+  // Remove batch dim when not in explicit batch mode.
+  if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) {
+    shape.erase(shape.begin());
+  }
+  node_output_map_[nid] = {TensorRTOpInput(weight, shape)};
+}
+
+void TensorRTBuilder::AddOutput(const JSONGraphNodeEntry& node) {
+  auto it = node_output_map_.find(node.id_);
+  CHECK(it != node_output_map_.end()) << "Output was not found.";
+  auto out_tensor = it->second[node.index_].tensor;
+  std::string name = "tensorrt_output_" + std::to_string(network_output_names_.size());
+  out_tensor->setName(name.c_str());
+  network_->markOutput(*out_tensor);
+  network_output_names_.push_back(out_tensor->getName());
+}
+
+void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
+  TensorRTOpConverterParams params(network_, node, &trt_weights_);
+  // Look up converter.
+  auto it = GetOpConverters()->find(params.op_name);
+  CHECK(it != GetOpConverters()->end())
+      << "Unsupported operator conversion to TRT, op name: " << params.op_name;
+  const auto converter = it->second;
+  // Get inputs.
+  for (size_t i = 0; i < node.GetInputs().size(); ++i) {
+    auto in_node = node.GetInputs()[i];
+    auto it = node_output_map_.find(in_node.id_);
+    CHECK(it != node_output_map_.end()) << "Input was not found.";
+    auto input = it->second[in_node.index_];
+    if (!converter->variable_input_count) {
+      if (converter->input_types[i] == kTensor && input.type == kWeight) {
+        input = TensorRTOpInput(GetInputAsTensor(input));
+      } else if (converter->input_types[i] == kWeight && input.type == kTensor) {
+        LOG(FATAL) << "Input " << i << " for " << params.op_name
+                   << " requires weights but got a tensor.";
+      }
+    }
+    params.inputs.push_back(input);
+  }
+  CHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
+      << "Op expected a different number of inputs.";
+
+  // Convert op to TRT.
+  converter->Convert(&params);
+
+  // Get outputs.
+  node_output_map_[nid] = {};
+  for (auto out : params.outputs) {
+    node_output_map_[nid].push_back(TensorRTOpInput(out));
+  }
+}
+
+TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
+  // Process graph to create INetworkDefinition.
+// Build engine.
+#if TRT_VERSION_GE(6, 0, 1)
+  config_ = builder_->createBuilderConfig();
+  config_->setMaxWorkspaceSize(max_workspace_size_);
+  if (use_fp16_) {
+    config_->setFlag(nvinfer1::BuilderFlag::kFP16);
+  }
+  // Add profiles.
+  if (!use_implicit_batch_) {
+    auto profile = builder_->createOptimizationProfile();
+    for (int i = 0; i < network_->getNbInputs(); ++i) {
+      auto name = network_->getInput(i)->getName();
+      auto dims = network_->getInput(i)->getDimensions();
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, dims);
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, dims);
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, dims);
+    }
+    config_->addOptimizationProfile(profile);
+  }
+  nvinfer1::ICudaEngine* engine = builder_->buildEngineWithConfig(*network_, *config_);
+#else
+  nvinfer1::ICudaEngine* engine = builder_->buildCudaEngine(*network_);
+#endif
+  CHECK_EQ(engine->getNbBindings(), network_input_names_.size() + network_output_names_.size());
+  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+  CleanUp();
+  return {engine, context, network_input_names_, network_output_names_};
+}
+
+nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
+                                                        DLDeviceType src_device) {
+  CHECK_EQ(dptr->ctx.device_type, src_device);
+  CHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
+        static_cast<int>(dptr->dtype.code) == kDLInt);
+  const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
+                             ? nvinfer1::DataType::kFLOAT
+                             : nvinfer1::DataType::kINT32;
+  const size_t weight_bytes = GetDataSize(*dptr);
+  nvinfer1::Weights weight{trt_dtype, nullptr, 0};
+  size_t count = 1;
+  for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
+    count *= dptr->shape[i];
+  }
+  CHECK_EQ(count * 4, weight_bytes);
+  weight.count = count;
+  weight.values = new float[count];
+  CHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
+                               weight_bytes),
+           0)
+      << TVMGetLastError();
+  trt_weights_.push_back(weight);
+  return weight;
+}
+
+nvinfer1::ITensor* TensorRTBuilder::GetInputAsTensor(const TensorRTOpInput& input) {
+  if (input.type == kTensor) return input.tensor;
+  auto dims = VectorToTrtDims(input.weight_shape);
+  return network_->addConstant(dims, input.weight)->getOutput(0);
+}
+
+void TensorRTBuilder::CleanUp() {
+  network_->destroy();
+#if TRT_VERSION_GE(6, 0, 1)
+  config_->destroy();
+#endif
+  builder_->destroy();
+  for (auto weight : trt_weights_) {
+    if (weight.type == nvinfer1::DataType::kFLOAT) {
+      delete[] static_cast<const float*>(weight.values);
+    } else {
+      delete[] static_cast<const uint16_t*>(weight.values);
+    }
+  }
+}
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
new file mode 100644
index 000000000000..efb4d8175650
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h
@@ -0,0 +1,159 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/tensorrt_builder.h
+ * \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph into a TRT engine
+ * which can be used for inference.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "NvInfer.h"
+#include "tensorrt_logger.h"
+#include "tensorrt_ops.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+/*!
+ * \brief The product of TensorRTBuilder which provides everything needed to
+ * perform inference.
+ */
+struct TensorRTEngineAndContext {
+  nvinfer1::ICudaEngine* engine;
+  nvinfer1::IExecutionContext* context;
+  std::vector<std::string> inputs;
+  std::vector<std::string> outputs;
+};
+
+/*!
+ * \brief Converts a JSONRuntime graph into a TensorRT engine and execution context. Inputs,
+ * constants, layers, and outputs can be added to construct the TensorRT network definition.
+ * BuildEngine() will then use the network definition to build the TensorRT engine and context which
+ * can be used to run inference - this phase can take a long time because TensorRT will query the
+ * performance of all available kernels and fusions to optimize the engine.
+ */
+class TensorRTBuilder {
+ public:
+  /*!
+   * \brief Create TensorRT builder.
+   * \param logger TensorRT logger to use for errors and warnings.
+   * \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
+   * \param use_implicit_batch Whether to use implicit batch mode (default)
+   * \param use_fp16 Whether to use implicit batch mode (default)
+   * \param batch_size If use_implicit_batch,
+   */
+  TensorRTBuilder(TensorRTLogger* logger, size_t max_workspace_size, bool use_implicit_batch,
+                  bool use_fp16, int batch_size);
+
+  /*!
+   * \brief Add TensorRT input(s) for input node in network definition.
+   * \param nid The input node id.
+   * \param node The input node.
+   */
+  void AddInput(int nid, const JSONGraphNode& node);
+
+  /*!
+   * \brief Add TensorRT weight for input constant in network definition.
+   * \param nid The input node id.
+   * \param node The data tensor on CPU.
+   */
+  void AddConstant(int nid, const DLTensor* data);
+
+  /*!
+   * \brief Add TensorRT layer for op node in network definition.
+   * \param nid The input node id.
+   * \param node The op node.
+   */
+  void AddLayer(int nid, const JSONGraphNode& node);
+
+  /*!
+   * \brief Mark TensorRT output in network definition.
+   * \param entry The output node entry.
+   */
+  void AddOutput(const JSONGraphNodeEntry& entry);
+
+  /*!
+   * \brief Takes network definition and "compiles" a TensorRT engine which can be used for
+   * inference. This step is time confusing.
+   * \return TRT engine, context, and input/output information.
+   */
+  TensorRTEngineAndContext BuildEngine();
+
+ private:
+  /*! \brief Convert a DLTensor to a TensorRT weight. */
+  nvinfer1::Weights GetDLTensorAsWeights(const DLTensor* dptr, DLDeviceType src_device);
+
+  /*! \brief Convert an input to a Tensor if it is a Weight */
+  nvinfer1::ITensor* GetInputAsTensor(const TensorRTOpInput& input);
+
+  /*! \brief Clean up resources used to create engine. */
+  void CleanUp();
+
+  /*! \brief Maps a node to its outputs. */
+  std::unordered_map<int, std::vector<TensorRTOpInput>> node_output_map_;
+
+  /*! \brief TensorRT builder. */
+  nvinfer1::IBuilder* builder_;
+
+#if TRT_VERSION_GE(6, 0, 1)
+  /*! \brief TensorRT builder config. */
+  nvinfer1::IBuilderConfig* config_;
+#endif
+
+  /*! \brief TensorRT network definition. */
+  nvinfer1::INetworkDefinition* network_;
+
+  /*! \brief List of all weights held in memory. */
+  std::vector<nvinfer1::Weights> trt_weights_;
+
+  /*! \brief Max workspace size in bytes for TRT. */
+  size_t max_workspace_size_;
+
+  /*! \brief Whether to use implicit batch mode. */
+  bool use_implicit_batch_;
+
+  /*! \brief Whether to automatically convert model to 16-bit floating point precision. */
+  bool use_fp16_;
+
+  /*! \brief Batch size to optimize for. */
+  int batch_size_;
+
+  /*! \brief Input names. */
+  std::vector<std::string> network_input_names_;
+
+  /*! \brief Output names. */
+  std::vector<std::string> network_output_names_;
+};
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_logger.h b/src/runtime/contrib/tensorrt/tensorrt_logger.h
new file mode 100644
index 000000000000..53b6dfeea763
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_logger.h
@@ -0,0 +1,78 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/tensorrt_logger.h
+ * \brief Contains TensorRTLogger class which is required by TRT and used to
+ * print info, warnings, and errors.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
+
+#include <dmlc/logging.h>
+
+#include "NvInfer.h"
+#include "tensorrt_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+/*! \brief Logger for TensorRT info/warning/errors. */
+class TensorRTLogger : public nvinfer1::ILogger {
+ public:
+  TensorRTLogger() : TensorRTLogger(Severity::kWARNING) {}
+  explicit TensorRTLogger(Severity severity) : reportable_severity(severity) {}
+  void log(Severity severity, const char* msg) override {
+    // suppress messages with severity enum value greater than the reportable
+    if (severity > reportable_severity) return;
+
+    switch (severity) {
+      case Severity::kINTERNAL_ERROR:
+        LOG(ERROR) << "INTERNAL_ERROR: " << msg;
+        break;
+      case Severity::kERROR:
+        LOG(ERROR) << "ERROR: " << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << "WARNING: " << msg;
+        break;
+      case Severity::kINFO:
+        LOG(INFO) << "INFO: " << msg;
+        break;
+#if TRT_VERSION_GE(5, 1, 5)
+      case Severity::kVERBOSE:
+        DLOG(INFO) << "VERBOSE: " << msg;
+        break;
+#endif
+      default:
+        LOG(INFO) << "UNKNOWN: " << msg;
+        break;
+    }
+  }
+
+ private:
+  Severity reportable_severity{Severity::kWARNING};
+};
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
new file mode 100644
index 000000000000..a1da6c39f68e
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -0,0 +1,1070 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/tensorrt_ops.cc
+ * \brief Converters from Relay ops into TensorRT layers. Converters should
+ * inherit from TensorRTOpConverter and implement the Convert() method.
+ */
+
+#include "tensorrt_ops.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "NvInfer.h"
+#include "tensorrt_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+TensorRTOpConverter::TensorRTOpConverter(const std::vector<TensorRTInputType>& input_types,
+                                         bool variable_input_count)
+    : input_types(input_types), variable_input_count(variable_input_count) {}
+
+nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* params,
+                                                nvinfer1::ITensor* input,
+                                                const std::vector<int>& new_shape) const {
+  auto layer = params->network->addShuffle(*input);
+  CHECK(layer != nullptr);
+  layer->setReshapeDimensions(VectorToTrtDims(new_shape));
+  return layer->getOutput(0);
+}
+
+nvinfer1::ITensor* TensorRTOpConverter::Transpose(TensorRTOpConverterParams* params,
+                                                  nvinfer1::ITensor* input,
+                                                  const std::vector<int>& order) const {
+  auto layer = params->network->addShuffle(*input);
+  CHECK(layer != nullptr);
+  nvinfer1::Permutation perm;
+  if (TRT_HAS_IMPLICIT_BATCH(params)) {
+    // Batch dimension cannot be modified.
+    CHECK_EQ(input->getDimensions().nbDims, order.size() - 1);
+    CHECK_EQ(order[0], 0);
+    for (size_t i = 0; i < order.size(); ++i) {
+      perm.order[i] = order[i + 1] - 1;
+    }
+  } else {
+    CHECK_EQ(input->getDimensions().nbDims, order.size());
+    for (size_t i = 0; i < order.size(); ++i) {
+      perm.order[i] = order[i];
+    }
+  }
+  layer->setFirstTranspose(perm);
+  return layer->getOutput(0);
+}
+
+int TensorRTOpConverter::ConvertAxis(TensorRTOpConverterParams* params, int axis,
+                                     int input_rank) const {
+  // Add 1 for missing batch dim.
+  if (TRT_HAS_IMPLICIT_BATCH(params)) {
+    input_rank += 1;
+  }
+  CHECK(axis >= -input_rank && axis < input_rank);
+  if (axis < 0) axis += input_rank;
+  if (TRT_HAS_IMPLICIT_BATCH(params)) {
+    // Can't modify batch dimenson.
+    CHECK_NE(axis, 0);
+    // Subtract 1 for implicit batch dim.
+    axis -= 1;
+  }
+  return axis;
+}
+
+nvinfer1::ITensor* TensorRTOpConverter::CreateScalar(
+    TensorRTOpConverterParams* params, float value, const nvinfer1::Dims& broadcast_to_dims) const {
+  nvinfer1::Dims dims;
+  dims.nbDims = broadcast_to_dims.nbDims;
+  std::fill_n(dims.d, dims.nbDims, 1);
+  float* values = new float[1];
+  values[0] = value;
+  nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
+  params->trt_weights->push_back(weights);
+  return params->network->addConstant(dims, weights)->getOutput(0);
+}
+
+void TensorRTOpConverter::GetPadding(const std::vector<std::string>& padding,
+                                     bool* use_asymmetric_padding, nvinfer1::DimsHW* prepadding,
+                                     nvinfer1::DimsHW* postpadding) const {
+  CHECK(padding.size() == 1 || padding.size() == 2 || padding.size() == 4);
+  if (padding.size() == 4) {
+    // four int : padding width in the order of (top, left, bottom, right).
+    *prepadding = nvinfer1::DimsHW(std::stoi(padding[0]), std::stoi(padding[1]));
+    *postpadding = nvinfer1::DimsHW(std::stoi(padding[2]), std::stoi(padding[3]));
+    *use_asymmetric_padding = true;
+  } else if (padding.size() == 2) {
+    // two int : bottom, right will use same padding as top, left
+    *prepadding = nvinfer1::DimsHW(std::stoi(padding[0]), std::stoi(padding[1]));
+    *postpadding = *prepadding;
+    *use_asymmetric_padding = false;
+  } else {
+    // one int : same padding used on all sides
+    *prepadding = nvinfer1::DimsHW(std::stoi(padding[0]), std::stoi(padding[0]));
+    *postpadding = *prepadding;
+    *use_asymmetric_padding = false;
+  }
+}
+
+void TensorRTOpConverter::GetPadding3D(const std::vector<std::string>& padding,
+                                       bool* use_asymmetric_padding, nvinfer1::Dims* prepadding,
+                                       nvinfer1::Dims* postpadding) const {
+  CHECK(padding.size() == 1 || padding.size() == 3 || padding.size() == 6);
+  if (padding.size() == 6) {
+    // six int : padding width in the order of (front, top, left, back, bottom, right)
+    *prepadding =
+        nvinfer1::Dims3(std::stoi(padding[0]), std::stoi(padding[1]), std::stoi(padding[2]));
+    *postpadding =
+        nvinfer1::Dims3(std::stoi(padding[3]), std::stoi(padding[4]), std::stoi(padding[5]));
+    *use_asymmetric_padding = true;
+  } else if (padding.size() == 3) {
+    // three int : back, bottom, right will use same padding as front, top, left
+    *prepadding =
+        nvinfer1::Dims3(std::stoi(padding[0]), std::stoi(padding[1]), std::stoi(padding[2]));
+    *postpadding = *prepadding;
+    *use_asymmetric_padding = false;
+  } else {
+    // one int : same padding used on all sides
+    *prepadding =
+        nvinfer1::Dims3(std::stoi(padding[0]), std::stoi(padding[0]), std::stoi(padding[0]));
+    *postpadding = *prepadding;
+    *use_asymmetric_padding = false;
+  }
+}
+
+class ActivationOpConverter : public TensorRTOpConverter {
+ public:
+  ActivationOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    static const std::unordered_map<std::string, nvinfer1::ActivationType> op_map = {
+      {"nn.relu", nvinfer1::ActivationType::kRELU},
+      {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
+      {"tanh", nvinfer1::ActivationType::kTANH},
+#if TRT_VERSION_GE(5, 1, 5)
+      {"clip", nvinfer1::ActivationType::kCLIP},
+      {"nn.leaky_relu", nvinfer1::ActivationType::kLEAKY_RELU},
+#endif
+    };
+    auto it = op_map.find(params->op_name);
+    CHECK(it != op_map.end()) << "Unsupported activation type " << params->op_name;
+    nvinfer1::IActivationLayer* act_layer =
+        params->network->addActivation(*params->inputs.at(0).tensor, it->second);
+#if TRT_VERSION_GE(5, 1, 5)
+    if (params->op_name == "clip") {
+      float a_min = std::stof(params->node.GetAttr<std::vector<std::string>>("a_min")[0]);
+      float a_max = std::stof(params->node.GetAttr<std::vector<std::string>>("a_max")[0]);
+      act_layer->setAlpha(a_min);
+      act_layer->setBeta(a_max);
+    } else if (params->op_name == "nn.leaky_relu") {
+      float alpha = std::stof(params->node.GetAttr<std::vector<std::string>>("alpha")[0]);
+      act_layer->setAlpha(alpha);
+    }
+#endif
+    CHECK(act_layer != nullptr);
+    params->outputs.push_back(act_layer->getOutput(0));
+  }
+};
+
+class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
+ public:
+  ElementWiseBinaryOpConverter() : TensorRTOpConverter({kTensor, kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation> op_map = {
+        {"add", nvinfer1::ElementWiseOperation::kSUM},
+        {"subtract", nvinfer1::ElementWiseOperation::kSUB},
+        {"multiply", nvinfer1::ElementWiseOperation::kPROD},
+        {"divide", nvinfer1::ElementWiseOperation::kDIV},
+        {"power", nvinfer1::ElementWiseOperation::kPOW},
+        {"maximum", nvinfer1::ElementWiseOperation::kMAX},
+        {"minimum", nvinfer1::ElementWiseOperation::kMIN}};
+    auto it = op_map.find(params->op_name);
+    CHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
+    // Broadcast
+    auto input0 = params->inputs.at(0).tensor;
+    auto input0_dims = TrtDimsToVector(input0->getDimensions());
+    auto input1 = params->inputs.at(1).tensor;
+    auto input1_dims = TrtDimsToVector(input1->getDimensions());
+    const bool need_broadcast = input0_dims.size() != input1_dims.size();
+    if (need_broadcast) {
+      if (input0_dims.size() < input1_dims.size()) {
+        std::vector<int> new_shape(input0_dims);
+        while (new_shape.size() < input1_dims.size()) new_shape.insert(new_shape.begin(), 1);
+        input0 = Reshape(params, input0, new_shape);
+      } else if (input1_dims.size() < input0_dims.size()) {
+        std::vector<int> new_shape(input1_dims);
+        while (new_shape.size() < input0_dims.size()) new_shape.insert(new_shape.begin(), 1);
+        input1 = Reshape(params, input1, new_shape);
+      }
+    }
+
+    nvinfer1::IElementWiseLayer* elemwise_layer =
+        params->network->addElementWise(*input0, *input1, it->second);
+    CHECK(elemwise_layer != nullptr);
+    params->outputs.push_back(elemwise_layer->getOutput(0));
+  }
+};
+
+class Conv2DOpConverter : public TensorRTOpConverter {
+ public:
+  Conv2DOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    auto weight_shape = params->inputs.at(1).weight_shape;
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
+    CHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+          params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCHW");
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
+    int channels = std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
+    // TRT conv2d op doesn't support asymmetric padding before 5.1, so we
+    // workaround by adding a padding layer before the pooling op.
+    nvinfer1::DimsHW prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+#if !TRT_VERSION_GE(5, 1, 5)
+    if (use_asymmetric_padding) {
+      auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
+      CHECK(pad_layer != nullptr);
+      input_tensor = pad_layer->getOutput(0);
+      // No need for conv op to do any padding.
+      use_asymmetric_padding = false;
+      prepadding = nvinfer1::DimsHW(0, 0);
+    }
+#endif
+
+    const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
+                                                      params->inputs.at(1).weight, bias);
+    CHECK(conv_layer != nullptr);
+    if (use_asymmetric_padding) {
+#if TRT_VERSION_GE(5, 1, 5)
+      conv_layer->setPrePadding(prepadding);
+      conv_layer->setPostPadding(postpadding);
+#endif
+    } else {
+      conv_layer->setPadding(prepadding);
+    }
+    CHECK_EQ(str_strides.size(), 2);
+    const auto strides = nvinfer1::DimsHW(std::stoi(str_strides[0]), std::stoi(str_strides[1]));
+    conv_layer->setStride(strides);
+    CHECK_EQ(str_dilation.size(), 2);
+    const auto dilation = nvinfer1::DimsHW(std::stoi(str_dilation[0]), std::stoi(str_dilation[1]));
+    conv_layer->setDilation(dilation);
+    conv_layer->setNbGroups(groups);
+    params->outputs.push_back(conv_layer->getOutput(0));
+  }
+};
+
+#if TRT_VERSION_GE(6, 0, 1)
+class Conv3DOpConverter : public TensorRTOpConverter {
+ public:
+  Conv3DOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    auto weight_shape = params->inputs.at(1).weight_shape;
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCDHW");
+    CHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+          params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCDHW");
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIDHW");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
+
+    nvinfer1::Dims prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+
+    // Could use attrs->channels.as<IntImmNode>()->value
+    const int num_outputs = weight_shape[0];
+    const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
+                                                        params->inputs.at(1).weight, bias);
+    CHECK(conv_layer != nullptr);
+    if (use_asymmetric_padding) {
+      conv_layer->setPrePadding(prepadding);
+      conv_layer->setPostPadding(postpadding);
+    } else {
+      conv_layer->setPaddingNd(prepadding);
+    }
+    CHECK_EQ(str_strides.size(), 3);
+    const auto strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
+                                         std::stoi(str_strides[2]));
+    conv_layer->setStrideNd(strides);
+    CHECK_EQ(str_dilation.size(), 3);
+    const auto dilation = nvinfer1::Dims3(std::stoi(str_dilation[0]), std::stoi(str_dilation[1]),
+                                          std::stoi(str_dilation[2]));
+    conv_layer->setDilationNd(dilation);
+    conv_layer->setNbGroups(groups);
+    params->outputs.push_back(conv_layer->getOutput(0));
+  }
+};
+#endif  // TRT_VERSION_GE(6, 0, 1)
+
+class DenseOpConverter : public TensorRTOpConverter {
+ public:
+  DenseOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    CHECK(input_dims.size() > 0 && input_dims.size() <= 3);
+    const size_t required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
+    const bool need_reshape_on_input = input_dims.size() != required_rank;
+    if (need_reshape_on_input) {
+      // Add dims of size 1 until rank is required_rank.
+      std::vector<int> new_shape(input_dims);
+      while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1);
+      input_tensor = Reshape(params, input_tensor, new_shape);
+    }
+    // Weights are in KC format.
+    CHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
+    const int num_units = params->inputs.at(1).weight_shape[0];
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
+        *input_tensor, num_units, params->inputs.at(1).weight, bias);
+    CHECK(fc_layer != nullptr);
+    auto output_tensor = fc_layer->getOutput(0);
+    if (need_reshape_on_input) {
+      // Remove added dims.
+      input_dims[input_dims.size() - 1] = num_units;
+      output_tensor = Reshape(params, output_tensor, input_dims);
+    }
+    params->outputs.push_back(output_tensor);
+  }
+};
+
+class BatchNormOpConverter : public TensorRTOpConverter {
+ public:
+  BatchNormOpConverter() : TensorRTOpConverter({kTensor, kWeight, kWeight, kWeight, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto gamma = params->inputs.at(1).weight;
+    auto beta = params->inputs.at(2).weight;
+    auto mean = params->inputs.at(3).weight;
+    auto var = params->inputs.at(4).weight;
+    CHECK_EQ(gamma.count, beta.count);
+    CHECK_EQ(gamma.count, mean.count);
+    CHECK_EQ(gamma.count, var.count);
+    const float epsilon = std::stof(params->node.GetAttr<std::vector<std::string>>("epsilon")[0]);
+    const int axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
+    const bool scale = std::stoi(params->node.GetAttr<std::vector<std::string>>("scale")[0]);
+    const bool center = std::stoi(params->node.GetAttr<std::vector<std::string>>("center")[0]);
+    CHECK(axis == 1 || axis == 3);
+    const bool need_transpose = axis == 3;
+
+    void* weight_scale_ptr = new float[gamma.count];
+    nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
+    params->trt_weights->push_back(weight_scale);
+    void* weight_shift_ptr = new float[gamma.count];
+    nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
+    params->trt_weights->push_back(weight_shift);
+    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    // fill in the content of weights for the Scale layer
+    const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
+    const float* beta_ptr = reinterpret_cast<const float*>(beta.values);
+    const float* mean_ptr = reinterpret_cast<const float*>(mean.values);
+    const float* var_ptr = reinterpret_cast<const float*>(var.values);
+    float* scale_ptr = reinterpret_cast<float*>(weight_scale_ptr);
+    float* shift_ptr = reinterpret_cast<float*>(weight_shift_ptr);
+    for (int i = 0; i < gamma.count; ++i) {
+      scale_ptr[i] = 1.0 / std::sqrt(var_ptr[i] + epsilon);
+      if (scale) {
+        scale_ptr[i] *= gamma_ptr[i];
+      }
+      shift_ptr[i] = -mean_ptr[i] * scale_ptr[i];
+      if (center) {
+        shift_ptr[i] += beta_ptr[i];
+      }
+    }
+    if (need_transpose) {
+      input = Transpose(params, input, {0, 3, 1, 2});
+    }
+    nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
+        *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power);
+    CHECK(scale_layer != nullptr);
+    auto output = scale_layer->getOutput(0);
+    if (need_transpose) {
+      output = Transpose(params, output, {0, 2, 3, 1});
+    }
+    params->outputs.push_back(output);
+  }
+};
+
+class BatchFlattenOpConverter : public TensorRTOpConverter {
+ public:
+  BatchFlattenOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    std::vector<int> new_shape{-1};
+    if (!TRT_HAS_IMPLICIT_BATCH(params)) {
+      new_shape.insert(new_shape.begin(), params->inputs.at(0).tensor->getDimensions().d[0]);
+    }
+    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, new_shape));
+  }
+};
+
+class SoftmaxOpConverter : public TensorRTOpConverter {
+ public:
+  SoftmaxOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    const int input_rank = input->getDimensions().nbDims;
+    const int original_axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
+    const int axis = ConvertAxis(params, original_axis, input_rank);
+    nvinfer1::ISoftMaxLayer* softmax_layer = params->network->addSoftMax(*input);
+    softmax_layer->setAxes(1 << axis);
+    CHECK(softmax_layer != nullptr);
+    params->outputs.push_back(softmax_layer->getOutput(0));
+  }
+};
+
+class PoolingOpConverter : public TensorRTOpConverter {
+ public:
+  PoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
+        {"nn.max_pool2d", nvinfer1::PoolingType::kMAX},
+        {"nn.avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
+    auto it = op_map.find(params->op_name);
+    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+    auto str_pool_size = params->node.GetAttr<std::vector<std::string>>("pool_size");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    nvinfer1::DimsHW prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+    bool ceil_mode = std::stoi(params->node.GetAttr<std::vector<std::string>>("ceil_mode")[0]);
+
+// TRT pooling op doesn't support asymmetric padding before 5.1, so we
+// workaround by adding a padding layer before the pooling op.
+#if !TRT_VERSION_GE(5, 1, 5)
+    if (use_asymmetric_padding) {
+      auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
+      CHECK(pad_layer != nullptr);
+      input = pad_layer->getOutput(0);
+      // No need for pooling op to do any padding.
+      use_asymmetric_padding = false;
+      prepadding = nvinfer1::DimsHW(0, 0);
+    }
+#endif
+
+    nvinfer1::DimsHW window_size =
+        nvinfer1::DimsHW(std::stoi(str_pool_size[0]), std::stoi(str_pool_size[1]));
+    auto pool_layer = params->network->addPooling(*input, it->second, window_size);
+    CHECK(pool_layer != nullptr);
+    nvinfer1::DimsHW strides =
+        nvinfer1::DimsHW(std::stoi(str_strides[0]), std::stoi(str_strides[1]));
+    pool_layer->setStride(strides);
+    if (use_asymmetric_padding) {
+#if TRT_VERSION_GE(5, 1, 5)
+      pool_layer->setPrePadding(prepadding);
+      pool_layer->setPostPadding(postpadding);
+#endif
+    } else {
+      pool_layer->setPadding(prepadding);
+    }
+    if (params->op_name == "nn.avg_pool2d") {
+      bool count_include_pad =
+          std::stoi(params->node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
+      // count_include_pad=True is useless if there is no padding. TRT doesn't
+      // like count_include_pad in combination with strides even when there is
+      // no padding or assymetric padding even, so turn off inclusive to avoid
+      // error message. Note: Padding will always be symmetric with
+      // count_include_pad since partitioner will prevent unsupported case.
+      if (prepadding.h() == 0 && prepadding.w() == 0) {
+        count_include_pad = false;
+      }
+      pool_layer->setAverageCountExcludesPadding(!count_include_pad);
+    }
+#if TRT_VERSION_GE(5, 1, 5)
+    if (ceil_mode) {
+      pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
+    }
+#else
+    CHECK(!ceil_mode);
+#endif
+    params->outputs.push_back(pool_layer->getOutput(0));
+  }
+};
+
+#if TRT_VERSION_GE(6, 0, 1)
+class Pooling3DOpConverter : public TensorRTOpConverter {
+ public:
+  Pooling3DOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
+        {"nn.max_pool3d", nvinfer1::PoolingType::kMAX},
+        {"nn.avg_pool3d", nvinfer1::PoolingType::kAVERAGE}};
+    auto it = op_map.find(params->op_name);
+    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCDHW");
+    auto str_pool_size = params->node.GetAttr<std::vector<std::string>>("pool_size");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    nvinfer1::DimsHW prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+    bool ceil_mode = std::stoi(params->node.GetAttr<std::vector<std::string>>("ceil_mode")[0]);
+    nvinfer1::Dims window_size = nvinfer1::Dims3(
+        std::stoi(str_pool_size[0]), std::stoi(str_pool_size[1]), std::stoi(str_pool_size[2]));
+    auto pool_layer = params->network->addPoolingNd(*input, it->second, window_size);
+    CHECK(pool_layer != nullptr);
+    nvinfer1::Dims strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
+                                             std::stoi(str_strides[2]));
+    pool_layer->setStrideNd(strides);
+    if (use_asymmetric_padding) {
+      pool_layer->setPrePadding(prepadding);
+      pool_layer->setPostPadding(postpadding);
+    } else {
+      pool_layer->setPaddingNd(prepadding);
+    }
+    if (params->op_name == "nn.avg_pool3d") {
+      bool count_include_pad =
+          std::stoi(params->node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
+      pool_layer->setAverageCountExcludesPadding(!count_include_pad);
+    }
+    if (ceil_mode) {
+      pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
+    }
+    params->outputs.push_back(pool_layer->getOutput(0));
+  }
+};
+#endif  // TRT_VERSION_GE(6, 0, 1)
+
+class GlobalPoolingOpConverter : public TensorRTOpConverter {
+ public:
+  GlobalPoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
+        {"nn.global_max_pool2d", nvinfer1::PoolingType::kMAX},
+        {"nn.global_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
+    auto it = op_map.find(params->op_name);
+    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+    const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
+    const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
+    auto pool_layer =
+        params->network->addPooling(*input_tensor, it->second, nvinfer1::DimsHW(h, w));
+    CHECK(pool_layer != nullptr);
+    params->outputs.push_back(pool_layer->getOutput(0));
+  }
+};
+
+class ExpandDimsOpConverter : public TensorRTOpConverter {
+ public:
+  ExpandDimsOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    const int original_axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
+    const int num_newaxis =
+        std::stoi(params->node.GetAttr<std::vector<std::string>>("num_newaxis")[0]);
+    const int axis = ConvertAxis(params, original_axis, input_dims.size() + 1);
+    for (int i = 0; i < num_newaxis; ++i) {
+      input_dims.insert(input_dims.begin() + axis, 1);
+    }
+    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, input_dims));
+  }
+};
+
+class SqueezeOpConverter : public TensorRTOpConverter {
+ public:
+  SqueezeOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    auto str_axis = params->node.GetAttr<std::vector<std::string>>("axis");
+    for (size_t i = 0; i < str_axis.size(); ++i) {
+      const int axis = ConvertAxis(params, std::stoi(str_axis[i]), input_dims.size());
+      input_dims[axis] = 0;
+    }
+    input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0), input_dims.end());
+    params->outputs.push_back(Reshape(params, params->inputs.at(0).tensor, input_dims));
+  }
+};
+
+class UnaryOpConverter : public TensorRTOpConverter {
+ public:
+  UnaryOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    // The following ops are supported by TRT but don't exist in relay yet:
+    // recip, tan, sinh, cosh, asin, acos, asinh, acosh, atanh
+    static const std::unordered_map<std::string, nvinfer1::UnaryOperation> op_map = {
+      {"exp", nvinfer1::UnaryOperation::kEXP},
+      {"log", nvinfer1::UnaryOperation::kLOG},
+      {"sqrt", nvinfer1::UnaryOperation::kSQRT},
+      {"abs", nvinfer1::UnaryOperation::kABS},
+      {"negative", nvinfer1::UnaryOperation::kNEG},
+#if TRT_VERSION_GE(5, 1, 5)
+      {"sin", nvinfer1::UnaryOperation::kSIN},
+      {"cos", nvinfer1::UnaryOperation::kCOS},
+      {"atan", nvinfer1::UnaryOperation::kATAN},
+      {"ceil", nvinfer1::UnaryOperation::kCEIL},
+      {"floor", nvinfer1::UnaryOperation::kFLOOR},
+#endif
+    };
+    auto it = op_map.find(params->op_name);
+    CHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name;
+    nvinfer1::IUnaryLayer* unary_layer =
+        params->network->addUnary(*params->inputs.at(0).tensor, it->second);
+    CHECK(unary_layer != nullptr);
+    params->outputs.push_back(unary_layer->getOutput(0));
+  }
+};
+
+class ConcatOpConverter : public TensorRTOpConverter {
+ public:
+  ConcatOpConverter() : TensorRTOpConverter({}, /*variable_input_count=*/true) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    const int num_inputs = params->inputs.size();
+    CHECK_GT(num_inputs, 0);
+    const int input_rank = params->inputs[0].tensor->getDimensions().nbDims;
+    std::vector<nvinfer1::ITensor*> input_tensors;
+    for (auto input : params->inputs) {
+      CHECK(input.type == kTensor);
+      CHECK_EQ(input_rank, input.tensor->getDimensions().nbDims);
+      input_tensors.push_back(input.tensor);
+    }
+
+    const int original_axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
+    const int axis = ConvertAxis(params, original_axis, input_rank);
+
+    nvinfer1::IConcatenationLayer* concat_layer =
+        params->network->addConcatenation(input_tensors.data(), input_tensors.size());
+    CHECK(concat_layer != nullptr);
+    concat_layer->setAxis(axis);
+    params->outputs.push_back(concat_layer->getOutput(0));
+  }
+};
+
+class BiasAddOpConverter : public TensorRTOpConverter {
+ public:
+  BiasAddOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    const size_t required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
+    CHECK(input_dims.size() > 0 && input_dims.size() <= required_rank);
+    const bool need_reshape_on_input = input_dims.size() != required_rank;
+    if (need_reshape_on_input) {
+      // Add dims of size 1 until rank is required_rank.
+      std::vector<int> new_shape(input_dims);
+      while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1);
+      input_tensor = Reshape(params, input_tensor, new_shape);
+    }
+
+    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
+        *input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
+    CHECK(scale_layer != nullptr);
+    auto output_tensor = scale_layer->getOutput(0);
+    if (need_reshape_on_input) {
+      // Remove added dims.
+      output_tensor = Reshape(params, output_tensor, input_dims);
+    }
+    params->outputs.push_back(output_tensor);
+  }
+};
+
+class Conv2DTransposeOpConverter : public TensorRTOpConverter {
+ public:
+  Conv2DTransposeOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto weight_shape = params->inputs.at(1).weight_shape;
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
+    CHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+          params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCHW");
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
+    auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
+    CHECK(std::stoi(str_dilation[0]) == 1 && std::stoi(str_dilation[1]) == 1);
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    auto str_output_padding = params->node.GetAttr<std::vector<std::string>>("output_padding");
+    int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
+
+    // TRT deconv op doesn't support asymmetric padding before 5.1, so we
+    // workaround by adding a padding layer before the pooling op.
+    nvinfer1::DimsHW prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+#if !TRT_VERSION_GE(5, 1, 5)
+    if (use_asymmetric_padding) {
+      auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
+      CHECK(pad_layer != nullptr);
+      input_tensor = pad_layer->getOutput(0);
+      // No need for conv op to do any padding.
+      use_asymmetric_padding = false;
+      prepadding = nvinfer1::DimsHW(0, 0);
+    }
+#endif
+
+    // Could use conv2d_attr->channels.as<IntImmNode>()->value
+    const int num_outputs = weight_shape[1];
+    const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
+                                                          params->inputs.at(1).weight, bias);
+    CHECK(deconv_layer != nullptr);
+    if (use_asymmetric_padding) {
+#if TRT_VERSION_GE(5, 1, 5)
+      deconv_layer->setPrePadding(prepadding);
+      deconv_layer->setPostPadding(postpadding);
+#endif
+    } else {
+      deconv_layer->setPadding(prepadding);
+    }
+    const auto strides = nvinfer1::DimsHW(std::stoi(str_strides[0]), std::stoi(str_strides[1]));
+    deconv_layer->setStride(strides);
+    deconv_layer->setNbGroups(groups);
+    nvinfer1::ITensor* output = deconv_layer->getOutput(0);
+    // Output padding.
+    if (str_output_padding.size()) {
+      GetPadding(str_output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+      if (prepadding.h() != 0 || prepadding.w() != 0 || postpadding.h() != 0 ||
+          postpadding.w() != 0) {
+        // Output padding for Conv2D transpose is always asymmetric and applied to post only.
+        prepadding = nvinfer1::DimsHW(0, 0);
+        auto pad_layer = params->network->addPadding(*output, prepadding, postpadding);
+        output = pad_layer->getOutput(0);
+      }
+    }
+    params->outputs.push_back(output);
+  }
+};
+
+#if TRT_VERSION_GE(6, 0, 1)
+class Conv3DTransposeOpConverter : public TensorRTOpConverter {
+ public:
+  Conv3DTransposeOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto weight_shape = params->inputs.at(1).weight_shape;
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCDHW");
+    CHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+          params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCDHW");
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIDHW");
+    auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
+    CHECK_EQ(str_dilation.size(), 3);
+    CHECK(std::stoi(str_dilation[0]) == 1 && std::stoi(str_dilation[1]) == 1 &&
+          std::stoi(str_dilation[2]) == 1);
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
+    auto str_output_padding = params->node.GetAttr<std::vector<std::string>>("output_padding");
+    int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
+    nvinfer1::Dims prepadding, postpadding;
+    bool use_asymmetric_padding;
+    GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+
+    // Could use attrs->channels.as<IntImmNode>()->value
+    const int num_outputs = weight_shape[1];
+    const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
+                                                            params->inputs.at(1).weight, bias);
+    CHECK(deconv_layer != nullptr);
+    if (use_asymmetric_padding) {
+      deconv_layer->setPrePadding(prepadding);
+      deconv_layer->setPostPadding(postpadding);
+    } else {
+      deconv_layer->setPaddingNd(prepadding);
+    }
+    CHECK_EQ(str_strides.size(), 3);
+    const auto strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
+                                         std::stoi(str_strides[2]));
+    deconv_layer->setStrideNd(strides);
+    deconv_layer->setNbGroups(groups);
+    nvinfer1::ITensor* output = deconv_layer->getOutput(0);
+    // Output padding.
+    if (str_output_padding.size()) {
+      GetPadding3D(str_output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
+      // Are any post-padding values non-zero?
+      CHECK(!std::any_of(postpadding.d, postpadding.d + postpadding.nbDims, [](int x) {
+        return x != 0;
+      })) << "TRT does not support padding on 3 dimensions.";
+    }
+    params->outputs.push_back(output);
+  }
+};
+#endif  // TRT_VERSION_GE(6, 0, 1)
+
+class TransposeOpConverter : public TensorRTOpConverter {
+ public:
+  TransposeOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto str_axes = params->node.GetAttr<std::vector<std::string>>("axes");
+    std::vector<int> order;
+    for (size_t i = 0; i < str_axes.size(); ++i) {
+      order.push_back(std::stoi(str_axes[i]));
+    }
+    params->outputs.push_back(Transpose(params, input, order));
+  }
+};
+
+class LayoutTransformOpConverter : public TensorRTOpConverter {
+ public:
+  LayoutTransformOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto src = params->node.GetAttr<std::vector<std::string>>("src_layout")[0];
+    auto dst = params->node.GetAttr<std::vector<std::string>>("dst_layout")[0];
+    std::vector<int> order;
+    if (src == "NCHW" && dst == "NHWC") {
+      order = {0, 2, 3, 1};
+    } else if (src == "NHWC" && dst == "NCHW") {
+      order = {0, 3, 1, 2};
+    } else if (src == "NDHWC" && dst == "NCDHW") {
+      order = {0, 4, 1, 2, 3};
+    } else if (src == "NCDHW" && dst == "NDHWC") {
+      order = {0, 2, 3, 4, 1};
+    }
+    params->outputs.push_back(Transpose(params, input, order));
+  }
+};
+
+class ReshapeOpConverter : public TensorRTOpConverter {
+ public:
+  ReshapeOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    CHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("reverse")[0]), false);
+    auto str_newshape = params->node.GetAttr<std::vector<std::string>>("newshape");
+    std::vector<int> new_shape;
+    const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
+    for (size_t i = start_index; i < str_newshape.size(); ++i) {
+      const int value = std::stoi(str_newshape[i]);
+      CHECK_GE(value, -1);
+      new_shape.push_back(value);
+    }
+    params->outputs.push_back(Reshape(params, input, new_shape));
+  }
+};
+
+class PadOpConverter : public TensorRTOpConverter {
+ public:
+  PadOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto str_paddding = params->node.GetAttr<std::vector<std::string>>("padding");
+    nvinfer1::DimsHW prepadding =
+        nvinfer1::DimsHW(std::stoi(str_paddding[0]), std::stoi(str_paddding[1]));
+    nvinfer1::DimsHW postpadding =
+        nvinfer1::DimsHW(std::stoi(str_paddding[2]), std::stoi(str_paddding[3]));
+    auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
+    params->outputs.push_back(pad_layer->getOutput(0));
+  }
+};
+
+class ReduceOpConverter : public TensorRTOpConverter {
+ public:
+  ReduceOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    static const std::unordered_map<std::string, nvinfer1::ReduceOperation> op_map = {
+        {"sum", nvinfer1::ReduceOperation::kSUM},
+        {"prod", nvinfer1::ReduceOperation::kPROD},
+        {"max", nvinfer1::ReduceOperation::kMAX},
+        {"min", nvinfer1::ReduceOperation::kMIN},
+        {"mean", nvinfer1::ReduceOperation::kAVG}};
+    auto it = op_map.find(params->op_name);
+    CHECK(it != op_map.end()) << "Unsupported reduce type " << params->op_name;
+
+    auto input = params->inputs.at(0).tensor;
+    CHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("exclude")[0]), false);
+    bool keepdims = std::stoi(params->node.GetAttr<std::vector<std::string>>("keepdims")[0]);
+    auto str_axis = params->node.GetAttr<std::vector<std::string>>("axis");
+    // TODO(trevmorr): Support reduce to scalar.
+    CHECK_GT(str_axis.size(), 0);
+    uint32_t reduce_axes = 0;
+    for (size_t i = 0; i < str_axis.size(); ++i) {
+      const int axis = ConvertAxis(params, std::stoi(str_axis[i]), input->getDimensions().nbDims);
+      reduce_axes |= 1 << axis;
+    }
+    auto reduce_layer = params->network->addReduce(*input, it->second, reduce_axes, keepdims);
+    params->outputs.push_back(reduce_layer->getOutput(0));
+  }
+};
+
+#if TRT_VERSION_GE(5, 1, 5)
+class StridedSliceOpConverter : public TensorRTOpConverter {
+ public:
+  StridedSliceOpConverter() : TensorRTOpConverter({kTensor, kWeight, kWeight, kWeight}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input->getDimensions());
+    auto str_start = params->node.GetAttr<std::vector<std::string>>("start");
+    auto str_size = params->node.GetAttr<std::vector<std::string>>("size");
+    auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<int> start, size, strides;
+    std::transform(str_start.begin(), str_start.end(), std::back_inserter(start),
+                   [](const std::string& s) { return std::stoi(s); });
+    std::transform(str_size.begin(), str_size.end(), std::back_inserter(size),
+                   [](const std::string& s) { return std::stoi(s); });
+    std::transform(str_strides.begin(), str_strides.end(), std::back_inserter(strides),
+                   [](const std::string& s) { return std::stoi(s); });
+    if (TRT_HAS_IMPLICIT_BATCH(params)) {
+      start.erase(start.begin());
+      size.erase(size.begin());
+      strides.erase(strides.begin());
+    }
+    auto slice_layer = params->network->addSlice(*input, VectorToTrtDims(start),
+                                                 VectorToTrtDims(size), VectorToTrtDims(strides));
+    params->outputs.push_back(slice_layer->getOutput(0));
+  }
+};
+#endif
+
+class AdaptivePoolingOpConverter : public TensorRTOpConverter {
+ public:
+  AdaptivePoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+
+  void Convert(TensorRTOpConverterParams* params) const {
+    auto input_tensor = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
+    static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
+        {"nn.adaptive_max_pool2d", nvinfer1::PoolingType::kMAX},
+        {"nn.adaptive_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
+    auto it = op_map.find(params->op_name);
+    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+
+    // This is an approximation of adaptive pooling. Results will not be
+    // mathematically exact except when output_size is (1, 1).
+    // Annotation rules will only allow output size of (1, 1).
+    auto output_size = nvinfer1::DimsHW(1, 1);
+    const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
+    const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
+    const auto stride = nvinfer1::DimsHW(h / output_size.h(), w / output_size.w());
+    const auto window_size = nvinfer1::DimsHW(h - (output_size.h() - 1) * stride.h(),
+                                              w - (output_size.w() - 1) * stride.w());
+    auto pool_layer = params->network->addPooling(*input_tensor, it->second, window_size);
+    CHECK(pool_layer != nullptr);
+    pool_layer->setStride(stride);
+    params->outputs.push_back(pool_layer->getOutput(0));
+  }
+};
+
+const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>
+GetOpConverters() {
+  static auto map =
+      std::make_shared<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>();
+  if (!map->empty()) return map;
+  map->emplace("nn.relu", std::make_shared<ActivationOpConverter>());
+  map->emplace("sigmoid", std::make_shared<ActivationOpConverter>());
+  map->emplace("tanh", std::make_shared<ActivationOpConverter>());
+  map->emplace("nn.batch_norm", std::make_shared<BatchNormOpConverter>());
+  map->emplace("nn.softmax", std::make_shared<SoftmaxOpConverter>());
+  map->emplace("nn.conv2d", std::make_shared<Conv2DOpConverter>());
+  map->emplace("nn.dense", std::make_shared<DenseOpConverter>());
+  map->emplace("nn.bias_add", std::make_shared<BiasAddOpConverter>());
+  map->emplace("add", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("subtract", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("multiply", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("divide", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("power", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("maximum", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("minimum", std::make_shared<ElementWiseBinaryOpConverter>());
+  map->emplace("nn.max_pool2d", std::make_shared<PoolingOpConverter>());
+  map->emplace("nn.avg_pool2d", std::make_shared<PoolingOpConverter>());
+  map->emplace("nn.global_max_pool2d", std::make_shared<GlobalPoolingOpConverter>());
+  map->emplace("nn.global_avg_pool2d", std::make_shared<GlobalPoolingOpConverter>());
+  map->emplace("exp", std::make_shared<UnaryOpConverter>());
+  map->emplace("log", std::make_shared<UnaryOpConverter>());
+  map->emplace("sqrt", std::make_shared<UnaryOpConverter>());
+  map->emplace("abs", std::make_shared<UnaryOpConverter>());
+  map->emplace("negative", std::make_shared<UnaryOpConverter>());
+  map->emplace("nn.batch_flatten", std::make_shared<BatchFlattenOpConverter>());
+  map->emplace("expand_dims", std::make_shared<ExpandDimsOpConverter>());
+  map->emplace("squeeze", std::make_shared<SqueezeOpConverter>());
+  map->emplace("concatenate", std::make_shared<ConcatOpConverter>());
+  map->emplace("nn.conv2d_transpose", std::make_shared<Conv2DTransposeOpConverter>());
+  map->emplace("transpose", std::make_shared<TransposeOpConverter>());
+  map->emplace("layout_transform", std::make_shared<LayoutTransformOpConverter>());
+  map->emplace("reshape", std::make_shared<ReshapeOpConverter>());
+  map->emplace("nn.pad", std::make_shared<PadOpConverter>());
+  map->emplace("sum", std::make_shared<ReduceOpConverter>());
+  map->emplace("prod", std::make_shared<ReduceOpConverter>());
+  map->emplace("max", std::make_shared<ReduceOpConverter>());
+  map->emplace("min", std::make_shared<ReduceOpConverter>());
+  map->emplace("mean", std::make_shared<ReduceOpConverter>());
+  map->emplace("nn.adaptive_max_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
+  map->emplace("nn.adaptive_avg_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
+#if TRT_VERSION_GE(5, 1, 5)
+  map->emplace("clip", std::make_shared<ActivationOpConverter>());
+  map->emplace("nn.leaky_relu", std::make_shared<ActivationOpConverter>());
+  map->emplace("sin", std::make_shared<UnaryOpConverter>());
+  map->emplace("cos", std::make_shared<UnaryOpConverter>());
+  map->emplace("atan", std::make_shared<UnaryOpConverter>());
+  map->emplace("ceil", std::make_shared<UnaryOpConverter>());
+  map->emplace("floor", std::make_shared<UnaryOpConverter>());
+  map->emplace("strided_slice", std::make_shared<StridedSliceOpConverter>());
+#endif  // TRT_VERSION_GE(5, 1, 5)
+#if TRT_VERSION_GE(6, 0, 1)
+  map->emplace("nn.conv3d", std::make_shared<Conv3DOpConverter>());
+  map->emplace("nn.max_pool3d", std::make_shared<Pooling3DOpConverter>());
+  map->emplace("nn.avg_pool3d", std::make_shared<Pooling3DOpConverter>());
+  map->emplace("nn.conv3d_transpose", std::make_shared<Conv3DTransposeOpConverter>());
+#endif  // TRT_VERSION_GE(6, 0, 1)
+  return map;
+}
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.h b/src/runtime/contrib/tensorrt/tensorrt_ops.h
new file mode 100644
index 000000000000..e9871d42146c
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.h
@@ -0,0 +1,207 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/tensorrt_ops.h
+ * \brief Converters from Relay ops into TensorRT layers. Converters should
+ * inherit from TensorRTOpConverter and implement the Convert() method.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "NvInfer.h"
+#include "tensorrt_utils.h"
+
+#if TRT_VERSION_GE(6, 0, 1)
+#define TRT_HAS_IMPLICIT_BATCH(params) (params->network->hasImplicitBatchDimension())
+#else
+#define TRT_HAS_IMPLICIT_BATCH(params) (true)
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+/*!
+ * \brief An input to a op may be either kTensor in the case of nvinfer::ITensor*
+ * or kWeight for nvinfer1::Weights.
+ */
+enum TensorRTInputType {
+  kTensor,
+  kWeight,
+};
+
+/*!
+ * \brief An input to a TensorRTOpConverter. The type of the input is either kTensor
+ * or kWeight. For kTensor, "tensor" contains the input tensor. For kWeight,
+ * "weight" contains the input weight and "weight_shape" contains the shape.
+ */
+struct TensorRTOpInput {
+  /*! \brief If type is kTensor, will store input tensor. */
+  nvinfer1::ITensor* tensor;
+
+  /*! \brief If type is kWeight, will store input weight. */
+  nvinfer1::Weights weight;
+
+  /*! \brief Whether the input is in tensor or weight. */
+  TensorRTInputType type;
+
+  /*! \brief If type is kWeight, will store weight shape. */
+  std::vector<int> weight_shape;
+
+  explicit TensorRTOpInput(nvinfer1::ITensor* tensor)
+      : tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), type(kTensor) {}
+  TensorRTOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
+      : tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {}
+};
+
+/*! \brief Parameters to convert an Op from Relay to TensorRT. */
+struct TensorRTOpConverterParams {
+  /*! \brief The TRT network that the new layer should be added to. */
+  nvinfer1::INetworkDefinition* network;
+  /*! \brief The corresponding serialized node. */
+  const JSONGraphNode& node;
+  /*! \brief The type of op. */
+  std::string op_name;
+  /*! \brief Inputs to the op. */
+  std::vector<TensorRTOpInput> inputs;
+  /*! \brief Outputs of the op should be populated here during Convert(). */
+  std::vector<nvinfer1::ITensor*> outputs;
+  /*! \brief Any newly allocated weights should be stored here also. */
+  std::vector<nvinfer1::Weights>* trt_weights;
+
+  TensorRTOpConverterParams(nvinfer1::INetworkDefinition* network, const JSONGraphNode& node,
+                            std::vector<nvinfer1::Weights>* trt_weights)
+      : network(network), node(node), trt_weights(trt_weights) {
+    op_name = node.GetOpName();
+  }
+};
+
+/*! \brief Base class for an op converter from Relay to TRT. */
+class TensorRTOpConverter {
+ public:
+  /*! \brief Used to specify whether each input is tensor or weight. */
+  const std::vector<TensorRTInputType> input_types;
+  /*! \brief If set to true, any number of tensor inputs can be used for the op.
+   */
+  const bool variable_input_count;
+
+  /*!
+   * \brief Converter subclasses should call this constructor to set
+   * input_types or variable_input_count.
+   * \param input_types For each input to the op, there should be a
+   * corresponding entry in input_types to determine whether that input should
+   * be a tensor or a weight. TensorRTBuilder will prepare inputs in
+   * TensorRTOpConverter according to this.
+   * \param variable_input_count If the op can have multiple inputs, set this to
+   * true. input_types vector will be ignored and any number of input tensors
+   * can be used for this op. All inputs will be tensors and not weights.
+   */
+  explicit TensorRTOpConverter(const std::vector<TensorRTInputType>& input_types,
+                               bool variable_input_count = false);
+
+  /*!
+   * \brief Convert to TRT. Implementation should use inputs and attributes
+   * from the CallNode to add the corresponding TRT layers to network. Outputs
+   * should be pushed to outputs vector.
+   * \param params Parameters for this op.
+   */
+  virtual void Convert(TensorRTOpConverterParams* params) const = 0;
+
+  /*!
+   * \brief Helper function to reshape a tensor.
+   * \param params Parameters for this op.
+   * \param input Tensor to reshape.
+   * \param new_shape New shape, does not include batch dim.
+   * \return Reshaped tensor
+   */
+  nvinfer1::ITensor* Reshape(TensorRTOpConverterParams* params, nvinfer1::ITensor* input,
+                             const std::vector<int>& new_shape) const;
+
+  /*!
+   * \brief Helper function to transpose a tensor.
+   * \param params Parameters for this op.
+   * \param input Tensor to transpose.
+   * \param order New order of axes, does include batch dim.
+   * \return Transposed tensor
+   */
+  nvinfer1::ITensor* Transpose(TensorRTOpConverterParams* params, nvinfer1::ITensor* input,
+                               const std::vector<int>& order) const;
+
+  /*!
+   * \brief Helper function to convert an axis to TRT format.
+   * \param axis Axis from TVM.
+   * \param input_rank Rank of input, does not include batch dim.
+   * \return Axis in TRT format.
+   */
+  int ConvertAxis(TensorRTOpConverterParams* params, int axis, int input_rank) const;
+
+  /*!
+   * \brief Create constant that is broadcastable.
+   * \param params Parameters for this op.
+   * \param value Value of scalar.
+   * \param broadcast_to_dims Dims that scalar should be broadcastable against.
+   * \return Constant tensor.
+   */
+  nvinfer1::ITensor* CreateScalar(TensorRTOpConverterParams* params, float value,
+                                  const nvinfer1::Dims& broadcast_to_dims) const;
+
+  /*!
+   * \brief Get pre/post padding values from padding attributes array.
+   * \param padding Serialized padding from op attributes.
+   * \param padding_is_asymmetric True if both pre and post are needed for asymmetric padding.
+   * \param prepadding Prepadding value or symmetric padding values if !padding_is_asymmetric.
+   * \param postpadding Postpadding value if padding_is_asymmetric.
+   */
+  void GetPadding(const std::vector<std::string>& padding, bool* use_asymmetric_padding,
+                  nvinfer1::DimsHW* prepadding, nvinfer1::DimsHW* postpadding) const;
+
+  /*!
+   * \brief Get pre/post padding values from padding attributes array for volumetric ops.
+   * \param padding Serialized padding from op attributes.
+   * \param padding_is_asymmetric True if both pre and post are needed for asymmetric padding.
+   * \param prepadding Prepadding value or symmetric padding values if !padding_is_asymmetric.
+   * \param postpadding Postpadding value if padding_is_asymmetric.
+   */
+  void GetPadding3D(const std::vector<std::string>& padding, bool* use_asymmetric_padding,
+                    nvinfer1::Dims* prepadding, nvinfer1::Dims* postpadding) const;
+};
+
+/*!
+ * \brief Get the map of available TensorRTOpConverters, where the key is the name of the relay op.
+ * \return Map of TensorRTOpConverters.
+ */
+const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>
+GetOpConverters();
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
new file mode 100644
index 000000000000..72c025695f7d
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+ * \brief JSON runtime implementation for TensorRT.
+ */
+
+#include <dmlc/parameter.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+
+#include "../../file_utils.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#include "tensorrt_builder.h"
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+
+class TensorRTRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The TensorRT runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit TensorRTRuntime(const std::string& symbol_name, const std::string& graph_json,
+                           const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names),
+        use_implicit_batch_(true),
+        max_workspace_size_(size_t(1) << 30) {}
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "tensorrt"; }
+
+  /*!
+   * \brief Initialize runtime. Create TensorRT layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    CHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    LoadGlobalAttributes();
+    if (GetCachedEnginesFromDisk()) return;
+    SetupConstants(consts);
+    BuildEngine();
+    CacheEngineToDisk();
+  }
+
+  void LoadGlobalAttributes() {
+    // These settings are global to the entire subgraph. Codegen will add them as attributes to all
+    // op nodes. Read from first one.
+    for (size_t i = 0; i < nodes_.size(); ++i) {
+      if (nodes_[i].HasAttr("use_implicit_batch") && nodes_[i].HasAttr("max_workspace_size")) {
+        use_implicit_batch_ =
+            std::stoi(nodes_[i].GetAttr<std::vector<std::string>>("use_implicit_batch")[0]);
+        // Allow max_workspace_size to be overridden at runtime.
+        size_t runtime_max_workspace_size =
+            dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(0));
+        if (runtime_max_workspace_size != 0) {
+          max_workspace_size_ = runtime_max_workspace_size;
+        } else {
+          max_workspace_size_ =
+              std::stoul(nodes_[i].GetAttr<std::vector<std::string>>("max_workspace_size")[0]);
+        }
+        return;
+      }
+    }
+  }
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+  /*! \brief Run inference using built engine. */
+  void Run() override {
+    auto& engine_and_context = trt_engine_cache_.at(symbol_name_);
+    auto engine = engine_and_context.engine;
+    auto context = engine_and_context.context;
+    std::vector<void*> bindings(engine->getNbBindings(), nullptr);
+
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      if (nodes_[nid].GetOpType() == "input") {
+        for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
+          uint32_t eid = EntryID(nid, j);
+          const std::string name = nodes_[nid].GetOpName() + "_" + std::to_string(j);
+          int binding_index = engine->getBindingIndex(name.c_str());
+          CHECK_NE(binding_index, -1);
+          bindings[binding_index] = data_entry_[eid]->data;
+        }
+      }
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      const std::string& name = engine_and_context.outputs[i];
+      int binding_index = engine->getBindingIndex(name.c_str());
+      CHECK_NE(binding_index, -1);
+      bindings[binding_index] = data_entry_[eid]->data;
+    }
+
+#if TRT_VERSION_GE(6, 0, 1)
+    if (use_implicit_batch_) {
+      CHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
+    } else {
+      CHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
+    }
+#else
+    CHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
+#endif
+  }
+
+ private:
+  /*!
+   * \brief Build TensorRT engine from JSON representation.
+   */
+  void BuildEngine() {
+    DLOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_;
+    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
+    batch_size_ = GetBatchSize();
+    TensorRTBuilder builder(&logger_, max_workspace_size_, use_implicit_batch_, use_fp16,
+                            batch_size_);
+
+    // Add inputs and constants.
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      const auto& node = nodes_[nid];
+      std::string name = node.GetOpName();
+      if (node.GetOpType() == "input") {
+        builder.AddInput(nid, node);
+      } else {
+        CHECK_EQ(node.GetOpType(), "const");
+        uint32_t eid = EntryID(nid, 0);
+        builder.AddConstant(nid, data_entry_[eid]);
+      }
+    }
+
+    // Add layers.
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() != "kernel") continue;
+      builder.AddLayer(nid, node);
+    }
+
+    // Add outputs.
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      builder.AddOutput(outputs_[i]);
+    }
+
+    // Build engine.
+    trt_engine_cache_[symbol_name_] = builder.BuildEngine();
+    DLOG(INFO) << "Finished building TensorRT engine for subgraph " << symbol_name_;
+  }
+
+  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
+   * already built TRT engines and load into trt_engine_cache_ so they don't
+   * have to be built at first inference.
+   */
+  bool GetCachedEnginesFromDisk() {
+    std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
+    if (cache_dir.empty()) return false;
+    std::string key = GetSubgraphKey();
+    std::string path = cache_dir + "/" + key + ".plan";
+    // Check if engine is in the cache.
+    std::ifstream infile(path, std::ios::binary);
+    if (!infile.good()) return false;
+    DLOG(INFO) << "Loading cached TensorRT engine from " << path;
+    infile.close();
+    std::string serialized_engine;
+    LoadBinaryFromFile(path, &serialized_engine);
+    // Deserialize engine
+    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger_);
+    TensorRTEngineAndContext engine_and_context;
+    engine_and_context.engine =
+        runtime->deserializeCudaEngine(&serialized_engine[0], serialized_engine.size(), nullptr);
+    engine_and_context.context = engine_and_context.engine->createExecutionContext();
+    // Load metadata
+    std::string meta_path = cache_dir + "/" + key + ".meta";
+    std::string serialized_meta;
+    LoadBinaryFromFile(meta_path, &serialized_meta);
+    std::istringstream is(serialized_meta);
+    dmlc::JSONReader reader(&is);
+    dmlc::JSONObjectReadHelper helper;
+    helper.DeclareField("inputs", &engine_and_context.inputs);
+    helper.DeclareField("outputs", &engine_and_context.outputs);
+    helper.ReadAllFields(&reader);
+    trt_engine_cache_[symbol_name_] = engine_and_context;
+    return true;
+  }
+
+  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will save the engine to that
+   * directory so it can be loaded later.
+   */
+  void CacheEngineToDisk() {
+    std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
+    if (cache_dir.empty()) return;
+    std::string key = GetSubgraphKey();
+    std::string path = cache_dir + "/" + key + ".plan";
+    DLOG(INFO) << "Caching TensorRT engine to " << path;
+    // Serialize engine to disk
+    nvinfer1::IHostMemory* serialized_engine = trt_engine_cache_[symbol_name_].engine->serialize();
+    SaveBinaryToFile(path, std::string(static_cast<const char*>(serialized_engine->data()),
+                                       serialized_engine->size()));
+    serialized_engine->destroy();
+    // Serialize metadata
+    std::ostringstream os;
+    dmlc::JSONWriter writer(&os);
+    writer.BeginObject();
+    writer.WriteObjectKeyValue("inputs", trt_engine_cache_[symbol_name_].inputs);
+    writer.WriteObjectKeyValue("outputs", trt_engine_cache_[symbol_name_].outputs);
+    writer.EndObject();
+    std::string meta_path = cache_dir + "/" + key + ".meta";
+    SaveBinaryToFile(meta_path, os.str());
+  }
+
+  std::string GetSubgraphKey() {
+    // Using this key will only allow a single model per TVM_TENSORRT_CACHE_DIR directory. We could
+    // instead use a hash of graph_json and all weights to allow many models in the same directory,
+    // but the cost of computing the hash is high.
+    return symbol_name_ + (dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) ? "_fp16" : "_fp32");
+  }
+
+  /*! \brief Get the batch size when in implicit_batch mode. */
+  int GetBatchSize() {
+    if (!use_implicit_batch_) return -1;
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      if (nodes_[nid].GetOpType() == "input") {
+        // Get batch size from first input.
+        return nodes_[nid].GetOpShape()[0][0];
+      }
+    }
+    return -1;
+  }
+
+  /*! \brief Map of function name to TRT engine if built already. */
+  std::unordered_map<std::string, TensorRTEngineAndContext> trt_engine_cache_;
+
+  /*! \brief TensorRT logger. */
+  TensorRTLogger logger_;
+
+  /*! \brief Batch size that the engine is optimized for. */
+  int batch_size_;
+
+#else
+  void Run() override {
+    LOG(FATAL) << "TensorRT runtime is not enabled. "
+               << "Please build with USE_TENSORRT_RUNTIME.";
+  }
+
+  void BuildEngine() {
+    LOG(WARNING) << "TensorRT runtime is not enabled. "
+                 << "Please build with USE_TENSORRT_RUNTIME.";
+  }
+
+  bool GetCachedEnginesFromDisk() { return false; }
+
+  void CacheEngineToDisk() {}
+#endif
+
+  bool use_implicit_batch_;
+
+  size_t max_workspace_size_;
+};
+
+runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,
+                                      const Array<String>& const_names) {
+  auto n = make_object<TensorRTRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.tensorrt_runtime_create").set_body_typed(TensorRTRuntimeCreate);
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_tensorrt")
+    .set_body_typed(JSONRuntimeBase::LoadFromBinary<TensorRTRuntime>);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_utils.h b/src/runtime/contrib/tensorrt/tensorrt_utils.h
new file mode 100644
index 000000000000..ab9b169f26d6
--- /dev/null
+++ b/src/runtime/contrib/tensorrt/tensorrt_utils.h
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/utils.h
+ * \brief Helper functions used by TensorRTBuilder or TensorRTOpConverters.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+
+// There is a conflict between cpplint and clang-format-10.
+// clang-format off
+#define TRT_VERSION_GE(major, minor, patch)                                                    \
+  ((NV_TENSORRT_MAJOR > major) || (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
+  (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && NV_TENSORRT_PATCH >= patch))
+// clang-format on
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+/*!
+ * \brief Helper function to convert an vector to TRT Dims.
+ * \param vec Vector.
+ * \return TRT Dims.
+ */
+template <typename T>
+inline nvinfer1::Dims VectorToTrtDims(const std::vector<T>& vec) {
+  nvinfer1::Dims dims;
+  // Dims(nbDims=0, d[0]=1) is used to represent a scalar in TRT.
+  dims.d[0] = 1;
+  dims.nbDims = vec.size();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    dims.d[i] = vec[i];
+  }
+  return dims;
+}
+
+/*!
+ * \brief Helper function to convert TRT Dims to vector.
+ * \param vec TRT Dims.
+ * \return Vector.
+ */
+inline std::vector<int> TrtDimsToVector(const nvinfer1::Dims& dims) {
+  return std::vector<int>(dims.d, dims.d + dims.nbDims);
+}
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_UTILS_H_
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
new file mode 100644
index 000000000000..6f615397db58
--- /dev/null
+++ b/tests/python/contrib/test_tensorrt.py
@@ -0,0 +1,905 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import time
+import pytest
+
+import tvm
+import tvm.relay.testing
+from tvm import relay
+from tvm.relay.op.contrib import tensorrt
+from tvm.contrib import graph_runtime
+
+
+def skip_codegen_test():
+    """Skip test if TensorRT and CUDA codegen are not present"""
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tvm.get_global_func("relay.ext.tensorrt", True):
+        print("Skip because TensorRT codegen is not available.")
+        return True
+    return False
+
+
+def skip_runtime_test():
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tensorrt.is_tensorrt_runtime_enabled():
+        print("Skip because TensorRT runtime is not available.")
+        return True
+    return False
+
+
+def run_and_verify_func(config):
+    """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
+
+    Parameters
+    ----------
+    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
+        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
+        3) A list of which vars should be considered params.
+    """
+    if skip_codegen_test():
+        return
+    f, input_shapes, is_param = config
+    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param}
+    input_dict = {
+        k: np.random.uniform(-1, 1, v).astype(np.float32)
+        for k, v in input_shapes.items()
+        if k not in is_param
+    }
+
+    # Run TRT
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        graph, lib, graph_params = relay.build(mod, "cuda", params=params)
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    mod.set_input(**graph_params)
+    mod.run(**input_dict)
+    results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
+
+    # Run reference
+    mod = tvm.IRModule()
+    mod["main"] = f
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, graph_params = relay.build(mod, "cuda", params=params)
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    mod.set_input(**graph_params)
+    mod.run(**input_dict)
+    ref_results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
+
+    assert len(results) == len(ref_results)
+    for i in range(len(results)):
+        res = results[i].asnumpy()
+        ref_res = ref_results[i].asnumpy()
+        assert res.shape == ref_res.shape
+        tvm.testing.assert_allclose(res, ref_res, rtol=1e-3, atol=1e-3)
+
+
+def run_and_verify_model(model):
+    if skip_codegen_test():
+        return
+
+    def compile_and_run(i_data, input_shape, dtype, use_trt=True, num_iteration=1):
+        import mxnet as mx
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        def check_trt_used(graph):
+            import json
+
+            graph = json.loads(graph)
+            num_trt_subgraphs = sum(
+                [
+                    1
+                    for n in graph["nodes"]
+                    if n.get("attrs", {}).get("func_name", "").startswith("tensorrt_")
+                ]
+            )
+            assert num_trt_subgraphs >= 1
+
+        block = get_model(model, pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+
+        if use_trt:
+            mod, config = tensorrt.partition_for_tensorrt(mod, params)
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.ext.tensorrt.options": config}
+            ):
+                graph, lib, params = relay.build(mod, "cuda", params=params)
+            check_trt_used(graph)
+        else:
+            with tvm.transform.PassContext(opt_level=3):
+                graph, lib, params = relay.build(mod, "cuda", params=params)
+
+        if skip_runtime_test():
+            return
+        mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+        mod.set_input(**params)
+        # Warmup
+        for i in range(10):
+            mod.run(data=i_data)
+        # Time
+        times = []
+        for i in range(num_iteration):
+            start_time = time.time()
+            mod.run(data=i_data)
+            res = mod.get_output(0)
+            times.append(time.time() - start_time)
+        latency = 1000.0 * np.mean(times)
+        print(model, latency)
+        return res
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    i_data = np.random.uniform(-1, 1, input_shape).astype(dtype)
+    res = compile_and_run(i_data, input_shape, dtype, use_trt=True)
+    if skip_runtime_test():
+        return
+    ref_res = compile_and_run(i_data, input_shape, dtype, use_trt=False)
+    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-3)
+
+
+def test_tensorrt_simple():
+    if skip_codegen_test():
+        return
+    dtype = "float32"
+    xshape = (1, 3, 2, 2)
+    yshape = (1, 3, 1, 1)
+    zshape = (1, 1, 1, 1)
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.var("y", shape=(yshape), dtype=dtype)
+    z = relay.var("z", shape=(zshape), dtype=dtype)
+    w = z * (x + y)
+    out = relay.nn.relu(w)
+    f = relay.Function([x, y, z], out)
+
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        graph, lib, params = relay.build(mod, "cuda")
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
+    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
+    mod.run(x=x_data, y=y_data, z=z_data)
+    results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
+
+
+def test_tensorrt_not_compatible():
+    if skip_codegen_test():
+        return
+    dtype = "float32"
+    xshape = (1, 32, 14, 14)
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.add(x, x)
+    z = relay.erf(y)
+    out = relay.nn.relu(z)
+    f = relay.Function([x], out)
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        graph, lib, params = relay.build(mod, "cuda")
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+    mod.run(x=x_data)
+    results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
+
+
+def test_tensorrt_serialize():
+    if skip_codegen_test():
+        return
+    import mxnet
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    block = get_model("resnet18_v1", pretrained=True)
+    mod, params = relay.frontend.from_mxnet(
+        block, shape={"data": (1, 3, 224, 224)}, dtype="float32"
+    )
+    # Compile
+    mod, config = tensorrt.partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        lib = relay.build(mod, "cuda", params=params)
+    # Serialize
+    lib.export_library("compiled.so")
+    # Deserialize
+    loaded_lib = tvm.runtime.load_module("compiled.so")
+    # Run
+    if skip_runtime_test():
+        return
+    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib["default"](tvm.gpu(0)))
+    i_data = np.random.uniform(0, 1, (1, 3, 224, 224)).astype("float32")
+    gen_module.run(data=i_data)
+
+
+def test_conv2d():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(16, 32, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            channels=k_shape[0],
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32)]:
+        for padding in [(0, 0), (1, 1)]:
+            for strides in [(1, 1), (2, 2)]:
+                for dilation in [(1, 1), (2, 2)]:
+                    run_and_verify_func(
+                        get_graph(
+                            k_shape=k_shape,
+                            groups=groups,
+                            padding=padding,
+                            strides=strides,
+                            dilation=dilation,
+                        )
+                    )
+
+
+def test_conv2d_nhwc():
+    def get_graph(x_shape=(1, 8, 8, 32), k_shape=(3, 3, 32, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            channels=16,
+            kernel_size=(3, 3),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+
+
+def test_conv2d_weights_const():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(16, 32, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.const(np.ones(k_shape).astype("float32"))
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            channels=k_shape[0],
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+        )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_conv2d_weights_transposed():
+    def get_graph(x_shape=(1, 32, 9, 9), k_shape=(3, 3, 32, 16), order=(3, 2, 0, 1)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        kernel_t = relay.transpose(kernel, order)
+        # Conv2d requires constant weights in TensorRT, so the weights should be transposed by
+        # FoldConstant.
+        out = relay.nn.conv2d(x, kernel_t, channels=k_shape[order[0]], kernel_size=(3, 3))
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+
+
+def test_dense():
+    def get_graph(x_shape=(1, 16), k_shape=(32, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        # Dense requires constant weights in TensorRT, so the weights are transposed by us.
+        out = relay.nn.dense(x, kernel, units=k_shape[0])
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+
+
+def test_bias_add():
+    def get_graph(x_shape=(1, 16), channels=16):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        bias = relay.var("bias", shape=(channels,), dtype="float32")
+        out = relay.nn.bias_add(x, bias)
+        f = relay.Function([x, bias], out)
+        return f, {"x": x_shape, "bias": (channels,)}, ["bias"]
+
+    run_and_verify_func(get_graph())
+    run_and_verify_func(get_graph((1, 6, 3, 4), 6))
+
+
+def test_pool2d():
+    def get_graph(
+        op,
+        x_shape=(1, 3, 32, 32),
+        pool_size=(2, 2),
+        strides=(2, 2),
+        padding=(0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for pool_size in [(2, 2), (3, 3)]:
+        for strides in [(1, 1), (2, 2)]:
+            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
+                for ceil_mode in [False, True]:
+                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
+                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
+                        continue
+                    for count_include_pad in [False, True]:
+                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
+                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
+                            continue
+                        run_and_verify_func(
+                            get_graph(
+                                relay.nn.avg_pool2d,
+                                pool_size=pool_size,
+                                strides=strides,
+                                padding=padding,
+                                ceil_mode=ceil_mode,
+                                count_include_pad=count_include_pad,
+                            )
+                        )
+                    run_and_verify_func(
+                        get_graph(
+                            relay.nn.max_pool2d,
+                            pool_size=pool_size,
+                            strides=strides,
+                            padding=padding,
+                            ceil_mode=ceil_mode,
+                        )
+                    )
+
+
+def test_global_pool2d():
+    def get_graph(op, x_shape=(1, 3, 32, 32)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(relay.nn.global_max_pool2d))
+    run_and_verify_func(get_graph(relay.nn.global_avg_pool2d))
+
+
+def test_batch_flatten():
+    def get_graph(x_shape=(1, 3, 4, 6)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.batch_flatten(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_expand_dims():
+    def get_graph(x_shape=(1, 3), axis=1, num_newaxis=1):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.expand_dims(x, axis, num_newaxis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_squeeze():
+    def get_graph(x_shape, axis):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.squeeze(x, axis=axis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 5, 1, 1), (2, 3)))
+    run_and_verify_func(get_graph((1, 3, 1), (-1,)))
+
+
+def test_concatenate():
+    def get_graph(input_shapes, axis):
+        concat_inputs = []
+        shapes_dict = {}
+        for i in range(len(input_shapes)):
+            name = "input_{}".format(i)
+            concat_inputs.append(relay.var(name, shape=(input_shapes[i]), dtype="float32"))
+            shapes_dict[name] = input_shapes[i]
+        out = relay.concatenate(concat_inputs, axis)
+        f = relay.Function(concat_inputs, out)
+        return f, shapes_dict, []
+
+    run_and_verify_func(get_graph([(1, 2, 6, 6), (1, 3, 6, 6)], axis=1))
+
+
+def test_conv2d_transpose():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(32, 16, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d_transpose(
+            x,
+            kernel,
+            channels=k_shape[1],
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    for padding in [(0, 0), (1, 1)]:
+        for strides in [(1, 1), (2, 2)]:
+            run_and_verify_func(get_graph(padding=padding, strides=strides))
+
+
+def test_reshape():
+    def get_graph(x_shape, new_shape):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.reshape(x, new_shape)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 1, 1, 10), (-1, 10)))
+    run_and_verify_func(get_graph((1, 10, 2, 3), (1, -1)))
+    run_and_verify_func(get_graph((1, 1, 2, 3), (1, 6)))
+
+
+def test_transpose():
+    def get_graph(x_shape, order):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.transpose(x, order)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 16, 7, 7), [0, 2, 3, 1]))
+    run_and_verify_func(get_graph((1, 7, 7, 16), [0, 3, 1, 2]))
+
+
+def test_float_const():
+    def get_graph(x_shape=(1, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        beta = relay.const(1, dtype="float32")
+        out = relay.multiply(x, beta)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_pad():
+    def get_graph(x_shape, pad_width):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.pad(x, pad_width=pad_width)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [0, 0], [0, 0]]))
+    run_and_verify_func(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [1, 1], [1, 1]]))
+    run_and_verify_func(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [0, 1], [2, 0]]))
+    run_and_verify_func(get_graph((1, 8, 3, 16, 16), [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]))
+
+
+def test_softmax():
+    def get_graph(x_shape, axis):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.softmax(x, axis=axis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 1000), axis=1))
+    run_and_verify_func(get_graph((1, 1000), axis=-1))
+    run_and_verify_func(get_graph((1, 3, 4), axis=-2))
+    run_and_verify_func(get_graph((1, 3, 4), axis=1))
+
+
+def test_batch_norm():
+    def get_graph(x_shape, param_shape, axis=1, epsilon=1e-5):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        beta = relay.var("beta", shape=(param_shape), dtype="float32")
+        gamma = relay.var("gamma", shape=(param_shape), dtype="float32")
+        moving_mean = relay.var("moving_mean", shape=(param_shape), dtype="float32")
+        moving_var = relay.var("moving_var", shape=(param_shape), dtype="float32")
+        out, _, _ = relay.nn.batch_norm(
+            x,
+            gamma=gamma,
+            beta=beta,
+            moving_mean=moving_mean,
+            moving_var=moving_var,
+            axis=axis,
+            center=True,
+            scale=True,
+            epsilon=epsilon,
+        )
+        f = relay.Function([x, gamma, beta, moving_mean, moving_var], out)
+        return (
+            f,
+            {
+                "x": x_shape,
+                "beta": param_shape,
+                "gamma": param_shape,
+                "moving_mean": param_shape,
+                "moving_var": param_shape,
+            },
+            ["beta", "gamma", "moving_mean", "moving_var"],
+        )
+
+    run_and_verify_func(get_graph((1, 64, 56, 56), (64,)))
+    run_and_verify_func(get_graph((1, 56, 56, 64), (64,), axis=3, epsilon=1.001e-05))
+
+
+def test_unary():
+    def get_graph(op, x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for op in [
+        relay.nn.relu,
+        relay.sigmoid,
+        relay.tanh,
+        relay.exp,
+        relay.log,
+        relay.sqrt,
+        relay.abs,
+        relay.negative,
+        relay.sin,
+        relay.cos,
+        relay.atan,
+        relay.ceil,
+        relay.floor,
+    ]:
+        run_and_verify_func(get_graph(op))
+
+
+def test_clip():
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.clip(x, a_min=-0.2, a_max=0.4)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_leaky_relu():
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.leaky_relu(x, alpha=0.1)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_binary():
+    def get_graph(op, x_shape, y_shape, y_is_const=False):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if y_is_const:
+            y = relay.const(np.ones(y_shape).astype("float32"))
+            out = op(x, y)
+            f = relay.Function([x], out)
+            return f, {"x": x_shape}, []
+        y = relay.var("y", shape=(y_shape), dtype="float32")
+        out = op(x, y)
+        f = relay.Function([x, y], out)
+        return f, {"x": x_shape, "y": y_shape}, []
+
+    for op in [relay.add, relay.subtract, relay.multiply, relay.divide, relay.power]:
+        for y_is_const in [True, False]:
+            run_and_verify_func(get_graph(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const))
+            run_and_verify_func(get_graph(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const))
+            run_and_verify_func(get_graph(op, (1, 10), (10,), y_is_const))
+            run_and_verify_func(get_graph(op, (1, 1, 1, 10), (10,), y_is_const))
+            run_and_verify_func(get_graph(op, (1, 1, 1), (3,), y_is_const))
+
+
+def test_reduce():
+    def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x, axis=axis, keepdims=keepdims)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]:
+        for keepdims in [True, False]:
+            run_and_verify_func(get_graph(op, axis=(1), keepdims=keepdims))
+            run_and_verify_func(get_graph(op, axis=(2, 3), keepdims=keepdims))
+            run_and_verify_func(get_graph(op, axis=(1, 2), keepdims=keepdims))
+            run_and_verify_func(get_graph(op, axis=(1, 2, 3), keepdims=keepdims))
+
+
+def test_strided_slice():
+    def get_graph(x_shape, begin, end, strides=None):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if strides:
+            out = relay.strided_slice(
+                x,
+                relay.expr.const(begin, dtype="int32"),
+                relay.expr.const(end, dtype="int32"),
+                relay.expr.const(strides, dtype="int32"),
+            )
+        else:
+            out = relay.strided_slice(
+                x,
+                relay.expr.const(begin, dtype="int32"),
+                relay.expr.const(end, dtype="int32"),
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 3, 6, 7), [0, 0, 0, 0], [1, 1, 6, 7]))
+    run_and_verify_func(get_graph((1, 3, 6, 7), [0, 1, 0, 0], [1, 2, 6, 6]))
+    run_and_verify_func(get_graph((1, 10), [0, 0], [1, 10], [1, 2]))
+
+
+def test_adaptive_pool2d():
+    def get_graph(op, x_shape=(1, 3, 32, 32), out_size=(1, 1)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x, out_size)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(relay.nn.adaptive_max_pool2d))
+    run_and_verify_func(get_graph(relay.nn.adaptive_avg_pool2d))
+
+
+def test_multiple_outputs():
+    def get_graph():
+        x = relay.var("x", shape=(1, 3), dtype="float32")
+        y = relay.var("y", shape=(1, 3), dtype="float32")
+        z = relay.add(x, y)
+        w = relay.add(z, y)
+        out = relay.Tuple((z, w))
+        f = relay.Function([x, y], out)
+        return f, {"x": (1, 3), "y": (1, 3)}, []
+
+    run_and_verify_func(get_graph())
+
+
+def test_conv3d():
+    def get_graph(
+        x_shape=(1, 32, 8, 8, 8),
+        k_shape=(16, 32, 3, 3, 3),
+        groups=1,
+        padding=(0, 0, 0),
+        strides=(1, 1, 1),
+        dilation=(1, 1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv3d(
+            x,
+            kernel,
+            channels=k_shape[0],
+            kernel_size=k_shape[2:],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+    run_and_verify_func(get_graph(padding=(0, 0, 0, 1, 1, 1)))
+
+
+def test_pool3d():
+    def get_graph(
+        op,
+        x_shape=(1, 3, 8, 32, 32),
+        pool_size=(2, 2, 2),
+        strides=(2, 2, 2),
+        padding=(0, 0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(relay.nn.avg_pool3d))
+    run_and_verify_func(get_graph(relay.nn.max_pool3d))
+    run_and_verify_func(get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)))
+    run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)))
+
+
+def test_conv3d_transpose():
+    def get_graph(
+        x_shape=(1, 32, 8, 8, 8),
+        k_shape=(32, 16, 3, 3, 3),
+        groups=1,
+        padding=(0, 0, 0),
+        strides=(1, 1, 1),
+        output_padding=(0, 0, 0),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv3d_transpose(
+            x,
+            kernel,
+            channels=k_shape[1],
+            kernel_size=k_shape[2:5],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            output_padding=output_padding,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify_func(get_graph())
+    run_and_verify_func(get_graph(strides=(2, 2, 2)))
+    run_and_verify_func(get_graph(strides=(2, 2, 2), output_padding=(1, 1, 1)))
+
+
+def test_alexnet():
+    run_and_verify_model("alexnet")
+
+
+def test_resnet18_v1():
+    run_and_verify_model("resnet18_v1")
+
+
+def test_resnet18_v2():
+    run_and_verify_model("resnet18_v2")
+
+
+def test_squeezenet():
+    run_and_verify_model("squeezenet1.0")
+
+
+def test_mobilenet():
+    run_and_verify_model("mobilenet0.25")
+
+
+def test_mobilenet_v2():
+    run_and_verify_model("mobilenetv2_0.25")
+
+
+def test_vgg11():
+    run_and_verify_model("vgg11")
+
+
+def test_densenet121():
+    run_and_verify_model("densenet121")
+
+
+if __name__ == "__main__":
+    test_tensorrt_not_compatible()
+    test_tensorrt_simple()
+    test_tensorrt_serialize()
+
+    # Op tests
+    test_conv2d()
+    test_conv2d_nhwc()
+    test_conv2d_weights_const()
+    test_conv2d_weights_transposed()
+    test_dense()
+    test_bias_add()
+    test_pool2d()
+    test_global_pool2d()
+    test_batch_flatten()
+    test_expand_dims()
+    test_squeeze()
+    test_concatenate()
+    test_conv2d_transpose()
+    test_reshape()
+    test_transpose()
+    test_float_const()
+    test_pad()
+    test_softmax()
+    test_batch_norm()
+    test_unary()
+    test_clip()
+    test_leaky_relu()
+    test_binary()
+    test_reduce()
+    test_strided_slice()
+    test_adaptive_pool2d()
+    test_multiple_outputs()
+    test_conv3d()
+    test_pool3d()
+    test_conv3d_transpose()
+
+    # Integration tests
+    test_alexnet()
+    test_resnet18_v1()
+    test_resnet18_v2()
+    test_squeezenet()
+    test_mobilenet()
+    test_mobilenet_v2()
+    test_vgg11()
+    test_densenet121()
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 3fc7351c415f..0072fb59cf11 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -45,5 +45,4 @@ echo set\(USE_VTA_FSIM ON\) >> config.cmake
 echo set\(USE_BLAS openblas\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
-echo set\(USE_VTA_TSIM ON\) >> config.cmake
-echo set\(USE_VTA_FSIM ON\) >> config.cmake
+echo set\(USE_TENSORRT_CODEGEN ON\) >> config.cmake

From 7edbfcfa7e49a8ba5c1adf415920681e2e567e8e Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Mon, 19 Oct 2020 19:18:12 -0700
Subject: [PATCH 027/258] [Relay][Frontend][Onnx] Loop Support (#6700)

* Onnx loop almost working, checkpointing for safety.

* Very close to working.

* Last piece is fixing scan initialization.

* snapshotting for debug.

* Fix Josh's issue

* Use subgraph proto class.

* Loop with scan.

* Simple loop test now working.

* Scan outputs now working.

* Added second loop test.

* Removed unneeded helper functions.

* Remove bad merge artifact.

* Cleaned up scan output creation.

* Cleaned up some style mistakes.

* Add pylint skip for unused-argument.

* Remove onnx dependency.

* Remove now obsolete checks for 0 shaped tensors.

Co-authored-by: Jared Roesch <jroesch@octoml.ai>
---
 python/tvm/relay/frontend/onnx.py          | 224 ++++++++++++++++++++-
 python/tvm/relay/op/tensor.py              |  17 +-
 src/relay/backend/vm/compiler.cc           |   7 -
 src/relay/transforms/fold_constant.cc      |   3 -
 tests/python/frontend/onnx/test_forward.py | 153 ++++++++++++++
 5 files changed, 383 insertions(+), 21 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 9fae94b5a8a1..e2c6b9abc449 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -17,9 +17,11 @@
 # pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines
 # pylint: disable=import-outside-toplevel
 """ONNX: Open Neural Network Exchange frontend for Relay."""
+import warnings
 import numpy as np
 import tvm
 from tvm.ir import IRModule
+from tvm.topi.util import get_const_tuple
 
 from ... import nd as _nd
 from .. import analysis
@@ -27,6 +29,8 @@
 from .. import function as _function
 from .. import op as _op
 from .. import vision as _vision
+from .. import loops as _loops
+from .. import ty as _ty
 
 from .common import AttrCvt, Renamer
 from .common import get_relay_op, new_var, infer_shape, infer_channels
@@ -95,6 +99,29 @@ def get_numpy(tensor_proto):
     return to_array(tensor_proto)
 
 
+def get_type(elem_type):
+    """Converts onnx integer datatype to numpy datatype"""
+    try:
+        from onnx import TensorProto
+    except ImportError as e:
+        raise ImportError("Unable to import onnx which is required {}".format(e))
+    return TensorProto.DataType.Name(elem_type).lower()
+
+
+def get_info(info_proto):
+    """Extract the shape from a ValueInfoProto."""
+    shape = []
+    for dim in info_proto.type.tensor_type.shape.dim:
+        value = dim.dim_value
+        if value is None:
+            value = _ty.Any
+        shape.append(value)
+
+    name = info_proto.name
+    dtype = get_type(info_proto.type.tensor_type.elem_type)
+    return name, shape, dtype
+
+
 def dimension_picker(prefix, suffix=""):
     """Check that dimensions are supported."""
 
@@ -1995,6 +2022,164 @@ def _impl_v11(cls, inputs, attr, params):
         return result
 
 
+class Loop(OnnxOpConverter):
+    """Operator converter for Loop"""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        max_loop_count = inputs[0]
+        cond = inputs[1]
+        loop_deps = inputs[2:]
+        num_deps = len(loop_deps)
+        body = attr["body"]
+        iter_dtype = infer_type(max_loop_count).checked_type.dtype
+
+        # Determine what condition mode we're in.
+        assert cond is not None or max_loop_count is not None
+        is_for_loop = max_loop_count is not None and cond is None
+        is_condition_for_loop = cond is not None and max_loop_count is not None
+
+        # Loop inputs will be packed as
+        # [iter_count, max_count, condition, loop_deps, scan_outputs]
+        def cond_fn(*loop_inputs):
+            i = loop_inputs[0]
+            max_count = loop_inputs[1]
+            w = loop_inputs[2]
+
+            if cond is not None:
+                out_while = _op.equal(w, _expr.const(True, "bool"))
+            if max_loop_count is not None:
+                out_loop = _op.less(i, max_count)
+
+            if is_condition_for_loop:
+                return _op.logical_and(out_while, out_loop)
+            if is_for_loop:
+                return out_loop
+            return out_while
+
+        # Get the current graph proto and create a clone for the subgraph
+        graph_scope = GraphProto.current
+        subgraph_scope = GraphProto(graph_scope._shape, graph_scope._dtype)
+        # Load nodes from outer graph into inner graph.
+        subgraph_scope._nodes = graph_scope._nodes.copy()
+
+        # Create a list of variables for each value updated in the loop.
+        def get_var(name, val, scan=False):
+            checked_type = infer_type(val)
+            if hasattr(checked_type, "type_annotation"):
+                checked_type = checked_type.type_annotation
+            shape = get_const_tuple(checked_type.shape)
+            actual_shape = []
+            for dim in shape:
+                if isinstance(dim, int) and dim == 0:
+                    actual_shape.append(_ty.Any())
+                else:
+                    actual_shape.append(dim)
+            if scan:
+                return _expr.var(name, shape=[_ty.Any()] + actual_shape, dtype=checked_type.dtype)
+
+            return _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
+
+        loop_vars = [
+            _expr.var(body.input[0].name, shape=(), dtype=iter_dtype),  # iteration count
+            _expr.var("max_count", shape=(), dtype=iter_dtype),  # iteration count
+            get_var(body.input[1].name, cond),  # exit condition
+        ]
+        loop_vars += [get_var(body.input[i + 2].name, v) for i, v in enumerate(loop_deps)]
+        loop_var_names = [v.name_hint for v in loop_vars]
+
+        num_scan_outputs = len(body.output) - (1 + num_deps)
+        # TODO (jwfromm) Test with strided slice once type unifier for this case is fixed.
+        if num_scan_outputs != 0 and "Slice" in [n.op_type for n in body.node]:
+            warnings.warn(
+                """
+                Using scan outputs in a loop with strided slice
+                currently may cause errors during compilation.
+                """
+            )
+
+        # Construct variables and intial empty tensors for any scan outputs.
+        scan_output_vars = []
+        scan_output_init = []
+        for i in range(num_scan_outputs):
+            name, shape, dtype = get_info(body.output[i + 1 + num_deps])
+            scan_output_vars.append(_expr.var(name, shape=([_ty.Any()] + shape), dtype=dtype))
+            scan_output_init.append(_op.reshape(_expr.const([]), [0] + shape))
+
+        # Now we can remove loop iter variables from our inner loop's inputs.
+        # This is kind of a hack since we have graph inputs that we don't
+        # want to treat as actual inputs.
+        while len(body.input) != 0:
+            body.input.pop(0)
+
+        # Define the loop body, in this function we need to unpack loop inputs,
+        # convert the loop subgraph, and pack outputs for the next iteration.
+        def body_fn(*loop_inputs):
+            # Unpack inputs
+            loop_count = loop_inputs[0]
+            max_count = loop_inputs[1]
+            cond = loop_inputs[2]
+            current_vars = list(loop_inputs[3 : (3 + num_deps)])
+            scan_outputs = loop_inputs[(3 + num_deps) :]
+
+            # Prepare body inputs by adding them to node dictionary.
+            new_inputs = [loop_count, max_count, cond] + current_vars
+            for i, inp in enumerate(new_inputs):
+                subgraph_scope._nodes[loop_var_names[i]] = inp
+
+            # Get the output of the current loop using the updated inputs.
+            with subgraph_scope:
+                loop_outputs = subgraph_scope.from_onnx(body, 11, get_output_expr=True)
+            # Unpack the body outputs and prepare variables for next iteration.
+            new_cond = loop_outputs[0]
+            new_loop_vars = [loop_outputs[i] for i in range(1, 1 + num_deps)]
+            new_scan_outputs = [loop_outputs[i] for i in range(1 + num_deps, len(loop_outputs))]
+
+            # Increment counter.
+            if max_loop_count is not None:
+                incr = _expr.const(1, dtype=iter_dtype)
+                loop_count = loop_count + incr
+
+            # Add new scan outputs to tracking
+            combined_scan_outputs = []
+            for i, scan in enumerate(scan_outputs):
+                new_scan = _op.expand_dims(new_scan_outputs[i], axis=0)
+                combined_scan = _op.concatenate([scan, new_scan], axis=0)
+                combined_scan_outputs.append(combined_scan)
+
+            # Pack loop outputs for next iteration
+            # [iter_count, cond, loop_deps, loop_scans]
+            return [loop_count, max_count, new_cond] + new_loop_vars + combined_scan_outputs
+
+        # Create the loop function.
+        loop = _loops.while_loop(cond_fn, loop_vars + scan_output_vars, body_fn)
+
+        # Now need to run initial values through the graph.
+        init_count = _expr.const(0, dtype=iter_dtype)
+        loop_vals = loop(init_count, max_loop_count, cond, *loop_deps, *scan_output_init)
+
+        # Extract final iteration outputs.
+        if num_deps + num_scan_outputs == 1:
+            outputs = _expr.TupleGetItem(loop_vals, 3)
+        else:
+            outputs = _expr.TupleWrapper(
+                _expr.Tuple(
+                    [
+                        _expr.TupleGetItem(loop_vals, i + 3)
+                        for i in range(num_deps + num_scan_outputs)
+                    ]
+                ),
+                num_deps + num_scan_outputs,
+            )
+
+        # Update outer graph with constants found in the subgraph.
+        free_vars = analysis.free_vars(loop)
+        graph_scope._params.update(subgraph_scope._params)
+        for var in free_vars:
+            graph_scope._nodes.update({var.name_hint: var})
+        return outputs
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -2150,6 +2335,8 @@ def _get_convert_map(opset):
         "Resize": Resize.get_converter(opset),
         "NonZero": NonZero.get_converter(opset),
         "Range": Range.get_converter(opset),
+        # defs/control_flow
+        "Loop": Loop.get_converter(opset),
     }
 
 
@@ -2166,6 +2353,8 @@ class GraphProto:
         The input types to the graph
     """
 
+    current = None
+
     def __init__(self, shape, dtype):
         self._nodes = {}
         self._params = {}
@@ -2176,15 +2365,24 @@ def __init__(self, shape, dtype):
         self._shape = shape if shape else {}
         self._dtype = dtype
 
+    def __enter__(self):
+        self._old_manager = GraphProto.current
+        GraphProto.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        GraphProto.current = self._old_manager
+
     def freeze(self, func, params):
         bind_map = {}
         for name in params.keys():
-            bind_map[self._nodes[name]] = _expr.const(params[name])
+            if name in self._nodes.keys():
+                bind_map[self._nodes[name]] = _expr.const(params[name])
         body = _expr.bind(func.body, bind_map)
         fn = _function.Function(analysis.free_vars(body), body)
         return fn, {}
 
-    def from_onnx(self, graph, opset, freeze_params=False):
+    def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
         """Construct Relay expression from ONNX graph.
 
         Onnx graph is a python protobuf object.
@@ -2208,6 +2406,11 @@ def from_onnx(self, graph, opset, freeze_params=False):
             at compile time and helps in making models static if certain inputs represent
             attributes relay would traditionally consider compile-time constants.
 
+        get_output_expr: bool
+            If set to true, this conversion will return each output expression rather
+            than a packaged module. This can be useful when converting subgraphs to
+            relay.
+
         Returns
         -------
         mod : tvm.IRModule
@@ -2309,6 +2512,9 @@ def from_onnx(self, graph, opset, freeze_params=False):
         # now return the outputs
         outputs = [self._nodes[self._parse_value_proto(i)] for i in graph.output]
         outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
+        # If requested, directly return the converted expressions.
+        if get_output_expr:
+            return outputs
         ## Maintain the order of inputs and parameters from the ONNX graph, but only include
         ## those parameters that are needed to execute the relay graph
         free_vars = analysis.free_vars(outputs)
@@ -2317,6 +2523,7 @@ def from_onnx(self, graph, opset, freeze_params=False):
         for i_name in self._params:
             if i_name in free_vars and i_name not in self._inputs:
                 self._inputs[i_name] = self._nodes[i_name]
+        # Create a function from our output expression and all input variables.
         func = _function.Function([v for k, v in self._inputs.items()], outputs)
         if freeze_params:
             func, params = self.freeze(func, self._params)
@@ -2348,7 +2555,7 @@ def _parse_attr(self, attr_proto):
         """Convert a list of AttributeProto to a dict, with names as keys."""
         attrs = {}
         for a in attr_proto:
-            for f in ["f", "i", "s"]:
+            for f in ["f", "i", "s", "g"]:
                 if a.HasField(f):
                     attrs[a.name] = getattr(a, f)
             for f in ["floats", "ints", "strings"]:
@@ -2362,12 +2569,9 @@ def _parse_attr(self, attr_proto):
                 if list(getattr(a, f)):
                     assert a.name not in attrs, "Only one type of attr is allowed"
                     attrs[a.name] = tuple(getattr(a, f))
-            for f in ["g"]:
-                if a.HasField(f):
-                    raise NotImplementedError("Filed {} is not supported in relay.".format(f))
             for f in ["graphs"]:
                 if list(getattr(a, f)):
-                    raise NotImplementedError("Filed {} is not supported in relay.".format(f))
+                    raise NotImplementedError("Field {} is not supported in relay.".format(f))
             if a.name not in attrs:
                 raise ValueError("Cannot parse attribute: \n{}\n.".format(a))
         return attrs
@@ -2469,8 +2673,6 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
             try:
                 onnx.checker.check_model(model)
             except onnx.onnx_cpp2py_export.checker.ValidationError as e:
-                import warnings
-
                 # the checker is a bit violent about errors, so simply print warnings here
                 warnings.warn(str(e))
     except ImportError:
@@ -2482,5 +2684,7 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
             opset = model.opset_import[0].version if model.opset_import else 1
         except AttributeError:
             opset = 1
-    mod, params = g.from_onnx(graph, opset, freeze_params)
+    # Use the graph proto as a scope so that ops can access other nodes if needed.
+    with g:
+        mod, params = g.from_onnx(graph, opset, freeze_params)
     return mod, params
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 832372a6ed0d..453a9b7a7759 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -15,13 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 """Basic tensor operations."""
-# pylint: disable=redefined-builtin
+# pylint: disable=redefined-builtin, unused-argument
 from tvm.runtime import ndarray as _nd
 from tvm.runtime import TVMContext as _TVMContext
+from tvm.te.hybrid import script
 
 from . import _make
 from .dyn import _make as _dyn_make
 from ..expr import Tuple, Expr
+from . import op as reg
 
 
 # We create a wrapper function for each operator in the
@@ -1138,6 +1140,19 @@ def copy(data):
     return _make.copy(data)
 
 
+@script
+def _copy_shape_func(data_shape):
+    return data_shape
+
+
+@reg.register_shape_func("copy", False)
+def copy_shape_func(attrs, inputs, _):
+    """
+    Shape function for copy op.
+    """
+    return [_copy_shape_func(inputs[0])]
+
+
 def device_copy(data, src_dev, dst_dev):
     """Copy data from the source device to the destination device. This
     operator helps data transferring between difference contexts for
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index c7ceca3604c8..c3bf80571638 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -343,13 +343,6 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   void VisitExpr_(const ConstantNode* const_node) {
     // Check the shape is valid
     NDArray data = const_node->data;
-    const DLTensor* tensor = data.operator->();
-    if (tensor->ndim > 0) {
-      int64_t* shapes = reinterpret_cast<int64_t*>(tensor->shape);
-      for (auto i = 0; i < tensor->ndim; i++) {
-        CHECK_GT(shapes[i], 0U);
-      }
-    }
     size_t konst_idx = context_->constants.size();
     if (expr_device_map_.empty()) {
       context_->const_device_type.push_back(targets_.begin()->first);
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 8d2cba05be49..1de690d91036 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -199,9 +199,6 @@ class ConstantFolder : public MixedModeMutator {
   Expr ObjectToExpr(const ObjectRef& value) {
     if (value->IsInstance<runtime::NDArray::ContainerType>()) {
       auto nd_array = Downcast<runtime::NDArray>(value);
-      for (auto dim : nd_array.Shape()) {
-        CHECK_GT(dim, 0) << "invalid dimension after constant eval";
-      }
       return Constant(nd_array);
     } else if (const auto* val = value.as<runtime::ADTObj>()) {
       runtime::ADT adt = GetRef<runtime::ADT>(val);
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 07e6dc465268..81b5186d0e26 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3660,6 +3660,157 @@ def verify_roi_align(
     verify_roi_align((1, 4, 16, 16), 32, 7, 7, sampling_ratio=2, spatial_scale=1.0)
 
 
+def verify_cond_loop():
+    y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [1])
+    y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [1])
+    scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [1])
+    cond_in = helper.make_tensor_value_info("cond_in", TensorProto.BOOL, [])
+    cond_out = helper.make_tensor_value_info("cond_out", TensorProto.BOOL, [])
+    iter_count = helper.make_tensor_value_info("iter_count", TensorProto.INT64, [])
+
+    y = np.array([-2]).astype(np.float32)
+
+    five_const_node = helper.make_node(
+        "Constant",
+        inputs=[],
+        outputs=["five"],
+        value=helper.make_tensor(
+            name="const_tensor_five", data_type=TensorProto.FLOAT, dims=(), vals=[5]
+        ),
+    )
+
+    iter_cast_node = helper.make_node(
+        "Cast", inputs=["iter_count"], outputs=["iter_cast"], to=onnx.TensorProto.FLOAT
+    )
+
+    y_add_node = helper.make_node("Add", inputs=["y_in", "iter_cast"], outputs=["y_out"])
+
+    less_node = helper.make_node("Less", inputs=["y_out", "five"], outputs=["cond_less"])
+
+    squeeze_node = helper.make_node("Squeeze", inputs=["cond_less"], outputs=["cond_squeeze"])
+
+    cond_cast_node = helper.make_node(
+        "Cast", inputs=["cond_squeeze"], outputs=["cond_out"], to=onnx.TensorProto.BOOL
+    )
+
+    scan_identity_node = helper.make_node("Identity", inputs=["y_out"], outputs=["scan_out"])
+
+    loop_body = helper.make_graph(
+        [
+            five_const_node,
+            iter_cast_node,
+            y_add_node,
+            less_node,
+            squeeze_node,
+            cond_cast_node,
+            scan_identity_node,
+        ],
+        "loop_body",
+        [iter_count, cond_in, y_in],
+        [cond_out, y_out, scan_out],
+    )
+
+    loop_node = helper.make_node(
+        "Loop", inputs=["trip_count", "cond", "y"], outputs=["res_y", "res_scan"], body=loop_body
+    )
+
+    trip_count = np.array(5).astype(np.int64)
+    res_y = np.array([13]).astype(np.float32)
+    cond = np.array(1).astype(np.bool)
+    loop_graph = onnx.helper.make_graph(
+        [loop_node],
+        "loop_outer",
+        inputs=[
+            onnx.helper.make_tensor_value_info("trip_count", onnx.TensorProto.INT64, []),
+            onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []),
+            onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, [1]),
+        ],
+        outputs=[
+            onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, [1]),
+            onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5, 1]),
+        ],
+    )
+    loop_model = onnx.helper.make_model(loop_graph)
+
+    # Set a high trip count so that condition trips first.
+    trip_count = np.array(40).astype(np.int64)
+    cond = np.array(1).astype(np.bool)
+    input_vals = [trip_count, cond, y]
+    onnx_out = get_onnxruntime_output(loop_model, input_vals)
+
+    for target, ctx in [("llvm", tvm.cpu())]:
+        tvm_out = get_tvm_output_with_vm(loop_model, input_vals, target, ctx, freeze_params=True)
+        for i in range(len(tvm_out)):
+            tvm.testing.assert_allclose(onnx_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
+
+
+def verify_count_loop():
+    y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [1])
+    y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [1])
+    scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [1])
+    cond_in = helper.make_tensor_value_info("cond_in", TensorProto.BOOL, [])
+    cond_out = helper.make_tensor_value_info("cond_out", TensorProto.BOOL, [])
+    iter_count = helper.make_tensor_value_info("iter_count", TensorProto.INT64, [])
+
+    y = np.array([-2]).astype(np.float32)
+
+    iter_cast_node = helper.make_node(
+        "Cast", inputs=["iter_count"], outputs=["iter_cast"], to=onnx.TensorProto.FLOAT
+    )
+
+    y_add_node = helper.make_node("Add", inputs=["y_in", "iter_cast"], outputs=["y_out"])
+
+    identity_node = helper.make_node("Identity", inputs=["cond_in"], outputs=["cond_out"])
+
+    scan_identity_node = helper.make_node("Identity", inputs=["y_out"], outputs=["scan_out"])
+
+    loop_body = helper.make_graph(
+        [identity_node, iter_cast_node, y_add_node, scan_identity_node],
+        "loop_body",
+        [iter_count, cond_in, y_in],
+        [cond_out, y_out, scan_out],
+    )
+
+    loop_node = helper.make_node(
+        "Loop", inputs=["trip_count", "cond", "y"], outputs=["res_y", "res_scan"], body=loop_body
+    )
+
+    trip_count = np.array(5).astype(np.int64)
+    res_y = np.array([13]).astype(np.float32)
+    cond = np.array(1).astype(np.bool)
+    loop_graph = onnx.helper.make_graph(
+        [loop_node],
+        "loop_outer",
+        inputs=[
+            onnx.helper.make_tensor_value_info("trip_count", onnx.TensorProto.INT64, []),
+            onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []),
+            onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, [1]),
+        ],
+        outputs=[
+            onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, [1]),
+            onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5, 1]),
+        ],
+    )
+    loop_model = onnx.helper.make_model(loop_graph)
+
+    trip_count = np.array(5).astype(np.int64)
+    cond = np.array(1).astype(np.bool)
+    input_vals = [trip_count, cond, y]
+    onnx_out = get_onnxruntime_output(loop_model, input_vals)
+
+    for target, ctx in [("llvm", tvm.cpu())]:
+        tvm_out = get_tvm_output_with_vm(loop_model, input_vals, target, ctx, freeze_params=True)
+        for i in range(len(tvm_out)):
+            tvm.testing.assert_allclose(onnx_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
+
+
+def test_loop():
+    # Test a loop that exits once a condition is met.
+    verify_cond_loop()
+    # Test a loop that exits after a fixed number of iterations.
+    verify_count_loop()
+
+
 if __name__ == "__main__":
     test_flatten()
     test_reshape()
@@ -3734,3 +3885,5 @@ def verify_roi_align(
     test_xor()
     test_max_roi_pool()
     test_roi_align()
+    test_range()
+    test_loop()

From 107a8faab38077bf76389c09fa49ab50c58ce248 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Wed, 21 Oct 2020 14:02:59 -0700
Subject: [PATCH 028/258] [Bugfix] Auto scheduler tutorial failure on CI
 (#6723)

---
 .../auto_scheduler/tune_conv2d_layer_cuda.py  | 36 +++++++++----------
 tutorials/auto_scheduler/tune_matmul_x86.py   |  8 ++---
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index b800eb469ec5..cb2126dec911 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -157,17 +157,17 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # print the equivalent python schedule API, and build the binary again.
 
 # Load the measuremnt record for the best schedule
-inp, res = auto_scheduler.load_best("conv2d.json", task.workload_key)
+# inp, res = auto_scheduler.load_best("conv2d.json", task.workload_key)
 
 # Print equivalent python schedule API. This can be used for debugging and
 # learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
-print(task.compute_dag.print_python_code_from_state(inp.state))
+# print(task.compute_dag.print_python_code_from_state(inp.state))
 
 # Rebuild the binary. This shows how you can apply the best schedule from a
 # log file without reruning the search again.
-sch, args = task.compute_dag.apply_steps_from_state(inp.state)
-func = tvm.build(sch, args, target)
+# sch, args = task.compute_dag.apply_steps_from_state(inp.state)
+# func = tvm.build(sch, args, target)
 
 ######################################################################
 # A more complicated example is to resume the search.
@@ -176,19 +176,19 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # In the example below we resume the status and do more 5 trials.
 
 
-log_file = "conv2d.json"
-cost_model = auto_scheduler.XGBModel()
-cost_model.update_from_file(log_file)
-search_policy = auto_scheduler.SketchPolicy(
-    task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
-)
-measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
-tune_option = auto_scheduler.TuningOptions(
-    num_measure_trials=5,
-    runner=measure_ctx.runner,
-    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-)
-sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+# log_file = "conv2d.json"
+# cost_model = auto_scheduler.XGBModel()
+# cost_model.update_from_file(log_file)
+# search_policy = auto_scheduler.SketchPolicy(
+#     task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
+# )
+# measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
+# tune_option = auto_scheduler.TuningOptions(
+#     num_measure_trials=5,
+#     runner=measure_ctx.runner,
+#     measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+# )
+# sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
 
 # Kill the measurement process
-del measure_ctx
+# del measure_ctx
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 35c47444e081..5c039b17a958 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -141,17 +141,17 @@ def matmul_add(N, L, M, dtype):
 # print the equivalent python schedule API, and build the binary again.
 
 # Load the measuremnt record for the best schedule
-inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)
+# inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)
 
 # Print equivalent python schedule API. This can be used for debugging and
 # learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
-print(task.compute_dag.print_python_code_from_state(inp.state))
+# print(task.compute_dag.print_python_code_from_state(inp.state))
 
 # Rebuild the binary. This shows how you can apply the best schedule from a
 # log file without reruning the search again.
-sch, args = task.compute_dag.apply_steps_from_state(inp.state)
-func = tvm.build(sch, args)
+# sch, args = task.compute_dag.apply_steps_from_state(inp.state)
+# func = tvm.build(sch, args)
 
 ######################################################################
 # A more complicated example is to resume the search.

From a133fa5097e0dfccd39965f9c50d0d228468656c Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 21 Oct 2020 16:07:50 -0600
Subject: [PATCH 029/258] Fix InferCorrectLayout for dynamic upsampling and add
 a regression test (#6712)

* add a regression test

* fix dyn upsampling infer layout

* fix lint
---
 src/relay/op/dyn/nn/upsampling.cc             |  4 +-
 src/relay/op/dyn/nn/upsampling.h              | 69 +++++++++++++++++++
 .../python/relay/test_pass_alter_op_layout.py | 39 +++++++++++
 3 files changed, 110 insertions(+), 2 deletions(-)
 create mode 100644 src/relay/op/dyn/nn/upsampling.h

diff --git a/src/relay/op/dyn/nn/upsampling.cc b/src/relay/op/dyn/nn/upsampling.cc
index 9ed3298142af..8a28475eacd5 100644
--- a/src/relay/op/dyn/nn/upsampling.cc
+++ b/src/relay/op/dyn/nn/upsampling.cc
@@ -22,13 +22,14 @@
  * \brief upsampling operator
  */
 
-#include "../../nn/upsampling.h"
+#include "upsampling.h"
 
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/tir/data_layout.h>
 
+#include <utility>
 #include <vector>
 
 #include "../../op_common.h"
@@ -48,7 +49,6 @@ bool UpSamplingRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (scale_h == nullptr) return false;
   if (scale_w == nullptr) return false;
 
-  CHECK_EQ(data->shape.size(), 4);
   CHECK_EQ(scale_h->shape.size(), 0);
   CHECK_EQ(scale_w->shape.size(), 0);
   static const Layout kNCHW("NCHW");
diff --git a/src/relay/op/dyn/nn/upsampling.h b/src/relay/op/dyn/nn/upsampling.h
new file mode 100644
index 000000000000..79ed65bba36b
--- /dev/null
+++ b/src/relay/op/dyn/nn/upsampling.h
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *
+ * \file src/relay/op/dyn/nn/upsampling.h
+ * \brief implementation of the InferCorrectLayout pass for dynamic upsampling
+ */
+
+#ifndef TVM_RELAY_OP_DYN_NN_UPSAMPLING_H_
+#define TVM_RELAY_OP_DYN_NN_UPSAMPLING_H_
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/tir/data_layout.h>
+
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace dyn {
+
+template <typename T>
+Array<Array<Layout> > UpsamplingInferCorrectLayout(const Attrs& attrs,
+                                                   const Array<Layout>& new_in_layouts,
+                                                   const Array<Layout>& old_in_layouts,
+                                                   const Array<tvm::relay::Type>& old_in_types) {
+  // NOTE: Discard "const" qualifier here.
+  T* params = const_cast<T*>(attrs.as<T>());
+  if (new_in_layouts.defined()) {
+    CHECK_GT(new_in_layouts.size(), 0);
+
+    Layout raw_layout(params->layout);
+    Layout input = new_in_layouts[0];
+    if (input.IndexOf(LayoutAxis::Get('W')) == raw_layout.IndexOf(LayoutAxis::Get('W')) &&
+        input.IndexOf(LayoutAxis::Get('H')) == raw_layout.IndexOf(LayoutAxis::Get('H')) &&
+        !input.Contains(LayoutAxis::Get('w')) && !input.Contains(LayoutAxis::Get('h')) &&
+        (input.IndexOf(LayoutAxis::Get('D')) == -1 ||
+         (input.IndexOf(LayoutAxis::Get('D')) == raw_layout.IndexOf(LayoutAxis::Get('D')) &&
+          !input.Contains(LayoutAxis::Get('d'))))) {
+      params->layout = input.name();  // modify self to follow the input layout
+    }
+  }
+
+  Layout inferred_layout(params->layout);
+  Layout param_layout("NCHW");
+  return Array<Array<Layout> >{{inferred_layout, param_layout, param_layout}, {inferred_layout}};
+}
+
+}  // namespace dyn
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_DYN_NN_UPSAMPLING_H_
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index a7ae9f77fcb7..58c279d750ec 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -673,6 +673,45 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_alter_layout_nchw_dyn_upsamping_op():
+    """Test upsamping operators """
+
+    def before():
+        x = relay.var("x", shape=(1, 32, 28, 28))
+        weight = relay.var("weight", shape=(32, 32, 3, 3))
+        y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.upsampling(y, scale_h=relay.const(2), scale_w=relay.const(2))
+        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2))
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW16c"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 32, 28, 28))
+        weight = relay.var("weight")
+        x = relay.layout_transform(x, "NCHW", "NCHW16c")
+        y = relay.nn.conv2d(
+            x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
+        )
+        y = relay.nn.upsampling(y, scale_h=relay.const(2), scale_w=relay.const(2), layout="NCHW16c")
+        y = relay.nn.avg_pool2d(y, pool_size=(2, 2), strides=(2, 2), layout="NCHW16c")
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = before()
+        a = run_opt_pass(a, transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 @tvm.testing.uses_gpu
 def test_alter_layout_strided_slice():
     """Test rewriting strided_slice during alter_iop_layout"""

From 9a0c83b14ea5b67f387ab63a2c94b9635997cfd1 Mon Sep 17 00:00:00 2001
From: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Date: Thu, 22 Oct 2020 21:59:31 +0800
Subject: [PATCH 030/258] [TVMScript] refactor (#6734)

* [TVMScript] refactor

* [TVMScript] pylint

* [TVMScript] pylint
---
 python/tvm/script/__init__.py                 |   3 +-
 python/tvm/script/_ffi_api.py                 |   2 +-
 ...scope_emitter.py => context_maintainer.py} |  28 +-
 python/tvm/script/intrin.py                   | 100 ++--
 python/tvm/script/parser.py                   | 461 ++++++++++++------
 python/tvm/script/registry.py                 | 379 +-------------
 python/tvm/script/scope_handler.py            | 418 +++++++++-------
 python/tvm/script/special_stmt.py             | 240 +++++----
 python/tvm/script/ty.py                       |   5 +-
 python/tvm/script/utils.py                    | 102 +---
 10 files changed, 829 insertions(+), 909 deletions(-)
 rename python/tvm/script/{scope_emitter.py => context_maintainer.py} (70%)

diff --git a/python/tvm/script/__init__.py b/python/tvm/script/__init__.py
index 4b9f07354f70..4cf7828290a7 100644
--- a/python/tvm/script/__init__.py
+++ b/python/tvm/script/__init__.py
@@ -16,5 +16,4 @@
 # under the License.
 """TVM Script APIs of TVM Python Package, aimed to support TIR"""
 
-from .utils import create_module, asscript, tir, module
-from .parser import from_source
+from .parser import from_source, create_module, asscript, tir, module
diff --git a/python/tvm/script/_ffi_api.py b/python/tvm/script/_ffi_api.py
index 92c38909f446..926d17b1667e 100644
--- a/python/tvm/script/_ffi_api.py
+++ b/python/tvm/script/_ffi_api.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""FFI APIs for tvm.tvmscript"""
+"""FFI APIs for tvm.script"""
 import tvm._ffi
 
 tvm._ffi._init_api("script", __name__)
diff --git a/python/tvm/script/scope_emitter.py b/python/tvm/script/context_maintainer.py
similarity index 70%
rename from python/tvm/script/scope_emitter.py
rename to python/tvm/script/context_maintainer.py
index 69ad26731492..8ad39354e5cf 100644
--- a/python/tvm/script/scope_emitter.py
+++ b/python/tvm/script/context_maintainer.py
@@ -14,17 +14,24 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Scope Emitter for TIR"""
+"""TVM Script Context Maintainer for TIR"""
 
 from tvm.te import schedule
 
 
-class ScopeEmitter:
-    """Maintain the nodes and symbols of scopes"""
+class ContextMaintainer:
+    """Maintain all the necessary context info"""
 
     def __init__(self, parser):
-        self.node_stack = [[]]  # AST nodes of scopes
-        self.symbols = [dict()]  # Symbols of scopes
+        # scope context
+        self.node_stack = []  # AST nodes of scopes
+        self.symbols = []  # symbols of scopes
+        # function context
+        self.func_params = []  # parameter list of function
+        self.func_buffer_map = {}  # buffer_map of function
+        self.func_dict_attr = {}  # func_attr of function
+        self.func_var_env_dict = {}  # map from var to env_name
+        # parser
         self.parser = parser
 
     def pop_scope(self):
@@ -32,9 +39,11 @@ def pop_scope(self):
         self.symbols.pop()
         self.node_stack.pop()
 
-    def new_scope(self):
-        """ Creating a new scope """
-        self.node_stack.append([])
+    def new_scope(self, nodes=None):
+        """Creating a new scope"""
+        if nodes is None:
+            nodes = []
+        self.node_stack.append(list(reversed(nodes)))
         self.symbols.append(dict())
 
     def update_symbol(self, name, symbol):
@@ -60,3 +69,6 @@ def lookup_symbol(self, name):
             if name in symbols:
                 return symbols[name]
         return None
+
+    def report_error(self, message):
+        self.parser.report_error(message)
diff --git a/python/tvm/script/intrin.py b/python/tvm/script/intrin.py
index 21570b91111a..63bc676bc889 100644
--- a/python/tvm/script/intrin.py
+++ b/python/tvm/script/intrin.py
@@ -14,127 +14,127 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Parser Intrinsic Functions
-
-IRNodes (StmtNodes without body, PrimExprNodes and more) are called intrins
-"""
-# pylint: disable=redefined-builtin
+"""TVM Script Parser Intrinsic Classes"""
+# pylint: disable=redefined-builtin, relative-beyond-top-level
 import tvm.tir
-from .registry import register_intrin
+from .registry import register
+from .utils import get_param_list
+
+
+class Intrin:
+    def __init__(self, intrin, stmt=False):
+        self.intrin = intrin
+        self.stmt = stmt
+
+    def signature(self):
+        return "tir." + self.intrin.__name__, get_param_list(self.intrin)
 
+    def handle(self, arg_list):
+        return self.intrin(*arg_list)
 
-@register_intrin()
+
+@register
 def bool(imm):
     return tvm.tir.const(imm, "bool")
 
 
-@register_intrin()
+@register
 def int8(imm):
     return tvm.tir.const(imm, "int8")
 
 
-@register_intrin()
+@register
 def int16(imm):
     return tvm.tir.const(imm, "int16")
 
 
-@register_intrin()
+@register
 def int32(imm):
     return tvm.tir.const(imm, "int32")
 
 
-@register_intrin()
+@register
 def int64(imm):
     return tvm.tir.const(imm, "int64")
 
 
-@register_intrin()
+@register
 def uint8(imm):
     return tvm.tir.const(imm, "uint8")
 
 
-@register_intrin()
+@register
 def uint16(imm):
     return tvm.tir.const(imm, "uint16")
 
 
-@register_intrin()
+@register
 def uint32(imm):
     return tvm.tir.const(imm, "uint32")
 
 
-@register_intrin()
+@register
 def uint64(imm):
     return tvm.tir.const(imm, "uint64")
 
 
-@register_intrin()
+@register
 def float8(imm):
     return tvm.tir.const(imm, "float8")
 
 
-@register_intrin()
+@register
 def float16(imm):
     return tvm.tir.const(imm, "float16")
 
 
-@register_intrin()
+@register
 def float32(imm):
     return tvm.tir.const(imm, "float32")
 
 
-@register_intrin()
+@register
 def float64(imm):
     return tvm.tir.const(imm, "float64")
 
 
-@register_intrin()
+@register
 def floordiv(x, y):
     return tvm.tir.floordiv(x, y)
 
 
-@register_intrin()
+@register
 def floormod(x, y):
     return tvm.tir.floormod(x, y)
 
 
-@register_intrin()
+@register
 def load(dtype, var, index, predicate=True):
     return tvm.tir.Load(dtype, var, index, predicate)
 
 
-@register_intrin()
+@register
 def cast(value, dtype):
     return tvm.tir.Cast(dtype, value)
 
 
-@register_intrin()
+@register
 def ramp(base, stride, lanes):
     return tvm.tir.Ramp(base, stride, lanes)
 
 
-@register_intrin()
+@register
 def broadcast(value, lanes):
     return tvm.tir.Broadcast(value, lanes)
 
 
-@register_intrin()
-def evaluate(value):
-    return tvm.tir.Evaluate(value)
-
-
-@register_intrin()
-def store(var, index, value, predicate=True):
-    return tvm.tir.Store(var, value, index, predicate)
-
-
-@register_intrin()
+@register
 def iter_var(var, dom, iter_type, thread_tag):
     iter_type = getattr(tvm.tir.IterVar, iter_type)
     return tvm.tir.IterVar(dom, var, iter_type, thread_tag)
 
 
-@register_intrin()
+@register
 def max(a, b):  # pylint: disable=redefined-builtin
     return tvm.tir.Max(a, b)
 
@@ -148,21 +148,39 @@ def get_axis(begin, end, iter_type):
     return tvm.tir.IterVar(block_var_dom, "bv", iter_type_dict[iter_type])
 
 
-@register_intrin()
+@register
 def range(begin, end):
     return get_axis(begin, end, "data_par")
 
 
-@register_intrin()
+@register
 def reduce_axis(begin, end):
     return get_axis(begin, end, "reduce")
 
 
-@register_intrin()
+@register
 def scan_axis(begin, end):
     return get_axis(begin, end, "scan")
 
 
-@register_intrin()
+@register
 def opaque_axis(begin, end):
     return get_axis(begin, end, "opaque")
+
+
+@register
+class EvaluateIntrin(Intrin):
+    def __init__(self):
+        def evaluate(value):
+            return tvm.tir.Evaluate(value)
+
+        super().__init__(evaluate, stmt=True)
+
+
+@register
+class StoreIntrin(Intrin):
+    def __init__(self):
+        def store(var, index, value, predicate=True):
+            return tvm.tir.Store(var, value, index, predicate)
+
+        super().__init__(store, stmt=True)
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 56710fc7a60f..70aa3fe34387 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -16,25 +16,69 @@
 # under the License.
 """TVM Script Parser For TIR"""
 # pylint: disable=invalid-name, missing-docstring, inconsistent-return-statements, no-else-return
-# pylint: disable=unnecessary-comprehension, unused-argument, import-outside-toplevel
-# pylint: disable=unused-import
+# pylint: disable=unnecessary-comprehension, unused-argument
+# pylint: disable=relative-beyond-top-level
 import json
 import operator
+import inspect
 from typed_ast import ast3 as ast
 
-import tvm._ffi
-from tvm import tir
+import tvm
+from tvm import IRModule
 from tvm._ffi.base import TVMError
 from tvm.ir import GlobalVar
 from tvm.tir import all as _all
 from tvm.tir import expr as _expr
 
-from . import scope_emitter, special_stmt, scope_handler, intrin, ty
+from . import context_maintainer, ty
 from .meta_unparser import MetaUnparser
 from .registry import Registry
+from .intrin import Intrin
+from .special_stmt import SpecialStmt
+from .scope_handler import ScopeHandler, WithScopeHandler, ForScopeHandler
 from . import _ffi_api
 
 
+class CallArgumentReader(object):
+    """A helper class which read required argument from passed arguments"""
+
+    def __init__(self, func_name, args, kwargs, parser):
+        self.func_name = func_name
+        self.args = args
+        self.kwargs = kwargs
+        self.parser = parser
+
+    def get_pos_only_arg(self, pos, name):
+        """Get corresponding position only function argument from argument list"""
+        if len(self.args) >= pos:
+            arg = self.args[pos - 1]
+        elif name not in self.kwargs:
+            self.parser.report_error(self.func_name + " misses argument " + name)
+        else:
+            arg = self.kwargs[name]
+
+        return arg
+
+    def get_kwarg(self, pos, name, default):
+        """Get corresponding keyword function argument from argument list
+        If user doesn't provide the argument, set it to default value
+        """
+        if len(self.args) >= pos:
+            arg = self.args[pos - 1]
+        elif name in self.kwargs:
+            arg = self.kwargs[name]
+        else:
+            return default
+
+        return arg
+
+    def get_varargs(self, pos):
+        """Get corresponding variable argument from argument list"""
+        if len(self.args) >= pos and len(self.kwargs) == 0:
+            return self.args[pos - 1 :]
+        return []
+
+
 class TVMScriptParserError(RuntimeError):
     """TVM script Parser Runtime Error"""
 
@@ -58,33 +102,29 @@ class TVMScriptParser(ast.NodeVisitor):
     """
 
     _binop_maker = {
-        ast.Add: tir.Add,
-        ast.Sub: tir.Sub,
-        ast.Mult: tir.Mul,
-        ast.Div: tir.Div,
-        ast.FloorDiv: tir.FloorDiv,
-        ast.Mod: tir.FloorMod,
+        ast.Add: tvm.tir.Add,
+        ast.Sub: tvm.tir.Sub,
+        ast.Mult: tvm.tir.Mul,
+        ast.Div: tvm.tir.Div,
+        ast.FloorDiv: tvm.tir.FloorDiv,
+        ast.Mod: tvm.tir.FloorMod,
         ast.BitOr: operator.or_,
         ast.BitAnd: operator.and_,
         ast.BitXor: operator.xor,
-        ast.Gt: tir.GT,
-        ast.GtE: tir.GE,
-        ast.Lt: tir.LT,
-        ast.LtE: tir.LE,
-        ast.Eq: tir.EQ,
-        ast.NotEq: tir.NE,
-        ast.And: tir.And,
-        ast.Or: tir.Or,
+        ast.Gt: tvm.tir.GT,
+        ast.GtE: tvm.tir.GE,
+        ast.Lt: tvm.tir.LT,
+        ast.LtE: tvm.tir.LE,
+        ast.Eq: tvm.tir.EQ,
+        ast.NotEq: tvm.tir.NE,
+        ast.And: tvm.tir.And,
+        ast.Or: tvm.tir.Or,
     }
 
-    _unaryop_maker = {ast.USub: operator.neg, ast.Invert: operator.invert, ast.Not: tir.Not}
+    _unaryop_maker = {ast.USub: operator.neg, ast.Invert: operator.invert, ast.Not: tvm.tir.Not}
 
     def __init__(self, src, base_lienno):
-        self.params = None
-        self.buffer_map = None
-        self.dict_attr = None
-        self.scope_emitter = None
-        self.var_env_dict = None
+        self.context = None
 
         self.src = src.split("\n")
         self.base_lineno = base_lienno
@@ -93,15 +133,10 @@ def __init__(self, src, base_lienno):
         self.meta = None
 
         self.functions = {}
-        self.target = None
 
     def init_function_parsing_env(self):
         """Initialize function parsing environment"""
-        self.params = []  # parameter list
-        self.buffer_map = {}  # buffer map
-        self.dict_attr = {}  # dict attr
-        self.scope_emitter = scope_emitter.ScopeEmitter(self)  # scope emitter
-        self.var_env_dict = {}  # map from var to thread env name
+        self.context = context_maintainer.ContextMaintainer(self)  # scope emitter
 
     @staticmethod
     def is_meta(node):
@@ -170,15 +205,40 @@ def report_error(self, message, lineno=None, col_offset=None):
             col_offset = self.current_col_offset
         raise TVMScriptParserError(self.wrap_line_col(message, lineno, col_offset))
 
-    def get_body(self):
+    def parse_body(self):
         body = []
-        while len(self.scope_emitter.node_stack[-1]) > 0:
-            res = self.visit(self.scope_emitter.node_stack[-1].pop())
+        while len(self.context.node_stack[-1]) > 0:
+            res = self.visit(self.context.node_stack[-1].pop())
             if res is not None:
                 body.append(res)
         return tvm.tir.SeqStmt(body) if len(body) > 1 else body[0]
 
-    def get_type(self, type_node):
+    def parse_arg_list(self, func, node_call):
+        assert isinstance(node_call, ast.Call)
+        # collect arguments
+        args = [self.visit(arg) for arg in node_call.args]
+        kw_args = [self.visit(keyword) for keyword in node_call.keywords]
+        kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
+        # get the name and parameter list of func
+        if isinstance(func, (Intrin, ScopeHandler, SpecialStmt)):
+            func_name, param_list = func.signature()
+        else:
+            print(func)
+            raise Exception("Internal Error")
+        # check arguments and parameter list and get a list of arguments
+        reader = CallArgumentReader(func_name, args, kw_args, self)
+        pos_only, kwargs, varargs = param_list
+        internal_args = list()
+        for i, arg_name in enumerate(pos_only):
+            internal_args.append(reader.get_pos_only_arg(i + 1, arg_name))
+        for i, arg_info in enumerate(kwargs):
+            arg_name, default = arg_info
+            internal_args.append(reader.get_kwarg(i + 1 + len(pos_only), arg_name, default=default))
+        if varargs is not None:
+            internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1))
+        return internal_args
+
+    def parse_type(self, type_node):
         """ Parse type """
         if type_node is None:
             self.report_error("missing type annotation")
@@ -267,7 +327,6 @@ def visit_ClassDef(self, node):
         for body_element in node.body:
             if isinstance(body_element, ast.FunctionDef):
                 self.visit(body_element)
-        from .utils import create_module
 
         return create_module(self.functions)
 
@@ -282,69 +341,76 @@ def visit_FunctionDef(self, node):
         """
 
         self.init_function_parsing_env()
+        self.context.new_scope(nodes=node.body)
+
         # add parameters of function
         for arg in node.args.args:
-            arg_var = tvm.te.var(arg.arg, self.get_type(arg.annotation))
-            self.scope_emitter.update_symbol(arg.arg, arg_var)
-            self.params.append(arg_var)
-
-        # visit the body of function
-        self.scope_emitter.node_stack[-1].extend(reversed(node.body))
+            arg_var = tvm.te.var(arg.arg, self.parse_type(arg.annotation))
+            self.context.update_symbol(arg.arg, arg_var)
+            self.context.func_params.append(arg_var)
 
         # fetch the body and return a tir.PrimFunc
         func = tvm.tir.PrimFunc(
-            self.params,
-            self.get_body(),
-            ret_type=self.get_type(node.returns),
-            buffer_map=self.buffer_map,
-            attrs=tvm.ir.make_node("DictAttrs", **self.dict_attr),
+            self.context.func_params,
+            self.parse_body(),
+            ret_type=self.parse_type(node.returns),
+            buffer_map=self.context.func_buffer_map,
+            attrs=tvm.ir.make_node("DictAttrs", **self.context.func_dict_attr),
         )
         self.functions[GlobalVar(node.name)] = func
+
+        self.context.pop_scope()
         return func
 
     def visit_Assign(self, node):
         """Assign visitor
         AST abstract grammar:
             Assign(expr* targets, expr value, string? type_comment)
-        By now only 3 types of Assign is supported:
+
+        By now 3 patterns of Assign is supported:
             1. special stmts with return value
-                1.1 Buffer = tir.buffer_bind()/tir.buffer_decl()
+                1.1 Buffer = tir.match_buffer()/tir.buffer_decl()
                 1.2 Var = tir.var()
                 1.3 Var = tir.env_thread()
             2. (BufferStore) Buffer[PrimExpr, PrimExpr, ..., PrimExpr] = PrimExpr
             3. (Store)       Var[PrimExpr] = PrimExpr
             4. with scope handlers with concise scoping and var def
-                4.1 var = tir.alloc_with_scope()
+                4.1 var = tir.allocate()
         """
 
         if not len(node.targets) == 1:
             self.report_error("Only one-valued assignment is supported now")
-        target = node.targets[0]
 
-        if isinstance(target, ast.Name):
-            # scenario 1&4
-            self.target = [target.id]
-            if not isinstance(node.value, ast.Call):
-                self.report_error("Unsupported assign stmt")
+        if isinstance(node.targets[0], ast.Name) and isinstance(node.value, ast.Call):
+            # Pattern 1 & Pattern 4
             func = self.visit(node.value.func)
-            if Registry.is_with_scope(func):
-                # scenario 4
-                return self.visit(node.value)
+            arg_list = self.parse_arg_list(func, node.value)
+            if isinstance(func, WithScopeHandler):
+                if not func.concise_scope or not func.def_symbol:
+                    self.report_error(
+                        "with scope handler " + func.signature()[0] + " is not suitable here"
+                    )
+                # Pattern 4
+                func.enter_scope(node, self.context)
+                arg_list = self.parse_arg_list(func, node.value)
+                func.body = self.parse_body()
+                return func.exit_scope(node, self.context, arg_list)
+            elif isinstance(func, SpecialStmt):
+                # Pattern 1
+                func.handle(node, self.context, arg_list)
             else:
-                # scenario 1
-                rhs = self.visit(node.value)
-                self.scope_emitter.update_symbol(target.id, rhs)
-        elif isinstance(target, ast.Subscript):
-            # scenario 2&3
-            symbol, indexes = self.visit(target)
+                self.report_error("Unsupported Assign stmt")
+        elif isinstance(node.targets[0], ast.Subscript):
+            # Pattern 2 & Pattern 3
+            symbol, indexes = self.visit(node.targets[0])
             rhs = self.visit(node.value)
             if isinstance(symbol, tvm.tir.Buffer):
-                # BufferStore
+                # Pattern 2
                 return tvm.tir.BufferStore(symbol, tvm.runtime.convert(rhs), indexes)
             else:
                 if len(indexes) != 1:
                     self.report_error("Invalid Store stmt")
-                # Store
+                # Pattern 3
                 return tvm.tir.Store(
                     symbol, tvm.runtime.convert(rhs), indexes[0], tvm.runtime.convert(True)
                 )
@@ -355,14 +421,17 @@ def visit_AnnAssign(self, node):
         """AnnAssign visitor
         AST abstract grammar:
             AnnAssign(expr target, expr annotation, expr? value, int simple)
-        Corresponds to concise mode of with tir.let()
+
+        Pattern corresponds to concise mode of with tir.let()
         """
 
         if isinstance(node.target, ast.Name):
             value = self.visit(node.value)
-            var = tvm.te.var(node.target.id, self.get_type(node.annotation))
-            self.scope_emitter.update_symbol(var.name, var)
-            return tvm.tir.LetStmt(var, value, self.visit(self.scope_emitter.node_stack[-1].pop()))
+            var = tvm.te.var(node.target.id, self.parse_type(node.annotation))
+            self.context.update_symbol(var.name, var)
+            body = self.parse_body()
+            self.context.remove_symbol(var.name)
+            return tvm.tir.LetStmt(var, value, body)
         else:
             self.report_error("Unsupported AnnAssign stmt")
 
@@ -370,40 +439,45 @@ def visit_Assert(self, node):
         """Assert visitor
         AST abstract grammar:
             Assert(expr test, expr? msg)
-        Corresponds to concise mode of with tir.assert()
+
+        Pattern corresponds to concise mode of with tir.Assert()
         """
 
         condition = self.visit(node.test)
         if node.msg is None:
             self.report_error("Message of AssertStmt can't be None")
         message = self.visit(node.msg)
-        return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), self.get_body())
+        body = self.parse_body()
+        return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), body)
 
     def visit_For(self, node):
         """For visitor
         AST abstract grammar:
             For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
-        By now only 1 type of For is supported:
-            1. for name in tir.serial/parallel/vectorized/unroll(begin, end)
+        By now 1 pattern of For is supported:
+            1. for scope handler
+                for name in tir.serial()/tir.parallel()/tir.vectorized()/tir.unroll()
         """
 
-        # check node.iter, which is a Call
         if not isinstance(node.iter, ast.Call):
             self.report_error("The loop iter should be a Call")
         func = self.visit(node.iter.func)
-        if not Registry.is_for_scope(func):
-            self.report_error("Function not allowed in for scope")
-        # collect arguments
-        args = [self.visit(arg) for arg in node.iter.args]
-        kw_args = [self.visit(keyword) for keyword in node.iter.keywords]
-        kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
-
+        if not isinstance(func, ForScopeHandler):
+            self.report_error("Only for scope handlers can be used in for stmt")
+        # prepare for new for scope
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
         self.current_lineno, self.current_col_offset = (
             self.base_lineno + node.iter.lineno - 1,
             node.iter.col_offset,
         )
-        res = func(self, node, args, kw_args)
+        self.context.new_scope(nodes=node.body)
+        # for scope handler process the scope
+        func.enter_scope(node, self.context)
+        func.body = self.parse_body()
+        arg_list = self.parse_arg_list(func, node.iter)
+        res = func.exit_scope(node, self.context, arg_list)
+        # exit the scope
+        self.context.pop_scope()
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
@@ -412,10 +486,13 @@ def visit_With(self, node):
         AST abstract grammar:
             With(withitem* items, stmt* body, string? type_comment)
             withitem = (expr context_expr, expr? optional_vars)
-        By now 2 types of With is supported:
-            1. with tir.allocate() as targets:
-            2. with tir.let()/tir.Assert()/tir.attr()//tir.realize()
+        By now 2 patterns of With is supported:
+            1. with scope handler with symbol def
+                with tir.allocate() as targets:
+            2. with scope handler without symbol def
+                with tir.let()/tir.Assert()/tir.attr()//tir.realize()
         """
+
         if not len(node.items) == 1:
             self.report_error("Only one with element is supported now")
         if not isinstance(node.items[0].context_expr, ast.Call):
@@ -425,32 +502,22 @@ def visit_With(self, node):
         func_node = func_call.func
         func = self.visit(func_node)
 
-        if not Registry.is_with_scope(func):
+        if not isinstance(func, WithScopeHandler):
             self.report_error("Function not allowed in with scope")
-
-        self.target = []
-        if node.items[0].optional_vars is not None:
-            # preprocess optional var names
-            if isinstance(node.items[0].optional_vars, ast.Name):
-                self.target = [node.items[0].optional_vars.id]
-            elif isinstance(node.items[0].optional_vars, (ast.List, ast.Tuple)):
-                for var in node.items[0].optional_vars.elts:
-                    if not isinstance(var, ast.Name):
-                        self.report_error("Invalid optional var definition")
-                self.target = [var.id for var in node.items[0].optional_vars.elts]
-            else:
-                self.report_error("Invalid optional var definition")
-        # parse other arguments
-        args = [self.visit(arg) for arg in func_call.args]
-        kw_args = [self.visit(keyword) for keyword in func_call.keywords]
-        kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
-
+        # prepare for new block scope
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
         self.current_lineno, self.current_col_offset = (
             self.base_lineno + func_call.lineno - 1,
             func_call.col_offset,
         )
-        res = func(self, node, args, kw_args)
+        self.context.new_scope(nodes=node.body)
+        # with scope handler process the scope
+        func.enter_scope(node, self.context)
+        func.body = self.parse_body()
+        arg_list = self.parse_arg_list(func, func_call)
+        res = func.exit_scope(node, self.context, arg_list)
+        # exit the scope
+        self.context.pop_scope()
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
@@ -462,19 +529,18 @@ def visit_If(self, node):
 
         condition = self.visit(node.test)
         # then body
-        self.scope_emitter.new_scope()
-        self.scope_emitter.node_stack[-1].extend(reversed(node.body))
-        then_body = self.get_body()
-        self.scope_emitter.pop_scope()
+        self.context.new_scope(nodes=node.body)
+        then_body = self.parse_body()
+        self.context.pop_scope()
 
         # else body
         if len(node.orelse) > 0:
-            self.scope_emitter.new_scope()
-            self.scope_emitter.node_stack[-1].extend(reversed(node.orelse))
-            else_body = self.get_body()
-            self.scope_emitter.pop_scope()
+            self.context.new_scope(nodes=node.orelse)
+            else_body = self.parse_body()
+            self.context.pop_scope()
         else:
             else_body = None
+
         return tvm.tir.IfThenElse(condition, then_body, else_body)
 
     def visit_Call(self, node):
@@ -482,22 +548,30 @@ def visit_Call(self, node):
         AST abstract grammar:
             Call(expr func, expr* args, keyword* keywords)
             keyword = (identifier? arg, expr value)
-        All the functions used outside With and For are registered in special_stmt or intrin
+
+        By now 3 patterns of Call is allowed
+            1. Intrin representing PrimExpr/IterVar
+                1.1 tir.int/uint/float8/16/32/64/floormod/floordiv/load/cast/ramp/broadcast/max
+                1.2 tir.range/reduce_axis/scan_axis/opaque_axis
+            2. tir.Op(dtype, ...)
+            3. other callable functions
         """
 
         func = self.visit(node.func)
-        # collect arguments
-        args = [self.visit(arg) for arg in node.args]
-        kw_args = [self.visit(keyword) for keyword in node.keywords]
-        kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
-
-        if callable(func):
-            if Registry.is_registered(func):
-                return func(self, node, args, kw_args)
-            else:
+        if isinstance(func, Intrin) and not func.stmt:
+            # pattern 1
+            arg_list = self.parse_arg_list(func, node)
+            return func.handle(arg_list)
+        else:
+            args = [self.visit(arg) for arg in node.args]
+            kw_args = [self.visit(keyword) for keyword in node.keywords]
+            kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
+            if isinstance(func, tvm.tir.op.Op):
+                # pattern 2
+                return tvm.tir.Call(kw_args["dtype"], func, args)
+            elif callable(func):
+                # pattern 3
                 return func(*args, **kw_args)
-        elif isinstance(func, tvm.tir.op.Op):
-            return tvm.tir.Call(kw_args["dtype"], func, args)
 
         self.report_error("Unsupported function call")
 
@@ -505,17 +579,35 @@ def visit_Expr(self, node):
         """Expr visitor
         AST abstract grammar:
             Expr(expr value)
-        Now only 3 types of `Expr` stmt is allowed:
-            1. reducer.step()/tir.store()
-            2. tir.attr()/tir.assert()/tir.allocate()/tir.realize()
-            3. tir.set_func_attr()
+
+        Now only 3 types of Expr stmt is allowed:
+            1. Intrin representing Stmt without body
+                tir.store()/tir.evaluate()
+            2. with scope handlers with concise scoping without var def
+                tir.attr()/tir.assert()/tir.allocate()/tir.realize()
+            3. special stmt without var def
+                tir.func_attr()
         """
 
         if not isinstance(node.value, ast.Call):
             self.report_error("Unsupported Expr stmt")
-        res = self.visit(node.value)
-        if res is None or isinstance(res, tvm.tir.Stmt):
-            return res
+
+        func = self.visit(node.value.func)
+        arg_list = self.parse_arg_list(func, node.value)
+
+        if isinstance(func, Intrin) and func.stmt:
+            # pattern 1
+            return func.handle(arg_list)
+        elif isinstance(func, WithScopeHandler) and func.concise_scope and not func.def_symbol:
+            # pattern 2
+            func.enter_scope(node, self.context)
+            func.body = self.parse_body()
+            return func.exit_scope(node, self.context, arg_list)
+        elif isinstance(func, SpecialStmt) and not func.def_symbol:
+            # pattern 3
+            func.handle(node, self.context, arg_list)
+            return
+
         self.report_error("Invalid Expr stmt")
 
     def visit_BinOp(self, node):
@@ -572,7 +664,7 @@ def visit_Subscript(self, node):
             slice = Slice(expr? lower, expr? upper, expr? step)
                     | ExtSlice(slice* dims)
                     | Index(expr value)
-        By now only 2 types of Subscript are supported:
+        By now 2 patterns of Subscript are supported:
             1. Buffer[index, index, ...], Buffer element access(BufferLoad & BufferStore)
                Var[index] Buffer element access()
             2. meta[type_key][index], Meta info access
@@ -587,7 +679,7 @@ def visit_Subscript(self, node):
                 indexes = self.visit(node.slice.value)
                 indexes = list(indexes) if isinstance(indexes, tuple) else [indexes]
                 if isinstance(node.ctx, ast.Load):
-                    if isinstance(symbol, tir.expr.Var):
+                    if isinstance(symbol, tvm.tir.expr.Var):
                         return tvm.tir.Load("float32", symbol, indexes, True)
                     else:
                         return tvm.tir.BufferLoad(symbol, indexes)
@@ -632,7 +724,7 @@ def visit_Attribute(self, node):
         if isinstance(node.value, ast.Name):
             if node.value.id == "tir":
                 func_name = "tir." + node.attr
-                res = Registry.look_up_function(func_name)
+                res = Registry.lookup(func_name)
                 if res is not None:
                     return res
                 try:
@@ -696,10 +788,10 @@ def visit_Name(self, node):
         name = node.id
         if name == "meta":
             return self.meta
-        symbol = Registry.look_up_function(name)
+        symbol = Registry.lookup(name)
         if symbol is not None:
             return symbol
-        symbol = self.scope_emitter.lookup_symbol(name)
+        symbol = self.context.lookup_symbol(name)
         if symbol is not None:
             return symbol
         self.report_error("Unknown identifier %s" % name)
@@ -749,10 +841,95 @@ def from_source(src, func_lineno=0):
             parser.wrap_line_col(msg, parser.current_lineno, parser.current_col_offset).split("\n")
         )
         inject_e[-1] = "TVM" + inject_e[-1][6:]
-        raise TVMError("\n".join(inject_e))
+        raise TVMError("\n".join(inject_e)) from e
     except Exception as e:
         inject_e = parser.wrap_line_col(str(e), parser.current_lineno, parser.current_col_offset)
-        raise TVMScriptParserError(inject_e)
+        raise TVMScriptParserError(inject_e) from e
+
+
+def _parse(script_in):
+    """Helper function to parse TVM script into TIR"""
+    return from_source(inspect.getsource(script_in), inspect.getsourcelines(script_in)[1])
+
+
+def create_module(functions=None):
+    """Construct a module from list of functions.
+
+    Parameters
+    -----------
+    functions: Optional[dict].
+        Map of GlobalVar or str to PrimFunc
+
+    Returns
+    -------
+    mod : IRModule
+        An IRModule containing the passed definitions
+    """
+
+    return IRModule(functions=functions)
+
+
+def asscript(input_ir, show_meta=False):
+    """Transform a PrimFunc or IRModule to python syntax script
+
+    Parameters
+    ----------
+    input_ir : Union[PrimFunc, IRModule]
+        The PrimFunc or IRModule to be dumped
+
+    show_meta : bool
+        Whether show meta
+
+    Returns
+    -------
+    script : str
+        The Python script
+    """
+
+    return _ffi_api.AsTVMScript(input_ir, show_meta)
+
+
+def tir(script_in):
+    """Decorate a python function or class as tvm script.
+
+    The tvm function or parsing support parsing to the internal TIR.
+
+    Returns
+    -------
+    output : Union[Function, Module]
+        The Function or Module in IR.
+    """
+
+    if inspect.isfunction(script_in):
+        result = _parse(script_in)
+    elif inspect.isclass(script_in):
+        result = TVMScriptClass(script_in)
+    else:
+        raise TypeError("Only function and class are supported")
+    result.__name__ = script_in.__name__
+    result.__qualname__ = script_in.__qualname__
+    return result
+
+
+def module(script_in):
+    """Decorate a python function or class as tvm script.
+
+    Alias for tvm.script.tir for now.
+
+    Returns
+    -------
+    output : Union[Function, Module]
+        The Function or Module in IR.
+    """
+    return tir(script_in)
+
+
+class TVMScriptClass:
+    """Helper class for decorating a class"""
 
+    def __init__(self, script_in):
+        self.script = script_in
 
-tvm._ffi._init_api("script", __name__)
+    def __call__(self, *args, **kwargs):
+        # call the parser to transform tvm script into TIR
+        return _parse(self.script)
diff --git a/python/tvm/script/registry.py b/python/tvm/script/registry.py
index acbc444a4190..389570115935 100644
--- a/python/tvm/script/registry.py
+++ b/python/tvm/script/registry.py
@@ -15,19 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """TVM Script Parser Function Registry """
-# pylint: disable=inconsistent-return-statements
+# pylint: disable=inconsistent-return-statements, relative-beyond-top-level, import-outside-toplevel
 import inspect
-from enum import Enum
-from typed_ast import ast3 as ast
-
-import tvm
-
-
-class Category(Enum):
-    INTRIN = 0
-    WITH_SCOPE = 1
-    FOR_SCOPE = 2
-    SPECIAL_STMT = 3
 
 
 class Registry(object):
@@ -35,355 +24,35 @@ class Registry(object):
     All these maps are static
     """
 
-    functions = dict()
+    registrations = dict()
 
     @staticmethod
-    def look_up_function(func_name):
-        """look up a registered function by name"""
-        if func_name in Registry.functions:
-            return Registry.functions[func_name][0]
+    def lookup(name):
+        if name in Registry.registrations:
+            # every time we create a new handler
+            # since we may want to keep some local info inside it
+            return Registry.registrations[name]()
         return None
 
-    @staticmethod
-    def is_intrin(func):
-        """check whether a function belongs to intrin"""
-        return (func, Category.INTRIN) in Registry.functions.values()
-
-    @staticmethod
-    def is_with_scope(func):
-        """check whether a function belongs to with scope handlers"""
-        return (func, Category.WITH_SCOPE) in Registry.functions.values()
-
-    @staticmethod
-    def is_for_scope(func):
-        """check whether a function belongs to for scope handlers"""
-        return (func, Category.FOR_SCOPE) in Registry.functions.values()
-
-    @staticmethod
-    def is_special_stmt(func):
-        """check whether a function belongs to special stmts"""
-        return (func, Category.SPECIAL_STMT) in Registry.functions.values()
-
-    @staticmethod
-    def is_registered(func):
-        """check whether a function is registered"""
-        return (
-            Registry.is_intrin(func)
-            or Registry.is_with_scope(func)
-            or Registry.is_for_scope(func)
-            or Registry.is_special_stmt(func)
-        )
-
-
-class CallArgumentReader(object):
-    """A helper class which read required argument from passed arguments"""
-
-    def __init__(self, func_name, args, kwargs, parser):
-        self.func_name = func_name
-        self.args = args
-        self.kwargs = kwargs
-        self.parser = parser
-
-    def get_pos_only_arg(self, pos, name):
-        """Get corresponding position only function argument from argument list"""
-        if len(self.args) >= pos:
-            arg = self.args[pos - 1]
-        elif name not in self.kwargs:
-            self.parser.report_error(self.func_name + " misses argument " + name)
-        else:
-            arg = self.kwargs[name]
-
-        return arg
-
-    def get_kwarg(self, pos, name, default):
-        """Get corresponding keyword function argument from argument list
-        If user doesn't provide the argument, set it to default value
-        """
-        if len(self.args) >= pos:
-            arg = self.args[pos - 1]
-        elif name in self.kwargs:
-            arg = self.kwargs[name]
-        else:
-            return default
-
-        return arg
-
-    def get_varargs(self, pos):
-        """Get corresponding variable argument from argument list"""
-        if len(self.args) >= pos and len(self.kwargs) == 0:
-            return self.args[pos - 1 :]
-        return []
-
-    def auto_insert_body(self, pos, body):
-        """Automatically provide body as function call argument"""
-        if len(self.args) >= pos:
-            self.args.insert(pos - 1, body)
-        else:
-            self.kwargs["body"] = body
-
-
-def func_wrapper(func_name, func_to_register, arg_list, category, concise=False, with_var=False):
-    """Helper function to wrap a function to be registered """
-
-    def wrap_func(parser, node, args, kwargs):
-        if category == Category.FOR_SCOPE:
-            # automatically parse loop vars and body for for_scope handlers
-            loop_var_names = list()
-            if isinstance(node.target, ast.Name):
-                loop_var_names.append(node.target.id)
-            elif isinstance(node.target, ast.Tuple):
-                for elt in node.target.elts:
-                    if not isinstance(elt, ast.Name):
-                        parser.report_error("Invalid loop var")
-                    loop_var_names.append(elt.id)
-            else:
-                parser.report_error("Invalid loop var")
-            loop_vars = [tvm.te.var(name, dtype="int32") for name in loop_var_names]
-
-            parser.scope_emitter.new_scope()
-            parser.scope_emitter.node_stack[-1].extend(reversed(node.body))
-            for loop_var in loop_vars:
-                parser.scope_emitter.update_symbol(loop_var.name, loop_var)
-            body = parser.get_body()
-            parser.scope_emitter.pop_scope()
-        elif category == Category.WITH_SCOPE:
-            if not with_var:
-                if isinstance(node, ast.With) and node.items[0].optional_vars is not None:
-                    parser.report_error("Function " + func_name + " expects no optional vars")
-                # automatically parse body for with_scope handlers without optional vars
-                if isinstance(node, ast.With):
-                    parser.scope_emitter.new_scope()
-                    parser.scope_emitter.node_stack[-1].extend(reversed(node.body))
-                    body = parser.get_body()
-                    parser.scope_emitter.pop_scope()
-                else:
-                    body = parser.get_body()
-            else:
-                if isinstance(node, ast.With) and node.items[0].optional_vars is None:
-                    parser.report_error("Function " + func_name + " expects optional vars")
-                body = None
-
-            if not isinstance(node, ast.With) and not concise:
-                parser.report_error("Concise scoping is not allowed here")
-
-        reader = CallArgumentReader(func_name, args, kwargs, parser)
-        pos_only, kwargs, varargs = arg_list
-
-        internal_args = list()
-        if category == Category.WITH_SCOPE:
-            if not with_var:
-                internal_args.extend([parser, node, body])
-            else:
-                internal_args.extend([parser, node])
-        elif category == Category.FOR_SCOPE:
-            internal_args.extend([parser, node, body, loop_vars])
-        elif category == Category.SPECIAL_STMT:
-            internal_args.extend([parser, node])
-
-        for i, arg_name in enumerate(pos_only):
-            internal_args.append(reader.get_pos_only_arg(i + 1, arg_name))
 
-        for i, arg_info in enumerate(kwargs):
-            arg_name, default = arg_info
-            internal_args.append(reader.get_kwarg(i + 1 + len(pos_only), arg_name, default=default))
+def register(inputs):
+    """Register Intrin/ScopeHandler/SpecialStmt"""
+    if inspect.isfunction(inputs):
+        from .intrin import Intrin
 
-        if varargs is not None:
-            internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1))
+        def create_new_intrin(func):
+            class NewIntrin(Intrin):
+                def __init__(self):
+                    super().__init__(func)
 
-        return func_to_register(*internal_args)
-
-    return wrap_func
-
-
-def get_arg_list(origin_func, category, with_var=False):
-    """Helper function to get the argument list of Function
-    Parameters
-    ----------
-    origin_func: function
-        The function to get the argument list
-    category: Category
-        The category of registered function
-    with_var: bool, optional
-        Whether the with scope handler neeeds optional vars
-    """
-    full_arg_spec = inspect.getfullargspec(origin_func)
-
-    args, defaults = full_arg_spec.args, full_arg_spec.defaults
-
-    if defaults is None:
-        defaults = tuple()
-
-    if category == Category.WITH_SCOPE:
-        if not with_var:
-            if len(args) < 3 or args[0] != "parser" or args[1] != "node" or args[2] != "body":
-                raise RuntimeError(
-                    "TVM Script register error : the first three arguments of "
-                    "this with scope handler must be parser, node, body"
-                )
-            args = args[3:]
-        else:
-            if len(args) < 2 or args[0] != "parser" or args[1] != "node":
-                raise RuntimeError(
-                    "TVM Script register error : the first two arguments of "
-                    "this with scope handler must be parser, node"
-                )
-            args = args[2:]
-    elif category == Category.FOR_SCOPE:
-        if (
-            len(args) < 4
-            or args[0] != "parser"
-            or args[1] != "node"
-            or args[2] != "body"
-            or args[3] != "loop_vars"
-        ):
-            raise RuntimeError(
-                "TVM Script register error : the first three arguments of for scope handler"
-                "must be parser, node, body, loop_vars"
-            )
-        args = args[4:]
-    elif category == Category.SPECIAL_STMT:
-        if len(args) < 2 or args[0] != "parser" or args[1] != "node":
-            raise RuntimeError(
-                "TVM Script register error : the first three arguments of special stmt"
-                "must be parser, node"
-            )
-        args = args[2:]
-
-    if full_arg_spec.varkw is not None:
-        raise RuntimeError(
-            "TVM Script register error : variable keyword argument is not supported now"
-        )
-    if not len(full_arg_spec.kwonlyargs) == 0:
-        raise RuntimeError("TVM Script register error : keyword only argument is not supported now")
-
-    pos_only = list()
-    for arg in args[: len(args) - len(defaults)]:
-        pos_only.append(arg)
-    kwargs = list()
-    for default, arg in zip(defaults, args[len(args) - len(defaults) :]):
-        kwargs.append((arg, default))
-
-    return pos_only, kwargs, full_arg_spec.varargs
-
-
-def register_intrin(name=None):
-    """Decorator to register function under category intrin
-    Parameters
-    ----------
-    name: str, optional
-        registered name for the function
-    Example
-    ------
-    .. code-block:: python
-    @register_intrin
-    def broadcast(value, lanes):
-        lanes = lanes.value if not isinstance(lanes, int) else lanes
-        return tvm.tir.Broadcast(value, lanes)
-    """
-
-    def decorate(origin_func):
-        func_name = "tir." + origin_func.__qualname__ if name is None else name
-        Registry.functions[func_name] = (
-            func_wrapper(
-                func_name, origin_func, get_arg_list(origin_func, Category.INTRIN), Category.INTRIN
-            ),
-            Category.INTRIN,
-        )
-        return origin_func
-
-    return decorate
-
-
-def register_with_scope(concise=False, with_var=False, name=None):
-    """Decorator to register function under with scope handler
-    Parameters
-    ----------
-    concise: bool, optional
-        whether this with scope handler is allowed in concise scoping
-    with_var: bool, optional
-        whether this with scope handler neeeds optional vars
-    name: str, optional
-        registered name for the function
-    Example
-    ------
-    .. code-block:: python
-    @register_scope_handler(concise=True)
-    def attr(parser, node, attr_node, attr_key, value, body):
-        return tvm.tir.AttrStmt(attr_node, attr_key, tvm.runtime.convert(value), body)
-    """
-
-    def decorate(origin_func):
-        """Register function under category with_scope"""
-        func_name = "tir." + origin_func.__qualname__ if name is None else name
-        Registry.functions[func_name] = (
-            func_wrapper(
-                func_name,
-                origin_func,
-                get_arg_list(origin_func, Category.WITH_SCOPE, with_var),
-                Category.WITH_SCOPE,
-                concise=concise,
-                with_var=with_var,
-            ),
-            Category.WITH_SCOPE,
-        )
-        return origin_func
-
-    return decorate
-
-
-def register_for_scope(name=None):
-    """Decorator to register function under for scope handler
-    Parameters
-    ----------
-    name: str, optional
-        registered name for the function
-    """
-
-    def decorate(origin_func):
-        func_name = "tir." + origin_func.__qualname__ if name is None else name
-        Registry.functions[func_name] = (
-            func_wrapper(
-                func_name,
-                origin_func,
-                get_arg_list(origin_func, Category.FOR_SCOPE),
-                Category.FOR_SCOPE,
-            ),
-            Category.FOR_SCOPE,
-        )
-        return origin_func
-
-    return decorate
-
-
-def register_special_stmt(name=None):
-    """Decorator to register function under category special_stmt
-    Parameters
-    ----------
-    name: str, optional
-        registered name for the function
-    Example
-    -------
-    @register_special_stmt
-    def buffer_decl(parser, node, shape, dtype="float32", data=None, strides=[], elem_offset=None,
-                    scope="global", align=-1, offset_factor=0, buffer_type="default"):
-        align = align.value if not isinstance(align, int) else align
-        offset_factor = offset_factor.value if not isinstance(offset_factor, int) else offset_factor
-        buffer = tvm.tir.decl_buffer(shape, dtype, parser.assign_target, data, strides,
-                                    elem_offset, scope, align, offset_factor, buffer_type)
-        return buffer
-    """
+            return NewIntrin
 
-    def decorate(origin_func):
-        func_name = "tir." + origin_func.__qualname__ if name is None else name
-        Registry.functions[func_name] = (
-            func_wrapper(
-                func_name,
-                origin_func,
-                get_arg_list(origin_func, Category.SPECIAL_STMT),
-                Category.SPECIAL_STMT,
-            ),
-            Category.SPECIAL_STMT,
-        )
-        return origin_func
+        registration = create_new_intrin(inputs)
+    elif inspect.isclass(inputs):
+        registration = inputs
+    else:
+        raise ValueError()
 
-    return decorate
+    key = registration().signature()[0]
+    Registry.registrations[key] = registration
+    return registration
diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py
index 08cd7ca84eb9..251df8c6d6cb 100644
--- a/python/tvm/script/scope_handler.py
+++ b/python/tvm/script/scope_handler.py
@@ -14,182 +14,248 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Parser Scope Handler Functions
-This module provides the functions registered into parser under with_scope or for_scope category.
-Scope handler nodes are StmtNodes with body, which are used to handle such scenarios.
-1. For scope handler
-When registering a for scope handler, the first 4 arguments must be parser, node, body, loop_vars
-and these arguments will provided by TVM Script parser automatically
-.. code-block:: python
-    for loop_vars in tir.xxx():
-2. With scope handler
-There are 4 subtypes of with scope handlers, classified by
-    1) with or without as
-    2) allow concise scoping or not
-1) with as    & concise
-the first 2 arguments must be parser, node
-Need to parse the body manually
-Example : tir.alloc_with_scope
-.. code-block:: python
-    target = tir.xxx()
-    with tir.xxx() as target:
-2) with as    & not concise
-the first 2 arguments must be parser, node
-Need to parse the body manually
-Example : None atm
-.. code-block:: python
-    with tir.xxx() as target:
-3) without as & concise
-the first 3 arguments must be parser, node, body
-TVM Script parser will parse the body automatically
-Example : tir.allocate()/tir.realize()/tir.attr()
-.. code-block:: python
-    tir.xxx()
-    with tir.xxx():
-4) without as & not concise
-the first 3 arguments must be parser, node, body
-TVM Script parser will parse the body automatically
-Example : tir.assert()/tir.let()
-.. code-block:: python
-    with tir.xxx():
-"""
-# pylint: disable=redefined-builtin, unused-argument, invalid-name
+"""TVM Script Parser Scope Handler Classes"""
+# pylint: disable=redefined-builtin, unused-argument, invalid-name, relative-beyond-top-level
 
 from typed_ast import ast3 as ast
 import tvm.tir
-from .registry import register_with_scope, register_for_scope
-
-
-# With scope handler
-@register_with_scope(concise=True, with_var=True)
-def allocate(parser, node, extents, dtype, scope, condition=True):
-    """ With scope handler function tir.alloc_with_scope(var, extents, dtype, scope, condition) """
-    # defining buffer var and parse the body manually
-
-    buffer_var = tvm.te.var(parser.target[0], "handle")
-    # (TODO) Uncomment this line if we have richer type info for buffer var
-    # buffer_var = tvm.te.var(parser.target[0], tvm.ir.PointerType(tvm.ir.PrimType(dtype)))
-    if isinstance(node, ast.With):
-        parser.scope_emitter.new_scope()
-        parser.scope_emitter.update_symbol(buffer_var.name, buffer_var)
-        parser.scope_emitter.node_stack[-1].extend(reversed(node.body))
-        body = parser.get_body()
-        parser.scope_emitter.pop_scope()
-    else:
-        parser.scope_emitter.update_symbol(buffer_var.name, buffer_var)
-        body = parser.get_body()
-    condition = tvm.runtime.convert(condition)
-    scope = tvm.runtime.convert(scope)
-    body = tvm.tir.Allocate(buffer_var, dtype, extents, condition, body)
-    return tvm.tir.AttrStmt(buffer_var, "storage_scope", scope, body)
-
-
-@register_with_scope(concise=True)
-def launch_thread(parser, node, body, env_var, extent):
-    extent = tvm.runtime.convert(extent)
-    return tvm.tir.AttrStmt(
-        tvm.tir.IterVar(
-            None, env_var, getattr(tvm.tir.IterVar, "ThreadIndex"), parser.var_env_dict[env_var]
-        ),
-        "thread_extent",
-        extent,
-        body,
-    )
-
-
-@register_with_scope(concise=True)
-def realize(parser, node, body, buffer_bounds, scope, condition=True):
-    """ With scope handler function tir.realize(buffer_bounds, scope, condition) """
-    buffer, bounds = buffer_bounds
-    scope = tvm.runtime.convert(scope)
-    return tvm.tir.AttrStmt(
-        buffer, "realize_scope", scope, tvm.tir.BufferRealize(buffer, bounds, condition, body)
-    )
-
-
-@register_with_scope(concise=True)
-def attr(parser, node, body, attr_node, attr_key, value):
-    """ With scope handler function tir.attr(attr_node, attr_key, value) """
-    attr_node = tvm.runtime.convert(attr_node)
-    value = tvm.runtime.convert(value)
-    return tvm.tir.AttrStmt(attr_node, attr_key, value, body)
-
-
-@register_with_scope(concise=False)
-def Assert(parser, node, body, condition, message):
-    """ With scope handler function tir.Assert(condition, message) """
-    return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), body)
-
-
-@register_with_scope(concise=False)
-def let(parser, node, body, var, value):
-    """ With scope handler function tir.let(var, value) """
-    return tvm.tir.LetStmt(var, value, body)
-
-
-# For scope handler
-@register_for_scope()
-def serial(parser, node, body, loop_vars, begin, end):
-    """ For scope handler function tir.serial(begin, end)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    return tvm.tir.For(loop_vars[0], begin, extent, 0, 0, body)
-
-
-@register_for_scope()
-def parallel(parser, node, body, loop_vars, begin, end):
-    """ For scope handler function tir.parallel(begin, end)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    return tvm.tir.For(loop_vars[0], begin, extent, 1, 0, body)
-
-
-@register_for_scope()
-def vectorized(parser, node, body, loop_vars, begin, end):
-    """ For scope handler function tir.vectorized(begin, end)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    return tvm.tir.For(loop_vars[0], begin, extent, 2, 0, body)
-
-
-@register_for_scope()
-def unroll(parser, node, body, loop_vars, begin, end):
-    """ For scope handler function tir.unroll(begin, end)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    return tvm.tir.For(loop_vars[0], begin, extent, 3, 0, body)
-
-
-@register_for_scope(name="range")
-def Range(parser, node, body, loop_vars, begin, end, annotation=None):
-    """ For scope handler function range(begin, end, annotation)"""
-    if len(loop_vars) != 1:
-        parser.report_error("Expect exact 1 loop var")
-    ana = tvm.arith.Analyzer()
-    extent = end if begin == 0 else ana.simplify(end - begin)
-    if annotation is None:
-        annotation = []
-    else:
-        annotation = [
-            tvm.tir.Annotation(key, tvm.runtime.convert(val) if isinstance(val, str) else val)
-            for key, val in annotation.items()
-        ]
-    return tvm.tir.Loop(loop_vars[0], begin, extent, annotation, body)
-
-
-@register_for_scope()
-def grid(parser, node, body, loop_vars, *extents):
-    """ For scope handler function tir.grid(*extents) """
-    if len(loop_vars) != len(extents):
-        parser.report_error("Inconsitent number of loop vars and extents")
-    for loop_var, extent in zip(reversed(loop_vars), reversed(extents)):
-        body = tvm.tir.Loop(loop_var, 0, extent, [], body)
-    return body
+from .utils import get_param_list
+from .registry import register
+
+
+class ScopeHandler:
+    """Base class for all scope handlers"""
+
+    def __init__(self, func):
+        self.func = func
+        self.body = None
+        self.node = None
+        self.context = None
+
+    def signature(self):
+        return "tir." + self.func.__name__, get_param_list(self.func)
+
+    def enter_scope(self, node, context):
+        pass
+
+    def exit_scope(self, node, context, arg_list):
+        self.node = node
+        self.context = context
+        return self.func(*arg_list)
+
+
+class WithScopeHandler(ScopeHandler):
+    """Base class for all with scope handlers"""
+
+    def __init__(self, func, concise_scope, def_symbol):
+        super().__init__(func)
+        self.concise_scope = concise_scope
+        self.def_symbol = def_symbol
+
+    @staticmethod
+    def get_optional_var_names(node, context):
+        """Get list of names from ast.With's optional_vars"""
+        assert isinstance(node, ast.With)
+
+        var_names = None
+        if isinstance(node.items[0].optional_vars, ast.Name):
+            var_names = [node.items[0].optional_vars.id]
+        elif isinstance(node.items[0].optional_vars, (ast.List, ast.Tuple)):
+            for var in node.items[0].optional_vars.elts:
+                if not isinstance(var, ast.Name):
+                    context.report_error("Invalid optional var definition")
+            var_names = [var.id for var in node.items[0].optional_vars.elts]
+        else:
+            context.report_error("Invalid optional var definition")
+        return var_names
+
+
+@register
+class Allocate(WithScopeHandler):
+    """ With scope handler tir.alloc_with_scope(var, extents, dtype, scope, condition) """
+
+    def __init__(self):
+        def allocate(extents, dtype, scope, condition=True):
+            condition = tvm.runtime.convert(condition)
+            scope = tvm.runtime.convert(scope)
+            body = tvm.tir.Allocate(self.buffer_var, dtype, extents, condition, self.body)
+            return tvm.tir.AttrStmt(self.buffer_var, "storage_scope", scope, body)
+
+        super().__init__(allocate, concise_scope=True, def_symbol=True)
+        self.buffer_var = None
+
+    def enter_scope(self, node, context):
+        # define buffer vars in symbol table
+        if isinstance(node, ast.With):
+            names = WithScopeHandler.get_optional_var_names(node, context)
+            if len(names) != 1:
+                context.report_error("Unexpected number of vars")
+            name = names[0]
+        elif isinstance(node, ast.Assign):
+            name = node.targets[0].id
+        else:
+            raise Exception("Internal Bug")
+
+        self.buffer_var = tvm.te.var(name, "handle")
+        context.update_symbol(name, self.buffer_var)
+
+
+@register
+class LaunchThread(WithScopeHandler):
+    """ With scope handler tir.launch_thread(env_var, extent) """
+
+    def __init__(self):
+        def launch_thread(env_var, extent):
+            extent = tvm.runtime.convert(extent)
+            return tvm.tir.AttrStmt(
+                tvm.tir.IterVar(
+                    None,
+                    env_var,
+                    getattr(tvm.tir.IterVar, "ThreadIndex"),
+                    self.context.func_var_env_dict[env_var],
+                ),
+                "thread_extent",
+                extent,
+                self.body,
+            )
+
+        super().__init__(launch_thread, concise_scope=True, def_symbol=False)
+
+
+@register
+class Realize(WithScopeHandler):
+    """ With scope handler tir.realize(buffer_bounds, scope, condition) """
+
+    def __init__(self):
+        def realize(buffer_bounds, scope, condition=True):
+            buffer, bounds = buffer_bounds
+            scope = tvm.runtime.convert(scope)
+            return tvm.tir.AttrStmt(
+                buffer,
+                "realize_scope",
+                scope,
+                tvm.tir.BufferRealize(buffer, bounds, condition, self.body),
+            )
+
+        super().__init__(realize, concise_scope=True, def_symbol=False)
+
+
+@register
+class Attr(WithScopeHandler):
+    """ With scope handler tir.attr(attr_node, attr_key, value) """
+
+    def __init__(self):
+        def attr(attr_node, attr_key, value):
+            attr_node = tvm.runtime.convert(attr_node)
+            value = tvm.runtime.convert(value)
+            return tvm.tir.AttrStmt(attr_node, attr_key, value, self.body)
+
+        super().__init__(attr, concise_scope=True, def_symbol=False)
+
+
+@register
+class AssertHandler(WithScopeHandler):
+    """ With scope handler tir.Assert(condition, message) """
+
+    def __init__(self):
+        def Assert(condition, message):
+            return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), self.body)
+
+        super().__init__(Assert, concise_scope=True, def_symbol=False)
+
+
+@register
+class Let(WithScopeHandler):
+    """ With scope handler tir.let(var, value) """
+
+    def __init__(self):
+        def let(var, value):
+            return tvm.tir.LetStmt(var, value, self.body)
+
+        super().__init__(let, concise_scope=False, def_symbol=False)
+
+
+class ForScopeHandler(ScopeHandler):
+    """Base class for all for scope handlers"""
+
+    def __init__(self, func):
+        super().__init__(func)
+        self.loop_vars = None
+
+    def enter_scope(self, node, context):
+        assert isinstance(node, ast.For)
+
+        loop_var_names = list()
+        if isinstance(node.target, ast.Name):
+            loop_var_names.append(node.target.id)
+        elif isinstance(node.target, ast.Tuple):
+            for elt in node.target.elts:
+                if not isinstance(elt, ast.Name):
+                    context.report_error("Invalid loop var")
+                loop_var_names.append(elt.id)
+        else:
+            context.report_error("Invalid loop var")
+
+        self.loop_vars = [tvm.te.var(name, dtype="int32") for name in loop_var_names]
+        for loop_var in self.loop_vars:
+            context.update_symbol(loop_var.name, loop_var)
+
+
+@register
+class Serial(ForScopeHandler):
+    """ For scope handler tir.serial(begin, end)"""
+
+    def __init__(self):
+        def serial(begin, end):
+            if len(self.loop_vars) != 1:
+                self.context.report_error("Expect exact 1 loop var")
+            ana = tvm.arith.Analyzer()
+            extent = end if begin == 0 else ana.simplify(end - begin)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 0, 0, self.body)
+
+        super().__init__(serial)
+
+
+@register
+class Parallel(ForScopeHandler):
+    """ For scope handler tir.parallel(begin, end)"""
+
+    def __init__(self):
+        def parallel(begin, end):
+            if len(self.loop_vars) != 1:
+                self.context.report_error("Expect exact 1 loop var")
+            ana = tvm.arith.Analyzer()
+            extent = end if begin == 0 else ana.simplify(end - begin)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 1, 0, self.body)
+
+        super().__init__(parallel)
+
+
+@register
+class Vectorized(ForScopeHandler):
+    """ For scope handler tir.vectorized(begin, end)"""
+
+    def __init__(self):
+        def vectorized(begin, end):
+            if len(self.loop_vars) != 1:
+                self.context.report_error("Expect exact 1 loop var")
+            ana = tvm.arith.Analyzer()
+            extent = end if begin == 0 else ana.simplify(end - begin)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 2, 0, self.body)
+
+        super().__init__(vectorized)
+
+
+@register
+class Unroll(ForScopeHandler):
+    """ For scope handler tir.unroll(begin, end)"""
+
+    def __init__(self):
+        def unroll(begin, end):
+            if len(self.loop_vars) != 1:
+                self.context.report_error("Expect exact 1 loop var")
+            ana = tvm.arith.Analyzer()
+            extent = end if begin == 0 else ana.simplify(end - begin)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 3, 0, self.body)
+
+        super().__init__(unroll)
diff --git a/python/tvm/script/special_stmt.py b/python/tvm/script/special_stmt.py
index 53c01d49d371..31fe0ed7cebf 100644
--- a/python/tvm/script/special_stmt.py
+++ b/python/tvm/script/special_stmt.py
@@ -14,130 +14,172 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Parser Special Stmt Functions
-This module provides the functions registered into parser under special_stmt category.
-special_stmt functions don't correspond to an IRNode in the AST directly. It is usually
-used for some information that is not suitable to be printed directly.
-special_stmt can appear as 2 formats
-.. code-block:: python
-    target = tir.name():
-    tir.name()
-When registering a special stmt, the first two arguments must be parser, node
-"""
+"""TVM Script Parser Special Stmt Classes"""
 # pylint: disable=unused-argument, no-self-argument, inconsistent-return-statements
+# pylint: disable=relative-beyond-top-level
+from typed_ast import ast3 as ast
 
 import tvm.tir
 from tvm import te
-from .registry import register_special_stmt
-
-
-@register_special_stmt()
-def match_buffer(
-    parser,
-    node,
-    param,
-    shape,
-    dtype="float32",
-    data=None,
-    strides=None,
-    elem_offset=None,
-    scope="global",
-    align=-1,
-    offset_factor=0,
-    buffer_type="default",
-):
-    """Special function match_buffer(var, shape, dtype, data, strides, elem_offset, scope, align,
-                                      offset_factor, buffer_type)
+from .utils import get_param_list
+from .registry import register
+
+
+class SpecialStmt:
+    """Base class for all Special Stmts"""
+
+    def __init__(self, func, def_symbol):
+        self.func = func
+        self.def_symbol = def_symbol
+        self.node = None
+        self.context = None
+
+    def signature(self):
+        return "tir." + self.func.__name__, get_param_list(self.func)
+
+    def handle(self, node, context, arg_list):
+        self.node = node
+        self.context = context
+        return self.func(*arg_list)
+
+
+@register
+class MatchBuffer(SpecialStmt):
+    """Special Stmt match_buffer(var, shape, dtype, data, strides, elem_offset, scope, align,
+                                 offset_factor, buffer_type)
     Example
     -------
     .. code-block:: python
         A = tir.match_buffer(a, (128, 128), dtype="float32")
     """
 
-    if param not in parser.params:
-        parser.report_error("Can not bind non-input param to buffer")
-    if strides is None:
-        strides = []
-    align = align.value if not isinstance(align, int) else align
-    offset_factor = offset_factor.value if not isinstance(offset_factor, int) else offset_factor
-    buffer = tvm.tir.decl_buffer(
-        shape,
-        dtype,
-        parser.target[0],
-        data,
-        strides,
-        elem_offset,
-        scope,
-        align,
-        offset_factor,
-        buffer_type,
-    )
-    parser.buffer_map[param] = buffer
-    return buffer
-
-
-@register_special_stmt()
-def buffer_decl(
-    parser,
-    node,
-    shape,
-    dtype="float32",
-    data=None,
-    strides=None,
-    elem_offset=None,
-    scope="global",
-    align=-1,
-    offset_factor=0,
-    buffer_type="default",
-):
-    """Special function buffer_decl(shape, dtype, data, strides, elem_offset, scope, align,
-                                         offset_factor, buffer_type)
+    def __init__(self):
+        def match_buffer(
+            param,
+            shape,
+            dtype="float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope="global",
+            align=-1,
+            offset_factor=0,
+            buffer_type="default",
+        ):
+            assert isinstance(self.node, ast.Assign)
+
+            if param not in self.context.func_params:
+                self.context.report_error("Can not bind non-input param to buffer")
+            if strides is None:
+                strides = []
+            align = align.value if not isinstance(align, int) else align
+            offset_factor = (
+                offset_factor.value if not isinstance(offset_factor, int) else offset_factor
+            )
+            buffer = tvm.tir.decl_buffer(
+                shape,
+                dtype,
+                self.node.targets[0].id,
+                data,
+                strides,
+                elem_offset,
+                scope,
+                align,
+                offset_factor,
+                buffer_type,
+            )
+            self.context.func_buffer_map[param] = buffer
+            self.context.update_symbol(self.node.targets[0].id, buffer)
+
+        super().__init__(match_buffer, def_symbol=True)
+
+
+@register
+class BufferDeclare(SpecialStmt):
+    """Special Stmt buffer_decl(shape, dtype, data, strides, elem_offset, scope, align,
+                                offset_factor, buffer_type)
     Example
     -------
     .. code-block:: python
         A = tir.buffer_decl((128, 128), dtype="float32")
     """
 
-    if strides is None:
-        strides = []
-    align = align.value if not isinstance(align, int) else align
-    offset_factor = offset_factor.value if not isinstance(offset_factor, int) else offset_factor
-    buffer = tvm.tir.decl_buffer(
-        shape,
-        dtype,
-        parser.target[0],
-        data,
-        strides,
-        elem_offset,
-        scope,
-        align,
-        offset_factor,
-        buffer_type,
-    )
-    return buffer
-
-
-@register_special_stmt()
-def var(parser, node, dtype):
+    def __init__(self):
+        def buffer_decl(
+            shape,
+            dtype="float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope="global",
+            align=-1,
+            offset_factor=0,
+            buffer_type="default",
+        ):
+            assert isinstance(self.node, ast.Assign)
+
+            if strides is None:
+                strides = []
+            align = align.value if not isinstance(align, int) else align
+            offset_factor = (
+                offset_factor.value if not isinstance(offset_factor, int) else offset_factor
+            )
+            buffer = tvm.tir.decl_buffer(
+                shape,
+                dtype,
+                self.node.targets[0].id,
+                data,
+                strides,
+                elem_offset,
+                scope,
+                align,
+                offset_factor,
+                buffer_type,
+            )
+            self.context.update_symbol(self.node.targets[0].id, buffer)
+            return buffer
+
+        super().__init__(buffer_decl, def_symbol=True)
+
+
+@register
+class VarDef(SpecialStmt):
     """ Special function for defining a Var"""
-    return te.var(parser.target[0], dtype)
 
+    def __init__(self):
+        def var(dtype):
+            assert isinstance(self.node, ast.Assign)
+            v = te.var(self.node.targets[0].id, dtype)
+            self.context.update_symbol(v.name, v)
 
-@register_special_stmt()
-def env_thread(parser, node, env_name):
+        super().__init__(var, def_symbol=True)
+
+
+@register
+class EnvThread(SpecialStmt):
     """ Bind a var to thread env """
-    v = te.var(parser.target[0])
-    parser.var_env_dict[v] = env_name
-    return v
 
+    def __init__(self):
+        def env_thread(env_name):
+            assert isinstance(self.node, ast.Assign)
+            v = te.var(self.node.targets[0].id)
+            self.context.func_var_env_dict[v] = env_name
+            self.context.update_symbol(v.name, v)
+
+        super().__init__(env_thread, def_symbol=True)
 
-@register_special_stmt()
-def func_attr(parser, node, dict_attr):
-    """Special function for declaring the DictAttr of PrimFunc
+
+@register
+class FuncAttr(SpecialStmt):
+    """Special Stmt for declaring the DictAttr of PrimFunc
     Example
     -------
     .. code-block:: python
          tir.func_attr({"tir.noalias": True, "global_symbol"})
     """
 
-    parser.dict_attr = dict_attr
+    def __init__(self):
+        def func_attr(dict_attr):
+            self.context.func_dict_attr = dict_attr
+
+        super().__init__(func_attr, def_symbol=False)
diff --git a/python/tvm/script/ty.py b/python/tvm/script/ty.py
index 430a746fff40..1d7871624eb5 100644
--- a/python/tvm/script/ty.py
+++ b/python/tvm/script/ty.py
@@ -23,14 +23,15 @@
 import tvm
 
 
-class TypeGeneric:
+class TypeGeneric:  # pylint: disable=too-few-public-methods
     """Base class for all the TVM script typing class"""
 
     def evaluate(self):
+        """Return an actual ir.Type Object that this Generic class wraps"""
         raise TypeError("Cannot get tvm.Type from a generic type")
 
 
-class ConcreteType(TypeGeneric):
+class ConcreteType(TypeGeneric):  # pylint: disable=too-few-public-methods
     """TVM script typing class for uniform Type objects"""
 
     def __init__(self, vtype):
diff --git a/python/tvm/script/utils.py b/python/tvm/script/utils.py
index f510ddb906aa..ef6736f3e98b 100644
--- a/python/tvm/script/utils.py
+++ b/python/tvm/script/utils.py
@@ -17,93 +17,29 @@
 """Helper functions in TVM Script Parser"""
 
 import inspect
-from tvm import IRModule
 
-from . import _ffi_api
-from .parser import from_source
 
+def get_param_list(func):
+    """Get the parameter list from definition of function"""
+    full_arg_spec = inspect.getfullargspec(func)
 
-def create_module(functions=None):
-    """Construct a module from list of functions.
+    args, defaults = full_arg_spec.args, full_arg_spec.defaults
 
-    Parameters
-    -----------
-    functions: Optional[dict].
-        Map of GlobalVar or str to PrimFunc
+    if defaults is None:
+        defaults = tuple()
 
-    Returns
-    -------
-    mod : IRModule
-        An IRModule containing the passed definitions
-    """
+    if full_arg_spec.varkw is not None:
+        raise RuntimeError(
+            "TVM Script register error : variable keyword argument is not supported now"
+        )
+    if not len(full_arg_spec.kwonlyargs) == 0:
+        raise RuntimeError("TVM Script register error : keyword only argument is not supported now")
 
-    return IRModule(functions=functions)
+    pos_only = list()
+    for arg in args[: len(args) - len(defaults)]:
+        pos_only.append(arg)
+    kwargs = list()
+    for default, arg in zip(defaults, args[len(args) - len(defaults) :]):
+        kwargs.append((arg, default))
 
-
-def asscript(input_ir, show_meta=False):
-    """Transform a PrimFunc or IRModule to python syntax script
-
-    Parameters
-    ----------
-    input_ir : Union[PrimFunc, IRModule]
-        The PrimFunc or IRModule to be dumped
-
-    show_meta : bool
-        Whether show meta
-
-    Returns
-    -------
-    script : str
-        The Python script
-    """
-
-    return _ffi_api.AsTVMScript(input_ir, show_meta)
-
-
-def tir(script_in):
-    """Decorate a python function or class as tvm script.
-
-    The tvm function or parsing support parsing to the internal TIR.
-
-    Returns
-    -------
-    output : Union[Function, Module]
-        The Function or Module in IR.
-    """
-
-    if inspect.isfunction(script_in):
-        return _parse(script_in)
-
-    if inspect.isclass(script_in):
-        return TVMScriptClass(script_in)
-
-    raise TypeError("Only function and class are supported")
-
-
-def module(script_in):
-    """Decorate a python function or class as tvm script.
-
-    Alias for tvm.script.tir for now.
-
-    Returns
-    -------
-    output : Union[Function, Module]
-        The Function or Module in IR.
-    """
-    return tir(script_in)
-
-
-class TVMScriptClass:
-    """Helper class for decorating a class"""
-
-    def __init__(self, script_in):
-        self.script = script_in
-
-    def __call__(self, *args, **kwargs):
-        # call the parser to transform tvm script into TIR
-        return _parse(self.script)
-
-
-def _parse(script_in):
-    """Helper function to parse TVM script into TIR"""
-    return from_source(inspect.getsource(script_in), inspect.getsourcelines(script_in)[1])
+    return pos_only, kwargs, full_arg_spec.varargs

From 7607ade2d8d12c78da585851febc4863335a9a0e Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 22 Oct 2020 08:04:44 -0700
Subject: [PATCH 031/258] [AutoScheduler] Use tempfile in tutorials (#6728)

* Use tempfile in tutorials

* address comment

* Update tutorials/auto_scheduler/tune_matmul_x86.py

* Update tutorials/auto_scheduler/tune_conv2d_layer_cuda.py

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
---
 tests/scripts/task_python_docs.sh             |  1 +
 .../auto_scheduler/tune_conv2d_layer_cuda.py  | 43 +++++++++++--------
 tutorials/auto_scheduler/tune_matmul_x86.py   | 26 ++++++-----
 3 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 98dac93ac98f..e0165b66578f 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -37,6 +37,7 @@ rm -rf docs/_build
 mkdir -p docs/_build/html
 rm -rf docs/gen_modules
 rm -rf docs/doxygen
+rm -rf tutorials/auto_scheduler/auto_scheduler_logs
 
 # remove stale tutorials and always build from scratch.
 rm -rf docs/tutorials
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index cb2126dec911..18c2e85b2d33 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -32,6 +32,8 @@
 We use a convolution layer as an example in this tutorial.
 """
 
+import os
+
 import numpy as np
 import tvm
 from tvm import te, auto_scheduler, topi
@@ -88,11 +90,15 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # * see :any:`auto_scheduler.TuningOptions`,
 #   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
 
+if not os.path.exists("./auto_scheduler_logs"):
+    os.mkdir("./auto_scheduler_logs")
+
+logfile = os.path.join("./auto_scheduler_logs", "conv2d.json")
 measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
 tune_option = auto_scheduler.TuningOptions(
     num_measure_trials=10,
     runner=measure_ctx.runner,
-    measure_callbacks=[auto_scheduler.RecordToFile("conv2d.json")],
+    measure_callbacks=[auto_scheduler.RecordToFile(logfile)],
 )
 
 ######################################################################
@@ -157,17 +163,17 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # print the equivalent python schedule API, and build the binary again.
 
 # Load the measuremnt record for the best schedule
-# inp, res = auto_scheduler.load_best("conv2d.json", task.workload_key)
+inp, res = auto_scheduler.load_best(logfile, task.workload_key)
 
 # Print equivalent python schedule API. This can be used for debugging and
 # learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
-# print(task.compute_dag.print_python_code_from_state(inp.state))
+print(task.compute_dag.print_python_code_from_state(inp.state))
 
 # Rebuild the binary. This shows how you can apply the best schedule from a
 # log file without reruning the search again.
-# sch, args = task.compute_dag.apply_steps_from_state(inp.state)
-# func = tvm.build(sch, args, target)
+sch, args = task.compute_dag.apply_steps_from_state(inp.state)
+func = tvm.build(sch, args, target)
 
 ######################################################################
 # A more complicated example is to resume the search.
@@ -176,19 +182,18 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # In the example below we resume the status and do more 5 trials.
 
 
-# log_file = "conv2d.json"
-# cost_model = auto_scheduler.XGBModel()
-# cost_model.update_from_file(log_file)
-# search_policy = auto_scheduler.SketchPolicy(
-#     task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
-# )
-# measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
-# tune_option = auto_scheduler.TuningOptions(
-#     num_measure_trials=5,
-#     runner=measure_ctx.runner,
-#     measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-# )
-# sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+cost_model = auto_scheduler.XGBModel()
+cost_model.update_from_file(logfile)
+search_policy = auto_scheduler.SketchPolicy(
+    task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(logfile)]
+)
+measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
+tune_option = auto_scheduler.TuningOptions(
+    num_measure_trials=5,
+    runner=measure_ctx.runner,
+    measure_callbacks=[auto_scheduler.RecordToFile(logfile)],
+)
+sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
 
 # Kill the measurement process
-# del measure_ctx
+del measure_ctx
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 5c039b17a958..9e9423f20a45 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -29,6 +29,8 @@
 We use matrix multiplication as an example in this tutorial.
 """
 
+import os
+
 import numpy as np
 import tvm
 from tvm import te, auto_scheduler
@@ -80,8 +82,12 @@ def matmul_add(N, L, M, dtype):
 #   and do more analyses later.
 # * see :any:`auto_scheduler.TuningOptions` for more parameters
 
+if not os.path.exists("./auto_scheduler_logs"):
+    os.mkdir("./auto_scheduler_logs")
+
+logfile = os.path.join("./auto_scheduler_logs", "matmul.json")
 tune_option = auto_scheduler.TuningOptions(
-    num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")]
+    num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile(logfile)]
 )
 
 ######################################################################
@@ -141,17 +147,17 @@ def matmul_add(N, L, M, dtype):
 # print the equivalent python schedule API, and build the binary again.
 
 # Load the measuremnt record for the best schedule
-# inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)
+inp, res = auto_scheduler.load_best(logfile, task.workload_key)
 
 # Print equivalent python schedule API. This can be used for debugging and
 # learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
-# print(task.compute_dag.print_python_code_from_state(inp.state))
+print(task.compute_dag.print_python_code_from_state(inp.state))
 
 # Rebuild the binary. This shows how you can apply the best schedule from a
 # log file without reruning the search again.
-# sch, args = task.compute_dag.apply_steps_from_state(inp.state)
-# func = tvm.build(sch, args)
+sch, args = task.compute_dag.apply_steps_from_state(inp.state)
+func = tvm.build(sch, args)
 
 ######################################################################
 # A more complicated example is to resume the search.
@@ -160,19 +166,19 @@ def matmul_add(N, L, M, dtype):
 # In the example below we resume the status and do more 5 trials.
 
 
-def resume_search(task, log_file):
+def resume_search(task, logfile_name):
     cost_model = auto_scheduler.XGBModel()
-    cost_model.update_from_file(log_file)
+    cost_model.update_from_file(logfile_name)
     search_policy = auto_scheduler.SketchPolicy(
-        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
+        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(logfile_name)]
     )
     tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
+        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(logfile_name)]
     )
     sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
 
 
-# resume_search(task, "matmul.json")
+# resume_search(task, logfile)
 
 ######################################################################
 # .. note::

From f8f17a3d8b1e52b5016303efa26b3ebcabb9435d Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@connect.ust.hk>
Date: Thu, 22 Oct 2020 08:46:35 -0700
Subject: [PATCH 032/258] [FIX] Fix cublas batch matmul (#6715)

* Update batch_matmul.py

Update batch_matmul.py

* fix
---
 python/tvm/topi/cuda/batch_matmul.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index bb060b3ad8a7..ee94420066dd 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -138,7 +138,7 @@ def _callback(op):
     return s
 
 
-def batch_matmul_cublas(x, y):
+def batch_matmul_cublas(x, y, out_shape=None):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
     data in batch.
 
@@ -150,6 +150,9 @@ def batch_matmul_cublas(x, y):
     y : tvm.te.Tensor
         3-D with shape [batch, N, K]
 
+    out_shape : None
+        The output shape
+
     Returns
     -------
     output : tvm.te.Tensor

From 4ab71f47c2a5a2a85a4be838a2741fefe5392d48 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 22 Oct 2020 09:38:20 -0700
Subject: [PATCH 033/258] [FIX,CMAKE] Use set_property with append flag instead
 of (#6725)

set_target_properties.

set_target_properties does not append to existing properties. There were
a couple place where previously set properties were overridden with
different properties. For example, the debug flags for relay were not
set correctly because set_target_properties was called twice in a row
with different options.
---
 CMakeLists.txt          | 14 +++++++-------
 cmake/modules/VTA.cmake |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d07f55f06ad0..8c873a8016e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -405,23 +405,23 @@ endif()
 
 if(USE_RELAY_DEBUG)
   message(STATUS "Building Relay in debug mode...")
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "USE_RELAY_DEBUG")
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "DMLC_LOG_DEBUG")
+  target_compile_definitions(tvm_objs PRIVATE "USE_RELAY_DEBUG")
+  target_compile_definitions(tvm_objs PRIVATE "DMLC_LOG_DEBUG")
 else()
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "NDEBUG")
+  target_compile_definitions(tvm_objs PRIVATE "NDEBUG")
 endif(USE_RELAY_DEBUG)
 
 if(USE_FALLBACK_STL_MAP)
   message(STATUS "Building with STL Map...")
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "USE_FALLBACK_STL_MAP=1")
+  target_compile_definitions(tvm_objs PRIVATE "USE_FALLBACK_STL_MAP=1")
 else()
   message(STATUS "Building with TVM Map...")
-  set_target_properties(tvm_objs PROPERTIES COMPILE_DEFINITIONS "USE_FALLBACK_STL_MAP=0")
+  target_compile_definitions(tvm_objs PRIVATE "USE_FALLBACK_STL_MAP=0")
 endif(USE_FALLBACK_STL_MAP)
 
 if(BUILD_FOR_HEXAGON)
   # Wrap pthread_create to allow setting custom stack size.
-  set_target_properties(tvm_runtime PROPERTIES LINK_FLAGS
+  set_property(TARGET tvm_runtime APPEND PROPERTY LINK_FLAGS
                         "-Wl,--wrap=pthread_create")
 
   target_include_directories(tvm_runtime SYSTEM
@@ -488,7 +488,7 @@ if(GTEST_INCLUDE_DIR AND GTEST_LIB)
     add_executable(${__execname} ${__srcpath})
     list(APPEND TEST_EXECS ${__execname})
     target_include_directories(${__execname} SYSTEM PUBLIC ${GTEST_INCLUDE_DIR})
-    target_link_libraries(${__execname} ${TVM_TEST_LIBRARY_NAME} ${GTEST_LIB} pthread dl)
+    target_link_libraries(${__execname} PRIVATE ${TVM_TEST_LIBRARY_NAME} ${GTEST_LIB} pthread dl)
     set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_ALL 1)
     set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
   endforeach()
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index a9fc66507d35..115216680fff 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -65,7 +65,7 @@ elseif(PYTHON)
       target_compile_definitions(vta_fsim PUBLIC ${__strip_def})
     endforeach()
     if(APPLE)
-      set_target_properties(vta_fsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+      set_property(TARGET vta_fsim APPEND PROPERTY LINK_FLAGS "-undefined dynamic_lookup")
     endif(APPLE)
     target_compile_definitions(vta_fsim PUBLIC USE_FSIM_TLPP)
   endif()
@@ -86,7 +86,7 @@ elseif(PYTHON)
       target_compile_definitions(vta_tsim PUBLIC ${__strip_def})
     endforeach()
     if(APPLE)
-      set_target_properties(vta_tsim PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+      set_property(TARGET vta_fsim APPEND PROPERTY LINK_FLAGS "-undefined dynamic_lookup")
     endif(APPLE)
   endif()
 

From 9769adde1e3c460a6b3890a880694db63f88238b Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Thu, 22 Oct 2020 10:05:15 -0700
Subject: [PATCH 034/258] [COMMUNITY] junrushao1994 -> committer (#6719)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9b2faf78d8bc..358d42cf7a19 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -59,6 +59,7 @@ We do encourage everyone to work anything they are interested in.
 - [Jared Roesch](https://github.com/jroesch) (PPMC): @jroesch - relay
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
+- [Junru Shao](https://github.com/junrushao1994) @junrushao1994 - relay, compiler
 - [Haichen Shen](https://github.com/icemelon9) (PPMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch - topi, compiler, runtime

From 3f81fd91e9466745709379a7080fb9edc59e1173 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 22 Oct 2020 13:43:52 -0500
Subject: [PATCH 035/258] [LLVM] Create fixed vector size according to latest
 LLVM12+ changes (#6717)

The vector handling code in LLVM keeps evolving to accommodate scalable
vectors. As a result, code related to vector sizes changes quite often.
---
 src/target/llvm/codegen_llvm.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index cb04e6b8055b..40ec2cc9e0b8 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -475,7 +475,9 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
   llvm::Constant* undef = llvm::UndefValue::get(type);
   llvm::Constant* zero = ConstInt32(0);
   value = builder_->CreateInsertElement(undef, value, zero);
-#if TVM_LLVM_VERSION >= 110
+#if TVM_LLVM_VERSION >= 120
+  llvm::Constant* mask = llvm::ConstantVector::getSplat(llvm::ElementCount::getFixed(lanes), zero);
+#elif TVM_LLVM_VERSION >= 110
   llvm::Constant* mask =
       llvm::ConstantVector::getSplat(llvm::ElementCount(lanes, /*Scalable=*/false), zero);
 #else

From cc21fea0c6dd0a111cc26e8200aaccf62b781692 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Fri, 23 Oct 2020 10:17:53 -0700
Subject: [PATCH 036/258] [AutoScheduler] Guarantee init population sampling
 outputs a valid set (#6713)

---
 .../auto_scheduler/cost_model/xgb_model.py    |  2 +-
 python/tvm/auto_scheduler/search_policy.py    |  3 +-
 src/auto_scheduler/compute_dag.cc             |  2 +-
 .../search_policy/sketch_policy.cc            | 70 ++++++++++++++++---
 .../search_policy/sketch_policy.h             | 12 ++--
 ...test_auto_scheduler_evolutionary_search.py |  2 -
 tests/scripts/task_python_docs.sh             |  6 +-
 tutorials/auto_scheduler/conv2d.json          |  1 +
 tutorials/auto_scheduler/matmul.json          |  2 +
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |  6 +-
 tutorials/auto_scheduler/tune_matmul_x86.py   |  6 +-
 11 files changed, 85 insertions(+), 27 deletions(-)
 create mode 100644 tutorials/auto_scheduler/conv2d.json
 create mode 100644 tutorials/auto_scheduler/matmul.json

diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index 9a534aa96af5..b8953c1db63b 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -192,7 +192,7 @@ def predict(self, task, states):
         else:
             ret = np.random.uniform(0, 1, (len(states),))
 
-        # Predict 0 for invalid states that failed to be lowered.
+        # Predict -inf for invalid states that failed to be lowered.
         for idx, feature in enumerate(features):
             if feature.min() == feature.max() == 0:
                 ret[idx] = float("-inf")
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index f3d459e4a7d2..838ced1806aa 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -148,10 +148,11 @@ class SketchPolicy(SearchPolicy):
     DEFAULT_PARAMS = {
         "eps_greedy": 0.05,
         "retry_search_one_round_on_empty": 10,
+        "sample_init_population": 50,
+        "sample_init_use_measured_ratio": 0.2,
         "evolutionary_search_population": 2048,
         "evolutionary_search_num_iters": 10,
         "evolutionary_search_mutation_prob": 0.85,
-        "evolutionary_search_use_measured_ratio": 0.2,
         "cpu_multi_level_tiling_structure": "SSRSRS",
         "gpu_multi_level_tiling_structure": "SSSRRSRS",
         # Notice: the default thread bind policy of GPU assumes the tiling structure to have at
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 3b0de974617c..75fd27ef9fa8 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1175,7 +1175,7 @@ Array<State> ComputeDAG::InferBound(const Array<State>& states) const {
 
   support::parallel_for(0, states.size(), [this, &states, &out_states](int i) {
     try {
-      out_states.Set(i, this->InferBound(states[i]));
+      out_states.Set(i, (states[i].defined()) ? this->InferBound(states[i]) : states[i]);
     } catch (dmlc::Error& e) {
       LOG(WARNING) << "InferBound fails on the state:\n"
                    << states[i] << "\n"
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 8de17a626707..60178b342e62 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -258,12 +258,12 @@ std::pair<Array<MeasureInput>, Array<MeasureResult>> SketchPolicyNode::ContinueS
 
 Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State>* random_states) {
   // Get parameters
-  int population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
-  int num_use_measured =
-      std::min(static_cast<int>(measured_states_vector_.size()),
-               static_cast<int>(
-                   GetDoubleParam(params, SketchParamKey::EvolutionarySearch::use_measured_ratio) *
-                   population));
+  int population = GetIntParam(params, SketchParamKey::SampleInitPopulation::population);
+  int num_use_measured = std::min(
+      static_cast<int>(measured_states_vector_.size()),
+      static_cast<int>(
+          GetDoubleParam(params, SketchParamKey::SampleInitPopulation::use_measured_ratio) *
+          population));
   bool is_cost_model_reasonable = !program_cost_model->IsInstance<RandomModelNode>();
 
   // 1. Generate sketches
@@ -374,10 +374,14 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
   }
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
-  while (static_cast<int>(out_states.size()) < out_size && fail_ct < out_size) {
+  size_t iter = 1;
+  size_t target_size = out_size;
+  size_t unchange_cnt = 0;
+  while (out_states.size() < target_size) {
     std::vector<State> temp_states(out_size);
 
-    support::parallel_for(0, out_size - out_states.size(),
+    // Initial a batch of states randomly
+    support::parallel_for(0, out_size,
                           [this, &temp_states, &sketches, &rand_gens](int index) {
                             // Randomly choose a sketch
                             State tmp_s = sketches[(rand_gens[index])() % sketches.size()];
@@ -395,13 +399,57 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
                             }
                           });
 
-    for (int i = 0; i < out_size; i++) {
-      if (temp_states[i].defined()) {
-        out_states.push_back(std::move(temp_states[i]));
+    // Filter out the states that were failed to apply initial rules
+    Array<State> cand_states;
+    for (auto tmp_s : temp_states) {
+      if (tmp_s.defined()) {
+        cand_states.push_back(std::move(tmp_s));
       } else {
         fail_ct++;
       }
     }
+
+    unchange_cnt++;
+    if (!cand_states.empty()) {
+      // Run the cost model to make filter out states that failed to extract features.
+      // This may happen due to illegal schedules or the schedules that uses too much
+      // memory on GPU.
+      std::vector<float> pop_scores;
+      pop_scores.reserve(cand_states.size());
+      cand_states = search_task->compute_dag.InferBound(cand_states);
+      program_cost_model->Predict(search_task, cand_states, &pop_scores);
+
+      for (size_t i = 0; i < cand_states.size(); i++) {
+        if (pop_scores[i] > -1e10) {
+          out_states.push_back(std::move(cand_states[i]));
+          unchange_cnt = 0;  // Reset the counter once we found a valid state
+        } else {
+          fail_ct++;
+        }
+      }
+    }
+
+    if (iter % 5 == 0) {
+      double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+                            std::chrono::high_resolution_clock::now() - tic_begin)
+                            .count();
+      StdCout(verbose) << "Sample Iter: " << iter << std::fixed << std::setprecision(4)
+                       << "\t#Pop: " << out_states.size() << "\t#Target: " << target_size
+                       << "\tfail_ct: " << fail_ct << "\tTime elapsed: " << std::fixed
+                       << std::setprecision(2) << duration << std::endl;
+    }
+
+    if (unchange_cnt == 5) {
+      // Reduce the target size to avoid too-long time in this phase if no valid state was found
+      // in the past iterations
+      if (target_size > 1) {
+        target_size /= 2;
+        StdCout(verbose) << "#Target has been reduced to " << target_size
+                         << " due to too many failures";
+      }
+      unchange_cnt = 0;
+    }
+    iter++;
   }
 
   double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index edaa89e6cfd6..930fd5ecbc4b 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -56,16 +56,20 @@ struct SketchParamKey {
   /*! \brief Retry several times if SearchOneRound gets no valid state. */
   static constexpr const char* empty_retry_count = "retry_search_one_round_on_empty";
 
+  struct SampleInitPopulation {
+    /*! \brief The population size of initial sampling. */
+    static constexpr const char* population = "sample_init_population";
+    /*! \brief The maximum percentage of measured states in the initial sampling. */
+    static constexpr const char* use_measured_ratio = "sample_init_use_measured_ratio";
+  };
+
   struct EvolutionarySearch {
-    /*! \brief The population size for evolutionary search. */
+    /*! \brief The population size of evolutionary search. */
     static constexpr const char* population = "evolutionary_search_population";
     /*! \brief The number of iterations performed by generic algorithm.*/
     static constexpr const char* num_iters = "evolutionary_search_num_iters";
     /*! \brief The mutation probability.*/
     static constexpr const char* mutation_prob = "evolutionary_search_mutation_prob";
-    /*! \brief The maximum percentage of measured states in the initial population for evolutionary
-     * search. */
-    static constexpr const char* use_measured_ratio = "evolutionary_search_use_measured_ratio";
   };
 
   struct MultiLevelTiling {
diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
index 9fec6f15a6c4..4acfa3908cc6 100644
--- a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
+++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
@@ -44,7 +44,6 @@ def is_good_state(state):
 
         def predict(self, task, states):
             scores = []
-            found = False
             for state in states:
                 scores.append(1 if self.is_good_state(state) else 0)
             return scores
@@ -89,7 +88,6 @@ def is_good_state(state):
 
         def predict(self, task, states):
             scores = []
-            found = False
             for state in states:
                 scores.append(1 if self.is_good_state(state) else 0)
             return scores
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index e0165b66578f..e279b908329d 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -37,7 +37,11 @@ rm -rf docs/_build
 mkdir -p docs/_build/html
 rm -rf docs/gen_modules
 rm -rf docs/doxygen
-rm -rf tutorials/auto_scheduler/auto_scheduler_logs
+
+# prepare auto scheduler tutorials
+rm -rf tutorials/auto_scheduler/*logs
+mkdir tutorials/auto_scheduler/logs
+cp -f tutorials/auto_scheduler/{matmul,conv2d}.json tutorials/auto_scheduler/logs
 
 # remove stale tutorials and always build from scratch.
 rm -rf docs/tutorials
diff --git a/tutorials/auto_scheduler/conv2d.json b/tutorials/auto_scheduler/conv2d.json
new file mode 100644
index 000000000000..10f63d0d4c8a
--- /dev/null
+++ b/tutorials/auto_scheduler/conv2d.json
@@ -0,0 +1 @@
+{"i": [["[\"conv2d_layer\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32"], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 512, [1, 64, 2, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 7, [1, 1, 7, 1], 1], ["SP", 3, 20, 512, [4, 2], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 48, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 504, [4], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000429498], 0, 1.59126, 1603259147], "v": "v0.2"}
diff --git a/tutorials/auto_scheduler/matmul.json b/tutorials/auto_scheduler/matmul.json
new file mode 100644
index 000000000000..7f537641281a
--- /dev/null
+++ b/tutorials/auto_scheduler/matmul.json
@@ -0,0 +1,2 @@
+# Keep a valid schedule for demonstraction
+{"i": [["[\"matmul_add\", 128, 128, 128, \"float32\"]", "llvm -keys=cpu"], [[], [["SP", 2, 0, 128, [4, 2, 4], 1], ["SP", 2, 4, 128, [1, 32, 2], 1], ["SP", 2, 8, 128, [2], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 1], ["FSP", 4, 2, 1, 1], ["RE", 4, [0, 2, 1, 3]], ["CA", 2, 4, 1], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$0"], ["AN", 2, 9, 2]]]], "r": [[5.80388e-05], 0, 0.299169, 1603402396], "v": "v0.2"}
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 18c2e85b2d33..68fa5d597f66 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -90,10 +90,10 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # * see :any:`auto_scheduler.TuningOptions`,
 #   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
 
-if not os.path.exists("./auto_scheduler_logs"):
-    os.mkdir("./auto_scheduler_logs")
+if not os.path.exists("./logs"):
+    os.mkdir("./logs")
 
-logfile = os.path.join("./auto_scheduler_logs", "conv2d.json")
+logfile = os.path.join("./logs", "conv2d.json")
 measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
 tune_option = auto_scheduler.TuningOptions(
     num_measure_trials=10,
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 9e9423f20a45..a2331fcc9835 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -82,10 +82,10 @@ def matmul_add(N, L, M, dtype):
 #   and do more analyses later.
 # * see :any:`auto_scheduler.TuningOptions` for more parameters
 
-if not os.path.exists("./auto_scheduler_logs"):
-    os.mkdir("./auto_scheduler_logs")
+if not os.path.exists("./logs"):
+    os.mkdir("./logs")
 
-logfile = os.path.join("./auto_scheduler_logs", "matmul.json")
+logfile = os.path.join("./logs", "matmul.json")
 tune_option = auto_scheduler.TuningOptions(
     num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile(logfile)]
 )

From ea96ff039dc2acdbe85889a5c63588ae5e12eca5 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Fri, 23 Oct 2020 11:10:33 -0700
Subject: [PATCH 037/258] [Relay] Minor fix for some TF OD models (#6729)

* Minor fix for some tf od models

* More fix

* Minor fix

* Fix lint

* Minor fix
---
 include/tvm/topi/transform.h                  | 20 ++++++++++++-----
 python/tvm/relay/frontend/tensorflow.py       | 22 ++++++++++++++-----
 python/tvm/relay/op/_transform.py             | 18 +++++++++++++++
 .../python/topi/python/test_topi_transform.py |  1 +
 4 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index e01eb703cb99..aa5c6d2a2256 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -897,8 +897,14 @@ inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
     return compute(
         condition->shape,
         [&](const Array<Var>& indices) {
-          Array<PrimExpr> condition_idx{indices[0]};
-          return tvm::tir::Select(condition(condition_idx) != 0, x(), y());
+          PrimExpr cond;
+          if (condition->shape.size() == 0) {
+            cond = condition();
+          } else {
+            Array<PrimExpr> condition_idx{indices[0]};
+            cond = condition(condition_idx);
+          }
+          return tvm::tir::Select(cond != 0, x(), y());
         },
         name, tag);
   } else if (condition->shape.size() != 1) {
@@ -913,9 +919,13 @@ inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
         },
         name, tag);
   } else {
-    CHECK_EQ(topi::GetConstInt(condition->shape[0]), topi::GetConstInt(x->shape[0]))
-        << "If condition is 1-D, the first dimension must be the same as x: " << condition->shape[0]
-        << " vs " << x->shape[0];
+    int64_t cond_first_dim = topi::GetConstInt(condition->shape[0]);
+    int64_t x_first_dim = topi::GetConstInt(x->shape[0]);
+    if (cond_first_dim > 0 && x_first_dim > 0) {
+      CHECK_EQ(cond_first_dim, x_first_dim)
+          << "If condition is 1-D, the first dimension must be the same as x: " << cond_first_dim
+          << " vs " << x_first_dim;
+    }
     return compute(
         x->shape,
         [&](const Array<Var>& indices) {
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 3df582a0c76a..9671e45a59a3 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1549,7 +1549,7 @@ def _impl(inputs, attr, params, mod):
                 idx += st
 
             # Only return when in_shape is fully static in the range from begin to end.
-            if idx >= st:
+            if idx >= ed:
                 ret = _expr.const(out_data, dtype)
                 if shrink_axis_mask:
                     ret = _op.squeeze(ret)
@@ -1659,14 +1659,26 @@ def _transform_mask(stride_dim, ellipsis_mask):
 
 def _pad(name):
     def _impl(inputs, attr, params, mod):
-        padlist = _get_param(params, inputs[1])
-        paddings = tuple(tuple(l) for l in padlist)
+        try:
+            padlist = _get_param(params, inputs[1])
+        except (IndexError, KeyError, AttributeError):
+            try:
+                padlist = _infer_value(inputs[1], params, mod).asnumpy().tolist()
+            except Exception:
+                padlist = inputs[1]
+
+        if isinstance(padlist, _expr.Expr):
+            paddings = padlist
+        else:
+            paddings = tuple(tuple(l) for l in padlist)
         attr["pad_width"] = paddings
         attr["pad_value"] = 0
         new_inputs = [inputs[0]]
         if name == "PadV2":
-            constant_values = _get_num_param(params, inputs[2])
-            attr["pad_value"] = constant_values
+            try:
+                attr["pad_value"] = _get_num_param(params, inputs[2])
+            except (IndexError, KeyError, AttributeError):
+                attr["pad_value"] = inputs[2]
         return AttrCvt(
             op_name="pad",
             ignores=["Tpaddings"],
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 3b70d78cf967..b135901baac3 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -789,6 +789,9 @@ def repeat_shape_func(attrs, inputs, _):
 
 @_reg.register_shape_func("broadcast_to_like", False)
 def broadcast_to_like_shape_func(attrs, inputs, _):
+    """
+    Shape func for broadcast_to_like.
+    """
     return [topi.math.identity(inputs[1])]
 
 
@@ -809,7 +812,22 @@ def _stack_shape_func(data_shape, axis, num_inputs):
 
 @_reg.register_shape_func("stack", False)
 def stack_shape_func(attrs, inputs, _):
+    """
+    Shape func for stack.
+    """
     axis = get_const_int(attrs.axis)
     if axis < 0:
         axis += inputs[0].shape[0] + 1
     return [_stack_shape_func(inputs[0], convert(axis), convert(len(inputs)))]
+
+
+@_reg.register_shape_func("where", False)
+def where_shape_func(attrs, inputs, _):
+    """
+    Shape func for where.
+    """
+    cond_shape = inputs[0]
+    x_shape = inputs[1]
+    out_shape = x_shape if x_shape.shape else cond_shape
+
+    return [topi.math.identity(out_shape)]
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index f18b5397eefe..cdf0b8319087 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -839,6 +839,7 @@ def test_reshape():
 
 @tvm.testing.uses_gpu
 def test_where():
+    verify_where(())
     verify_where((1, 2, 3, 4))
 
 
From a22e84e8964ec73000a429343121a16bb446e349 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 23 Oct 2020 20:41:10 -0400
Subject: [PATCH 038/258] [CONDA] Revamp conda recipe. (#6732)

* [CONDA] Revamp conda recipe.

- Combines two packages into a single recipe.
- Enable windows build.
- Better packaging hash tag (use git string).

* Address comment
---
 .gitignore                                    |  2 +-
 conda/Dockerfile.template                     | 22 +----
 conda/build_cpu.sh                            |  4 +-
 conda/build_cuda.sh                           |  3 +-
 conda/build_win.bat                           | 18 ++++
 conda/recipe/bld.bat                          | 38 ++++++++
 conda/{tvm-libs => recipe}/build.sh           | 44 ++++++----
 conda/{ => recipe}/conda_build_config.yaml    |  2 +-
 conda/{ => recipe}/cross-linux.cmake          |  0
 conda/recipe/install_libtvm.bat               | 22 +++++
 .../build.sh => recipe/install_libtvm.sh}     |  5 +-
 conda/recipe/install_tvm_python.bat           | 20 +++++
 .../install_tvm_python.sh}                    | 49 ++---------
 conda/recipe/meta.yaml                        | 88 +++++++++++++++++++
 ...der_cuda.py => render_cuda_dockerfiles.py} |  2 +-
 conda/tvm-libs/meta.yaml                      | 48 ----------
 docker/Dockerfile.conda_cpu                   | 23 ++---
 docker/Dockerfile.conda_cuda100               | 22 +----
 docker/Dockerfile.conda_cuda90                | 22 +----
 docker/bash.sh                                |  2 +-
 docker/build.sh                               |  2 +-
 docker/install/ubuntu_install_conda.sh        | 30 +++++++
 include/tvm/parser/source_map.h               |  2 +-
 tests/lint/add_asf_header.py                  | 20 +++++
 tests/lint/check_file_type.py                 |  1 +
 version.py                                    |  4 +-
 26 files changed, 296 insertions(+), 199 deletions(-)
 create mode 100644 conda/build_win.bat
 create mode 100644 conda/recipe/bld.bat
 rename conda/{tvm-libs => recipe}/build.sh (63%)
 mode change 100644 => 100755
 rename conda/{ => recipe}/conda_build_config.yaml (99%)
 rename conda/{ => recipe}/cross-linux.cmake (100%)
 create mode 100644 conda/recipe/install_libtvm.bat
 rename conda/{tvm/build.sh => recipe/install_libtvm.sh} (88%)
 mode change 100644 => 100755
 create mode 100644 conda/recipe/install_tvm_python.bat
 rename conda/{tvm/meta.yaml => recipe/install_tvm_python.sh} (50%)
 mode change 100644 => 100755
 create mode 100644 conda/recipe/meta.yaml
 rename conda/{render_cuda.py => render_cuda_dockerfiles.py} (98%)
 delete mode 100644 conda/tvm-libs/meta.yaml
 create mode 100755 docker/install/ubuntu_install_conda.sh

diff --git a/.gitignore b/.gitignore
index 77c593ca2ab8..cdcf6780a3f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,7 +24,7 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
-
+.conda/
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
diff --git a/conda/Dockerfile.template b/conda/Dockerfile.template
index 1b5dc6fbef5e..342d532bbff5 100644
--- a/conda/Dockerfile.template
+++ b/conda/Dockerfile.template
@@ -17,30 +17,16 @@
 
 FROM nvidia/cuda:{{ cuda_version }}-devel-ubuntu16.04
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            bzip2 curl sudo binutils && \
-    rm -rf /var/lib/apt/lists/*
+RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
 
-RUN  curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v{{ cudnn_short_version }}/cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz -O && \
+RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v{{ cudnn_short_version }}/cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz && \
     tar --no-same-owner -xzf cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz -C /usr/local && \
     rm cudnn-{{ cuda_version }}-linux-x64-v{{ cudnn_version }}.tgz && \
     ldconfig
 
-
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda upgrade --all && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
-
-RUN /opt/conda/bin/conda install --download-only cmake make zlib
-RUN /opt/conda/bin/conda install --download-only -c numba llvmdev=8.0.0
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh
 
 ENV PATH /opt/conda/bin:$PATH
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV CONDA_BLD_PATH /tmp
-
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
diff --git a/conda/build_cpu.sh b/conda/build_cpu.sh
index 992b1a369b96..48b93b23dc0f 100755
--- a/conda/build_cpu.sh
+++ b/conda/build_cpu.sh
@@ -26,6 +26,4 @@ mkdir -p /tmp/.conda/pkgs
 touch /tmp/.conda/pkgs/urls.txt
 touch /tmp/.conda/environments.txt
 
-
-conda build --output-folder=conda/pkg -c numba conda/tvm-libs
-conda build --output-folder=conda/pkg -m conda/conda_build_config.yaml conda/tvm
+conda build --output-folder=conda/pkg conda/recipe
diff --git a/conda/build_cuda.sh b/conda/build_cuda.sh
index 2c9a20ae66ae..ec4a144852b7 100755
--- a/conda/build_cuda.sh
+++ b/conda/build_cuda.sh
@@ -26,5 +26,4 @@ mkdir -p /tmp/.conda/pkgs
 touch /tmp/.conda/pkgs/urls.txt
 touch /tmp/.conda/environments.txt
 
-
-conda build --output-folder=conda/pkg --variants "{cuda: True, cuda_version: ${CUDA_VERSION%.*}}" -c numba conda/tvm-libs
+conda build --output-folder=conda/pkg --variants "{cuda: True, cuda_version: ${CUDA_VERSION%.*}}" conda/recipe
diff --git a/conda/build_win.bat b/conda/build_win.bat
new file mode 100644
index 000000000000..59d0d07340c7
--- /dev/null
+++ b/conda/build_win.bat
@@ -0,0 +1,18 @@
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+
+conda build --output-folder=conda/pkg conda/recipe
diff --git a/conda/recipe/bld.bat b/conda/recipe/bld.bat
new file mode 100644
index 000000000000..9fc0469febc6
--- /dev/null
+++ b/conda/recipe/bld.bat
@@ -0,0 +1,38 @@
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+echo on
+
+rd /s /q build
+mkdir build
+cd build
+
+cmake ^
+      -DCMAKE_PREFIX_PATH=%LIBRARY_PREFIX% ^
+      -DCMAKE_INSTALL_PREFIX:PATH=%LIBRARY_PREFIX% ^
+      -DUSE_LLVM=ON ^
+      -DUSE_RPC=ON ^
+      -DUSE_CPP_RPC=ON ^
+      -DUSE_SORT=ON ^
+      -DUSE_RANDOM=ON ^
+      -DUSE_GRAPH_RUNTIME_DEBUG=ON ^
+      -DINSTALL_DEV=ON ^
+      %SRC_DIR%
+
+cd ..
+:: defer build to install stage to avoid rebuild.
+:: sometimes windows msbuild is not very good at file
+:: caching and install will results in a rebuild
diff --git a/conda/tvm-libs/build.sh b/conda/recipe/build.sh
old mode 100644
new mode 100755
similarity index 63%
rename from conda/tvm-libs/build.sh
rename to conda/recipe/build.sh
index 94919c60e779..c9e76314da31
--- a/conda/tvm-libs/build.sh
+++ b/conda/recipe/build.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,35 +19,41 @@
 set -e
 set -u
 
+GPU_OPT=""
+TOOLCHAIN_OPT=""
+
 if [ "$target_platform" == "osx-64" ]; then
     # macOS 64 bits
-    METAL_OPT="-DUSE_METAL=ON"
-    TOOLCHAIN_OPT="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.11"
-else
-    METAL_OPT=""
-    if [ "$target_platform" == "linux-64" ]; then
-	# Linux 64 bits
-	TOOLCHAIN_OPT="-DCMAKE_TOOLCHAIN_FILE=${RECIPE_DIR}/../cross-linux.cmake"
-    else
-	# Windows (or 32 bits, which we don't support)
-	TOOLCHAIN_OPT=""
-    fi
+    GPU_OPT="-DUSE_METAL=ON"
+elif [ "$target_platform" == "linux-64" ]; then
+    TOOLCHAIN_OPT="-DCMAKE_TOOLCHAIN_FILE=${RECIPE_DIR}/cross-linux.cmake"
 fi
 
 # When cuda is not set, we default to False
 cuda=${cuda:-False}
 
 if [ "$cuda" == "True" ]; then
-    CUDA_OPT="-DUSE_CUDA=ON -DUSE_CUBLAS=ON -DUSE_CUDNN=ON"
+    GPU_OPT="-DUSE_CUDA=ON -DUSE_CUBLAS=ON -DUSE_CUDNN=ON"
     TOOLCHAIN_OPT=""
-else
-    CUDA_OPT=""
 fi
 
+# remove touched cmake config
+rm -f config.cmake
 rm -rf build || true
 mkdir -p build
 cd build
-cmake $METAL_OPT $CUDA_OPT -DUSE_LLVM=$PREFIX/bin/llvm-config -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" $TOOLCHAIN_OPT ..
-make -j${CPU_COUNT} VERBOSE=1
-make install
+
+cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}" \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DUSE_RPC=ON \
+      -DUSE_CPP_RPC=OFF \
+      -DUSE_SORT=ON \
+      -DUSE_RANDOM=ON \
+      -DUSE_GRAPH_RUNTIME_DEBUG=ON \
+      -DUSE_LLVM=ON \
+      -DINSTALL_DEV=ON \
+      ${GPU_OPT} ${TOOLCHAIN_OPT} \
+      ${SRC_DIR}
+
+make -j${CPU_COUNT}
 cd ..
diff --git a/conda/conda_build_config.yaml b/conda/recipe/conda_build_config.yaml
similarity index 99%
rename from conda/conda_build_config.yaml
rename to conda/recipe/conda_build_config.yaml
index 79d6bfe3c175..938d294da556 100644
--- a/conda/conda_build_config.yaml
+++ b/conda/recipe/conda_build_config.yaml
@@ -16,9 +16,9 @@
 # under the License.
 
 python:
-  - 3.5
   - 3.6
   - 3.7
+  - 3.8
 
 cuda:
   - False
diff --git a/conda/cross-linux.cmake b/conda/recipe/cross-linux.cmake
similarity index 100%
rename from conda/cross-linux.cmake
rename to conda/recipe/cross-linux.cmake
diff --git a/conda/recipe/install_libtvm.bat b/conda/recipe/install_libtvm.bat
new file mode 100644
index 000000000000..f423c521f84e
--- /dev/null
+++ b/conda/recipe/install_libtvm.bat
@@ -0,0 +1,22 @@
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+
+cmake --build build --config Release --target install
+
+:: Copy files into library bin so that they can be found
+cp %LIBRARY_LIB%\tvm.dll %LIBRARY_BIN%\tvm.dll
+cp %LIBRARY_LIB%\tvm_runtime.dll %LIBRARY_BIN%\tvm_runtime.dll
diff --git a/conda/tvm/build.sh b/conda/recipe/install_libtvm.sh
old mode 100644
new mode 100755
similarity index 88%
rename from conda/tvm/build.sh
rename to conda/recipe/install_libtvm.sh
index 9bdbe0a6f509..b236c7dc2720
--- a/conda/tvm/build.sh
+++ b/conda/recipe/install_libtvm.sh
@@ -19,6 +19,5 @@
 set -e
 set -u
 
-cd python
-$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
-cd ..
+cd build
+make install
diff --git a/conda/recipe/install_tvm_python.bat b/conda/recipe/install_tvm_python.bat
new file mode 100644
index 000000000000..96187468c2b2
--- /dev/null
+++ b/conda/recipe/install_tvm_python.bat
@@ -0,0 +1,20 @@
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+echo on
+
+cd %SRC_DIR%\python
+%PYTHON% setup.py install --single-version-externally-managed --record=%SRC_DIR%\record.txt
diff --git a/conda/tvm/meta.yaml b/conda/recipe/install_tvm_python.sh
old mode 100644
new mode 100755
similarity index 50%
rename from conda/tvm/meta.yaml
rename to conda/recipe/install_tvm_python.sh
index 9e8f94789394..2c721c64a156
--- a/conda/tvm/meta.yaml
+++ b/conda/recipe/install_tvm_python.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,48 +16,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-{% set version = "0.8.dev0" %}
+set -e
+set -u
 
-package:
-  name: tvm
-  version: {{ version }}
-
-source:
-  path: ../..
-
-build:
-  number: 0
-
-requirements:
-  build:
-    - {{ compiler('cxx') }}
-  host:
-    - python {{ python }}
-    - cython
-    - numpy
-    - setuptools
-    - decorator
-    - tvm-libs {{ version }}
-  run:
-    - python {{ python }}
-    - {{ pin_compatible('numpy') }}
-    - decorator
-    - tvm-libs {{ version }}
-    - psutil
-
-test:
-  imports:
-    - tvm
-  requires:
-    - pytest
-    - scipy
-  source_files:
-    - tests/python
-  commands:
-    - python -m pytest -v tests/python/integration
-
-about:
-  home: https://github.com/apache/incubator-tvm
-  license: Apache-2.0
-  license_family: Apache
-  summary: a low level domain specific language for compiling tensor computation pipelines
+cd ${SRC_DIR}/python
+${PYTHON} setup.py install --single-version-externally-managed --record=/tmp/record.txt
diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
new file mode 100644
index 000000000000..67ba7fec1869
--- /dev/null
+++ b/conda/recipe/meta.yaml
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{% set version = '0.8.dev0' %}
+{% set pkg_name = 'tvm' %}
+{% set cuda_tag = cuda_version | replace('.', '') %} # [cuda]
+{% set pkg_name = pkg_name + '-cu' + cuda_tag %} # [cuda]
+{% set build_tag = environ.get('GIT_BUILD_STR', 'unknown') %}
+{% set build_tag = build_tag + '_h' + PKG_HASH + '_' + PKG_BUILDNUM %}
+
+package:
+  name: {{ pkg_name }}-package
+  version: {{ version }}
+
+source:
+  path: '../..'
+
+build:
+  number: 0
+  include_recipe: False
+  missing_dso_whitelist:
+    - "*libcuda.*"  # [linux]
+
+requirements:
+  build:
+    # The anaconda compilers for OS X are old an annoying
+    # so we rely on the platform ones for now
+    - {{ compiler('cxx') }} # [not osx]
+    - cmake
+    - make # [not win]
+  host:
+    - zlib
+    - llvmdev ==10.0.0
+
+outputs:
+  - name: {{ pkg_name }}-libs
+    script: install_libtvm.bat # [win]
+    script: install_libtvm.sh  # [not win]
+    string: {{ build_tag }}
+    requirements:
+      build:
+        - {{ compiler('cxx') }}
+        - cmake
+        - git
+        - make # [not win]
+      host:
+        - zlib
+        - llvmdev ==10.0.0
+        - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
+        - {{ pin_compatible('cudnn', lower_bound='7.6.0', max_pin='x') }}  # [cuda]
+      run:
+        - llvmdev ==10.0.0
+        - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
+        - {{ pin_compatible('cudnn', lower_bound='7.6.0', max_pin='x') }}  # [cuda]
+
+  - name: {{ pkg_name }}
+    script: install_tvm_python.sh  # [not win]
+    script: install_tvm_python.bat # [win]
+    string: {{ build_tag }}_py{{ PY_VER | replace('.', '')}}
+    requirements:
+      host:
+        - python
+        - setuptools
+      run:
+        - python
+        - decorator
+        - psutil
+        - {{ pin_compatible('numpy') }}
+        - {{ pin_subpackage(pkg_name + '-libs', exact=True) }}
+
+about:
+  home: https://tvm.apache.org
+  license: Apache2
+  summary: An End to End Deep Learning Compiler Stack for CPUs, GPUs and accelerators.
diff --git a/conda/render_cuda.py b/conda/render_cuda_dockerfiles.py
similarity index 98%
rename from conda/render_cuda.py
rename to conda/render_cuda_dockerfiles.py
index efd616946314..d9d32f05fb5e 100644
--- a/conda/render_cuda.py
+++ b/conda/render_cuda_dockerfiles.py
@@ -48,7 +48,7 @@ def render_dockerfile(version):
     )
     fname = os.path.join(condadir, "../docker/Dockerfile.conda_cuda" + version.replace(".", ""))
     with open(fname, "w") as f:
-        f.write(txt)
+        f.write(txt + "\n")
     return fname
 
 
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
deleted file mode 100644
index f151048e445b..000000000000
--- a/conda/tvm-libs/meta.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-{% set version = "0.8.dev0" %}
-
-package:
-  name: tvm-libs
-  version: {{ version }}
-
-source:
-  path: ../..
-
-build:
-  number: 0
-  string: cuda{{ cuda_version | replace('.', '') }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda]
-
-requirements:
-  build:
-    # The anaconda compilers for OS X are old an annoying
-    # so we rely on the platform ones for now
-    - {{ compiler('cxx') }}  # [linux]
-    - cmake
-    - make
-  host:
-    - llvmdev ==8.0.0
-    - zlib  # [linux]
-  run:
-    - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
-    - {{ pin_compatible('cudnn', lower_bound='7.6.0', max_pin='x') }}  # [cuda]
-
-about:
-  home: https://github.com/apache/incubator-tvm
-  license: Apache2
-  summary: a low level domain specific language for compiling tensor computation pipelines
\ No newline at end of file
diff --git a/docker/Dockerfile.conda_cpu b/docker/Dockerfile.conda_cpu
index 4e0c35a26e55..d2779afbdaf3 100644
--- a/docker/Dockerfile.conda_cpu
+++ b/docker/Dockerfile.conda_cpu
@@ -17,25 +17,12 @@
 
 FROM ubuntu:16.04
 
-RUN apt-get update && apt-get install -y bzip2 curl sudo binutils && rm -rf /var/lib/apt/lists/*
+RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
 
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda upgrade --all && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
-
-# Cache some of the packages for the builds
-RUN /opt/conda/bin/conda install --download-only cmake make zlib && \
-    /opt/conda/bin/conda install --download-only -c numba llvmdev=8.0.0 && \
-    /opt/conda/bin/conda create -n py35 --download-only pytest scipy numpy=1.11 cython decorator python=3.5 && \
-    /opt/conda/bin/conda create -n py36 --download-only pytest scipy numpy=1.11 cython decorator python=3.6 && \
-    /opt/conda/bin/conda create -n py37 --download-only pytest scipy numpy=1.11 cython decorator python=3.7
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh
 
 ENV PATH /opt/conda/bin:$PATH
 ENV CONDA_BLD_PATH /tmp
-
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
+ENV CONDA_PKGS_DIRS /workspace/.conda/pkgs
+ENV CONDA_ENVS_DIRS /workspace/.conda/env
diff --git a/docker/Dockerfile.conda_cuda100 b/docker/Dockerfile.conda_cuda100
index d6e1cddbfd37..7705c8548b52 100644
--- a/docker/Dockerfile.conda_cuda100
+++ b/docker/Dockerfile.conda_cuda100
@@ -17,30 +17,16 @@
 
 FROM nvidia/cuda:10.0-devel-ubuntu16.04
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            bzip2 curl sudo binutils && \
-    rm -rf /var/lib/apt/lists/*
+RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
 
-RUN  curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-10.0-linux-x64-v7.6.0.64.tgz -O && \
+RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-10.0-linux-x64-v7.6.0.64.tgz && \
     tar --no-same-owner -xzf cudnn-10.0-linux-x64-v7.6.0.64.tgz -C /usr/local && \
     rm cudnn-10.0-linux-x64-v7.6.0.64.tgz && \
     ldconfig
 
-
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda upgrade --all && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
-
-RUN /opt/conda/bin/conda install --download-only cmake make zlib
-RUN /opt/conda/bin/conda install --download-only -c numba llvmdev=8.0.0
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh
 
 ENV PATH /opt/conda/bin:$PATH
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV CONDA_BLD_PATH /tmp
-
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
\ No newline at end of file
diff --git a/docker/Dockerfile.conda_cuda90 b/docker/Dockerfile.conda_cuda90
index f55aa1bf2e12..372167438141 100644
--- a/docker/Dockerfile.conda_cuda90
+++ b/docker/Dockerfile.conda_cuda90
@@ -17,30 +17,16 @@
 
 FROM nvidia/cuda:9.0-devel-ubuntu16.04
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            bzip2 curl sudo binutils && \
-    rm -rf /var/lib/apt/lists/*
+RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
 
-RUN  curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-9.0-linux-x64-v7.6.0.64.tgz -O && \
+RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-9.0-linux-x64-v7.6.0.64.tgz && \
     tar --no-same-owner -xzf cudnn-9.0-linux-x64-v7.6.0.64.tgz -C /usr/local && \
     rm cudnn-9.0-linux-x64-v7.6.0.64.tgz && \
     ldconfig
 
-
-RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda upgrade --all && \
-     /opt/conda/bin/conda install conda-build conda-verify && \
-     /opt/conda/bin/conda clean -ya
-
-RUN /opt/conda/bin/conda install --download-only cmake make zlib
-RUN /opt/conda/bin/conda install --download-only -c numba llvmdev=8.0.0
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh
 
 ENV PATH /opt/conda/bin:$PATH
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV CONDA_BLD_PATH /tmp
-
-WORKDIR /workspace
-RUN chmod -R a+w /workspace
\ No newline at end of file
diff --git a/docker/bash.sh b/docker/bash.sh
index d2424f170219..a87701afb918 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -70,7 +70,7 @@ else
     CUDA_ENV=""
 fi
 
-if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* ]]; then
+if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* || "${DOCKER_IMAGE_NAME}" == *"cuda"* ]]; then
     if ! type "nvidia-docker" 1> /dev/null 2> /dev/null
     then
         DOCKER_BINARY="docker"
diff --git a/docker/build.sh b/docker/build.sh
index 43f0a08700a4..7d9145832000 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -91,7 +91,7 @@ if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then
 fi
 
 # Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == *"gpu"* ]]; then
+if [[ "${CONTAINER_TYPE}" == *"gpu"* || "${CONTAINER_TYPE}" == *"cuda"* ]]; then
     if ! type "nvidia-docker" 1> /dev/null 2> /dev/null
     then
         DOCKER_BINARY="docker"
diff --git a/docker/install/ubuntu_install_conda.sh b/docker/install/ubuntu_install_conda.sh
new file mode 100755
index 000000000000..ef059ce42aa0
--- /dev/null
+++ b/docker/install/ubuntu_install_conda.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+cd /tmp && wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+chmod +x Miniconda3-latest-Linux-x86_64.sh
+/tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda
+rm /tmp/Miniconda3-latest-Linux-x86_64.sh
+/opt/conda/bin/conda upgrade --all
+/opt/conda/bin/conda clean -ya
+/opt/conda/bin/conda install conda-build conda-verify
+chmod -R a+w /opt/conda/
diff --git a/include/tvm/parser/source_map.h b/include/tvm/parser/source_map.h
index 5595574265c6..1153deb95dc3 100644
--- a/include/tvm/parser/source_map.h
+++ b/include/tvm/parser/source_map.h
@@ -101,7 +101,7 @@ class SourceMap : public ObjectRef {
   TVM_DLL SourceMap(std::initializer_list<std::pair<SourceName, Source>> source_map)
       : SourceMap(Map<SourceName, Source>(source_map)) {}
 
-  TVM_DLL SourceMap() : SourceMap({}) {}
+  TVM_DLL SourceMap() : SourceMap(Map<SourceName, Source>()) {}
 
   TVM_DLL static SourceMap Global();
 
diff --git a/tests/lint/add_asf_header.py b/tests/lint/add_asf_header.py
index a83373cea078..477ef2db4390 100644
--- a/tests/lint/add_asf_header.py
+++ b/tests/lint/add_asf_header.py
@@ -115,6 +115,25 @@
 // under the License.
 """.strip()
 
+header_cmdstyle = """
+:: Licensed to the Apache Software Foundation (ASF) under one
+:: or more contributor license agreements.  See the NOTICE file
+:: distributed with this work for additional information
+:: regarding copyright ownership.  The ASF licenses this file
+:: to you under the Apache License, Version 2.0 (the
+:: "License"); you may not use this file except in compliance
+:: with the License.  You may obtain a copy of the License at
+::
+::   http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing,
+:: software distributed under the License is distributed on an
+:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+:: KIND, either express or implied.  See the License for the
+:: specific language governing permissions and limitations
+:: under the License.
+""".strip()
+
 FMT_MAP = {
     "sh": header_pystyle,
     "cc": header_cstyle,
@@ -141,6 +160,7 @@
     "plist": header_mdstyle,
     "xcworkspacedata": header_mdstyle,
     "html": header_mdstyle,
+    "bat": header_cmdstyle,
 }
 
 
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 8a90bb3745ca..7d3e95d5af13 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -44,6 +44,7 @@
     "pyd",
     "pyx",
     "cu",
+    "bat",
     # relay text format
     "rly",
     # configurations
diff --git a/version.py b/version.py
index 6554343ac7c6..0338d13661e0 100644
--- a/version.py
+++ b/version.py
@@ -75,10 +75,10 @@ def main():
         __version__,
     )
     # conda
-    for path in ["tvm", "tvm-libs"]:
+    for path in ["recipe"]:
         update(
             os.path.join(proj_root, "conda", path, "meta.yaml"),
-            '(?<=version = ")[.0-9a-z]+',
+            "(?<=version = ')[.0-9a-z]+",
             __version__,
         )
 

From 323ecd31a65e1525ecf2a28729402ac297c94231 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 23 Oct 2020 20:47:18 -0500
Subject: [PATCH 039/258] [Hexagon] Use nullptr instead of 0 in
 hexagon_device_sim.cc (#6718)

Passing 0 produces compilation warnings.
---
 src/runtime/hexagon/sim/hexagon_device_sim.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/hexagon/sim/hexagon_device_sim.cc b/src/runtime/hexagon/sim/hexagon_device_sim.cc
index 477da09c1c65..9ff5a0421d51 100644
--- a/src/runtime/hexagon/sim/hexagon_device_sim.cc
+++ b/src/runtime/hexagon/sim/hexagon_device_sim.cc
@@ -623,7 +623,7 @@ void HexagonSimulator::Free(void* ptr) {
   LOG(INFO) << "HexagonSimulator::Free(ptr=" << std::hex << ptr << std::dec << ')';
   if (task_queuing_) {
     Message mf = {kFlush, 0, 0};
-    SendMsg(mf, 0, true);
+    SendMsg(mf, nullptr, true);
   }
   Message m = {kFree, sizeof(MsgPointer), 0u};
   MsgPointer mp = {p2va(ptr)};
@@ -661,7 +661,7 @@ void HexagonSimulator::CopyDeviceToHost(void* host_dst, const void* src, unsigne
             << ", len=" << len << ')';
   if (task_queuing_) {
     Message mf = {kFlush, 0, 0};
-    SendMsg(mf, 0, true);
+    SendMsg(mf, nullptr, true);
   }
   CopyFromV(host_dst, p2va(src), len);
 }
@@ -739,7 +739,7 @@ void HexagonSimulator::Call(void* func, uint32_t* scalar, unsigned sc_num, uint3
 
   if (!task_queuing_) {
     Message mf = {kFlush, 0, 0};
-    SendMsg(mf, 0, true);
+    SendMsg(mf, nullptr, true);
   }
 
   std::vector<uint8_t> rv(m.len);

From 0e3a2f25fc5b3228f0b3d7d3056a8863ad3ef46e Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sat, 24 Oct 2020 00:15:38 -0700
Subject: [PATCH 040/258] [Docker] Turn on Rust docs and MxNet based ResNet
 (#6640)

* Enable ResNet and Rust docs

* Tweak

* Format

* Fix issue with overwriting
---
 python/tvm/contrib/download.py               |  4 ++--
 rust/Cargo.toml                              |  1 +
 rust/tvm/examples/resnet/build.rs            | 10 ++++------
 rust/tvm/examples/resnet/src/build_resnet.py |  5 +++--
 tests/scripts/task_python_docs.sh            | 11 ++++-------
 tests/scripts/task_rust.sh                   |  6 ++----
 6 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/python/tvm/contrib/download.py b/python/tvm/contrib/download.py
index 96030245b9e5..a521c8cac017 100644
--- a/python/tvm/contrib/download.py
+++ b/python/tvm/contrib/download.py
@@ -126,7 +126,7 @@ def _download_progress(count, block_size, total_size):
 os.makedirs(TEST_DATA_ROOT_PATH, exist_ok=True)
 
 
-def download_testdata(url, relpath, module=None):
+def download_testdata(url, relpath, module=None, overwrite=False):
     """Downloads the test data from the internet.
 
     Parameters
@@ -155,5 +155,5 @@ def download_testdata(url, relpath, module=None):
     else:
         raise ValueError("Unsupported module: " + module)
     abspath = os.path.join(TEST_DATA_ROOT_PATH, module_path, relpath)
-    download(url, abspath, overwrite=False, size_compare=False)
+    download(url, abspath, overwrite=overwrite, size_compare=False)
     return abspath
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 9935ce7c8b9f..28312a5e73dc 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -23,6 +23,7 @@ members = [
 	"tvm",
 	"tvm/tests/basics",
 	"tvm/tests/callback",
+	"tvm/examples/resnet",
 	"tvm-graph-rt",
 	"tvm-graph-rt/tests/test_tvm_basic",
 	"tvm-graph-rt/tests/test_tvm_dso",
diff --git a/rust/tvm/examples/resnet/build.rs b/rust/tvm/examples/resnet/build.rs
index 1e5d8a98736d..9bf7d867e50f 100644
--- a/rust/tvm/examples/resnet/build.rs
+++ b/rust/tvm/examples/resnet/build.rs
@@ -21,9 +21,10 @@ use anyhow::{Context, Result};
 use std::{io::Write, path::Path, process::Command};
 
 fn main() -> Result<()> {
+    let out_dir = std::env::var("CARGO_MANIFEST_DIR")?;
     let output = Command::new("python3")
         .arg(concat!(env!("CARGO_MANIFEST_DIR"), "/src/build_resnet.py"))
-        .arg(&format!("--build-dir={}", env!("CARGO_MANIFEST_DIR")))
+        .arg(&format!("--build-dir={}", out_dir))
         .output()
         .with_context(|| anyhow::anyhow!("failed to run python3"))?;
     if !output.status.success() {
@@ -33,7 +34,7 @@ fn main() -> Result<()> {
         panic!("Failed to execute build script");
     }
     assert!(
-        Path::new(&format!("{}/deploy_lib.o", env!("CARGO_MANIFEST_DIR"))).exists(),
+        Path::new(&format!("{}/deploy_lib.o", out_dir)).exists(),
         "Could not prepare demo: {}",
         String::from_utf8(output.stderr)
             .unwrap()
@@ -42,10 +43,7 @@ fn main() -> Result<()> {
             .last()
             .unwrap_or("")
     );
-    println!(
-        "cargo:rustc-link-search=native={}",
-        env!("CARGO_MANIFEST_DIR")
-    );
+    println!("cargo:rustc-link-search=native={}", out_dir);
 
     Ok(())
 }
diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py
index bc100fed0df1..03ac611a191a 100644
--- a/rust/tvm/examples/resnet/src/build_resnet.py
+++ b/rust/tvm/examples/resnet/src/build_resnet.py
@@ -104,10 +104,11 @@ def download_img_labels():
         ]
     )
     synset_name = "synset.txt"
-    synset_path = download_testdata(synset_url, synset_name, module="data")
+    synset_path = download_testdata(synset_url, synset_name + ".raw", module="data", overwrite=True)
 
     with open(synset_path) as fin:
-        synset = eval(fin.read())
+        data = fin.read()
+        synset = eval(data)
 
     with open(synset_name, "w") as f:
         for key in synset:
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index e279b908329d..cbaffa2b37e4 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -73,12 +73,10 @@ npm install
 npm run typedoc
 cd ..
 
-# TODO(@jroesch): add Rust to CI container
-# see: https://github.com/apache/incubator-tvm/issues/6628
 # Rust doc
-# cd rust
-# cargo doc --workspace --no-deps
-# cd ..
+cd rust
+cargo doc --workspace --no-deps
+cd ..
 
 # Prepare the doc dir
 rm -rf _docs
@@ -87,8 +85,7 @@ rm -f _docs/.buildinfo
 mkdir -p _docs/api
 mv docs/doxygen/html _docs/api/doxygen
 mv jvm/core/target/site/apidocs _docs/api/javadoc
-# See above TODO
-# mv rust/target/doc _docs/api/rust
+mv rust/target/doc _docs/api/rust
 mv web/dist/docs _docs/api/typedoc
 
 echo "Start creating the docs tarball.."
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 18361feb03ee..d60999c3f3d0 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -110,8 +110,6 @@ cargo run --bin array
 cargo run --bin string
 cd -
 
-# TODO(@jroesch): we need to renable MxNet in ci-cpu image
-# https://github.com/apache/incubator-tvm/pull/6563
-# cd examples/resnet
-# cargo build
+cd examples/resnet
+cargo run
 cd -

From 4e5ce770a26ea15b1f259ad989f1e1feafae4e00 Mon Sep 17 00:00:00 2001
From: Lily Orth-Smith <lilyorthsmith@gmail.com>
Date: Sat, 24 Oct 2020 00:23:50 -0700
Subject: [PATCH 041/258] [RELAY] Refactor FoldConstant to skip
 TNonComputationalOps (#6720)

* add TNonComputational to qnn ops and change FoldConstant

* remove comments

* check if op in nonComputational map

* forgot to mark device_copy op as TNonComputational

* hacky fix to fuseops pass

* fix typo

* manually skip device_copy in fold_constant

* Update src/relay/transforms/fold_constant.cc

Co-authored-by: Junru Shao <junrushao1994@gmail.com>

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 src/relay/qnn/op/concatenate.cc       | 1 +
 src/relay/qnn/op/convolution.cc       | 1 +
 src/relay/qnn/op/dense.cc             | 1 +
 src/relay/qnn/op/dequantize.cc        | 1 +
 src/relay/qnn/op/op_common.h          | 1 +
 src/relay/qnn/op/quantize.cc          | 1 +
 src/relay/qnn/op/requantize.cc        | 1 +
 src/relay/transforms/fold_constant.cc | 9 ++++++---
 8 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc
index 29ecf451767e..88d2ecc9b45b 100644
--- a/src/relay/qnn/op/concatenate.cc
+++ b/src/relay/qnn/op/concatenate.cc
@@ -207,6 +207,7 @@ RELAY_REGISTER_OP("qnn.concatenate")
                   "The quantization zero_point of the output tensor.")
     .set_support_level(11)
     .add_type_rel("QnnConcatenate", QnnConcatenateRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", ConcatenateQnnCanonicalize)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnConcatenateLayout);
 
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index b2b6b092fd62..73ee4561907d 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -733,6 +733,7 @@ operator to understand how to scale back the int32 output to (u)int8.
                   "The quantization zero_point of the weight tensor.")
     .set_support_level(11)
     .add_type_rel("QnnConv2D", QnnConv2DRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnConv2DCanonicalize)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnConvInferCorrectLayout);
 
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 3cfc418868ea..e1cbfaf98df1 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -189,6 +189,7 @@ RELAY_REGISTER_OP("qnn.dense")
                   "The quantization zero_point of the weight tensor.")
     .set_support_level(11)
     .add_type_rel("QDense", QnnDenseRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnDenseCanonicalize);
 
 TVM_REGISTER_GLOBAL("relay.qnn.op._make.dense").set_body_typed(MakeQuantizedDense);
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index f0c139c2a5e3..0a81f3fe4fdb 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -136,6 +136,7 @@ The input is always quantized (int8, uint8) and will be converted to float32 giv
     .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
     .set_support_level(11)
     .add_type_rel("Dequantize", DequantizeRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", DequantizeQnnCanonicalize);
 
 TVM_REGISTER_GLOBAL("relay.qnn.op._make.dequantize").set_body_typed(MakeDequantize);
diff --git a/src/relay/qnn/op/op_common.h b/src/relay/qnn/op/op_common.h
index e99c11b6f02b..3ca8f64ac9d9 100644
--- a/src/relay/qnn/op/op_common.h
+++ b/src/relay/qnn/op/op_common.h
@@ -215,6 +215,7 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
       .add_argument("output_scale", "Tensor", "The scale of the output tensor.")                   \
       .add_argument("output_zero_point", "Tensor", "The zero_point of the output tensor.")         \
       .add_type_rel("QnnBroadcast", QnnBroadcastRel)                                               \
+      .set_attr<TNonComputational>("TNonComputational", true)                                      \
       .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnBinaryBroadcastLayout)
 
 }  // namespace qnn
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 1b5cb5e2b55b..07847916fae7 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -150,6 +150,7 @@ scale and zero point.
                   "The quantization zero_point of the output tensor.")
     .set_support_level(11)
     .add_type_rel("Quantize", QuantizeRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QuantizeQnnCanonicalize);
 
 TVM_REGISTER_GLOBAL("relay.qnn.op._make.quantize").set_body_typed(MakeQuantize);
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index ea878557d98e..3572a3980ced 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -324,6 +324,7 @@ Q_output = zp_output +  (scale_input)/(scale_output) * (Q_input - zp_input)
                   "The quantization zero_point of the output tensor.")
     .set_support_level(11)
     .add_type_rel("Requantize", RequantizeRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", RequantizeQnnCanonicalize)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", RequantizeInferCorrectLayout);
 
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 1de690d91036..4a739ddba40f 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -151,9 +151,12 @@ class ConstantFolder : public MixedModeMutator {
     }
 
     // We should think about potentially constant evaluation over these ops too.
-    if (call->op == invoke_tvm_op_ || call->op == shape_func_op_ || call->op == alloc_tensor_op_ ||
-        call->op == alloc_storage_op_ || call->op == device_copy_op_) {
-      return GetRef<Call>(call);
+    static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
+    if (const auto* call_node = call->op.as<OpNode>()) {
+      Op op = GetRef<Op>(call_node);
+      if ((fnoncomputational.count(op) && fnoncomputational[op]) || (call->op == device_copy_op_)) {
+        return GetRef<Call>(call);
+      }
     }
 
     bool all_const_args = true;

From f2dc4f34c5b749b82b7356c9b0a290d2e9339713 Mon Sep 17 00:00:00 2001
From: Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>
Date: Sat, 24 Oct 2020 08:27:30 +0100
Subject: [PATCH 042/258] Add pytest-xdist and pytest-profiling to the base
 installation packages. (#6736)

For building and testing some small portions of the python testsuite,
I've been playing off and on with xdist and pytest-profiling.

We know it's not safe for the entirity of CI yet but this could
enable smaller parts of pipelines that folks use using the
common scripts to be parallelized or indeed profiled for more
insight into where time is spent in building and testing TVM
---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 2ed14c273678..c8d9856b6de0 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -21,4 +21,4 @@ set -u
 set -o pipefail
 
 # install libraries for python package on ubuntu
-pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest mypy orderedset attrs requests Pillow packaging cloudpickle
+pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest pytest-xdist pytest-profiling mypy orderedset attrs requests Pillow packaging cloudpickle

From a2816b888536b75785aa54c2e8d45c9a3705813d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 24 Oct 2020 09:56:31 -0400
Subject: [PATCH 043/258] [FFI][BUGFIX] Fix memory leak when Pac callback
 argument is NDArray (#6744)

* [FFI][BUGFIX] Fix leak when Packed callback arg is ndarray.

Co-authored-by: Matthew Brookhart <mbrookhart@octoml.ai>

* Fix for rust ts and jvm

* Update rust/tvm-rt/src/to_function.rs

Co-authored-by: Junru Shao <junrushao1994@gmail.com>

Co-authored-by: Matthew Brookhart <mbrookhart@octoml.ai>
Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 .../src/main/native/org_apache_tvm_native_c_api.cc   |  3 ++-
 python/tvm/_ffi/_ctypes/packed_func.py               |  4 +++-
 python/tvm/_ffi/_cython/ndarray.pxi                  |  4 ++++
 python/tvm/_ffi/_cython/packed_func.pxi              |  1 +
 rust/tvm-rt/src/to_function.rs                       |  2 ++
 tests/python/unittest/test_runtime_packed_func.py    | 12 ++++++++++++
 web/src/runtime.ts                                   |  1 +
 7 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
index 6fc316ca8739..e3ea4b9c3766 100644
--- a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
+++ b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
@@ -243,7 +243,8 @@ extern "C" int funcInvokeCallback(TVMValue* args, int* typeCodes, int numArgs,
     TVMValue arg = args[i];
     int tcode = typeCodes[i];
     if (tcode == kTVMObjectHandle || tcode == kTVMPackedFuncHandle ||
-        tcode == kTVMObjectRValueRefArg || tcode == kTVMModuleHandle) {
+        tcode == kTVMObjectRValueRefArg || tcode == kTVMModuleHandle ||
+        tcode == kTVMNDArrayHandle) {
       TVMCbArgToReturn(&arg, &tcode);
     }
     jobject jarg = tvmRetValueToJava(env, arg, tcode);
diff --git a/python/tvm/_ffi/_ctypes/packed_func.py b/python/tvm/_ffi/_ctypes/packed_func.py
index acf9776d9b8b..fd82b263e2dd 100644
--- a/python/tvm/_ffi/_ctypes/packed_func.py
+++ b/python/tvm/_ffi/_ctypes/packed_func.py
@@ -306,7 +306,9 @@ def _get_global_func(name, allow_missing=False):
     _return_module, ArgTypeCode.MODULE_HANDLE
 )
 C_TO_PY_ARG_SWITCH[ArgTypeCode.DLTENSOR_HANDLE] = lambda x: _make_array(x.v_handle, True, False)
-C_TO_PY_ARG_SWITCH[ArgTypeCode.NDARRAY_HANDLE] = lambda x: _make_array(x.v_handle, False, True)
+C_TO_PY_ARG_SWITCH[ArgTypeCode.NDARRAY_HANDLE] = _wrap_arg_func(
+    lambda x: _make_array(x.v_handle, False, True), ArgTypeCode.NDARRAY_HANDLE
+)
 
 _CLASS_MODULE = None
 _CLASS_PACKED_FUNC = None
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index 9fd3aa43841f..e671ef626205 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -68,6 +68,10 @@ cdef class NDArrayBase:
         def __set__(self, value):
             self._set_handle(value)
 
+    property is_view:
+        def __get__(self):
+            return self.c_is_view != 0
+
     @property
     def shape(self):
         """Shape of this array"""
diff --git a/python/tvm/_ffi/_cython/packed_func.pxi b/python/tvm/_ffi/_cython/packed_func.pxi
index 16b146119f0d..00585659ab76 100644
--- a/python/tvm/_ffi/_cython/packed_func.pxi
+++ b/python/tvm/_ffi/_cython/packed_func.pxi
@@ -43,6 +43,7 @@ cdef int tvm_callback(TVMValue* args,
         if (tcode == kTVMObjectHandle or
             tcode == kTVMPackedFuncHandle or
             tcode == kTVMModuleHandle or
+            tcode == kTVMNDArrayHandle or
             tcode == kTVMObjectRefArg or
             tcode > kTVMExtBegin):
             CALL(TVMCbArgToReturn(&value, &tcode))
diff --git a/rust/tvm-rt/src/to_function.rs b/rust/tvm-rt/src/to_function.rs
index a89652b0378c..affd81b0e7ed 100644
--- a/rust/tvm-rt/src/to_function.rs
+++ b/rust/tvm-rt/src/to_function.rs
@@ -103,8 +103,10 @@ pub trait ToFunction<I, O>: Sized {
                 value = args_list[i];
                 tcode = type_codes_list[i];
                 if tcode == ffi::TVMArgTypeCode_kTVMObjectHandle as c_int
+                    || tcode == ffi::TVMArgTypeCode_kTVMObjectRValueRefArg as c_int
                     || tcode == ffi::TVMArgTypeCode_kTVMPackedFuncHandle as c_int
                     || tcode == ffi::TVMArgTypeCode_kTVMModuleHandle as c_int
+                    || tcode == ffi::TVMArgTypeCode_kTVMNDArrayHandle as c_int
                 {
                     check_call!(ffi::TVMCbArgToReturn(
                         &mut value as *mut _,
diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_packed_func.py
index 718fe03d5c16..b681e4fc25d7 100644
--- a/tests/python/unittest/test_runtime_packed_func.py
+++ b/tests/python/unittest/test_runtime_packed_func.py
@@ -333,7 +333,19 @@ def test_numpy_scalar():
     assert tvm.testing.echo(np.int64(maxint)) == maxint
 
 
+def test_ndarray_args():
+    def check(arr):
+        assert not arr.is_view
+        assert tvm.testing.object_use_count(arr) == 2
+
+    fcheck = tvm.runtime.convert(check)
+    x = tvm.nd.array([1, 2, 3])
+    fcheck(x)
+    assert tvm.testing.object_use_count(x) == 1
+
+
 if __name__ == "__main__":
+    test_ndarray_args()
     test_numpy_scalar()
     test_rvalue_ref()
     test_empty_array()
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 5c9b9d8181d7..80e7d71f06ad 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1216,6 +1216,7 @@ export class Instance implements Disposable {
           tcode == ArgTypeCode.TVMObjectHandle ||
           tcode == ArgTypeCode.TVMObjectRValueRefArg ||
           tcode == ArgTypeCode.TVMPackedFuncHandle ||
+          tcode == ArgTypeCode.TVMNDArrayHandle ||
           tcode == ArgTypeCode.TVMModuleHandle
         ) {
           lib.checkCall(

From 397e0752a6a9021acf7963c2486702a6b1d6fda7 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Sat, 24 Oct 2020 08:58:19 -0500
Subject: [PATCH 044/258] [LLVM] Avoid warnings when compiling getNumElements
 with LLVM12+ (#6738)

* [LLVM] Avoid warnings when compiling getNumElements with LLVM12+

Extract the element-count code into GetVectorNumElements and make it
compile cleanly with all LLVM versions.

* Trigger another build
---
 src/target/llvm/codegen_llvm.cc   | 18 +++++++++---------
 src/target/llvm/codegen_llvm.h    | 14 ++++++++++++++
 src/target/llvm/codegen_x86_64.cc |  6 +++++-
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 40ec2cc9e0b8..9bc56dc91458 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -487,7 +487,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
 }
 
 llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) {
-  int num_elems = llvm::cast<llvm::VectorType>(vec->getType())->getNumElements();
+  int num_elems = GetVectorNumElements(vec);
   if (extent == num_elems && begin == 0) return vec;
   CHECK(begin >= 0 && extent <= num_elems) << "Slicing out of bound!\n";
   std::vector<llvm::Constant*> indices;
@@ -503,7 +503,7 @@ llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent
 }
 
 llvm::Value* CodeGenLLVM::CreateVecFlip(llvm::Value* vec) {
-  int num_elems = llvm::cast<llvm::VectorType>(vec->getType())->getNumElements();
+  int num_elems = GetVectorNumElements(vec);
 #if TVM_LLVM_VERSION >= 110
   std::vector<int> indices;
 #else
@@ -517,7 +517,7 @@ llvm::Value* CodeGenLLVM::CreateVecFlip(llvm::Value* vec) {
 
 llvm::Value* CodeGenLLVM::CreateVecPad(llvm::Value* vec, int target_lanes) {
   llvm::Value* mask = llvm::UndefValue::get(DTypeToLLVMType(DataType::Int(32, target_lanes)));
-  int num_elems = llvm::cast<llvm::VectorType>(vec->getType())->getNumElements();
+  int num_elems = GetVectorNumElements(vec);
   if (num_elems == target_lanes) return vec;
   CHECK_LT(num_elems, target_lanes);
   for (int i = 0; i < num_elems; ++i) {
@@ -531,15 +531,15 @@ llvm::Value* CodeGenLLVM::CreateVecConcat(std::vector<llvm::Value*> vecs) {
   int total_lanes = 0;
 
   for (llvm::Value* v : vecs) {
-    total_lanes += llvm::cast<llvm::VectorType>(v->getType())->getNumElements();
+    total_lanes += GetVectorNumElements(v);
   }
   while (vecs.size() > 1) {
     std::vector<llvm::Value*> new_vecs;
     for (size_t i = 0; i < vecs.size() - 1; i += 2) {
       llvm::Value* lhs = vecs[i];
       llvm::Value* rhs = vecs[i + 1];
-      const size_t lhs_lanes = llvm::cast<llvm::VectorType>(lhs->getType())->getNumElements();
-      const size_t rhs_lanes = llvm::cast<llvm::VectorType>(rhs->getType())->getNumElements();
+      const size_t lhs_lanes = GetVectorNumElements(lhs);
+      const size_t rhs_lanes = GetVectorNumElements(rhs);
       if (lhs_lanes < rhs_lanes) {
         lhs = CreateVecPad(lhs, rhs_lanes);
       } else if (rhs_lanes < lhs_lanes) {
@@ -843,16 +843,16 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     return builder_->CreateFCmpUNO(a, a);
   } else if (op->op.same_as(builtin::vectorlow())) {
     llvm::Value* v = MakeValue(op->args[0]);
-    int l = llvm::cast<llvm::VectorType>(v->getType())->getNumElements();
+    int l = GetVectorNumElements(v);
     return CreateVecSlice(v, 0, l / 2);
   } else if (op->op.same_as(builtin::vectorhigh())) {
     llvm::Value* v = MakeValue(op->args[0]);
-    int l = llvm::cast<llvm::VectorType>(v->getType())->getNumElements();
+    int l = GetVectorNumElements(v);
     return CreateVecSlice(v, l / 2, l / 2);
   } else if (op->op.same_as(builtin::vectorcombine())) {
     llvm::Value* v0 = MakeValue(op->args[0]);
     llvm::Value* v1 = MakeValue(op->args[1]);
-    int num_elems = llvm::cast<llvm::VectorType>(v0->getType())->getNumElements() * 2;
+    int num_elems = GetVectorNumElements(v0) * 2;
 #if TVM_LLVM_VERSION >= 110
     std::vector<int> indices;
 #else
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 3b0ce10534fd..78eb5e2dcac7 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -242,6 +242,11 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   llvm::Function* GetIntrinsicDecl(llvm::Intrinsic::ID id, llvm::Type* ret_type,
                                    llvm::ArrayRef<llvm::Type*> arg_types);
+  /*!
+   * \brief Get the number of elements in the given vector value.
+   * \param vec The value, must be of a vector type.
+   */
+  inline int GetVectorNumElements(llvm::Value* vec);
   // initialize the function state.
   void InitFuncState();
   // Get alignment given index.
@@ -348,6 +353,15 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   static std::unique_ptr<DebugInfo> CreateDebugInfo(llvm::Module* module);
 };
+
+inline int CodeGenLLVM::GetVectorNumElements(llvm::Value* vec) {
+#if TVM_LLVM_VERSION >= 120
+  return llvm::cast<llvm::FixedVectorType>(vec->getType())->getNumElements();
+#else
+  return llvm::cast<llvm::VectorType>(vec->getType())->getNumElements();
+#endif
+}
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // LLVM_VERSION
diff --git a/src/target/llvm/codegen_x86_64.cc b/src/target/llvm/codegen_x86_64.cc
index f3362fb0f1eb..a71a0226c958 100644
--- a/src/target/llvm/codegen_x86_64.cc
+++ b/src/target/llvm/codegen_x86_64.cc
@@ -117,7 +117,11 @@ llvm::Value* CodeGenX86_64::CallVectorIntrin(llvm::Intrinsic::ID id, size_t intr
                                              llvm::Type* result_ty,
                                              const std::vector<llvm::Value*>& args) {
   llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), id, {});
+#if TVM_LLVM_VERSION >= 120
+  size_t num_elems = llvm::cast<llvm::FixedVectorType>(result_ty)->getNumElements();
+#else
   size_t num_elems = llvm::cast<llvm::VectorType>(result_ty)->getNumElements();
+#endif
   if (intrin_lanes == num_elems) {
     return builder_->CreateCall(f, args);
   }
@@ -130,7 +134,7 @@ llvm::Value* CodeGenX86_64::CallVectorIntrin(llvm::Intrinsic::ID id, size_t intr
     std::vector<llvm::Value*> split_args;
     for (const auto& v : args) {
       if (v->getType()->isVectorTy()) {
-        CHECK_EQ(llvm::cast<llvm::VectorType>(v->getType())->getNumElements(), num_elems);
+        CHECK_EQ(GetVectorNumElements(v), num_elems);
         split_args.push_back(CreateVecSlice(v, i, intrin_lanes));
       } else {
         split_args.push_back(v);

From 57441b6c2397401a6102f5838cb8df199cc033f7 Mon Sep 17 00:00:00 2001
From: Gus Smith <guscomps@gmail.com>
Date: Sat, 24 Oct 2020 06:59:35 -0700
Subject: [PATCH 045/258] Add BatchNormAttrs Rust bindings (#6678)

---
 rust/tvm/src/ir/relay/attrs/nn.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index f743534e5f61..cb96f0fbf588 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -94,3 +94,15 @@ pub struct SoftmaxAttrsNode {
     pub base: BaseAttrsNode,
     pub axis: i32,
 }
+
+#[repr(C)]
+#[derive(Object)]
+#[ref_name = "BatchNormAttrs"]
+#[type_key = "relay.attrs.BatchNormAttrs"]
+pub struct BatchNormAttrsNode {
+    pub base: BaseAttrsNode,
+    pub axis: i32,
+    pub epsilon: f64,
+    pub center: bool,
+    pub scale: bool,
+}

From c39de6916108f0a605a3fa54607e5824a71aea5e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 24 Oct 2020 23:00:36 +0900
Subject: [PATCH 046/258] [Torch] Support bincount and scatter_add ops (#6740)

---
 python/tvm/relay/frontend/pytorch.py          | 33 +++++++++++++++++++
 tests/python/frontend/pytorch/test_forward.py | 32 +++++++++++++-----
 2 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index c8fbd5a5c10c..c41d6802edd9 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2357,6 +2357,37 @@ def _impl(inputs, input_types):
     return _impl
 
 
+def _bincount():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        weights = inputs[1]
+        maximum = _op.max(data)
+        dim = maximum + _expr.const(1, dtype="int64")
+        if weights:
+            weight_type = _infer_type(weights).checked_type
+            out_dtype = weight_type.dtype
+            updates = weights
+        else:
+            out_dtype = "int64"
+            updates = _op.ones_like(data)
+
+        counts = _op.zeros(_op.reshape(dim, [1]), out_dtype)
+        return _op.scatter_add(counts, data, updates, axis=0)
+
+    return _impl
+
+
+def _scatter_add():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        axis = inputs[1]
+        index = inputs[2]
+        src = inputs[3]
+        return _op.scatter_add(data, index, src, axis=axis)
+
+    return _impl
+
+
 def _pytorch_result_type(dtypes, non_tensor_inputs):
     """This promotes TVM dtypes like PyTorch would"""
     import torch
@@ -2699,6 +2730,8 @@ def _get_convert_map(prelude, default_dtype):
         "aten::tensor": _identity(),  # used for example in tensor(1.0)
         "aten::numel": _numel(),
         "aten::empty": _empty(),
+        "aten::bincount": _bincount(),
+        "aten::scatter_add": _scatter_add(),
     }
     return convert_map
 
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 54c3daf25385..e997ebe07a50 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3139,26 +3139,27 @@ def forward(self, data):
 
 
 def test_forward_scatter():
-    class Scatter(Module):
-        def __init__(self, dim=0):
-            super().__init__()
-            self.dim = dim
+    # integer cannot be traced
+    def test_fn_scatter(dim):
+        return lambda data, index, src: torch.scatter(data, dim=dim, index=index, src=src)
 
-        def forward(self, data, index, src):
-            return torch.scatter(data, dim=self.dim, index=index, src=src)
+    def test_fn_scatter_add(dim):
+        return lambda data, index, src: torch.scatter_add(data, dim=dim, index=index, src=src)
 
     in_data = torch.zeros(3, 5)
     in_index = torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]])
     in_src = torch.rand(2, 5)
     # TODO: add scatter gpu schedule to enable gpu test.
-    verify_trace_model(Scatter(), [in_data, in_index, in_src], ["llvm"])
+    verify_trace_model(test_fn_scatter(0), [in_data, in_index, in_src], ["llvm"])
+    verify_trace_model(test_fn_scatter_add(0), [in_data, in_index, in_src], ["llvm"])
 
     in_data = torch.zeros(2, 4)
     in_index = torch.tensor([[2], [3]])
     in_src = torch.rand(2, 1)
 
-    # TODO: add scatter gpu schedule to enable gpu test.
-    verify_trace_model(Scatter(1), [in_data, in_index, in_src], ["llvm"])
+    # # TODO: add scatter gpu schedule to enable gpu test.
+    verify_trace_model(test_fn_scatter(1), [in_data, in_index, in_src], ["llvm"])
+    verify_trace_model(test_fn_scatter_add(1), [in_data, in_index, in_src], ["llvm"])
 
 
 def test_numel():
@@ -3350,6 +3351,18 @@ def expected(x_shape, y_shape):
     assert tvm.ir.structural_equal(expected_mod, mod["main"], map_free_vars=True)
 
 
+def test_bincount():
+    def test_fn(x, weights=None):
+        return torch.bincount(x, weights=weights)
+
+    inp = torch.randint(0, 8, (5,), dtype=torch.int64)
+    weights = torch.linspace(0, 1, steps=5)
+
+    verify_trace_model(test_fn, [inp], ["llvm"])
+    verify_trace_model(test_fn, [inp, weights], ["llvm"])
+    verify_trace_model(test_fn, [inp, weights.to(torch.float64)], ["llvm"])
+
+
 if __name__ == "__main__":
     # some structural tests
     test_forward_traced_function()
@@ -3476,6 +3489,7 @@ def expected(x_shape, y_shape):
     test_forward_nonzero()
     test_forward_scatter()
     test_numel()
+    test_bincount()
 
     # Model tests
     test_resnet18()

From 729b30a637b3d19f17595b01da881fc231f5799d Mon Sep 17 00:00:00 2001
From: Andrew Liu <andrewlliu@gmail.com>
Date: Sat, 24 Oct 2020 07:09:18 -0700
Subject: [PATCH 047/258] [Docker][CI][BYODT] add universal to Docker image
 (#6654)

---
 docker/Dockerfile.ci_cpu                   |  4 ++++
 docker/Dockerfile.ci_gpu                   |  4 ++++
 docker/install/ubuntu_install_universal.sh | 26 ++++++++++++++++++++++
 3 files changed, 34 insertions(+)
 create mode 100644 docker/install/ubuntu_install_universal.sh

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 44eec4d6319c..b29c93b66707 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -64,6 +64,10 @@ ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
 
+# BYODT deps
+COPY install/ubuntu_install_universal.sh /install/ubuntu_install_universal.sh
+RUN bash /install/ubuntu_install_universal.sh
+
 # Chisel deps for TSIM
 COPY install/ubuntu_install_chisel.sh /install/ubuntu_install_chisel.sh
 RUN bash /install/ubuntu_install_chisel.sh
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index bf2e21394f36..ac76af6b0a1e 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -98,6 +98,10 @@ RUN bash /install/ubuntu_install_redis.sh
 COPY install/ubuntu_install_nnpack.sh /install/ubuntu_install_nnpack.sh
 RUN bash /install/ubuntu_install_nnpack.sh
 
+# BYODT deps
+COPY install/ubuntu_install_universal.sh /install/ubuntu_install_universal.sh
+RUN bash /install/ubuntu_install_universal.sh
+
 # Environment variables
 ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
diff --git a/docker/install/ubuntu_install_universal.sh b/docker/install/ubuntu_install_universal.sh
new file mode 100644
index 000000000000..a054aafdd5f7
--- /dev/null
+++ b/docker/install/ubuntu_install_universal.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+git clone https://github.com/stillwater-sc/universal.git /opt/universal
+
+# Use specific versioning tag.
+(cd /opt/universal && git checkout e32899d551b53d758865fabd5fdd69eed35bfb0f)
\ No newline at end of file

From 97cc1ac48bbe7581c6d5e3e18467e11a45792697 Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Sat, 24 Oct 2020 19:41:33 +0530
Subject: [PATCH 048/258] [Relay][Frontend] Tensorflow version support upgrade
 from 2.1.0 to 2.3.1 (#6706)

---
 docker/install/ubuntu_install_tensorflow.sh      | 2 +-
 python/tvm/relay/frontend/tensorflow.py          | 4 ++--
 tests/python/frontend/tensorflow/test_forward.py | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index 25543909d78b..4e766d4d5a5b 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install tensorflow==2.1.0 keras==2.3.1 h5py
+pip3 install tensorflow==2.3.1 keras==2.3.1 h5py
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 9671e45a59a3..89b36256152e 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1334,7 +1334,7 @@ def _impl(inputs, attr, params, mod):
             op_name="batch_norm",
             transforms={"scale_after_normalization": "scale", "variance_epsilon": "epsilon"},
             extras={"axis": axis},
-            ignores=["data_format", "U"],
+            ignores=["data_format", "U", "exponential_avg_factor"],
             disables=["momentum"],
         )(inputs, attr)
 
@@ -1364,7 +1364,7 @@ def _impl(inputs, attr, params, mod):
             op_name="batch_norm",
             transforms={"scale_after_normalization": "scale", "variance_epsilon": "epsilon"},
             extras={"axis": axis},
-            ignores=["data_format"],
+            ignores=["data_format", "exponential_avg_factor"],
             disables=["momentum"],
         )(new_inputs, attr)
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 6a24b5752772..143030d080c9 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -3660,7 +3660,7 @@ def _test_math_op(op, dtypes=["int32", "float32"]):
     _test_math_op(tf.math.reduce_max)
     _test_math_op(tf.math.reduce_min)
     _test_math_op(tf.math.reduce_prod)
-    _test_math_op(tf.math.reduce_variance)
+    _test_math_op(tf.math.reduce_variance, dtypes=["float32"])
     _test_math_op(tf.math.reduce_std, dtypes=["float32"])
     _test_math_op(tf.math.reduce_logsumexp, dtypes=["float32"])
     if package_version.parse(tf.VERSION) >= package_version.parse("1.15.0"):
@@ -3871,11 +3871,11 @@ def test_forward_unravel_index():
     _test_forward_unravel_index([x, y])
 
     x = np.array([0, 1, 2, 5])
-    y = np.array([2, 2])
+    y = np.array([2, 3])
     _test_forward_unravel_index([x, y])
 
     x = np.array([0, 1, 2, 5])
-    y = np.array([2])
+    y = np.array([6])
     _test_forward_unravel_index([x, y])
 
     x = np.array([102, 300, 16])

From 9e5cfd7a89e2519a582500736e1e74da9202669a Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Sat, 24 Oct 2020 15:47:24 -0700
Subject: [PATCH 049/258] Update include and src dir CHECK* to ICHECK* (#6745)

---
 include/tvm/arith/analyzer.h                  |   4 +-
 include/tvm/ir/attrs.h                        |   8 +-
 include/tvm/ir/diagnostic.h                   |   4 +-
 include/tvm/ir/env_func.h                     |   4 +-
 include/tvm/ir/expr.h                         |  16 +-
 include/tvm/ir/module.h                       |   2 +-
 include/tvm/ir/op.h                           |   6 +-
 include/tvm/ir/transform.h                    |   8 +-
 include/tvm/ir/type_functor.h                 |   2 +-
 include/tvm/node/attr_registry_map.h          |   6 +-
 include/tvm/node/container.h                  |  16 +-
 include/tvm/node/functor.h                    |   8 +-
 include/tvm/node/reflection.h                 |   4 +-
 include/tvm/parser/source_map.h               |   2 +-
 include/tvm/relay/base.h                      |  20 +-
 include/tvm/relay/dataflow_pattern_functor.h  |   2 +-
 include/tvm/relay/expr_functor.h              |   4 +-
 include/tvm/relay/pattern_functor.h           |   2 +-
 include/tvm/runtime/container.h               |  58 +--
 include/tvm/runtime/data_type.h               |   8 +-
 include/tvm/runtime/ndarray.h                 |  38 +-
 include/tvm/runtime/packed_func.h             |  42 +-
 include/tvm/runtime/vm/bytecode.h             |   1 +
 include/tvm/support/logging.h                 |  12 +-
 include/tvm/target/target_kind.h              |   4 +-
 include/tvm/tir/data_layout.h                 |   4 +-
 include/tvm/tir/expr_functor.h                |   2 +-
 include/tvm/topi/broadcast.h                  |   6 +-
 include/tvm/topi/cuda/dense.h                 |   8 +-
 include/tvm/topi/cuda/reduction.h             |   4 +-
 include/tvm/topi/detail/broadcast.h           |  14 +-
 include/tvm/topi/detail/constant_utils.h      |   4 +-
 include/tvm/topi/detail/extern.h              |   4 +-
 include/tvm/topi/detail/ravel_unravel.h       |   4 +-
 include/tvm/topi/elemwise.h                   |   2 +-
 include/tvm/topi/nn.h                         |  28 +-
 include/tvm/topi/nn/bnn.h                     |  10 +-
 include/tvm/topi/nn/dense.h                   |   6 +-
 include/tvm/topi/nn/dilate.h                  |   6 +-
 include/tvm/topi/nn/local_response_norm.h     |   6 +-
 include/tvm/topi/nn/pooling.h                 |  44 +-
 include/tvm/topi/nn/softmax.h                 |   4 +-
 include/tvm/topi/reduction.h                  |  10 +-
 include/tvm/topi/rocm/dense.h                 |   8 +-
 include/tvm/topi/transform.h                  | 104 ++---
 src/arith/analyzer.cc                         |   6 +-
 src/arith/canonical_simplify.cc               |  32 +-
 src/arith/const_fold.h                        |  20 +-
 src/arith/const_int_bound.cc                  |  30 +-
 src/arith/domain_touched.cc                   |   2 +-
 src/arith/int_constraints.cc                  |  16 +-
 src/arith/int_set.cc                          |  12 +-
 src/arith/ir_mutator_with_analyzer.cc         |   2 +-
 src/arith/ir_visitor_with_analyzer.h          |   2 +-
 src/arith/iter_affine_map.cc                  |   8 +-
 src/arith/modular_set.cc                      |  12 +-
 src/arith/pattern_match.h                     |   8 +-
 src/arith/rewrite_simplify.cc                 |  12 +-
 src/arith/solve_linear_equation.cc            |   4 +-
 src/arith/solve_linear_inequality.cc          |   4 +-
 src/auto_scheduler/compute_dag.cc             |  24 +-
 src/auto_scheduler/cost_model.cc              |   6 +-
 src/auto_scheduler/feature.cc                 |  14 +-
 src/auto_scheduler/loop_state.cc              |   6 +-
 src/auto_scheduler/measure.cc                 |   2 +-
 src/auto_scheduler/measure_record.cc          |  34 +-
 .../search_policy/empty_policy.cc             |   2 +-
 .../search_policy/search_policy.cc            |   2 +-
 .../search_policy/sketch_policy.cc            |   6 +-
 .../search_policy/sketch_policy_rules.cc      |  41 +-
 src/auto_scheduler/search_policy/utils.cc     |  22 +-
 src/auto_scheduler/search_policy/utils.h      |  26 +-
 src/auto_scheduler/search_task.cc             |   2 +-
 src/auto_scheduler/transform_step.cc          | 129 +++---
 src/auto_scheduler/utils.h                    |   8 +-
 src/autotvm/feature_visitor.cc                |   2 +-
 src/autotvm/touch_extractor.cc                |   6 +-
 src/contrib/hybrid/codegen_hybrid.cc          |  26 +-
 src/contrib/tf_op/tvm_dso_op_kernels.cc       |   2 +-
 src/driver/driver_api.cc                      |  14 +-
 src/ir/diagnostic.cc                          |   2 +-
 src/ir/env_func.cc                            |   4 +-
 src/ir/error.cc                               |   6 +-
 src/ir/expr.cc                                |  10 +-
 src/ir/module.cc                              |  30 +-
 src/ir/op.cc                                  |   4 +-
 src/ir/span.cc                                |   4 +-
 src/ir/transform.cc                           |  12 +-
 src/node/attr_registry.h                      |   8 +-
 src/node/container.cc                         |  40 +-
 src/node/reflection.cc                        |   8 +-
 src/node/serialization.cc                     |  20 +-
 src/node/structural_equal.cc                  |  12 +-
 src/node/structural_hash.cc                   |  22 +-
 src/parser/meta_ref.cc                        |   4 +-
 src/parser/parser.cc                          |  18 +-
 src/parser/source_map.cc                      |   2 +-
 src/parser/tokenizer.h                        |  28 +-
 src/printer/doc.cc                            |   2 +-
 src/printer/meta_data.h                       |   2 +-
 src/printer/relay_text_printer.cc             |   4 +-
 src/printer/tir_text_printer.cc               |   2 +-
 src/printer/tvmscript_printer.cc              |   6 +-
 src/relay/analysis/annotated_region_set.cc    |  12 +-
 src/relay/analysis/annotated_region_set.h     |  14 +-
 src/relay/analysis/call_graph.cc              |  24 +-
 src/relay/analysis/call_graph.h               |  22 +-
 src/relay/analysis/context_analysis.cc        |  36 +-
 src/relay/analysis/dependency_graph.cc        |   2 +-
 src/relay/analysis/feature.cc                 |   6 +-
 src/relay/analysis/get_calibration_data.cc    |  10 +-
 src/relay/analysis/mac_count.cc               |  28 +-
 src/relay/analysis/match_exhaustion.cc        |   6 +-
 src/relay/analysis/type_solver.cc             |  16 +-
 src/relay/analysis/type_solver.h              |   2 +-
 src/relay/analysis/util.cc                    |  12 +-
 src/relay/analysis/well_formed.cc             |  14 +-
 src/relay/backend/build_module.cc             |  22 +-
 src/relay/backend/compile_engine.cc           |  86 ++--
 src/relay/backend/compile_engine.h            |   4 +-
 .../contrib/arm_compute_lib/codegen.cc        |  44 +-
 .../backend/contrib/codegen_c/codegen.cc      |  16 +-
 .../backend/contrib/codegen_c/codegen_c.h     |   6 +-
 .../contrib/codegen_json/codegen_json.h       |  16 +-
 src/relay/backend/contrib/dnnl/codegen.cc     |  34 +-
 src/relay/backend/contrib/ethosn/codegen.cc   |  14 +-
 src/relay/backend/contrib/tensorrt/codegen.cc |  16 +-
 src/relay/backend/graph_plan_memory.cc        |  30 +-
 src/relay/backend/graph_runtime_codegen.cc    |  24 +-
 src/relay/backend/interpreter.cc              |  46 +-
 src/relay/backend/param_dict.cc               |  12 +-
 src/relay/backend/utils.h                     |  24 +-
 src/relay/backend/vm/compiler.cc              | 102 ++---
 src/relay/backend/vm/lambda_lift.cc           |   7 +-
 src/relay/ir/dataflow_matcher.cc              |   8 +-
 src/relay/ir/expr.cc                          |   6 +-
 src/relay/ir/expr_functor.cc                  |  14 +-
 src/relay/ir/function.cc                      |   4 +-
 src/relay/ir/indexed_graph.h                  |   4 +-
 src/relay/ir/transform.cc                     |   2 +-
 src/relay/op/algorithm/argsort.cc             |   4 +-
 src/relay/op/algorithm/topk.cc                |   6 +-
 src/relay/op/dyn/algorithm/topk.cc            |  14 +-
 src/relay/op/dyn/image/resize.cc              |   6 +-
 src/relay/op/dyn/nn/pad.cc                    |  12 +-
 src/relay/op/dyn/nn/upsampling.cc             |  16 +-
 src/relay/op/dyn/nn/upsampling.h              |   2 +-
 src/relay/op/dyn/tensor/transform.cc          |  49 +--
 src/relay/op/image/dilation2d.cc              |  10 +-
 src/relay/op/image/grid_sample.cc             |  14 +-
 src/relay/op/image/resize.cc                  |  16 +-
 src/relay/op/memory/memory.cc                 |  42 +-
 src/relay/op/nn/bitserial.cc                  |  20 +-
 src/relay/op/nn/convolution.h                 | 214 +++++-----
 src/relay/op/nn/correlation.cc                |   6 +-
 src/relay/op/nn/nn.cc                         |  84 ++--
 src/relay/op/nn/nn.h                          |  10 +-
 src/relay/op/nn/pad.cc                        |  52 +--
 src/relay/op/nn/pooling.cc                    | 146 +++----
 src/relay/op/nn/sparse.cc                     |  10 +-
 src/relay/op/nn/upsampling.cc                 |  12 +-
 src/relay/op/nn/upsampling.h                  |   2 +-
 src/relay/op/op_common.h                      |   6 +-
 src/relay/op/tensor/binary.cc                 |   2 +-
 src/relay/op/tensor/reduce.cc                 |  50 +--
 src/relay/op/tensor/transform.cc              | 400 +++++++++---------
 src/relay/op/tensor/transform.h               |   6 +-
 src/relay/op/tensor/unary.cc                  |  12 +-
 src/relay/op/type_relations.cc                |  12 +-
 src/relay/op/vision/multibox_op.cc            |  24 +-
 src/relay/op/vision/nms.cc                    |  10 +-
 src/relay/op/vision/rcnn_op.cc                |  34 +-
 src/relay/op/vision/yolo.cc                   |   8 +-
 src/relay/op/vm/vm.cc                         |  22 +-
 src/relay/qnn/op/concatenate.cc               |  28 +-
 src/relay/qnn/op/convolution.cc               |  36 +-
 src/relay/qnn/op/dense.cc                     |  20 +-
 src/relay/qnn/op/dequantize.cc                |  22 +-
 src/relay/qnn/op/op_common.h                  |  22 +-
 src/relay/qnn/op/quantize.cc                  |  22 +-
 src/relay/qnn/op/requantize.cc                |  42 +-
 src/relay/qnn/utils.cc                        |   4 +-
 src/relay/qnn/utils.h                         |  29 +-
 src/relay/quantize/annotate.cc                |   2 +-
 src/relay/quantize/calibrate.cc               |   8 +-
 src/relay/quantize/quantize.cc                |   6 +-
 src/relay/quantize/realize.cc                 |  58 +--
 src/relay/transforms/alter_op_layout.cc       |   2 +-
 src/relay/transforms/annotate_target.cc       |  12 +-
 src/relay/transforms/canonicalize_cast.cc     |   6 +-
 src/relay/transforms/canonicalize_ops.cc      |   2 +-
 .../transforms/combine_parallel_conv2d.cc     |  10 +-
 .../transforms/combine_parallel_dense.cc      |  16 +-
 src/relay/transforms/combine_parallel_op.cc   |   4 +-
 src/relay/transforms/convert_layout.cc        |   2 +-
 src/relay/transforms/convert_sparse_dense.cc  |   4 +-
 src/relay/transforms/de_duplicate.cc          |  10 +-
 src/relay/transforms/dead_code.cc             |   2 +-
 src/relay/transforms/defunctionalization.cc   |  38 +-
 src/relay/transforms/device_annotation.cc     |  16 +-
 src/relay/transforms/dynamic_to_static.cc     |  48 +--
 .../transforms/eliminate_common_subexpr.cc    |   4 +-
 src/relay/transforms/eta_expand.cc            |   2 +-
 src/relay/transforms/fold_constant.cc         |   8 +-
 src/relay/transforms/fold_scale_axis.cc       |  60 +--
 src/relay/transforms/forward_rewrite.cc       |   4 +-
 src/relay/transforms/fuse_ops.cc              |  44 +-
 src/relay/transforms/gradient.cc              |  40 +-
 src/relay/transforms/infer_layout_utils.h     |   6 +-
 src/relay/transforms/inline.cc                |  10 +-
 src/relay/transforms/lazy_gradient_init.cc    |   4 +-
 src/relay/transforms/legalize.cc              |   2 +-
 src/relay/transforms/let_list.h               |   6 +-
 .../transforms/merge_compiler_regions.cc      |   6 +-
 src/relay/transforms/merge_composite.cc       |   2 +-
 src/relay/transforms/partial_eval.cc          |  70 +--
 src/relay/transforms/partition_graph.cc       |  16 +-
 src/relay/transforms/pattern_utils.h          |  20 +-
 src/relay/transforms/simplify_fc_transpose.cc |   4 +-
 src/relay/transforms/simplify_inference.cc    |  16 +-
 src/relay/transforms/to_a_normal_form.cc      |  14 +-
 .../transforms/to_basic_block_normal_form.cc  |   2 +-
 src/relay/transforms/to_cps.cc                |   8 +-
 src/relay/transforms/transform_layout.h       |  14 +-
 src/relay/transforms/type_infer.cc            |  38 +-
 src/runtime/c_runtime_api.cc                  |  32 +-
 src/runtime/container.cc                      |   2 +-
 .../contrib/arm_compute_lib/acl_allocator.cc  |   2 +-
 .../contrib/arm_compute_lib/acl_runtime.cc    |  12 +-
 .../contrib/arm_compute_lib/acl_utils.cc      |   7 +-
 src/runtime/contrib/cblas/cblas.cc            |   8 +-
 src/runtime/contrib/cblas/gemm_common.h       |  60 +--
 src/runtime/contrib/cblas/mkl.cc              |  12 +-
 src/runtime/contrib/cblas/mkldnn.cc           |   4 +-
 src/runtime/contrib/coreml/coreml_runtime.mm  |   7 +-
 src/runtime/contrib/cublas/cublas.cc          |  80 ++--
 src/runtime/contrib/cublas/cublas_utils.h     |  10 +-
 src/runtime/contrib/cudnn/cudnn_utils.h       |  10 +-
 src/runtime/contrib/cudnn/softmax.cc          |   2 +-
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc |  18 +-
 src/runtime/contrib/json/json_node.h          |  22 +-
 src/runtime/contrib/json/json_runtime.h       |  24 +-
 src/runtime/contrib/miopen/miopen_utils.h     |  10 +-
 src/runtime/contrib/mps/conv.mm               |  18 +-
 src/runtime/contrib/mps/gemm.mm               |  26 +-
 src/runtime/contrib/mps/mps_utils.h           |   2 +-
 src/runtime/contrib/nnpack/convolution.cc     |  90 ++--
 src/runtime/contrib/nnpack/fully_connected.cc |  24 +-
 src/runtime/contrib/nnpack/nnpack_utils.cc    |   4 +-
 src/runtime/contrib/nnpack/nnpack_utils.h     |   2 +-
 src/runtime/contrib/onnx/onnx_module.cc       |   4 +-
 .../contrib/random/mt_random_engine.cc        |  14 +-
 src/runtime/contrib/random/random.cc          |   6 +-
 src/runtime/contrib/rocblas/rocblas.cc        |  32 +-
 src/runtime/contrib/sort/sort.cc              |  22 +-
 .../contrib/tensorrt/tensorrt_builder.cc      |  28 +-
 .../contrib/tensorrt/tensorrt_logger.h        |   2 +-
 src/runtime/contrib/tensorrt/tensorrt_ops.cc  | 162 +++----
 .../contrib/tensorrt/tensorrt_runtime.cc      |  14 +-
 src/runtime/contrib/tflite/tflite_runtime.cc  |   4 +-
 src/runtime/contrib/tflite/tflite_runtime.h   |   2 +-
 src/runtime/contrib/thrust/thrust.cu          |   4 +-
 src/runtime/cpu_device_api.cc                 |   2 +-
 src/runtime/cuda/cuda_common.h                |   9 +-
 src/runtime/cuda/cuda_device_api.cc           |   2 +-
 src/runtime/cuda/cuda_module.cc               |  10 +-
 src/runtime/dso_library.cc                    |   6 +-
 src/runtime/file_utils.cc                     |  10 +-
 .../graph/debug/graph_runtime_debug.cc        |  14 +-
 src/runtime/graph/graph_runtime.cc            |  75 ++--
 src/runtime/graph/graph_runtime.h             |  62 +--
 src/runtime/graph/graph_runtime_factory.cc    |  30 +-
 src/runtime/hexagon/hexagon_device_api.cc     |  30 +-
 src/runtime/hexagon/hexagon_module.cc         |  18 +-
 src/runtime/hexagon/hexagon_module.h          |   2 +-
 src/runtime/hexagon/sim/hexagon_device_sim.cc |  68 +--
 .../hexagon/target/hexagon_dsprpcapi.cc       |   4 +-
 .../hexagon/target/hexagon_dsprpcapi.h        |   4 +-
 src/runtime/hexagon/target/hexagon_stubapi.cc |   4 +-
 src/runtime/hexagon/target/hexagon_stubapi.h  |   2 +-
 src/runtime/library_module.cc                 |  20 +-
 src/runtime/metadata_module.cc                |  24 +-
 src/runtime/metal/metal_common.h              |  10 +-
 src/runtime/metal/metal_device_api.mm         |  16 +-
 src/runtime/metal/metal_module.mm             |  18 +-
 src/runtime/micro/micro_session.cc            |  18 +-
 src/runtime/minrpc/minrpc_server.h            |   2 +-
 src/runtime/module.cc                         |  12 +-
 src/runtime/ndarray.cc                        |  42 +-
 src/runtime/object.cc                         |  26 +-
 src/runtime/opencl/opencl_common.h            |   8 +-
 src/runtime/opencl/opencl_device_api.cc       |  10 +-
 src/runtime/opencl/opencl_module.cc           |  12 +-
 src/runtime/pack_args.h                       |   4 +-
 src/runtime/registry.cc                       |   4 +-
 src/runtime/rocm/rocm_common.h                |   8 +-
 src/runtime/rocm/rocm_device_api.cc           |   4 +-
 src/runtime/rocm/rocm_module.cc               |   8 +-
 src/runtime/rpc/rpc_device_api.cc             |   6 +-
 src/runtime/rpc/rpc_endpoint.cc               |  48 +--
 src/runtime/rpc/rpc_module.cc                 |  38 +-
 src/runtime/rpc/rpc_pipe_impl.cc              |   4 +-
 src/runtime/rpc/rpc_server_env.cc             |   2 +-
 src/runtime/rpc/rpc_session.cc                |   4 +-
 src/runtime/rpc/rpc_socket_impl.cc            |  14 +-
 src/runtime/stackvm/stackvm.cc                |  18 +-
 src/runtime/stackvm/stackvm.h                 |   8 +-
 src/runtime/stackvm/stackvm_module.cc         |   6 +-
 src/runtime/thread_pool.cc                    |  10 +-
 src/runtime/threading_backend.cc              |   6 +-
 src/runtime/vm/bytecode.cc                    |   1 -
 src/runtime/vm/executable.cc                  |  28 +-
 src/runtime/vm/memory_manager.cc              |  12 +-
 src/runtime/vm/profiler/vm.cc                 |  16 +-
 src/runtime/vm/serialize_utils.h              |   8 +-
 src/runtime/vm/vm.cc                          |  62 +--
 src/runtime/vulkan/vulkan.cc                  |  34 +-
 src/runtime/vulkan/vulkan_common.h            |  10 +-
 src/runtime/vulkan/vulkan_shader.h            |   2 +-
 src/runtime/vulkan/vulkan_stream.h            |   2 +-
 src/runtime/workspace_pool.cc                 |   6 +-
 src/support/base64.h                          |  16 +-
 src/support/parallel_for.cc                   |  12 +-
 src/support/pipe.h                            |  12 +-
 src/support/ring_buffer.h                     |   4 +-
 src/support/socket.h                          |  22 +-
 src/target/build_common.h                     |   2 +-
 src/target/codegen.cc                         |   4 +-
 src/target/datatype/registry.cc               |   6 +-
 src/target/generic_func.cc                    |   8 +-
 src/target/intrin_rule.cc                     |  14 +-
 src/target/intrin_rule.h                      |   6 +-
 src/target/llvm/codegen_amdgpu.cc             |  28 +-
 src/target/llvm/codegen_arm.cc                |   2 +-
 src/target/llvm/codegen_cpu.cc                |  64 +--
 src/target/llvm/codegen_hexagon.cc            |  46 +-
 src/target/llvm/codegen_llvm.cc               |  78 ++--
 src/target/llvm/codegen_nvptx.cc              |  22 +-
 src/target/llvm/codegen_x86_64.cc             |   6 +-
 src/target/llvm/intrin_rule_llvm.cc           |  10 +-
 src/target/llvm/intrin_rule_llvm.h            |   4 +-
 src/target/llvm/intrin_rule_nvptx.cc          |   9 +-
 src/target/llvm/intrin_rule_rocm.cc           |  14 +-
 src/target/llvm/llvm_common.cc                |   4 +-
 src/target/llvm/llvm_module.cc                |  48 +--
 src/target/opt/build_cuda_on.cc               |   6 +-
 src/target/source/codegen_aocl.cc             |   4 +-
 src/target/source/codegen_c.cc                |  57 +--
 src/target/source/codegen_c_host.cc           |  16 +-
 src/target/source/codegen_cuda.cc             |  56 +--
 src/target/source/codegen_metal.cc            |  19 +-
 src/target/source/codegen_opencl.cc           |  10 +-
 src/target/source/codegen_source_base.cc      |   6 +-
 src/target/source/codegen_vhls.cc             |   8 +-
 src/target/source/intrin_rule_cuda.cc         |   6 +-
 src/target/source/intrin_rule_opencl.cc       |   6 +-
 src/target/source/source_module.cc            |   8 +-
 src/target/spirv/build_vulkan.cc              |  18 +-
 src/target/spirv/codegen_spirv.cc             |  74 ++--
 src/target/spirv/codegen_spirv.h              |   2 +-
 src/target/spirv/intrin_rule_spirv.cc         |   2 +-
 src/target/spirv/ir_builder.cc                |  62 +--
 src/target/spirv/ir_builder.h                 |   6 +-
 src/target/stackvm/codegen_stackvm.cc         |  48 +--
 src/target/tag.cc                             |   2 +-
 src/target/target.cc                          |  10 +-
 src/target/target_kind.cc                     |   6 +-
 src/te/autodiff/ad_simplify.cc                |  22 +-
 src/te/autodiff/jacobian.cc                   |  11 +-
 src/te/operation/compute_op.cc                |  44 +-
 src/te/operation/cross_thread_reduction.cc    |   9 +-
 src/te/operation/extern_op.cc                 |  16 +-
 src/te/operation/hybrid_op.cc                 |  38 +-
 src/te/operation/op_utils.cc                  |  20 +-
 src/te/operation/placeholder_op.cc            |   4 +-
 src/te/operation/scan_op.cc                   |  44 +-
 src/te/operation/tensor_compute_op.cc         |  16 +-
 src/te/operation/tensorize.cc                 |  92 ++--
 src/te/schedule/bound.cc                      |  26 +-
 src/te/schedule/graph.cc                      |  10 +-
 src/te/schedule/message_passing.cc            |  64 +--
 src/te/schedule/operation_inline.cc           |   6 +-
 src/te/schedule/schedule_dataflow_rewrite.cc  |  65 +--
 src/te/schedule/schedule_lang.cc              |  80 ++--
 src/te/schedule/schedule_ops.cc               |  49 +--
 ...hedule_postproc_rewrite_for_tensor_core.cc |  40 +-
 .../schedule/schedule_postproc_to_primfunc.cc |   4 +-
 src/te/tensor.cc                              |   4 +-
 src/tir/analysis/verify_gpu_code.cc           |   4 +-
 src/tir/analysis/verify_memory.cc             |   2 +-
 src/tir/analysis/verify_ssa.cc                |   2 +-
 src/tir/ir/buffer.cc                          |  14 +-
 src/tir/ir/data_layout.cc                     |  62 +--
 src/tir/ir/expr.cc                            | 152 +++----
 src/tir/ir/stmt.cc                            |  62 +--
 src/tir/ir/transform.cc                       |   2 +-
 src/tir/op/op.cc                              |  70 +--
 src/tir/transforms/arg_binder.cc              |  18 +-
 src/tir/transforms/bf16_legalize.cc           |  10 +-
 src/tir/transforms/combine_context_call.cc    |   4 +-
 src/tir/transforms/coproc_sync.cc             |  24 +-
 src/tir/transforms/hoist_if_then_else.cc      |   2 +-
 src/tir/transforms/inject_copy_intrin.cc      |  14 +-
 src/tir/transforms/inject_double_buffer.cc    |  18 +-
 src/tir/transforms/inject_prefetch.cc         |   2 +-
 src/tir/transforms/inject_virtual_thread.cc   |  18 +-
 src/tir/transforms/ir_utils.cc                |  18 +-
 src/tir/transforms/ir_utils.h                 |   4 +-
 src/tir/transforms/lift_attr_scope.cc         |   2 +-
 src/tir/transforms/loop_partition.cc          |  10 +-
 src/tir/transforms/lower_custom_datatypes.cc  |  24 +-
 .../lower_device_storage_access_info.cc       |  12 +-
 src/tir/transforms/lower_intrin.cc            |   8 +-
 src/tir/transforms/lower_thread_allreduce.cc  |  36 +-
 src/tir/transforms/lower_tvm_builtin.cc       |  22 +-
 src/tir/transforms/lower_warp_memory.cc       |  28 +-
 src/tir/transforms/make_packed_api.cc         |  10 +-
 src/tir/transforms/narrow_datatype.cc         |  22 +-
 src/tir/transforms/remap_thread_axis.cc       |   6 +-
 src/tir/transforms/remove_no_op.cc            |   2 +-
 src/tir/transforms/split_host_device.cc       |  20 +-
 src/tir/transforms/storage_access.cc          |  16 +-
 src/tir/transforms/storage_flatten.cc         |  48 +--
 src/tir/transforms/storage_rewrite.cc         |  56 +--
 .../transforms/tensorcore_infer_fragment.cc   |  60 +--
 src/tir/transforms/thread_storage_sync.cc     |  14 +-
 src/tir/transforms/unroll_loop.cc             |   4 +-
 src/tir/transforms/vectorize_loop.cc          |  18 +-
 src/topi/transform.cc                         |   2 +-
 429 files changed, 4252 insertions(+), 4234 deletions(-)

diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index a9a0bed6712a..cd20bdcf4d1a 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -320,10 +320,10 @@ class CanonicalSimplifier {
  *  arith::Analyzer analyzer;
  *  {
  *    With<arith::ConstraintContext> scope(&analyzer, x % 3 == 0);
- *    CHECK_EQ(analyzer.modular_set(x)->coeff, 3);
+ *    ICHECK_EQ(analyzer.modular_set(x)->coeff, 3);
  *  }
  *  // constraint no longer in effect.
- *  CHECK_NE(analyzer.modular_set(x)->coeff, 3);
+ *  ICHECK_NE(analyzer.modular_set(x)->coeff, 3);
  *
  * \endcode
  */
diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index e92baf12b05f..afb8ef0730e0 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -428,7 +428,7 @@ inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
     *ptr = val.operator double();
   } else {
     ObjectRef expr = val;
-    CHECK(expr.defined());
+    ICHECK(expr.defined());
     if (const IntImmNode* op = expr.as<IntImmNode>()) {
       *ptr = static_cast<double>(op->value);
     } else if (const FloatImmNode* op = expr.as<FloatImmNode>()) {
@@ -664,7 +664,7 @@ class AttrsNode : public BaseAttrsNode {
   }
 
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final {
-    CHECK_EQ(args.size() % 2, 0);
+    ICHECK_EQ(args.size() % 2, 0);
     const int kLinearSearchBound = 16;
     int hit_count = 0;
     // applies two stratgies to lookup
@@ -672,7 +672,7 @@ class AttrsNode : public BaseAttrsNode {
       // linear search.
       auto ffind = [&args](const char* key, runtime::TVMArgValue* val) {
         for (int i = 0; i < args.size(); i += 2) {
-          CHECK_EQ(args.type_codes[i], kTVMStr);
+          ICHECK_EQ(args.type_codes[i], kTVMStr);
           if (!std::strcmp(key, args.values[i].v_str)) {
             *val = args[i + 1];
             return true;
@@ -687,7 +687,7 @@ class AttrsNode : public BaseAttrsNode {
       // construct a map then do lookup.
       std::unordered_map<std::string, runtime::TVMArgValue> kwargs;
       for (int i = 0; i < args.size(); i += 2) {
-        CHECK_EQ(args.type_codes[i], kTVMStr);
+        ICHECK_EQ(args.type_codes[i], kTVMStr);
         kwargs[args[i].operator std::string()] = args[i + 1];
       }
       auto ffind = [&kwargs](const char* key, runtime::TVMArgValue* val) {
diff --git a/include/tvm/ir/diagnostic.h b/include/tvm/ir/diagnostic.h
index 2a2a6cd4e867..2053a295a3b8 100644
--- a/include/tvm/ir/diagnostic.h
+++ b/include/tvm/ir/diagnostic.h
@@ -149,7 +149,7 @@ class DiagnosticRenderer : public ObjectRef {
   void Render(const DiagnosticContext& ctx);
 
   DiagnosticRendererNode* operator->() {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<DiagnosticRendererNode*>(get_mutable());
   }
 
@@ -203,7 +203,7 @@ class DiagnosticContext : public ObjectRef {
   void Render();
 
   DiagnosticContextNode* operator->() {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<DiagnosticContextNode*>(get_mutable());
   }
 
diff --git a/include/tvm/ir/env_func.h b/include/tvm/ir/env_func.h
index 65653b75562d..386666a2c50c 100644
--- a/include/tvm/ir/env_func.h
+++ b/include/tvm/ir/env_func.h
@@ -83,7 +83,7 @@ class EnvFunc : public ObjectRef {
   template <typename... Args>
   runtime::TVMRetValue operator()(Args&&... args) const {
     const EnvFuncNode* n = operator->();
-    CHECK(n != nullptr);
+    ICHECK(n != nullptr);
     return n->func(std::forward<Args>(args)...);
   }
   /*!
@@ -137,7 +137,7 @@ class TypedEnvFunc<R(Args...)> : public ObjectRef {
    */
   R operator()(Args... args) const {
     const EnvFuncNode* n = operator->();
-    CHECK(n != nullptr);
+    ICHECK(n != nullptr);
     return runtime::detail::typed_packed_call_dispatcher<R>::run(n->func,
                                                                  std::forward<Args>(args)...);
   }
diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index d6cfc5a64121..c982c5cf850b 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -386,7 +386,7 @@ class Integer : public IntImm {
    * \brief convert to int64_t
    */
   operator int64_t() const {
-    CHECK(data_ != nullptr) << " Trying to reference a null Integer";
+    ICHECK(data_ != nullptr) << " Trying to reference a null Integer";
     return (*this)->value;
   }
   // comparators
@@ -461,9 +461,9 @@ class Range : public ObjectRef {
 
 // implementataions
 inline const Type& RelayExprNode::checked_type() const {
-  CHECK(checked_type_.defined()) << "internal error: the type checker has "
-                                 << "not populated the checked_type "
-                                 << "field for " << GetRef<RelayExpr>(this);
+  ICHECK(checked_type_.defined()) << "internal error: the type checker has "
+                                  << "not populated the checked_type "
+                                  << "field for " << GetRef<RelayExpr>(this);
   return this->checked_type_;
 }
 
@@ -471,11 +471,11 @@ template <typename TTypeNode>
 inline const TTypeNode* RelayExprNode::type_as() const {
   static_assert(std::is_base_of<TypeNode, TTypeNode>::value,
                 "TType must be a special case of type");
-  CHECK(checked_type_.defined())
+  ICHECK(checked_type_.defined())
       << "Type inference for this Expr has not completed. Try to call infer_type pass.";
   const TTypeNode* node = checked_type_.as<TTypeNode>();
-  CHECK(node != nullptr) << "Expected type to be " << TTypeNode::_type_key << ", but get "
-                         << checked_type_->GetTypeKey();
+  ICHECK(node != nullptr) << "Expected type to be " << TTypeNode::_type_key << ", but get "
+                          << checked_type_->GetTypeKey();
   return node;
 }
 
@@ -522,7 +522,7 @@ struct PackedFuncValueConverter<tvm::Bool> {
     }
     if (val.type_code() == kTVMArgInt) {
       int v = val.operator int();
-      CHECK(v == 0 || v == 1) << "ValueError: boolean value can only be 0 or 1, but get " << v;
+      ICHECK(v == 0 || v == 1) << "ValueError: boolean value can only be 0 or 1, but get " << v;
       return Bool(static_cast<bool>(v));
     }
     return val.AsObjectRef<tvm::Bool>();
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index b3f8438f6ec9..d6fb6a20b58a 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -300,7 +300,7 @@ class IRModule : public ObjectRef {
   /*! \return mutable pointers to the node. */
   IRModuleNode* operator->() const {
     auto* ptr = get_mutable();
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
     return static_cast<IRModuleNode*>(ptr);
   }
 
diff --git a/include/tvm/ir/op.h b/include/tvm/ir/op.h
index e7b35778d500..c73be3c1e564 100644
--- a/include/tvm/ir/op.h
+++ b/include/tvm/ir/op.h
@@ -146,7 +146,7 @@ class OpNode : public RelayExprNode {
   // Internal function to compute if it is primitive op
   bool IsPrimitiveOp_() const {
     const auto& fn_ty = this->op_type;
-    CHECK(fn_ty.get() != nullptr);
+    ICHECK(fn_ty.get() != nullptr);
     if (fn_ty->type_constraints.size() != 1) return false;
     const TypeRelationNode* rel = fn_ty->type_constraints[0].as<TypeRelationNode>();
     if (rel == nullptr) return false;
@@ -462,7 +462,7 @@ inline OpRegEntry& OpRegEntry::set_support_level(int32_t n) {  // NOLINT(*)
 template <typename ValueType>
 inline OpRegEntry& OpRegEntry::set_attr(  // NOLINT(*)
     const std::string& attr_name, const ValueType& value, int plevel) {
-  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  ICHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
   runtime::TVMRetValue rv;
   rv = value;
   UpdateAttr(attr_name, rv, plevel);
@@ -473,7 +473,7 @@ inline OpRegEntry& OpRegEntry::set_attr(  // NOLINT(*)
 
 template <typename ValueType>
 inline ValueType OpAttrMap<ValueType>::get(const RelayExpr& expr, ValueType def_value) const {
-  CHECK(expr.defined());
+  ICHECK(expr.defined());
   if (const OpNode* op = expr.as<OpNode>()) {
     return this->map_.get(GetRef<Op>(op), def_value);
   } else {
diff --git a/include/tvm/ir/transform.h b/include/tvm/ir/transform.h
index 2bbf28311b30..d2931123073b 100644
--- a/include/tvm/ir/transform.h
+++ b/include/tvm/ir/transform.h
@@ -166,7 +166,7 @@ class PassContext : public ObjectRef {
    * \return const access pointer.
    */
   const PassContextNode* operator->() const {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<const PassContextNode*>(get());
   }
   /*!
@@ -174,7 +174,7 @@ class PassContext : public ObjectRef {
    * \return mutable access pointer.
    */
   PassContextNode* operator->() {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<PassContextNode*>(get_mutable());
   }
 
@@ -344,7 +344,7 @@ class Pass : public ObjectRef {
    */
   IRModule operator()(IRModule mod) const {
     const PassNode* node = operator->();
-    CHECK(node != nullptr);
+    ICHECK(node != nullptr);
     return node->operator()(std::move(mod));
   }
   /*!
@@ -357,7 +357,7 @@ class Pass : public ObjectRef {
    */
   IRModule operator()(IRModule mod, const PassContext& pass_ctx) const {
     const PassNode* node = operator->();
-    CHECK(node != nullptr);
+    ICHECK(node != nullptr);
     return node->operator()(std::move(mod), pass_ctx);
   }
 
diff --git a/include/tvm/ir/type_functor.h b/include/tvm/ir/type_functor.h
index 2a6314cf7644..11bf7d4740d0 100644
--- a/include/tvm/ir/type_functor.h
+++ b/include/tvm/ir/type_functor.h
@@ -71,7 +71,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitType(const Type& n, Args... args) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/include/tvm/node/attr_registry_map.h b/include/tvm/node/attr_registry_map.h
index 9c554af9bc21..552aa7114657 100644
--- a/include/tvm/node/attr_registry_map.h
+++ b/include/tvm/node/attr_registry_map.h
@@ -56,9 +56,9 @@ class AttrRegistryMapContainerMap {
    * \return the const reference to the content value.
    */
   const runtime::TVMRetValue& operator[](const KeyType& key) const {
-    CHECK(key.defined());
+    ICHECK(key.defined());
     const uint32_t idx = key->AttrRegistryIndex();
-    CHECK(idx < data_.size() && data_[idx].second != 0)
+    ICHECK(idx < data_.size() && data_[idx].second != 0)
         << "Attribute " << attr_name_ << " has not been registered for " << key->name;
     return data_[idx].first;
   }
@@ -71,7 +71,7 @@ class AttrRegistryMapContainerMap {
    */
   template <typename ValueType>
   ValueType get(const KeyType& key, ValueType def_value) const {
-    CHECK(key.defined());
+    ICHECK(key.defined());
     const uint32_t idx = key->AttrRegistryIndex();
     if (idx < data_.size() && data_[idx].second != 0) {
       return data_[idx].first;
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index 74dabc168924..209bb9e72f33 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -351,7 +351,7 @@ class SmallMapNode : public MapNode,
    */
   const mapped_type& at(const key_type& key) const {
     iterator itr = find(key);
-    CHECK(itr.index < size_) << "IndexError: key is not in Map";
+    ICHECK(itr.index < size_) << "IndexError: key is not in Map";
     return itr->second;
   }
   /*!
@@ -361,7 +361,7 @@ class SmallMapNode : public MapNode,
    */
   mapped_type& at(const key_type& key) {
     iterator itr = find(key);
-    CHECK(itr.index < size_) << "IndexError: key is not in Map";
+    ICHECK(itr.index < size_) << "IndexError: key is not in Map";
     return itr->second;
   }
   /*! \return begin iterator */
@@ -466,7 +466,7 @@ class SmallMapNode : public MapNode,
     }
     uint64_t next_size = std::max(map_node->slots_ * 2, uint64_t(kInitSize));
     next_size = std::min(next_size, uint64_t(kMaxSize));
-    CHECK_GT(next_size, map_node->slots_);
+    ICHECK_GT(next_size, map_node->slots_);
     ObjectPtr<Object> new_map = CreateFromRange(next_size, map_node->begin(), map_node->end());
     InsertMaybeReHash(kv, &new_map);
     *map = std::move(new_map);
@@ -656,7 +656,7 @@ class DenseMapNode : public MapNode {
    */
   mapped_type& At(const key_type& key) const {
     ListNode iter = Search(key);
-    CHECK(!iter.IsNone()) << "IndexError: key is not in Map";
+    ICHECK(!iter.IsNone()) << "IndexError: key is not in Map";
     return iter.Val();
   }
   /*!
@@ -823,7 +823,7 @@ class DenseMapNode : public MapNode {
    * \return The object created
    */
   static ObjectPtr<DenseMapNode> Empty(uint32_t fib_shift, uint64_t n_slots) {
-    CHECK_GT(n_slots, uint64_t(SmallMapNode::kMaxSize));
+    ICHECK_GT(n_slots, uint64_t(SmallMapNode::kMaxSize));
     ObjectPtr<DenseMapNode> p = make_object<DenseMapNode>();
     uint64_t n_blocks = CalcNumBlocks(n_slots - 1);
     Block* block = p->data_ = new Block[n_blocks];
@@ -855,7 +855,7 @@ class DenseMapNode : public MapNode {
       for (int j = 0; j < kBlockCap;
            ++j, ++meta_ptr_from, ++data_ptr_from, ++meta_ptr_to, ++data_ptr_to) {
         uint8_t& meta = *meta_ptr_to = *meta_ptr_from;
-        CHECK(meta != kProtectedSlot);
+        ICHECK(meta != kProtectedSlot);
         if (meta != uint8_t(kEmptySlot)) {
           new (data_ptr_to) KVType(*data_ptr_from);
         }
@@ -876,7 +876,7 @@ class DenseMapNode : public MapNode {
       iter.Val() = kv.second;
       return;
     }
-    CHECK_GT(map_node->slots_, uint64_t(SmallMapNode::kMaxSize));
+    ICHECK_GT(map_node->slots_, uint64_t(SmallMapNode::kMaxSize));
     // Otherwise, start rehash
     ObjectPtr<Object> p = Empty(map_node->fib_shift_ - 1, map_node->slots_ * 2 + 2);
     // Insert the given `kv` into the new hash map
@@ -963,7 +963,7 @@ class DenseMapNode : public MapNode {
       shift -= 1;
       slots <<= 1;
     }
-    CHECK_GT(slots, cap);
+    ICHECK_GT(slots, cap);
     if (slots < cap * 2) {
       *fib_shift = shift - 1;
       *n_slots = slots << 1;
diff --git a/include/tvm/node/functor.h b/include/tvm/node/functor.h
index 0837f35bd715..9920500ffe98 100644
--- a/include/tvm/node/functor.h
+++ b/include/tvm/node/functor.h
@@ -92,8 +92,8 @@ class NodeFunctor<R(const ObjectRef& n, Args...)> {
    * \return The result.
    */
   R operator()(const ObjectRef& n, Args... args) const {
-    CHECK(can_dispatch(n)) << "NodeFunctor calls un-registered function on type "
-                           << n->GetTypeKey();
+    ICHECK(can_dispatch(n)) << "NodeFunctor calls un-registered function on type "
+                            << n->GetTypeKey();
     return (*func_[n->type_index()])(n, std::forward<Args>(args)...);
   }
   /*!
@@ -108,7 +108,7 @@ class NodeFunctor<R(const ObjectRef& n, Args...)> {
     if (func_.size() <= tindex) {
       func_.resize(tindex + 1, nullptr);
     }
-    CHECK(func_[tindex] == nullptr) << "Dispatch for " << TNode::_type_key << " is already set";
+    ICHECK(func_[tindex] == nullptr) << "Dispatch for " << TNode::_type_key << " is already set";
     func_[tindex] = f;
     return *this;
   }
@@ -121,7 +121,7 @@ class NodeFunctor<R(const ObjectRef& n, Args...)> {
   template <typename TNode>
   TSelf& clear_dispatch() {  // NOLINT(*)
     uint32_t tindex = TNode::RuntimeTypeIndex();
-    CHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
+    ICHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
     func_[tindex] = nullptr;
     return *this;
   }
diff --git a/include/tvm/node/reflection.h b/include/tvm/node/reflection.h
index e8ff26be42b3..d842c33cce03 100644
--- a/include/tvm/node/reflection.h
+++ b/include/tvm/node/reflection.h
@@ -208,7 +208,7 @@ class ReflectionVTable::Registry {
    * \return rference to self.
    */
   Registry& set_creator(FCreate f) {  // NOLINT(*)
-    CHECK_LT(type_index_, parent_->fcreate_.size());
+    ICHECK_LT(type_index_, parent_->fcreate_.size());
     parent_->fcreate_[type_index_] = f;
     return *this;
   }
@@ -218,7 +218,7 @@ class ReflectionVTable::Registry {
    * \return rference to self.
    */
   Registry& set_repr_bytes(FReprBytes f) {  // NOLINT(*)
-    CHECK_LT(type_index_, parent_->frepr_bytes_.size());
+    ICHECK_LT(type_index_, parent_->frepr_bytes_.size());
     parent_->frepr_bytes_[type_index_] = f;
     return *this;
   }
diff --git a/include/tvm/parser/source_map.h b/include/tvm/parser/source_map.h
index 1153deb95dc3..424af5c98cc8 100644
--- a/include/tvm/parser/source_map.h
+++ b/include/tvm/parser/source_map.h
@@ -108,7 +108,7 @@ class SourceMap : public ObjectRef {
   void Add(const Source& source);
 
   SourceMapNode* operator->() {
-    CHECK(get() != nullptr);
+    ICHECK(get() != nullptr);
     return static_cast<SourceMapNode*>(get_mutable());
   }
 
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 76a6a221d065..e94bd2756e98 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -42,18 +42,18 @@ namespace tvm {
  */
 namespace relay {
 
-#define RELAY_DEBUG(...)                                               \
-  {                                                                    \
-    auto fdebug = runtime::Registry::Get("relay.debug");               \
-    CHECK(fdebug) << "Could not find Relay Python debugger function."; \
-    (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__);         \
+#define RELAY_DEBUG(...)                                                \
+  {                                                                     \
+    auto fdebug = runtime::Registry::Get("relay.debug");                \
+    ICHECK(fdebug) << "Could not find Relay Python debugger function."; \
+    (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__);          \
   }
 
-#define RELAY_DEBUG_INTERP(...)                                        \
-  {                                                                    \
-    auto fdebug = runtime::Registry::Get("relay.debug_interp");        \
-    CHECK(fdebug) << "Could not find Relay Python debugger function."; \
-    (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__);         \
+#define RELAY_DEBUG_INTERP(...)                                         \
+  {                                                                     \
+    auto fdebug = runtime::Registry::Get("relay.debug_interp");         \
+    ICHECK(fdebug) << "Could not find Relay Python debugger function."; \
+    (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__);          \
   }
 
 /*!
diff --git a/include/tvm/relay/dataflow_pattern_functor.h b/include/tvm/relay/dataflow_pattern_functor.h
index 98c81c929409..364daac81cc8 100644
--- a/include/tvm/relay/dataflow_pattern_functor.h
+++ b/include/tvm/relay/dataflow_pattern_functor.h
@@ -76,7 +76,7 @@ class DFPatternFunctor<R(const DFPattern& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitDFPattern(const DFPattern& n, Args... args) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index c3d2f724b736..df0940fa7482 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -87,7 +87,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitExpr(const Expr& n, Args... args) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
@@ -345,7 +345,7 @@ class ExprRewriter {
    * \return The result of the call
    */
   virtual Expr Rewrite(const Expr& pre, const Expr& post) {
-    CHECK(pre.defined());
+    ICHECK(pre.defined());
     static FType vtable = InitVTable();
     return vtable(pre, this, post);
   }
diff --git a/include/tvm/relay/pattern_functor.h b/include/tvm/relay/pattern_functor.h
index de3bafa49074..711d8323f158 100644
--- a/include/tvm/relay/pattern_functor.h
+++ b/include/tvm/relay/pattern_functor.h
@@ -89,7 +89,7 @@ class PatternFunctor<R(const Pattern& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitPattern(const Pattern& n, Args... args) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/include/tvm/runtime/container.h b/include/tvm/runtime/container.h
index 7778c5d8424c..796ab7b113c1 100644
--- a/include/tvm/runtime/container.h
+++ b/include/tvm/runtime/container.h
@@ -146,7 +146,7 @@ class InplaceArrayBase {
    */
   const ElemType& operator[](size_t idx) const {
     size_t size = Self()->GetSize();
-    CHECK_LT(idx, size) << "Index " << idx << " out of bounds " << size << "\n";
+    ICHECK_LT(idx, size) << "Index " << idx << " out of bounds " << size << "\n";
     return *(reinterpret_cast<ElemType*>(AddressOf(idx)));
   }
 
@@ -157,7 +157,7 @@ class InplaceArrayBase {
    */
   ElemType& operator[](size_t idx) {
     size_t size = Self()->GetSize();
-    CHECK_LT(idx, size) << "Index " << idx << " out of bounds " << size << "\n";
+    ICHECK_LT(idx, size) << "Index " << idx << " out of bounds " << size << "\n";
     return *(reinterpret_cast<ElemType*>(AddressOf(idx)));
   }
 
@@ -361,7 +361,7 @@ class ArrayNode : public Object, public InplaceArrayBase<ArrayNode, ObjectRef> {
    */
   static ObjectPtr<ArrayNode> CopyFrom(int64_t cap, ArrayNode* from) {
     int64_t size = from->size_;
-    CHECK_GE(cap, size) << "ValueError: not enough capacity";
+    ICHECK_GE(cap, size) << "ValueError: not enough capacity";
     ObjectPtr<ArrayNode> p = ArrayNode::Empty(cap);
     ObjectRef* write = p->MutableBegin();
     ObjectRef* read = from->MutableBegin();
@@ -380,7 +380,7 @@ class ArrayNode : public Object, public InplaceArrayBase<ArrayNode, ObjectRef> {
    */
   static ObjectPtr<ArrayNode> MoveFrom(int64_t cap, ArrayNode* from) {
     int64_t size = from->size_;
-    CHECK_GE(cap, size) << "ValueError: not enough capacity";
+    ICHECK_GE(cap, size) << "ValueError: not enough capacity";
     ObjectPtr<ArrayNode> p = ArrayNode::Empty(cap);
     ObjectRef* write = p->MutableBegin();
     ObjectRef* read = from->MutableBegin();
@@ -429,7 +429,7 @@ class ArrayNode : public Object, public InplaceArrayBase<ArrayNode, ObjectRef> {
    * \return Ref-counted ArrayNode requested
    */
   static ObjectPtr<ArrayNode> Empty(int64_t n = kInitSize) {
-    CHECK_GE(n, 0);
+    ICHECK_GE(n, 0);
     ObjectPtr<ArrayNode> p = make_inplace_array_object<ArrayNode, ObjectRef>(n);
     p->capacity_ = n;
     p->size_ = 0;
@@ -679,9 +679,9 @@ class Array : public ObjectRef {
    */
   const T operator[](int64_t i) const {
     ArrayNode* p = GetArrayNode();
-    CHECK(p != nullptr) << "ValueError: cannot index a null array";
-    CHECK(0 <= i && i < p->size_) << "IndexError: indexing " << i << " on an array of size "
-                                  << p->size_;
+    ICHECK(p != nullptr) << "ValueError: cannot index a null array";
+    ICHECK(0 <= i && i < p->size_)
+        << "IndexError: indexing " << i << " on an array of size " << p->size_;
     return DowncastNoCheck<T>(*(p->begin() + i));
   }
 
@@ -703,16 +703,16 @@ class Array : public ObjectRef {
   /*! \return The first element of the array */
   const T front() const {
     ArrayNode* p = GetArrayNode();
-    CHECK(p != nullptr) << "ValueError: cannot index a null array";
-    CHECK_GT(p->size_, 0) << "IndexError: cannot index an empty array";
+    ICHECK(p != nullptr) << "ValueError: cannot index a null array";
+    ICHECK_GT(p->size_, 0) << "IndexError: cannot index an empty array";
     return DowncastNoCheck<T>(*(p->begin()));
   }
 
   /*! \return The last element of the array */
   const T back() const {
     ArrayNode* p = GetArrayNode();
-    CHECK(p != nullptr) << "ValueError: cannot index a null array";
-    CHECK_GT(p->size_, 0) << "IndexError: cannot index an empty array";
+    ICHECK(p != nullptr) << "ValueError: cannot index a null array";
+    ICHECK_GT(p->size_, 0) << "IndexError: cannot index an empty array";
     return DowncastNoCheck<T>(*(p->end() - 1));
   }
 
@@ -734,7 +734,7 @@ class Array : public ObjectRef {
    * \param val The element to insert
    */
   void insert(iterator position, const T& val) {
-    CHECK(data_ != nullptr) << "ValueError: cannot insert a null array";
+    ICHECK(data_ != nullptr) << "ValueError: cannot insert a null array";
     int64_t idx = std::distance(begin(), position);
     int64_t size = GetArrayNode()->size_;
     auto addr = CopyOnWrite(1)                               //
@@ -755,7 +755,7 @@ class Array : public ObjectRef {
     if (first == last) {
       return;
     }
-    CHECK(data_ != nullptr) << "ValueError: cannot insert a null array";
+    ICHECK(data_ != nullptr) << "ValueError: cannot insert a null array";
     int64_t idx = std::distance(begin(), position);
     int64_t size = GetArrayNode()->size_;
     int64_t numel = std::distance(first, last);
@@ -767,9 +767,9 @@ class Array : public ObjectRef {
 
   /*! \brief Remove the last item of the list */
   void pop_back() {
-    CHECK(data_ != nullptr) << "ValueError: cannot pop_back because array is null";
+    ICHECK(data_ != nullptr) << "ValueError: cannot pop_back because array is null";
     int64_t size = GetArrayNode()->size_;
-    CHECK_GT(size, 0) << "ValueError: cannot pop_back because array is empty";
+    ICHECK_GT(size, 0) << "ValueError: cannot pop_back because array is empty";
     CopyOnWrite()->ShrinkBy(1);
   }
 
@@ -778,11 +778,11 @@ class Array : public ObjectRef {
    * \param position An iterator pointing to the element to be erased
    */
   void erase(iterator position) {
-    CHECK(data_ != nullptr) << "ValueError: cannot erase a null array";
+    ICHECK(data_ != nullptr) << "ValueError: cannot erase a null array";
     int64_t st = std::distance(begin(), position);
     int64_t size = GetArrayNode()->size_;
-    CHECK(0 <= st && st < size) << "ValueError: cannot erase at index " << st
-                                << ", because Array size is " << size;
+    ICHECK(0 <= st && st < size) << "ValueError: cannot erase at index " << st
+                                 << ", because Array size is " << size;
     CopyOnWrite()                             //
         ->MoveElementsLeft(st, st + 1, size)  //
         ->ShrinkBy(1);
@@ -797,12 +797,12 @@ class Array : public ObjectRef {
     if (first == last) {
       return;
     }
-    CHECK(data_ != nullptr) << "ValueError: cannot erase a null array";
+    ICHECK(data_ != nullptr) << "ValueError: cannot erase a null array";
     int64_t size = GetArrayNode()->size_;
     int64_t st = std::distance(begin(), first);
     int64_t ed = std::distance(begin(), last);
-    CHECK_LT(st, ed) << "ValueError: cannot erase array in range [" << st << ", " << ed << ")";
-    CHECK(0 <= st && st <= size && 0 <= ed && ed <= size)
+    ICHECK_LT(st, ed) << "ValueError: cannot erase array in range [" << st << ", " << ed << ")";
+    ICHECK(0 <= st && st <= size && 0 <= ed && ed <= size)
         << "ValueError: cannot erase array in range [" << st << ", " << ed << ")"
         << ", because array size is " << size;
     CopyOnWrite()                         //
@@ -815,7 +815,7 @@ class Array : public ObjectRef {
    * \param n The new size.
    */
   void resize(int64_t n) {
-    CHECK_GE(n, 0) << "ValueError: cannot resize an Array to negative size";
+    ICHECK_GE(n, 0) << "ValueError: cannot resize an Array to negative size";
     if (data_ == nullptr) {
       SwitchContainer(n);
       return;
@@ -856,8 +856,8 @@ class Array : public ObjectRef {
    */
   void Set(int64_t i, T value) {
     ArrayNode* p = this->CopyOnWrite();
-    CHECK(0 <= i && i < p->size_) << "IndexError: indexing " << i << " on an array of size "
-                                  << p->size_;
+    ICHECK(0 <= i && i < p->size_)
+        << "IndexError: indexing " << i << " on an array of size " << p->size_;
     *(p->MutableBegin() + i) = std::move(value);
   }
 
@@ -923,7 +923,7 @@ class Array : public ObjectRef {
   template <typename IterType>
   void Assign(IterType first, IterType last) {
     int64_t cap = std::distance(first, last);
-    CHECK_GE(cap, 0) << "ValueError: cannot construct an Array of negative size";
+    ICHECK_GE(cap, 0) << "ValueError: cannot construct an Array of negative size";
     ArrayNode* p = GetArrayNode();
     if (p != nullptr && data_.unique() && p->capacity_ >= cap) {
       // do not have to make new space
@@ -1565,8 +1565,8 @@ struct NullOptType {};
  *
  *  Optional<String> opt0 = nullptr;
  *  Optional<String> opt1 = String("xyz");
- *  CHECK(opt0 == nullptr);
- *  CHECK(opt1 == "xyz");
+ *  ICHECK(opt0 == nullptr);
+ *  ICHECK(opt1 == "xyz");
  *
  * \endcode
  */
@@ -1613,7 +1613,7 @@ class Optional : public ObjectRef {
    * \note This function performs not-null checking.
    */
   T value() const {
-    CHECK(data_ != nullptr);
+    ICHECK(data_ != nullptr);
     return T(data_);
   }
   /*!
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index cb817a89ab81..25aadb598b28 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -24,8 +24,8 @@
 #ifndef TVM_RUNTIME_DATA_TYPE_H_
 #define TVM_RUNTIME_DATA_TYPE_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/support/logging.h>
 
 #include <string>
 #include <type_traits>
@@ -74,7 +74,7 @@ class DataType {
     data_.bits = static_cast<uint8_t>(bits);
     data_.lanes = static_cast<uint16_t>(lanes);
     if (code == kBFloat) {
-      CHECK_EQ(bits, 16);
+      ICHECK_EQ(bits, 16);
     }
   }
   /*! \return The type code. */
@@ -212,7 +212,7 @@ inline int GetVectorBytes(DataType dtype) {
       dtype == DataType::Int(1)) {
     return 1;
   }
-  CHECK_EQ(data_bits % 8, 0U) << "Need to load/store by multiple of bytes";
+  ICHECK_EQ(data_bits % 8, 0U) << "Need to load/store by multiple of bytes";
   return data_bits / 8;
 }
 
@@ -373,7 +373,7 @@ inline DLDataType String2DLDataType(std::string s) {
   if (*xdelim == 'x') {
     t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, &endpt, 10));
   }
-  CHECK(endpt == s.c_str() + s.length()) << "unknown type " << s;
+  ICHECK(endpt == s.c_str() + s.length()) << "unknown type " << s;
   return t;
 }
 
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 92b3857fbec8..0ff171d4821f 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -325,29 +325,29 @@ inline bool NDArray::IsContiguous() const {
 }
 
 inline void NDArray::CopyFrom(const DLTensor* other) {
-  CHECK(data_ != nullptr);
+  ICHECK(data_ != nullptr);
   CopyFromTo(other, &(get_mutable()->dl_tensor));
 }
 
 inline void NDArray::CopyFrom(const NDArray& other) {
-  CHECK(data_ != nullptr);
-  CHECK(other.data_ != nullptr);
+  ICHECK(data_ != nullptr);
+  ICHECK(other.data_ != nullptr);
   CopyFromTo(&(other.get_mutable()->dl_tensor), &(get_mutable()->dl_tensor));
 }
 
 inline void NDArray::CopyTo(DLTensor* other) const {
-  CHECK(data_ != nullptr);
+  ICHECK(data_ != nullptr);
   CopyFromTo(&(get_mutable()->dl_tensor), other);
 }
 
 inline void NDArray::CopyTo(const NDArray& other) const {
-  CHECK(data_ != nullptr);
-  CHECK(other.data_ != nullptr);
+  ICHECK(data_ != nullptr);
+  ICHECK(other.data_ != nullptr);
   CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor));
 }
 
 inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
-  CHECK(data_ != nullptr);
+  ICHECK(data_ != nullptr);
   const DLTensor* dptr = operator->();
   NDArray ret =
       Empty(std::vector<int64_t>(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, ctx);
@@ -422,7 +422,7 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
     strm->Write(tensor->data, data_byte_size);
   } else {
     std::vector<uint8_t> bytes(data_byte_size);
-    CHECK_EQ(
+    ICHECK_EQ(
         TVMArrayCopyToBytes(const_cast<DLTensor*>(tensor), dmlc::BeginPtr(bytes), data_byte_size),
         0)
         << TVMGetLastError();
@@ -438,19 +438,19 @@ inline void NDArray::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operato
 
 inline bool NDArray::Load(dmlc::Stream* strm) {
   uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid DLTensor file format";
-  CHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
-  CHECK(header == kTVMNDArrayMagic) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&header)) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
+  ICHECK(header == kTVMNDArrayMagic) << "Invalid DLTensor file format";
   DLContext ctx;
   int ndim;
   DLDataType dtype;
-  CHECK(strm->Read(&ctx)) << "Invalid DLTensor file format";
-  CHECK(strm->Read(&ndim)) << "Invalid DLTensor file format";
-  CHECK(strm->Read(&dtype)) << "Invalid DLTensor file format";
-  CHECK_EQ(ctx.device_type, kDLCPU) << "Invalid DLTensor context: can only save as CPU tensor";
+  ICHECK(strm->Read(&ctx)) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&ndim)) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&dtype)) << "Invalid DLTensor file format";
+  ICHECK_EQ(ctx.device_type, kDLCPU) << "Invalid DLTensor context: can only save as CPU tensor";
   std::vector<int64_t> shape(ndim);
   if (ndim != 0) {
-    CHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
+    ICHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
   }
   NDArray ret = NDArray::Empty(shape, dtype, ctx);
   int64_t num_elems = 1;
@@ -459,12 +459,12 @@ inline bool NDArray::Load(dmlc::Stream* strm) {
     num_elems *= ret->shape[i];
   }
   int64_t data_byte_size;
-  CHECK(strm->Read(&data_byte_size)) << "Invalid DLTensor file format";
-  CHECK(data_byte_size == num_elems * elem_bytes) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&data_byte_size)) << "Invalid DLTensor file format";
+  ICHECK(data_byte_size == num_elems * elem_bytes) << "Invalid DLTensor file format";
   auto read_ret = strm->Read(ret->data, data_byte_size);
   // Only check non-empty data
   if (ndim > 0 && shape[0] != 0) {
-    CHECK(read_ret) << "Invalid DLTensor file format";
+    ICHECK(read_ret) << "Invalid DLTensor file format";
   }
   if (!DMLC_IO_NO_ENDIAN_SWAP) {
     dmlc::ByteSwap(ret->data, elem_bytes, num_elems);
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 2305f12e5533..43038998639e 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -195,7 +195,7 @@ class TypedPackedFunc<R(Args...)> {
    * // construct from packed function
    * TypedPackedFunc<int(int)> ftyped(packed);
    * // call the typed version.
-   * CHECK_EQ(ftyped(1), 2);
+   * ICHECK_EQ(ftyped(1), 2);
    * \endcode
    *
    * \param packed The packed function
@@ -225,7 +225,7 @@ class TypedPackedFunc<R(Args...)> {
    * // construct from packed function
    * TypedPackedFunc<int(int)> ftyped(typed_lambda);
    * // call the typed version.
-   * CHECK_EQ(ftyped(1), 2);
+   * ICHECK_EQ(ftyped(1), 2);
    * \endcode
    *
    * \param typed_lambda typed lambda function.
@@ -246,7 +246,7 @@ class TypedPackedFunc<R(Args...)> {
    * TypedPackedFunc<int(int)> ftyped;
    * ftyped = [](int x) { return x + 1; }
    * // call the typed version.
-   * CHECK_EQ(ftyped(1), 2);
+   * ICHECK_EQ(ftyped(1), 2);
    * \endcode
    *
    * \param typed_lambda typed lambda function.
@@ -337,7 +337,7 @@ inline const char* ArgTypeCode2Str(int type_code);
 
 // macro to check type code.
 #define TVM_CHECK_TYPE_CODE(CODE, T) \
-  CHECK_EQ(CODE, T) << " expected " << ArgTypeCode2Str(T) << " but get " << ArgTypeCode2Str(CODE)
+  ICHECK_EQ(CODE, T) << " expected " << ArgTypeCode2Str(T) << " but get " << ArgTypeCode2Str(CODE)
 
 /*!
  * \brief Type traits for runtime type check during FFI conversion.
@@ -382,8 +382,8 @@ class TVMPODValue_ {
   }
   operator int() const {
     TVM_CHECK_TYPE_CODE(type_code_, kDLInt);
-    CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
-    CHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
+    ICHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
+    ICHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
     return static_cast<int>(value_.v_int64);
   }
   operator bool() const {
@@ -491,7 +491,7 @@ class TVMArgValue : public TVMPODValue_ {
     } else if (type_code_ == kTVMStr) {
       return std::string(value_.v_str);
     } else {
-      CHECK(IsObjectRef<tvm::runtime::String>());
+      ICHECK(IsObjectRef<tvm::runtime::String>());
       return AsObjectRef<tvm::runtime::String>().operator std::string();
     }
   }
@@ -719,7 +719,7 @@ class TVMRetValue : public TVMPODValue_ {
    */
   void MoveToCHost(TVMValue* ret_value, int* ret_type_code) {
     // cannot move str; need specially handle.
-    CHECK(type_code_ != kTVMStr && type_code_ != kTVMBytes);
+    ICHECK(type_code_ != kTVMStr && type_code_ != kTVMBytes);
     *ret_value = value_;
     *ret_type_code = type_code_;
     type_code_ = kTVMNullptr;
@@ -733,7 +733,7 @@ class TVMRetValue : public TVMPODValue_ {
    */
   static TVMRetValue MoveFromCHost(TVMValue value, int type_code) {
     // Can move POD and everything under the object system.
-    CHECK(type_code <= kTVMPackedFuncHandle || type_code == kTVMNDArrayHandle);
+    ICHECK(type_code <= kTVMPackedFuncHandle || type_code == kTVMNDArrayHandle);
     TVMRetValue ret;
     ret.value_ = value;
     ret.type_code_ = type_code;
@@ -741,8 +741,8 @@ class TVMRetValue : public TVMPODValue_ {
   }
   /*! \return The value field, if the data is POD */
   const TVMValue& value() const {
-    CHECK(type_code_ != kTVMObjectHandle && type_code_ != kTVMPackedFuncHandle &&
-          type_code_ != kTVMModuleHandle && type_code_ != kTVMStr)
+    ICHECK(type_code_ != kTVMObjectHandle && type_code_ != kTVMPackedFuncHandle &&
+           type_code_ != kTVMModuleHandle && type_code_ != kTVMStr)
         << "TVMRetValue.value can only be used for POD data";
     return value_;
   }
@@ -966,8 +966,8 @@ struct PackedFuncValueConverter {
   }
 
 inline TVMArgValue TVMArgs::operator[](int i) const {
-  CHECK_LT(i, num_args) << "not enough argument passed, " << num_args << " passed"
-                        << " but request arg[" << i << "].";
+  ICHECK_LT(i, num_args) << "not enough argument passed, " << num_args << " passed"
+                         << " but request arg[" << i << "].";
   return TVMArgValue(values[i], type_codes[i]);
 }
 
@@ -1090,7 +1090,7 @@ class TVMArgsSetter {
   }
   TVM_ALWAYS_INLINE void operator()(size_t i, uint64_t value) const {
     values_[i].v_int64 = static_cast<int64_t>(value);
-    CHECK_LE(value, static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
+    ICHECK_LE(value, static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
     type_codes_[i] = kDLInt;
   }
   TVM_ALWAYS_INLINE void operator()(size_t i, double value) const {
@@ -1155,7 +1155,7 @@ class TVMArgsSetter {
       values_[i].v_str = value.ptr<std::string>()->c_str();
       type_codes_[i] = kTVMStr;
     } else {
-      CHECK_NE(value.type_code(), kTVMBytes) << "not handled.";
+      ICHECK_NE(value.type_code(), kTVMBytes) << "not handled.";
       values_[i] = value.value_;
       type_codes_[i] = value.type_code();
     }
@@ -1234,7 +1234,7 @@ struct unpack_call_dispatcher<void, 0, index, F> {
 
 template <typename R, int nargs, typename F>
 TVM_ALWAYS_INLINE void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) {
-  CHECK_EQ(nargs, args.size()) << "Expect " << nargs << " arguments but get " << args.size();
+  ICHECK_EQ(nargs, args.size()) << "Expect " << nargs << " arguments but get " << args.size();
   unpack_call_dispatcher<R, nargs, 0, F>::run(f, args, rv);
 }
 
@@ -1363,7 +1363,7 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
   using ContainerType = typename TObjectRef::ContainerType;
 
   if (type_code_ == kTVMNullptr) {
-    CHECK(TObjectRef::_type_is_nullable)
+    ICHECK(TObjectRef::_type_is_nullable)
         << "Expect a not null value of " << ContainerType::_type_key;
     return TObjectRef(ObjectPtr<Object>(nullptr));
   }
@@ -1373,7 +1373,7 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMNDArrayHandle);
     ObjectPtr<Object> data =
         NDArray::FFIDataFromHandle(static_cast<TVMArrayHandle>(value_.v_handle));
-    CHECK(data->IsInstance<ContainerType>())
+    ICHECK(data->IsInstance<ContainerType>())
         << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey();
     return TObjectRef(data);
   }
@@ -1381,20 +1381,20 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
     // Casting to a sub-class of Module
     TVM_CHECK_TYPE_CODE(type_code_, kTVMModuleHandle);
     ObjectPtr<Object> data = GetObjectPtr<Object>(static_cast<Object*>(value_.v_handle));
-    CHECK(data->IsInstance<ContainerType>())
+    ICHECK(data->IsInstance<ContainerType>())
         << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey();
     return TObjectRef(data);
   }
   if (type_code_ == kTVMObjectHandle) {
     // normal object type check.
     Object* ptr = static_cast<Object*>(value_.v_handle);
-    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+    ICHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
         << "Expect " << ObjectTypeChecker<TObjectRef>::TypeName() << " but get "
         << ptr->GetTypeKey();
     return TObjectRef(GetObjectPtr<Object>(ptr));
   } else if (type_code_ == kTVMObjectRValueRefArg) {
     Object* ptr = *static_cast<Object**>(value_.v_handle);
-    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+    ICHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
         << "Expect " << ObjectTypeChecker<TObjectRef>::TypeName() << " but get "
         << ptr->GetTypeKey();
     return TObjectRef(GetObjectPtr<Object>(ptr));
diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h
index edcbd881e074..e858c4458054 100644
--- a/include/tvm/runtime/vm/bytecode.h
+++ b/include/tvm/runtime/vm/bytecode.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_VM_BYTECODE_H_
 
 #include <tvm/runtime/data_type.h>
+#include <tvm/support/logging.h>
 
 #include <iostream>
 #include <vector>
diff --git a/include/tvm/support/logging.h b/include/tvm/support/logging.h
index 4322435c06b0..d98363ea1c1b 100644
--- a/include/tvm/support/logging.h
+++ b/include/tvm/support/logging.h
@@ -125,13 +125,13 @@ constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
 #define ICHECK_BINARY_OP(name, op, x, y)                           \
   if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \
   dmlc::LogMessageFatal(__FILE__, __LINE__).stream()               \
-      << kTVM_INTERNAL_ERROR_MESSAGE << std::endl                  \
+      << tvm::kTVM_INTERNAL_ERROR_MESSAGE << std::endl             \
       << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": "
 
 #define ICHECK(x)                                    \
   if (!(x))                                          \
   dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
-      << kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: "
+      << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: "
 
 #define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
 #define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
@@ -139,10 +139,10 @@ constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
 #define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
 #define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
 #define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
-#define ICHECK_NOTNULL(x)                                                                   \
-  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()                      \
-                        << kTVM_INTERNAL_ERROR_MESSAGE << __INDENT << "Check not null: " #x \
-                        << ' ',                                                             \
+#define ICHECK_NOTNULL(x)                                                                        \
+  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()                           \
+                        << tvm::kTVM_INTERNAL_ERROR_MESSAGE << __INDENT << "Check not null: " #x \
+                        << ' ',                                                                  \
    (x) : (x))  // NOLINT(*)
 
 /*! \brief The diagnostic level, controls the printing of the message. */
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index dd14602fa6fc..c9ef736f7aee 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -295,7 +295,7 @@ inline TargetKindAttrMap<ValueType> TargetKind::GetAttrMap(const String& attr_na
 template <typename ValueType>
 inline TargetKindRegEntry& TargetKindRegEntry::set_attr(const String& attr_name,
                                                         const ValueType& value, int plevel) {
-  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  ICHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
   runtime::TVMRetValue rv;
   rv = value;
   UpdateAttr(attr_name, rv, plevel);
@@ -321,7 +321,7 @@ inline TargetKindRegEntry& TargetKindRegEntry::set_attrs_preprocessor(FLambda f)
 
 template <typename ValueType>
 inline TargetKindRegEntry& TargetKindRegEntry::add_attr_option(const String& key) {
-  CHECK(!kind_->key2vtype_.count(key))
+  ICHECK(!kind_->key2vtype_.count(key))
       << "AttributeError: add_attr_option failed because '" << key << "' has been set once";
   kind_->key2vtype_[key] = detail::ValueTypeInfoMaker<ValueType>()();
   return *this;
diff --git a/include/tvm/tir/data_layout.h b/include/tvm/tir/data_layout.h
index ee93a0675470..73da05c549e2 100644
--- a/include/tvm/tir/data_layout.h
+++ b/include/tvm/tir/data_layout.h
@@ -255,9 +255,9 @@ class Layout : public ObjectRef {
   }
 
   const LayoutAxis& operator[](int32_t i) const {
-    CHECK(defined()) << "Try to access axis from an undefined layout.";
+    ICHECK(defined()) << "Try to access axis from an undefined layout.";
     int32_t index = i < 0 ? static_cast<int32_t>(ndim() + i) : i;
-    CHECK(index >= 0 && static_cast<size_t>(index) < ndim()) << "Invalid index " << i;
+    ICHECK(index >= 0 && static_cast<size_t>(index) < ndim()) << "Invalid index " << i;
     const tir::IterVar axis = operator->()->axes[index];
     return LayoutAxis::Get(axis);
   }
diff --git a/include/tvm/tir/expr_functor.h b/include/tvm/tir/expr_functor.h
index 3f73d21bb625..b5f1d64a00c4 100644
--- a/include/tvm/tir/expr_functor.h
+++ b/include/tvm/tir/expr_functor.h
@@ -58,7 +58,7 @@ namespace tir {
  *  };
  *  MyExprFunctor f;
  *  Var x("x");
- *  CHECK_EQ(f(x + 1, 2), 3);
+ *  ICHECK_EQ(f(x + 1, 2), 3);
  * \endcode
  *
  * \note Why do we need this more powerful Functor:
diff --git a/include/tvm/topi/broadcast.h b/include/tvm/topi/broadcast.h
index d03ddc93b4c0..f4f4f2ccb917 100644
--- a/include/tvm/topi/broadcast.h
+++ b/include/tvm/topi/broadcast.h
@@ -49,17 +49,17 @@ inline tvm::te::Tensor broadcast_to(const tvm::te::Tensor& t,
                                     const tvm::Array<tvm::PrimExpr>& output_shape,
                                     std::string name = "T_broadcast_to",
                                     std::string tag = kBroadcast) {
-  CHECK_GE(output_shape.size(), t->shape.size())
+  ICHECK_GE(output_shape.size(), t->shape.size())
       << "Not a broadcast, output dimensionality smaller than input.\noutput: " << output_shape
       << "\nvs\ninput: " << t;
   auto bh = detail::BroadcastShape(output_shape, t->shape);
-  CHECK_EQ(output_shape.size(), bh.common_shape.size());
+  ICHECK_EQ(output_shape.size(), bh.common_shape.size());
   Array<PrimExpr> oshape;
   for (size_t i = 0; i < output_shape.size(); ++i) {
     if (output_shape[i].as<tir::IntImmNode>() == nullptr) {
       oshape.push_back(output_shape[i]);
     } else {
-      CHECK(topi::detail::EqualCheck(output_shape[i], bh.common_shape[i]));
+      ICHECK(topi::detail::EqualCheck(output_shape[i], bh.common_shape[i]));
       oshape.push_back(bh.common_shape[i]);
     }
   }
diff --git a/include/tvm/topi/cuda/dense.h b/include/tvm/topi/cuda/dense.h
index 447486d2fe0d..7fd3107b6c32 100644
--- a/include/tvm/topi/cuda/dense.h
+++ b/include/tvm/topi/cuda/dense.h
@@ -53,10 +53,10 @@ namespace cuda {
 inline tvm::te::Tensor dense_cuda(const Target& target, const tvm::te::Tensor& data,
                                   const tvm::te::Tensor& weight, const tvm::te::Tensor& bias,
                                   const DataType& out_dtype) {
-  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
-  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
-    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+    ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
   }
 
   auto batch = data->shape[0];
@@ -64,7 +64,7 @@ inline tvm::te::Tensor dense_cuda(const Target& target, const tvm::te::Tensor& d
   auto out_dim = weight->shape[0];
 
   if (target->GetLibs().count("cublas")) {
-    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
+    ICHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::cublas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::te::compute(
diff --git a/include/tvm/topi/cuda/reduction.h b/include/tvm/topi/cuda/reduction.h
index acfcc76b9ade..7160419422a6 100644
--- a/include/tvm/topi/cuda/reduction.h
+++ b/include/tvm/topi/cuda/reduction.h
@@ -60,7 +60,7 @@ Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch,
   }
 
   auto out_stage = sch[data_out];
-  CHECK_GT(out_stage->op.as<ComputeOpNode>()->reduce_axis.size(), 0)
+  ICHECK_GT(out_stage->op.as<ComputeOpNode>()->reduce_axis.size(), 0)
       << "reduce_axis must be greater than zero";
 
   bool all_reduce;
@@ -183,7 +183,7 @@ void TraverseAfterReduce(const Target& target, Schedule s, Operation op) {
  * \return A schedule for the given ops.
  */
 Schedule schedule_reduce(const Target& target, Array<Tensor> outs) {
-  CHECK_EQ(outs.size(), 1) << "outs must have size 1";
+  ICHECK_EQ(outs.size(), 1) << "outs must have size 1";
   Array<Operation> out_ops;
   for (auto t : outs) {
     out_ops.push_back(t->op);
diff --git a/include/tvm/topi/detail/broadcast.h b/include/tvm/topi/detail/broadcast.h
index e719348ecf77..5c701825840c 100644
--- a/include/tvm/topi/detail/broadcast.h
+++ b/include/tvm/topi/detail/broadcast.h
@@ -59,7 +59,7 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::PrimExpr>& shape1,
       bh.vars1.push_front(bh.all_vars[0]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else if (topi::detail::EqualCheck(one, shape1[s1_size - i])) {
-      CHECK(!topi::detail::EqualCheck(one, shape2[s2_size - i]));
+      ICHECK(!topi::detail::EqualCheck(one, shape2[s2_size - i]));
       bh.common_shape.push_front(shape2[s2_size - i]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else if (topi::detail::EqualCheck(one, shape2[s2_size - i])) {
@@ -78,10 +78,10 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::PrimExpr>& shape1,
       bh.vars1.push_front(bh.all_vars[0]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else {
-      CHECK(false) << "Incompatible broadcast dims: " << shape1[s1_size - i] << " and "
-                   << shape2[s2_size - i]
-                   << " in: " << tvm::Array<tvm::PrimExpr>(shape1.begin(), shape1.end()) << " and "
-                   << tvm::Array<tvm::PrimExpr>(shape2.begin(), shape2.end());
+      ICHECK(false) << "Incompatible broadcast dims: " << shape1[s1_size - i] << " and "
+                    << shape2[s2_size - i]
+                    << " in: " << tvm::Array<tvm::PrimExpr>(shape1.begin(), shape1.end()) << " and "
+                    << tvm::Array<tvm::PrimExpr>(shape2.begin(), shape2.end());
     }
   }
   // Remaining dimensions whether on shape1 or shape2 can always be completed
@@ -100,7 +100,7 @@ inline tvm::Array<tvm::PrimExpr> InputIndexFromBroadcast(
     const tvm::Array<tvm::tir::Var>& ovars, const tvm::te::Tensor& T,
     const std::deque<tvm::tir::Var>& my_vars, const std::deque<tvm::tir::Var>& all_vars) {
   tvm::Array<tvm::PrimExpr> ivars;
-  CHECK_EQ(ovars.size(), all_vars.size());
+  ICHECK_EQ(ovars.size(), all_vars.size());
   // N^2, could use a map but NBD.
   size_t expected_dims = T->shape.size();
   for (size_t i = 0; i < ovars.size(); ++i) {
@@ -118,7 +118,7 @@ inline tvm::Array<tvm::PrimExpr> InputIndexFromBroadcast(
       ivars.push_back(tvm::tir::make_zero(ovars[i].dtype()));
     }
   }
-  CHECK(expected_dims == ivars.size());
+  ICHECK(expected_dims == ivars.size());
   return ivars;
 }
 
diff --git a/include/tvm/topi/detail/constant_utils.h b/include/tvm/topi/detail/constant_utils.h
index 201a0da94278..412c79330ca9 100644
--- a/include/tvm/topi/detail/constant_utils.h
+++ b/include/tvm/topi/detail/constant_utils.h
@@ -76,7 +76,7 @@ inline std::vector<int> GetConstIntValues(Array<PrimExpr> exprs, const std::stri
   std::vector<int> result;
   if (!exprs.defined()) return result;
   for (auto expr : exprs) {
-    CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
+    ICHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
     result.push_back(GetConstInt(expr));
   }
   return result;
@@ -96,7 +96,7 @@ inline std::vector<int64_t> GetConstInt64Values(Array<PrimExpr> exprs,
   std::vector<int64_t> result;
   if (!exprs.defined()) return result;
   for (auto expr : exprs) {
-    CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
+    ICHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
     result.push_back(GetConstInt(expr));
   }
   return result;
diff --git a/include/tvm/topi/detail/extern.h b/include/tvm/topi/detail/extern.h
index 48c3e18aa58e..caca1e85e520 100644
--- a/include/tvm/topi/detail/extern.h
+++ b/include/tvm/topi/detail/extern.h
@@ -79,7 +79,7 @@ inline Array<Tensor> make_extern(const Array<Array<PrimExpr> >& out_shapes,
                                  const std::vector<DataType>& out_types,
                                  const Array<Tensor>& inputs, FExtern fextern, std::string name,
                                  std::string tag, ::tvm::Map<String, ObjectRef> attrs) {
-  CHECK_EQ(out_shapes.size(), out_types.size())
+  ICHECK_EQ(out_shapes.size(), out_types.size())
       << "make_extern: out_shapes and out_types must have equal size";
 
   Array<Buffer> input_placeholders;
@@ -112,7 +112,7 @@ inline Array<Tensor> make_extern(const Array<Array<PrimExpr> >& out_shapes,
  * \return An expression representing the pack operation
  */
 inline PrimExpr pack_buffer(Buffer buf) {
-  CHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element";
+  ICHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element";
   auto shape =
       tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_stack_make_shape(), buf->shape);
   PrimExpr strides;
diff --git a/include/tvm/topi/detail/ravel_unravel.h b/include/tvm/topi/detail/ravel_unravel.h
index fc775093e632..dd7bcac09a04 100644
--- a/include/tvm/topi/detail/ravel_unravel.h
+++ b/include/tvm/topi/detail/ravel_unravel.h
@@ -43,8 +43,8 @@ using namespace tvm::te;
  * \return The index after flattening
  */
 inline PrimExpr RavelIndex(Array<PrimExpr> indices, Array<PrimExpr> shape) {
-  CHECK_EQ(indices.size(), shape.size()) << "indices and shape must have equal size";
-  CHECK_GT(indices.size(), 0) << "indices must not be empty";
+  ICHECK_EQ(indices.size(), shape.size()) << "indices and shape must have equal size";
+  ICHECK_GT(indices.size(), 0) << "indices must not be empty";
   PrimExpr idx;
   for (size_t i = 0; i < indices.size(); ++i) {
     if (i == 0) {
diff --git a/include/tvm/topi/elemwise.h b/include/tvm/topi/elemwise.h
index f537c9c865df..cad72cb591f8 100644
--- a/include/tvm/topi/elemwise.h
+++ b/include/tvm/topi/elemwise.h
@@ -327,7 +327,7 @@ inline Tensor reinterpret(const Tensor& x, DataType type, std::string name = "te
  */
 inline Tensor elemwise_sum(const Array<Tensor>& xs, std::string name = "T_elemwise_sum",
                            std::string tag = kElementWise) {
-  CHECK_GT(xs.size(), 0) << "elemwise sum must have at least one input tensor.";
+  ICHECK_GT(xs.size(), 0) << "elemwise sum must have at least one input tensor.";
   return compute(
       xs[0]->shape,
       [&](const Array<Var>& i) {
diff --git a/include/tvm/topi/nn.h b/include/tvm/topi/nn.h
index d257d3cbb863..ba1be3424fcc 100644
--- a/include/tvm/topi/nn.h
+++ b/include/tvm/topi/nn.h
@@ -98,8 +98,8 @@ inline tvm::te::Tensor leaky_relu(const tvm::te::Tensor& t, double alpha = 0.1,
 inline tvm::te::Tensor prelu(const tvm::te::Tensor& x, const tvm::te::Tensor& slope,
                              const int axis = 1, std::string name = "T_prelu",
                              std::string tag = kBroadcast) {
-  CHECK((size_t)axis < x->shape.size()) << "Wrong axis (" << axis << ")value. ";
-  CHECK(topi::detail::GetConstInt(slope->shape[0]) == topi::detail::GetConstInt(x->shape[axis]))
+  ICHECK((size_t)axis < x->shape.size()) << "Wrong axis (" << axis << ")value. ";
+  ICHECK(topi::detail::GetConstInt(slope->shape[0]) == topi::detail::GetConstInt(x->shape[axis]))
       << "Wrong slope shape received.";
 
   return tvm::te::compute(
@@ -162,8 +162,8 @@ inline tvm::te::Tensor pad(const tvm::te::Tensor& t, const tvm::Array<tvm::PrimE
   }
 
   arith::Analyzer analyzer;
-  CHECK_GE(pad_before.size(), 1);
-  CHECK_EQ(pad_before.size(), pad_after.size());
+  ICHECK_GE(pad_before.size(), 1);
+  ICHECK_EQ(pad_before.size(), pad_after.size());
   tvm::Array<tvm::PrimExpr> pad_before_int32;
   tvm::Array<tvm::PrimExpr> pad_after_int32;
 
@@ -262,8 +262,8 @@ inline tvm::te::Tensor conv2d_nchw(const tvm::te::Tensor& I, const tvm::te::Tens
                                    int pad_h = 0, int pad_w = 0, int stride_h = 1, int stride_w = 1,
                                    std::string name = "T_conv2d_nchw",
                                    std::string tag = kConv2dNCHW) {
-  CHECK_EQ(4, I->shape.size());
-  CHECK_EQ(4, W->shape.size());
+  ICHECK_EQ(4, I->shape.size());
+  ICHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
   tvm::Array<tvm::PrimExpr> output_shape{
@@ -306,8 +306,8 @@ inline tvm::te::Tensor conv2d_hwcn(const tvm::te::Tensor& I, const tvm::te::Tens
                                    int pad_h = 0, int pad_w = 0, int stride_h = 1, int stride_w = 1,
                                    std::string name = "T_conv2d_hwcn",
                                    std::string tag = kConv2dHWCN) {
-  CHECK_EQ(4, I->shape.size());
-  CHECK_EQ(4, W->shape.size());
+  ICHECK_EQ(4, I->shape.size());
+  ICHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
   tvm::Array<tvm::PrimExpr> output_shape{
@@ -351,8 +351,8 @@ inline tvm::te::Tensor depthwise_conv2d_nchw(const tvm::te::Tensor& I, const tvm
                                              int stride_w = 1,
                                              std::string name = "T_depthwise_conv2d_nchw",
                                              std::string tag = kDepthwiseConv2dNCHW) {
-  CHECK_EQ(4, I->shape.size());
-  CHECK_EQ(4, W->shape.size());
+  ICHECK_EQ(4, I->shape.size());
+  ICHECK_EQ(4, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
   auto pCM = W->shape[1];  // channel_multiplier
@@ -380,8 +380,8 @@ inline tvm::te::Tensor depthwise_conv2d_nhwc(const tvm::te::Tensor& I, const tvm
                                              int stride_w = 1,
                                              std::string name = "T_depthwise_conv2d_nhwc",
                                              std::string tag = kDepthwiseConv2dNHWC) {
-  CHECK_EQ(4, I->shape.size());
-  CHECK_EQ(4, W->shape.size());
+  ICHECK_EQ(4, I->shape.size());
+  ICHECK_EQ(4, W->shape.size());
   auto pH = I->shape[1];
   auto pW = I->shape[2];
   auto pCM = W->shape[1];  // channel_multiplier
@@ -429,8 +429,8 @@ inline tvm::te::Tensor group_conv2d_ngchw(const tvm::te::Tensor& I, const tvm::t
                                           int stride_w = 1,
                                           std::string name = "T_group_conv2d_ngchw",
                                           std::string tag = kGroupConv2d) {
-  CHECK_EQ(5, I->shape.size());
-  CHECK_EQ(5, W->shape.size());
+  ICHECK_EQ(5, I->shape.size());
+  ICHECK_EQ(5, W->shape.size());
   auto pH = I->shape[2];
   auto pW = I->shape[3];
   tvm::Array<tvm::PrimExpr> output_shape{
diff --git a/include/tvm/topi/nn/bnn.h b/include/tvm/topi/nn/bnn.h
index f72950861b8a..815b8a23c998 100644
--- a/include/tvm/topi/nn/bnn.h
+++ b/include/tvm/topi/nn/bnn.h
@@ -52,7 +52,7 @@ inline tvm::te::Tensor binarize_pack(const tvm::te::Tensor& data, int axis,
                                      std::string name = "PackedInput",
                                      std::string tag = "binarize_pack") {
   auto ishape = data->shape;
-  CHECK_EQ(GetConstInt(ishape[axis]) % 32, 0)
+  ICHECK_EQ(GetConstInt(ishape[axis]) % 32, 0)
       << "binarize_pack: axis size must be a multiple of 32";
 
   arith::Analyzer analyzer;
@@ -99,10 +99,10 @@ inline tvm::te::Tensor binarize_pack(const tvm::te::Tensor& data, int axis,
  * \return Tensor with shape [batch, out_dim], dtype is float32
  */
 inline tvm::te::Tensor binary_dense(const tvm::te::Tensor& data, const tvm::te::Tensor& weight) {
-  CHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data";
-  CHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight";
-  CHECK_EQ(data->dtype, DataType::UInt(32)) << "binary_dense requires uint32 data";
-  CHECK_EQ(weight->dtype, DataType::UInt(32)) << "binary_dense requires uint32 weight";
+  ICHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data";
+  ICHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight";
+  ICHECK_EQ(data->dtype, DataType::UInt(32)) << "binary_dense requires uint32 data";
+  ICHECK_EQ(weight->dtype, DataType::UInt(32)) << "binary_dense requires uint32 weight";
 
   auto batch = data->shape[0];
   auto in_dim = data->shape[1];
diff --git a/include/tvm/topi/nn/dense.h b/include/tvm/topi/nn/dense.h
index ad18cb063f10..113002dc2d88 100644
--- a/include/tvm/topi/nn/dense.h
+++ b/include/tvm/topi/nn/dense.h
@@ -47,10 +47,10 @@ using namespace tvm::te;
  */
 inline tvm::te::Tensor dense(const tvm::te::Tensor& data, const tvm::te::Tensor& weight,
                              const tvm::te::Tensor& bias, const DataType& out_dtype) {
-  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
-  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
-    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+    ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
   }
 
   auto batch = data->shape[0];
diff --git a/include/tvm/topi/nn/dilate.h b/include/tvm/topi/nn/dilate.h
index 9b5a8047740e..3369316e4d7f 100644
--- a/include/tvm/topi/nn/dilate.h
+++ b/include/tvm/topi/nn/dilate.h
@@ -45,7 +45,7 @@ using namespace tvm::te;
  * \return The logical conjunction expression
  */
 PrimExpr all(Array<PrimExpr> args) {
-  CHECK_GT(args.size(), 0) << "all requires at least one argument";
+  ICHECK_GT(args.size(), 0) << "all requires at least one argument";
 
   PrimExpr ret = args[0];
   for (size_t i = 1; i < args.size(); ++i) {
@@ -70,8 +70,8 @@ PrimExpr all(Array<PrimExpr> args) {
 inline Tensor dilate(const Tensor& x, Array<PrimExpr> strides, double dilation_value,
                      std::string name = "tensor", std::string tag = kInjective) {
   auto n = x->shape.size();
-  CHECK_EQ(n, strides.size()) << "strides size (" << strides.size()
-                              << ") must match dimension of x (" << n << ")";
+  ICHECK_EQ(n, strides.size()) << "strides size (" << strides.size()
+                               << ") must match dimension of x (" << n << ")";
 
   Array<PrimExpr> out_shape;
   arith::Analyzer analyzer;
diff --git a/include/tvm/topi/nn/local_response_norm.h b/include/tvm/topi/nn/local_response_norm.h
index 0170c503d9ff..717adb8ff8fa 100644
--- a/include/tvm/topi/nn/local_response_norm.h
+++ b/include/tvm/topi/nn/local_response_norm.h
@@ -52,9 +52,9 @@ using namespace tvm::te;
 inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.0001,
                   float beta = 0.75, float bias = 2, std::string name = "tensor",
                   std::string tag = kBroadcast) {
-  CHECK_EQ(data->shape.size(), 4) << "LRN requires 4-D input";
-  CHECK_EQ(size % 2, 1) << "size should be odd number";
-  CHECK(axis == 1 || axis == 3) << "axis should be 1 or 3 for NCHW and NHWC";
+  ICHECK_EQ(data->shape.size(), 4) << "LRN requires 4-D input";
+  ICHECK_EQ(size % 2, 1) << "size should be odd number";
+  ICHECK(axis == 1 || axis == 3) << "axis should be 1 or 3 for NCHW and NHWC";
   auto input_shape = data->shape;
   Array<PrimExpr> pad_before{0, 0, 0, 0};
   Array<PrimExpr> pad_after{0, 0, 0, 0};
diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h
index 2396fc25c23f..882793877ed6 100644
--- a/include/tvm/topi/nn/pooling.h
+++ b/include/tvm/topi/nn/pooling.h
@@ -65,10 +65,10 @@ inline Tensor pool_impl(const Tensor& x, const Array<PrimExpr>& kernel_size,
                         const Array<PrimExpr>& stride_size, const Array<PrimExpr>& padding_size,
                         PoolType pool_type, bool ceil_mode, const size_t height_axis,
                         const size_t width_axis, bool count_include_pad) {
-  CHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
-  CHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
-  CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
-  CHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
+  ICHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
+  ICHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
+  ICHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
+  ICHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
 
   auto kernel_height = cast(DataType::DataType::Int(32), kernel_size[0]);
   auto kernel_width = cast(DataType::DataType::Int(32), kernel_size[1]);
@@ -181,11 +181,11 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
                              const Array<PrimExpr>& padding_size, PoolType pool_type,
                              bool ceil_mode, const size_t height_axis, const size_t width_axis,
                              bool count_include_pad) {
-  CHECK(out_grad->shape.size() >= 2) << "Pooling grad output must >= 2-D (H, W)";
-  CHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
-  CHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
-  CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
-  CHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
+  ICHECK(out_grad->shape.size() >= 2) << "Pooling grad output must >= 2-D (H, W)";
+  ICHECK(x->shape.size() >= 2) << "Pooling input must >= 2-D (H, W)";
+  ICHECK_EQ(kernel_size.size(), 2) << "Pooling kernel_size must have 2 elements";
+  ICHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
+  ICHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
 
   auto kernel_height = cast(DataType::DataType::Int(32), kernel_size[0]);
   auto kernel_width = cast(DataType::DataType::Int(32), kernel_size[1]);
@@ -372,7 +372,7 @@ inline bool find_depth_height_width(const std::string& layout, int* depth_axis,
 
 inline bool find_height_width(const std::string& layout, int* height_axis, int* width_axis) {
   int dummy;
-  CHECK_EQ(find_depth_height_width(layout, &dummy, height_axis, width_axis), false);
+  ICHECK_EQ(find_depth_height_width(layout, &dummy, height_axis, width_axis), false);
   if (*height_axis != -1 && *width_axis != -1) {
     return true;
   }
@@ -381,7 +381,7 @@ inline bool find_height_width(const std::string& layout, int* height_axis, int*
 
 inline bool find_width(const std::string& layout, int* width_axis) {
   int dummy;
-  CHECK_EQ(find_depth_height_width(layout, &dummy, &dummy, width_axis), false);
+  ICHECK_EQ(find_depth_height_width(layout, &dummy, &dummy, width_axis), false);
   if (*width_axis != -1) {
     return true;
   }
@@ -422,7 +422,7 @@ inline Tensor pool(const Tensor& x, const Array<PrimExpr>& kernel_size,
                    PoolType pool_type, bool ceil_mode, const std::string& layout = "NCHW",
                    bool count_include_pad = true) {
   int height_axis = -1, width_axis = -1;
-  CHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
+  ICHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
   return pool_impl(x, kernel_size, stride_size, padding_size, pool_type, ceil_mode, height_axis,
                    width_axis, count_include_pad);
 }
@@ -462,7 +462,7 @@ inline Tensor pool_grad(const Tensor& out_grad, const Tensor& x, const Array<Pri
                         PoolType pool_type, bool ceil_mode, const std::string& layout = "NCHW",
                         bool count_include_pad = true) {
   int height_axis = -1, width_axis = -1;
-  CHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
+  ICHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
   return pool_grad_impl(out_grad, x, kernel_size, stride_size, padding_size, pool_type, ceil_mode,
                         height_axis, width_axis, count_include_pad);
 }
@@ -489,7 +489,7 @@ inline PrimExpr end_index(const Var& out_index, const PrimExpr& odim, const Prim
 inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_size,
                                  PoolType pool_type, const std::vector<int>& axes) {
   const auto n_dim = output_size.size();
-  CHECK_EQ(axes.size(), n_dim) << "The number of axes not equal to the in/out dimension";
+  ICHECK_EQ(axes.size(), n_dim) << "The number of axes not equal to the in/out dimension";
 
   Array<PrimExpr> data_shape = x->shape;
   for (size_t i = 0; i < data_shape.size(); ++i) {
@@ -591,7 +591,7 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const Array<PrimExpr>& output_
 inline Tensor adaptive_pool(const Tensor& x, const Array<PrimExpr>& output_size, PoolType pool_type,
                             const std::string& layout = "NCHW") {
   int height_axis = -1, width_axis = -1;
-  CHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
+  ICHECK(find_height_width(layout, &height_axis, &width_axis)) << "Unsupported layout " << layout;
   return adaptive_pool_impl(x, output_size, pool_type, {height_axis, width_axis});
 }
 
@@ -606,7 +606,7 @@ inline Tensor adaptive_pool(const Tensor& x, const Array<PrimExpr>& output_size,
 inline Tensor adaptive_pool3d(const Tensor& x, const Array<PrimExpr>& output_size,
                               PoolType pool_type, const std::string& layout = "NCDHW") {
   int depth_axis = -1, height_axis = -1, width_axis = -1;
-  CHECK(find_depth_height_width(layout, &depth_axis, &height_axis, &width_axis))
+  ICHECK(find_depth_height_width(layout, &depth_axis, &height_axis, &width_axis))
       << "Unsupported layout " << layout;
   return adaptive_pool_impl(x, output_size, pool_type, {depth_axis, height_axis, width_axis});
 }
@@ -661,10 +661,10 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
                            bool count_include_pad) {
   int k_size = kernel_size.size();
   int x_size = x->shape.size();
-  CHECK_EQ(stride_size.size(), k_size) << "Pooling stride_size must have same elements as kernel";
-  CHECK_EQ(padding_size.size(), k_size * 2) << "Pooling padding_size must has double elements of"
-                                               " kernel";
-  CHECK_EQ(axis.size(), k_size) << "axis must have same elements as kernel";
+  ICHECK_EQ(stride_size.size(), k_size) << "Pooling stride_size must have same elements as kernel";
+  ICHECK_EQ(padding_size.size(), k_size * 2) << "Pooling padding_size must has double elements of"
+                                                " kernel";
+  ICHECK_EQ(axis.size(), k_size) << "axis must have same elements as kernel";
 
   Array<IterVar> daxis;
   std::vector<PrimExpr> kernel(k_size);
@@ -812,7 +812,7 @@ inline Tensor pool1d(const Tensor& x, const Array<PrimExpr>& kernel_size,
                      PoolType pool_type, bool ceil_mode, const std::string& layout = "NCW",
                      bool count_include_pad = true) {
   int width_axis = -1;
-  CHECK(find_width(layout, &width_axis)) << "Unsupported layout " << layout;
+  ICHECK(find_width(layout, &width_axis)) << "Unsupported layout " << layout;
   std::vector<int> axis = {width_axis};
   return pool_impl_nd(x, kernel_size, stride_size, padding_size, pool_type, ceil_mode, axis,
                       count_include_pad);
@@ -853,7 +853,7 @@ inline Tensor pool3d(const Tensor& x, const Array<PrimExpr>& kernel_size,
                      PoolType pool_type, bool ceil_mode, const std::string& layout = "NCDHW",
                      bool count_include_pad = true) {
   int depth_axis = -1, height_axis = -1, width_axis = -1;
-  CHECK(find_depth_height_width(layout, &depth_axis, &height_axis, &width_axis))
+  ICHECK(find_depth_height_width(layout, &depth_axis, &height_axis, &width_axis))
       << "Unsupported layout " << layout;
   std::vector<int> axis = {depth_axis, height_axis, width_axis};
   return pool_impl_nd(x, kernel_size, stride_size, padding_size, pool_type, ceil_mode, axis,
diff --git a/include/tvm/topi/nn/softmax.h b/include/tvm/topi/nn/softmax.h
index 2e94f9103c68..78a9ec40bf89 100644
--- a/include/tvm/topi/nn/softmax.h
+++ b/include/tvm/topi/nn/softmax.h
@@ -54,7 +54,7 @@ inline Tensor softmax(const Tensor& x, int axis = -1, std::string name = "tensor
   if (axis < 0) {
     axis = ndim + axis;
   }
-  CHECK_LT(axis, ndim) << "axis parameter should be less than input dim";
+  ICHECK_LT(axis, ndim) << "axis parameter should be less than input dim";
 
   auto k1 = tvm::te::reduce_axis(Range(0, input_shape[axis]), "k1");
   auto k2 = tvm::te::reduce_axis(Range(0, input_shape[axis]), "k2");
@@ -124,7 +124,7 @@ inline Tensor softmax(const Tensor& x, int axis = -1, std::string name = "tensor
  */
 inline Tensor log_softmax(const Tensor& x, std::string name = "tensor",
                           std::string tag = "log_softmax_output") {
-  CHECK_EQ(x->shape.size(), 2) << "Log softmax requires 2-D input";
+  ICHECK_EQ(x->shape.size(), 2) << "Log softmax requires 2-D input";
 
   PrimExpr m = x->shape[0];
   PrimExpr n = x->shape[1];
diff --git a/include/tvm/topi/reduction.h b/include/tvm/topi/reduction.h
index 75c8265a63ce..2a2f2113e9b1 100644
--- a/include/tvm/topi/reduction.h
+++ b/include/tvm/topi/reduction.h
@@ -75,8 +75,8 @@ inline std::vector<int> GetRealAxis(int ndim, const Array<Integer>& axis) {
       if (val < 0) {
         val += ndim;
       }
-      CHECK_LE(val, ndim) << " exceeds the maximum dimension " << ndim;
-      CHECK_GE(val, 0);
+      ICHECK_LE(val, ndim) << " exceeds the maximum dimension " << ndim;
+      ICHECK_GE(val, 0);
       real_axis.push_back(static_cast<int>(val));
     }
     std::sort(real_axis.begin(), real_axis.end());
@@ -181,7 +181,7 @@ inline Tensor DoCommReduce(const Tensor& data, FReduce func, const Array<PrimExp
 inline Tensor CommReduce(const Tensor& data, const Array<Integer>& axis, FReduce func,
                          bool keepdims, bool atleast1d) {
   auto ndim = data->shape.size();
-  CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
+  ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
   auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
   auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
   return DoCommReduce(data, func, target_shape, real_axis,
@@ -204,7 +204,7 @@ inline Tensor CommReduce(const Tensor& data, const Array<Integer>& axis, FReduce
 inline Tensor CommReduceIdx(const Tensor& data, const Array<Integer>& axis, FCommReduce func,
                             bool keepdims, bool atleast1d) {
   auto ndim = data->shape.size();
-  CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
+  ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
   auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
   auto reduce_axes = MakeReduceAxes(real_axis, data);
   auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
@@ -325,7 +325,7 @@ inline Tensor sum(const Tensor& data, const Array<Integer>& axis, bool keepdims
 }
 
 inline Tensor collapse_sum(const Tensor& data, Array<PrimExpr> target_shape) {
-  CHECK_GE(data->shape.size(), target_shape.size());
+  ICHECK_GE(data->shape.size(), target_shape.size());
   auto ishape = detail::GetConstIntValues(data->shape, "ishape");
   auto oshape = detail::GetConstIntValues(target_shape, "oshape");
 
diff --git a/include/tvm/topi/rocm/dense.h b/include/tvm/topi/rocm/dense.h
index a1e4d14b9719..b861e6c89a67 100644
--- a/include/tvm/topi/rocm/dense.h
+++ b/include/tvm/topi/rocm/dense.h
@@ -53,10 +53,10 @@ namespace rocm {
 inline tvm::te::Tensor dense_rocm(const Target& target, const tvm::te::Tensor& data,
                                   const tvm::te::Tensor& weight, const tvm::te::Tensor& bias,
                                   const DataType& out_dtype) {
-  CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
-  CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
+  ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
+  ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
-    CHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
+    ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";
   }
 
   auto batch = data->shape[0];
@@ -64,7 +64,7 @@ inline tvm::te::Tensor dense_rocm(const Target& target, const tvm::te::Tensor& d
   auto out_dim = weight->shape[0];
 
   if (target->GetLibs().count("rocblas")) {
-    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
+    ICHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::rocblas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::te::compute(
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index aa5c6d2a2256..fa27faf18f15 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -60,11 +60,11 @@ using namespace topi::detail;
 inline Tensor expand_dims(const Tensor& x, int axis, int num_newaxis = 1,
                           std::string name = "T_expand_dims", std::string tag = kBroadcast) {
   int ndim = static_cast<int>(x->shape.size());
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
-  CHECK(num_newaxis >= 0) << "expand_dims only accepts `num_newaxis >= 0`"
-                          << ", but got num_newaxis = " << num_newaxis;
+  ICHECK(num_newaxis >= 0) << "expand_dims only accepts `num_newaxis >= 0`"
+                           << ", but got num_newaxis = " << num_newaxis;
   if (axis < 0) {
     // Calculate offset from last dimension
     axis = ndim + axis + 1;
@@ -123,13 +123,13 @@ inline Tensor transpose(const Tensor& x, Array<Integer> axes, std::string name =
       new_axis = static_cast<int>(x->shape.size()) + axis;
       axes.Set(i, new_axis);
     }
-    CHECK((new_axis >= 0) && (new_axis < static_cast<int>(x->shape.size())))
+    ICHECK((new_axis >= 0) && (new_axis < static_cast<int>(x->shape.size())))
         << "axis=" << axis << " is invalid for the " << static_cast<int>(x->shape.size())
         << "-dimensional input tensor";
 
     for (size_t j = 0; j < axes.size(); ++j) {
       if (i != j) {
-        CHECK(new_axis != static_cast<int>(axes[j]->value)) << "repeated axis in transpose";
+        ICHECK(new_axis != static_cast<int>(axes[j]->value)) << "repeated axis in transpose";
       }
     }
     new_shape.push_back(x->shape[new_axis]);
@@ -178,14 +178,14 @@ inline Tensor reverse_sequence(const Tensor& x, const Tensor& seq_lengths, int s
       batch_axis = static_cast<int>(x->shape.size()) + batch_axis;
     }
 
-    CHECK(seq_lengths_dim == 1) << "seq_lengths should be 1D vector";
+    ICHECK(seq_lengths_dim == 1) << "seq_lengths should be 1D vector";
 
-    CHECK(GetConstInt(seq_lengths->shape[0]) == GetConstInt(x->shape[batch_axis]))
+    ICHECK(GetConstInt(seq_lengths->shape[0]) == GetConstInt(x->shape[batch_axis]))
         << "For reverse_sequnece seq_lengths size should match with dimension of batch axis"
         << ", but got dimension of batch_axis = " << GetConstInt(x->shape[batch_axis])
         << ", and seq_length size = " << GetConstInt(seq_lengths->shape[0]);
 
-    CHECK((0 <= batch_axis) && (batch_axis < static_cast<int>(x->shape.size())))
+    ICHECK((0 <= batch_axis) && (batch_axis < static_cast<int>(x->shape.size())))
         << "batch_axis=" << batch_axis_inp << " is invalid for the "
         << static_cast<int>(x->shape.size()) << "-dimensional input tensor";
   }
@@ -193,7 +193,7 @@ inline Tensor reverse_sequence(const Tensor& x, const Tensor& seq_lengths, int s
   if (seq_axis < 0) {
     seq_axis = static_cast<int>(x->shape.size()) + seq_axis;
   }
-  CHECK((0 <= seq_axis) && (seq_axis < static_cast<int>(x->shape.size())))
+  ICHECK((0 <= seq_axis) && (seq_axis < static_cast<int>(x->shape.size())))
       << "seq_axis=" << seq_axis_inp << " is invalid for the " << static_cast<int>(x->shape.size())
       << "-dimensional input tensor";
 
@@ -332,7 +332,7 @@ inline Tensor squeeze(const Tensor& x, Array<Integer> axis, bool atleast1d = fal
       if (val < 0) {
         val += static_cast<int>(x->shape.size());
       }
-      CHECK_EQ(GetConstInt(x->shape[val]), 1) << "Dimension " << val << " must have size 1";
+      ICHECK_EQ(GetConstInt(x->shape[val]), 1) << "Dimension " << val << " must have size 1";
       axis_val.push_back(val);
     }
   }
@@ -380,12 +380,12 @@ inline Tensor squeeze(const Tensor& x, Array<Integer> axis, bool atleast1d = fal
 inline Tensor concatenate(const Array<Tensor>& inputs, int axis = 0, std::string name = "T_concat",
                           std::string tag = kInjective) {
   int ndim = static_cast<int>(inputs[0]->shape.size());
-  CHECK(-ndim <= axis && axis < ndim) << "concatenate only accepts `axis` in [-ndim, ndim)"
-                                      << ", but got axis = " << axis << ", and ndim = " << ndim;
+  ICHECK(-ndim <= axis && axis < ndim) << "concatenate only accepts `axis` in [-ndim, ndim)"
+                                       << ", but got axis = " << axis << ", and ndim = " << ndim;
   if (axis < 0) {
     axis += ndim;
   }
-  CHECK_LT(axis, inputs[0]->shape.size()) << "axis out of bounds";
+  ICHECK_LT(axis, inputs[0]->shape.size()) << "axis out of bounds";
 
   Array<PrimExpr> axis_sizes;
   for (auto t : inputs) {
@@ -439,13 +439,13 @@ inline Tensor concatenate(const Array<Tensor>& inputs, int axis = 0, std::string
 inline Tensor stack(const Array<Tensor>& inputs, int axis = 0, std::string name = "T_stack",
                     std::string tag = kInjective) {
   int ndim = static_cast<int>(inputs[0]->shape.size());
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "stack only accepts `axis` in [-ndim, ndim)"
       << ", but got axis = " << axis << ", and ndim = " << ndim;
   if (axis < 0) {
     axis += ndim + 1;
   }
-  CHECK_LT(axis, inputs[0]->shape.size() + 1) << "axis out of bounds";
+  ICHECK_LT(axis, inputs[0]->shape.size() + 1) << "axis out of bounds";
 
   const int stack_size = static_cast<int>(inputs.size());
   Array<PrimExpr> out_shape;
@@ -487,7 +487,7 @@ inline Array<Tensor> split(const Tensor& x, Array<PrimExpr> split_indices, int a
   if (axis < 0) {
     axis += static_cast<int>(x->shape.size());
   }
-  CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
+  ICHECK_LT(axis, x->shape.size()) << "axis out of bounds";
 
   auto src_axis_size = x->shape[axis];
   std::vector<PrimExpr> begin_ids;
@@ -497,7 +497,7 @@ inline Array<Tensor> split(const Tensor& x, Array<PrimExpr> split_indices, int a
     auto idx_node = idx.as<IntImmNode>();
     auto back_node = begin_ids.back().as<IntImmNode>();
     if (idx_node && back_node) {
-      CHECK_GT(idx_node->value, back_node->value) << "split_indices must be sorted";
+      ICHECK_GT(idx_node->value, back_node->value) << "split_indices must be sorted";
     }
     begin_ids.push_back(idx);
   }
@@ -569,7 +569,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
   // Consider to refactor in the future.
   std::vector<int64_t> stride_vec(src_tensor_dim, 1);
   for (size_t i = 0; i < strides.size(); ++i) {
-    CHECK(strides[i].defined());
+    ICHECK(strides[i].defined());
     stride_vec[i] = strides[i]->value;
   }
 
@@ -630,7 +630,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
     int interval = std::abs(end_i - begin_i);
     int slice_size =
         static_cast<int>((interval + std::abs(stride_vec[i]) - 1) / std::abs(stride_vec[i]));
-    CHECK(stride_vec[i] < 0 ? (end_i <= begin_i) : (begin_i <= end_i))
+    ICHECK(stride_vec[i] < 0 ? (end_i <= begin_i) : (begin_i <= end_i))
         << ": Input [Begin=" << begin_vec[i] << ", End=" << end_vec[i]
         << "] is invalid for axis=" << i;
 
@@ -670,14 +670,14 @@ inline Array<Tensor> split_sections(const Tensor& x, int num_sections, int axis,
   if (axis < 0) {
     axis += static_cast<int>(x->shape.size());
   }
-  CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
+  ICHECK_LT(axis, x->shape.size()) << "axis out of bounds";
 
   auto src_axis_size = x->shape[axis];
 
-  CHECK_GT(num_sections, 0) << "Slice count must be > 0";
+  ICHECK_GT(num_sections, 0) << "Slice count must be > 0";
 
   if (auto node = src_axis_size.as<IntImmNode>()) {
-    CHECK_EQ(node->value % num_sections, 0)
+    ICHECK_EQ(node->value % num_sections, 0)
         << "num_sections must be an integer factor of the size of axis " << axis << " ("
         << node->value << ")";
   }
@@ -756,8 +756,8 @@ inline Tensor take(const Tensor& a, const Tensor& indices, std::string mode = "c
 inline Tensor sequence_mask(const Tensor& data, const Tensor& valid_length, double mask_value,
                             int axis, std::string name = "T_sequence_mask",
                             std::string tag = kInjective) {
-  CHECK(axis == 0 || axis == 1) << "axis must be either 0 or 1";
-  CHECK_EQ(valid_length->shape.size(), 1) << "valid_length must have ndim=1, i.e., (batch_size,).";
+  ICHECK(axis == 0 || axis == 1) << "axis must be either 0 or 1";
+  ICHECK_EQ(valid_length->shape.size(), 1) << "valid_length must have ndim=1, i.e., (batch_size,).";
   auto length_dim = data->shape[axis];
   auto batch_dim = data->shape[1 - axis];
   Array<PrimExpr> out_shape = data->shape;
@@ -795,8 +795,8 @@ inline Tensor take(const Tensor& a, const Tensor& indices, int axis, std::string
   if (axis < 0) {
     axis += static_cast<int>(a->shape.size());
   }
-  CHECK_GE(axis, 0) << "axis out of bounds";
-  CHECK_LT(axis, a->shape.size()) << "axis out of bounds";
+  ICHECK_GE(axis, 0) << "axis out of bounds";
+  ICHECK_LT(axis, a->shape.size()) << "axis out of bounds";
   auto axis_dim = a->shape[axis];
 
   int indices_len = static_cast<int>(indices->shape.size());
@@ -887,11 +887,11 @@ inline Tensor take(const Tensor& a, const Tensor& indices, int axis, std::string
  */
 inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
                     std::string name = "T_where", std::string tag = kBroadcast) {
-  CHECK_EQ(x->shape.size(), y->shape.size())
+  ICHECK_EQ(x->shape.size(), y->shape.size())
       << "x and y must have the same shape.Got different number of dimension: " << x->shape.size()
       << " vs " << y->shape.size();
-  CHECK_EQ(x->dtype, y->dtype) << "x and y must have the same dtype: " << x->dtype << " vs "
-                               << y->dtype;
+  ICHECK_EQ(x->dtype, y->dtype) << "x and y must have the same dtype: " << x->dtype << " vs "
+                                << y->dtype;
 
   if (x->shape.size() == 0) {
     return compute(
@@ -908,7 +908,7 @@ inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
         },
         name, tag);
   } else if (condition->shape.size() != 1) {
-    CHECK_EQ(condition->shape.size(), x->shape.size())
+    ICHECK_EQ(condition->shape.size(), x->shape.size())
         << "condition array must be either have the same shape as x or to be a "
            "1-D array.Got different number of dimension: "
         << condition->shape.size() << " vs " << x->shape.size();
@@ -922,7 +922,7 @@ inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
     int64_t cond_first_dim = topi::GetConstInt(condition->shape[0]);
     int64_t x_first_dim = topi::GetConstInt(x->shape[0]);
     if (cond_first_dim > 0 && x_first_dim > 0) {
-      CHECK_EQ(cond_first_dim, x_first_dim)
+      ICHECK_EQ(cond_first_dim, x_first_dim)
           << "If condition is 1-D, the first dimension must be the same as x: " << cond_first_dim
           << " vs " << x_first_dim;
     }
@@ -951,11 +951,11 @@ inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
 inline Tensor repeat(const Tensor& x, int repeats, int axis, std::string name = "T_repeat",
                      std::string tag = kBroadcast) {
   int ndim = static_cast<int>(x->shape.size());
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
-  CHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
-                      << ", but got repeats = " << repeats;
+  ICHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
+                       << ", but got repeats = " << repeats;
   if (axis < 0) {
     // Calculate offset from last dimension
     axis += ndim;
@@ -1091,13 +1091,13 @@ inline Tensor gather(const Tensor& data, int axis, const Tensor& indices,
                      std::string name = "T_gather", std::string tag = kInjective) {
   size_t ndim_d = data->shape.size();
   size_t ndim_i = indices->shape.size();
-  CHECK_GE(ndim_d, 1) << "Cannot gather from a scalar.";
-  CHECK_EQ(ndim_d, ndim_i);
-  CHECK_GE(axis, 0);
-  CHECK_LT(axis, ndim_d);
+  ICHECK_GE(ndim_d, 1) << "Cannot gather from a scalar.";
+  ICHECK_EQ(ndim_d, ndim_i);
+  ICHECK_GE(axis, 0);
+  ICHECK_LT(axis, ndim_d);
   size_t indices_dim_i = static_cast<size_t>(GetConstInt(indices->shape[axis]));
-  CHECK_GE(indices_dim_i, 1);
-  CHECK(indices->dtype.is_int());
+  ICHECK_GE(indices_dim_i, 1);
+  ICHECK(indices->dtype.is_int());
 
   Array<PrimExpr> out_shape;
   for (size_t i = 0; i < ndim_i; ++i) {
@@ -1138,10 +1138,10 @@ inline Tensor gather_nd(const Tensor& data, const Tensor& indices, std::string n
                         std::string tag = kInjective) {
   size_t ndim_d = data->shape.size();
   size_t ndim_i = indices->shape.size();
-  CHECK_GE(ndim_i, 1) << "indices tensor must have at least 1 dimensions";
+  ICHECK_GE(ndim_i, 1) << "indices tensor must have at least 1 dimensions";
   size_t indices_dim0 = static_cast<size_t>(GetConstInt(indices->shape[0]));
-  CHECK_LE(indices_dim0, ndim_d) << "dim 0 of indices tensor must be no more "
-                                 << "than dimensions of data tensor";
+  ICHECK_LE(indices_dim0, ndim_d) << "dim 0 of indices tensor must be no more "
+                                  << "than dimensions of data tensor";
   Array<PrimExpr> out_shape;
   for (size_t i = 1; i < ndim_i; ++i) {
     out_shape.push_back(indices->shape[i]);
@@ -1216,8 +1216,8 @@ inline tvm::te::Tensor matmul(const tvm::te::Tensor& A, const tvm::te::Tensor& B
  */
 inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, int axes = 2,
                         std::string name = "T_tensordot", std::string tag = kMatMul) {
-  CHECK_GE(A->shape.size(), axes);
-  CHECK_GE(B->shape.size(), axes);
+  ICHECK_GE(A->shape.size(), axes);
+  ICHECK_GE(B->shape.size(), axes);
 
   Array<PrimExpr> output_shape(A->shape.begin(), A->shape.end() + (-axes));
   for (auto it = B->shape.begin() + axes; it != B->shape.end(); ++it) output_shape.push_back(*it);
@@ -1262,7 +1262,7 @@ inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, int axes = 2,
 inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, Array<PrimExpr> A_axes,
                         Array<PrimExpr> B_axes, std::string name = "T_tensordot",
                         std::string tag = kMatMul) {
-  CHECK_EQ(A_axes.size(), B_axes.size());
+  ICHECK_EQ(A_axes.size(), B_axes.size());
 
   auto A_axes_val = GetConstIntValues(A_axes, "A_axes");
   auto B_axes_val = GetConstIntValues(B_axes, "B_axes");
@@ -1366,11 +1366,12 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
     return src;
   }
 
-  CHECK(src_layout_struct.defined() && dst_layout_struct.defined())
+  ICHECK(src_layout_struct.defined() && dst_layout_struct.defined())
       << "cannot convert from/to undefined layout";
 
   auto layout_converter = tir::BijectiveLayout(src_layout_struct, dst_layout_struct);
-  CHECK(layout_converter.defined()) << "cannot convert from " << src_layout << " to " << dst_layout;
+  ICHECK(layout_converter.defined())
+      << "cannot convert from " << src_layout << " to " << dst_layout;
 
   Array<PrimExpr> dst_shape = layout_converter.ForwardShape(src->shape);
 
@@ -1499,9 +1500,10 @@ inline Tensor sparse_to_dense(const Tensor& sparse_indices, const Array<Integer>
                               const Tensor& sparse_values, const PrimExpr& default_value,
                               const std::string name = "T_sparse_to_dense",
                               const std::string tag = kInjective) {
-  CHECK(sparse_indices->dtype.is_int()) << "sparse_indices only accepts integer values";
-  CHECK_LE(sparse_indices->shape.size(), 3) << "sparse_indices tensor should be 0D, 1D, or 2D only";
-  CHECK_LE(sparse_values->shape.size(), 2) << "sparse_values tensor should be 0D or 1D only";
+  ICHECK(sparse_indices->dtype.is_int()) << "sparse_indices only accepts integer values";
+  ICHECK_LE(sparse_indices->shape.size(), 3)
+      << "sparse_indices tensor should be 0D, 1D, or 2D only";
+  ICHECK_LE(sparse_values->shape.size(), 2) << "sparse_values tensor should be 0D or 1D only";
 
   const auto rank_sparse_indices = static_cast<int>(sparse_indices->shape.size());
   Array<PrimExpr> oshape;
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index daf61441b466..9737b53703fd 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -47,7 +47,7 @@ void Analyzer::Bind(const Var& var, const PrimExpr& expr, bool allow_override) {
 }
 
 void Analyzer::Bind(const Var& var, const Range& range, bool allow_override) {
-  CHECK(range.defined());
+  ICHECK(range.defined());
   if (tir::is_one(range->extent)) {
     this->Bind(var, range->min, allow_override);
   } else {
@@ -64,7 +64,7 @@ void Analyzer::Bind(const Map<Var, Range>& variables, bool allow_override) {
 }
 
 void ConstraintContext::EnterWithScope() {
-  CHECK(exit_ == nullptr);
+  ICHECK(exit_ == nullptr);
   // entering the scope.
   auto f0 = analyzer_->const_int_bound.EnterConstraint(constraint_);
   auto f1 = analyzer_->modular_set.EnterConstraint(constraint_);
@@ -78,7 +78,7 @@ void ConstraintContext::EnterWithScope() {
 }
 
 void ConstraintContext::ExitWithScope() {
-  CHECK(exit_ != nullptr);
+  ICHECK(exit_ != nullptr);
   exit_();
 }
 
diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index a88849b42e9f..d0a0702a0fb0 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -63,7 +63,7 @@ inline PrimExpr ModImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   if (mode == kTruncDiv) {
     return truncmod(a, b);
   } else {
-    CHECK_EQ(mode, kFloorDiv);
+    ICHECK_EQ(mode, kFloorDiv);
     return floormod(a, b);
   }
 }
@@ -72,7 +72,7 @@ inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   if (mode == kTruncDiv) {
     return truncdiv(a, b);
   } else {
-    CHECK_EQ(mode, kFloorDiv);
+    ICHECK_EQ(mode, kFloorDiv);
     return floordiv(a, b);
   }
 }
@@ -102,7 +102,7 @@ class SplitExprNode : public CanonicalExprNode {
   DivMode div_mode{kTruncDiv};
 
   /*! \brief verify that this is a valid entry. */
-  void Verify() const { CHECK(upper_factor == kPosInf || upper_factor % lower_factor == 0); }
+  void Verify() const { ICHECK(upper_factor == kPosInf || upper_factor % lower_factor == 0); }
 
   PrimExpr NormalizeWithScale(int64_t sscale) const {
     PrimExpr res = this->index;
@@ -118,7 +118,7 @@ class SplitExprNode : public CanonicalExprNode {
     }
     sscale *= this->scale;
     if (sscale != 1) {
-      CHECK(!dtype.is_uint() || sscale > 0);
+      ICHECK(!dtype.is_uint() || sscale > 0);
       res = res * make_const(dtype, sscale);
     }
     return res;
@@ -209,10 +209,10 @@ class SumExprNode : public CanonicalExprNode {
    * \param scale The scale to be applied.
    */
   void DivideBy(int64_t scale) {
-    CHECK_EQ(this->base % scale, 0);
+    ICHECK_EQ(this->base % scale, 0);
     this->base /= scale;
     for (size_t i = 0; i < this->args.size(); ++i) {
-      CHECK_EQ(args[i]->scale % scale, 0);
+      ICHECK_EQ(args[i]->scale % scale, 0);
       args[i].CopyOnWrite()->scale /= scale;
     }
   }
@@ -508,7 +508,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
       return expr;
     }
     expr = ToSplitExpr(Normalize(expr));
-    CHECK(expr->DivModeCompatibleTo(div_mode));
+    ICHECK(expr->DivModeCompatibleTo(div_mode));
     expr.CopyOnWrite()->div_mode = div_mode;
     return expr;
   }
@@ -648,7 +648,7 @@ void CanonicalSimplifier::Impl::SeparateDivisibleParts(const SumExprNode* psum,
 }
 
 SplitExpr CanonicalSimplifier::Impl::SplitDivConst(SplitExpr lhs, int64_t cval, DivMode div_mode) {
-  CHECK_GT(cval, 0);
+  ICHECK_GT(cval, 0);
   lhs = ConvertDivMode(lhs, div_mode);
 
   // the following rule works for both floordiv and truncdiv
@@ -682,8 +682,8 @@ SplitExpr CanonicalSimplifier::Impl::SplitDivConst(SplitExpr lhs, int64_t cval,
   }
   // directly return the split with cval == 1
   lhs = ToSplitExpr(Normalize(lhs));
-  CHECK(lhs->DivModeCompatibleTo(div_mode));
-  CHECK_EQ(lhs->scale, 1);
+  ICHECK(lhs->DivModeCompatibleTo(div_mode));
+  ICHECK_EQ(lhs->scale, 1);
   lhs.CopyOnWrite()->lower_factor *= cval;
   lhs.CopyOnWrite()->div_mode = div_mode;
   return lhs;
@@ -803,7 +803,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
 }
 
 SplitExpr CanonicalSimplifier::Impl::SplitModConst(SplitExpr lhs, int64_t cval, DivMode div_mode) {
-  CHECK_GT(cval, 0);
+  ICHECK_GT(cval, 0);
   lhs = ConvertDivMode(lhs, div_mode);
 
   if (lhs->scale % cval == 0) {
@@ -842,9 +842,9 @@ SplitExpr CanonicalSimplifier::Impl::SplitModConst(SplitExpr lhs, int64_t cval,
   }
   // Normalize the value.
   lhs = ToSplitExpr(Normalize(lhs));
-  CHECK(lhs->DivModeCompatibleTo(div_mode));
-  CHECK_EQ(lhs->scale, 1);
-  CHECK_EQ(lhs->lower_factor, 1);
+  ICHECK(lhs->DivModeCompatibleTo(div_mode));
+  ICHECK_EQ(lhs->scale, 1);
+  ICHECK_EQ(lhs->lower_factor, 1);
   lhs.CopyOnWrite()->div_mode = div_mode;
   lhs.CopyOnWrite()->upper_factor = cval;
   return lhs;
@@ -886,7 +886,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
             // contonue to use logic below.
             a = extra;
             psum = a.as<SumExprNode>();
-            CHECK(psum != nullptr);
+            ICHECK(psum != nullptr);
           }
         }
       }
@@ -948,7 +948,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
           // contonue to use logic below.
           a = extra;
           psum = a.as<SumExprNode>();
-          CHECK(psum != nullptr);
+          ICHECK(psum != nullptr);
         }
       }
       // Simplify the offset constant if necessary.
diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index 876d336454d8..7bc04a184633 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -150,7 +150,7 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
     if (pa && pb) {
       // due to division and mod can have different modes
       // NOTE: this will assumes truc div.
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
       return IntImm(rtype, pa->value / pb->value);
     }
     if (pa) {
@@ -158,7 +158,7 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
     }
     if (pb) {
       if (pb->value == 1) return a;
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
     if (fa && fb && fb->value != 0) {
       return FloatImm(rtype, fa->value / fb->value);
@@ -166,7 +166,7 @@ inline PrimExpr TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
     if (fa && fa->value == 0) return a;
     if (fb) {
       if (fb->value == 1) return a;
-      CHECK_NE(fb->value, 0) << "Divide by zero";
+      ICHECK_NE(fb->value, 0) << "Divide by zero";
     }
   });
   return PrimExpr();
@@ -177,7 +177,7 @@ inline PrimExpr TryConstFold<tir::Mod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
       return IntImm(rtype, pa->value % pb->value);
     }
     if (pa) {
@@ -185,7 +185,7 @@ inline PrimExpr TryConstFold<tir::Mod>(PrimExpr a, PrimExpr b) {
     }
     if (pb) {
       if (pb->value == 1) return tir::make_zero(rtype);
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
   return PrimExpr();
@@ -196,7 +196,7 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
       return IntImm(rtype, arith::floordiv(pa->value, pb->value));
     }
     if (pa) {
@@ -204,7 +204,7 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
     }
     if (pb) {
       if (pb->value == 1) return a;
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
     if (fa && fb && fb->value != 0) {
       return FloatImm(rtype, std::floor(fa->value / fb->value));
@@ -212,7 +212,7 @@ inline PrimExpr TryConstFold<tir::FloorDiv>(PrimExpr a, PrimExpr b) {
     if (fa && fa->value == 0) return a;
     if (fb) {
       if (fb->value == 1) return a;
-      CHECK_NE(fb->value, 0) << "Divide by zero";
+      ICHECK_NE(fb->value, 0) << "Divide by zero";
     }
   });
   return PrimExpr();
@@ -223,7 +223,7 @@ inline PrimExpr TryConstFold<tir::FloorMod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pa && pb) {
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
       return IntImm(rtype, floormod(pa->value, pb->value));
     }
     if (pa) {
@@ -231,7 +231,7 @@ inline PrimExpr TryConstFold<tir::FloorMod>(PrimExpr a, PrimExpr b) {
     }
     if (pb) {
       if (pb->value == 1) return tir::make_zero(rtype);
-      CHECK_NE(pb->value, 0) << "Divide by zero";
+      ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
   return PrimExpr();
diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 876b7db188c6..f39ce4b05643 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -109,11 +109,11 @@ class ConstIntBoundAnalyzer::Impl
     if (!allow_override) {
       auto it = var_map_.find(var);
       if (it != var_map_.end()) {
-        CHECK(it->second == info) << "Trying to update var \'" << var << "\'"
-                                  << " with a different const bound: "
-                                  << "original="
-                                  << ConstIntBound(it->second.min_value, it->second.max_value)
-                                  << ", new=" << ConstIntBound(info.min_value, info.max_value);
+        ICHECK(it->second == info)
+            << "Trying to update var \'" << var << "\'"
+            << " with a different const bound: "
+            << "original=" << ConstIntBound(it->second.min_value, it->second.max_value)
+            << ", new=" << ConstIntBound(info.min_value, info.max_value);
       }
     }
     var_map_[var] = info;
@@ -155,7 +155,7 @@ class ConstIntBoundAnalyzer::Impl
       auto val = bound_->find(expr);
       if (val != bound_->end()) {
         auto everything = Everything(expr->dtype);
-        CHECK(
+        ICHECK(
             (val->second->min_value == res.min_value && val->second->max_value == res.max_value) ||
             (val->second->min_value == everything.min_value &&
              val->second->max_value == everything.max_value))
@@ -211,7 +211,7 @@ class ConstIntBoundAnalyzer::Impl
   Entry VisitExpr_(const DivNode* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = VisitExpr(op->b);
-    CHECK(!b.is_const(0)) << "divide by zero";
+    ICHECK(!b.is_const(0)) << "divide by zero";
     return HandleDivision(a, b, op->dtype, InfAwareDiv);
   }
 
@@ -230,7 +230,7 @@ class ConstIntBoundAnalyzer::Impl
                          std::min(std::max(a.max_value, (int64_t)0), b_max_cap));
       }
     } else {
-      CHECK(!b.is_const(0)) << "mod by zero";
+      ICHECK(!b.is_const(0)) << "mod by zero";
       // mod by negative value is rare,
       // and we just use the simpliest rule.
       return Everything(op->dtype);
@@ -240,7 +240,7 @@ class ConstIntBoundAnalyzer::Impl
   Entry VisitExpr_(const FloorDivNode* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = VisitExpr(op->b);
-    CHECK(!b.is_const(0)) << "floordiv by zero";
+    ICHECK(!b.is_const(0)) << "floordiv by zero";
     return HandleDivision(a, b, op->dtype, InfAwareFloorDiv);
   }
 
@@ -258,7 +258,7 @@ class ConstIntBoundAnalyzer::Impl
         return MakeBound(0, b_max_cap);
       }
     } else {
-      CHECK(!b.is_const(0)) << "floormod by zero";
+      ICHECK(!b.is_const(0)) << "floormod by zero";
       // mod by negative value is rare,
       // and we just use the simpliest rule.
       return Everything(op->dtype);
@@ -352,7 +352,7 @@ class ConstIntBoundAnalyzer::Impl
     additional_info_.insert(additional_info_.end(), info.begin(), info.end());
     size_t new_size = old_size + info.size();
     auto frecover = [old_size, new_size, this]() {
-      CHECK_EQ(additional_info_.size(), new_size);
+      ICHECK_EQ(additional_info_.size(), new_size);
       additional_info_.resize(old_size);
     };
     return frecover;
@@ -432,11 +432,11 @@ class ConstIntBoundAnalyzer::Impl
    */
   static int64_t InfAwareAdd(int64_t x, int64_t y) {
     if (x == kPosInf) {
-      CHECK(y != kNegInf);
+      ICHECK(y != kNegInf);
       return kPosInf;
     }
     if (x == kNegInf) {
-      CHECK(y != kPosInf);
+      ICHECK(y != kPosInf);
       return kNegInf;
     }
     if (y == kPosInf || y == kNegInf) return y;
@@ -464,7 +464,7 @@ class ConstIntBoundAnalyzer::Impl
    * \return the result.
    */
   static int64_t InfAwareDiv(int64_t x, int64_t y) {
-    CHECK_NE(y, 0);
+    ICHECK_NE(y, 0);
     if (x == kPosInf || x == kNegInf) {
       if (y > 0) return x;
       return -x;
@@ -478,7 +478,7 @@ class ConstIntBoundAnalyzer::Impl
    * \return the result.
    */
   static int64_t InfAwareFloorDiv(int64_t x, int64_t y) {
-    CHECK_NE(y, 0);
+    ICHECK_NE(y, 0);
     if (x == kPosInf || x == kNegInf) {
       if (y > 0) return x;
       return -x;
diff --git a/src/arith/domain_touched.cc b/src/arith/domain_touched.cc
index d59486cfcd79..3c3da5f4b99b 100644
--- a/src/arith/domain_touched.cc
+++ b/src/arith/domain_touched.cc
@@ -67,7 +67,7 @@ class BufferTouchedDomain final : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == tir::attr::thread_extent) {
       const IterVarNode* thread_axis = op->node.as<IterVarNode>();
-      CHECK(thread_axis);
+      ICHECK(thread_axis);
       const VarNode* var = thread_axis->var.get();
       dom_map_[var] = IntSet::FromRange(Range(make_zero(op->value.dtype()), op->value));
       StmtExprVisitor::VisitStmt_(op);
diff --git a/src/arith/int_constraints.cc b/src/arith/int_constraints.cc
index 56c95d0ab713..3a668c2331e7 100644
--- a/src/arith/int_constraints.cc
+++ b/src/arith/int_constraints.cc
@@ -43,9 +43,9 @@ Array<PrimExpr> AsConditions(const Array<Var>& variables, const Map<Var, IntGrou
   Array<PrimExpr> res;
   // use variables to keep the order of iteration
   // so as to get rid of any non-determinism.
-  CHECK_EQ(variables.size(), bounds.size());
+  ICHECK_EQ(variables.size(), bounds.size());
   for (const auto v : variables) {
-    CHECK(bounds.count(v));
+    ICHECK(bounds.count(v));
     const auto& bnds = bounds[v];
     PrimExpr lhs = bnds->coef * v;
     for (const PrimExpr& rhs : bnds->equal) {
@@ -66,7 +66,7 @@ Array<PrimExpr> AsConditions(const Array<Var>& variables, const Map<Var, IntGrou
 
 IntGroupBounds::IntGroupBounds(PrimExpr coef, Array<PrimExpr> lower, Array<PrimExpr> equal,
                                Array<PrimExpr> upper) {
-  CHECK(coef.dtype().is_int() || coef.dtype().is_uint())
+  ICHECK(coef.dtype().is_int() || coef.dtype().is_uint())
       << "Coefficient in IntGroupBounds must be integers";
   ObjectPtr<IntGroupBoundsNode> node = make_object<IntGroupBoundsNode>();
   node->coef = std::move(coef);
@@ -178,7 +178,7 @@ Range IntGroupBounds::FindBestRange(const Map<Var, Range>& vranges_addl) const {
   }
 
   if (!best_lower.defined()) {
-    CHECK(!best_diff_over.defined());
+    ICHECK(!best_diff_over.defined());
     return Range();
   }
   return Range::FromMinExtent(best_lower, analyzer.Simplify(best_diff_over + 1));
@@ -196,7 +196,7 @@ TVM_REGISTER_GLOBAL("arith.IntGroupBounds_from_range").set_body_typed(IntGroupBo
 
 TVM_REGISTER_GLOBAL("arith.IntGroupBounds_FindBestRange")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
-      CHECK(args.size() == 1 || args.size() == 2);
+      ICHECK(args.size() == 1 || args.size() == 2);
       IntGroupBounds bounds = args[0];
       if (args.size() == 1) {
         *ret = bounds.FindBestRange();
@@ -221,9 +221,9 @@ IntConstraints::IntConstraints(Array<Var> variables, Map<Var, Range> ranges,
   if (!ranges.defined()) {
     ranges = Map<Var, Range>();
   }
-  CHECK(relations.defined());
+  ICHECK(relations.defined());
   for (const auto& var : variables) {
-    CHECK(var.dtype().is_int() || var.dtype().is_uint())
+    ICHECK(var.dtype().is_int() || var.dtype().is_uint())
         << "Variables in IntConstraints must be integers";
   }
   node->variables = std::move(variables);
@@ -259,7 +259,7 @@ IntConstraintsTransform::IntConstraintsTransform(IntConstraints src, IntConstrai
 
 IntConstraintsTransform IntConstraintsTransform::operator+(
     const IntConstraintsTransform& other) const {
-  CHECK(other->src.same_as(operator->()->dst));
+  ICHECK(other->src.same_as(operator->()->dst));
   Map<Var, PrimExpr> dst_to_src;
   Map<Var, PrimExpr> src_to_dst;
 
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index 9940d1f60b39..6490f67e1b1a 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -412,7 +412,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
   IntervalSet VisitExpr_(const OrNode* op) final { return VisitBinaryExpr_<Or>(op); }
 
   IntervalSet VisitExpr_(const RampNode* op) final {
-    CHECK(eval_vec_);
+    ICHECK(eval_vec_);
     IntervalSet base = Eval(op->base);
     PVar<IntImm> stride;
     if (stride.Match(op->stride)) {
@@ -431,7 +431,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
   }
 
   IntervalSet VisitExpr_(const BroadcastNode* op) final {
-    CHECK(eval_vec_);
+    ICHECK(eval_vec_);
     return VisitExpr(op->value);
   }
 
@@ -506,7 +506,7 @@ Range IntSet::CoverRange(Range max_range) const {
   IntSet temp;
   Analyzer analyzer;
   const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
-  CHECK(s_int != nullptr);
+  ICHECK(s_int != nullptr);
   if (s_int->HasUpperBound() && s_int->HasLowerBound()) {
     return Range::FromMinExtent(s_int->min_value,
                                 analyzer.Simplify(s_int->max_value + 1 - s_int->min_value));
@@ -516,13 +516,13 @@ Range IntSet::CoverRange(Range max_range) const {
 
 PrimExpr IntSet::min() const {
   const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
-  CHECK(s_int);
+  ICHECK(s_int);
   return s_int->min_value;
 }
 
 PrimExpr IntSet::max() const {
   const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
-  CHECK(s_int);
+  ICHECK(s_int);
   return s_int->max_value;
 }
 
@@ -584,7 +584,7 @@ SignType IntSet::GetSignType() const {
 }
 PrimExpr IntSet::PointValue() const {
   const IntervalSetNode* s_int = (*this).as<IntervalSetNode>();
-  CHECK(s_int && s_int->IsSinglePoint());
+  ICHECK(s_int && s_int->IsSinglePoint());
   return s_int->min_value;
 }
 
diff --git a/src/arith/ir_mutator_with_analyzer.cc b/src/arith/ir_mutator_with_analyzer.cc
index 8fb69b31857a..7bc0d946ade7 100644
--- a/src/arith/ir_mutator_with_analyzer.cc
+++ b/src/arith/ir_mutator_with_analyzer.cc
@@ -96,7 +96,7 @@ Stmt IRMutatorWithAnalyzer::VisitStmt_(const IfThenElseNode* op) {
 Stmt IRMutatorWithAnalyzer::VisitStmt_(const AttrStmtNode* op) {
   if (op->attr_key == tir::attr::thread_extent || op->attr_key == tir::attr::virtual_thread) {
     IterVar iv = Downcast<IterVar>(op->node);
-    CHECK_NE(iv->thread_tag.length(), 0U);
+    ICHECK_NE(iv->thread_tag.length(), 0U);
     analyzer_->Bind(iv->var, Range::FromMinExtent(0, op->value));
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     return stmt;
diff --git a/src/arith/ir_visitor_with_analyzer.h b/src/arith/ir_visitor_with_analyzer.h
index 388720ad29c0..058abc8c7d20 100644
--- a/src/arith/ir_visitor_with_analyzer.h
+++ b/src/arith/ir_visitor_with_analyzer.h
@@ -44,7 +44,7 @@ class IRVisitorWithAnalyzer final : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       analyzer_.Bind(iv->var, Range::FromMinExtent(0, op->value));
       StmtExprVisitor::VisitStmt_(op);
     } else {
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index e56ef2a75ee1..283ffa646567 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -336,7 +336,7 @@ class IterMapRewriter : public ExprMutator {
     } else if (const auto* op = expr.as<IterSplitExprNode>()) {
       return IterSumExpr({GetRef<IterSplitExpr>(op)}, make_zero(expr->dtype));
     } else {
-      CHECK(!expr->IsInstance<IterMapExprNode>());
+      ICHECK(!expr->IsInstance<IterMapExprNode>());
       return IterSumExpr({}, expr);
     }
   }
@@ -566,7 +566,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
     MulToLhs(ret.CopyOnWrite(), b);
     return std::move(ret);
   } else {
-    CHECK(a->IsInstance<IterSplitExprNode>());
+    ICHECK(a->IsInstance<IterSplitExprNode>());
     IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
     ret.CopyOnWrite()->scale *= b;
     return std::move(ret);
@@ -639,7 +639,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
       return FloorDiv(a, b);
     }
   } else {
-    CHECK(a->IsInstance<IterSplitExprNode>());
+    ICHECK(a->IsInstance<IterSplitExprNode>());
     IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
     return SplitFloorDivConst(ret, b);
   }
@@ -707,7 +707,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
       return FloorMod(a, b);
     }
   } else {
-    CHECK(a->IsInstance<IterSplitExprNode>());
+    ICHECK(a->IsInstance<IterSplitExprNode>());
     IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
     return SplitFloorModConst(ret, b);
   }
diff --git a/src/arith/modular_set.cc b/src/arith/modular_set.cc
index 9826769a5c65..ac176b2623a3 100644
--- a/src/arith/modular_set.cc
+++ b/src/arith/modular_set.cc
@@ -67,7 +67,7 @@ struct ModularSetAnalyzer::Entry {
   Entry() = default;
 
   Entry(int64_t coeff, int64_t base) {
-    CHECK_GE(coeff, 0);
+    ICHECK_GE(coeff, 0);
     this->coeff = coeff;
     if (coeff != 0) {
       base = base % coeff;
@@ -93,10 +93,10 @@ class ModularSetAnalyzer::Impl : public ExprFunctor<ModularSetAnalyzer::Entry(co
     if (!allow_override) {
       auto it = var_map_.find(var);
       if (it != var_map_.end()) {
-        CHECK(it->second == info) << "Trying to update var \'" << var << "\'"
-                                  << " with a different const bound: "
-                                  << "original=" << ModularSet(it->second.coeff, it->second.base)
-                                  << ", new=" << info;
+        ICHECK(it->second == info)
+            << "Trying to update var \'" << var << "\'"
+            << " with a different const bound: "
+            << "original=" << ModularSet(it->second.coeff, it->second.base) << ", new=" << info;
       }
     }
     var_map_[var] = Entry(info->coeff, info->base);
@@ -165,7 +165,7 @@ class ModularSetAnalyzer::Impl : public ExprFunctor<ModularSetAnalyzer::Entry(co
 
   Entry DivByConst(const PrimExpr& lhs, int64_t val, bool round_down) {
     Entry a = VisitExpr(lhs);
-    CHECK_NE(val, 0);
+    ICHECK_NE(val, 0);
     if (a.coeff % val == 0) {
       if (a.base == 0) {
         // a c x  / c -> a x
diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h
index 78ae446d0321..01baaa8d13a2 100644
--- a/src/arith/pattern_match.h
+++ b/src/arith/pattern_match.h
@@ -49,10 +49,10 @@
  *  arith::PVar<Var> v;
  *  // We can match integer and Var, both of which are
  *  // special case container of Expr
- *  CHECK((v * c).Match(tx * 3));
- *  CHECK_EQ(c.Eval()->value, 3);
+ *  ICHECK((v * c).Match(tx * 3));
+ *  ICHECK_EQ(c.Eval()->value, 3);
  *  // cannot match c to ty
- *  CHECK(!(v * c).Match(tx * ty));
+ *  ICHECK(!(v * c).Match(tx * ty));
  *
  * \endcode
  *
@@ -199,7 +199,7 @@ class PVar : public Pattern<PVar<T>> {
   }
 
   T Eval() const {
-    CHECK(filled_);
+    ICHECK(filled_);
     return value_;
   }
 
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index cb8ef01e7369..a58e4433dadd 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -109,9 +109,9 @@ void RewriteSimplifier::Impl::Update(const Var& var, const PrimExpr& info, bool
   if (!can_override) {
     auto it = var_map_.find(var);
     if (it != var_map_.end()) {
-      CHECK(ExprDeepEqual()(it->second, info)) << "Trying to update var \'" << var << "\'"
-                                               << " with a different value: "
-                                               << "original=" << it->second << ", new=" << info;
+      ICHECK(ExprDeepEqual()(it->second, info)) << "Trying to update var \'" << var << "\'"
+                                                << " with a different value: "
+                                                << "original=" << it->second << ", new=" << info;
     }
   }
   var_map_[var] = info;
@@ -222,7 +222,7 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
   literal_constraints_.push_back(operator()(constraint));
   size_t new_literal_size = literal_constraints_.size();
   auto frecover = [old_literal_size, new_literal_size, this]() {
-    CHECK_EQ(literal_constraints_.size(), new_literal_size);
+    ICHECK_EQ(literal_constraints_.size(), new_literal_size);
     literal_constraints_.resize(old_literal_size);
   };
   return frecover;
@@ -461,8 +461,8 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
 
   // x / 2.0 = x * 0.5
   if (const FloatImmNode* ptr = op->b.as<FloatImmNode>()) {
-    CHECK(op->dtype.is_float() ||
-          datatype::Registry::Global()->GetTypeRegistered(op->dtype.code()));
+    ICHECK(op->dtype.is_float() ||
+           datatype::Registry::Global()->GetTypeRegistered(op->dtype.code()));
     return op->a * make_const(op->b.dtype(), 1.0 / ptr->value);
   }
 
diff --git a/src/arith/solve_linear_equation.cc b/src/arith/solve_linear_equation.cc
index cda1ec230cbc..22bf7360563d 100644
--- a/src/arith/solve_linear_equation.cc
+++ b/src/arith/solve_linear_equation.cc
@@ -42,8 +42,8 @@ void SmithNormalFormDiag(std::vector<std::vector<int64_t>>* S, std::vector<std::
   if (S->empty() || V->empty()) return;
   size_t m = S->size();
   size_t n = (*S)[0].size();  // n is # of variables
-  CHECK_EQ(V->size(), n);
-  CHECK_EQ((*V)[0].size(), n);
+  ICHECK_EQ(V->size(), n);
+  ICHECK_EQ((*V)[0].size(), n);
 
   for (size_t index = 0; index < std::min(m, n); ++index) {
     // Here A is partially diagonalized, that is A[i, j] is zero for all i, j
diff --git a/src/arith/solve_linear_inequality.cc b/src/arith/solve_linear_inequality.cc
index eec916ac6c22..f4de9ffb197b 100644
--- a/src/arith/solve_linear_inequality.cc
+++ b/src/arith/solve_linear_inequality.cc
@@ -268,7 +268,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
 
   Map<Var, IntGroupBounds> res_bounds;
   for (const Var& v : system_to_solve->variables) {
-    CHECK(!res_bounds.count(v))
+    ICHECK(!res_bounds.count(v))
         << "Variable " << v
         << " appears more than one time in the `variables` which might be a bug";
 
@@ -436,7 +436,7 @@ IntConstraints SolveInequalitiesToRange(const IntConstraints& inequalities) {
     analyzer.Bind(vranges);
 
     const Var& var = *it;
-    CHECK(solved_bounds.count(var));
+    ICHECK(solved_bounds.count(var));
     auto bnd = solved_bounds[var];
     if (is_one(bnd->coef) && !bnd->equal.empty()) {
       // There is an equation of the form `v == expr`, so this variable can be completely removed.
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 75fd27ef9fa8..c6cf094ee202 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -553,7 +553,7 @@ class FlopEstimator : public ExprFunctor<double(const PrimExpr& n)> {
         if (pop->attrs.count("FLOP")) {
           // Use user-provided FLOP
           auto pint = pop->attrs["FLOP"].as<IntImmNode>();
-          CHECK(pint != nullptr);
+          ICHECK(pint != nullptr);
           ret += pint->value;
         } else {
           // Estimate by parsing the compute body
@@ -719,11 +719,11 @@ class IndexRewriter : public StmtExprMutator {
       for (const auto& arg : op->indices) {
         std::string axis_name;
         if (const auto* int_imm = arg.as<IntImmNode>()) {
-          CHECK_EQ(int_imm->value, 0);
+          ICHECK_EQ(int_imm->value, 0);
           axis_name = "IntImm";
         } else {
           axis_name = AxisBaseName(CleanName(Downcast<Var>(arg)->name_hint));
-          CHECK_EQ(name_to_arg.count(axis_name), 0);
+          ICHECK_EQ(name_to_arg.count(axis_name), 0);
           name_to_arg[axis_name] = arg;
         }
       }
@@ -733,7 +733,7 @@ class IndexRewriter : public StmtExprMutator {
       for (int i = new_names_.size() - 1; i >= 0; --i) {
         auto ori_iter_name = new_names_[i];
         auto name_it = name_to_arg.find(ori_iter_name);
-        CHECK(name_it != name_to_arg.end());
+        ICHECK(name_it != name_to_arg.end());
         PrimExpr ori_arg = name_it->second;
 
         PrimExpr mod_factor = new_shape_[i];
@@ -772,12 +772,12 @@ std::string GetOrigLayout(std::set<std::string>* placeholder_axis_names, const t
   std::ostringstream os;
   uint32_t i = 0;
   const auto& placeholder_op = placeholder->op;
-  CHECK_GT(extractor.read_access.count(placeholder_op), 0);
+  ICHECK_GT(extractor.read_access.count(placeholder_op), 0);
   for (const auto& ev : extractor.read_access[placeholder_op]) {
     for (const auto& e : ev) {
       std::string axis_name;
       if (const auto* int_imm = e.as<IntImmNode>()) {
-        CHECK_EQ(int_imm->value, 0);
+        ICHECK_EQ(int_imm->value, 0);
         axis_name = "IntImm";
       } else {
         axis_name = AxisBaseName(CleanName(Downcast<Var>(e)->name_hint));
@@ -788,7 +788,7 @@ std::string GetOrigLayout(std::set<std::string>* placeholder_axis_names, const t
     }
   }
 
-  CHECK_EQ(placeholder_axis_names->size(), placeholder->shape.size());
+  ICHECK_EQ(placeholder_axis_names->size(), placeholder->shape.size());
   std::string orig_layout = os.str();
   os.str("");
   // TODO(minmin): uncomment this line for relay integration
@@ -837,7 +837,7 @@ std::string GetNewLayout(Array<PrimExpr>* new_shape, const State& state, const i
     ExtractOriginalIterators(iter->name, &ori_iter_names);
     // fused iters have been replaced with iter->orig_iters.
     // So there should be only one ori iter name extracted from iter->name.
-    CHECK_EQ(ori_iter_names.size(), 1);
+    ICHECK_EQ(ori_iter_names.size(), 1);
     auto ori_iter_name = AxisBaseName(*ori_iter_names.begin());
     new_axis_names.push_back(ori_iter_name);
   }
@@ -937,7 +937,7 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
               new_body.push_back(index_rewriter.Rewrite(body));
             }
             old_compute_op = op;
-            CHECK(!new_compute_op.defined());
+            ICHECK(!new_compute_op.defined());
             new_compute_op = te::ComputeOp(pop->name, pop->tag, pop->attrs, pop->axis, new_body);
           }
         }
@@ -1109,7 +1109,7 @@ String ComputeDAG::PrintStepsAsPython(const Array<Step>& transform_steps) const
 }
 
 State ComputeDAG::InferBound(const State& state) const {
-  CHECK(state->concrete) << "Only concrete state can be processed to get bound info.";
+  ICHECK(state->concrete) << "Only concrete state can be processed to get bound info.";
 
   State ret_state;
   StateNode* pstate;
@@ -1267,7 +1267,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
               ss << ".v" << k;
             }
             if (auto preduce = pop->body[k].as<ReduceNode>()) {
-              CHECK_LT(k, preduce->combiner->result.size());
+              ICHECK_LT(k, preduce->combiner->result.size());
               PrimExpr combiner = preduce->combiner->result[k];
               if (combiner->IsInstance<AddNode>()) {
                 ss << " += " << preduce->source[0] << "\n";
@@ -1300,7 +1300,7 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAG")
       if (tensors) {
         return ComputeDAG(tensors.value());
       }
-      CHECK(sch) << "Both tensors and schedule are null";
+      ICHECK(sch) << "Both tensors and schedule are null";
       return ComputeDAG(sch.value());
     });
 
diff --git a/src/auto_scheduler/cost_model.cc b/src/auto_scheduler/cost_model.cc
index 3d540c7b6610..4ed5ca2bfbe8 100755
--- a/src/auto_scheduler/cost_model.cc
+++ b/src/auto_scheduler/cost_model.cc
@@ -34,7 +34,7 @@ TVM_REGISTER_OBJECT_TYPE(PythonBasedModelNode);
 RandomModel::RandomModel() {
   ObjectPtr<RandomModelNode> node = make_object<RandomModelNode>();
   const auto* f = runtime::Registry::Get("auto_scheduler.cost_model.random_fill_float");
-  CHECK(f != nullptr);
+  ICHECK(f != nullptr);
   node->random_number_func = reinterpret_cast<const TypedPackedFunc<void(size_t, void*)>*>(f);
   data_ = std::move(node);
 }
@@ -109,7 +109,7 @@ void PythonBasedModelNode::PredictStages(const SearchTask& task, const Array<Sta
   // Score of each stage in each states.
   size_t idx = n_states;
   for (size_t i = 0; i < n_states; ++i) {
-    CHECK_LE(idx, flatten_scores.size());
+    ICHECK_LE(idx, flatten_scores.size());
 
     // Number of scored stages of this state.
     int s_length = static_cast<int>(flatten_scores[idx++]);
@@ -134,7 +134,7 @@ void PythonBasedModelNode::PredictStages(const SearchTask& task, const Array<Sta
           scores.push_back(flatten_scores[idx + offset]);
           offset++;
         }
-        CHECK_EQ(offset, s_length);
+        ICHECK_EQ(offset, s_length);
         stage_scores->push_back(std::move(scores));
       }
       idx += s_length;
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 15066a98e2bc..8d17c4bba10f 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -298,7 +298,7 @@ class MathOpCounter : public StmtExprVisitor {
 
   void VisitExpr_(const CallNode* op) final {
     auto* pop = op->op.as<OpNode>();
-    CHECK(pop != nullptr);
+    ICHECK(pop != nullptr);
     auto effect_kind = op_call_effect_[GetRef<Op>(pop)];
     bool is_pure =
         effect_kind == CallEffectKind::kPure || effect_kind == CallEffectKind::kExprAnnotation;
@@ -937,7 +937,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
         while (compute_ops_list[pt] < cur_compute_ops - 1e-4) {
           pt++;
         }
-        CHECK_LT(pt, compute_ops_list.size());
+        ICHECK_LT(pt, compute_ops_list.size());
 
         float value;
         if (pt == 0) {
@@ -1323,7 +1323,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
         tir::transform::Sequential(Array<tvm::transform::Pass>{tir::transform::Simplify()});
     mod = optimize(std::move(mod));
     const auto& it = mod->functions.find(global_var);
-    CHECK(it != mod->functions.end());
+    ICHECK(it != mod->functions.end());
     const auto& prim_func = (*it).second.as<PrimFuncNode>();
     GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs,
                        feature);
@@ -1389,7 +1389,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
 
   const auto* workload_key_to_tensors =
       tvm::runtime::Registry::Get("auto_scheduler.workload_key_to_tensors");
-  CHECK(workload_key_to_tensors != nullptr);
+  ICHECK(workload_key_to_tensors != nullptr);
 
   // read from file
   RecordReader reader(filename);
@@ -1454,7 +1454,7 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
 
   const auto* workload_key_to_tensors =
       tvm::runtime::Registry::Get("auto_scheduler.workload_key_to_tensors");
-  CHECK(workload_key_to_tensors != nullptr);
+  ICHECK(workload_key_to_tensors != nullptr);
 
   tasks.reserve(inputs.size());
   normalized_throughputs->reserve(inputs.size());
@@ -1548,7 +1548,7 @@ TVMByteArray SerializeFeatures(std::vector<std::vector<float>>&& features,
   size_vector.push_back(static_cast<int>(task_ids.size()));
   total_bytes += sizeof(int) * task_ids.size();
 
-  CHECK_EQ(size_vector.size(), size_vector_size);
+  ICHECK_EQ(size_vector.size(), size_vector_size);
 
   // allocate memory
   out_data->reserve(total_bytes);
@@ -1574,7 +1574,7 @@ TVMByteArray SerializeFeatures(std::vector<std::vector<float>>&& features,
   memmove(ptr, reinterpret_cast<char*>(task_ids.data()), task_ids.size() * sizeof(int));
   ptr += task_ids.size() * sizeof(int);
 
-  CHECK_EQ(ptr - out_data->data(), total_bytes);
+  ICHECK_EQ(ptr - out_data->data(), total_bytes);
 
   return TVMByteArray{out_data->data(), total_bytes};
 }
diff --git a/src/auto_scheduler/loop_state.cc b/src/auto_scheduler/loop_state.cc
index c3c764fc8e2b..23d6eb64da6c 100755
--- a/src/auto_scheduler/loop_state.cc
+++ b/src/auto_scheduler/loop_state.cc
@@ -114,7 +114,7 @@ void AttachMap::DeleteStage(int stage_id) {
 
 void AttachMap::UpdateIters(const std::vector<IterKey>& original_iters,
                             const std::vector<IterKey>& new_iters) {
-  CHECK_EQ(original_iters.size(), new_iters.size());
+  ICHECK_EQ(original_iters.size(), new_iters.size());
   AttachMapNode* pnode = CopyOnWrite();
   std::unordered_map<IterKey, std::vector<StageKey>> new_iter_to_attached_stages;
   for (size_t i = 0; i < original_iters.size(); ++i) {
@@ -265,8 +265,8 @@ void State::pragma(int stage_id, const Iterator& it, const String& pragma_type)
 
 void State::reorder(int stage_id, const Array<Iterator>& order) {
   const Stage& stage = operator->()->stages[stage_id];
-  CHECK_EQ(order.size(), stage->iters.size()) << "The order of all iterators "
-                                              << "should be specified";
+  ICHECK_EQ(order.size(), stage->iters.size()) << "The order of all iterators "
+                                               << "should be specified";
   Array<Integer> after_ids;
   GetIndices(stage->iters, order, &after_ids);
   ReorderStep step = ReorderStep(stage_id, after_ids);
diff --git a/src/auto_scheduler/measure.cc b/src/auto_scheduler/measure.cc
index c3ee6a1495e3..6c5c10e5aaee 100755
--- a/src/auto_scheduler/measure.cc
+++ b/src/auto_scheduler/measure.cc
@@ -303,7 +303,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
         auto old_config = p->stream.precision(4);
         for (size_t i = 0; i < node->costs.size(); ++i) {
           auto pf = node->costs[i].as<FloatImmNode>();
-          CHECK(pf != nullptr);
+          ICHECK(pf != nullptr);
           p->stream << pf->value;
           if (i != node->costs.size() - 1) {
             p->stream << ",";
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 99c01b17e78e..66f521e17e80 100755
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -53,7 +53,7 @@ struct Handler<::tvm::Array<::tvm::auto_scheduler::Stage>> {
     bool s;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(!s);
+    ICHECK(!s);
   }
 };
 
@@ -80,7 +80,7 @@ struct Handler<::tvm::Array<::tvm::auto_scheduler::Step>> {
       reader->BeginArray();
       data->push_back(::tvm::auto_scheduler::StepReadFromRecord(reader));
       s = reader->NextArrayItem();
-      CHECK(!s);
+      ICHECK(!s);
     }
   }
 };
@@ -97,13 +97,13 @@ struct Handler<::tvm::auto_scheduler::StateNode> {
     bool s;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->stages);
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->transform_steps);
     s = reader->NextArrayItem();
-    CHECK(!s);
+    ICHECK(!s);
   }
 };
 
@@ -121,15 +121,15 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     std::string str_value;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&str_value);
     data->workload_key = std::move(str_value);
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&str_value);
     data->target = ::tvm::Target(str_value);
     s = reader->NextArrayItem();
-    CHECK(!s);
+    ICHECK(!s);
   }
 };
 
@@ -150,13 +150,13 @@ struct Handler<::tvm::auto_scheduler::MeasureInputNode> {
     bool s;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(task_node.get());
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(state_node.get());
     s = reader->NextArrayItem();
-    CHECK(!s);
+    ICHECK(!s);
 
     data->task = ::tvm::auto_scheduler::SearchTask(task_node);
     data->state = ::tvm::auto_scheduler::State(state_node);
@@ -172,7 +172,7 @@ struct Handler<::tvm::auto_scheduler::MeasureResultNode> {
     writer->BeginArray(false);
     for (const auto& x : data.costs) {
       auto pf = x.as<::tvm::tir::FloatImmNode>();
-      CHECK(pf != nullptr) << "Cost can only contain float values";
+      ICHECK(pf != nullptr) << "Cost can only contain float values";
       writer->WriteArrayItem(pf->value);
     }
     writer->EndArray();
@@ -187,23 +187,23 @@ struct Handler<::tvm::auto_scheduler::MeasureResultNode> {
     bool s;
     reader->BeginArray();
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&double_list);
     data->costs.clear();
     for (const auto& i : double_list) {
       data->costs.push_back(::tvm::FloatImm(::tvm::DataType::Float(64), i));
     }
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->error_no);
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->all_cost);
     s = reader->NextArrayItem();
-    CHECK(s);
+    ICHECK(s);
     reader->Read(&data->timestamp);
     s = reader->NextArrayItem();
-    CHECK(!s);
+    ICHECK(!s);
   }
 };
 
diff --git a/src/auto_scheduler/search_policy/empty_policy.cc b/src/auto_scheduler/search_policy/empty_policy.cc
index fba1ac2f42f8..79f98793d848 100644
--- a/src/auto_scheduler/search_policy/empty_policy.cc
+++ b/src/auto_scheduler/search_policy/empty_policy.cc
@@ -57,7 +57,7 @@ State EmptyPolicyNode::Search(int num_measure_trials, int early_stopping,
   // Measure is disabled if num_measure_trials <= 1
   if (num_measure_trials <= 1) {
     const auto& res = SearchOneRound();
-    CHECK_GT(res.size(), 0);
+    ICHECK_GT(res.size(), 0);
 
     return res[0];
   } else {
diff --git a/src/auto_scheduler/search_policy/search_policy.cc b/src/auto_scheduler/search_policy/search_policy.cc
index 8b6d22bb2725..702eec087668 100644
--- a/src/auto_scheduler/search_policy/search_policy.cc
+++ b/src/auto_scheduler/search_policy/search_policy.cc
@@ -39,7 +39,7 @@ void SearchPolicyNode::PreloadMeasuredStates(const String& log_file) {
   RecordReader reader = RecordReader(log_file);
   const auto& res = reader->ReadLines(-1);
   size_t log_size = res.first.size();
-  CHECK_EQ(log_size, res.second.size());
+  ICHECK_EQ(log_size, res.second.size());
   if (log_size) {
     Array<State> measured_states;
     std::vector<float> measured_throughputs;
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 60178b342e62..5d6d1d28be1c 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -147,7 +147,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
   if (n_trials <= 1) {
     // No measurement is allowed
     const Array<State>& best_states = SearchOneRound(0);
-    CHECK_GT(best_states.size(), 0);
+    ICHECK_GT(best_states.size(), 0);
     return best_states[0];
   } else {
     int num_random =
@@ -348,10 +348,10 @@ Array<State> SketchPolicyNode::GenerateSketches() {
     auto pstate = state.CopyOnWrite();
     for (size_t step_id = 0; step_id < pstate->transform_steps.size(); ++step_id) {
       if (pstate->transform_steps[step_id]->IsInstance<RfactorStepNode>()) {
-        CHECK_GE(step_id, 1);
+        ICHECK_GE(step_id, 1);
         int split_step_id = static_cast<int>(step_id - 1);
         auto step = pstate->transform_steps[split_step_id].as<SplitStepNode>();
-        CHECK(step != nullptr);
+        ICHECK(step != nullptr);
         pstate->transform_steps.Set(
             split_step_id, SplitStep(step->stage_id, step->iter_id, step->extent, {NullOpt},
                                      step->inner_to_outer));
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 1b965c9886a1..1b6cc06a4c45 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -115,7 +115,8 @@ SketchGenerationRule::ConditionKind RuleMultiLevelTilingWithFusion::MeetConditio
 std::vector<std::pair<State, int>> RuleMultiLevelTilingWithFusion::Apply(
     const SketchPolicyNode& policy, const State& state, int stage_id) const {
   int target_stage_id;
-  CHECK(HasSingleElementwiseMatchedConsumer(policy.search_task, state, stage_id, &target_stage_id));
+  ICHECK(
+      HasSingleElementwiseMatchedConsumer(policy.search_task, state, stage_id, &target_stage_id));
   const std::string& multi_level_tiling_structure =
       IsGPUTask(policy.search_task)
           ? GetStringParam(policy.params, SketchParamKey::MultiLevelTiling::gpu_structure)
@@ -296,7 +297,7 @@ std::vector<std::pair<State, int>> RuleSimplifyComputeWithConstTensor::Apply(
       unrolled_inner_iters.push_back(tmp_s.unroll(stage_id, iter));
     } else {
       // tile other space indices
-      CHECK(iter->iter_kind == IteratorKind::kSpatial);
+      ICHECK(iter->iter_kind == IteratorKind::kSpatial);
       tiled_outer_iters.push_back(
           tmp_s.split(stage_id, iter, Array<Optional<Integer>>(tile_level - 1, NullOpt)));
     }
@@ -319,7 +320,7 @@ std::vector<std::pair<State, int>> RuleSimplifyComputeWithConstTensor::Apply(
 
 SketchGenerationRule::ConditionKind RuleCrossThreadReduction::MeetCondition(
     const SketchPolicyNode& policy, const State& state, int stage_id) const {
-  CHECK(IsGPUTask(policy.search_task));
+  ICHECK(IsGPUTask(policy.search_task));
 
   // If it is an intermediate state created by RuleAddCacheWrite,
   // we just skip it.
@@ -386,14 +387,14 @@ std::vector<std::pair<State, int>> RuleCrossThreadReduction::Apply(const SketchP
       // If the target stage does not have split step,
       // it must be a simple stage without reduce iters.
       // We then should do a split for it.
-      CHECK(!HasReduceIter(target_stage));
+      ICHECK(!HasReduceIter(target_stage));
       const auto& split_res = tmp_s.split(target_stage_id, target_stage->iters.back(),
                                           {Integer(task->hardware_params->warp_size)});
       tmp_s.bind(target_stage_id, split_res[1], IteratorAnnotation::kThreadX);
       split_step_ids.push_back(tmp_s->transform_steps.size() - 2);
     }
 
-    CHECK_EQ(split_step_ids.size(), 1);
+    ICHECK_EQ(split_step_ids.size(), 1);
 
     const Iterator& target_iter = tmp_s->stages[target_stage_id]->iters[num_common_outer - 1];
     const auto& split_res = tmp_s.follow_split(stage_id, fused_reduce_iter, split_step_ids[0], 1);
@@ -429,13 +430,13 @@ std::vector<std::pair<State, int>> RuleSpecialComputeLocationGPU::Apply(
     const SketchPolicyNode& policy, const State& state, int stage_id) const {
   State tmp_s = state;
   const std::set<int>& consumers = GetConsumers(policy.search_task, state, stage_id);
-  CHECK_EQ(consumers.size(), 1);
+  ICHECK_EQ(consumers.size(), 1);
 
   // Get the last outer space iterator that is not unrolled.
   const Stage& target_stage = state->stages[*consumers.begin()];
   for (size_t i = 0; i < target_stage->iters.size(); ++i) {
     if (target_stage->iters[i]->annotation == IteratorAnnotation::kUnroll) {
-      CHECK_GT(i, 0);
+      ICHECK_GT(i, 0);
 
       tmp_s.compute_at(stage_id, *consumers.begin(), target_stage->iters[i - 1]);
       break;
@@ -467,7 +468,7 @@ PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* p
         continue;
       }
 
-      CHECK(ps->extent);
+      ICHECK(ps->extent);
       int extent = GetIntImm(ps->extent.value());
       const auto& candidate_lens = policy->split_memo.GetFactorizationSchemes(
           extent, ps->lengths.size(), max_innermost_split_factor);
@@ -720,10 +721,10 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol
       } else if (stage->compute_at != ComputeAtKind::kIter) {
         // This stage is not multi-level tiled,
         // so it must be produced by RuleCrossThreadReduction.
-        CHECK(HasCrossThreadReduction(*state, stage_id));
+        ICHECK(HasCrossThreadReduction(*state, stage_id));
       } else {
         const auto res = (*state)->attach_map->stage_to_attach_iter.find(stage_id);
-        CHECK(res != (*state)->attach_map->stage_to_attach_iter.end());
+        ICHECK(res != (*state)->attach_map->stage_to_attach_iter.end());
         multi_level_tiling_root_set.insert(res->second.first);
       }
     }
@@ -782,9 +783,9 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol
       std::vector<Iterator> to_fuse;
       int total_space_extent = 1;
       for (const auto& i : pop->root_iter_vars()) {
-        CHECK(i->dom.defined());
+        ICHECK(i->dom.defined());
         const auto& pint = i->dom->extent.as<IntImmNode>();
-        CHECK(pint);
+        ICHECK(pint);
         total_space_extent *= pint->value;
       }
 
@@ -847,7 +848,7 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol
       // Do cooperative fetching for the cache read stage.
       // Get spatial_split_step_ids from the root stage
       const auto& it = (*state)->attach_map->stage_to_attach_iter.find(stage_id);
-      CHECK(it != (*state)->attach_map->stage_to_attach_iter.end());
+      ICHECK(it != (*state)->attach_map->stage_to_attach_iter.end());
       Array<Integer> spatial_split_step_ids = GetSpatialSplitStepIds(*state, it->second.first);
 
       // Fuse all iterators to do cooperative fetching
@@ -897,7 +898,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
   do {
     step_id = split_step_ids[(*rand_gen)() % split_step_ids.size()];
     ps = (*state)->transform_steps[step_id].as<SplitStepNode>();
-    CHECK(ps != nullptr);
+    ICHECK(ps != nullptr);
     extent = GetIntImm(ps->extent.value());
     retry_ct += 1;
   } while (retry_ct < static_cast<int>(split_step_ids.size()) << 2 && (extent == 1 || extent == 0));
@@ -929,7 +930,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
     // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx]
     size_t dst_idx = random_perm[(i + 1) % random_perm.size()];
     const std::vector<int>& factors = policy->split_memo.GetFactors(length);
-    CHECK_GE(factors.size(), 1);
+    ICHECK_GE(factors.size(), 1);
 
     int divide_factor;
     if (dst_idx == lengths.size() - 1) {
@@ -961,7 +962,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol
       }
     }
 
-    CHECK_LE(GetIntImm(new_lengths.back()), max_innermost_split_factor);
+    ICHECK_LE(GetIntImm(new_lengths.back()), max_innermost_split_factor);
 
     StateNode* pstate = state->CopyOnWrite();
     pstate->transform_steps.Set(
@@ -994,7 +995,7 @@ PopulationGenerationRule::ResultKind MutateAutoUnroll::Apply(SketchPolicyNode* p
   // Randomly pick up an auto unroll pragma step
   auto step_id = pragma_steps[(*rand_gen)() % pragma_steps.size()];
   auto ps = (*state)->transform_steps[step_id].as<PragmaStepNode>();
-  CHECK(ps);
+  ICHECK(ps);
 
   // Mutate its value to a random candidates
   auto val = std::to_string(auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()]);
@@ -1035,7 +1036,7 @@ PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNo
   size_t step_id = compute_at_steps[(*rand_gen)() % compute_at_steps.size()];
   auto ps = (*state)->transform_steps[step_id].as<ComputeAtStepNode>();
   int stage_inc = GetTargetStageIDInState(*state, step_id) - ps->stage_id;
-  CHECK(ps != nullptr);
+  ICHECK(ps != nullptr);
 
   // Randomly pick a new computation location
   std::vector<std::pair<int, int>> candidates =
@@ -1156,14 +1157,14 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol
         if (ps->iter_id == 0) {
           step = AnnotationStep(ps->stage_id, 0, ps->annotation);
         } else {
-          CHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
+          ICHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
           step = AnnotationStep(ps->stage_id, ps->iter_id + iter_offset, ps->annotation);
         }
       } else if (auto ps = step.as<PragmaStepNode>()) {
         if (ps->iter_id == 0) {
           step = PragmaStep(ps->stage_id, 0, ps->pragma_type);
         } else {
-          CHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
+          ICHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
           step = PragmaStep(ps->stage_id, ps->iter_id + iter_offset, ps->pragma_type);
         }
       } else {
diff --git a/src/auto_scheduler/search_policy/utils.cc b/src/auto_scheduler/search_policy/utils.cc
index 9e72eeb3f0c2..3e2f7aaed44f 100644
--- a/src/auto_scheduler/search_policy/utils.cc
+++ b/src/auto_scheduler/search_policy/utils.cc
@@ -32,7 +32,7 @@ namespace auto_scheduler {
 Array<Integer> GetSpatialSplitStepIds(const State& s, int stage_id) {
   const auto& stage = s->stages[stage_id];
   const auto& pop = s->stages[stage_id]->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
+  ICHECK(pop != nullptr);
   const std::set<std::string>& no_split_at_inner_name_set =
       stage->op->attrs.count(SearchPolicyKey::no_split_at_inner)
           ? GetIterNameSetParam(stage->op->attrs, SearchPolicyKey::no_split_at_inner)
@@ -182,7 +182,7 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo
   for (const auto& iter : state->stages[stage_id]->iters) {
     if (!no_split_at_inner_name_set.count(iter->name)) {
       if (iter->iter_kind == IteratorKind::kSpatial) {
-        CHECK_GE(n_space, 1);
+        ICHECK_GE(n_space, 1);
 
         if (n_space == 1) {
           space_levels[0].push_back(iter);
@@ -194,7 +194,7 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo
           spatial_split_step_ids->push_back(tmp_s->transform_steps.size() - 1);
         }
       } else if (iter->iter_kind == IteratorKind::kReduction) {
-        CHECK_GE(n_reduce, 1);
+        ICHECK_GE(n_reduce, 1);
 
         if (n_reduce == 1) {
           reduce_levels[0].push_back(iter);
@@ -219,26 +219,26 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo
   }
 
   if (!space_outer.empty()) {
-    CHECK(!space_levels.empty());
+    ICHECK(!space_levels.empty());
     space_levels.front().insert(space_levels.front().begin(),
                                 std::make_move_iterator(space_outer.begin()),
                                 std::make_move_iterator(space_outer.end()));
   }
   if (!space_inner.empty()) {
-    CHECK(!space_levels.empty());
+    ICHECK(!space_levels.empty());
     space_levels.back().insert(space_levels.back().begin(),
                                std::make_move_iterator(space_inner.begin()),
                                std::make_move_iterator(space_inner.end()));
   }
 
   if (!reduce_outer.empty()) {
-    CHECK(!reduce_levels.empty());
+    ICHECK(!reduce_levels.empty());
     reduce_levels.front().insert(reduce_levels.front().begin(),
                                  std::make_move_iterator(reduce_outer.begin()),
                                  std::make_move_iterator(reduce_outer.end()));
   }
   if (!reduce_inner.empty()) {
-    CHECK(!reduce_levels.empty());
+    ICHECK(!reduce_levels.empty());
     reduce_levels.back().insert(reduce_levels.back().begin(),
                                 std::make_move_iterator(reduce_inner.begin()),
                                 std::make_move_iterator(reduce_inner.end()));
@@ -274,7 +274,7 @@ State FollowTiling(const State& state, int stage_id, const std::vector<int>& spl
   Array<Iterator> split_res;
 
   auto pop = state->stages[stage_id]->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
+  ICHECK(pop != nullptr);
   const Stage& stage = state->stages[stage_id];
   const std::set<std::string>& no_split_at_inner_name_set =
       stage->op->attrs.count(SearchPolicyKey::no_split_at_inner)
@@ -285,8 +285,8 @@ State FollowTiling(const State& state, int stage_id, const std::vector<int>& spl
     no_split_at_inner_name_in_stage_cnt += no_split_at_inner_name_set.count(iter->name);
   }
 
-  CHECK_EQ(state->stages[stage_id]->iters.size() - no_split_at_inner_name_in_stage_cnt,
-           split_step_ids.size());
+  ICHECK_EQ(state->stages[stage_id]->iters.size() - no_split_at_inner_name_in_stage_cnt,
+            split_step_ids.size());
 
   State tmp_s = state;
   int ct = 0;
@@ -328,7 +328,7 @@ State FollowTiling(const State& state, int stage_id, const std::vector<int>& spl
           } else if (n_split == 2) {
             space_2.push_back(iter);
           } else {
-            CHECK_EQ(n_split, 3);
+            ICHECK_EQ(n_split, 3);
             space_3.push_back(iter);
           }
         }
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index 5c015ca46a9b..f0c4cbca9ca0 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -99,29 +99,29 @@ inline int OperationToStage(const te::Operation& op, const State& state) {
 
 /*! \brief Get an integer from a tvm str Map. */
 inline int GetIntParam(const Map<String, ObjectRef>& attr_dict, const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
+  ICHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
   auto pint = attr_dict[key].as<IntImmNode>();
-  CHECK(pint != nullptr);
+  ICHECK(pint != nullptr);
   return pint->value;
 }
 
 /*! \brief Get a double from a tvm str Map. */
 inline double GetDoubleParam(const Map<String, ObjectRef>& attr_dict, const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
+  ICHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
   auto pdouble = attr_dict[key].as<FloatImmNode>();
-  CHECK(pdouble != nullptr);
+  ICHECK(pdouble != nullptr);
   return pdouble->value;
 }
 
 /*! \brief Get a string from a tvm str Map. */
 inline std::string GetStringParam(const Map<String, ObjectRef>& attr_dict, const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
+  ICHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
   const auto& target = attr_dict[key];
   if (auto pstr = target.as<StringImmNode>()) {
     return pstr->value;
   }
   auto pstr = target.as<StringObj>();
-  CHECK(pstr != nullptr);
+  ICHECK(pstr != nullptr);
   return pstr->data;
 }
 
@@ -129,9 +129,9 @@ inline std::string GetStringParam(const Map<String, ObjectRef>& attr_dict, const
 inline std::set<std::string> GetIterNameSetParam(const Map<String, ObjectRef>& attr_dict,
                                                  const std::string& key) {
   std::set<std::string> ret;
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
+  ICHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
   auto names = attr_dict[key].as<ArrayNode>();
-  CHECK(names != nullptr);
+  ICHECK(names != nullptr);
   for (const auto& name : *names) {
     ret.insert(name.as<StringObj>()->data);
   }
@@ -477,7 +477,7 @@ inline bool HasCrossThreadReduction(const State& state, int stage_id) {
 /*! \brief Return whether the stage has been tiled already. */
 inline bool IsTiled(const Stage& stage) {
   auto op = stage->op.as<te::ComputeOpNode>();
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   return stage->iters.size() != op->axis.size() + op->reduce_axis.size();
 }
 
@@ -502,7 +502,7 @@ inline void ExtractOriginalIterators(const std::string& name, std::set<std::stri
 /*! \brief Get the last reduce iterator in the outermost reduce tile. */
 inline Iterator GetLastReduceIteratorInOutermostReduceTile(const Stage& stage) {
   auto pop = stage->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
+  ICHECK(pop != nullptr);
   std::set<std::string> original_names;
 
   const std::set<std::string>& no_split_at_inner_name_set =
@@ -583,7 +583,7 @@ inline State FuseAllReductionIterators(const State& state, int stage_id, Iterato
     }
   }
 
-  CHECK(!reduce_iters->empty());
+  ICHECK(!reduce_iters->empty());
   State tmp_s = state;
   if (reduce_iters->size() > 1) {
     *fused_iter = tmp_s.fuse(stage_id, *reduce_iters);
@@ -609,7 +609,7 @@ inline State FuseAllOuterSpaceIterators(const State& state, int stage_id, Iterat
     to_fuse.push_back(it);
   }
 
-  CHECK(!to_fuse.empty());
+  ICHECK(!to_fuse.empty());
   State tmp_s = state;
   if (to_fuse.size() > 1) {
     *fused_iter = tmp_s.fuse(stage_id, to_fuse);
@@ -649,7 +649,7 @@ inline int RandomChoose(const std::vector<double>& prefix_sum_probs, std::mt1993
   std::uniform_real_distribution<> dis(0.0, 1.0);
   double x = dis(*random_gen);
 
-  CHECK(!prefix_sum_probs.empty());
+  ICHECK(!prefix_sum_probs.empty());
 
   return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) -
          prefix_sum_probs.begin();
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index e3f35e9f0c19..0b85a03f0671 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -53,7 +53,7 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
 
     auto ctx = TVMContext{kDLGPU, 0};
     auto func = tvm::runtime::Registry::Get("device_api.gpu");
-    CHECK(func != nullptr) << "Cannot find GPU device_api in registry";
+    ICHECK(func != nullptr) << "Cannot find GPU device_api in registry";
     auto device_api = static_cast<tvm::runtime::DeviceAPI*>(((*func)()).operator void*());
 
     tvm::runtime::TVMRetValue ret;
diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc
index 73f673421378..852f1e1f17d8 100755
--- a/src/auto_scheduler/transform_step.cc
+++ b/src/auto_scheduler/transform_step.cc
@@ -27,6 +27,7 @@
 #include <tvm/auto_scheduler/loop_state.h>
 #include <tvm/auto_scheduler/transform_step.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 #include <tvm/te/operation.h>
 
 #include <string>
@@ -43,7 +44,7 @@ struct Handler<::tvm::Array<::tvm::Integer>> {
   inline static void Write(dmlc::JSONWriter* writer, const ::tvm::Array<::tvm::Integer>& array) {
     writer->BeginArray(false);
     for (const auto& i : array) {
-      CHECK(i.defined());
+      ICHECK(i.defined());
       writer->WriteArrayItem(i->value);
     }
     writer->EndArray();
@@ -65,7 +66,7 @@ struct Handler<::tvm::Array<::tvm::Optional<::tvm::Integer>>> {
                            const ::tvm::Array<::tvm::Optional<::tvm::Integer>>& array) {
     writer->BeginArray(false);
     for (const auto& i : array) {
-      CHECK(i);
+      ICHECK(i);
       writer->WriteArrayItem(i.value()->value);
     }
     writer->EndArray();
@@ -125,7 +126,7 @@ Step StepReadFromRecord(dmlc::JSONReader* reader) {
   std::string name;
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&name);
   if (name == AnnotationStepNode::record_prefix_str) {
     return AnnotationStep(reader);
@@ -283,13 +284,13 @@ AnnotationStep::AnnotationStep(dmlc::JSONReader* reader) {
   auto node = make_object<AnnotationStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   int int_val;
   reader->Read(&int_val);
   node->annotation = IteratorAnnotation(int_val);
@@ -308,7 +309,7 @@ Iterator AnnotationStepNode::ApplyToState(State* state) const {
   const Stage& stage = (*state)->stages[stage_id];
   Iterator it = stage->iters[iter_id];
 
-  CHECK(it->annotation == IteratorAnnotation::kNone);
+  ICHECK(it->annotation == IteratorAnnotation::kNone);
   Iterator new_it = Iterator(it->name, it->range, it->iter_kind, annotation, &it->orig_iters);
   Stage new_stage = stage;
   new_stage.CopyOnWrite()->iters.Set(iter_id, new_it);
@@ -410,7 +411,7 @@ FuseStep::FuseStep(int stage_id, const Array<Integer>& fused_ids) {
   auto node = make_object<FuseStepNode>();
   node->stage_id = stage_id;
   for (const auto& x : fused_ids) {
-    CHECK(x->IsInstance<IntImmNode>());
+    ICHECK(x->IsInstance<IntImmNode>());
   }
   node->fused_ids = fused_ids;
   data_ = std::move(node);
@@ -420,10 +421,10 @@ FuseStep::FuseStep(dmlc::JSONReader* reader) {
   auto node = make_object<FuseStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->fused_ids);
   data_ = std::move(node);
 }
@@ -446,7 +447,7 @@ Iterator FuseStepNode::ApplyToState(State* state) const {
 
   for (size_t i = 0; i < fused_ids.size(); ++i) {
     if (i > 0) {
-      CHECK_EQ(fused_ids[i]->value, fused_ids[i - 1]->value + 1);
+      ICHECK_EQ(fused_ids[i]->value, fused_ids[i - 1]->value + 1);
     }
 
     if (i != fused_ids.size() - 1) {
@@ -574,13 +575,13 @@ PragmaStep::PragmaStep(dmlc::JSONReader* reader) {
   auto node = make_object<PragmaStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   std::string string_value;
   reader->Read(&string_value);
   node->pragma_type = std::move(string_value);
@@ -609,7 +610,7 @@ void PragmaStepNode::ApplyToState(State* state) const {
         break;
       }
     }
-    CHECK_LT(pos, pragma_type.size()) << "max step value not found.";
+    ICHECK_LT(pos, pragma_type.size()) << "max step value not found.";
     stage.CopyOnWrite()->attrs.auto_unroll_max_step = atoi(pragma_type.c_str() + pos + 1);
     pstate->stages.Set(stage_id, std::move(stage));
   } else {
@@ -628,7 +629,7 @@ void PragmaStepNode::ApplyToSchedule(Array<te::Stage>* stages,
         break;
       }
     }
-    CHECK_LT(pos, pragma_type.size()) << "max step value not found.";
+    ICHECK_LT(pos, pragma_type.size()) << "max step value not found.";
     int value = atoi(pragma_type.c_str() + pos + 1);
     stage.pragma(axes[iter_id], "auto_unroll_max_step", value);
     stage.pragma(axes[iter_id], "unroll_explicit", true);
@@ -651,7 +652,7 @@ String PragmaStepNode::PrintAsPythonAPI(Array<te::Stage>* stages,
         break;
       }
     }
-    CHECK_LT(pos, pragma_type.size()) << "max step value not found.";
+    ICHECK_LT(pos, pragma_type.size()) << "max step value not found.";
     int value = atoi(pragma_type.c_str() + pos + 1);
     ss << "s[" << op_name << "].pragma("
        << CleanName((*stage_to_axes)[stage][iter_id]->var->name_hint, op_name)
@@ -674,7 +675,7 @@ ReorderStep::ReorderStep(int stage_id, const Array<Integer>& after_ids) {
   auto node = make_object<ReorderStepNode>();
   node->stage_id = stage_id;
   for (const auto& x : after_ids) {
-    CHECK(x->IsInstance<IntImmNode>());
+    ICHECK(x->IsInstance<IntImmNode>());
   }
   node->after_ids = after_ids;
   data_ = std::move(node);
@@ -684,10 +685,10 @@ ReorderStep::ReorderStep(dmlc::JSONReader* reader) {
   auto node = make_object<ReorderStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->after_ids);
   data_ = std::move(node);
 }
@@ -713,7 +714,7 @@ void ReorderStepNode::ApplyToSchedule(Array<te::Stage>* stages,
                                       StageToAxesMap* stage_to_axes) const {
   auto stage = (*stages)[stage_id];
   const Array<IterVar>& axes = stage_to_axes->at(stage);
-  CHECK_EQ(after_ids.size(), axes.size());
+  ICHECK_EQ(after_ids.size(), axes.size());
 
   Array<IterVar> new_axes;
   new_axes.reserve(axes.size());
@@ -879,7 +880,7 @@ String PrintSplitAsPythonAPI(Array<te::Stage>* stages, StageToAxesMap* stage_to_
   const auto& func_name = CleanName(stage->op->name);
   const auto& outs =
       ApplySplitToSchedule(stages, stage_to_axes, stage_id, iter_id, lengths, inner_to_outer);
-  CHECK_EQ(outs.size(), lengths.size() + 1);
+  ICHECK_EQ(outs.size(), lengths.size() + 1);
 
   std::stringstream ss;
   int size = static_cast<int>(lengths.size());
@@ -921,23 +922,23 @@ SplitStep::SplitStep(dmlc::JSONReader* reader) {
   auto node = make_object<SplitStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   int int_val;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&int_val);
   if (int_val) {
     node->extent = Integer(int_val);
   }
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->lengths);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->inner_to_outer);
   data_ = std::move(node);
 }
@@ -988,14 +989,14 @@ void FollowSplitStepNode::WriteToRecord(dmlc::JSONWriter* writer) const {
 Array<Optional<Integer>> FollowSplitStepNode::ExtractSplitLengths(
     const Array<Step>& transform_steps) const {
   // Make sure src_step_id is within the range of transform_steps.
-  CHECK_LT(src_step_id, transform_steps.size());
+  ICHECK_LT(src_step_id, transform_steps.size());
   auto ps = transform_steps[src_step_id].as<SplitStepNode>();
-  CHECK(ps != nullptr);
+  ICHECK(ps != nullptr);
 
   // Make sure the size of ps->lengths is not smaller than n_split-1.
   // Note that the number of actual splitting factors of src_step is ps->lengths.size()+1.
-  CHECK_LE(n_split, ps->lengths.size() + 1);
-  CHECK(ps != nullptr);
+  ICHECK_LE(n_split, ps->lengths.size() + 1);
+  ICHECK(ps != nullptr);
 
   Array<Optional<Integer>> lengths;
   lengths.reserve(n_split);
@@ -1029,16 +1030,16 @@ FollowSplitStep::FollowSplitStep(dmlc::JSONReader* reader) {
   auto node = make_object<FollowSplitStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->src_step_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->n_split);
   data_ = std::move(node);
 }
@@ -1079,19 +1080,19 @@ FollowFusedSplitStep::FollowFusedSplitStep(dmlc::JSONReader* reader) {
   auto node = make_object<FollowFusedSplitStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->src_step_ids);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->level);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->factor_or_nparts);
   data_ = std::move(node);
 }
@@ -1112,9 +1113,9 @@ Optional<Integer> FollowFusedSplitStepNode::ExtractSplitLength(
 
   for (int src_step_id : src_step_ids) {
     // Make sure the src_step_id is within the range of transform_steps.
-    CHECK_LT(src_step_id, transform_steps.size());
+    ICHECK_LT(src_step_id, transform_steps.size());
     auto ps = transform_steps[src_step_id].as<SplitStepNode>();
-    CHECK(ps != nullptr);
+    ICHECK(ps != nullptr);
     // Multiple the splitting factor on corresponding splitting level of src_steps.
     if (ps->lengths[level] && ret.defined()) {
       ret *= ps->lengths[level].value();
@@ -1158,16 +1159,16 @@ StorageAlignStep::StorageAlignStep(dmlc::JSONReader* reader) {
   auto node = make_object<StorageAlignStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->factor);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->offset);
   data_ = std::move(node);
 }
@@ -1224,13 +1225,13 @@ ComputeAtStep::ComputeAtStep(dmlc::JSONReader* reader) {
   auto node = make_object<ComputeAtStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->target_stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->target_iter_id);
   data_ = std::move(node);
 }
@@ -1295,7 +1296,7 @@ ComputeInlineStep::ComputeInlineStep(dmlc::JSONReader* reader) {
   auto node = make_object<ComputeInlineStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   data_ = std::move(node);
 }
@@ -1311,7 +1312,7 @@ void ComputeInlineStepNode::ApplyToState(State* state) const {
 
   // Check the validity of compute_inline
   for (size_t i = 0; i < stage->iters.size(); ++i) {
-    CHECK_EQ((*state)->attach_map->iter_to_attached_stages.count(std::make_pair(stage_id, i)), 0)
+    ICHECK_EQ((*state)->attach_map->iter_to_attached_stages.count(std::make_pair(stage_id, i)), 0)
         << "Invalid compute_inline: There are some other stages that are attached to the "
         << "target stage";
   }
@@ -1351,7 +1352,7 @@ ComputeRootStep::ComputeRootStep(dmlc::JSONReader* reader) {
   auto node = make_object<ComputeRootStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   data_ = std::move(node);
 }
@@ -1418,10 +1419,10 @@ Array<Step> GetFormerStageModifiableSteps(Step current_step, const Array<Step>&
         }
       }
       // add SplitStepNode required by rfactor
-      CHECK_GE(i, 1);
-      CHECK(transform_steps[i - 1]->IsInstance<SplitStepNode>());
+      ICHECK_GE(i, 1);
+      ICHECK(transform_steps[i - 1]->IsInstance<SplitStepNode>());
       const Step& split_step = transform_steps[i - 1];
-      CHECK_EQ(split_step->stage_id, step->stage_id);
+      ICHECK_EQ(split_step->stage_id, step->stage_id);
       ret_steps.push_back(split_step);
       // add RfactorStepNode
       ret_steps.push_back(step);
@@ -1449,15 +1450,15 @@ CacheReadStep::CacheReadStep(dmlc::JSONReader* reader) {
   auto node = make_object<CacheReadStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   std::string string_value;
   reader->Read(&string_value);
   node->scope_name = std::move(string_value);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->reader_stage_ids);
   data_ = std::move(node);
 }
@@ -1560,10 +1561,10 @@ CacheWriteStep::CacheWriteStep(dmlc::JSONReader* reader) {
   auto node = make_object<CacheWriteStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   std::string string_value;
   reader->Read(&string_value);
   node->scope_name = std::move(string_value);
@@ -1587,7 +1588,7 @@ int CacheWriteStepNode::ApplyToState(State* state, const ComputeDAG& dag) const
       GetFormerStageModifiableSteps(GetRef<Step>(this), (*state)->transform_steps));
   int added_ops = current_compute_dag->ops.size() - last_dag_op_size;
   // TODO(jcf94): Update this check to equal after fixing the cache write bug in TVM
-  CHECK_GE(added_ops, 1);
+  ICHECK_GE(added_ops, 1);
 
   // target_stage -> cache_write_stage + target_stage
   // Assume no step has been applied to the target stage before cache write.
@@ -1691,13 +1692,13 @@ RfactorStep::RfactorStep(dmlc::JSONReader* reader) {
   auto node = make_object<RfactorStepNode>();
   bool s;
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->stage_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->iter_id);
   s = reader->NextArrayItem();
-  CHECK(s);
+  ICHECK(s);
   reader->Read(&node->factor_iter_id);
   data_ = std::move(node);
 }
diff --git a/src/auto_scheduler/utils.h b/src/auto_scheduler/utils.h
index 610fec96617a..88c649c6f919 100755
--- a/src/auto_scheduler/utils.h
+++ b/src/auto_scheduler/utils.h
@@ -150,8 +150,8 @@ inline bool IntArrayEqual(const Array<PrimExpr>& arr1, const Array<PrimExpr>& ar
   for (size_t i = 0; i < arr1.size(); ++i) {
     auto int1 = arr1[i].as<IntImmNode>();
     auto int2 = arr2[i].as<IntImmNode>();
-    CHECK(int1 != nullptr);
-    CHECK(int2 != nullptr);
+    ICHECK(int1 != nullptr);
+    ICHECK(int2 != nullptr);
     if (int1->value != int2->value) {
       return false;
     }
@@ -169,7 +169,7 @@ inline double FloatArrayMean(const Array<PrimExpr>& float_array) {
 
   for (const auto& x : float_array) {
     auto floatimm = x.as<tir::FloatImmNode>();
-    CHECK(floatimm != nullptr);
+    ICHECK(floatimm != nullptr);
     sum += floatimm->value;
   }
   return sum / float_array.size();
@@ -191,7 +191,7 @@ inline bool StrEndsWith(const String& a, const String& b) {
 /*! \brief Get an int value from an Expr */
 inline int64_t GetIntImm(const PrimExpr& expr) {
   auto pint = expr.as<IntImmNode>();
-  CHECK(pint != nullptr);
+  ICHECK(pint != nullptr);
   return pint->value;
 }
 
diff --git a/src/autotvm/feature_visitor.cc b/src/autotvm/feature_visitor.cc
index 54fc2522db66..15e09755cee2 100644
--- a/src/autotvm/feature_visitor.cc
+++ b/src/autotvm/feature_visitor.cc
@@ -60,7 +60,7 @@ void FeatureVisitor::VisitStmt_(const AttrStmtNode* op) {
   if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
     Var var = op->node.as<tir::IterVarNode>()->var;
     const auto* extent = op->value.as<IntImmNode>();
-    CHECK(extent);
+    ICHECK(extent);
 
     std::string name = var.get()->name_hint;
     AnnotationType ann = kParallel;
diff --git a/src/autotvm/touch_extractor.cc b/src/autotvm/touch_extractor.cc
index 91e2ee135b16..10ead718bae2 100644
--- a/src/autotvm/touch_extractor.cc
+++ b/src/autotvm/touch_extractor.cc
@@ -120,13 +120,13 @@ void TouchExtractor::ExitItervar_() {
     if (kv.second.stride != 0) {  // multiply count
       for (auto stack_var : itervar_stack_) {
         auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
-        CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+        ICHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
         touch_pattern->second.count *= itervar_map[var].length;
       }
     } else {  // multiply reuse ratio
       for (auto stack_var : itervar_stack_) {
         auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
-        CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+        ICHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
         touch_pattern->second.reuse *= itervar_map[var].length;
       }
     }
@@ -151,7 +151,7 @@ void TouchExtractor::ExitItervar_() {
       for (auto stack_var : itervar_stack_) {
         if (ParallelLevel(itervar_map[stack_var].ann) == para_level + 1) {
           auto touch_pattern = itervar_map[stack_var].touch_feature.find(kv.first);
-          CHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
+          ICHECK(touch_pattern != itervar_map[stack_var].touch_feature.end());
           touch_pattern->second.thread_reuse = -kv.second.reuse;
           touch_pattern->second.thread_count = -kv.second.count;
           // NOTE: use minus as a flag to denote it is a base,
diff --git a/src/contrib/hybrid/codegen_hybrid.cc b/src/contrib/hybrid/codegen_hybrid.cc
index 67765f039714..7522f20523c8 100644
--- a/src/contrib/hybrid/codegen_hybrid.cc
+++ b/src/contrib/hybrid/codegen_hybrid.cc
@@ -65,14 +65,14 @@ std::string CodeGenHybrid::Finish() { return stream.str(); }
 void CodeGenHybrid::PrintType(DataType t, std::ostream& os) {
   if (t.is_float()) {
     os << "float";
-    CHECK(t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
+    ICHECK(t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
   } else if (t.is_int()) {
     os << "int";
-    CHECK(t.bits() == 8 || t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
+    ICHECK(t.bits() == 8 || t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
   } else {
-    CHECK(t.is_uint()) << "Unsupported type " << t;
+    ICHECK(t.is_uint()) << "Unsupported type " << t;
     os << "uint";
-    CHECK(t.bits() == 8 || t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
+    ICHECK(t.bits() == 8 || t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
   }
   os << t.bits();
 }
@@ -93,7 +93,7 @@ template <typename T>
 inline void PrintBinaryExpr(const T* op, const char* opstr,
                             std::ostream& os,  // NOLINT(*)
                             CodeGenHybrid* p) {
-  CHECK(op->dtype.lanes() == 1) << "vec bin op not implemented";
+  ICHECK(op->dtype.lanes() == 1) << "vec bin op not implemented";
   if (isalpha(opstr[0])) {
     os << opstr << '(';
     p->PrintExpr(op->a, os);
@@ -114,8 +114,8 @@ inline void PrintBinaryExpr(const T* op, const char* opstr,
 inline void PrintBinaryIntrinsitc(const CallNode* op, const char* opstr,
                                   std::ostream& os,  // NOLINT(*)
                                   CodeGenHybrid* p) {
-  CHECK(op->dtype.lanes() == 1) << "vec bin intrin not implemented";
-  CHECK_EQ(op->args.size(), 2U);
+  ICHECK(op->dtype.lanes() == 1) << "vec bin intrin not implemented";
+  ICHECK_EQ(op->args.size(), 2U);
   os << '(';
   p->PrintExpr(op->args[0], os);
   os << opstr;
@@ -228,7 +228,7 @@ void CodeGenHybrid::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLIN
   } else if (op->op.same_as(builtin::shift_right())) {
     PrintBinaryIntrinsitc(op, ">>", os, this);
   } else if (op->op.same_as(builtin::bitwise_not())) {
-    CHECK_EQ(op->args.size(), 1U);
+    ICHECK_EQ(op->args.size(), 1U);
     os << "(~";
     PrintExpr(op->args[0], os);
     os << ')';
@@ -251,9 +251,9 @@ void CodeGenHybrid::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLIN
     os << ")";
   } else {
     auto* ptr_op = op->op.as<OpNode>();
-    CHECK(ptr_op != nullptr);
+    ICHECK(ptr_op != nullptr);
     std::string name = ptr_op->name;
-    CHECK_EQ(name.compare(0, 4, "tir."), 0);
+    ICHECK_EQ(name.compare(0, 4, "tir."), 0);
     os << name.substr(4) << "(";
     for (size_t i = 0; i < op->args.size(); i++) {
       PrintExpr(op->args[i], os);
@@ -305,7 +305,7 @@ void CodeGenHybrid::VisitStmt_(const LetStmtNode* op) {
 void CodeGenHybrid::VisitStmt_(const AttrStmtNode* op) {
   if (op->attr_key == tir::attr::thread_extent) {
     auto iter_var = op->node.as<IterVarNode>();
-    CHECK(iter_var);
+    ICHECK(iter_var);
     binds_[iter_var->var.get()] = dot_to_underscore(iter_var->var->name_hint);
     PrintIndent();
     stream << "for " << binds_[iter_var->var.get()] << " in bind('" << iter_var->var->name_hint
@@ -327,7 +327,7 @@ void CodeGenHybrid::VisitStmt_(const AttrStmtNode* op) {
 
 void CodeGenHybrid::VisitStmt_(const ProducerRealizeNode* op) {
   auto tensor = Downcast<Tensor>(op->producer);
-  CHECK(alloc_storage_scope_.count(tensor->op));
+  ICHECK(alloc_storage_scope_.count(tensor->op));
   if (!alloc_storage_scope_[tensor->op].empty()) {
     PrintIndent();
     stream << GetTensorID(tensor) << " = allocate((";
@@ -493,7 +493,7 @@ void CodeGenHybrid::DumpStmt(const Stmt& stmt, const Array<ObjectRef>& inputs,
       stream << GetTensorID(GetRef<Tensor>(tensor));
     } else {
       auto var = inputs[i].as<VarNode>();
-      CHECK(var) << "Input should either be a tensor or a variable!";
+      ICHECK(var) << "Input should either be a tensor or a variable!";
       stream << GetVarID(var);
     }
   }
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 705a3347b68c..5c119b64b93d 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -241,7 +241,7 @@ class TVMDSOOp : public OpKernel {
     // Load TVM function from dynamic library
     tvm::runtime::Module mod_dylib = tvm::runtime::Module::LoadFromFile(lib_path);
     tvm_func = mod_dylib.GetFunction(func_name);
-    CHECK(tvm_func != nullptr);
+    ICHECK(tvm_func != nullptr);
   }
 
   void Compute(tensorflow::OpKernelContext* context) override {
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 2e41f0bee921..f88b6215f927 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -215,7 +215,7 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
       tir::transform::CombineContextCall(),
   };
   auto opt_host = transform::Sequential(host_pass_list);
-  CHECK(mod_mixed.defined()) << "This module must be defined";
+  ICHECK(mod_mixed.defined()) << "This module must be defined";
   auto mhost = opt_host(mod_mixed);
 
   // device pipeline
@@ -243,9 +243,9 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
   }
 
   if (target->kind->device_type == kDLCPU && target_host == target) {
-    CHECK(mdevice->functions.empty()) << "No device code should be generated when target "
-                                      << "and host_target are both llvm target."
-                                      << "\n";
+    ICHECK(mdevice->functions.empty()) << "No device code should be generated when target "
+                                       << "and host_target are both llvm target."
+                                       << "\n";
   }
 
   return {mhost, mdevice};
@@ -272,7 +272,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_
 
   IRModule mhost_all = IRModule(Map<GlobalVar, BaseFunc>());
 
-  CHECK(mhost_all.defined()) << "The host module must be defined";
+  ICHECK(mhost_all.defined()) << "The host module must be defined";
 
   for (const auto& it : inputs) {
     if (it.second.defined()) {
@@ -280,9 +280,9 @@ runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_
       auto& mhost = pair.first;
       auto& mdevice = pair.second;
 
-      CHECK(mhost.defined()) << "The split host module must be defined";
+      ICHECK(mhost.defined()) << "The split host module must be defined";
 
-      CHECK(mhost_all.defined()) << "The host module must be defined";
+      ICHECK(mhost_all.defined()) << "The host module must be defined";
 
       mhost_all->Update(mhost);
 
diff --git a/src/ir/diagnostic.cc b/src/ir/diagnostic.cc
index 148831dc3ab6..f9299e3e27e8 100644
--- a/src/ir/diagnostic.cc
+++ b/src/ir/diagnostic.cc
@@ -225,7 +225,7 @@ void ReportAt(const DiagnosticContext& context, std::ostream& out, const Span& s
     return;
   }
 
-  CHECK(context->module->source_map.defined());
+  ICHECK(context->module->source_map.defined());
   auto it = context->module->source_map->source_map.find(span->source_name);
 
   // If the source name is not in the current source map, sources were not annotated.
diff --git a/src/ir/env_func.cc b/src/ir/env_func.cc
index 7b0d6e6f09c2..6e1f847d3fdd 100644
--- a/src/ir/env_func.cc
+++ b/src/ir/env_func.cc
@@ -38,7 +38,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 ObjectPtr<Object> CreateEnvNode(const std::string& name) {
   auto* f = runtime::Registry::Get(name);
-  CHECK(f != nullptr) << "Cannot find global function \'" << name << '\'';
+  ICHECK(f != nullptr) << "Cannot find global function \'" << name << '\'';
   ObjectPtr<EnvFuncNode> n = make_object<EnvFuncNode>();
   n->func = *f;
   n->name = name;
@@ -51,7 +51,7 @@ TVM_REGISTER_GLOBAL("ir.EnvFuncGet").set_body_typed(EnvFunc::Get);
 
 TVM_REGISTER_GLOBAL("ir.EnvFuncCall").set_body([](TVMArgs args, TVMRetValue* rv) {
   EnvFunc env = args[0];
-  CHECK_GE(args.size(), 1);
+  ICHECK_GE(args.size(), 1);
   env->func.CallPacked(TVMArgs(args.values + 1, args.type_codes + 1, args.size() - 1), rv);
 });
 
diff --git a/src/ir/error.cc b/src/ir/error.cc
index 5cd7a247d025..5d3978dda4ff 100644
--- a/src/ir/error.cc
+++ b/src/ir/error.cc
@@ -46,7 +46,7 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
   // First we pick an error reporting strategy for each error.
   // TODO(@jroesch): Spanned errors are currently not supported.
   for (auto err : this->errors_) {
-    CHECK(!err.span.defined()) << "attempting to use spanned errors, currently not supported";
+    ICHECK(!err.span.defined()) << "attempting to use spanned errors, currently not supported";
   }
 
   NodeMap<GlobalVar, NodeMap<ObjectRef, std::string>> error_maps;
@@ -62,7 +62,7 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
 
     auto has_errs = this->node_to_error_.find(node);
 
-    CHECK(has_errs != this->node_to_error_.end());
+    ICHECK(has_errs != this->node_to_error_.end());
 
     const auto& error_indicies = has_errs->second;
 
@@ -113,7 +113,7 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
     annotated_prog << AsText(func, false, [&err_map](const ObjectRef& expr) {
       auto it = err_map.find(expr);
       if (it != err_map.end()) {
-        CHECK_NE(it->second.size(), 0);
+        ICHECK_NE(it->second.size(), 0);
         return it->second;
       } else {
         return std::string("");
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index 05d41cf204d6..67e5cea93011 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -49,17 +49,17 @@ PrimExpr PrimExpr::FromObject_(ObjectRef ref) {
   if (auto* ptr = ref.as<runtime::StringObj>()) {
     return tir::StringImm(GetRef<runtime::String>(ptr));
   }
-  CHECK(ObjectTypeChecker<PrimExpr>::Check(ref.get()))
+  ICHECK(ObjectTypeChecker<PrimExpr>::Check(ref.get()))
       << "Expect type " << ObjectTypeChecker<PrimExpr>::TypeName() << " but get "
       << ref->GetTypeKey();
   return Downcast<PrimExpr>(ref);
 }
 
 IntImm::IntImm(DataType dtype, int64_t value) {
-  CHECK(dtype.is_scalar()) << "ValueError: IntImm can only take scalar.";
-  CHECK(dtype.is_int() || dtype.is_uint()) << "ValueError: IntImm supports only int or uint type.";
+  ICHECK(dtype.is_scalar()) << "ValueError: IntImm can only take scalar.";
+  ICHECK(dtype.is_int() || dtype.is_uint()) << "ValueError: IntImm supports only int or uint type.";
   if (dtype.is_uint()) {
-    CHECK_GE(value, 0U);
+    ICHECK_GE(value, 0U);
   }
   ObjectPtr<IntImmNode> node = make_object<IntImmNode>();
   node->dtype = dtype;
@@ -84,7 +84,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 FloatImm::FloatImm(DataType dtype, double value) {
-  CHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar.";
+  ICHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar.";
   ObjectPtr<FloatImmNode> node = make_object<FloatImmNode>();
   node->dtype = dtype;
   node->value = value;
diff --git a/src/ir/module.cc b/src/ir/module.cc
index 231ae68dd4e0..b011f2d2f664 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -55,14 +55,14 @@ IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
 
   for (const auto& kv : n->functions) {
     // set global var map
-    CHECK(n->global_var_map_.count(kv.first->name_hint) == 0)
+    ICHECK(n->global_var_map_.count(kv.first->name_hint) == 0)
         << "Duplicate global function name " << kv.first->name_hint;
     n->global_var_map_.Set(kv.first->name_hint, kv.first);
   }
 
   for (const auto& kv : n->type_definitions) {
     // set global typevar map
-    CHECK(n->global_type_var_map_.count(kv.first->name_hint) == 0)
+    ICHECK(n->global_type_var_map_.count(kv.first->name_hint) == 0)
         << "Duplicate global type definition name " << kv.first->name_hint;
     n->global_type_var_map_.Set(kv.first->name_hint, kv.first);
     n->RegisterConstructors(kv.first, kv.second);
@@ -150,9 +150,9 @@ tvm::Array<GlobalVar> IRModuleNode::GetGlobalVars() const {
 }
 
 GlobalTypeVar IRModuleNode::GetGlobalTypeVar(const String& name) const {
-  CHECK(global_type_var_map_.defined());
+  ICHECK(global_type_var_map_.defined());
   auto it = global_type_var_map_.find(name);
-  CHECK(it != global_type_var_map_.end())
+  ICHECK(it != global_type_var_map_.end())
       << "Cannot find global type var " << name << " in the Module";
   return (*it).second;
 }
@@ -183,9 +183,9 @@ void WarnIfMalformed(const IRModule& mod, relay::Function func) {
   auto fv = relay::FreeVars(func);
   auto ftv = relay::FreeTypeVars(func, mod);
   // TODO(@jroesch): refactor to use diagnostic context
-  CHECK_EQ(fv.size(), 0) << "There are free variables: " << fv << std::endl;
-  CHECK_EQ(ftv.size(), 0) << "There are free type variables: " << fv
-                          << " in function: " << AsText(func, false);
+  ICHECK_EQ(fv.size(), 0) << "There are free variables: " << fv << std::endl;
+  ICHECK_EQ(ftv.size(), 0) << "There are free type variables: " << fv
+                           << " in function: " << AsText(func, false);
 }
 
 void IRModuleNode::Add(const GlobalVar& var, const BaseFunc& f, bool update) {
@@ -202,9 +202,9 @@ void IRModuleNode::AddUnchecked(const GlobalVar& var, const BaseFunc& func) {
 
   auto it = global_var_map_.find(var->name_hint);
   if (it != global_var_map_.end()) {
-    CHECK_EQ((*it).second, var);
+    ICHECK_EQ((*it).second, var);
   } else {
-    CHECK(global_var_map_.count(var->name_hint) == 0)
+    ICHECK(global_var_map_.count(var->name_hint) == 0)
         << "Duplicate global function name " << var->name_hint;
   }
 
@@ -234,7 +234,7 @@ void IRModuleNode::AddTypeDefUnchecked(const GlobalTypeVar& var, const TypeData&
   this->type_definitions.Set(var, type);
   if (!update) {
     // set global type var map
-    CHECK(global_type_var_map_.count(var->name_hint) == 0)
+    ICHECK(global_type_var_map_.count(var->name_hint) == 0)
         << "Duplicate global type definition name " << var->name_hint;
   }
   global_type_var_map_.Set(var->name_hint, var);
@@ -258,7 +258,7 @@ void IRModuleNode::Remove(const GlobalVar& var) {
 
 BaseFunc IRModuleNode::Lookup(const GlobalVar& var) const {
   auto it = functions.find(var);
-  CHECK(it != functions.end()) << "There is no definition of " << var->name_hint;
+  ICHECK(it != functions.end()) << "There is no definition of " << var->name_hint;
   return (*it).second;
 }
 
@@ -269,7 +269,7 @@ BaseFunc IRModuleNode::Lookup(const String& name) const {
 
 TypeData IRModuleNode::LookupTypeDef(const GlobalTypeVar& var) const {
   auto it = type_definitions.find(var);
-  CHECK(it != type_definitions.end()) << "There is no definition of " << var->name_hint;
+  ICHECK(it != type_definitions.end()) << "There is no definition of " << var->name_hint;
   return (*it).second;
 }
 
@@ -280,7 +280,7 @@ TypeData IRModuleNode::LookupTypeDef(const String& name) const {
 
 Constructor IRModuleNode::LookupTag(const int32_t tag) {
   auto it = constructor_tag_map_.find(tag);
-  CHECK(it != constructor_tag_map_.end()) << "There is no constructor with the tag " << tag;
+  ICHECK(it != constructor_tag_map_.end()) << "There is no constructor with the tag " << tag;
   return (*it).second;
 }
 
@@ -382,7 +382,7 @@ void IRModuleNode::Import(const String& path) {
 
 void IRModuleNode::ImportFromStd(const String& path) {
   auto* f = tvm::runtime::Registry::Get("tvm.relay.std_path");
-  CHECK(f != nullptr) << "The Relay std_path is not set, please register tvm.relay.std_path.";
+  ICHECK(f != nullptr) << "The Relay std_path is not set, please register tvm.relay.std_path.";
   std::string std_path = (*f)();
   this->Import(std_path + "/" + path);
 }
@@ -406,7 +406,7 @@ TVM_REGISTER_GLOBAL("ir.Module_Add").set_body([](TVMArgs args, TVMRetValue* ret)
   GlobalVar var = args[1];
   ObjectRef val = args[2];
   bool update = args[3];
-  CHECK(val->IsInstance<RelayExprNode>());
+  ICHECK(val->IsInstance<RelayExprNode>());
 
   if (val->IsInstance<BaseFuncNode>()) {
     mod->Add(var, Downcast<BaseFunc>(val), update);
diff --git a/src/ir/op.cc b/src/ir/op.cc
index 45c31963695c..5d2dc704f5b7 100644
--- a/src/ir/op.cc
+++ b/src/ir/op.cc
@@ -42,7 +42,7 @@ using OpRegistry = AttrRegistry<OpRegEntry, Op>;
 // find operator by name
 const Op& Op::Get(const String& name) {
   const OpRegEntry* reg = OpRegistry::Global()->Get(name);
-  CHECK(reg != nullptr) << "AttributeError: Operator " << name << " is not registered";
+  ICHECK(reg != nullptr) << "AttributeError: Operator " << name << " is not registered";
   return reg->op();
 }
 
@@ -130,7 +130,7 @@ struct Op2ObjectPtr : public ObjectRef {
 ObjectPtr<Object> CreateOp(const std::string& name) {
   // Hack use TVMRetValue as exchange
   auto op = Op::Get(name);
-  CHECK(op.defined()) << "Cannot find op \'" << name << '\'';
+  ICHECK(op.defined()) << "Cannot find op \'" << name << '\'';
   return Op2ObjectPtr::Get(op);
 }
 
diff --git a/src/ir/span.cc b/src/ir/span.cc
index 667c14e4a7ae..4a26f3a6eb11 100644
--- a/src/ir/span.cc
+++ b/src/ir/span.cc
@@ -74,9 +74,9 @@ Span::Span(SourceName source_name, int line, int end_line, int column, int end_c
 }
 
 Span Span::Merge(const Span& other) const {
-  CHECK(this->defined() && other.defined()) << "Span::Merge: both spans must be defined";
+  ICHECK(this->defined() && other.defined()) << "Span::Merge: both spans must be defined";
 
-  CHECK((*this)->source_name == other->source_name);
+  ICHECK((*this)->source_name == other->source_name);
   return Span((*this)->source_name, std::min((*this)->line, other->line),
               std::max((*this)->end_line, other->end_line),
               std::min((*this)->column, other->column),
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index ec88482ee3bf..3b774462565e 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -60,8 +60,8 @@ void PassContext::EnterWithScope() {
 
 void PassContext::ExitWithScope() {
   PassContextThreadLocalEntry* entry = RelayPassContextThreadLocalStore::Get();
-  CHECK(!entry->context_stack.empty());
-  CHECK(entry->context_stack.top().same_as(*this));
+  ICHECK(!entry->context_stack.empty());
+  ICHECK(entry->context_stack.top().same_as(*this));
   entry->context_stack.pop();
 }
 
@@ -77,7 +77,7 @@ PassContext PassContext::Current() {
 class PassConfigManager {
  public:
   void Register(std::string key, uint32_t value_type_index) {
-    CHECK_EQ(key2vtype_.count(key), 0U);
+    ICHECK_EQ(key2vtype_.count(key), 0U);
     ValueTypeInfo info;
     info.type_index = value_type_index;
     info.type_key = runtime::Object::TypeIndex2Key(value_type_index);
@@ -103,7 +103,7 @@ class PassConfigManager {
         LOG(FATAL) << os.str();
       }
       const auto& info = it->second;
-      CHECK(kv.second.defined()) << "AttributeError: " << kv.first << " is None";
+      ICHECK(kv.second.defined()) << "AttributeError: " << kv.first << " is None";
       if (kv.second->IsInstance<Map<String, ObjectRef>::ContainerType>()) {
         ObjectRef converted =
             reflection->CreateObject(info.type_key, Downcast<Map<String, ObjectRef>>(kv.second));
@@ -376,7 +376,7 @@ Pass GetPass(const String& pass_name) {
     // pass
   } else if ((f = Registry::Get("relay._transform." + pass_name))) {
   }
-  CHECK(f != nullptr) << "Cannot use " << pass_name << "to create the pass";
+  ICHECK(f != nullptr) << "Cannot use " << pass_name << "to create the pass";
   return (*f)();
 }
 
@@ -385,7 +385,7 @@ Pass GetPass(const String& pass_name) {
 // ordering problem needs to be handled in the future.
 IRModule SequentialNode::operator()(IRModule mod, const PassContext& pass_ctx) const {
   for (const Pass& pass : passes) {
-    CHECK(pass.defined()) << "Found undefined pass for optimization.";
+    ICHECK(pass.defined()) << "Found undefined pass for optimization.";
     const PassInfo& pass_info = pass->Info();
     if (!PassEnabled(pass_info)) continue;
     // resolve dependencies
diff --git a/src/node/attr_registry.h b/src/node/attr_registry.h
index 01d2b68c471b..f84be1467453 100644
--- a/src/node/attr_registry.h
+++ b/src/node/attr_registry.h
@@ -109,10 +109,10 @@ class AttrRegistry {
       op_map->data_.resize(index + 1, std::make_pair(TVMRetValue(), 0));
     }
     std::pair<TVMRetValue, int>& p = op_map->data_[index];
-    CHECK(p.second != plevel) << "Attribute " << attr_name << " of " << key->AttrRegistryName()
-                              << " is already registered with same plevel=" << plevel;
-    CHECK(value.type_code() != kTVMNullptr) << "Registered packed_func is Null for " << attr_name
-                                            << " of operator " << key->AttrRegistryName();
+    ICHECK(p.second != plevel) << "Attribute " << attr_name << " of " << key->AttrRegistryName()
+                               << " is already registered with same plevel=" << plevel;
+    ICHECK(value.type_code() != kTVMNullptr) << "Registered packed_func is Null for " << attr_name
+                                             << " of operator " << key->AttrRegistryName();
     if (p.second < plevel && value.type_code() != kTVMNullptr) {
       op_map->data_[index] = std::make_pair(value, plevel);
     }
diff --git a/src/node/container.cc b/src/node/container.cc
index 60b5f40b98f1..b72d5a4cd736 100644
--- a/src/node/container.cc
+++ b/src/node/container.cc
@@ -96,8 +96,8 @@ struct NDArrayContainerTrait {
   static constexpr const std::nullptr_t VisitAttrs = nullptr;
 
   static void SHashReduce(const runtime::NDArray::Container* key, SHashReducer hash_reduce) {
-    CHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    CHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor";
+    ICHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor";
     hash_reduce(runtime::DataType(key->dl_tensor.dtype));
     hash_reduce(key->dl_tensor.ndim);
     for (int i = 0; i < key->dl_tensor.ndim; ++i) {
@@ -113,10 +113,10 @@ struct NDArrayContainerTrait {
 
     auto ldt = lhs->dl_tensor.dtype;
     auto rdt = rhs->dl_tensor.dtype;
-    CHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    CHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    CHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
-    CHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
+    ICHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
+    ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
 
     if (lhs->dl_tensor.ndim != rhs->dl_tensor.ndim) return false;
     for (int i = 0; i < lhs->dl_tensor.ndim; ++i) {
@@ -172,18 +172,18 @@ TVM_REGISTER_GLOBAL("node.Array").set_body([](TVMArgs args, TVMRetValue* ret) {
 
 TVM_REGISTER_GLOBAL("node.ArrayGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
   int64_t i = args[1];
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<ArrayNode>());
+  ICHECK(ptr->IsInstance<ArrayNode>());
   auto* n = static_cast<const ArrayNode*>(ptr);
-  CHECK_LT(static_cast<size_t>(i), n->size()) << "out of bound of array";
+  ICHECK_LT(static_cast<size_t>(i), n->size()) << "out of bound of array";
   *ret = n->at(i);
 });
 
 TVM_REGISTER_GLOBAL("node.ArraySize").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<ArrayNode>());
+  ICHECK(ptr->IsInstance<ArrayNode>());
   *ret = static_cast<int64_t>(static_cast<const ArrayNode*>(ptr)->size());
 });
 
@@ -300,7 +300,7 @@ TVM_REGISTER_REFLECTION_VTABLE(MapNode, MapNodeTrait)
     .set_creator([](const std::string&) -> ObjectPtr<Object> { return MapNode::Empty(); });
 
 TVM_REGISTER_GLOBAL("node.Map").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args.size() % 2, 0);
+  ICHECK_EQ(args.size() % 2, 0);
   std::unordered_map<ObjectRef, ObjectRef, ObjectPtrHash, ObjectPtrEqual> data;
   for (int i = 0; i < args.num_args; i += 2) {
     ObjectRef k =
@@ -312,29 +312,29 @@ TVM_REGISTER_GLOBAL("node.Map").set_body([](TVMArgs args, TVMRetValue* ret) {
 });
 
 TVM_REGISTER_GLOBAL("node.MapSize").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<MapNode>());
+  ICHECK(ptr->IsInstance<MapNode>());
   auto* n = static_cast<const MapNode*>(ptr);
   *ret = static_cast<int64_t>(n->size());
 });
 
 TVM_REGISTER_GLOBAL("node.MapGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<MapNode>());
+  ICHECK(ptr->IsInstance<MapNode>());
 
   auto* n = static_cast<const MapNode*>(ptr);
   auto it = n->find(String::CanConvertFrom(args[1]) ? args[1].operator String()
                                                     : args[1].operator ObjectRef());
-  CHECK(it != n->end()) << "cannot find the corresponding key in the Map";
+  ICHECK(it != n->end()) << "cannot find the corresponding key in the Map";
   *ret = (*it).second;
 });
 
 TVM_REGISTER_GLOBAL("node.MapCount").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  CHECK(ptr->IsInstance<MapNode>());
+  ICHECK(ptr->IsInstance<MapNode>());
   const MapNode* n = static_cast<const MapNode*>(ptr);
   int64_t cnt = n->count(String::CanConvertFrom(args[1]) ? args[1].operator String()
                                                          : args[1].operator ObjectRef());
@@ -342,7 +342,7 @@ TVM_REGISTER_GLOBAL("node.MapCount").set_body([](TVMArgs args, TVMRetValue* ret)
 });
 
 TVM_REGISTER_GLOBAL("node.MapItems").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* ptr = static_cast<Object*>(args[0].value().v_handle);
   auto* n = static_cast<const MapNode*>(ptr);
   Array<ObjectRef> rkvs;
diff --git a/src/node/reflection.cc b/src/node/reflection.cc
index ec82c91bb652..9dc9d330bb77 100644
--- a/src/node/reflection.cc
+++ b/src/node/reflection.cc
@@ -50,7 +50,7 @@ class AttrGetter : public AttrVisitor {
     if (skey == key) *ret = value[0];
   }
   void Visit(const char* key, uint64_t* value) final {
-    CHECK_LE(value[0], static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
+    ICHECK_LE(value[0], static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
         << "cannot return too big constant";
     if (skey == key) *ret = static_cast<int64_t>(value[0]);
   }
@@ -198,7 +198,7 @@ class NodeAttrSetter : public AttrVisitor {
 void InitNodeByPackedArgs(ReflectionVTable* reflection, Object* n, const TVMArgs& args) {
   NodeAttrSetter setter;
   setter.type_key = n->GetTypeKey();
-  CHECK_EQ(args.size() % 2, 0);
+  ICHECK_EQ(args.size() % 2, 0);
   for (int i = 0; i < args.size(); i += 2) {
     setter.attrs.emplace(args[i].operator std::string(), args[i + 1]);
   }
@@ -245,13 +245,13 @@ ObjectRef ReflectionVTable::CreateObject(const std::string& type_key,
 
 // Expose to FFI APIs.
 void NodeGetAttr(TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* self = static_cast<Object*>(args[0].value().v_handle);
   *ret = ReflectionVTable::Global()->GetAttr(self, args[1]);
 }
 
 void NodeListAttrNames(TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
   Object* self = static_cast<Object*>(args[0].value().v_handle);
 
   auto names =
diff --git a/src/node/serialization.cc b/src/node/serialization.cc
index 1f0e8c0f9b00..c7e4d27c8b2c 100644
--- a/src/node/serialization.cc
+++ b/src/node/serialization.cc
@@ -85,7 +85,7 @@ class NodeIndexer : public AttrVisitor {
   void Visit(const char* key, runtime::NDArray* value) final {
     DLTensor* ptr = const_cast<DLTensor*>((*value).operator->());
     if (tensor_index_.count(ptr)) return;
-    CHECK_EQ(tensor_index_.size(), tensor_list_.size());
+    ICHECK_EQ(tensor_index_.size(), tensor_list_.size());
     tensor_index_[ptr] = tensor_list_.size();
     tensor_list_.push_back(ptr);
   }
@@ -97,10 +97,10 @@ class NodeIndexer : public AttrVisitor {
   // make index of all the children of node
   void MakeIndex(Object* node) {
     if (node == nullptr) return;
-    CHECK(node->IsInstance<Object>());
+    ICHECK(node->IsInstance<Object>());
 
     if (node_index_.count(node)) return;
-    CHECK_EQ(node_index_.size(), node_list_.size());
+    ICHECK_EQ(node_index_.size(), node_list_.size());
     node_index_[node] = node_list_.size();
     node_list_.push_back(node);
 
@@ -195,7 +195,7 @@ struct JSONNode {
     helper.ReadAllFields(reader);
 
     if (repr_str.size() != 0) {
-      CHECK_EQ(repr_b64.size(), 0U);
+      ICHECK_EQ(repr_b64.size(), 0U);
       repr_bytes = std::move(repr_str);
     } else if (repr_b64.size() != 0) {
       repr_bytes = Base64Decode(repr_b64);
@@ -388,13 +388,13 @@ class JSONAttrSetter : public AttrVisitor {
   void Visit(const char* key, runtime::NDArray* value) final {
     size_t index;
     ParseValue(key, &index);
-    CHECK_LE(index, tensor_list_->size());
+    ICHECK_LE(index, tensor_list_->size());
     *value = tensor_list_->at(index);
   }
   void Visit(const char* key, ObjectRef* value) final {
     size_t index;
     ParseValue(key, &index);
-    CHECK_LE(index, node_list_->size());
+    ICHECK_LE(index, node_list_->size());
     *value = ObjectRef(node_list_->at(index));
   }
   // set node to be current JSONNode
@@ -421,13 +421,13 @@ class JSONAttrSetter : public AttrVisitor {
     if (jnode->type_key == MapNode::_type_key) {
       std::unordered_map<ObjectRef, ObjectRef, ObjectHash, ObjectEqual> container;
       if (jnode->keys.empty()) {
-        CHECK_EQ(jnode->data.size() % 2, 0U);
+        ICHECK_EQ(jnode->data.size() % 2, 0U);
         for (size_t i = 0; i < jnode->data.size(); i += 2) {
           container[ObjectRef(node_list_->at(jnode->data[i]))] =
               ObjectRef(node_list_->at(jnode->data[i + 1]));
         }
       } else {
-        CHECK_EQ(jnode->data.size(), jnode->keys.size());
+        ICHECK_EQ(jnode->data.size(), jnode->keys.size());
         for (size_t i = 0; i < jnode->data.size(); ++i) {
           container[String(jnode->keys[i])] = ObjectRef(node_list_->at(jnode->data[i]));
         }
@@ -530,7 +530,7 @@ struct JSONGraph {
         }
       }
     }
-    CHECK_EQ(topo_order.size(), n_nodes) << "Cyclic reference detected in JSON file";
+    ICHECK_EQ(topo_order.size(), n_nodes) << "Cyclic reference detected in JSON file";
     std::reverse(std::begin(topo_order), std::end(topo_order));
     return topo_order;
   }
@@ -562,7 +562,7 @@ ObjectRef LoadJSON(std::string json_str) {
       support::Base64InStream b64strm(&mstrm);
       b64strm.InitPosition();
       runtime::NDArray temp;
-      CHECK(temp.Load(&b64strm));
+      ICHECK(temp.Load(&b64strm));
       tensors.emplace_back(std::move(temp));
     }
   }
diff --git a/src/node/structural_equal.cc b/src/node/structural_equal.cc
index e05cbbb60d1f..1fa72c92b6fc 100644
--- a/src/node/structural_equal.cc
+++ b/src/node/structural_equal.cc
@@ -90,7 +90,7 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
 
   void MarkGraphNode() final {
     // need to push to pending tasks in this case
-    CHECK(!allow_push_to_stack_ && !task_stack_.empty());
+    ICHECK(!allow_push_to_stack_ && !task_stack_.empty());
     task_stack_.back().graph_equal = true;
   }
 
@@ -108,8 +108,8 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
     equal_map_lhs_.clear();
     equal_map_rhs_.clear();
     if (!SEqualReduce(lhs, rhs, map_free_vars)) return false;
-    CHECK_EQ(pending_tasks_.size(), 1U);
-    CHECK(allow_push_to_stack_);
+    ICHECK_EQ(pending_tasks_.size(), 1U);
+    ICHECK(allow_push_to_stack_);
     task_stack_.emplace_back(std::move(pending_tasks_.back()));
     pending_tasks_.clear();
     return RunTasks();
@@ -141,7 +141,7 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
         // We can safely mark lhs and rhs as equal to each other.
         auto it = equal_map_lhs_.find(entry.lhs);
         if (it != equal_map_lhs_.end()) {
-          CHECK(it->second.same_as(entry.rhs));
+          ICHECK(it->second.same_as(entry.rhs));
         }
         // create the map if the quality is graph equal.
         if (entry.graph_equal) {
@@ -156,7 +156,7 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
         // Expand the objects
         // The SEqual of the object can call into this->SEqualReduce
         // which populates the pending tasks.
-        CHECK_EQ(pending_tasks_.size(), 0U);
+        ICHECK_EQ(pending_tasks_.size(), 0U);
         allow_push_to_stack_ = false;
         if (!DispatchSEqualReduce(entry.lhs, entry.rhs, entry.map_free_vars)) return false;
         allow_push_to_stack_ = true;
@@ -174,7 +174,7 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
   // The default equal as registered in the structural equal vtable.
   bool DispatchSEqualReduce(const ObjectRef& lhs, const ObjectRef& rhs, bool map_free_vars) {
     auto compute = [=]() {
-      CHECK(lhs.defined() && rhs.defined() && lhs->type_index() == rhs->type_index());
+      ICHECK(lhs.defined() && rhs.defined() && lhs->type_index() == rhs->type_index());
       // skip entries that already have equality maps.
       auto it = equal_map_lhs_.find(lhs);
       if (it != equal_map_lhs_.end()) {
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index cb576fa9c067..e0b729d3f103 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -79,7 +79,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
 
   void MarkGraphNode() final {
     // need to push to pending tasks in this case
-    CHECK(!allow_push_to_stack_ && !task_stack_.empty());
+    ICHECK(!allow_push_to_stack_ && !task_stack_.empty());
     task_stack_.back().graph_node_hash = true;
   }
 
@@ -97,7 +97,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
   }
 
   void SHashReduceFreeVar(const runtime::Object* var, bool map_free_vars) final {
-    CHECK(!hash_memo_.count(GetRef<ObjectRef>(var)));
+    ICHECK(!hash_memo_.count(GetRef<ObjectRef>(var)));
     if (map_free_vars) {
       // use counter value.
       size_t value = std::hash<size_t>()(free_var_counter_++);
@@ -127,19 +127,19 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
   }
 
   size_t Hash(const ObjectRef& object, bool map_free_vars) {
-    CHECK_EQ(task_stack_.size(), 0U);
-    CHECK_EQ(pending_tasks_.size(), 0U);
-    CHECK_EQ(result_stack_.size(), 0U);
+    ICHECK_EQ(task_stack_.size(), 0U);
+    ICHECK_EQ(pending_tasks_.size(), 0U);
+    ICHECK_EQ(result_stack_.size(), 0U);
 
     this->SHashReduce(object, map_free_vars);
-    CHECK_EQ(pending_tasks_.size(), 1U);
-    CHECK(allow_push_to_stack_);
+    ICHECK_EQ(pending_tasks_.size(), 1U);
+    ICHECK(allow_push_to_stack_);
     task_stack_.emplace_back(std::move(pending_tasks_.back()));
     pending_tasks_.clear();
 
     this->RunTasks();
 
-    CHECK_EQ(result_stack_.size(), 1U);
+    ICHECK_EQ(result_stack_.size(), 1U);
     size_t ret = result_stack_.back();
     result_stack_.pop_back();
     return ret;
@@ -160,7 +160,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
    */
   size_t ReduceHash(const Task& task) {
     size_t stack_begin = task.result_stack_index;
-    CHECK_LE(stack_begin, result_stack_.size());
+    ICHECK_LE(stack_begin, result_stack_.size());
 
     // combine in the reverse order of the stack.
     size_t reduced_hash = task.reduced_hash;
@@ -210,7 +210,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
           entry.children_expanded = true;
           entry.result_stack_index = result_stack_.size();
 
-          CHECK_EQ(pending_tasks_.size(), 0U);
+          ICHECK_EQ(pending_tasks_.size(), 0U);
           allow_push_to_stack_ = false;
           // dispatch hash, reduce to the current slot.
           this->DispatchSHash(entry.object, entry.map_free_vars);
@@ -227,7 +227,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler {
 
   // The default equal as registered in the structural equal vtable.
   void DispatchSHash(const ObjectRef& object, bool map_free_vars) {
-    CHECK(object.defined());
+    ICHECK(object.defined());
     vtable_->SHashReduce(object.get(), SHashReducer(this, map_free_vars));
   }
 
diff --git a/src/parser/meta_ref.cc b/src/parser/meta_ref.cc
index d23892753c5f..c74b396900d8 100644
--- a/src/parser/meta_ref.cc
+++ b/src/parser/meta_ref.cc
@@ -72,9 +72,9 @@ struct MetaRefExpander : public ExprMutator {
     if (auto op_node = call->op.as<OpNode>()) {
       if (op_node->name == "parser.MetaRef") {
         auto meta_attrs = call->attrs.as<MetaRefAttrs>();
-        CHECK(meta_attrs) << "an internal error has occurred";
+        ICHECK(meta_attrs) << "an internal error has occurred";
         auto nodes = table.at(meta_attrs->node_type_key);
-        CHECK_LT(meta_attrs->node_index, nodes.size());
+        ICHECK_LT(meta_attrs->node_index, nodes.size());
         return Downcast<Expr>(nodes[meta_attrs->node_index]);
       }
     }
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 9c9965ca588f..987a6e20ec38 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -371,7 +371,7 @@ class Parser {
    * \return The Nth token.
    */
   Token Lookahead(int n) {
-    CHECK_GE(n, 1) << "lookahead is only valid when n >= 1";
+    ICHECK_GE(n, 1) << "lookahead is only valid when n >= 1";
 
     // We intend to skip n - 1 tokens, then return the nth.
     auto old_pos = pos;
@@ -822,7 +822,7 @@ class Parser {
               ctor = tvm::Constructor(ctor_name, arg_types, type_global);
             }
 
-            CHECK(ctor.defined());
+            ICHECK(ctor.defined());
 
             try {
               this->ctors.Add(ctor_name, ctor);
@@ -944,7 +944,7 @@ class Parser {
         }
       }
 
-      CHECK_GE(exprs.size(), 1);
+      ICHECK_GE(exprs.size(), 1);
 
       if (exprs.size() == 1) {
         // ICHECK(exprs[0].defined() && exprs[0]->span.defined())
@@ -1258,7 +1258,7 @@ class Parser {
         auto op = opt_op[0];
 
         Expr right = WithSpan<Expr>([this] { return ParseCallExpr(); });
-        CHECK(right->span.defined());
+        ICHECK(right->span.defined());
 
         // If the operator stack is empty
         // we parse an operator and expression
@@ -1285,7 +1285,7 @@ class Parser {
           exprs.pop_back();
           Expr left = exprs.back();
           exprs.pop_back();
-          CHECK(new_op.op.defined()) << "a call op must be set " << new_op.op;
+          ICHECK(new_op.op.defined()) << "a call op must be set " << new_op.op;
           exprs.push_back(
               relay::Call(new_op.op, {left, right}, Attrs(), {}, left->span.Merge(right->span)));
         }
@@ -1301,7 +1301,7 @@ class Parser {
         exprs.pop_back();
         Expr left = exprs.back();
         exprs.pop_back();
-        CHECK(new_op.op.defined()) << "a call op must be set " << new_op.op;
+        ICHECK(new_op.op.defined()) << "a call op must be set " << new_op.op;
         exprs.push_back(
             relay::Call(new_op.op, {left, right}, Attrs(), {}, left->span.Merge(right->span)));
       }
@@ -1369,7 +1369,7 @@ class Parser {
   }
 
   Expr ParseCallArgs(Expr op) {
-    CHECK(op.defined()) << "the operator must be defined";
+    ICHECK(op.defined()) << "the operator must be defined";
 
     DLOG(INFO) << "Parser::ParseCallArgs";
     Map<String, ObjectRef> raw_attrs;
@@ -1401,7 +1401,7 @@ class Parser {
 
       if (is_op && op_key.size()) {
         auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs);
-        CHECK(attr_obj.defined());
+        ICHECK(attr_obj.defined());
         attrs = Downcast<Attrs>(attr_obj);
       }
 
@@ -1500,7 +1500,7 @@ class Parser {
             auto spanned_idents = ParseHierarchicalName();
             auto idents = spanned_idents.data;
             auto span = spanned_idents.span;
-            CHECK_NE(idents.size(), 0);
+            ICHECK_NE(idents.size(), 0);
             std::stringstream op_name;
             int i = 0;
             int periods = idents.size() - 1;
diff --git a/src/parser/source_map.cc b/src/parser/source_map.cc
index 40998b0c9dc4..7ac978cd6341 100644
--- a/src/parser/source_map.cc
+++ b/src/parser/source_map.cc
@@ -62,7 +62,7 @@ Source::Source(SourceName src_name, std::string source) {
 
 tvm::String Source::GetLine(int line) {
   DLOG(INFO) << "Source::GetLine: line=" << line;
-  CHECK(line - 1 < static_cast<int64_t>((*this)->line_map.size()))
+  ICHECK(line - 1 < static_cast<int64_t>((*this)->line_map.size()))
       << "requested line: " << line << "at index: " << (line - 1)
       << "line_map size: " << (*this)->line_map.size() << "source: " << (*this)->source;
 
diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h
index 20ad1734e573..a9ae64ba8fb1 100644
--- a/src/parser/tokenizer.h
+++ b/src/parser/tokenizer.h
@@ -100,7 +100,7 @@ struct Tokenizer {
   bool More() { return this->pos < this->source.size(); }
 
   char Peek() {
-    CHECK(pos < this->source.size());
+    ICHECK(pos < this->source.size());
     return this->source.at(this->pos);
   }
 
@@ -170,7 +170,7 @@ struct Tokenizer {
   }
 
   Token ParseNumber(bool is_pos, bool is_float, std::string number) {
-    CHECK(number.size() > 0) << "an empty string is an invalid number";
+    ICHECK(number.size() > 0) << "an empty string is an invalid number";
 
     try {
       if (is_float) {
@@ -231,22 +231,22 @@ struct Tokenizer {
     int line = this->line;
     int column = this->col;
 
-    CHECK_EQ(Peek(), '[');
+    ICHECK_EQ(Peek(), '[');
     Next();
     std::stringstream type_key;
     while (More() && Peek() != ']') {
       type_key << Next();
     }
-    CHECK_EQ(Peek(), ']');
+    ICHECK_EQ(Peek(), ']');
     Next();
 
-    CHECK_EQ(Peek(), '[');
+    ICHECK_EQ(Peek(), '[');
     Next();
     std::stringstream str_index;
     while (More() && Peek() != ']') {
       str_index << Next();
     }
-    CHECK_EQ(Peek(), ']');
+    ICHECK_EQ(Peek(), ']');
     Next();
     // todo: add error handling around bad indices
     auto index = ParseNumber(true, false, str_index.str()).ToNumber();
@@ -266,7 +266,7 @@ struct Tokenizer {
         raw_attribute << Next();
       }
 
-      CHECK_EQ(Next(), ']');
+      ICHECK_EQ(Next(), ']');
 
       auto attribute = raw_attribute.str();
       // Clean up the white-space on both sides.
@@ -537,7 +537,7 @@ struct Tokenizer {
     DLOG(INFO) << "tvm::parser::Tokenize";
     while (this->More()) {
       auto token = TokenizeOnce();
-      CHECK(token.defined());
+      ICHECK(token.defined());
       this->tokens.push_back(token);
     }
     this->tokens.push_back(NewToken(TokenType::kEndOfFile));
@@ -576,15 +576,15 @@ std::vector<Token> Condense(const std::vector<Token>& tokens, Token* table) {
           i += 1;
           // TODO(@jroesch): merge spans
           auto tok = Token(current->span, TokenType::kLocal, next->data);
-          CHECK(tok.defined());
+          ICHECK(tok.defined());
           out.push_back(tok);
         } else if (next->token_type == TokenType::kInteger) {
           i += 1;
           auto tok = Token(current->span, TokenType::kGraph, next->data);
-          CHECK(tok.defined());
+          ICHECK(tok.defined());
           out.push_back(tok);
         } else {
-          CHECK(current.defined());
+          ICHECK(current.defined());
           out.push_back(current);
         }
         continue;
@@ -596,10 +596,10 @@ std::vector<Token> Condense(const std::vector<Token>& tokens, Token* table) {
           i += 1;
           // TODO(@jroesch): merge spans
           auto tok = Token(current->span, TokenType::kGlobal, next->data);
-          CHECK(tok.defined());
+          ICHECK(tok.defined());
           out.push_back(tok);
         } else {
-          CHECK(current.defined());
+          ICHECK(current.defined());
           out.push_back(current);
         }
         continue;
@@ -638,7 +638,7 @@ std::pair<std::vector<Token>, Token> Tokenize(const DiagnosticContext& ctx, cons
   Token meta_table(Span(), TokenType::kUnknown, ObjectRef());
   auto tokens = Condense(tokenizer.tokens, &meta_table);
   for (auto token : tokens) {
-    CHECK(token.defined());
+    ICHECK(token.defined());
   }
   return {tokens, meta_table};
 }
diff --git a/src/printer/doc.cc b/src/printer/doc.cc
index ab1eddbe7d1e..4b22d54448c2 100644
--- a/src/printer/doc.cc
+++ b/src/printer/doc.cc
@@ -85,7 +85,7 @@ class DocLine : public DocAtom {
 
 // DSL function implementations
 Doc& Doc::operator<<(const Doc& right) {
-  CHECK(this != &right);
+  ICHECK(this != &right);
   this->stream_.insert(this->stream_.end(), right.stream_.begin(), right.stream_.end());
   return *this;
 }
diff --git a/src/printer/meta_data.h b/src/printer/meta_data.h
index df27d92170c6..233da1baffd8 100644
--- a/src/printer/meta_data.h
+++ b/src/printer/meta_data.h
@@ -99,7 +99,7 @@ class TextMetaDataContext {
       return it->second;
     }
     std::string type_key = node->GetTypeKey();
-    CHECK(!type_key.empty());
+    ICHECK(!type_key.empty());
     Array<ObjectRef>& mvector = meta_data_[type_key];
     int64_t index = static_cast<int64_t>(mvector.size());
     mvector.push_back(node);
diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc
index 555d335a51da..4132ab14ff29 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/printer/relay_text_printer.cc
@@ -322,7 +322,7 @@ Doc RelayTextPrinter::VisitExpr_(const ConstantNode* op) {
   if (op->is_scalar()) {
     std::ostringstream os;
     DataType dtype = DataType(op->data->dtype);
-    CHECK_EQ(op->data->ctx.device_type, kDLCPU);
+    ICHECK_EQ(op->data->ctx.device_type, kDLCPU);
     if (dtype == DataType::Int(32)) {
       return ScalarLiteral(dtype, static_cast<const int32_t*>(op->data->data)[0]);
     } else if (dtype == DataType::Int(64)) {
@@ -831,7 +831,7 @@ std::vector<Doc> RelayTextPrinter::PrintFuncAttrs(const Attrs& attrs) {
   std::vector<Doc> docs;
   if (!attrs.defined()) return docs;
   const auto* dict_attrs = attrs.as<DictAttrsNode>();
-  CHECK(dict_attrs);
+  ICHECK(dict_attrs);
   for (const auto& k : dict_attrs->dict) {
     Doc doc;
     doc << k.first << "=" << Print(k.second);
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 7feb0b5031ab..107817db29b3 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -353,7 +353,7 @@ Doc TIRTextPrinter::VisitExpr_(const CallNode* op) {
   } else {
     // TODO(bohan): Print out the name by he global var in the module.
     auto* op_gvar = op->op.as<GlobalVarNode>();
-    CHECK(op_gvar != nullptr);
+    ICHECK(op_gvar != nullptr);
     doc << "@" << Doc::Text(op_gvar->name_hint) << "(";
   }
   std::vector<Doc> args;
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 5add7c17b04c..09f95e44b6d8 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -475,7 +475,7 @@ Doc TVMScriptPrinter::VisitExpr_(const CallNode* op) {
     doc << Doc::Text(ptr_op->name) << "(";
   } else {
     auto* op_gvar = op->op.as<GlobalVarNode>();
-    CHECK(op_gvar != nullptr);
+    ICHECK(op_gvar != nullptr);
     doc << Doc::Text(op_gvar->name_hint) << "(";
   }
   std::vector<Doc> args;
@@ -566,7 +566,7 @@ Doc TVMScriptPrinter::VisitStmt_(const AttrStmtNode* op) {
   // concise thread env
   if (op->node->IsInstance<IterVarNode>() && op->attr_key == "thread_extent") {
     const auto* iter_var = Downcast<IterVar>(op->node).get();
-    CHECK(!iter_var->dom.defined());
+    ICHECK(!iter_var->dom.defined());
     var_not_in_headers.insert(iter_var->var.get());
     var_env_map_[iter_var->var] = iter_var->thread_tag;
     if (current_num_ != num_child_ - 1) {
@@ -890,7 +890,7 @@ Doc TVMScriptPrinter::PrintBuffer(const BufferNode* op) {
 TVM_REGISTER_GLOBAL("script.AsTVMScript")
     .set_body_typed<std::string(const ObjectRef&, bool)>([](const ObjectRef& functions,
                                                             bool show_meta) {
-      CHECK(functions.as<PrimFuncNode>() != nullptr || functions.as<IRModuleNode>() != nullptr);
+      ICHECK(functions.as<PrimFuncNode>() != nullptr || functions.as<IRModuleNode>() != nullptr);
       return "@tvm.script.tir\n" + TVMScriptPrinter(show_meta).Print(functions).str() + "\n";
     });
 
diff --git a/src/relay/analysis/annotated_region_set.cc b/src/relay/analysis/annotated_region_set.cc
index 587add36706f..04a18c4b7351 100644
--- a/src/relay/analysis/annotated_region_set.cc
+++ b/src/relay/analysis/annotated_region_set.cc
@@ -119,7 +119,7 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor {
       }
 
       auto arg_region = region_set_->GetRegion(arg);
-      CHECK_EQ(region.defined(), arg_region.defined())
+      ICHECK_EQ(region.defined(), arg_region.defined())
           << "Arg regions are inconsistent: " << AsText(expr);
       if (region.defined() && region != arg_region) {
         region_set_->MergeRegions(arg_region, region);
@@ -137,21 +137,21 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor {
       AddToArgRegion(GetRef<Call>(call), call->args);
     } else if (call->op == begin_op_) {
       // The annotation node is inserted on edge so it must have only one argument.
-      CHECK_EQ(call->args.size(), 1U);
+      ICHECK_EQ(call->args.size(), 1U);
       std::string target = call->attrs.as<CompilerAttrs>()->compiler;
 
       // Check if the argument already belongs to a region
       auto region = region_set_->GetRegion(GetRef<Call>(call));
-      CHECK(!region.defined());
+      ICHECK(!region.defined());
 
       // Create a new region.
       region = region_set_->MakeRegion(target);
       region->nodes_.insert(GetRef<Call>(call));
       region->ins_.push_back(GetRef<Call>(call));
     } else {
-      CHECK_EQ(call->op, end_op_);
+      ICHECK_EQ(call->op, end_op_);
       // The annotation node is inserted on edge so it must have only one argument.
-      CHECK_EQ(call->args.size(), 1U);
+      ICHECK_EQ(call->args.size(), 1U);
       std::string target = call->attrs.as<CompilerAttrs>()->compiler;
 
       // Check if the argument already belongs to a region
@@ -162,7 +162,7 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor {
       } else {
         // If the argument is belonged to a region, it must have the same target.
         // Otherwise we should see a region_begin op.
-        CHECK_EQ(region->GetTarget(), target);
+        ICHECK_EQ(region->GetTarget(), target);
       }
       region->nodes_.insert(GetRef<Call>(call));
       region->outs_.push_back(GetRef<Call>(call));
diff --git a/src/relay/analysis/annotated_region_set.h b/src/relay/analysis/annotated_region_set.h
index cbcf155350df..d9923cca99fc 100644
--- a/src/relay/analysis/annotated_region_set.h
+++ b/src/relay/analysis/annotated_region_set.h
@@ -114,7 +114,7 @@ class AnnotatedRegion : public ObjectRef {
   /*! \return Mutable pointers to the node. */
   AnnotatedRegionNode* operator->() const {
     auto* ptr = get_mutable();
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
     return static_cast<AnnotatedRegionNode*>(ptr);
   }
 };
@@ -216,39 +216,39 @@ class AnnotatedRegionSet : public ObjectRef {
   /*! \return The begin iterator. */
   iterator begin() {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->begin();
   }
   /*! \return The end iterator. */
   iterator end() {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->end();
   }
   /*! \return The begin iterator. */
   const_iterator begin() const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->begin();
   }
   /*! \return The end iterator. */
   const_iterator end() const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->end();
   }
 
   /*! \return mutable pointers to the node. */
   AnnotatedRegionSetNode* operator->() const {
     auto* ptr = get_mutable();
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
     return static_cast<AnnotatedRegionSetNode*>(ptr);
   }
 
   /*! \return The region an expression belongs to. */
   AnnotatedRegion operator[](const Expr& expr) {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->GetRegion(expr);
   }
 
diff --git a/src/relay/analysis/call_graph.cc b/src/relay/analysis/call_graph.cc
index 0d3fedcde0f7..9edb471f7f79 100644
--- a/src/relay/analysis/call_graph.cc
+++ b/src/relay/analysis/call_graph.cc
@@ -51,7 +51,7 @@ CallGraph::CallGraph(IRModule module) {
 }
 
 void CallGraphNode::AddToCallGraph(const GlobalVar& gv, const Function& func) {
-  CHECK(func.defined() && gv.defined());
+  ICHECK(func.defined() && gv.defined());
   // Add the current global function as an entry to the call grpah.
   CallGraphEntry* cg_node = LookupGlobalVar(gv);
 
@@ -73,20 +73,20 @@ void CallGraphNode::AddToCallGraph(const GlobalVar& gv, const Function& func) {
 
 const CallGraphEntry* CallGraphNode::operator[](const GlobalVar& gv) const {
   const_iterator cit = call_graph_.find(gv);
-  CHECK(cit != call_graph_.end()) << "GlobalVar " << gv->name_hint
-                                  << " not found in the call graph!";
+  ICHECK(cit != call_graph_.end())
+      << "GlobalVar " << gv->name_hint << " not found in the call graph!";
   return cit->second.get();
 }
 
 CallGraphEntry* CallGraphNode::operator[](const GlobalVar& gv) {
   const_iterator cit = call_graph_.find(gv);
-  CHECK(cit != call_graph_.end()) << "GlobalVar " << gv->name_hint
-                                  << " not found in the call graph!";
+  ICHECK(cit != call_graph_.end())
+      << "GlobalVar " << gv->name_hint << " not found in the call graph!";
   return cit->second.get();
 }
 
 BaseFunc CallGraphNode::GetGlobalFunction(const GlobalVar& var) const {
-  CHECK(module->ContainGlobalVar(var->name_hint))
+  ICHECK(module->ContainGlobalVar(var->name_hint))
       << "GlobalVar " << var->name_hint << " not found in the current ir module";
   return module->Lookup(var);
 }
@@ -94,13 +94,13 @@ BaseFunc CallGraphNode::GetGlobalFunction(const GlobalVar& var) const {
 // Query the existence of a GlobalVar in the call graph. It creates an entry if
 // there is no such node available.
 CallGraphEntry* CallGraphNode::LookupGlobalVar(const GlobalVar& gv) {
-  CHECK(gv.defined());
+  ICHECK(gv.defined());
 
   // This inserts an element to the call graph if it is not there yet.
   auto& call_graph_node = call_graph_[gv];
   if (call_graph_node) return call_graph_node.get();
 
-  CHECK(module->ContainGlobalVar(gv->name_hint))
+  ICHECK(module->ContainGlobalVar(gv->name_hint))
       << "GlobalVar " << gv->name_hint << " not found in the current ir module";
 
   // Create the node for the inserted entry.
@@ -118,7 +118,7 @@ void CallGraphNode::Print(std::ostream& os) const {
 
 GlobalVar CallGraphNode::RemoveGlobalVarFromModule(CallGraphEntry* cg_node,
                                                    bool update_call_graph) {
-  CHECK(cg_node->empty() || (cg_node->IsRecursive() && cg_node->size() == 1))
+  ICHECK(cg_node->empty() || (cg_node->IsRecursive() && cg_node->size() == 1))
       << "Cannot remove global var " << cg_node->GetNameHint()
       << " from call graph, because it still calls " << cg_node->size()
       << " other global functions";
@@ -232,7 +232,7 @@ inline void CallGraphEntry::AddCalledGlobal(CallGraphEntry* cg_node) {
 // Remove an edge from the current global function to the callee.
 void CallGraphEntry::RemoveCallTo(const GlobalVar& callee) {
   for (auto it = begin();; ++it) {
-    CHECK(it != end()) << "Cannot find global function " << callee->name_hint << " to remove!";
+    ICHECK(it != end()) << "Cannot find global function " << callee->name_hint << " to remove!";
     if (it->second->GetGlobalVar() == callee) {
       // Only remove one occurrence of the call site.
       it->second->DecRef();
@@ -256,7 +256,7 @@ void CallGraphEntry::RemoveAllCallTo(CallGraphEntry* callee) {
     }
   }
   // Make sure all references to the callee are removed.
-  CHECK_EQ(callee->GetRefCount(), 0U)
+  ICHECK_EQ(callee->GetRefCount(), 0U)
       << "All references to " << callee->GetNameHint() << " should have been removed";
 }
 
@@ -291,7 +291,7 @@ TVM_REGISTER_NODE_TYPE(CallGraphNode);
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<CallGraphNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const CallGraphNode*>(ref.get());
-      CHECK(node);
+      ICHECK(node);
       p->stream << "CallGraph: \n" << GetRef<CallGraph>(node);
     });
 
diff --git a/src/relay/analysis/call_graph.h b/src/relay/analysis/call_graph.h
index 07b25278b1d6..7cc813ebbff1 100644
--- a/src/relay/analysis/call_graph.h
+++ b/src/relay/analysis/call_graph.h
@@ -218,25 +218,25 @@ class CallGraph : public ObjectRef {
   /*! \return The begin iterator. */
   iterator begin() {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->begin();
   }
   /*! \return The end iterator. */
   iterator end() {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->end();
   }
   /*! \return The begin iterator. */
   const_iterator begin() const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->begin();
   }
   /*! \return The end iterator. */
   const_iterator end() const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return n->end();
   }
 
@@ -249,7 +249,7 @@ class CallGraph : public ObjectRef {
    */
   const CallGraphEntry* operator[](const GlobalVar& gv) const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return (*n)[gv];
   }
   /*!
@@ -261,7 +261,7 @@ class CallGraph : public ObjectRef {
    */
   CallGraphEntry* operator[](const GlobalVar& gv) {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return (*n)[gv];
   }
   /*!
@@ -273,7 +273,7 @@ class CallGraph : public ObjectRef {
    */
   const CallGraphEntry* operator[](const std::string& gvar_name) const {
     const auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return (*n)[gvar_name];
   }
   /*!
@@ -285,14 +285,14 @@ class CallGraph : public ObjectRef {
    */
   CallGraphEntry* operator[](const std::string& gvar_name) {
     auto* n = operator->();
-    CHECK(n);
+    ICHECK(n);
     return (*n)[gvar_name];
   }
 
   /*! \return mutable pointers to the node. */
   CallGraphNode* operator->() const {
     auto* ptr = get_mutable();
-    CHECK(ptr != nullptr);
+    ICHECK(ptr != nullptr);
     return static_cast<CallGraphNode*>(ptr);
   }
 
@@ -360,7 +360,7 @@ class CallGraphEntry {
    * \return The fetched CallGraphEntry.
    */
   CallGraphEntry* operator[](size_t i) const {
-    CHECK_LT(i, called_globals_.size()) << "Invalid Index";
+    ICHECK_LT(i, called_globals_.size()) << "Invalid Index";
     return called_globals_[i].second;
   }
 
@@ -452,7 +452,7 @@ class CallGraphEntry {
  private:
   /*! \brief Decrement the reference counter by 1. */
   void DecRef() {
-    CHECK_GT(ref_cnt_, 0);
+    ICHECK_GT(ref_cnt_, 0);
     --ref_cnt_;
   }
   /*! \brief Increment the reference counter by 1. */
diff --git a/src/relay/analysis/context_analysis.cc b/src/relay/analysis/context_analysis.cc
index 5fbd8a4d067f..a648b7af8fd3 100644
--- a/src/relay/analysis/context_analysis.cc
+++ b/src/relay/analysis/context_analysis.cc
@@ -151,7 +151,7 @@ DeviceDomainPtr Join(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) {
   } else if (rhs->IsEmptyDomain()) {
     return lhs;
   } else {
-    CHECK(*lhs.get() == *rhs.get()) << "All expressions must have a singular device to unify";
+    ICHECK(*lhs.get() == *rhs.get()) << "All expressions must have a singular device to unify";
     return lhs;
   }
 }
@@ -311,7 +311,7 @@ class ContextAnalyzer : public MixedModeVisitor {
       auto ty = let->value->checked_type();
       if (ty->IsInstance<FuncTypeNode>()) {
         auto gv = ExtractClosure(let);
-        CHECK(gv.defined() && gv->IsInstance<GlobalVarNode>());
+        ICHECK(gv.defined() && gv->IsInstance<GlobalVarNode>());
         closures_[let->var] = Downcast<GlobalVar>(gv);
       }
 
@@ -444,7 +444,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   // Process device copy call node
   void UnifyDeviceCopyCall(const CallNode* call) {
-    CHECK_EQ(call->args.size(), 1U);
+    ICHECK_EQ(call->args.size(), 1U);
 
     std::vector<Expr> inps{call->args[0]};
     std::vector<Expr> outs{GetRef<Call>(call)};
@@ -455,13 +455,13 @@ class ContextAnalyzer : public MixedModeVisitor {
       inps.push_back(fn->params[0]);
       outs.push_back(call->op);
       Expr body = fn->body;
-      CHECK(body->IsInstance<CallNode>() && IsDeviceCopy(body));
+      ICHECK(body->IsInstance<CallNode>() && IsDeviceCopy(body));
       Call call_body = Downcast<Call>(body);
       attrs = call_body->attrs.as<DeviceCopyAttrs>();
     } else {
       attrs = call->attrs.as<DeviceCopyAttrs>();
     }
-    CHECK(attrs != nullptr);
+    ICHECK(attrs != nullptr);
     src_dev_type = static_cast<DLDeviceType>(attrs->src_dev_type);
     dst_dev_type = static_cast<DLDeviceType>(attrs->dst_dev_type);
 
@@ -474,7 +474,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyAllocStorageCall(const CallNode* call) {
     // [size, alignment]
-    CHECK_EQ(call->args.size(), 2U);
+    ICHECK_EQ(call->args.size(), 2U);
 
     // The arguments of alloc storage should be on CPU.
     for (int i = 0; i < 2; i++) {
@@ -490,7 +490,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyAllocTensorCall(const CallNode* call) {
     // [storage, offset, shape]
-    CHECK_EQ(call->args.size(), 3U);
+    ICHECK_EQ(call->args.size(), 3U);
 
     Expr storage = call->args[0];
     Expr shape = call->args[1];
@@ -503,7 +503,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyShapeFuncCall(const CallNode* call) {
     // [func, inputs, outputs]
-    CHECK_EQ(call->args.size(), 3U);
+    ICHECK_EQ(call->args.size(), 3U);
     auto shape_func_domain = DeviceType(cpu_ctx_);
 
     // No need to unify the op of a shape_func as shape_func doesn't
@@ -523,7 +523,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyInvokeTVMOpCall(const CallNode* call) {
     // [op, inputs, outputs]
-    CHECK_EQ(call->args.size(), 3U);
+    ICHECK_EQ(call->args.size(), 3U);
     Tuple inps = Downcast<Tuple>(call->args[1]);
     Tuple outputs = Downcast<Tuple>(call->args[2]);
     UnifyCall(call->args[0], inps->fields, outputs->fields, Bottom());
@@ -532,7 +532,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyShapeOfCall(const CallNode* call) {
     // vm shape_of is always on the CPU.
-    CHECK_EQ(call->args.size(), 1U);
+    ICHECK_EQ(call->args.size(), 1U);
     MixedModeVisitor::VisitExpr(call->args[0]);
     // Note we don't unify the input of a shape_of with the cpu domain. This is
     // because vm.shape_of has a native instruction to compute the shape of
@@ -544,7 +544,7 @@ class ContextAnalyzer : public MixedModeVisitor {
 
   void UnifyReshapeTensorCall(const CallNode* call) {
     // [data, shape]
-    CHECK_EQ(call->args.size(), 2U);
+    ICHECK_EQ(call->args.size(), 2U);
     Expr data = call->args[0];
     Expr shape = call->args[1];
     Unify(DeviceFor(GetRef<Call>(call)), DeviceFor(data));
@@ -583,10 +583,10 @@ class ContextAnalyzer : public MixedModeVisitor {
   // Invoke a global function.
   void UnifyGlobalVarCall(const CallNode* call) {
     auto device = DeviceFor(GetRef<Call>(call));
-    CHECK(mod_.defined()) << "Cannot analyze context on a globalvar without module";
+    ICHECK(mod_.defined()) << "Cannot analyze context on a globalvar without module";
     GlobalVar gv = Downcast<GlobalVar>(call->op);
     auto func = Downcast<Function>(mod_->Lookup(gv));
-    CHECK_EQ(call->args.size(), func->params.size())
+    ICHECK_EQ(call->args.size(), func->params.size())
         << "The number of arguments doesn't match the number of parameters of the function.";
 
     for (size_t i = 0; i < call->args.size(); i++) {
@@ -596,14 +596,14 @@ class ContextAnalyzer : public MixedModeVisitor {
 
       // Save the the arg to function mapping for closures as it will
       // be invoked/unified later.
-      CHECK(arg->checked_type().defined())
+      ICHECK(arg->checked_type().defined())
           << "Type inference is required to run the context analysis passes.";
       if (arg->checked_type()->IsInstance<FuncTypeNode>()) {
         auto it = closures_.find(arg);
         if (it != closures_.end()) {
           closures_[param] = it->second;
         } else {
-          CHECK(arg->IsInstance<GlobalVarNode>());
+          ICHECK(arg->IsInstance<GlobalVarNode>());
           closures_[param] = Downcast<GlobalVar>(arg);
         }
       }
@@ -631,9 +631,9 @@ class ContextAnalyzer : public MixedModeVisitor {
     // Unify the corresponding arguement and parameter.
     auto device = DeviceFor(GetRef<Call>(call));
     auto it = closures_.find(call->op);
-    CHECK(it != closures_.end()) << "Cannot find var: " << call->op;
+    ICHECK(it != closures_.end()) << "Cannot find var: " << call->op;
     auto glb_var = it->second;
-    CHECK(mod_.defined()) << "Cannot analyze context on a globalvar without module";
+    ICHECK(mod_.defined()) << "Cannot analyze context on a globalvar without module";
     Function func = Downcast<Function>(mod_->Lookup(glb_var));
     // Unify the underlying function for clousre or currying functions.
     while (IsClosure(func) || IsCurrying(func)) {
@@ -648,7 +648,7 @@ class ContextAnalyzer : public MixedModeVisitor {
       }
     }
 
-    CHECK_EQ(call->args.size(), func->params.size());
+    ICHECK_EQ(call->args.size(), func->params.size());
     for (size_t i = 0; i < call->args.size(); i++) {
       Unify(DeviceFor(call->args[i]), DeviceFor(func->params[i]));
       MixedModeVisitor::VisitExpr(call->args[i]);
diff --git a/src/relay/analysis/dependency_graph.cc b/src/relay/analysis/dependency_graph.cc
index de61800d8c52..3a4fb59475a4 100644
--- a/src/relay/analysis/dependency_graph.cc
+++ b/src/relay/analysis/dependency_graph.cc
@@ -50,7 +50,7 @@ class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
   void Depend(DependencyGraph::Node* parent, const Expr& child) {
     VisitExpr(child);
 
-    CHECK_NE(graph_.expr_node.count(child), 0);
+    ICHECK_NE(graph_.expr_node.count(child), 0);
 
     Depend(parent, graph_.expr_node[child]);
   }
diff --git a/src/relay/analysis/feature.cc b/src/relay/analysis/feature.cc
index b3516e965b85..f72b4e105749 100644
--- a/src/relay/analysis/feature.cc
+++ b/src/relay/analysis/feature.cc
@@ -114,7 +114,7 @@ std::string FeatureSet::ToString() const {
   DETECT_FEATURE(fGraph);
   DETECT_FEATURE(fLetRec);
 #undef DETECT_FEATURE
-  CHECK(detected == feature_count) << "some feature not printed";
+  ICHECK(detected == feature_count) << "some feature not printed";
   ret += "]";
   return ret;
 }
@@ -139,8 +139,8 @@ TVM_REGISTER_GLOBAL("relay.analysis.detect_feature").set_body_typed(PyDetectFeat
 
 void CheckFeature(const Expr& expr, const FeatureSet& fs) {
   auto dfs = DetectFeature(expr);
-  CHECK(dfs.is_subset_of(fs)) << AsText(expr, false)
-                              << "\nhas unsupported feature: " << (dfs - fs).ToString();
+  ICHECK(dfs.is_subset_of(fs)) << AsText(expr, false)
+                               << "\nhas unsupported feature: " << (dfs - fs).ToString();
 }
 
 void CheckFeature(const IRModule& mod, const FeatureSet& fs) {
diff --git a/src/relay/analysis/get_calibration_data.cc b/src/relay/analysis/get_calibration_data.cc
index 34d0d0002b6a..70fe2a68f21e 100644
--- a/src/relay/analysis/get_calibration_data.cc
+++ b/src/relay/analysis/get_calibration_data.cc
@@ -52,7 +52,7 @@ class Collector : public ExprRewriter {
     // intrinsic functions are excluded for now
     if (call->op->IsInstance<GlobalVarNode>()) {
       auto var = Downcast<GlobalVar>(call->op);
-      CHECK(module_->ContainGlobalVar(var->name_hint)) << "Function " << var << " is not defined";
+      ICHECK(module_->ContainGlobalVar(var->name_hint)) << "Function " << var << " is not defined";
       // we only handle functions with Compiler attribute set
       auto func = Downcast<Function>(module_->Lookup(var));
       if (func->GetAttr<String>(attr::kCompiler)) {
@@ -74,10 +74,10 @@ class Collector : public ExprRewriter {
 Expr FlattenOutputTuple(const Array<Expr>& exprs) {
   Array<Expr> fields;
   for (const auto& it : exprs) {
-    CHECK(it->checked_type_.defined());
+    ICHECK(it->checked_type_.defined());
     if (auto* tn = it->checked_type_.as<TupleTypeNode>()) {
       // TODO(seanlatias): for now input argument cannot be a tuple
-      CHECK(it->IsInstance<CallNode>());
+      ICHECK(it->IsInstance<CallNode>());
       for (size_t i = 0; i < tn->fields.size(); i++) {
         fields.push_back(TupleGetItem(it, i));
       }
@@ -140,8 +140,8 @@ class OutputMapper : public ExprRewriter {
   Expr Rewrite_(const CallNode* call, const Expr& post) final {
     if (call->op->IsInstance<GlobalVarNode>()) {
       auto var = Downcast<GlobalVar>(call->op);
-      CHECK(module_->ContainGlobalVar(var->name_hint)) << "Function " << var << " is not defined";
-      CHECK_EQ(output_map_->count(var), 0)
+      ICHECK(module_->ContainGlobalVar(var->name_hint)) << "Function " << var << " is not defined";
+      ICHECK_EQ(output_map_->count(var), 0)
           << "Repeated function call " << var << " is not supported.";
       auto func = Downcast<Function>(module_->Lookup(var));
       // we only handle functions with Compiler attribute set
diff --git a/src/relay/analysis/mac_count.cc b/src/relay/analysis/mac_count.cc
index 5e35ab7ba62d..29edf55812cc 100644
--- a/src/relay/analysis/mac_count.cc
+++ b/src/relay/analysis/mac_count.cc
@@ -65,24 +65,24 @@ int64_t ConvMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK_EQ(args.size(), 2) << "The number of input arguments of a CONV 2D node should be 2.";
+  ICHECK_EQ(args.size(), 2) << "The number of input arguments of a CONV 2D node should be 2.";
   const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> data_shape = data_type->shape;
   std::string data_layout = conv_2d_attr->data_layout;
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
-  CHECK_NE(C_ind, -1) << "There is no input channel dimension.";
+  ICHECK_NE(C_ind, -1) << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImmNode>()->value);
   if (c_ind != -1) input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImmNode>()->value);
   Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
-  CHECK_EQ(kernel_size.size(), 2) << "The dimension of the kernel in Conv 2D should be 2.";
+  ICHECK_EQ(kernel_size.size(), 2) << "The dimension of the kernel in Conv 2D should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
-  CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
+  ICHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
       << "The dimension of the output tensor in Conv 2D should be 4 or 5.";
   int64_t count = GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
-  CHECK_EQ(input_channel % conv_2d_attr->groups, 0)
+  ICHECK_EQ(input_channel % conv_2d_attr->groups, 0)
       << "The number of input channels is not divisble by groups.";
   count *= input_channel / conv_2d_attr->groups;
   return count;
@@ -94,7 +94,7 @@ int64_t Conv2dTransposeMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK_EQ(args.size(), 2)
+  ICHECK_EQ(args.size(), 2)
       << "The number of input arguments of a CONV 2D Transpose node should be 2.";
   const auto* conv_2d_transpose_attr = call_node->attrs.as<Conv2DTransposeAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
@@ -102,18 +102,18 @@ int64_t Conv2dTransposeMacCount(const Call& call_node) {
   std::string data_layout = conv_2d_transpose_attr->data_layout;
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
-  CHECK_NE(C_ind, -1) << "There is no input channel dimension.";
+  ICHECK_NE(C_ind, -1) << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImmNode>()->value);
   if (c_ind != -1) input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImmNode>()->value);
   Array<IndexExpr> kernel_size = conv_2d_transpose_attr->kernel_size;
-  CHECK_EQ(kernel_size.size(), 2)
+  ICHECK_EQ(kernel_size.size(), 2)
       << "The dimension of the kernel in Conv 2D Transpose should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
-  CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
+  ICHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
       << "The dimension of the output tensor in Conv 2D Transpose should be 4 or 5.";
   int64_t count = GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
-  CHECK_EQ(input_channel % conv_2d_transpose_attr->groups, 0)
+  ICHECK_EQ(input_channel % conv_2d_transpose_attr->groups, 0)
       << "The number of input channels is not divisble by groups.";
   count *= input_channel / conv_2d_transpose_attr->groups;
   return count;
@@ -125,18 +125,18 @@ int64_t DenseMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK_EQ(args.size(), 2) << "The number of input arguments of a Dense node should be 2.";
+  ICHECK_EQ(args.size(), 2) << "The number of input arguments of a Dense node should be 2.";
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
   const auto* weight_type = args[1]->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> data_shape = data_type->shape;
   Array<IndexExpr> weight_shape = weight_type->shape;
-  CHECK(data_shape.size() == 2 && weight_shape.size() == 2)
+  ICHECK(data_shape.size() == 2 && weight_shape.size() == 2)
       << "The dimension of an input tensor to Dense node should be 2.";
   int64_t d1 = static_cast<int64_t>(data_shape[0].as<IntImmNode>()->value);
   int64_t d2 = static_cast<int64_t>(data_shape[1].as<IntImmNode>()->value);
   int64_t d3 = static_cast<int64_t>(weight_shape[0].as<IntImmNode>()->value);
   int64_t d4 = static_cast<int64_t>(weight_shape[1].as<IntImmNode>()->value);
-  CHECK_EQ(d2, d4) << "The dimensions of input arguments do not match.";
+  ICHECK_EQ(d2, d4) << "The dimensions of input arguments do not match.";
   int64_t count = d1 * d2 * d3;
   return count;
 }
@@ -147,7 +147,7 @@ int64_t BatchMatmulMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK_EQ(args.size(), 2);
+  ICHECK_EQ(args.size(), 2);
   Array<IndexExpr> x_shape = args[0]->checked_type().as<TensorTypeNode>()->shape;
   Array<IndexExpr> y_shape = args[1]->checked_type().as<TensorTypeNode>()->shape;
   int64_t batch = x_shape[0].as<IntImmNode>()->value;
diff --git a/src/relay/analysis/match_exhaustion.cc b/src/relay/analysis/match_exhaustion.cc
index e852c40dfeba..bb6e8f14ca09 100644
--- a/src/relay/analysis/match_exhaustion.cc
+++ b/src/relay/analysis/match_exhaustion.cc
@@ -68,7 +68,7 @@ class CandidateChecker : public PatternFunctor<MatchResult(const Pattern&, const
     }
 
     // now check that subpatterns match
-    CHECK_EQ(op->patterns.size(), ctor_cand->patterns.size());
+    ICHECK_EQ(op->patterns.size(), ctor_cand->patterns.size());
     bool unspecified = false;
     for (size_t i = 0; i < op->patterns.size(); i++) {
       MatchResult submatch = this->Check(op->patterns[i], ctor_cand->patterns[i]);
@@ -95,7 +95,7 @@ class CandidateChecker : public PatternFunctor<MatchResult(const Pattern&, const
     }
 
     // now check that subpatterns match
-    CHECK_EQ(op->patterns.size(), tuple_cand->patterns.size());
+    ICHECK_EQ(op->patterns.size(), tuple_cand->patterns.size());
     bool unspecified = false;
     for (size_t i = 0; i < op->patterns.size(); i++) {
       MatchResult submatch = this->Check(op->patterns[i], tuple_cand->patterns[i]);
@@ -126,7 +126,7 @@ class CandidateChecker : public PatternFunctor<MatchResult(const Pattern&, const
 
 // Returns list of arrays corresponding to Cartesian product of input list
 Array<Array<Pattern>> CartesianProduct(Array<Array<Pattern>> fields) {
-  CHECK_NE(fields.size(), 0);
+  ICHECK_NE(fields.size(), 0);
   Array<Pattern> field_vals = fields[fields.size() - 1];
   Array<Array<Pattern>> ret;
 
diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index 8c1cc92fe009..55f736895018 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -114,14 +114,14 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
     }
 
     if (lhs->resolved_type.as<IncompleteTypeNode>()) {
-      CHECK(!OccursCheck(lhs, rhs->resolved_type))
+      ICHECK(!OccursCheck(lhs, rhs->resolved_type))
           << "Incomplete type " << lhs->resolved_type << " occurs in " << rhs->resolved_type
           << ", cannot unify";
 
       solver_->MergeFromTo(lhs, rhs);
       return rhs->resolved_type;
     } else if (rhs->resolved_type.as<IncompleteTypeNode>()) {
-      CHECK(!OccursCheck(rhs, lhs->resolved_type))
+      ICHECK(!OccursCheck(rhs, lhs->resolved_type))
           << "Incomplete type " << rhs->resolved_type << " occurs in " << lhs->resolved_type
           << ", cannot unify";
       solver_->MergeFromTo(rhs, lhs);
@@ -242,7 +242,7 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
 
     std::vector<std::tuple<size_t, IndexExpr, IndexExpr>> mismatches;
 
-    CHECK_EQ(tt1->shape.size(), tt2->shape.size());
+    ICHECK_EQ(tt1->shape.size(), tt2->shape.size());
     for (size_t i = 0; i < tt1->shape.size(); i++) {
       auto dim = UnifyDim(tt1->shape[i], tt2->shape[i]);
       if (!dim.defined()) {
@@ -328,8 +328,8 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
     for (size_t i = 0; i < ft1->type_constraints.size(); ++i) {
       Type unified_constraint = Unify(ft1->type_constraints[i], ft2->type_constraints[i]);
       const auto* tcn = unified_constraint.as<TypeConstraintNode>();
-      CHECK(tcn) << "Two type constraints unified into a non-constraint?"
-                 << ft1->type_constraints[i] << " and " << ft2->type_constraints[i];
+      ICHECK(tcn) << "Two type constraints unified into a non-constraint?"
+                  << ft1->type_constraints[i] << " and " << ft2->type_constraints[i];
       type_constraints.push_back(GetRef<TypeConstraint>(tcn));
     }
 
@@ -527,7 +527,7 @@ TypeSolver::TypeSolver(const GlobalVar& current_func, DiagnosticContext diag_ctx
       current_func(current_func),
       diag_ctx_(diag_ctx),
       module_(diag_ctx->module) {
-  CHECK(module_.defined());
+  ICHECK(module_.defined());
 }
 
 // destructor
@@ -593,12 +593,12 @@ bool TypeSolver::Solve() {
     RelationNode* rnode = update_queue_.front();
     const auto& rel = rnode->rel;
     update_queue_.pop();
-    CHECK(!rnode->resolved);
+    ICHECK(!rnode->resolved);
     // update the relation with given evidence.
     Array<Type> args;
     for (auto* tlink = rnode->type_list.head; tlink != nullptr; tlink = tlink->next) {
       args.push_back(Resolve(tlink->value->FindRoot()->resolved_type));
-      CHECK_LE(args.size(), rel->args.size());
+      ICHECK_LE(args.size(), rel->args.size());
     }
 
     // We need to set this in order to understand where unification
diff --git a/src/relay/analysis/type_solver.h b/src/relay/analysis/type_solver.h
index 1fc0525d6bca..4ae2e6a2b07b 100644
--- a/src/relay/analysis/type_solver.h
+++ b/src/relay/analysis/type_solver.h
@@ -208,7 +208,7 @@ class TypeSolver {
    */
   void AddToQueue(RelationNode* rel) {
     if (rel->inqueue) return;
-    CHECK(!rel->resolved);
+    ICHECK(!rel->resolved);
     rel->inqueue = true;
     update_queue_.push(rel);
   }
diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index edf8fb644c57..bcfbc83da514 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -358,9 +358,9 @@ std::unordered_map<const Object*, size_t> GetExprRefCount(const Expr& body) {
 
 template <typename T>
 bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) {
-  CHECK_EQ(tensor->ctx.device_type, kDLCPU);
-  CHECK(tensor->strides == nullptr);
-  CHECK_EQ(tensor->byte_offset, 0);
+  ICHECK_EQ(tensor->ctx.device_type, kDLCPU);
+  ICHECK(tensor->strides == nullptr);
+  ICHECK_EQ(tensor->byte_offset, 0);
   const T* data = static_cast<const T*>(tensor->data);
   int64_t num_elems = 1;
   for (int i = 0; i < tensor->ndim; ++i) {
@@ -446,10 +446,10 @@ Expr TypeSubst(const Expr& expr, const tvm::Map<TypeVar, Type>& subst_map) {
    private:
     const tvm::Map<TypeVar, Type>& subst_map_;
   };
-  CHECK(WellFormed(expr));
+  ICHECK(WellFormed(expr));
   auto ret = TypeSubstMutator(subst_map).VisitExpr(expr);
-  CHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
-  CHECK(WellFormed(ret));
+  ICHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
+  ICHECK(WellFormed(ret));
   return ret;
 }
 
diff --git a/src/relay/analysis/well_formed.cc b/src/relay/analysis/well_formed.cc
index 0b6e043a0d21..856c5dc7aac1 100644
--- a/src/relay/analysis/well_formed.cc
+++ b/src/relay/analysis/well_formed.cc
@@ -59,9 +59,9 @@ class WellFormedChecker : private MixedModeVisitor, PatternVisitor {
     WellFormedChecker* wfc;
     explicit Scope(WellFormedChecker* wfc) : wfc(wfc) { wfc->scope.push_back({{}}); }
     ~Scope() {
-      CHECK_GE(wfc->scope.size(), 0);
+      ICHECK_GE(wfc->scope.size(), 0);
       for (const Var& v : wfc->scope.back()) {
-        CHECK_GE(wfc->current_bound.count(v), 0);
+        ICHECK_GE(wfc->current_bound.count(v), 0);
         wfc->current_bound.erase(v);
       }
       wfc->scope.pop_back();
@@ -73,7 +73,7 @@ class WellFormedChecker : private MixedModeVisitor, PatternVisitor {
       Illformed(Diagnostic::Error(v->span) << "the variable " << v->name_hint()
                                            << "is bound more then once, this is not valid IR");
     }
-    CHECK_GE(scope.size(), 0);
+    ICHECK_GE(scope.size(), 0);
     scope.back().insert(v);
     current_bound.insert(v);
     total_bound.insert(v);
@@ -120,14 +120,14 @@ class WellFormedChecker : private MixedModeVisitor, PatternVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
-    CHECK(call->op.defined());
+    ICHECK(call->op.defined());
 
     for (auto arg : call->args) {
-      CHECK(arg.defined());
+      ICHECK(arg.defined());
     }
 
-    // CHECK(call->attrs.defined());
-    CHECK(call->type_args.defined());
+    // ICHECK(call->attrs.defined());
+    ICHECK(call->type_args.defined());
     MixedModeVisitor::VisitExpr_(call);
   }
 
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 64f1253ff9db..ddea5456585b 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -124,7 +124,7 @@ class RelayBuildModule : public runtime::ModuleNode {
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
     } else if (name == "build") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 3);
+        ICHECK_EQ(args.num_args, 3);
         this->Build(args[0], args[1], args[2]);
       });
     } else if (name == "list_params") {
@@ -150,7 +150,7 @@ class RelayBuildModule : public runtime::ModuleNode {
       });
     } else if (name == "optimize") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 2);
+        ICHECK_EQ(args.num_args, 2);
         *rv = this->Optimize(args[0], args[1], this->params_);
       });
     } else {
@@ -244,7 +244,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     ICHECK(relay_module.defined()) << "The IRModule must be defined for the Relay compiler.";
 
     if (params.size()) {
-      CHECK(relay_module->ContainGlobalVar("main")) << "Missing the main entry function";
+      ICHECK(relay_module->ContainGlobalVar("main")) << "Missing the main entry function";
       GlobalVar main_glb_var = relay_module->GetGlobalVar("main");
       Function main_func = Downcast<Function>(relay_module->Lookup(main_glb_var));
       auto new_main = BindParamsByName(main_func, params);
@@ -319,7 +319,7 @@ class RelayBuildModule : public runtime::ModuleNode {
       Optional<Integer> opt_fallback_dev =
           pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast<int>(kDLCPU)));
       auto fallback_dev = opt_fallback_dev.value();
-      CHECK_GT(fallback_dev->value, 0U);
+      ICHECK_GT(fallback_dev->value, 0U);
       relay_module = RunDeviceAnnotationPass(relay_module, fallback_dev->value);
     }
 
@@ -335,7 +335,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     relay_module = transform::Inline()(relay_module);
     relay_module = transform::InferType()(relay_module);
 
-    CHECK(relay_module.defined());
+    ICHECK(relay_module.defined());
 
     return relay_module;
   }
@@ -383,7 +383,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     UpdateHeterogeneousInputs(fallback_device);
     auto rewrite = transform::RewriteAnnotatedOps(fallback_device);
     auto updated_module = rewrite(relay_module);
-    CHECK(updated_module.defined());
+    ICHECK(updated_module.defined());
 
     tvm::Map<Expr, Integer> device_map;
     for (const auto& it : updated_module->functions) {
@@ -408,11 +408,11 @@ class RelayBuildModule : public runtime::ModuleNode {
           break;
         }
         for (auto kv : annotation_map) {
-          CHECK_EQ(kv.second->value, dev_type) << "Expressions in the function are "
-                                               << "annotated with various device types,"
-                                               << "but not device copy operators "
-                                               << "found. Please check the "
-                                               << "RewriteAnnotation pass.";
+          ICHECK_EQ(kv.second->value, dev_type) << "Expressions in the function are "
+                                                << "annotated with various device types,"
+                                                << "but not device copy operators "
+                                                << "found. Please check the "
+                                                << "RewriteAnnotation pass.";
         }
         targets_.Set(0, CreateDefaultTarget(dev_type));
       }
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index d720e94ddc75..556687c453ac 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -79,8 +79,8 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
     const int64_t* pval = tir::as_const_int(val);
     if (pval != nullptr) {
 #ifndef TVM_INDEX_DEFAULT_I64
-      CHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
-      CHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
+      ICHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
+      ICHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
       res.push_back(IntImm(DataType::Int(32), *pval));
 #else
       res.push_back(val);
@@ -116,7 +116,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
         for (Type field : tuple_type->fields) {
           const auto* ttype = field.as<TensorTypeNode>();
           // TODO(@icemelon): Allow recursive tuple
-          CHECK(ttype != nullptr);
+          ICHECK(ttype != nullptr);
           tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
           cache_node->inputs.push_back(tensor);
           inputs.push_back(tensor);
@@ -135,7 +135,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       candidate_name = truncated_name.str();
     }
     cache_node->func_name = candidate_name;
-    CHECK(anchor_op_.defined());
+    ICHECK(anchor_op_.defined());
     // Fusion over tupled results may leave identity relationships
     // between inputs and outputs, and those should not be scheduled.
     // Hence schedule only non PlaceholderOp outputs.
@@ -148,7 +148,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     te::Schedule schedule;
     // No need to register schedule for device copy op.
     if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr) {
-      CHECK(anchor_implementation_.defined());
+      ICHECK(anchor_implementation_.defined());
       schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_);
       for (const auto& scalar : scalars_) {
         if (schedule->Contain(scalar)) {
@@ -167,7 +167,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
     using tir::make_const;
-    CHECK(op->is_scalar());
+    ICHECK(op->is_scalar());
     void* data = op->data->data;
     DataType dtype = DataType(op->data->dtype);
     auto value = te::compute(
@@ -196,7 +196,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
     static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
     static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
-    CHECK(flower_call) << "relay.backend.lower_call is not registered.";
+    ICHECK(flower_call) << "relay.backend.lower_call is not registered.";
 
     Array<te::Tensor> inputs;
     int count_tuple = 0;
@@ -209,10 +209,10 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       }
     }
     if (count_tuple) {
-      CHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
+      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
     }
 
-    CHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
+    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
 
     Array<te::Tensor> outputs;
@@ -229,7 +229,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
     int op_pattern = fpattern[op];
     if (op_pattern >= kCommReduce) {
-      CHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
+      ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
           << "Two complicated op in a primitive function "
           << " anchor=" << anchor_op_ << " current=" << op;
     }
@@ -241,8 +241,8 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     }
     if (outputs.size() != 1) {
       const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
-      CHECK(tuple_type) << "Expect output to be a tuple type";
-      CHECK_EQ(tuple_type->fields.size(), outputs.size());
+      ICHECK(tuple_type) << "Expect output to be a tuple type";
+      ICHECK_EQ(tuple_type->fields.size(), outputs.size());
     }
     // Set the name to `__copy`. It will be detected in graph runtime to perform
     // data copy across devices.
@@ -262,7 +262,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const LetNode* op) final {
     Array<te::Tensor> val = VisitExpr(op->value);
-    CHECK(!memo_.count(op->var));
+    ICHECK(!memo_.count(op->var));
     memo_[op->var] = val;
     return VisitExpr(op->body);
   }
@@ -270,9 +270,9 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
     Array<te::Tensor> fields;
     for (Expr field : op->fields) {
-      CHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
+      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
       Array<te::Tensor> res = VisitExpr(field);
-      CHECK_EQ(res.size(), 1);
+      ICHECK_EQ(res.size(), 1);
       fields.push_back(res[0]);
     }
     return fields;
@@ -281,9 +281,9 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const TupleGetItemNode* op) final {
     const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
     Array<te::Tensor> tuple = VisitExpr(op->tuple);
-    CHECK_EQ(tuple_type->fields.size(), tuple.size());
-    CHECK_GE(op->index, 0);
-    CHECK_LT(static_cast<size_t>(op->index), tuple.size());
+    ICHECK_EQ(tuple_type->fields.size(), tuple.size());
+    ICHECK_GE(op->index, 0);
+    ICHECK_LT(static_cast<size_t>(op->index), tuple.size());
     return {tuple[op->index]};
   }
 
@@ -332,10 +332,10 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
         // flatten tuple of tensor type.
         const auto* tuple_type = param->type_as<TupleTypeNode>();
         // TODO(@icemelon): Support recursive tuple
-        CHECK(tuple_type);
+        ICHECK(tuple_type);
         for (Type field : tuple_type->fields) {
           const auto* ttype = field.as<TensorTypeNode>();
-          CHECK(ttype);
+          ICHECK(ttype);
           add_placeholder(ttype);
         }
       }
@@ -405,7 +405,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       LOG(FATAL) << "Free variable " << var->name_hint();
       return {};
     } else {
-      CHECK(data_dependants_.size());
+      ICHECK(data_dependants_.size());
       bool data_dependant = data_dependants_.back();
       if (data_dependant) {
         param_states_[var] |= kNeedInputData;
@@ -419,8 +419,8 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
     using tir::make_const;
-    CHECK(data_dependants_.size());
-    CHECK(op->is_scalar());
+    ICHECK(data_dependants_.size());
+    ICHECK(op->is_scalar());
     bool data_dependant = data_dependants_.back();
     if (data_dependant) {
       void* data = op->data->data;
@@ -458,13 +458,13 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
     static auto fshape_func = Op::GetAttrMap<FShapeFunc>("FShapeFunc");
     static auto tshape_data_dependant = Op::GetAttrMap<TShapeDataDependant>("TShapeDataDependant");
-    CHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
+    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
-    CHECK(data_dependants_.empty() || !data_dependants_.back())
+    ICHECK(data_dependants_.empty() || !data_dependants_.back())
         << "Error in op fusion: output of the shape func is fed to a "
         << "data-dependant shape func";
-    CHECK_GT(fshape_func.count(op), 0) << "Internal error, cannot find ShapeFunc for " << op->name;
-    CHECK_GT(tshape_data_dependant.count(op), 0)
+    ICHECK_GT(fshape_func.count(op), 0) << "Internal error, cannot find ShapeFunc for " << op->name;
+    ICHECK_GT(tshape_data_dependant.count(op), 0)
         << "Internal error, cannot find TShapeDataDependant for " << op->name;
 
     data_dependants_.push_back(IsDataDependant(call_node));
@@ -480,7 +480,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       }
     }
     if (count_tuple) {
-      CHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
+      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
     }
     // Get output ndims
     auto ret_type = call_node->checked_type();
@@ -490,10 +490,10 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     } else {
       auto rtype = ret_type.as<TupleTypeNode>();
       // TODO(@icemelon): Allow recursive tuple
-      CHECK(rtype);
+      ICHECK(rtype);
       for (size_t i = 0; i < rtype->fields.size(); ++i) {
         auto ttype = rtype->fields[i].as<TensorTypeNode>();
-        CHECK(ttype);
+        ICHECK(ttype);
         out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
       }
     }
@@ -511,7 +511,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const LetNode* op) final {
     Array<te::Tensor> val = VisitExpr(op->value);
-    CHECK(!memo_.count(op->var));
+    ICHECK(!memo_.count(op->var));
     memo_[op->var] = val;
     return VisitExpr(op->body);
   }
@@ -519,9 +519,9 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
     Array<te::Tensor> fields;
     for (Expr field : op->fields) {
-      CHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
+      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
       Array<te::Tensor> res = VisitExpr(field);
-      CHECK_EQ(res.size(), 1);
+      ICHECK_EQ(res.size(), 1);
       fields.push_back(res[0]);
     }
     return fields;
@@ -579,34 +579,34 @@ class CompileEngineImpl : public CompileEngineNode {
     std::vector<CCacheKey> cached_ext_funcs;
     for (const auto& it : cache_) {
       auto src_func = it.first->source_func;
-      CHECK(src_func.defined());
+      ICHECK(src_func.defined());
       if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
         auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
-        CHECK(code_gen.defined()) << "No external codegen is set";
+        ICHECK(code_gen.defined()) << "No external codegen is set";
         std::string code_gen_name = code_gen.value();
         cached_ext_funcs.push_back(it.first);
 
         auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-        CHECK(symbol_name.defined()) << "No external symbol is set for:\n"
-                                     << AsText(src_func, false);
+        ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
+                                      << AsText(src_func, false);
 
         std::string sn = symbol_name.value();
         if (cached_symbol.count(sn)) {
           cached_symbol[sn] = code_gen_name;
         } else {
-          CHECK_NE(sn, code_gen_name)
+          ICHECK_NE(sn, code_gen_name)
               << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
         }
 
         std::string ext_name = "relay.ext." + code_gen_name;
         auto pf = tvm::runtime::Registry::Get(ext_name);
-        CHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
+        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
         // No need to keep compiler attribute at this point, functions have been
         // extracted for specific codegen.
         src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
         runtime::Module ext_mod = (*pf)(src_func);
 
-        CHECK(ext_mod.defined()) << "No external runtime is generated.";
+        ICHECK(ext_mod.defined()) << "No external runtime is generated.";
         ret.push_back(ext_mod);
       }
     }
@@ -661,7 +661,7 @@ class CompileEngineImpl : public CompileEngineNode {
     if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
       auto cache_node = make_object<CachedFuncNode>();
       const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-      CHECK(name_node.defined()) << "External function has not been attached a name yet.";
+      ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
       cache_node->func_name = std::string(name_node.value());
       cache_node->target = Target("ext_dev");
       cache_node->funcs->Add(GlobalVar(cache_node->func_name), key->source_func);
@@ -671,7 +671,7 @@ class CompileEngineImpl : public CompileEngineNode {
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
-    CHECK(!value->cached_func.defined());
+    ICHECK(!value->cached_func.defined());
     auto cfunc = CreateSchedule(key->source_func, key->target);
     auto cache_node = make_object<CachedFuncNode>(*(cfunc.operator->()));
 
@@ -720,7 +720,7 @@ class CompileEngineImpl : public CompileEngineNode {
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
-    CHECK(!value->cached_func.defined());
+    ICHECK(!value->cached_func.defined());
     auto spair = MakeShapeFunc().Create(key->source_func);
     auto cache_node = make_object<CachedFuncNode>(*(spair.second.operator->()));
     cache_node->func_name = GetUniqueName(cache_node->func_name);
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 95166c74f891..55822917b6b7 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -154,7 +154,7 @@ class CCacheKey : public ObjectRef {
   const CCacheKeyNode* operator->() const { return static_cast<const CCacheKeyNode*>(get()); }
   // comparator
   inline bool operator==(const CCacheKey& other) const {
-    CHECK(defined() && other.defined());
+    ICHECK(defined() && other.defined());
     return (*this)->Equal(other.operator->());
   }
   using ContainerType = CCacheKeyNode;
@@ -272,7 +272,7 @@ namespace std {
 template <>
 struct hash<::tvm::relay::CCacheKey> {
   size_t operator()(const ::tvm::relay::CCacheKey& key) const {
-    CHECK(key.defined());
+    ICHECK(key.defined());
     return key->Hash();
   }
 };
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 087c895f4614..a963242f82d5 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -87,7 +87,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     }
     auto fn = cn->op.as<FunctionNode>();
     auto comp = fn->GetAttr<String>(attr::kComposite);
-    CHECK(comp.defined()) << "Arm Compute Library JSON runtime only supports composite functions.";
+    ICHECK(comp.defined()) << "Arm Compute Library JSON runtime only supports composite functions.";
     const std::string name = comp.value();
     std::shared_ptr<JSONGraphNode> json_node;
     if (name == "arm_compute_lib.conv2d" || name == "arm_compute_lib.qnn_conv2d") {
@@ -114,7 +114,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   static CompositeConvNode UnpackCompositeConvolution(const CallNode* cn) {
     CompositeConvNode nodes{};
     const auto* fn = cn->op.as<FunctionNode>();
-    CHECK(fn);
+    ICHECK(fn);
 
     // Traverse composite convolution function from child to parent
     const auto* current_call = fn->body.as<CallNode>();
@@ -132,9 +132,9 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     }
     // Enforce a convolution node exists at this point during traversal
     if (nodes.requantize) {
-      CHECK(backend::IsOp(current_call, "qnn.conv2d"));
+      ICHECK(backend::IsOp(current_call, "qnn.conv2d"));
     } else {
-      CHECK(backend::IsOp(current_call, "nn.conv2d"));
+      ICHECK(backend::IsOp(current_call, "nn.conv2d"));
     }
     nodes.conv = current_call;
     if (!current_call->args.empty() && current_call->args[0]->IsInstance<CallNode>()) {
@@ -157,8 +157,8 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     std::string name = "nn.conv2d";
 
     const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
-    CHECK(conv_attr);
-    CHECK(conv_attr->kernel_layout == "OHWI")
+    ICHECK(conv_attr);
+    ICHECK(conv_attr->kernel_layout == "OHWI")
         << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
 
     // Inputs must be added in the same order they appear in the relay graph.
@@ -186,7 +186,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     // Override attributes
     if (nodes.pad) {
       const auto* pad_attr = nodes.pad->attrs.as<PadAttrs>();
-      CHECK(pad_attr);
+      ICHECK(pad_attr);
       auto p = pad_attr->pad_width;
       // Convert to TVM layout for now, conversion to ACL layout takes place in runtime.
       // Standard convolution pad layout for TVM: top, left, bottom, right.
@@ -216,7 +216,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   static CompositeDenseNode UnpackCompositeDense(const CallNode* cn) {
     CompositeDenseNode nodes{};
     const auto* fn = cn->op.as<FunctionNode>();
-    CHECK(fn);
+    ICHECK(fn);
 
     // Traverse composite dense function from child to parent
     const auto* current_call = fn->body.as<CallNode>();
@@ -230,9 +230,9 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     }
     // Enforce a dense node exists at this point during traversal
     if (nodes.requantize) {
-      CHECK(backend::IsOp(current_call, "qnn.dense"));
+      ICHECK(backend::IsOp(current_call, "qnn.dense"));
     } else {
-      CHECK(backend::IsOp(current_call, "nn.dense"));
+      ICHECK(backend::IsOp(current_call, "nn.dense"));
     }
     nodes.dense = current_call;
     return nodes;
@@ -282,13 +282,13 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
    */
   std::shared_ptr<JSONGraphNode> CreateCompositeAvgPool2DJSONNode(const CallNode* cn) {
     const auto* fn = cn->op.as<FunctionNode>();
-    CHECK(fn);
+    ICHECK(fn);
     const auto* cast = fn->body.as<CallNode>();
-    CHECK(cast);
+    ICHECK(cast);
     const auto* avg_pool = cast->args[0].as<CallNode>();
-    CHECK(avg_pool);
+    ICHECK(avg_pool);
     const auto* avg_pool_op = avg_pool->op.as<OpNode>();
-    CHECK(avg_pool_op);
+    ICHECK(avg_pool_op);
     const std::string name = avg_pool_op->name;
 
     std::vector<JSONGraphNodeEntry> inputs;
@@ -310,16 +310,16 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   std::shared_ptr<JSONGraphNode> CreateCompositeL2Pool2DJSONNode(const CallNode* cn) {
     const std::string name = "nn.l2_pool2d";
     const auto* fn = cn->op.as<FunctionNode>();
-    CHECK(fn);
+    ICHECK(fn);
     const auto* sqrt = fn->body.as<CallNode>();
-    CHECK(sqrt);
+    ICHECK(sqrt);
     const auto* avg_pool = sqrt->args[0].as<CallNode>();
-    CHECK(avg_pool);
+    ICHECK(avg_pool);
     const auto* pow = avg_pool->args[0].as<CallNode>();
-    CHECK(pow);
+    ICHECK(pow);
     const auto* exponent = pow->args[1].as<ConstantNode>();
-    CHECK(exponent);
-    CHECK_EQ(*static_cast<float*>(exponent->data->data), 2) << "Exponent must be 2 for L2 pooling";
+    ICHECK(exponent);
+    ICHECK_EQ(*static_cast<float*>(exponent->data->data), 2) << "Exponent must be 2 for L2 pooling";
 
     std::vector<JSONGraphNodeEntry> inputs;
     inputs.push_back(VisitExpr(cn->args[0])[0]);
@@ -363,7 +363,7 @@ TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib.optimize").set_body_typed(PreProc
  * \return A runtime module.
  */
 runtime::Module ACLCompiler(const ObjectRef& ref) {
-  CHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
+  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
   Function func = Downcast<Function>(ref);
   std::string func_name = backend::GetExtSymbol(func);
 
@@ -372,7 +372,7 @@ runtime::Module ACLCompiler(const ObjectRef& ref) {
   std::string graph_json = serializer.GetJSON();
   auto param_names = serializer.GetParams();
   const auto* pf = runtime::Registry::Get("runtime.arm_compute_lib_runtime_create");
-  CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
+  ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
   runtime::Module lib = (*pf)(func_name, graph_json, param_names);
   return lib;
 }
diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index c7b5a8da1fed..935ac16efb23 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -61,7 +61,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     std::vector<Output> outs;
     for (auto field : node->fields) {
       auto res = VisitExpr(field);
-      CHECK_EQ(res.size(), 1U) << "Do not support tuple nest";
+      ICHECK_EQ(res.size(), 1U) << "Do not support tuple nest";
       outs.push_back(res[0]);
     }
     return outs;
@@ -69,7 +69,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
 
   std::vector<Output> VisitExpr_(const TupleGetItemNode* op) final {
     auto res = VisitExpr(op->tuple);
-    CHECK_GT(res.size(), static_cast<size_t>(op->index));
+    ICHECK_GT(res.size(), static_cast<size_t>(op->index));
 
     // Only keep the item we want for the child node.
     // FIXME(@comaniac): The other items should still be requried for the primary outputs.
@@ -84,7 +84,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     // Get const: static_cast<float*>(gcc_0_consts[0]->data)
     output.name = CreateDataReference(ext_func_id_, const_idx_);
     const auto* type_node = cn->checked_type().as<TensorTypeNode>();
-    CHECK(type_node);
+    ICHECK(type_node);
     const auto& dtype = GetDtypeString(type_node);
 
     // Generate the global variable for needed ndarrays
@@ -94,7 +94,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
       ext_func_body_.insert(ext_func_body_.begin(), checker);
     }
 
-    CHECK(dtype == "float" || dtype == "int") << "Only float and int are supported for now.";
+    ICHECK(dtype == "float" || dtype == "int") << "Only float and int are supported for now.";
     output.dtype = dtype;
 
     std::string const_var_name = CreateConstVar(ext_func_id_, const_idx_);
@@ -130,7 +130,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     }
 
     const auto* type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node);
+    ICHECK(type_node);
     const auto& dtype = GetDtypeString(type_node);
     macro_stream << ", " << dtype;
 
@@ -216,7 +216,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
 class CSourceCodegen : public CSourceModuleCodegenBase {
  public:
   std::pair<std::string, Array<String>> GenCFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
+    ICHECK(func.defined()) << "Input error: expect a Relay function.";
 
     // Record the external symbol for runtime lookup.
     auto sid = GetExtSymbol(func);
@@ -260,7 +260,7 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
 
     code_stream_ << operator_macro << "\n\n";
 
-    CHECK(ref->IsInstance<FunctionNode>());
+    ICHECK(ref->IsInstance<FunctionNode>());
     auto res = GenCFunc(Downcast<Function>(ref));
     std::string code = code_stream_.str();
 
@@ -269,7 +269,7 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
 
     // Create a CSource module
     const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
-    CHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
+    ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
     return (*pf)(code, "c", sym, variables);
   }
 
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
index 0d395b7977b2..9448b4d0738d 100644
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -85,7 +85,7 @@ class CodegenCBase {
    * \brief Exit a scope.
    */
   void ExitScope() {
-    CHECK_GE(indent_, 2U) << "Wrong ident found.";
+    ICHECK_GE(indent_, 2U) << "Wrong ident found.";
     indent_ -= 2;
   }
 
@@ -262,7 +262,7 @@ class CodegenCBase {
    */
   std::string GetDtypeString(const Var& var) {
     auto ttype = var->checked_type().as<TensorTypeNode>();
-    CHECK(ttype) << "Expect TensorTypeNode";
+    ICHECK(ttype) << "Expect TensorTypeNode";
     return GetDtypeString(ttype);
   }
 
@@ -297,7 +297,7 @@ class CodegenCBase {
    */
   std::string CreateInitChecker(const std::string& symbol) const {
     std::ostringstream oss;
-    oss << "CHECK(!" << symbol
+    oss << "ICHECK(!" << symbol
         << "_consts.empty()) << \"C source module hasn't been initialized.\";\n";
     return oss.str();
   }
diff --git a/src/relay/backend/contrib/codegen_json/codegen_json.h b/src/relay/backend/contrib/codegen_json/codegen_json.h
index 9ed15a88c72a..859ef8c9bdb2 100644
--- a/src/relay/backend/contrib/codegen_json/codegen_json.h
+++ b/src/relay/backend/contrib/codegen_json/codegen_json.h
@@ -197,8 +197,8 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
     if (const auto* tuple_type = checked_type.as<TupleTypeNode>()) {
       for (size_t i = 0; i < tuple_type->fields.size(); ++i) {
         const auto* tensor_type = tuple_type->fields[i].as<TensorTypeNode>();
-        CHECK(tensor_type) << "Expect TensorType, but received: ."
-                           << tuple_type->fields[i]->GetTypeKey();
+        ICHECK(tensor_type) << "Expect TensorType, but received: ."
+                            << tuple_type->fields[i]->GetTypeKey();
         ret.push_back(JSONGraphNodeEntry(node_id, i));
         shape.emplace_back(GetIntShape(tensor_type->shape));
         dtype.emplace_back(DType2String(tensor_type->dtype));
@@ -206,7 +206,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
       node->SetNumOutput(tuple_type->fields.size());
     } else {
       const auto* tensor_type = checked_type.as<TensorTypeNode>();
-      CHECK(tensor_type) << "Expect TensorType, but received: " << checked_type->GetTypeKey();
+      ICHECK(tensor_type) << "Expect TensorType, but received: " << checked_type->GetTypeKey();
       shape.emplace_back(GetIntShape(tensor_type->shape));
       dtype.emplace_back(DType2String(tensor_type->dtype));
       ret.push_back(JSONGraphNodeEntry(node_id, 0));
@@ -228,7 +228,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
       extractor.Extract(const_cast<Object*>(call_attr));
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto pattern = fn->GetAttr<String>(attr::kPartitionedFromPattern);
-      CHECK(pattern.defined());
+      ICHECK(pattern.defined());
       std::vector<std::string> values;
       values.push_back(pattern.value());
       std::vector<dmlc::any> attr;
@@ -243,7 +243,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
   }
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const VarNode* vn) {
-    CHECK(memo_.count(GetRef<Expr>(vn)));
+    ICHECK(memo_.count(GetRef<Expr>(vn)));
     return memo_[GetRef<Expr>(vn)];
   }
 
@@ -270,7 +270,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
       name = op_node->name;
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto comp = fn->GetAttr<String>(attr::kComposite);
-      CHECK(comp.defined()) << "JSON runtime only supports composite functions.";
+      ICHECK(comp.defined()) << "JSON runtime only supports composite functions.";
       name = comp.value();
     } else {
       LOG(FATAL) << "JSON runtime does not support calls to " << cn->op->GetTypeKey();
@@ -289,7 +289,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
   }
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const LetNode* ln) {
-    CHECK_EQ(memo_.count(ln->var), 0);
+    ICHECK_EQ(memo_.count(ln->var), 0);
     memo_[ln->var] = VisitExpr(ln->value);
     return VisitExpr(ln->body);
   }
@@ -300,7 +300,7 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
   }
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const FunctionNode* fn) {
-    CHECK(fn->GetAttr<String>(attr::kComposite).defined())
+    ICHECK(fn->GetAttr<String>(attr::kComposite).defined())
         << "JSON runtime only supports composite functions";
     // FunctionNode should be handled by the caller.
     return {};
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index bec9af0cf83f..bfc5c77d116b 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -57,7 +57,7 @@ inline size_t GetShape1DSize(const Type& type) {
 std::vector<std::string> Conv2d(const CallNode* call) {
   std::vector<std::string> args;
   const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
-  CHECK(conv2d_attr);
+  ICHECK(conv2d_attr);
 
   auto ishape = GetShape(call->args[0]->checked_type());
   auto wshape = GetShape(call->args[1]->checked_type());
@@ -155,7 +155,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
     std::vector<Output> outs;
     for (auto field : node->fields) {
       auto res = VisitExpr(field);
-      CHECK_EQ(res.size(), 1U) << "Do not support tuple nest";
+      ICHECK_EQ(res.size(), 1U) << "Do not support tuple nest";
       outs.push_back(res[0]);
     }
     return outs;
@@ -163,7 +163,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
 
   std::vector<Output> VisitExpr_(const TupleGetItemNode* op) final {
     auto res = VisitExpr(op->tuple);
-    CHECK_GT(res.size(), static_cast<size_t>(op->index));
+    ICHECK_GT(res.size(), static_cast<size_t>(op->index));
 
     // Only keep the item we want for the child node.
     // FIXME(@comaniac): The other items should still be requried for the primary outputs.
@@ -190,8 +190,8 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
     const_idx_++;
 
     const auto* type_node = cn->checked_type().as<TensorTypeNode>();
-    CHECK(type_node);
-    CHECK_EQ(GetDtypeString(type_node), "float") << "Only float is supported for now.";
+    ICHECK(type_node);
+    ICHECK_EQ(GetDtypeString(type_node), "float") << "Only float is supported for now.";
 
     return {output};
   }
@@ -233,7 +233,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
 
   GenerateBodyOutput GenerateOpCall(const CallNode* call) {
     const auto* op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "Expect OpNode, but got " << call->op->GetTypeKey();
+    ICHECK(op_node) << "Expect OpNode, but got " << call->op->GetTypeKey();
 
     using ArgFunType = std::function<std::vector<std::string>(const CallNode*)>;
     static const std::map<std::string, std::pair<std::string, ArgFunType>> op_map = {
@@ -257,7 +257,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
   GenerateBodyOutput GenerateCompositeFunctionCall(const FunctionNode* callee,
                                                    const CallNode* caller) {
     const auto pattern_name = callee->GetAttr<runtime::String>(attr::kComposite);
-    CHECK(pattern_name.defined()) << "Only functions with composite attribute supported";
+    ICHECK(pattern_name.defined()) << "Only functions with composite attribute supported";
 
     if (pattern_name == "dnnl.conv2d_bias_relu") {
       const auto* conv_call =
@@ -283,7 +283,7 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
                                   const std::vector<std::string>& func_args,
                                   const std::vector<std::string>& attribute_args) {
     // Make function call with input buffers when visiting arguments
-    CHECK_GT(func_args.size(), 0);
+    ICHECK_GT(func_args.size(), 0);
     std::ostringstream decl_stream;
     decl_stream << "(" << func_args[0];
     for (size_t i = 1; i < func_args.size(); ++i) {
@@ -295,11 +295,11 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
     if (root_call->checked_type()->IsInstance<TupleTypeNode>()) {
       auto type_node = root_call->checked_type().as<TupleTypeNode>();
       for (auto field : type_node->fields) {
-        CHECK(field->IsInstance<TensorTypeNode>());
+        ICHECK(field->IsInstance<TensorTypeNode>());
         out_types.push_back(field);
       }
     } else if (root_call->checked_type()->IsInstance<TensorTypeNode>()) {
-      CHECK(root_call->checked_type()->IsInstance<TensorTypeNode>());
+      ICHECK(root_call->checked_type()->IsInstance<TensorTypeNode>());
       out_types.push_back(root_call->checked_type());
     } else {
       LOG(FATAL) << "Unrecognized type node: " << AsText(root_call->checked_type(), false);
@@ -363,7 +363,7 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
  public:
   // Create a corresponding DNNL function for the given relay Function.
   std::pair<std::string, Array<String>> GenDNNLFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
+    ICHECK(func.defined()) << "Input error: expect a Relay function.";
 
     // Record the external symbol for runtime lookup.
     auto sid = GetExtSymbol(func);
@@ -404,7 +404,7 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
     code_stream_ << "using namespace tvm::runtime::contrib;\n";
     code_stream_ << "\n";
 
-    CHECK(ref->IsInstance<FunctionNode>());
+    ICHECK(ref->IsInstance<FunctionNode>());
     auto res = GenDNNLFunc(Downcast<Function>(ref));
     std::string code = code_stream_.str();
     String sym = std::get<0>(res);
@@ -412,7 +412,7 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
 
     // Create a CSource module
     const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
-    CHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
+    ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
     return (*pf)(code, "c", sym, variables);
   }
 
@@ -441,14 +441,14 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
       name = op_node->name;
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto comp = fn->GetAttr<String>(attr::kComposite);
-      CHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
+      ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
       name = comp.value();
 
       if (name == "dnnl.conv2d_bias_relu") {
         call = GetRootCall(fn->body.as<CallNode>(), 2, {"nn.conv2d", "add", "nn.relu"});
       } else if (name == "dnnl.conv2d_relu") {
         call = GetRootCall(fn->body.as<CallNode>(), 1, {"nn.conv2d", "nn.relu"});
-        CHECK(call->op.as<OpNode>()) << "Not op node";
+        ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else {
         LOG(FATAL) << "Unrecognized DNNL pattern: " << name;
       }
@@ -476,7 +476,7 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
  */
 runtime::Module DNNLCompiler(const ObjectRef& ref) {
 #ifdef USE_JSON_RUNTIME
-  CHECK(ref->IsInstance<FunctionNode>());
+  ICHECK(ref->IsInstance<FunctionNode>());
   auto func = Downcast<Function>(ref);
   auto func_name = GetExtSymbol(func);
   DNNLJSONSerializer serializer(func_name, func);
@@ -485,7 +485,7 @@ runtime::Module DNNLCompiler(const ObjectRef& ref) {
   auto params = serializer.GetParams();
 
   const auto* pf = runtime::Registry::Get("runtime.DNNLJSONRuntimeCreate");
-  CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
+  ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
   auto mod = (*pf)(func_name, graph_json, params);
   return mod;
 #else
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index dd92c6bfe723..3097a300a0d9 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -43,7 +43,7 @@ sl::TensorInfo GetTensorInfo(std::map<Expr, std::vector<sl::TensorInfo>> tensor_
 bool IsEthosnOp(const Call& call, const std::string& op_name) {
   if (call->op->IsInstance<OpNode>()) {
     Op op = Downcast<Op>(call->op);
-    CHECK(op.defined());
+    ICHECK(op.defined());
     return op == Op::Get(op_name);
   } else {
     return false;
@@ -53,7 +53,7 @@ bool IsEthosnOp(const Call& call, const std::string& op_name) {
 bool IsEthosnFunc(const Call& call, const std::string& op_name) {
   if (call->op->IsInstance<FunctionNode>()) {
     Function func = Downcast<Function>(call->op);
-    CHECK(func.defined());
+    ICHECK(func.defined());
     auto name_node = func->GetAttr<String>(attr::kComposite);
     return name_node.value() == op_name;
   }
@@ -62,7 +62,7 @@ bool IsEthosnFunc(const Call& call, const std::string& op_name) {
 
 std::map<Expr, std::vector<sl::TensorInfo>> InferTensorsVisitor::Infer(const Expr& expr) {
   tensor_table_.clear();
-  CHECK(expr->checked_type().defined());
+  ICHECK(expr->checked_type().defined());
   size_t output_size = 1;
   if (auto tuple = expr->checked_type().as<TupleTypeNode>()) {
     output_size = tuple->fields.size();
@@ -162,7 +162,7 @@ void InferTensorsVisitor::VisitExpr_(const CallNode* cn) {
 
 void InferTensorsVisitor::VisitExpr_(const TupleNode* tn) {
   auto tuple = GetRef<Tuple>(tn);
-  CHECK(tensor_table_.find(tuple) != tensor_table_.end());
+  ICHECK(tensor_table_.find(tuple) != tensor_table_.end());
   for (size_t i = 0; i < tn->fields.size(); i++) {
     tensor_table_[tn->fields[i]] = {tensor_table_[tuple][i]};
   }
@@ -176,7 +176,7 @@ void InferTensorsVisitor::VisitExpr_(const TupleGetItemNode* tgn) {
   // Don't assume it must be targeting a TupleNode
   // Vars and calls can still have TupleType
   auto tg = GetRef<TupleGetItem>(tgn);
-  CHECK(tensor_table_.find(tg) != tensor_table_.end());
+  ICHECK(tensor_table_.find(tg) != tensor_table_.end());
   auto tuple = tg->tuple;
   auto type = tuple->checked_type().as<TupleTypeNode>();
   int index = tg->index;
@@ -517,7 +517,7 @@ runtime::Module EthosnCompiler::CreateRuntimeModule(const ObjectRef& ref) {
     IRModule mod;
     Function func = Downcast<Function>(ref);
     auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(name_node.defined()) << "Failed to retrieved external symbol.";
+    ICHECK(name_node.defined()) << "Failed to retrieved external symbol.";
     GlobalVar gvar = GlobalVar(name_node.value());
     mod->Add(gvar, func);
     Function mod_func = Downcast<Function>(mod->functions.at(gvar));
@@ -539,7 +539,7 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
   // Finally compile the network
   std::vector<std::unique_ptr<sl::CompiledNetwork>> compiled_networks =
       sl::Compile(*network_with_ids.network, options);
-  CHECK_GE(compiled_networks.size(), 1) << "Ethos-N compiler failed to compile network";
+  ICHECK_GE(compiled_networks.size(), 1) << "Ethos-N compiler failed to compile network";
   auto compiled_network = std::move(compiled_networks[0]);
   // Determine the order that the inputs/outputs are in and how that corresponds to the
   // order that the TVM runtime will expect them in
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index f692da3f31ac..26f674dcd7b5 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -109,7 +109,7 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
 
   void SetPadNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
     const auto* pad_attr = cn->attrs.as<PadAttrs>();
-    CHECK(pad_attr);
+    ICHECK(pad_attr);
     auto p = pad_attr->pad_width;
     const int dim_h = (p.size() == 5) ? 3 : 2;
     const int dim_w = (p.size() == 5) ? 4 : 3;
@@ -124,7 +124,7 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
 
   void SetStridedSliceNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
     const auto* attrs = cn->attrs.as<StridedSliceAttrs>();
-    CHECK(attrs && attrs->begin && attrs->end && attrs->strides)
+    ICHECK(attrs && attrs->begin && attrs->end && attrs->strides)
         << "StridedSlice must have static begin, end, and strides.";
     const bool default_strides =
         !attrs->strides.value().defined() || attrs->strides.value().size() == 0;
@@ -145,10 +145,10 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
                                 !attrs->strides.value()[i].defined())
                                    ? 1
                                    : attrs->strides.value()[i].as<IntImmNode>()->value;
-      CHECK_GT(stride_value, 0);
+      ICHECK_GT(stride_value, 0);
       const int size_value = (end_value - begin_value + stride_value - 1) / stride_value;
-      CHECK_GE(begin_value, 0);
-      CHECK_GT(size_value, 0);
+      ICHECK_GE(begin_value, 0);
+      ICHECK_GT(size_value, 0);
       start.push_back(std::to_string(begin_value));
       size.push_back(std::to_string(size_value));
       strides.push_back(std::to_string(stride_value));
@@ -168,7 +168,7 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
     if (!cfg.defined()) {
       cfg = AttrsWithDefaultValues<TensorRTCompilerConfig>();
     }
-    CHECK_EQ(cfg.value()->tensorrt_version.size(), 3);
+    ICHECK_EQ(cfg.value()->tensorrt_version.size(), 3);
     std::vector<std::string> tensorrt_version = {std::to_string(cfg.value()->tensorrt_version[0]),
                                                  std::to_string(cfg.value()->tensorrt_version[1]),
                                                  std::to_string(cfg.value()->tensorrt_version[2])};
@@ -190,7 +190,7 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
  * \return A runtime module.
  */
 runtime::Module TensorRTCompiler(const ObjectRef& ref) {
-  CHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
+  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
   Function func = Downcast<Function>(ref);
   std::string func_name = backend::GetExtSymbol(func);
 
@@ -199,7 +199,7 @@ runtime::Module TensorRTCompiler(const ObjectRef& ref) {
   std::string graph_json = serializer.GetJSON();
   auto param_names = serializer.GetParams();
   const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
-  CHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
+  ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
   runtime::Module lib = (*pf)(func_name, graph_json, param_names);
   return lib;
 }
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 2b08f45b2582..bf58c8d5be41 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -83,7 +83,7 @@ class StorageAllocaBaseVisitor : public ExprVisitor {
     std::vector<StorageToken*> fields;
     for (Expr field : op->fields) {
       auto tok = GetToken(field);
-      CHECK_EQ(tok.size(), 1U);
+      ICHECK_EQ(tok.size(), 1U);
       fields.push_back(tok[0]);
     }
     token_map_[op] = fields;
@@ -91,7 +91,7 @@ class StorageAllocaBaseVisitor : public ExprVisitor {
 
   void VisitExpr_(const TupleGetItemNode* op) final {
     const auto& tok = GetToken(op->tuple);
-    CHECK_LT(static_cast<size_t>(op->index), tok.size());
+    ICHECK_LT(static_cast<size_t>(op->index), tok.size());
     token_map_[op] = {tok[op->index]};
   }
 
@@ -115,7 +115,7 @@ class StorageAllocaBaseVisitor : public ExprVisitor {
   const std::vector<StorageToken*>& GetToken(const Expr& expr) {
     this->VisitExpr(expr);
     auto it = token_map_.find(expr.operator->());
-    CHECK(it != token_map_.end());
+    ICHECK(it != token_map_.end());
     return it->second;
   }
   /*!
@@ -142,14 +142,14 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor {
   using StorageAllocaBaseVisitor::VisitExpr_;
 
   void CreateToken(const ExprNode* op, bool can_realloc) final {
-    CHECK(!token_map_.count(op));
+    ICHECK(!token_map_.count(op));
     std::vector<StorageToken*> tokens;
     int device_type =
         node_device_map_.count(GetRef<Expr>(op)) ? node_device_map_[GetRef<Expr>(op)]->value : 0;
     if (const auto* tuple_type = op->checked_type().as<TupleTypeNode>()) {
       for (Type t : tuple_type->fields) {
         const auto* ttype = t.as<TensorTypeNode>();
-        CHECK(ttype);
+        ICHECK(ttype);
         StorageToken* token = arena_->make<StorageToken>();
         token->ttype = ttype;
         token->device_type = device_type;
@@ -157,7 +157,7 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor {
       }
     } else {
       const auto* ttype = op->checked_type().as<TensorTypeNode>();
-      CHECK(ttype);
+      ICHECK(ttype);
       StorageToken* token = arena_->make<StorageToken>();
       token->ttype = ttype;
       token->device_type = device_type;
@@ -233,9 +233,9 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   using StorageAllocaBaseVisitor::VisitExpr_;
   // override create token by getting token as prototype requirements.
   void CreateToken(const ExprNode* op, bool can_realloc) final {
-    CHECK(!token_map_.count(op));
+    ICHECK(!token_map_.count(op));
     auto it = prototype_.find(op);
-    CHECK(it != prototype_.end());
+    ICHECK(it != prototype_.end());
     std::vector<StorageToken*> tokens;
     for (StorageToken* tok : it->second) {
       if (can_realloc) {
@@ -286,12 +286,12 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
    */
   size_t GetMemorySize(StorageToken* prototype) {
     const TensorTypeNode* ttype = prototype->ttype;
-    CHECK(ttype != nullptr);
+    ICHECK(ttype != nullptr);
     size_t size = 1;
     for (IndexExpr dim : ttype->shape) {
       const int64_t* pval = tir::as_const_int(dim);
-      CHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
-      CHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
       size *= static_cast<size_t>(pval[0]);
     }
     size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
@@ -316,7 +316,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     for (auto it = mid; it != end; ++it) {
       StorageToken* tok = it->second;
       if (tok->device_type != prototype->device_type) continue;
-      CHECK_EQ(tok->ref_counter, 0);
+      ICHECK_EQ(tok->ref_counter, 0);
       // Use exect matching strategy
       tok->max_bytes = std::max(size, tok->max_bytes);
       tok->ref_counter = prototype->ref_counter;
@@ -329,7 +329,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
       --it;
       StorageToken* tok = it->second;
       if (tok->device_type != prototype->device_type) continue;
-      CHECK_EQ(tok->ref_counter, 0);
+      ICHECK_EQ(tok->ref_counter, 0);
       // Use exect matching strategy
       tok->max_bytes = std::max(size, tok->max_bytes);
       tok->ref_counter = prototype->ref_counter;
@@ -356,8 +356,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
    * \param tok The token to be released.
    */
   void CheckForRelease(StorageToken* tok) {
-    CHECK_GE(tok->storage_id, 0);
-    CHECK_GE(tok->ref_counter, 0);
+    ICHECK_GE(tok->storage_id, 0);
+    ICHECK_GE(tok->ref_counter, 0);
     if (tok->ref_counter == 0) {
       free_.insert({tok->max_bytes, tok});
     }
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index acc99c51b69b..7b71e34b777b 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -243,9 +243,9 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   std::vector<GraphNodeRef> AddNode(GraphObjectPtr node, Expr expr) {
     auto checked_type = expr->checked_type();
     size_t count = storage_device_map_.count(expr);
-    CHECK_GT(count, 0) << "Expr is not existing in storage plan";
+    ICHECK_GT(count, 0) << "Expr is not existing in storage plan";
     auto storage_device_info = storage_device_map_[expr];
-    CHECK_EQ(storage_device_info.size(), 2);
+    ICHECK_EQ(storage_device_info.size(), 2);
     // storage
     std::vector<int64_t> storage_info;
     for (auto& v : storage_device_info[0]) {
@@ -282,7 +282,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
           LOG(FATAL) << "type " << checked_type->GetTypeKey() << " not supported";
         }
       }
-      CHECK_EQ(node->Type(), kGraphOpNode);
+      ICHECK_EQ(node->Type(), kGraphOpNode);
       auto op_nd = std::dynamic_pointer_cast<GraphOpNode>(node);
       op_nd->attrs_["shape"] = shape;
       op_nd->attrs_["dtype"] = dtype;
@@ -367,7 +367,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
       target = Target("ext_dev");
       CCacheKey key = (*pf0)(func, target);
       CachedFunc ext_func = (*pf1)(compile_engine_, key);
-      CHECK(ext_func.defined()) << "External function is not defined.";
+      ICHECK(ext_func.defined()) << "External function is not defined.";
 
       // Step into the functions that are handled by external codegen to
       // collect metadata.
@@ -379,7 +379,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
       return GraphAddCallNode(op, ext_func->func_name, ext_func->func_name);
     }
 
-    CHECK_GE(storage_device_map_.count(expr), 0);
+    ICHECK_GE(storage_device_map_.count(expr), 0);
     auto& device_type = storage_device_map_[expr][1];
     auto call_dev_type = device_type[0]->value;
     // Normal Relay Function
@@ -410,7 +410,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   }
 
   std::vector<GraphNodeRef> VisitExpr_(const LetNode* op) override {
-    CHECK_EQ(var_map_.count(op->var.get()), 0);
+    ICHECK_EQ(var_map_.count(op->var.get()), 0);
     var_map_[op->var.get()] = VisitExpr(op->value);
     return VisitExpr(op->body);
   }
@@ -431,7 +431,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const FunctionNode* op) override {
-    CHECK(op->GetAttr<String>(attr::kCompiler).defined())
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
         << "Only functions supported by custom codegen";
     return {};
   }
@@ -479,7 +479,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
       const auto& storage_id = dmlc::get<std::vector<int64_t>>(node->attrs_["storage_id"]);
       const auto& dtype_vec = dmlc::get<std::vector<std::string>>(node->attrs_["dtype"]);
 
-      CHECK_EQ(node->num_outputs_, shape_vec.size());
+      ICHECK_EQ(node->num_outputs_, shape_vec.size());
       num_entry += node->num_outputs_;
 
       shapes.insert(shapes.end(), shape_vec.begin(), shape_vec.end());
@@ -556,14 +556,14 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
   virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
     if (name == "init") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
-                                   << "runtime::Module mod and Map<int, Target> targets";
+        ICHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
+                                    << "runtime::Module mod and Map<int, Target> targets";
         void* mod = args[0];
         Map<Integer, tvm::Target> tmp = args[1];
         TargetsMap targets;
         for (const auto& it : tmp) {
           auto dev_type = it.first.as<tir::IntImmNode>();
-          CHECK(dev_type);
+          ICHECK(dev_type);
           targets[dev_type->value] = it.second;
         }
         codegen_ =
@@ -588,7 +588,7 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
     } else if (name == "get_param_by_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         String key = args[0];
-        CHECK_GT(this->output_.params.count(key), 0);
+        ICHECK_GT(this->output_.params.count(key), 0);
         *rv = this->output_.params[key];
       });
     } else if (name == "get_irmodule") {
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index e58c23b76670..993fb1a62787 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -54,7 +54,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 inline const PackedFunc& GetPackedFunc(const std::string& name) {
   const PackedFunc* pf = tvm::runtime::Registry::Get(name);
-  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
+  ICHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
   return *pf;
 }
 
@@ -347,12 +347,12 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
         }
       }
     }
-    CHECK_EQ(arg_counter, cfunc->inputs.size()) << "Shape function input sizes mismatch";
+    ICHECK_EQ(arg_counter, cfunc->inputs.size()) << "Shape function input sizes mismatch";
 
     auto fset_shape_output = [&](size_t i, Type val_type) {
       // TODO(@icemelon): allow recursive tuple
       const TensorTypeNode* rtype = val_type.as<TensorTypeNode>();
-      CHECK(rtype != nullptr);
+      ICHECK(rtype != nullptr);
       int64_t ndim = rtype->shape.size();
       auto arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_ctx);
       outputs[i] = arr;
@@ -371,7 +371,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       auto tt = Downcast<TensorType>(ret_type);
       fset_shape_output(0, tt);
     }
-    CHECK_EQ(cfunc->outputs.size(), out_cnt) << "Shape function output sizes mismatch";
+    ICHECK_EQ(cfunc->outputs.size(), out_cnt) << "Shape function output sizes mismatch";
 
     PackedFunc shape_func;
     Module m;
@@ -428,7 +428,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     if (const auto* tuple_type = func->body->checked_type().as<TupleTypeNode>()) {
       arg_len += tuple_type->fields.size();
     } else {
-      CHECK(func->body->checked_type().as<TensorTypeNode>()) << func->body->checked_type();
+      ICHECK(func->body->checked_type().as<TensorTypeNode>()) << func->body->checked_type();
       arg_len += 1;
     }
     std::vector<TVMValue> values(arg_len);
@@ -439,7 +439,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       const auto nd_array = Downcast<NDArray>(val);
       setter(i, nd_array);
       DLContext arg_ctx = nd_array->ctx;
-      CHECK(arg_ctx.device_type == context_.device_type && arg_ctx.device_id == context_.device_id)
+      ICHECK(arg_ctx.device_type == context_.device_type && arg_ctx.device_id == context_.device_id)
           << "Interpreter expect context to be " << context_ << ", but get " << arg_ctx;
     };
 
@@ -461,12 +461,12 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     // return type.
     auto fset_output = [&](size_t i, Type val_type) {
       const TensorTypeNode* rtype = val_type.as<TensorTypeNode>();
-      CHECK(rtype != nullptr);
+      ICHECK(rtype != nullptr);
       // Allocate output tensor.
       std::vector<int64_t> shape;
       for (auto dim : rtype->shape) {
         const auto* ivalue = tir::as_const_int(dim);
-        CHECK(ivalue) << "expected concrete dimensions";
+        ICHECK(ivalue) << "expected concrete dimensions";
         shape.push_back(ivalue[0]);
       }
       DLDataType dtype = rtype->dtype;
@@ -480,14 +480,14 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     bool is_dyn = IsDynamic(ret_type);
 
     if (is_dyn) {
-      CHECK(func->HasNonzeroAttr(attr::kPrimitive));
+      ICHECK(func->HasNonzeroAttr(attr::kPrimitive));
       out_shapes = ComputeDynamicShape(func, args);
     }
 
     PackedFunc packed_func = engine_->JIT(CCacheKey(func, target_));
     TVMRetValue rv;
     if (const TupleTypeNode* rtype = func->body->checked_type().as<TupleTypeNode>()) {
-      CHECK(!is_dyn || out_shapes.size() == rtype->fields.size());
+      ICHECK(!is_dyn || out_shapes.size() == rtype->fields.size());
       std::vector<ObjectRef> fields;
       for (size_t i = 0; i < rtype->fields.size(); ++i) {
         if (is_dyn) {
@@ -503,7 +503,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     } else {
       ObjectRef out_tensor;
       if (is_dyn) {
-        CHECK_EQ(out_shapes.size(), 1);
+        ICHECK_EQ(out_shapes.size(), 1);
         auto sh = out_shapes[0];
         auto tt = Downcast<TensorType>(ret_type);
         out_tensor = fset_output(0, TensorType(sh, tt->dtype));
@@ -526,16 +526,16 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     // Allocate a frame with the parameters and free variables.
     tvm::Map<Var, ObjectRef> locals;
 
-    CHECK_EQ(func->params.size(), args.size());
+    ICHECK_EQ(func->params.size(), args.size());
 
     for (size_t i = 0; i < func->params.size(); i++) {
-      CHECK_EQ(locals.count(func->params[i]), 0);
+      ICHECK_EQ(locals.count(func->params[i]), 0);
       locals.Set(func->params[i], args[i]);
     }
 
     // Add the var to value mappings from the Closure's environment.
     for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
-      CHECK_EQ(locals.count((*it).first), 0);
+      ICHECK_EQ(locals.count((*it).first), 0);
       locals.Set((*it).first, (*it).second);
     }
 
@@ -593,9 +593,9 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   ObjectRef VisitExpr_(const TupleGetItemNode* op) final {
     ObjectRef val = Eval(op->tuple);
     const auto* adt_obj = val.as<ADTObj>();
-    CHECK(adt_obj) << "interal error: when evaluating TupleGetItem expected an ADT value";
+    ICHECK(adt_obj) << "interal error: when evaluating TupleGetItem expected an ADT value";
     auto adt = GetRef<ADT>(adt_obj);
-    CHECK_LT(static_cast<size_t>(op->index), adt.size()) << "internal error: index out of bounds";
+    ICHECK_LT(static_cast<size_t>(op->index), adt.size()) << "internal error: index out of bounds";
     return adt[op->index];
   }
 
@@ -607,7 +607,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       cpu_ctx.device_type = kDLCPU;
       cpu_ctx.device_id = 0;
       NDArray cpu_array = nd_array.CopyTo(cpu_ctx);
-      CHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
+      ICHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
       // TODO(@jroesch, @MK): Refactor code into helper from DCE.
       if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
         return Eval(op->true_branch);
@@ -656,11 +656,11 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
 
   bool VisitPattern_(const PatternConstructorNode* op, const ObjectRef& v) final {
     const ConstructorValueObj* cvn = v.as<ConstructorValueObj>();
-    CHECK(cvn) << "need to be a constructor for match";
-    CHECK_NE(op->constructor->tag, -1);
-    CHECK_NE(cvn->tag, -1);
+    ICHECK(cvn) << "need to be a constructor for match";
+    ICHECK_NE(op->constructor->tag, -1);
+    ICHECK_NE(cvn->tag, -1);
     if (op->constructor->tag == cvn->tag) {
-      CHECK_EQ(op->patterns.size(), cvn->fields.size());
+      ICHECK_EQ(op->patterns.size(), cvn->fields.size());
       for (size_t i = 0; i < op->patterns.size(); ++i) {
         if (!VisitPattern(op->patterns[i], cvn->fields[i])) {
           return false;
@@ -673,7 +673,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
 
   bool VisitPattern_(const PatternTupleNode* op, const ObjectRef& v) final {
     auto adt = Downcast<ADT>(v);
-    CHECK_EQ(op->patterns.size(), adt.size());
+    ICHECK_EQ(op->patterns.size(), adt.size());
     for (size_t i = 0; i < op->patterns.size(); ++i) {
       if (!VisitPattern(op->patterns[i], adt[i])) {
         return false;
@@ -730,7 +730,7 @@ TypedPackedFunc<ObjectRef(Expr)> CreateInterpreter(IRModule mod, DLContext conte
   auto intrp = std::make_shared<Interpreter>(mod, context, target);
   auto packed = [intrp](Expr expr) {
     auto f = DetectFeature(expr);
-    CHECK(f.is_subset_of(FeatureSet::All() - fGraph));
+    ICHECK(f.is_subset_of(FeatureSet::All() - fGraph));
     return intrp->Eval(expr);
   };
   return TypedPackedFunc<ObjectRef(Expr)>(packed);
diff --git a/src/relay/backend/param_dict.cc b/src/relay/backend/param_dict.cc
index ef4b6589bdba..1d7e08abcdde 100644
--- a/src/relay/backend/param_dict.cc
+++ b/src/relay/backend/param_dict.cc
@@ -37,7 +37,7 @@ namespace relay {
 using namespace runtime;
 
 TVM_REGISTER_GLOBAL("tvm.relay._save_param_dict").set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_EQ(args.size() % 2, 0u);
+  ICHECK_EQ(args.size() % 2, 0u);
   // `args` is in the form "key, value, key, value, ..."
   size_t num_params = args.size() / 2;
   std::vector<std::string> names;
@@ -74,14 +74,14 @@ TVM_REGISTER_GLOBAL("tvm.relay._load_param_dict").set_body([](TVMArgs args, TVMR
   dmlc::MemoryStringStream memstrm(&bytes);
   dmlc::Stream* strm = &memstrm;
   uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid parameters file format";
-  CHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
-  CHECK(strm->Read(&reserved)) << "Invalid parameters file format";
-  CHECK(strm->Read(&names)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
+  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&names)) << "Invalid parameters file format";
   uint64_t sz;
   strm->Read(&sz, sizeof(sz));
   size_t size = static_cast<size_t>(sz);
-  CHECK(size == names.size()) << "Invalid parameters file format";
+  ICHECK(size == names.size()) << "Invalid parameters file format";
   tvm::Array<NamedNDArray> ret;
   for (size_t i = 0; i < size; ++i) {
     tvm::runtime::NDArray temp;
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 07f42266b831..3def6359c615 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -81,7 +81,7 @@ class MemoizedExprTranslator : public ::tvm::relay::ExprFunctor<OutputType(const
    * \return The result of the call
    */
   virtual OutputType VisitExpr(const Expr& n) {
-    CHECK(n.defined());
+    ICHECK(n.defined());
     auto it = memo_.find(n);
     if (it != memo_.end()) {
       return it->second;
@@ -115,7 +115,7 @@ inline const PackedFunc* GetPackedFunc(const std::string& func_name) {
 template <typename R, typename... Args>
 inline const runtime::TypedPackedFunc<R(Args...)> GetTypedPackedFunc(const std::string& func_name) {
   auto* pf = GetPackedFunc(func_name);
-  CHECK(pf != nullptr) << "can not find packed function";
+  ICHECK(pf != nullptr) << "can not find packed function";
   return runtime::TypedPackedFunc<R(Args...)>(*pf);
 }
 
@@ -129,7 +129,7 @@ inline std::vector<int64_t> GetIntShape(const Array<IndexExpr>& shape) {
   std::vector<int64_t> ret;
   for (const auto& dim : shape) {
     const int64_t* pval = tir::as_const_int(dim);
-    CHECK(pval) << "Expect integer, but received: " << dim->GetTypeKey();
+    ICHECK(pval) << "Expect integer, but received: " << dim->GetTypeKey();
     ret.push_back(*pval);
   }
   return ret;
@@ -192,8 +192,8 @@ inline relay::Function BindParamsByName(
   }
   Expr bound_expr = relay::Bind(func, bind_dict);
   Function ret = Downcast<Function>(bound_expr);
-  CHECK(ret.defined()) << "The returning type is expected to be a Relay Function."
-                       << "\n";
+  ICHECK(ret.defined()) << "The returning type is expected to be a Relay Function."
+                        << "\n";
   return ret;
 }
 
@@ -204,11 +204,11 @@ inline relay::Function BindParamsByName(
  */
 inline std::vector<int> GetShape(const Type& type) {
   const auto* ttype = type.as<TensorTypeNode>();
-  CHECK(ttype) << "Expect TensorTypeNode";
+  ICHECK(ttype) << "Expect TensorTypeNode";
   std::vector<int> shape;
   for (size_t i = 0; i < ttype->shape.size(); ++i) {
     auto* val = ttype->shape[i].as<IntImmNode>();
-    CHECK(val);
+    ICHECK(val);
     shape.push_back(val->value);
   }
   return shape;
@@ -223,7 +223,7 @@ inline std::vector<int> GetShape(const Type& type) {
  */
 inline bool IsOp(const CallNode* call, const std::string& op_name) {
   const auto* op_node = call->op.as<OpNode>();
-  CHECK(op_node) << "Expects a single op.";
+  ICHECK(op_node) << "Expects a single op.";
   Op op = GetRef<Op>(op_node);
   return op == Op::Get(op_name);
 }
@@ -239,14 +239,14 @@ inline bool IsOp(const CallNode* call, const std::string& op_name) {
 
 inline const CallNode* GetRootCall(const CallNode* current_call, int depth,
                                    const std::vector<std::string>& expected_op_names) {
-  CHECK(current_call && depth >= 0 && static_cast<size_t>(depth) < expected_op_names.size() &&
-        IsOp(current_call, expected_op_names[depth]));
+  ICHECK(current_call && depth >= 0 && static_cast<size_t>(depth) < expected_op_names.size() &&
+         IsOp(current_call, expected_op_names[depth]));
 
   if (depth == 0) {
     return current_call;
   }
 
-  CHECK_GT(current_call->args.size(), 0);
+  ICHECK_GT(current_call->args.size(), 0);
 
   const auto* next_call = current_call->args[0].as<CallNode>();
   return GetRootCall(next_call, depth - 1, expected_op_names);
@@ -260,7 +260,7 @@ inline const CallNode* GetRootCall(const CallNode* current_call, int depth,
  */
 inline std::string GetExtSymbol(const Function& func) {
   const auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(name_node.defined()) << "Fail to retrieve external symbol.";
+  ICHECK(name_node.defined()) << "Fail to retrieve external symbol.";
   return std::string(name_node.value());
 }
 
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index c3bf80571638..4a7e5eec17bc 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -60,19 +60,19 @@ Pass InlinePrimitives();
 
 Pass ManifestAlloc(Target target_host, vm::TargetsMap targets) {
   auto f = tvm::runtime::Registry::Get("relay.transform.ManifestAlloc");
-  CHECK(f != nullptr) << "unable to load allocation manifestation pass";
+  ICHECK(f != nullptr) << "unable to load allocation manifestation pass";
   return (*f)(target_host, targets);
 }
 
 Pass MemoryPlan() {
   auto f = tvm::runtime::Registry::Get("relay.transform.MemoryPlan");
-  CHECK(f != nullptr) << "unable to load the memory planning pass";
+  ICHECK(f != nullptr) << "unable to load the memory planning pass";
   return (*f)();
 }
 
 Pass LiftConstants() {
   auto f = tvm::runtime::Registry::Get("relay.transform.LiftConstants");
-  CHECK(f != nullptr) << "unable to load the constant lifting pass";
+  ICHECK(f != nullptr) << "unable to load the constant lifting pass";
   return (*f)();
 }
 
@@ -178,7 +178,7 @@ TreeObjectPtr BuildDecisionTreeFromPattern(MatchValuePtr data, Pattern pattern,
     return TreeBranchNode::Make(cond, then_branch, else_branch);
   } else {
     const auto* pt = pattern.as<PatternTupleNode>();
-    CHECK(pt) << "unhandled case: " << AsText(pattern, false);
+    ICHECK(pt) << "unhandled case: " << AsText(pattern, false);
     size_t field_index = 0;
     for (auto& p : pt->patterns) {
       auto d = std::make_shared<AccessField>(data, field_index++);
@@ -209,10 +209,10 @@ std::vector<int64_t> ToAllocTensorShape(NDArray shape) {
   if (shape->ndim == 0) {
     return raw_shape;
   }
-  CHECK_EQ(shape->ndim, 1u);
-  CHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got "
-                                  << DLDataType2String(shape->dtype);
-  CHECK(shape->dtype.bits == 64 || shape->dtype.bits == 32)
+  ICHECK_EQ(shape->ndim, 1u);
+  ICHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got "
+                                   << DLDataType2String(shape->dtype);
+  ICHECK(shape->dtype.bits == 64 || shape->dtype.bits == 32)
       << "The dtype of constant shape must be int32 or int64, but got"
       << DLDataType2String(shape->dtype);
 
@@ -247,7 +247,7 @@ int GetFallbackDevice() {
   Optional<Integer> opt_fallback_dev =
       pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast<int>(kDLCPU)));
   auto fallback_dev = opt_fallback_dev.value();
-  CHECK_GT(fallback_dev->value, 0U);
+  ICHECK_GT(fallback_dev->value, 0U);
   return fallback_dev->value;
 }
 
@@ -271,7 +271,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     // We then assign register num to the free variables
     for (auto param : func->params) {
       auto arg_register = NewRegister();
-      CHECK_EQ(i, arg_register);
+      ICHECK_EQ(i, arg_register);
       var_register_map_.insert({param, arg_register});
       params_.push_back(param->name_hint());
       ++i;
@@ -281,7 +281,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       Function inner_func = Downcast<Function>(func->body);
       for (auto param : inner_func->params) {
         auto arg_register = NewRegister();
-        CHECK_EQ(i, arg_register);
+        ICHECK_EQ(i, arg_register);
         var_register_map_.insert({param, arg_register});
         params_.push_back(param->name_hint());
         ++i;
@@ -295,10 +295,10 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     std::vector<Index> params_device_type;
     for (const auto& it : func->params) {
       if (!expr_device_map_.empty()) {
-        CHECK_GT(expr_device_map_.count(it), 0U);
+        ICHECK_GT(expr_device_map_.count(it), 0U);
         params_device_type.push_back(expr_device_map_[it].device_type);
       } else {
-        CHECK_EQ(targets_.size(), 1U);
+        ICHECK_EQ(targets_.size(), 1U);
         params_device_type.push_back((targets_.begin())->first);
       }
     }
@@ -311,7 +311,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 
   inline void Emit(const Instruction& instr) {
     DLOG(INFO) << "VMCompiler::Emit: instr=" << instr;
-    CHECK((int)instr.op < 100) << "Invalid opcode " << (int)instr.op;
+    ICHECK((int)instr.op < 100) << "Invalid opcode " << (int)instr.op;
     switch (instr.op) {
       case Opcode::AllocADT:
       case Opcode::AllocTensor:
@@ -348,7 +348,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       context_->const_device_type.push_back(targets_.begin()->first);
     } else {
       auto con = GetRef<Constant>(const_node);
-      CHECK_GT(expr_device_map_.count(con), 0U);
+      ICHECK_GT(expr_device_map_.count(con), 0U);
       context_->const_device_type.push_back(expr_device_map_[con].device_type);
     }
     context_->constants.push_back(const_node->data);
@@ -358,7 +358,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   void VisitExpr_(const VarNode* var_node) {
     auto var = GetRef<Var>(var_node);
     auto reg_it = this->var_register_map_.find(var);
-    CHECK(reg_it != this->var_register_map_.end());
+    ICHECK(reg_it != this->var_register_map_.end());
     last_register_ = reg_it->second;
   }
 
@@ -400,7 +400,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     auto var = GetRef<GlobalVar>(gvar);
     auto func = context_->module->Lookup(var);
     auto it = context_->global_map.find(var);
-    CHECK(it != context_->global_map.end());
+    ICHECK(it != context_->global_map.end());
     // Allocate closure with zero free vars
     Emit(Instruction::AllocClosure(it->second, 0, {}, NewRegister()));
   }
@@ -458,7 +458,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     auto cfunc = engine_->LowerShapeFunc(key);
     int op_index = -1;
     // pick the only function inside the context
-    CHECK_EQ(cfunc->funcs->functions.size(), 1);
+    ICHECK_EQ(cfunc->funcs->functions.size(), 1);
     auto pfunc = Downcast<tir::PrimFunc>((*cfunc->funcs->functions.begin()).second);
     if (context_->seen_funcs.count(pfunc) == 0) {
       op_index = context_->cached_funcs.size();
@@ -477,7 +477,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 
     for (auto output : outputs) {
       auto reg = var_register_map_.find(Downcast<Var>(output));
-      CHECK(reg != var_register_map_.end())
+      ICHECK(reg != var_register_map_.end())
           << "internal error: all variables should be in the register mapping";
       argument_registers.push_back(reg->second);
     }
@@ -489,16 +489,16 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
   void EmitInvokeTVMOp(const Function& func, const Expr& inputs, const Expr& outputs) {
     std::vector<Index> argument_registers;
 
-    CHECK(func->GetAttr<Integer>(attr::kPrimitive, 0) != 0)
+    ICHECK(func->GetAttr<Integer>(attr::kPrimitive, 0) != 0)
         << "internal error: invoke_tvm_op requires the first argument to be a relay::Function";
 
     auto input_tuple = inputs.as<TupleNode>();
-    CHECK(input_tuple) << "internal error: invoke_tvm_op inputs must be a tuple,"
-                       << "please file a bug in the memory manifestation pass";
+    ICHECK(input_tuple) << "internal error: invoke_tvm_op inputs must be a tuple,"
+                        << "please file a bug in the memory manifestation pass";
 
     auto output_tuple = outputs.as<TupleNode>();
-    CHECK(output_tuple) << "internal error: invoke_tvm_op outputs must be a tuple,"
-                        << "please file a bug in the memory manifestation pass";
+    ICHECK(output_tuple) << "internal error: invoke_tvm_op outputs must be a tuple,"
+                         << "please file a bug in the memory manifestation pass";
 
     for (auto input : input_tuple->fields) {
       VisitExpr(input);
@@ -507,7 +507,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 
     for (auto output : output_tuple->fields) {
       auto reg = var_register_map_.find(Downcast<Var>(output));
-      CHECK(reg != var_register_map_.end())
+      ICHECK(reg != var_register_map_.end())
           << "internal error: all variables should be in the register mapping";
       argument_registers.push_back(reg->second);
     }
@@ -520,11 +520,11 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       // Next generate the invoke instruction.
       if (expr_device_map_.empty()) {
         // homogeneous execution.
-        CHECK_EQ(targets_.size(), 1U);
+        ICHECK_EQ(targets_.size(), 1U);
         const auto& it = targets_.begin();
         target = (*it).second;
       } else {
-        CHECK_GT(expr_device_map_.count(func), 0U)
+        ICHECK_GT(expr_device_map_.count(func), 0U)
             << "Found not annotated expression, please make sure "
                "context analysis has been executed";
         int dev_type = expr_device_map_[func].device_type;
@@ -545,7 +545,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       context_->cached_funcs.push_back(cfunc);
     } else {
       // TODO(jroesch): support lowered funcs for multiple targets
-      CHECK_EQ(cfunc->funcs->functions.size(), 1);
+      ICHECK_EQ(cfunc->funcs->functions.size(), 1);
       auto pfunc = Downcast<tir::PrimFunc>((*cfunc->funcs->functions.begin()).second);
       if (context_->seen_funcs.find(pfunc) == context_->seen_funcs.end()) {
         op_index = context_->cached_funcs.size();
@@ -571,16 +571,16 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       matcher
           .Match("vm.invoke_tvm_op",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 3);
+                   ICHECK_EQ(args.size(), 3);
                    EmitInvokeTVMOp(Downcast<Function>(args[0]), args[1], args[2]);
                  })
           .Match("memory.alloc_tensor",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 3);
+                   ICHECK_EQ(args.size(), 3);
 
                    // Get the attributes.
                    auto alloc_attrs = attrs.as<AllocTensorAttrs>();
-                   CHECK(alloc_attrs != nullptr) << "must be the alloc tensor attrs";
+                   ICHECK(alloc_attrs != nullptr) << "must be the alloc tensor attrs";
                    auto dtype = alloc_attrs->dtype;
 
                    // The storage will be passed dynamically.
@@ -612,22 +612,22 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
           .Match("memory.alloc_storage",
                  [this, call_node](const Array<Expr>& args, const Attrs& attrs,
                                    const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 2);
+                   ICHECK_EQ(args.size(), 2);
                    // Compute the size of the allocation.
                    this->VisitExpr(args[0]);
                    auto size_register = last_register_;
 
-                   CHECK(args[1].as<ConstantNode>());
+                   ICHECK(args[1].as<ConstantNode>());
                    NDArray alignment_arr = args[1].as<ConstantNode>()->data;
-                   CHECK_EQ(alignment_arr->dtype.code, 0U)
+                   ICHECK_EQ(alignment_arr->dtype.code, 0U)
                        << "The dtype of constant shape must be int32 or int64, but got "
                        << DLDataType2String(alignment_arr->dtype);
-                   CHECK_EQ(alignment_arr->dtype.bits, 64U);
+                   ICHECK_EQ(alignment_arr->dtype.bits, 64U);
                    Index alignment = reinterpret_cast<int64_t*>(alignment_arr->data)[0];
 
                    // Get the dtype hint from the attributes.
                    auto alloc_attrs = attrs.as<AllocStorageAttrs>();
-                   CHECK(alloc_attrs != nullptr) << "must be the AllocStorage attrs";
+                   ICHECK(alloc_attrs != nullptr) << "must be the AllocStorage attrs";
                    auto dtype = alloc_attrs->dtype;
 
                    Index device_type;
@@ -637,7 +637,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                      auto& kv = *(targets_.begin());
                      device_type = kv.first;
                    } else {
-                     CHECK_GT(expr_device_map_.count(GetRef<Call>(call_node)), 0U)
+                     ICHECK_GT(expr_device_map_.count(GetRef<Call>(call_node)), 0U)
                          << " The alloc_storage node is not annotated";
                      device_type = expr_device_map_[GetRef<Call>(call_node)].device_type;
                    }
@@ -647,7 +647,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                  })
           .Match("vm.shape_func",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 3);
+                   ICHECK_EQ(args.size(), 3);
                    auto shape_func = Downcast<Function>(args[0]);
                    auto inputs = Downcast<Tuple>(args[1]);
                    auto outputs = Downcast<Tuple>(args[2]);
@@ -655,11 +655,11 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                  })
           .Match("vm.shape_of",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 1U);
+                   ICHECK_EQ(args.size(), 1U);
                    // Get the attributes.
                    const auto* shape_of_attrs = attrs.as<ShapeOfAttrs>();
-                   CHECK(shape_of_attrs) << "Must be the shape_of attrs";
-                   CHECK_EQ(shape_of_attrs->dtype.bits(), 64)
+                   ICHECK(shape_of_attrs) << "Must be the shape_of attrs";
+                   ICHECK_EQ(shape_of_attrs->dtype.bits(), 64)
                        << "The dtype of shape of must be int64, but got"
                        << DLDataType2String(shape_of_attrs->dtype);
                    this->VisitExpr(args[0]);
@@ -667,7 +667,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                  })
           .Match("vm.reshape_tensor",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 2u);
+                   ICHECK_EQ(args.size(), 2u);
                    this->VisitExpr(args[0]);
                    auto tensor_reg = last_register_;
                    this->VisitExpr(args[1]);
@@ -676,12 +676,12 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
                  })
           .Match("device_copy",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
-                   CHECK_EQ(args.size(), 1U);
+                   ICHECK_EQ(args.size(), 1U);
                    this->VisitExpr(args[0]);
                    auto src_reg = last_register_;
 
                    auto device_copy_attrs = attrs.as<DeviceCopyAttrs>();
-                   CHECK(device_copy_attrs != nullptr) << "Must be the device copy attrs";
+                   ICHECK(device_copy_attrs != nullptr) << "Must be the device copy attrs";
                    Index src_device_type = device_copy_attrs->src_dev_type;
                    Index dst_device_type = device_copy_attrs->dst_dev_type;
                    Emit(Instruction::DeviceCopy(src_reg, src_device_type, dst_device_type,
@@ -711,7 +711,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       // calling convention.
       auto global = GetRef<GlobalVar>(global_node);
       auto it = context_->global_map.find(global);
-      CHECK(it != context_->global_map.end());
+      ICHECK(it != context_->global_map.end());
       DLOG(INFO) << "VisitExpr_: generating invoke for " << global->name_hint
                  << " with func_index=" << it->second;
 
@@ -855,13 +855,13 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
 PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
   if (name == "lower") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.num_args, 3);
+      ICHECK_EQ(args.num_args, 3);
       IRModule mod = args[0];
       this->Lower(mod, args[1], args[2]);
     });
   } else if (name == "codegen") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.num_args, 0);
+      ICHECK_EQ(args.num_args, 0);
       this->Codegen();
     });
   } else if (name == "get_executable") {
@@ -884,7 +884,7 @@ PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Obje
     });
   } else if (name == "optimize") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.num_args, 3);
+      ICHECK_EQ(args.num_args, 3);
       *rv = this->OptimizeModule(args[0], args[1], args[2]);
     });
   } else {
@@ -900,7 +900,7 @@ void VMCompiler::SetParam(const std::string& name, runtime::NDArray data_in) {
 void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Target& target_host) {
   if (params_.size()) {
     BaseFunc base_func = mod->Lookup("main");
-    CHECK(base_func->IsInstance<FunctionNode>())
+    ICHECK(base_func->IsInstance<FunctionNode>())
         << "VM compiler expects to compile relay::Function";
     auto f = relay::backend::BindParamsByName(Downcast<Function>(base_func), params_);
     auto gvar = mod->GetGlobalVar("main");
@@ -936,7 +936,7 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
       auto vm_func = func_compiler.Compile(gvar, func);
 
       size_t func_index = context_.global_map.at(gvar);
-      CHECK(func_index < exec_->functions.size());
+      ICHECK(func_index < exec_->functions.size());
       exec_->functions[func_index] = vm_func;
     }
   }
@@ -1123,7 +1123,7 @@ void VMCompiler::Codegen() {
 
     if (target_str == "ext_dev") {
       // Collect metadata in functions that are handled by external codegen.
-      CHECK(mod->ContainGlobalVar(cfunc->func_name));
+      ICHECK(mod->ContainGlobalVar(cfunc->func_name));
       backend::ConstantUpdater const_visit(cfunc->func_name, &params_);
       const_visit(Downcast<Function>(mod->Lookup(cfunc->func_name)));
       continue;
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index 22b8364534c8..f21d0967701a 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -82,7 +82,7 @@ class LambdaLifter : public ExprMutator {
       auto var = GetRef<Var>(var_node);
       if (!letrec_.empty() && var == letrec_.back()) {
         auto it = lambda_map_.find(var);
-        CHECK(it != lambda_map_.end());
+        ICHECK(it != lambda_map_.end());
         return Call(it->second, call->args, call_node->attrs, call_node->type_args);
       }
     }
@@ -154,11 +154,12 @@ class LambdaLifter : public ExprMutator {
       lifted_func = MarkClosure(lifted_func);
     }
 
-    CHECK(lifted_func.defined());
+    ICHECK(lifted_func.defined());
 
     if (module_->ContainGlobalVar(name)) {
       const auto existing_func = module_->Lookup(name);
-      CHECK(tvm::StructuralEqual()(lifted_func, existing_func)) << "lifted function hash collision";
+      ICHECK(tvm::StructuralEqual()(lifted_func, existing_func))
+          << "lifted function hash collision";
       // If an identical function already exists, use its global var.
       global = module_->GetGlobalVar(name);
     } else {
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 50c05f2923bc..536e65979ee4 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -85,7 +85,7 @@ void DFPatternMatcher::ClearMap(size_t watermark) {
 
 bool DFPatternMatcher::VisitDFPattern(const DFPattern& pattern, const Expr& expr) {
   if (memoize_ && memo_.count(pattern)) {
-    CHECK_EQ(memo_[pattern].size(), 1);
+    ICHECK_EQ(memo_[pattern].size(), 1);
     return expr.same_as(memo_[pattern][0]);
   } else {
     auto watermark = matched_nodes_.size();
@@ -133,7 +133,7 @@ bool MatchRetValue(const ObjectRef& lhs, const TVMRetValue& rhs) {
       }
       break;
     default:
-      CHECK(false) << "Unsupported type code in Pattern Node " << rhs.type_code();
+      ICHECK(false) << "Unsupported type code in Pattern Node " << rhs.type_code();
   }
   return false;
 }
@@ -644,7 +644,7 @@ class PatternGrouper {
     auto body = extractor.Mutate(expr);
 
     // Verify the pattern still holds
-    CHECK(DFPatternMatcher(body).Match(pattern_, body));
+    ICHECK(DFPatternMatcher(body).Match(pattern_, body));
     group.function = Function(params, body, NullValue<Type>(), Array<TypeVar>());
     group.name = extractor.GetName();
     // Check to make sure we aren't overlapping with another group or creating an invalid fusion
@@ -765,7 +765,7 @@ class PatternRewriter : protected MixedModeMutator {
     int count = 0;
     bool equal = true;
     static auto* structural_equal = runtime::Registry::Get("node.StructuralEqual");
-    CHECK(structural_equal) << "node.StructuralEqual is not registered.";
+    ICHECK(structural_equal) << "node.StructuralEqual is not registered.";
     do {
       last = post;
       for (auto callback : callbacks) {
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 237cb35d8455..f2e0b363eb2b 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -47,7 +47,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<ConstantNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const ConstantNode*>(ref.get());
       const PackedFunc* fprint = Registry::Get("relay._constant_repr");
-      CHECK(fprint) << "unable to find printing function for constants";
+      ICHECK(fprint) << "unable to find printing function for constants";
       std::string data = (*fprint)(GetRef<Constant>(node));
       p->stream << "Constant(" << data << ")";
     });
@@ -56,8 +56,8 @@ TensorType ConstantNode::tensor_type() const {
   auto dtype = DataType(data->dtype);
   Array<tvm::PrimExpr> shape;
   for (int i = 0; i < data->ndim; i++) {
-    CHECK_LE(data->shape[i], std::numeric_limits<int32_t>::max());
-    CHECK_GE(data->shape[i], std::numeric_limits<int32_t>::min());
+    ICHECK_LE(data->shape[i], std::numeric_limits<int32_t>::max());
+    ICHECK_GE(data->shape[i], std::numeric_limits<int32_t>::min());
     shape.push_back(tvm::IntImm(DataType::Int(32), data->shape[i]));
   }
 
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index a09179bcc585..a22b69c4ed1b 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -102,8 +102,8 @@ void ExpandDataflow(Expr expr, FCheckVisited fcheck_visited, FVisitLeaf fvisit_l
 }
 
 MixedModeVisitor::MixedModeVisitor(int visit_limit) {
-  CHECK(visit_limit > 0) << "Dataflow visit limit must be greater than 0";
-  CHECK(visit_limit < 10) << "Dataflow visit limit must be less than 10";
+  ICHECK(visit_limit > 0) << "Dataflow visit limit must be greater than 0";
+  ICHECK(visit_limit < 10) << "Dataflow visit limit must be less than 10";
   visit_limit_ = visit_limit;
 }
 
@@ -524,13 +524,13 @@ class ExprBinder : public MixedModeMutator, PatternMutator {
   using MixedModeMutator::VisitExpr_;
 
   Expr VisitExpr_(const LetNode* op) final {
-    CHECK(!args_map_.count(op->var)) << "Cannot bind an internel variable in let";
+    ICHECK(!args_map_.count(op->var)) << "Cannot bind an internel variable in let";
     return ExprMutator::VisitExpr_(op);
   }
 
   Expr VisitExpr_(const FunctionNode* op) final {
     for (Var param : op->params) {
-      CHECK(!args_map_.count(param)) << "Cannnot bind an internal function parameter";
+      ICHECK(!args_map_.count(param)) << "Cannnot bind an internal function parameter";
     }
     return ExprMutator::VisitExpr_(op);
   }
@@ -553,7 +553,7 @@ class ExprBinder : public MixedModeMutator, PatternMutator {
   }
 
   Var VisitVar(const Var& v) final {
-    CHECK(!args_map_.count(v)) << "Cannnot bind an internal pattern variable";
+    ICHECK(!args_map_.count(v)) << "Cannnot bind an internal pattern variable";
     return v;
   }
 
@@ -584,7 +584,7 @@ Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
       }
     }
     ret = Function(new_params, new_body, func->ret_type, func->type_params, func->attrs);
-    CHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
+    ICHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
     return std::move(ret);
   } else {
     return ExprBinder(args_map).VisitExpr(expr);
@@ -596,7 +596,7 @@ TVM_REGISTER_GLOBAL("relay.ir.Bind").set_body([](TVMArgs args, TVMRetValue* ret)
   if (input->IsInstance<ExprNode>()) {
     *ret = Bind(Downcast<Expr>(input), args[1]);
   } else {
-    CHECK(input->IsInstance<TypeNode>());
+    ICHECK(input->IsInstance<TypeNode>());
     *ret = Bind(Downcast<Type>(input), args[1]);
   }
 });
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index 1439e8b59cf0..c9920a621b56 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -29,8 +29,8 @@ namespace relay {
 Function::Function(tvm::Array<Var> params, Expr body, Type ret_type,
                    tvm::Array<TypeVar> type_params, DictAttrs attrs, Span span) {
   ObjectPtr<FunctionNode> n = make_object<FunctionNode>();
-  CHECK(params.defined());
-  CHECK(type_params.defined());
+  ICHECK(params.defined());
+  ICHECK(type_params.defined());
   n->params = std::move(params);
   n->body = std::move(body);
   n->ret_type = std::move(ret_type);
diff --git a/src/relay/ir/indexed_graph.h b/src/relay/ir/indexed_graph.h
index 70508279af21..4bbb741b760d 100644
--- a/src/relay/ir/indexed_graph.h
+++ b/src/relay/ir/indexed_graph.h
@@ -115,8 +115,8 @@ class IndexedGraph {
       return nullptr;
     }
     while (lhs != rhs) {
-      CHECK(lhs);
-      CHECK(rhs);
+      ICHECK(lhs);
+      ICHECK(rhs);
       if (lhs->depth_ < rhs->depth_) {
         rhs = rhs->dominator_parent_;
       } else if (lhs->depth_ > rhs->depth_) {
diff --git a/src/relay/ir/transform.cc b/src/relay/ir/transform.cc
index b5f4d152ee00..596f812e25af 100644
--- a/src/relay/ir/transform.cc
+++ b/src/relay/ir/transform.cc
@@ -128,7 +128,7 @@ IRModule FunctionPassNode::operator()(IRModule mod, const PassContext& pass_ctx)
 
   const PassInfo& pass_info = Info();
 
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
 
   DLOG(INFO) << "Executing function pass : " << pass_info->name
              << " with opt level: " << pass_info->opt_level;
diff --git a/src/relay/op/algorithm/argsort.cc b/src/relay/op/algorithm/argsort.cc
index a24097420873..455d413c2746 100644
--- a/src/relay/op/algorithm/argsort.cc
+++ b/src/relay/op/algorithm/argsort.cc
@@ -33,10 +33,10 @@ bool ArgsortRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   // `types` contains: [data, result]
   const ArgsortAttrs* param = attrs.as<ArgsortAttrs>();
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "Argsort: expect input type to be TensorType but get " << types[0];
     return false;
   }
diff --git a/src/relay/op/algorithm/topk.cc b/src/relay/op/algorithm/topk.cc
index 14308dd592d6..b0e4b5dc6b4e 100644
--- a/src/relay/op/algorithm/topk.cc
+++ b/src/relay/op/algorithm/topk.cc
@@ -34,15 +34,15 @@ bool TopKRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, result]
   const TopKAttrs* param = attrs.as<TopKAttrs>();
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
-  CHECK(data);
+  ICHECK(data);
   int ndim = data->shape.size();
   int axis = param->axis;
   if (axis < 0) {
     axis += ndim;
   }
-  CHECK(axis >= 0 && axis < ndim);
+  ICHECK(axis >= 0 && axis < ndim);
   Array<IndexExpr> out_shape;
   for (int i = 0; i < ndim; ++i) {
     if (i != axis) {
diff --git a/src/relay/op/dyn/algorithm/topk.cc b/src/relay/op/dyn/algorithm/topk.cc
index 1c88730a5463..0ce0a18b2170 100644
--- a/src/relay/op/dyn/algorithm/topk.cc
+++ b/src/relay/op/dyn/algorithm/topk.cc
@@ -33,31 +33,31 @@ bool TopKRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, k, result]
   const TopKAttrs* param = attrs.as<TopKAttrs>();
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* k = types[1].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[0];
     return false;
   }
   if (k == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[1];
     return false;
   }
-  CHECK(k->shape.size() <= 1) << "Parameter k must be a Scalar or a Tensor of shape (1, )";
+  ICHECK(k->shape.size() <= 1) << "Parameter k must be a Scalar or a Tensor of shape (1, )";
   if (k->shape.size() == 1) {
     const IntImmNode* k_shape = k->shape[0].as<IntImmNode>();
-    CHECK(k_shape) << "Parameter k must have static shape";
-    CHECK_EQ(k_shape->value, 1) << "Parameter k must be a Scalar or a Tensor of shape (1, )";
+    ICHECK(k_shape) << "Parameter k must have static shape";
+    ICHECK_EQ(k_shape->value, 1) << "Parameter k must be a Scalar or a Tensor of shape (1, )";
   }
   int ndim = data->shape.size();
   int axis = param->axis;
   if (axis < 0) {
     axis += ndim;
   }
-  CHECK(axis >= 0 && axis < ndim);
+  ICHECK(axis >= 0 && axis < ndim);
   Array<IndexExpr> out_shape;
   for (int i = 0; i < ndim; ++i) {
     if (i != axis) {
diff --git a/src/relay/op/dyn/image/resize.cc b/src/relay/op/dyn/image/resize.cc
index 23e17400f29d..6581250db0cd 100644
--- a/src/relay/op/dyn/image/resize.cc
+++ b/src/relay/op/dyn/image/resize.cc
@@ -36,17 +36,17 @@ TVM_REGISTER_NODE_TYPE(ResizeAttrs);
 bool ResizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // {data, size, out}
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const ResizeAttrs* param = attrs.as<ResizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "Resize only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
diff --git a/src/relay/op/dyn/nn/pad.cc b/src/relay/op/dyn/nn/pad.cc
index 73daccbd97fd..42ec784f8c15 100644
--- a/src/relay/op/dyn/nn/pad.cc
+++ b/src/relay/op/dyn/nn/pad.cc
@@ -41,7 +41,7 @@ namespace dyn {
 bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
             const TypeReporter& reporter) {
   // types = [data_type, pad_width_type, pad_value_type, ret_type]
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
@@ -52,13 +52,13 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (pad_value == nullptr) return false;
 
   int data_rank = data->shape.size();
-  CHECK(data_rank) << "Data shape must have static rank";
+  ICHECK(data_rank) << "Data shape must have static rank";
 
   int pad_width_rank = pad_width->shape.size();
-  CHECK_EQ(pad_width_rank, 2) << "Pad width must be 2D";
+  ICHECK_EQ(pad_width_rank, 2) << "Pad width must be 2D";
 
   const PadAttrs* param = attrs.as<PadAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   std::vector<IndexExpr> oshape;
   for (int i = 0; i < data_rank; i++) {
@@ -72,7 +72,7 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> PadCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                              const Type& out_type) {
   const auto* param = attrs.as<PadAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   auto data = inputs[0];
   auto pad_width = inputs[1];
@@ -88,7 +88,7 @@ Array<te::Tensor> PadCompute(const Attrs& attrs, const Array<te::Tensor>& inputs
   }
 
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
 
   return Array<te::Tensor>{topi::pad(inputs[0], pad_before, pad_after, pad_value, "T_pad",
                                      topi::kElementWise, param->pad_mode,
diff --git a/src/relay/op/dyn/nn/upsampling.cc b/src/relay/op/dyn/nn/upsampling.cc
index 8a28475eacd5..93869757e96f 100644
--- a/src/relay/op/dyn/nn/upsampling.cc
+++ b/src/relay/op/dyn/nn/upsampling.cc
@@ -41,7 +41,7 @@ namespace dyn {
 bool UpSamplingRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
   // types = [data_type, scale_h_type, scale_w_type, ret_type]
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* scale_h = types[1].as<TensorTypeNode>();
   const auto* scale_w = types[2].as<TensorTypeNode>();
@@ -49,16 +49,16 @@ bool UpSamplingRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (scale_h == nullptr) return false;
   if (scale_w == nullptr) return false;
 
-  CHECK_EQ(scale_h->shape.size(), 0);
-  CHECK_EQ(scale_w->shape.size(), 0);
+  ICHECK_EQ(scale_h->shape.size(), 0);
+  ICHECK_EQ(scale_w->shape.size(), 0);
   static const Layout kNCHW("NCHW");
 
   const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
-  CHECK(param);
+  ICHECK(param);
   const Layout in_layout(param->layout);
 
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "UpSampling only supports input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -122,18 +122,18 @@ RELAY_REGISTER_OP("dyn.nn.upsampling")
 bool UpSampling3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   // types = [data_type, scale_d_type, scale_h_type, scale_w_type, ret_type]
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCDHW("NCDHW");
 
   const UpSampling3DAttrs* param = attrs.as<UpSampling3DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
 
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "UpSampling3D only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
diff --git a/src/relay/op/dyn/nn/upsampling.h b/src/relay/op/dyn/nn/upsampling.h
index 79ed65bba36b..acdc54174913 100644
--- a/src/relay/op/dyn/nn/upsampling.h
+++ b/src/relay/op/dyn/nn/upsampling.h
@@ -43,7 +43,7 @@ Array<Array<Layout> > UpsamplingInferCorrectLayout(const Attrs& attrs,
   // NOTE: Discard "const" qualifier here.
   T* params = const_cast<T*>(attrs.as<T>());
   if (new_in_layouts.defined()) {
-    CHECK_GT(new_in_layouts.size(), 0);
+    ICHECK_GT(new_in_layouts.size(), 0);
 
     Layout raw_layout(params->layout);
     Layout input = new_in_layouts[0];
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index 863ad643f0da..119eba3da188 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -47,11 +47,11 @@ namespace dyn {
 bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   // types: [data, newshape, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
 
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "reshape: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -59,7 +59,7 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   Array<IndexExpr> oshape;
   const auto* newshape = types[1].as<TensorTypeNode>();
   if (newshape == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "reshape: expect input type to be TensorType but get " << types[1];
     return false;
   }
@@ -76,7 +76,7 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
   Array<IndexExpr> newshape;
   for (auto val : out_ttype->shape) {
     if (val->IsInstance<tir::AnyNode>()) {
@@ -149,21 +149,21 @@ RELAY_REGISTER_OP("dyn.reshape")
 bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, reps, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* reps = types[1].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[0];
     return false;
   }
   if (reps == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[1];
     return false;
   }
   const IntImmNode* reps_shape = reps->shape[0].as<IntImmNode>();
-  CHECK(reps_shape) << "Parameter reps must have static shape";
+  ICHECK(reps_shape) << "Parameter reps must have static shape";
   const size_t ndim = data->shape.size();
   const size_t rndim = reps_shape->value;
   size_t tndim = (ndim > rndim) ? ndim : rndim;
@@ -178,7 +178,7 @@ bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
 Array<te::Tensor> TileCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
-  CHECK_EQ(inputs.size(), 2);
+  ICHECK_EQ(inputs.size(), 2);
   const auto* out_ttype = out_type.as<TensorTypeNode>();
   size_t rndim = inputs[1]->shape[0].as<IntImmNode>()->value;
   return {topi::dyn_tile(inputs[0], out_ttype->shape, rndim)};
@@ -212,7 +212,7 @@ RELAY_REGISTER_OP("dyn.tile")
 bool BroadCastToRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
   // types = [data_type, broadcast_shape_type, ret_type]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
 
   const auto* input_type = types[0].as<TensorTypeNode>();
   const auto* target_type = types[1].as<TensorTypeNode>();
@@ -225,8 +225,9 @@ bool BroadCastToRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   auto out_dtype = input_type->dtype;
   // rank must be static
   const IntImmNode* rank = target_type->shape[0].as<IntImmNode>();
-  CHECK(rank) << "Target shape must have static rank";  // rank must be static even in dyn pass
-                                                        // could add support for dyn rank in futures
+  ICHECK(rank)
+      << "Target shape must have static rank";  // rank must be static even in dyn pass
+                                                // could add support for dyn rank in futures
 
   std::vector<IndexExpr> oshape;
   for (int i = 0; i < rank->value; ++i) {
@@ -266,13 +267,13 @@ RELAY_REGISTER_OP("dyn.broadcast_to")
 bool InitOpRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // types = [zeros_shape, ret_type]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
   const auto* fill_shape = types[0].as<TensorTypeNode>();
   DataType out_dtype = param->dtype;
 
   const IntImmNode* shape_shape = fill_shape->shape[0].as<IntImmNode>();
-  CHECK(shape_shape) << "Parameter shape must have static rank";
+  ICHECK(shape_shape) << "Parameter shape must have static rank";
 
   std::vector<IndexExpr> oshape;
   for (int i = 0; i < shape_shape->value; ++i) {
@@ -324,9 +325,9 @@ RELAY_REGISTER_OP("dyn.ones")
 bool OneHotRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // `types` contains: [indices, on_value, off_value, result]
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto* indices = types[0].as<TensorTypeNode>();
-  CHECK(indices);
+  ICHECK(indices);
 
   const auto param = attrs.as<OneHotAttrs>();
 
@@ -349,7 +350,7 @@ bool OneHotRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> OneHotCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const auto* param = attrs.as<OneHotAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const auto* out_ttype = out_type.as<TensorTypeNode>();
   return Array<te::Tensor>{topi::one_hot(inputs[0], inputs[1](), inputs[2](), -1, param->axis,
                                          param->dtype, out_ttype->shape)};
@@ -393,7 +394,7 @@ RELAY_REGISTER_OP("dyn.one_hot")
 
 bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
   const auto* fill_value = types[0].as<TensorTypeNode>();
   const auto* fill_shape = types[1].as<TensorTypeNode>();
@@ -406,11 +407,11 @@ bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     out_dtype = fill_value->dtype;
   }
 
-  CHECK_EQ(fill_value->shape.size(), 0)
+  ICHECK_EQ(fill_value->shape.size(), 0)
       << "Fill value should be a scalar but has dimension " << fill_value->shape.size() << ".";
 
   const IntImmNode* rank = fill_shape->shape[0].as<IntImmNode>();
-  CHECK(rank) << "Parameter shape must have static rank";
+  ICHECK(rank) << "Parameter shape must have static rank";
 
   std::vector<IndexExpr> oshape;
   for (int i = 0; i < rank->value; ++i) {
@@ -449,7 +450,7 @@ RELAY_REGISTER_OP("dyn.full")
 bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   // [data, begin, end, strides, out]
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
   if (param == nullptr) {
     return false;
@@ -501,9 +502,9 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
   te::Tensor strides = inputs[3];
   // Dynamic computation
   int64_t data_rank = data->shape.size();
-  CHECK(begin->shape[0].as<IntImmNode>()->value == data_rank &&
-        end->shape[0].as<IntImmNode>()->value == data_rank &&
-        strides->shape[0].as<IntImmNode>()->value == data_rank)
+  ICHECK(begin->shape[0].as<IntImmNode>()->value == data_rank &&
+         end->shape[0].as<IntImmNode>()->value == data_rank &&
+         strides->shape[0].as<IntImmNode>()->value == data_rank)
       << "begin, end, and strides are required to have the same length"
       << " if they are dynamic variables.";
   return Array<te::Tensor>{DynamicStridedSlice(data, begin, end, strides)};
diff --git a/src/relay/op/image/dilation2d.cc b/src/relay/op/image/dilation2d.cc
index 462f11f56d0d..1f8c7ec732d9 100644
--- a/src/relay/op/image/dilation2d.cc
+++ b/src/relay/op/image/dilation2d.cc
@@ -62,7 +62,7 @@ Expr MakeDilation2D(Expr data, Expr weight, Array<IndexExpr> strides, Array<Inde
 template <typename AttrType>
 bool Dilation2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -70,23 +70,23 @@ bool Dilation2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   static const Layout kOIHW("IHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Dilation2D only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Dilation2D only support kernel layouts that are convertible from OIHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->data_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Dilation2D only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
diff --git a/src/relay/op/image/grid_sample.cc b/src/relay/op/image/grid_sample.cc
index bc6989155323..d5fa68aed82a 100644
--- a/src/relay/op/image/grid_sample.cc
+++ b/src/relay/op/image/grid_sample.cc
@@ -35,21 +35,21 @@ TVM_REGISTER_NODE_TYPE(AffineGridAttrs);
 
 bool AffineGridRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   auto batch_size = data->shape[0];
 
   const AffineGridAttrs* param = attrs.as<AffineGridAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Array<IndexExpr> oshape;
 
-  CHECK(data->shape.size() == 3U && reporter->AssertEQ(data->shape[1], 2) &&
-        reporter->AssertEQ(data->shape[2], 3))
+  ICHECK(data->shape.size() == 3U && reporter->AssertEQ(data->shape[1], 2) &&
+         reporter->AssertEQ(data->shape[2], 3))
       << "data should be an"
          "affine matrix with shape [batch_size, 2, 3]";
-  CHECK(param->target_shape.defined() && param->target_shape.size() == 2)
+  ICHECK(param->target_shape.defined() && param->target_shape.size() == 2)
       << "target_shape should be 2D";
   oshape.push_back(batch_size);
   oshape.push_back(2);
@@ -97,12 +97,12 @@ TVM_REGISTER_NODE_TYPE(GridSampleAttrs);
 
 bool GridSampleRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* grid = types[1].as<TensorTypeNode>();
   if (!data || !grid) return false;
   const auto* param = attrs.as<GridSampleAttrs>();
-  CHECK(param);
+  ICHECK(param);
   static const Layout kNCHW("NCHW");
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index 41b7afe6d00c..b8875e48ed0f 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -35,17 +35,17 @@ TVM_REGISTER_NODE_TYPE(ResizeAttrs);
 
 bool ResizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const ResizeAttrs* param = attrs.as<ResizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "Resize only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -104,17 +104,17 @@ TVM_REGISTER_NODE_TYPE(Resize3dAttrs);
 
 bool Resize3dRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCDHW("NCDHW");
 
   const Resize3dAttrs* param = attrs.as<Resize3dAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "Resize3d only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
@@ -175,14 +175,14 @@ TVM_REGISTER_NODE_TYPE(CropAndResizeAttrs);
 
 bool CropAndResizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* boxes = types[1].as<TensorTypeNode>();
   const auto* box_indices = types[2].as<TensorTypeNode>();
   if (data == nullptr || boxes == nullptr || box_indices == nullptr) return false;
 
   const CropAndResizeAttrs* param = attrs.as<CropAndResizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto crop_size = param->crop_size;
 
   DataType out_dtype = param->out_dtype;
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index b853ef635b12..dc5a1ebd3c73 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -54,19 +54,19 @@ TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage")
 
 bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3u);
+  ICHECK_EQ(types.size(), 3u);
   auto size_type = types[0];
   auto tensor_type = size_type.as<TensorTypeNode>();
-  CHECK(tensor_type != nullptr);
-  CHECK_EQ(tensor_type->dtype, DataType::Int(64));
-  CHECK_EQ(tensor_type->shape.size(), 0);
+  ICHECK(tensor_type != nullptr);
+  ICHECK_EQ(tensor_type->dtype, DataType::Int(64));
+  ICHECK_EQ(tensor_type->shape.size(), 0);
   auto align_type = types[1];
   auto align_ttype = align_type.as<TensorTypeNode>();
-  CHECK(align_ttype != nullptr);
-  CHECK_EQ(align_ttype->dtype, DataType::Int(64));
-  CHECK_EQ(align_ttype->shape.size(), 0);
+  ICHECK(align_ttype != nullptr);
+  ICHECK_EQ(align_ttype->dtype, DataType::Int(64));
+  ICHECK_EQ(align_ttype->shape.size(), 0);
   auto mod = reporter->GetModule();
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
   auto storage_name = mod->GetGlobalTypeVar("Storage");
   auto storage = TypeCall(storage_name, {});
   reporter->Assign(types[2], storage);
@@ -107,10 +107,10 @@ TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_tensor")
 std::vector<int64_t> FromConstShape(Constant konst) {
   runtime::NDArray shape = konst->data;
   std::vector<int64_t> raw_shape;
-  CHECK_EQ(shape->ndim, 1u);
-  CHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got "
-                                  << runtime::DLDataType2String(shape->dtype);
-  CHECK(shape->dtype.bits == 64 || shape->dtype.bits == 32)
+  ICHECK_EQ(shape->ndim, 1u);
+  ICHECK_EQ(shape->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got "
+                                   << runtime::DLDataType2String(shape->dtype);
+  ICHECK(shape->dtype.bits == 64 || shape->dtype.bits == 32)
       << "The dtype of constant shape must be int32 or int64, but got"
       << runtime::DLDataType2String(shape->dtype);
 
@@ -131,28 +131,28 @@ std::vector<int64_t> FromConstShape(Constant konst) {
 
 bool AllocTensorRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4u);
+  ICHECK_EQ(types.size(), 4u);
   auto alloc_attrs = attrs.as<AllocTensorAttrs>();
-  CHECK(alloc_attrs != nullptr) << "must be alloc_tensor attributes";
+  ICHECK(alloc_attrs != nullptr) << "must be alloc_tensor attributes";
   // First argument should be storage.
   auto mod = reporter->GetModule();
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
   auto storage_name = mod->GetGlobalTypeVar("Storage");
   auto storage = relay::TypeCall(storage_name, {});
   reporter->Assign(types[0], storage);
   // Second argument should be the offset.
   auto offset_type = types[1].as<TensorTypeNode>();
-  CHECK(offset_type != nullptr) << "must be a scalar type";
+  ICHECK(offset_type != nullptr) << "must be a scalar type";
 
   // Third argument should be shape tensor.
   auto tt = types[2].as<TensorTypeNode>();
-  CHECK(tt != nullptr) << "must be tensor type";
+  ICHECK(tt != nullptr) << "must be tensor type";
 
   // Be careful about having to allocate scalars.
   int64_t dims = 0;
   if (tt->shape.size() != 0) {
     auto rank = tt->shape[0].as<tvm::IntImmNode>();
-    CHECK(rank != nullptr);
+    ICHECK(rank != nullptr);
     dims = rank->value;
   }
 
@@ -161,14 +161,14 @@ bool AllocTensorRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   if (alloc_attrs->const_shape.defined()) {
     auto con = alloc_attrs->const_shape;
     auto sh = FromConstShape(con);
-    CHECK_EQ(sh.size(), dims);
+    ICHECK_EQ(sh.size(), dims);
     Array<IndexExpr> out_shape;
     for (auto i = 0u; i < dims; i++) {
       out_shape.push_back(tvm::Integer(sh[i]));
     }
     alloc_type = TensorType(out_shape, alloc_attrs->dtype);
   } else {
-    CHECK(alloc_attrs->assert_shape.defined())
+    ICHECK(alloc_attrs->assert_shape.defined())
         << "the assert_shape must be set when const_shape is not";
     alloc_type = TensorType(alloc_attrs->assert_shape, alloc_attrs->dtype);
     return true;
@@ -198,7 +198,7 @@ RELAY_REGISTER_OP("memory.alloc_tensor")
 
 bool KillRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2u);
+  ICHECK_EQ(types.size(), 2u);
   // TODO(@jroesch): should only support tensors.
   reporter->Assign(types[1], TupleType::Empty());
   return true;
diff --git a/src/relay/op/nn/bitserial.cc b/src/relay/op/nn/bitserial.cc
index 61a1b8fdf289..853807997a4d 100644
--- a/src/relay/op/nn/bitserial.cc
+++ b/src/relay/op/nn/bitserial.cc
@@ -50,9 +50,9 @@ Array<Array<Layout>> BinaryConv2DInferCorrectLayout(const Attrs& attrs,
 bool BitPackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   const BitPackAttrs* param = attrs.as<BitPackAttrs>();
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
-  CHECK(data);
+  ICHECK(data);
   int ndim = data->shape.size();
   int bits = param->bits;
   int pack_axis = param->pack_axis;
@@ -120,20 +120,20 @@ TVM_REGISTER_NODE_TYPE(BinaryConv2DAttrs);
 
 bool BinaryConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const BinaryConv2DAttrs* param = attrs.as<BinaryConv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   static const Layout kNCHW("NCHW");
 
   const Layout in_layout(param->data_layout);
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
   Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
-  CHECK(param->channels.defined());
-  CHECK(param->kernel_size.defined());
+  ICHECK(param->channels.defined());
+  ICHECK(param->kernel_size.defined());
   Array<IndexExpr> oshape({dshape_nchw[0], param->channels, 0, 0});
   IndexExpr pad_h, pad_w;
   GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
@@ -199,15 +199,15 @@ TVM_REGISTER_NODE_TYPE(BinaryDenseAttrs);
 
 bool BinaryDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const BinaryDenseAttrs* param = attrs.as<BinaryDenseAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK(static_cast<int>(data->shape.size()) != 0);
-  CHECK(param->units.defined());
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
+  ICHECK(param->units.defined());
 
   Array<tvm::PrimExpr> oshape = data->shape;
   oshape.Set((oshape.size() - 1), param->units);
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 935058c1a5b3..f0112227153d 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -40,7 +40,7 @@ namespace relay {
 template <typename AttrType>
 bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -48,23 +48,23 @@ bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   static const Layout kOIW("OIW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCW."
       << " But got " << out_layout;
 
@@ -92,17 +92,17 @@ bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
           << "Conv1D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << wshape;
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
           << "Conv1D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << wshape;
     }
     if (!dshape_ncw[1].as<tir::AnyNode>() && !wshape[1].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(dshape_ncw[1], wshape[1]));
+      ICHECK(reporter->AssertEQ(dshape_ncw[1], wshape[1]));
     }
     channels = wshape[0];
     dilated_ksize = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -139,7 +139,7 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   static const Layout kOIHW("OIHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
@@ -191,8 +191,8 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 2);
-    CHECK_EQ(param->dilation.size(), 2);
+    ICHECK_EQ(param->kernel_size.size(), 2);
+    ICHECK_EQ(param->dilation.size(), 2);
     Array<IndexExpr> wshape;
 
     if (is_depthwise) {
@@ -291,7 +291,7 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 template <typename AttrType>
 bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -299,23 +299,23 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   static const Layout kOIDHW("OIDHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIDHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIDHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCDHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCDHW."
       << " But got " << out_layout;
 
@@ -324,8 +324,8 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   IndexExpr channels, dilated_ksize_z, dilated_ksize_y, dilated_ksize_x;
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 3);
-    CHECK_EQ(param->dilation.size(), 3);
+    ICHECK_EQ(param->kernel_size.size(), 3);
+    ICHECK_EQ(param->dilation.size(), 3);
     Array<IndexExpr> wshape;
     tvm::tir::ExprDeepEqual expr_equal;
 
@@ -355,23 +355,23 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     if (weight == nullptr) return false;
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 3);
+      ICHECK_EQ(param->kernel_size.size(), 3);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]) &&
-            reporter->AssertEQ(param->kernel_size[2], wshape[4]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+             reporter->AssertEQ(param->kernel_size[1], wshape[3]) &&
+             reporter->AssertEQ(param->kernel_size[2], wshape[4]))
           << "Conv3D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << wshape;
     }
 
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
           << "Conv3D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << wshape;
     }
 
     if (!dshape_ncdhw[1].as<tir::AnyNode>() && !wshape[1].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(dshape_ncdhw[1], param->groups), wshape[1]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_ncdhw[1], param->groups), wshape[1]));
     }
     channels = wshape[0];
     dilated_ksize_z = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -413,14 +413,14 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 // Winograd convolution shape relations
 inline bool Conv2DWinogradWeightTransformRel(const Array<Type>& types, int num_inputs,
                                              const Attrs& attrs, const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
+  ICHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
 
   std::vector<IndexExpr> oshape{
       param->tile_size + data->shape[2] - 1,
@@ -458,16 +458,16 @@ inline bool Conv2DWinogradWeightTransformRel(const Array<Type>& types, int num_i
 //
 inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_inputs,
                                          const Attrs& attrs, const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* weight = types[0].as<TensorTypeNode>();
   if (weight == nullptr) return false;
 
   const ConvGemmWeightTransformAttrs* param = attrs.as<ConvGemmWeightTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   int n = param->tile_rows;
   int k = param->tile_cols;
 
-  CHECK_EQ(weight->shape.size(), 4) << "Only support HWIO kernel layout";
+  ICHECK_EQ(weight->shape.size(), 4) << "Only support HWIO kernel layout";
 
   const auto K = weight->shape[0] * weight->shape[1] * weight->shape[2];
   const auto N = weight->shape[3];
@@ -494,14 +494,14 @@ inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_input
 
 inline bool Conv3DWinogradWeightTransformRel(const Array<Type>& types, int num_inputs,
                                              const Attrs& attrs, const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const ConvWinogradWeightTransformAttrs* param = attrs.as<ConvWinogradWeightTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK_EQ(data->shape.size(), 5) << "Only support NCDHW normal kernel layout";
+  ICHECK_EQ(data->shape.size(), 5) << "Only support NCDHW normal kernel layout";
 
   // Shape of packed weights depends on whether depth is being transformed or not.
   Array<IndexExpr> oshape({0, 0, 0, data->shape[0], data->shape[1]});
@@ -524,7 +524,7 @@ inline bool Conv3DWinogradWeightTransformRel(const Array<Type>& types, int num_i
 inline bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types, int num_inputs,
                                                    const Attrs& attrs,
                                                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -532,9 +532,9 @@ inline bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types, int
 
   const Conv2DWinogradNNPACKWeightTransformAttrs* param =
       attrs.as<Conv2DWinogradNNPACKWeightTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
+  ICHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
 
   std::vector<IndexExpr> oshape{
       data->shape[0],
@@ -554,30 +554,30 @@ inline bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types, int
 template <typename AttrType>
 bool Conv2DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   static const Layout kNCHW("NCHW");
   static const Layout kOIHW("OIHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
@@ -585,11 +585,11 @@ bool Conv2DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
-  CHECK(param->kernel_size.defined() && param->channels.defined())
+  ICHECK(param->kernel_size.defined() && param->channels.defined())
       << "The kernel size and channels of a Conv must be set or inferred by previous pass";
 
-  CHECK_EQ(param->kernel_size.size(), 2);
-  CHECK_EQ(param->dilation.size(), 2);
+  ICHECK_EQ(param->kernel_size.size(), 2);
+  ICHECK_EQ(param->dilation.size(), 2);
 
   channels = param->channels;
   dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
@@ -631,30 +631,30 @@ bool Conv2DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 template <typename AttrType>
 bool Conv2DGemmRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   static const Layout kNHWC("NHWC");
   static const Layout kHWIO("HWIO");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNHWC);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NHWC."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kHWIO);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from HWIO."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNHWC);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NHWC."
       << " But got " << out_layout;
 
@@ -662,11 +662,11 @@ bool Conv2DGemmRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
-  CHECK(param->kernel_size.defined() && param->channels.defined())
+  ICHECK(param->kernel_size.defined() && param->channels.defined())
       << "The kernel size and channels of a Conv must be set or inferred by previous pass";
 
-  CHECK_EQ(param->kernel_size.size(), 2);
-  CHECK_EQ(param->dilation.size(), 2);
+  ICHECK_EQ(param->kernel_size.size(), 2);
+  ICHECK_EQ(param->dilation.size(), 2);
 
   channels = param->channels;
   dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
@@ -703,30 +703,30 @@ bool Conv2DGemmRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 template <typename AttrType>
 bool Conv3DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   static const Layout kNCDHW("NCDHW");
   static const Layout kOIDHW("OIDHW");
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIDHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIDHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCDHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCDHW."
       << " But got " << out_layout;
 
@@ -734,11 +734,11 @@ bool Conv3DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 
   IndexExpr channels, dilated_ksize_d, dilated_ksize_y, dilated_ksize_x;
 
-  CHECK(param->kernel_size.defined() && param->channels.defined())
+  ICHECK(param->kernel_size.defined() && param->channels.defined())
       << "The kernel size and channels of a Conv must be set or inferred by previous pass";
 
-  CHECK_EQ(param->kernel_size.size(), 3);
-  CHECK_EQ(param->dilation.size(), 3);
+  ICHECK_EQ(param->kernel_size.size(), 3);
+  ICHECK_EQ(param->dilation.size(), 3);
 
   channels = param->channels;
   dilated_ksize_d = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
@@ -787,7 +787,7 @@ bool Conv3DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 template <typename AttrType>
 bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -796,23 +796,23 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   static const Layout kOIW("OIW");
 
   const Conv1DTransposeAttrs* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCW."
       << " But got " << out_layout;
 
@@ -822,8 +822,8 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 1);
-    CHECK_EQ(param->dilation.size(), 1);
+    ICHECK_EQ(param->kernel_size.size(), 1);
+    ICHECK_EQ(param->dilation.size(), 1);
 
     Array<IndexExpr> wshape(
         {dshape_ncw[1], indexdiv(param->channels, param->groups), param->kernel_size[0]});
@@ -839,19 +839,19 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     if (weight == nullptr) return false;
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 1);
+      ICHECK_EQ(param->kernel_size.size(), 1);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
           << "Conv1D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[1]))
           << "Conv1D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_ncw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(dshape_ncw[1], param->groups), wshape[0]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_ncw[1], param->groups), wshape[0]));
     }
     channels = wshape[1];
     dilated_ksize_x = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -879,7 +879,7 @@ bool Conv1DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 template <typename AttrType>
 bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -888,23 +888,23 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   static const Layout kOIDHW("OIDHW");
 
   const Conv3DTransposeAttrs* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv3d_transpose only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIDHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv3d_transpose only support kernel layouts that are convertible from OIDHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCDHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv3d_transpose only support output layouts that are convertible from NCDHW."
       << " But got " << out_layout;
 
@@ -914,8 +914,8 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 3);
-    CHECK_EQ(param->dilation.size(), 3);
+    ICHECK_EQ(param->kernel_size.size(), 3);
+    ICHECK_EQ(param->dilation.size(), 3);
 
     Array<IndexExpr> wshape({dshape_ncdhw[1], indexdiv(param->channels, param->groups),
                              param->kernel_size[0], param->kernel_size[1], param->kernel_size[2]});
@@ -933,21 +933,21 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     if (weight == nullptr) return false;
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 3);
+      ICHECK_EQ(param->kernel_size.size(), 3);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]) &&
-            reporter->AssertEQ(param->kernel_size[2], wshape[4]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+             reporter->AssertEQ(param->kernel_size[1], wshape[3]) &&
+             reporter->AssertEQ(param->kernel_size[2], wshape[4]))
           << "Conv3D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[1]))
           << "Conv3D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_ncdhw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(dshape_ncdhw[1], param->groups), wshape[0]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_ncdhw[1], param->groups), wshape[0]));
     }
     channels = wshape[1];
     dilated_ksize_d = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -991,7 +991,7 @@ bool Conv3DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 template <typename AttrType>
 bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
@@ -1000,23 +1000,23 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
   static const Layout kOIHW("OIHW");
 
   const Conv2DTransposeAttrs* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
   const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(trans_in_layout.defined())
+  ICHECK(trans_in_layout.defined())
       << "Conv only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
   const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kOIHW);
-  CHECK(trans_kernel_layout.defined())
+  ICHECK(trans_kernel_layout.defined())
       << "Conv only support kernel layouts that are convertible from OIHW."
       << " But got " << kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNCHW);
-  CHECK(trans_out_layout.defined())
+  ICHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
@@ -1026,8 +1026,8 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 2);
-    CHECK_EQ(param->dilation.size(), 2);
+    ICHECK_EQ(param->kernel_size.size(), 2);
+    ICHECK_EQ(param->dilation.size(), 2);
 
     Array<IndexExpr> wshape({dshape_nchw[1], indexdiv(param->channels, param->groups),
                              param->kernel_size[0], param->kernel_size[1]});
@@ -1044,20 +1044,20 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
     if (weight == nullptr) return false;
     auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 2);
+      ICHECK_EQ(param->kernel_size.size(), 2);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+             reporter->AssertEQ(param->kernel_size[1], wshape[3]))
           << "Conv2D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[1]))
           << "Conv2D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << Array<IndexExpr>(wshape);
     }
     if (!dshape_nchw[1].as<tir::AnyNode>() && !wshape[0].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[0]));
+      ICHECK(reporter->AssertEQ(indexdiv(dshape_nchw[1], param->groups), wshape[0]));
     }
     channels = wshape[1];
     dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
@@ -1093,21 +1093,21 @@ bool Conv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& a
 template <typename AttrType>
 bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                          const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[2].as<TensorTypeNode>();
 
-  CHECK(data);
+  ICHECK(data);
   auto* param = attrs.as<AttrType>();
-  CHECK_EQ(param->data_layout, "NCHW") << "data layout not supported.";
-  CHECK_EQ(param->kernel_layout, "OIHW") << "kernel_layout not supported.";
+  ICHECK_EQ(param->data_layout, "NCHW") << "data layout not supported.";
+  ICHECK_EQ(param->kernel_layout, "OIHW") << "kernel_layout not supported.";
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x, ksize_y, ksize_x;
 
   // infer weight shape if kernel_size and channels are defiend
   if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 2);
-    CHECK_EQ(param->dilation.size(), 2);
+    ICHECK_EQ(param->kernel_size.size(), 2);
+    ICHECK_EQ(param->dilation.size(), 2);
     Array<IndexExpr> wshape({param->channels, indexdiv(data->shape[1], param->groups),
                              param->kernel_size[0], param->kernel_size[1]});
     channels = param->channels;
@@ -1122,20 +1122,20 @@ bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs&
     if (weight == nullptr) return false;
     auto wshape = weight->shape;
     if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 2);
+      ICHECK_EQ(param->kernel_size.size(), 2);
       // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
-            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+      ICHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+             reporter->AssertEQ(param->kernel_size[1], wshape[3]))
           << "DeformableConv2D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size << " wshape=" << wshape;
     }
     if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+      ICHECK(reporter->AssertEQ(param->channels, wshape[0]))
           << "DeformableConv2D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels << " wshape=" << wshape;
     }
     if (!data->shape[1].as<tir::AnyNode>() && !wshape[1].as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(indexdiv(data->shape[1], param->groups), wshape[1]));
+      ICHECK(reporter->AssertEQ(indexdiv(data->shape[1], param->groups), wshape[1]));
     }
     channels = wshape[0];
     ksize_y = wshape[2];
diff --git a/src/relay/op/nn/correlation.cc b/src/relay/op/nn/correlation.cc
index 5970cc75b2a9..0c2f481e10cb 100644
--- a/src/relay/op/nn/correlation.cc
+++ b/src/relay/op/nn/correlation.cc
@@ -64,14 +64,14 @@ Expr MakeCorrelation(Expr data1, Expr data2, int kernel_size, int max_displaceme
 
 bool CorrelationRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data1 = types[0].as<TensorTypeNode>();
   const auto* data2 = types[1].as<TensorTypeNode>();
   if (data1 == nullptr || data2 == nullptr) return false;
 
   const CorrelationAttrs* param = attrs.as<CorrelationAttrs>();
-  CHECK(param != nullptr);
-  CHECK_EQ(param->layout, "NCHW") << "layout not supported.";
+  ICHECK(param != nullptr);
+  ICHECK_EQ(param->layout, "NCHW") << "layout not supported.";
   IndexExpr pad_h, pad_w;
   GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
   IndexExpr padded_height = data1->shape[2] + pad_h;
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 58dfab27a933..ea25c1a9c0f9 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -50,17 +50,17 @@ TVM_REGISTER_NODE_TYPE(BiasAddAttrs);
 
 bool BiasAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const BiasAddAttrs* param = attrs.as<BiasAddAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   int axis = param->axis;
   if (axis < 0) {
     axis = data->shape.size() + axis;
   }
-  CHECK_LE(axis, static_cast<int>(data->shape.size()))
+  ICHECK_LE(axis, static_cast<int>(data->shape.size()))
       << "axis " << param->axis << " is out of range";
 
   // assign output type
@@ -107,15 +107,15 @@ Expr MakeFIFOBuffer(Expr input, Expr buffer, int axis) {
 
 bool FIFOBufferRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* input = types[0].as<TensorTypeNode>();
   const auto* buffer = types[1].as<TensorTypeNode>();
   const FIFOBufferAttrs* param = attrs.as<FIFOBufferAttrs>();
   if (input == nullptr || buffer == nullptr) {
     return false;
   }
-  CHECK(param != nullptr);
-  CHECK_EQ(input->shape.size(), buffer->shape.size());
+  ICHECK(param != nullptr);
+  ICHECK_EQ(input->shape.size(), buffer->shape.size());
 
   const size_t buffer_axis = static_cast<size_t>(
       param->axis < 0 ? static_cast<int>(buffer->shape.size()) + param->axis : param->axis);
@@ -221,14 +221,14 @@ TVM_REGISTER_NODE_TYPE(PReluAttrs);
 
 bool PReluRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const PReluAttrs* param = attrs.as<PReluAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK(param->axis < static_cast<int>(data->shape.size()))
+  ICHECK(param->axis < static_cast<int>(data->shape.size()))
       << "Wrong axis (" << param->axis << ")value.";
 
   // assign alpha type
@@ -245,11 +245,11 @@ Array<Array<Layout>> PReluInferCorrectLayout(const Attrs& attrs,
                                              const Array<Layout>& new_in_layouts,
                                              const Array<Layout>& old_in_layouts,
                                              const Array<tvm::relay::Type>& old_in_types) {
-  CHECK_EQ(old_in_layouts.size(), 2U);
-  CHECK_EQ(old_in_types.size(), 2U);
+  ICHECK_EQ(old_in_layouts.size(), 2U);
+  ICHECK_EQ(old_in_types.size(), 2U);
   Layout data_layout = old_in_layouts[0];
   if (new_in_layouts.defined()) {
-    CHECK_EQ(new_in_layouts.size(), 2U);
+    ICHECK_EQ(new_in_layouts.size(), 2U);
   }
   return Array<Array<Layout>>{{data_layout, Layout("C")}, {data_layout}};
 }
@@ -335,8 +335,8 @@ RELAY_REGISTER_OP("nn.log_softmax")
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* param = attrs.as<SoftmaxAttrs>();
-      CHECK(param != nullptr);
-      CHECK(param->axis == -1 || param->axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
+      ICHECK(param != nullptr);
+      ICHECK(param->axis == -1 || param->axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
           << "log_softmax currently only works on last dimension";
       return Array<te::Tensor>{topi::nn::log_softmax(inputs[0])};
     });
@@ -344,7 +344,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
 // relay.nn.batch_flatten
 bool BatchFlattenRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   if (data->shape.size() == 0) return false;
@@ -499,7 +499,7 @@ TVM_REGISTER_NODE_TYPE(DropoutAttrs);
 
 bool DropoutRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
@@ -544,7 +544,7 @@ Array<Array<Layout>> BatchNormInferCorrectLayout(const Attrs& attrs,
 
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
 
@@ -572,14 +572,14 @@ Array<Array<Layout>> BatchNormInferCorrectLayout(const Attrs& attrs,
 
 bool BatchNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const BatchNormAttrs* param = attrs.as<BatchNormAttrs>();
 
   // axis of -1 means use the last dimension
-  CHECK(param->axis >= -1 && param->axis < (int)data->shape.size());
+  ICHECK(param->axis >= -1 && param->axis < (int)data->shape.size());
   int axis = (param->axis != -1) ? param->axis : data->shape.size() - 1;
   auto axis_size = data->shape[axis];
 
@@ -666,12 +666,12 @@ TVM_REGISTER_NODE_TYPE(InstanceNormAttrs);
 
 bool InstanceNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   const InstanceNormAttrs* param = attrs.as<InstanceNormAttrs>();
   int axis = param->axis >= 0 ? param->axis : param->axis + data->shape.size();
-  CHECK(axis >= 0 && axis < (int)data->shape.size());
+  ICHECK(axis >= 0 && axis < (int)data->shape.size());
   reporter->Assign(types[1], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[2], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
@@ -733,12 +733,12 @@ TVM_REGISTER_NODE_TYPE(LayerNormAttrs);
 
 bool LayerNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   const LayerNormAttrs* param = attrs.as<LayerNormAttrs>();
   int axis = param->axis >= 0 ? param->axis : param->axis + data->shape.size();
-  CHECK(axis >= 0 && axis < (int)data->shape.size());
+  ICHECK(axis >= 0 && axis < (int)data->shape.size());
   reporter->Assign(types[1], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[2], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
@@ -778,12 +778,12 @@ TVM_REGISTER_NODE_TYPE(GroupNormAttrs);
 
 bool GroupNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   const GroupNormAttrs* param = attrs.as<GroupNormAttrs>();
   int axis = param->axis >= 0 ? param->axis : param->axis + data->shape.size();
-  CHECK(axis >= 0 && axis < (int)data->shape.size());
+  ICHECK(axis >= 0 && axis < (int)data->shape.size());
   reporter->Assign(types[1], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[2], TensorType({data->shape[axis]}, data->dtype));
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
@@ -847,11 +847,11 @@ If the input has size k on axis 1, then both gamma and beta have shape (k,).
 // relay.nn.batch_matmul
 bool BatchMatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* x = types[0].as<TensorTypeNode>();
   const auto* y = types[1].as<TensorTypeNode>();
   if (x == nullptr || y == nullptr) return false;
-  CHECK(x->shape.size() == 3 && y->shape.size() == 3);
+  ICHECK(x->shape.size() == 3 && y->shape.size() == 3);
   bool is_dyn = false;
   Array<tvm::PrimExpr> oshape;
   for (size_t i = 0; i < 3; ++i) {
@@ -867,11 +867,11 @@ bool BatchMatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
     }
   }
   if (!is_dyn) {
-    CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]) || reporter->AssertEQ(x->shape[0], 1) ||
-          reporter->AssertEQ(y->shape[0], 1))
+    ICHECK(reporter->AssertEQ(x->shape[0], y->shape[0]) || reporter->AssertEQ(x->shape[0], 1) ||
+           reporter->AssertEQ(y->shape[0], 1))
         << "BatchDot: batch dimensions don't match, "
         << " x shape=" << x->shape << ", y shape=" << y->shape;
-    CHECK(reporter->AssertEQ(x->shape[2], y->shape[2]))
+    ICHECK(reporter->AssertEQ(x->shape[2], y->shape[2]))
         << "BatchDot: shapes of x and y is inconsistent, "
         << " x shape=" << x->shape << ", y shape=" << y->shape;
 
@@ -913,19 +913,19 @@ are data in batch.
 // relay.nn.cross_entropy
 bool CrossEntropyRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* x = types[0].as<TensorTypeNode>();
   const auto* y = types[1].as<TensorTypeNode>();
   if (x == nullptr || y == nullptr) return false;
-  CHECK(x->shape.size() == 2 && y->shape.size() == 2)
+  ICHECK(x->shape.size() == 2 && y->shape.size() == 2)
       << "CrossEntropy: shapes of x and y is inconsistent, "
       << "x shape = " << x->shape << ", "
       << "y shape = " << y->shape;
-  CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
+  ICHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
       << "CrossEntropy: shapes of x and y is inconsistent, "
       << "x shape = " << x->shape << ", "
       << "y shape = " << y->shape;
-  CHECK(reporter->AssertEQ(x->shape[1], y->shape[1]))
+  ICHECK(reporter->AssertEQ(x->shape[1], y->shape[1]))
       << "CrossEntropy: shapes of x and y is inconsistent, "
       << "x shape = " << x->shape << ", "
       << "y shape = " << y->shape;
@@ -958,11 +958,11 @@ TVM_REGISTER_NODE_TYPE(DilateAttrs);
 
 bool DilateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* x = types[0].as<TensorTypeNode>();
   const DilateAttrs* param = attrs.as<DilateAttrs>();
   if (x == nullptr) return false;
-  CHECK_EQ(x->shape.size(), param->strides.size());
+  ICHECK_EQ(x->shape.size(), param->strides.size());
 
   std::vector<IndexExpr> oshape;
   for (size_t i = 0; i < param->strides.size(); ++i) {
@@ -1022,18 +1022,18 @@ TVM_REGISTER_NODE_TYPE(SubPixelAttrs);
 
 bool DepthToSpaceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const SubPixelAttrs* param = attrs.as<SubPixelAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const int block_size = param->block_size;
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "DepthToSpace only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -1085,18 +1085,18 @@ RELAY_REGISTER_OP("nn.depth_to_space")
 
 bool SpaceToDepthRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const SubPixelAttrs* param = attrs.as<SubPixelAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const int block_size = param->block_size;
   const Layout in_layout(param->layout);
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "SpaceToDepth only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index e7f5a4b9d618..30ef3079e565 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -37,15 +37,15 @@ namespace relay {
 template <typename AttrType>
 bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const AttrType* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK(static_cast<int>(data->shape.size()) != 0);
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
 
   Array<tvm::PrimExpr> oshape = data->shape;
   if (param->units.defined()) {
@@ -62,9 +62,9 @@ bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   } else {
     if (weight == nullptr) return false;
     Array<tvm::PrimExpr> wshape = weight->shape;
-    CHECK(static_cast<int>(weight->shape.size()) == 2);
+    ICHECK(static_cast<int>(weight->shape.size()) == 2);
     if (!data->shape.back().as<tir::AnyNode>()) {
-      CHECK(reporter->AssertEQ(data->shape[data->shape.size() - 1], weight->shape[1]))
+      ICHECK(reporter->AssertEQ(data->shape[data->shape.size() - 1], weight->shape[1]))
           << "DenseRel: input dimension doesn't match,"
           << " data shape=" << data->shape << ", weight shape=" << weight->shape;
     }
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 45447e155135..5b9988b101eb 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -55,8 +55,8 @@ Array<Array<Layout>> PadInferCorrectLayout(const Attrs& attrs, const Array<Layou
     // 1) Create a map from axis to param_width using old layout.
     std::map<std::string, tvm::Array<Integer>> axis_pad_width;
     int index_counter = 0;
-    CHECK_EQ(new_in_layouts.size(), 1);
-    CHECK_EQ(old_in_layouts.size(), 1);
+    ICHECK_EQ(new_in_layouts.size(), 1);
+    ICHECK_EQ(old_in_layouts.size(), 1);
     for (auto iter_var : old_in_layouts[0]->axes) {
       const auto& old_layout_axis = LayoutAxis::Get(iter_var);
       axis_pad_width.emplace(old_layout_axis.name(), params->pad_width[index_counter]);
@@ -75,7 +75,7 @@ Array<Array<Layout>> PadInferCorrectLayout(const Attrs& attrs, const Array<Layou
         // This is the axis that got split. So, check that pad_width was [0, 0] originally.
         const auto& dual_axis = new_layout_axis.ToPrimal();
         auto dual_axis_name = dual_axis.name();
-        CHECK(axis_pad_width.count(dual_axis_name))
+        ICHECK(axis_pad_width.count(dual_axis_name))
             << "Missing axis " << dual_axis << " in " << old_in_layouts[0].name();
         new_pad_width.push_back(axis_pad_width.at(dual_axis_name));
 
@@ -102,7 +102,7 @@ Array<Array<Layout>> PadInferCorrectLayout(const Attrs& attrs, const Array<Layou
 
   if (!is_layout_modified) {
     if (old_in_layouts.defined()) {
-      CHECK_EQ(old_in_layouts.size(), 1);
+      ICHECK_EQ(old_in_layouts.size(), 1);
       ret = old_in_layouts[0];
     } else {
       ret = Layout::Undef();
@@ -114,15 +114,15 @@ Array<Array<Layout>> PadInferCorrectLayout(const Attrs& attrs, const Array<Layou
 
 bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
             const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const PadAttrs* param = attrs.as<PadAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // check that pad widths match lengths
-  CHECK(data->shape.size() == param->pad_width.size())
+  ICHECK(data->shape.size() == param->pad_width.size())
       << "There should be as many pad width pairs as shape dimensions "
       << "but the shape has " << data->shape.size() << " dimensions "
       << "and there are " << param->pad_width.size() << " pad width pairs.";
@@ -130,19 +130,19 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // each pad width element should be a pair of positive integers
   std::vector<IndexExpr> oshape;
   for (size_t i = 0; i < param->pad_width.size(); i++) {
-    CHECK(param->pad_width[i].size() == 2)
+    ICHECK(param->pad_width[i].size() == 2)
         << "Each pad width element should be a pair but at index " << i << " there are "
         << param->pad_width[i].size() << " elements.";
 
     auto width1 = tir::as_const_int(param->pad_width[i][0]);
     auto width2 = tir::as_const_int(param->pad_width[i][1]);
-    CHECK(width1 != nullptr);
-    CHECK(width2 != nullptr);
+    ICHECK(width1 != nullptr);
+    ICHECK(width2 != nullptr);
 
-    CHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
-                        << "index " << i << " is " << *width1 << ".";
-    CHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
-                        << "index " << i << " is " << *width2 << ".";
+    ICHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
+                         << "index " << i << " is " << *width1 << ".";
+    ICHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
+                         << "index " << i << " is " << *width2 << ".";
 
     if (!data->shape[i].as<tir::AnyNode>()) {
       auto padding = tir::make_const(data->shape[i].dtype(), *width1 + *width2);
@@ -159,10 +159,10 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> PadCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                              const Type& out_type) {
   const auto* param = attrs.as<PadAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   auto pad_width = param->pad_width;
-  CHECK(pad_width.size() == inputs[0].ndim() && pad_width[0].size() == 2) << "Illegal pad_width";
+  ICHECK(pad_width.size() == inputs[0].ndim() && pad_width[0].size() == 2) << "Illegal pad_width";
   Array<IndexExpr> pad_before;
   for (size_t i = 0; i < pad_width.size(); ++i) {
     pad_before.push_back(pad_width[i][0]);
@@ -207,15 +207,15 @@ TVM_REGISTER_NODE_TYPE(MirrorPadAttrs);
 
 bool MirrorPadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const MirrorPadAttrs* param = attrs.as<MirrorPadAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // check that pad widths match lengths
-  CHECK(data->shape.size() == param->pad_width.size())
+  ICHECK(data->shape.size() == param->pad_width.size())
       << "There should be as many pad width pairs as shape dimensions "
       << "but the shape has " << data->shape.size() << " dimensions "
       << "and there are " << param->pad_width.size() << " pad width pairs.";
@@ -223,19 +223,19 @@ bool MirrorPadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // each pad width element should be a pair of positive integers
   std::vector<IndexExpr> oshape;
   for (size_t i = 0; i < param->pad_width.size(); i++) {
-    CHECK(param->pad_width[i].size() == 2)
+    ICHECK(param->pad_width[i].size() == 2)
         << "Each pad width element should be a pair but at index " << i << " there are "
         << param->pad_width[i].size() << " elements.";
 
     auto width1 = tir::as_const_int(param->pad_width[i][0]);
     auto width2 = tir::as_const_int(param->pad_width[i][1]);
-    CHECK(width1 != nullptr);
-    CHECK(width2 != nullptr);
+    ICHECK(width1 != nullptr);
+    ICHECK(width2 != nullptr);
 
-    CHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
-                        << "index " << i << " is " << *width1 << ".";
-    CHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
-                        << "index " << i << " is " << *width2 << ".";
+    ICHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
+                         << "index " << i << " is " << *width1 << ".";
+    ICHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
+                         << "index " << i << " is " << *width2 << ".";
 
     auto padding = tir::make_const(data->shape[i].dtype(), *width1 + *width2);
     oshape.push_back(data->shape[i] + padding);
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index cee7b6456ce6..4fb1745d65aa 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -50,7 +50,7 @@ Array<Array<Layout> > PoolInferCorrectLayout(const Attrs& attrs,
 
   if (new_in_layouts.defined()) {
     // Set the pool with the new layout.
-    CHECK_EQ(new_in_layouts.size(), 1);
+    ICHECK_EQ(new_in_layouts.size(), 1);
     params->layout = new_in_layouts[0].name();
   }
 
@@ -61,20 +61,20 @@ Array<Array<Layout> > PoolInferCorrectLayout(const Attrs& attrs,
 template <typename AttrType>
 bool Pool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) return false;
 
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 2U)
+  ICHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
   const auto param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool2D layout must have H and W, which cannot be split";
 
   const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
@@ -131,21 +131,21 @@ Array<te::Tensor> Pool2DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
                                 const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto pool_size = param->pool_size;
   auto strides = param->strides;
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
 
-  CHECK(tir::BijectiveLayout(layout, kNCHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "max_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "max_pool2d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "max_pool2d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
       << " or 5-D input (e.g. NCHWc on for vector instructions)"
       << " or 6-D input (e.g. NCHWnc for tensor accelerators)";
@@ -248,20 +248,20 @@ TVM_REGISTER_NODE_TYPE(GlobalPool2DAttrs);
 
 bool GlobalPool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 2U)
+  ICHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
   const auto param = attrs.as<GlobalPool2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool2D layout must have H and W, which cannot be split";
 
   const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
@@ -280,16 +280,16 @@ Array<te::Tensor> GlobalPool2DCompute(const Attrs& attrs, const Array<te::Tensor
                                       const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<GlobalPool2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(tir::BijectiveLayout(layout, kNCHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "global_avg_pool2d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "global_avg_pool2d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
       << " or 5-D input (last dimension is a split of channel)";
   return Array<te::Tensor>{topi::nn::global_pool(inputs[0], mode, layout.name())};
@@ -354,27 +354,27 @@ TVM_REGISTER_NODE_TYPE(AdaptivePool2DAttrs);
 
 bool AdaptivePool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 2U)
+  ICHECK_GE(dshape.size(), 2U)
       << "Pool2D only support input >= 2-D: input must have height and width";
   const auto* param = attrs.as<AdaptivePool2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool2D layout must have H and W, which cannot be split";
 
   const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
   const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
   Array<IndexExpr> oshape(dshape);
   auto output_size = param->output_size;
-  CHECK_LE(output_size.size(), 2U) << "output_size can have up to 2 elements.";
+  ICHECK_LE(output_size.size(), 2U) << "output_size can have up to 2 elements.";
   IndexExpr output_height, output_width;
   if (output_size.empty()) {
     output_height = dshape[hidx];
@@ -400,16 +400,16 @@ Array<te::Tensor> AdaptivePool2DCompute(const Attrs& attrs, const Array<te::Tens
                                         const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AdaptivePool2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(tir::BijectiveLayout(layout, kNCHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "Adaptive pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "Adaptive pool2d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "Adaptive pool2d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
       << " or 5-D input (last dimension is a split of channel)";
 
@@ -505,21 +505,21 @@ TVM_REGISTER_NODE_TYPE(AdaptivePool3DAttrs);
 
 bool AdaptivePool3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 3U)
+  ICHECK_GE(dshape.size(), 3U)
       << "Pool3D only support input >= 3-D: input must have depth, height and width";
   const auto* param = attrs.as<AdaptivePool3DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
-        layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
+         layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout
       << ". Pool3D layout must have D, H and W, which cannot be split";
 
@@ -528,7 +528,7 @@ bool AdaptivePool3DRel(const Array<Type>& types, int num_inputs, const Attrs& at
   const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
   Array<IndexExpr> oshape(dshape);
   auto output_size = param->output_size;
-  CHECK_LE(output_size.size(), 3U) << "output_size can have up to 3 elements.";
+  ICHECK_LE(output_size.size(), 3U) << "output_size can have up to 3 elements.";
   IndexExpr output_depth, output_height, output_width;
   if (output_size.empty()) {
     output_depth = dshape[didx];
@@ -558,18 +558,18 @@ Array<te::Tensor> AdaptivePool3DCompute(const Attrs& attrs, const Array<te::Tens
                                         const Type& out_type) {
   static const Layout kNCDHW("NCDHW");
   const auto* param = attrs.as<AdaptivePool3DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
       << "Adaptive pool3d currently only supports layouts that are convertible from NCDHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
       << "Adaptive pool3d does not support input split on depth";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "Adaptive pool3d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "Adaptive pool3d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
+  ICHECK(inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
       << "Pool3D only support 5-D input (e.g., NCDHW)"
       << " or 6-D input (last dimension is a split of channel)";
 
@@ -666,7 +666,7 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool3d")
 
 bool Pool2DGradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[1].as<TensorTypeNode>();
 
   if (data == nullptr) return false;
@@ -681,26 +681,26 @@ Array<te::Tensor> Pool2DGradCompute(const Attrs& attrs, const Array<te::Tensor>&
                                     const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
-  CHECK_EQ(inputs.size(), 2);
+  ICHECK(param != nullptr);
+  ICHECK_EQ(inputs.size(), 2);
   auto pool_size = param->pool_size;
   auto strides = param->strides;
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
 
-  CHECK(tir::BijectiveLayout(layout, kNCHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "pool2d_grad currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "pool2d_grad does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "pool2d_grad does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2DGrad only support 4-D output gradient (e.g., NCHW)"
       << " or 5-D output gradient (last dimension is a split of channel)";
 
-  CHECK(inputs[1].ndim() == 4U || inputs[1].ndim() == 5U)
+  ICHECK(inputs[1].ndim() == 4U || inputs[1].ndim() == 5U)
       << "Pool2DGrad only support 4-D input (e.g., NCHW)"
       << " or 5-D input (last dimension is a split of channel)";
 
@@ -823,18 +823,18 @@ TVM_REGISTER_NODE_TYPE(AvgPool1DAttrs);
 template <typename AttrType>
 bool Pool1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) return false;
 
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 1U) << "Pool1D only support input >= 1-D: input must have width";
+  ICHECK_GE(dshape.size(), 1U) << "Pool1D only support input >= 1-D: input must have width";
   const auto param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool1D layout must have W, which cannot be split";
 
   const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
@@ -873,19 +873,19 @@ Array<te::Tensor> Pool1DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
                                 const Type& out_type) {
   static const Layout kNCW("NCW");
   const auto* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto pool_size = param->pool_size;
   auto strides = param->strides;
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
 
-  CHECK(tir::BijectiveLayout(layout, kNCW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCW).defined())
       << "max_pool1d currently only supports layouts that are convertible from NCW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "max_pool1d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 3U || inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+  ICHECK(inputs[0].ndim() == 3U || inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool1D only support 3-D input (e.g., NCW)"
       << " or 4-D input (e.g. NCWc on for vector instructions)"
       << " or 5-D input (e.g. NCWnc for tensor accelerators)";
@@ -982,21 +982,21 @@ TVM_REGISTER_NODE_TYPE(AvgPool3DAttrs);
 template <typename AttrType>
 bool Pool3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) return false;
 
   const auto dshape = data->shape;
-  CHECK_GE(dshape.size(), 3U)
+  ICHECK_GE(dshape.size(), 3U)
       << "Pool3D only support input >= 3-D: input must have depth, height and width";
   const auto param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
-        layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
-        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
+  ICHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
+         layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
+         !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout
       << ". Pool3D layout must have D, H and W, which cannot be split";
 
@@ -1051,23 +1051,23 @@ Array<te::Tensor> Pool3DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
                                 const Type& out_type) {
   static const Layout kNCDHW("NCDHW");
   const auto* param = attrs.as<AttrType>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto pool_size = param->pool_size;
   auto strides = param->strides;
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
 
-  CHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
+  ICHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
       << "max_pool3d currently only supports layouts that are convertible from NCDHW";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
       << "max_pool3d does not support input split on depth";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
       << "max_pool3d does not support input split on height";
-  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+  ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
       << "max_pool3d does not support input split on width";
 
-  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
+  ICHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U || inputs[0].ndim() == 6U)
       << "Pool3D only support 5-D input (e.g., NCDHW)"
       << " or 6-D input (e.g. NCDHWc on for vector instructions)"
       << " or 7-D input (e.g. NCDHWnc for tensor accelerators)";
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index 3f51e1f8ab37..09dca09a82de 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -38,10 +38,10 @@ TVM_REGISTER_NODE_TYPE(SparseDenseAttrs);
 
 bool SparseDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight_data = types[1].as<TensorTypeNode>();
-  CHECK(weight_data->shape.size() == 1 || weight_data->shape.size() == 3);
+  ICHECK(weight_data->shape.size() == 1 || weight_data->shape.size() == 3);
   const auto* weight_indptr = types[3].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
@@ -131,11 +131,11 @@ TVM_REGISTER_NODE_TYPE(SparseTransposeAttrs);
 
 bool SparseTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* sparse_data = types[0].as<TensorTypeNode>();
-  CHECK_EQ(sparse_data->shape.size(), 1);
+  ICHECK_EQ(sparse_data->shape.size(), 1);
   const auto* sparse_indices = types[1].as<TensorTypeNode>();
-  CHECK_EQ(sparse_indices->shape.size(), 1);
+  ICHECK_EQ(sparse_indices->shape.size(), 1);
   const auto* sparse_indptr = types[2].as<TensorTypeNode>();
 
   std::vector<Type> output_types;
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index bdf3090cefad..3b0139b16b1b 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -42,18 +42,18 @@ TVM_REGISTER_NODE_TYPE(UpSampling3DAttrs);
 
 bool UpSamplingRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCHW("NCHW");
 
   const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
 
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "UpSampling only support input layouts that are convertible from NCHW."
       << " But got " << in_layout;
 
@@ -110,18 +110,18 @@ RELAY_REGISTER_OP("nn.upsampling")
 // UpSampling3D
 bool UpSampling3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   static const Layout kNCDHW("NCDHW");
 
   const UpSampling3DAttrs* param = attrs.as<UpSampling3DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   const Layout in_layout(param->layout);
 
   auto layout_converter = tir::BijectiveLayout(in_layout, kNCDHW);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "UpSampling3D only support input layouts that are convertible from NCDHW."
       << " But got " << in_layout;
 
diff --git a/src/relay/op/nn/upsampling.h b/src/relay/op/nn/upsampling.h
index e4e3bc9b1929..4cd292e78cb6 100644
--- a/src/relay/op/nn/upsampling.h
+++ b/src/relay/op/nn/upsampling.h
@@ -43,7 +43,7 @@ Array<Array<Layout> > UpsamplingInferCorrectLayout(const Attrs& attrs,
   T* params = const_cast<T*>(attrs.as<T>());
 
   if (new_in_layouts.defined()) {
-    CHECK_EQ(new_in_layouts.size(), 1);
+    ICHECK_EQ(new_in_layouts.size(), 1);
 
     Layout raw_layout(params->layout);
     Layout input = new_in_layouts[0];
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index d530345fc9e8..6c2c6b2cce69 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -151,7 +151,7 @@ inline void GetPaddingWidth(const Array<IndexExpr>& padding, IndexExpr* pad_w) {
   } else if (padding.size() == 2) {
     *pad_w = padding[0] + padding[1];
   } else {
-    CHECK_EQ(padding.size(), 4) << " Expected padding size of 1 or 2, found " << padding.size();
+    ICHECK_EQ(padding.size(), 4) << " Expected padding size of 1 or 2, found " << padding.size();
   }
 }
 
@@ -168,7 +168,7 @@ inline void GetPaddingHeightWidth(const Array<IndexExpr>& padding, IndexExpr* pa
     *pad_h = padding[0] + padding[2];
     *pad_w = padding[1] + padding[3];
   } else {
-    CHECK_EQ(padding.size(), 4) << " Padding size should be 1, 2 or 4, but got " << padding.size();
+    ICHECK_EQ(padding.size(), 4) << " Padding size should be 1, 2 or 4, but got " << padding.size();
   }
 }
 
@@ -188,7 +188,7 @@ inline void GetPaddingDepthHeightWidth(const Array<IndexExpr>& padding, IndexExp
     *pad_h = padding[1] + padding[4];
     *pad_w = padding[2] + padding[5];
   } else {
-    CHECK_EQ(padding.size(), 6) << " Padding size should be 1, 3 or 6, but got " << padding.size();
+    ICHECK_EQ(padding.size(), 6) << " Padding size should be 1, 3 or 6, but got " << padding.size();
   }
 }
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index df128ff05338..aafd4492fec4 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -34,7 +34,7 @@ namespace relay {
 #define RELAY_BINARY_COMPUTE(FTOPI)                       \
   [](const Attrs& attrs, const Array<te::Tensor>& inputs, \
      const Type& out_type) -> Array<te::Tensor> {         \
-    CHECK_EQ(inputs.size(), 2U);                          \
+    ICHECK_EQ(inputs.size(), 2U);                         \
     return {FTOPI(inputs[0], inputs[1])};                 \
   }
 
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 16f5f0116b60..afe45571f558 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -63,12 +63,12 @@ inline std::vector<int64_t> GetReduceAxes(const uint32_t indim, const Array<Inte
     }
 
     // Check out of bounds error
-    CHECK(axis >= 0) << "Axis out of bounds in reduce operator.";
-    CHECK(axis < indim) << "Axis out of bounds in reduce operator.";
+    ICHECK(axis >= 0) << "Axis out of bounds in reduce operator.";
+    ICHECK(axis < indim) << "Axis out of bounds in reduce operator.";
     in_axes.push_back(axis);
   }
 
-  CHECK(in_axes[in_axes.size() - 1] < indim)
+  ICHECK(in_axes[in_axes.size() - 1] < indim)
       << "Reduction axis " << in_axes[in_axes.size() - 1] << " exceeds input dimensions " << indim;
 
   std::sort(in_axes.begin(), in_axes.end());
@@ -91,7 +91,7 @@ inline std::vector<int64_t> GetReduceAxes(const uint32_t indim, const Array<Inte
 
 // Get axis under exclude condition.
 Array<Integer> GetExcludeAxes(size_t indim, const Array<Integer>& inaxis) {
-  CHECK(inaxis.defined()) << "Cannot set exclude when axis=None";
+  ICHECK(inaxis.defined()) << "Cannot set exclude when axis=None";
   std::vector<bool> axis_flag(indim, true);
   for (auto i : inaxis) {
     int64_t axis = i->value;
@@ -99,8 +99,8 @@ Array<Integer> GetExcludeAxes(size_t indim, const Array<Integer>& inaxis) {
       axis = axis + static_cast<int64_t>(indim);
     }
     // Check out of bounds error
-    CHECK_GE(axis, 0) << "Axis out of bounds in reduce operator.";
-    CHECK_LT(axis, static_cast<int64_t>(indim)) << "Axis out of bounds in reduce operator.";
+    ICHECK_GE(axis, 0) << "Axis out of bounds in reduce operator.";
+    ICHECK_LT(axis, static_cast<int64_t>(indim)) << "Axis out of bounds in reduce operator.";
     axis_flag[axis] = false;
   }
 
@@ -125,7 +125,7 @@ Array<Array<Layout>> ReduceInferCorrectLayout(const Attrs& attrs,
   // Get the reduce axes.
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
   uint32_t indim = old_in_shapes[0].size();
@@ -135,8 +135,8 @@ Array<Array<Layout>> ReduceInferCorrectLayout(const Attrs& attrs,
   if (new_in_layouts.defined() && r_axes.size()) {
     // Adapt to new layout. The axis has to change. Record original reduce axes. Convert to the
     // modified layout axes.
-    CHECK_EQ(new_in_layouts.size(), 1);
-    CHECK_EQ(old_in_layouts.size(), 1);
+    ICHECK_EQ(new_in_layouts.size(), 1);
+    ICHECK_EQ(old_in_layouts.size(), 1);
 
     // 1) Collect the original axes
     std::unordered_set<std::string> old_r_dims;
@@ -166,7 +166,7 @@ Array<Array<Layout>> ReduceInferCorrectLayout(const Attrs& attrs,
     params->axis = new_r_axes;
   } else if (old_in_layouts.defined()) {
     // If the new layout is undefined, set the old layout as the inferred layout.
-    CHECK_EQ(old_in_layouts.size(), 1);
+    ICHECK_EQ(old_in_layouts.size(), 1);
     ret = old_in_layouts[0];
   }
 
@@ -177,7 +177,7 @@ template <typename F>
 Array<te::Tensor> ReduceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type, F f) {
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   if (inputs[0]->shape.size() == 0) {
     return {topi::identity(inputs[0])};
   }
@@ -221,8 +221,8 @@ inline std::vector<IndexExpr> ReduceShapeImpl(const std::vector<IndexExpr>& in_s
   }
 
   if (is_dynamic_input) {
-    CHECK(reporter->Assert(max_shape <
-                           tir::make_const(DataType::Int(64), std::numeric_limits<int32_t>::max())))
+    ICHECK(reporter->Assert(
+        max_shape < tir::make_const(DataType::Int(64), std::numeric_limits<int32_t>::max())))
         << "The maximum possible index of reduced shape cannot be more than int32 max.";
   }
 
@@ -259,14 +259,14 @@ inline std::vector<IndexExpr> ReduceShapeImpl(const std::vector<IndexExpr>& in_s
  */
 bool ArgReduceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  CHECK(static_cast<int>(data->shape.size()) != 0);
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
   std::vector<IndexExpr> in_shape(data->shape.begin(), data->shape.end());
 
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // assign output type and shape
   auto oshape = ReduceShapeImpl(in_shape, param, reporter);
@@ -283,13 +283,13 @@ bool ArgReduceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
  */
 bool ReduceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
   std::vector<IndexExpr> in_shape(data->shape.begin(), data->shape.end());
 
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // assign output type and shape
   auto oshape = ReduceShapeImpl(in_shape, param, reporter);
@@ -501,7 +501,7 @@ Array<te::Tensor> MeanCompute(const Attrs& attrs, const Array<te::Tensor>& input
                               const Type& out_type) {
   IndexExpr count = tir::make_const(inputs[0]->dtype, 1);
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto axes = param->axis;
   for (int64_t i : GetReduceAxes(inputs[0]->shape.size(), param->axis, param->exclude)) {
     count *= inputs[0]->shape[i];
@@ -537,19 +537,19 @@ Example::
 
 bool VarianceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  CHECK(static_cast<int>(data->shape.size()) != 0);
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
   const auto* mean = types[1].as<TensorTypeNode>();
   if (mean == nullptr) return false;
 
   std::vector<IndexExpr> in_shape(data->shape.begin(), data->shape.end());
   std::vector<IndexExpr> mean_shape(mean->shape.begin(), mean->shape.end());
-  CHECK_EQ(in_shape.size(), mean_shape.size());
+  ICHECK_EQ(in_shape.size(), mean_shape.size());
 
   const VarianceAttrs* param = attrs.as<VarianceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // assign output type and shape
   auto oshape = ReduceShapeImpl(in_shape, param, reporter);
@@ -561,7 +561,7 @@ Array<te::Tensor> VarianceCompute(const Attrs& attrs, const Array<te::Tensor>& i
                                   const Type& out_type) {
   IndexExpr count = tir::make_const(inputs[0]->dtype, 1);
   const VarianceAttrs* param = attrs.as<VarianceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto axes = param->axis;
   bool unbiased = param->unbiased;
   auto data = inputs[0];
@@ -576,7 +576,7 @@ Array<te::Tensor> VarianceCompute(const Attrs& attrs, const Array<te::Tensor>& i
   auto sq_diff = topi::power(topi::subtract(data, mean), 2);
   if (param->exclude) {
     axes = GetExcludeAxes(sq_diff->shape.size(), param->axis);
-    CHECK_NE(axes.size(), 0);
+    ICHECK_NE(axes.size(), 0);
   }
   auto var = topi::divide(topi::sum(sq_diff, axes, param->keepdims, false), count);
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 20cd0a12ed98..4a832ec8d962 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -55,10 +55,10 @@ TVM_REGISTER_NODE_TYPE(CastAttrs);
 
 bool CastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "cast: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -70,7 +70,7 @@ bool CastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> CastCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
   const CastAttrs* param = attrs.as<CastAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   DataType dtype = param->dtype;
   return {topi::cast(inputs[0], dtype)};
 }
@@ -100,16 +100,16 @@ RELAY_REGISTER_OP("cast")
 // relay.cast_like
 bool CastLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "cast: expect input type to be TensorType but get " << types[0];
     return false;
   }
   const auto* dtype_like = types[1].as<TensorTypeNode>();
   if (dtype_like == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "cast: expect input type to be TensorType but get " << types[1];
     return false;
   }
@@ -144,7 +144,7 @@ RELAY_REGISTER_OP("cast_like")
 Array<te::Tensor> ReinterpretCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                      const Type& out_type) {
   const CastAttrs* param = attrs.as<CastAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   DataType dtype = param->dtype;
   return {topi::reinterpret(inputs[0], dtype)};
 }
@@ -178,10 +178,10 @@ TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
 bool ExpandDimsRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "expand_dims: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -189,9 +189,9 @@ bool ExpandDimsRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const int ndim = static_cast<int>(data->shape.size());
   const int axis = param->axis;
   const int num_newaxis = param->num_newaxis;
-  CHECK(num_newaxis >= 0) << "expand_dims only accepts `num_newaxis >= 0`"
-                          << ", but got num_newaxis = " << num_newaxis;
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(num_newaxis >= 0) << "expand_dims only accepts `num_newaxis >= 0`"
+                           << ", but got num_newaxis = " << num_newaxis;
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
   const int pivot = axis < 0 ? ndim + axis + 1 : axis;
@@ -213,7 +213,7 @@ bool ExpandDimsRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> ExpandDimsCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                     const Type& out_type) {
   const ExpandDimsAttrs* param = attrs.as<ExpandDimsAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::expand_dims(inputs[0], param->axis, param->num_newaxis)};
 }
 
@@ -247,7 +247,7 @@ TVM_REGISTER_NODE_TYPE(ConcatenateAttrs);
 Array<te::Tensor> ConcatenateCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                      const Type& out_type) {
   const ConcatenateAttrs* param = attrs.as<ConcatenateAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::concatenate(inputs, param->axis)};
 }
 
@@ -282,10 +282,10 @@ TVM_REGISTER_NODE_TYPE(StackAttrs);
 bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* tensor_tuple = types[0].as<TupleTypeNode>();
   if (tensor_tuple == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "cast: expect input type to be TupleType but get " << types[0];
     return false;
   }
@@ -295,7 +295,7 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   // Sanity check: axis
   int axis = param->axis;
-  CHECK(-(ndim + 1) <= axis && axis < ndim + 1)
+  ICHECK(-(ndim + 1) <= axis && axis < ndim + 1)
       << "stack only accepts `axis` in [-(ndim+1), ndim+1)"
       << ", but got axis = " << axis << ", and ndim = " << ndim;
   axis = axis < 0 ? ndim + axis + 1 : axis;
@@ -306,8 +306,8 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     const auto& e = Downcast<TensorType>(ele);
     int e_ndim = static_cast<int>(e->shape.size());
     const DataType& e_dtype = e->dtype;
-    CHECK_EQ(e_ndim, ndim) << "relay.stack requires all tensors have the same ndim";
-    CHECK_EQ(e_dtype, dtype) << "relay.stack requires all tensors have the same dtype";
+    ICHECK_EQ(e_ndim, ndim) << "relay.stack requires all tensors have the same ndim";
+    ICHECK_EQ(e_dtype, dtype) << "relay.stack requires all tensors have the same dtype";
     for (size_t j = 0; j < first->shape.size(); ++j) {
       if (j == static_cast<size_t>(axis)) continue;
       if (first->shape[j].as<AnyNode>() || e->shape[j].as<AnyNode>() ||
@@ -337,7 +337,7 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> StackCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                const Type& out_type) {
   const StackAttrs* param = attrs.as<StackAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::stack(inputs, param->axis)};
 }
 
@@ -372,10 +372,10 @@ TVM_REGISTER_NODE_TYPE(TransposeAttrs);
 bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "transpose: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -383,7 +383,7 @@ bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const int ndim = data->shape.size();
   const Array<Integer>& axes = param->axes;
   // check dimension match
-  CHECK(!axes.defined() || static_cast<int>(axes.size()) == ndim)
+  ICHECK(!axes.defined() || static_cast<int>(axes.size()) == ndim)
       << "Dimension mismatch: axes has " << axes.size() << " elements"
       << ", but data.ndim = " << ndim;
   // construct int_axes
@@ -399,12 +399,12 @@ bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     for (const Integer& e : axes) {
       int64_t axis = e;
       // sanity check for axis and ndim
-      CHECK(-ndim <= axis && axis < ndim)
+      ICHECK(-ndim <= axis && axis < ndim)
           << "transpose only allows each `axis` in `axes` in range [-data.ndim, data.ndim)"
           << ", but got axis = " << axis << ", and data.ndim = " << ndim;
       axis = axis < 0 ? axis + ndim : axis;
       // sanity check for duplication
-      CHECK(!axis_used[axis]) << "Duplicate axes in transpose: " << axis;
+      ICHECK(!axis_used[axis]) << "Duplicate axes in transpose: " << axis;
       axis_used[axis] = 1;
       int_axes.push_back(static_cast<int>(axis));
     }
@@ -421,7 +421,7 @@ bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> TransposeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                    const Type& out_type) {
   const auto* param = attrs.as<TransposeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{topi::transpose(inputs[0], param->axes)};
 }
 
@@ -480,13 +480,13 @@ Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs&
       ++src_idx;
     } else if (svalue == 0) {
       // keep same
-      CHECK_LT(src_idx, ishape.size());
+      ICHECK_LT(src_idx, ishape.size());
       used_input_dims.insert(src_idx);
       used_output_dims.insert(oshape.size());
       oshape.push_back(ishape[src_idx++]);
     } else if (svalue == -1) {
       // inference based on rest
-      CHECK_LT(infer_idx, 0) << "One and only one dim can be inferred";
+      ICHECK_LT(infer_idx, 0) << "One and only one dim can be inferred";
       infer_idx = i;
       oshape.push_back(1);
       ++src_idx;
@@ -499,7 +499,7 @@ Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs&
       }
     } else if (svalue == -3) {
       // merge two dims from source
-      CHECK_LT(src_idx + 1, ishape.size());
+      ICHECK_LT(src_idx + 1, ishape.size());
       used_input_dims.insert(src_idx);
       IndexExpr d1 = ishape[src_idx++];
       used_input_dims.insert(src_idx);
@@ -513,14 +513,14 @@ Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs&
     } else if (svalue == -4) {
       // split the source dim s into two dims
       // read the left dim and then the right dim (either can be -1)
-      CHECK_LT(i + 2, newshape.size());
-      CHECK_LT(src_idx, ishape.size());
+      ICHECK_LT(i + 2, newshape.size());
+      ICHECK_LT(src_idx, ishape.size());
       used_input_dims.insert(src_idx);
       IndexExpr d0 = ishape[src_idx++];
       Integer d1 = newshape[++i];
       Integer d2 = newshape[++i];
       if (d1->value == -1) {
-        CHECK_NE(d2->value, -1) << "Split dims cannot both be -1.";
+        ICHECK_NE(d2->value, -1) << "Split dims cannot both be -1.";
         used_output_dims.insert(oshape.size());
         if (d0.as<AnyNode>()) {
           oshape.push_back(Any());
@@ -584,10 +584,10 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   const auto* param = attrs.as<ReshapeAttrs>();
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "reshape: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -627,7 +627,7 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     data_shape_sum *= Downcast<tvm::Integer>(x)->value;
   }
   if (!found_dynamic) {
-    CHECK_EQ(oshape_sum, data_shape_sum)
+    ICHECK_EQ(oshape_sum, data_shape_sum)
         << "Input tensor shape and reshaped shape are not compatible";
   }
 
@@ -648,7 +648,7 @@ Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& in
   }
 
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
   Array<IndexExpr> newshape;
   bool newshape_has_any = false;
   for (auto val : out_ttype->shape) {
@@ -745,7 +745,7 @@ Example::
  */
 bool ReshapeLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -763,7 +763,7 @@ bool ReshapeLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
     }
   }
   if (is_static_shape) {
-    CHECK(reporter->AssertEQ(data->Size(), reshape_like->Size()))
+    ICHECK(reporter->AssertEQ(data->Size(), reshape_like->Size()))
         << "Reshape inputs size should be compatible.";
   }
   reporter->Assign(types[2], TensorType(reshape_like->shape, data->dtype));
@@ -795,7 +795,7 @@ the input array into an output array with the same shape as the second input arr
 // ArgWhere
 bool ArgWhereRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 1);
+  ICHECK_EQ(num_inputs, 1);
   auto tt = types[0].as<TensorTypeNode>();
 
   if (tt == nullptr) {
@@ -832,8 +832,8 @@ TVM_REGISTER_NODE_TYPE(ScatterAttrs);
 // Scatter
 bool ScatterRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 3);
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(num_inputs, 3);
+  ICHECK_EQ(types.size(), 4);
   auto data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -846,9 +846,9 @@ bool ScatterRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (updates == nullptr) {
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
   const auto param = attrs.as<ScatterAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
   return true;
 }
@@ -879,8 +879,8 @@ TVM_REGISTER_NODE_TYPE(ScatterAddAttrs);
 // Scatter Add
 bool ScatterAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 3);
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(num_inputs, 3);
+  ICHECK_EQ(types.size(), 4);
   auto data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -893,9 +893,9 @@ bool ScatterAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (updates == nullptr) {
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of scatter_add must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of scatter_add must be tensor of integer";
   const auto param = attrs.as<ScatterAddAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   reporter->Assign(types[3], TensorType(data->shape, data->dtype));
   return true;
 }
@@ -926,7 +926,7 @@ TVM_REGISTER_NODE_TYPE(TakeAttrs);
 bool TakeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, indices, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -935,9 +935,9 @@ bool TakeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (indices == nullptr) {
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
   const auto param = attrs.as<TakeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   if (!param->axis.defined()) {
     std::vector<IndexExpr> oshape(indices->shape.begin(), indices->shape.end());
@@ -950,8 +950,8 @@ bool TakeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto ndim_indices = static_cast<int>(indices->shape.size());
   int axis = static_cast<int>(param->axis->value);
   if (axis < 0) axis += ndim_data;
-  CHECK_LE(axis, ndim_data) << "axis should be with in data shape"
-                            << ", but got = " << axis;
+  ICHECK_LE(axis, ndim_data) << "axis should be with in data shape"
+                             << ", but got = " << axis;
 
   oshape.reserve(ndim_data - 1 + ndim_indices);
   for (int i = 0; i < axis; ++i) {
@@ -971,7 +971,7 @@ bool TakeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> TakeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
   const auto* param = attrs.as<TakeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   if (!param->axis.defined()) {
     return Array<te::Tensor>{topi::take(inputs[0], inputs[1], param->mode)};
   } else {
@@ -1026,7 +1026,7 @@ TVM_REGISTER_NODE_TYPE(InitOpAttrs);
 
 bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
   const auto* fill_value = types[0].as<TensorTypeNode>();
   if (fill_value == nullptr) {
@@ -1038,7 +1038,7 @@ bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     out_dtype = fill_value->dtype;
   }
 
-  CHECK_EQ(fill_value->shape.size(), 0)
+  ICHECK_EQ(fill_value->shape.size(), 0)
       << "Fill value should be a scalar but has dimension " << fill_value->shape.size() << ".";
 
   std::vector<IndexExpr> oshape;
@@ -1081,10 +1081,10 @@ RELAY_REGISTER_OP("full")
 bool InitOpRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // types = [ret_type]
-  CHECK_EQ(types.size(), 1);
+  ICHECK_EQ(types.size(), 1);
 
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   DataType out_dtype = param->dtype;
   std::vector<IndexExpr> oshape;
@@ -1137,7 +1137,7 @@ RELAY_REGISTER_OP("ones")
 
 bool FullLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -1147,7 +1147,7 @@ bool FullLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     return false;
   }
 
-  CHECK_EQ(fill_value->shape.size(), 0)
+  ICHECK_EQ(fill_value->shape.size(), 0)
       << "The fill value should be a scalar but here it has dimension " << fill_value->shape.size()
       << ".";
 
@@ -1185,7 +1185,7 @@ TVM_REGISTER_NODE_TYPE(ArangeAttrs);
 
 bool ArangeRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
                const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const ArangeAttrs* attrs = raw_attrs.as<ArangeAttrs>();
   const ConstantNode *cstart, *cstop, *cstep;
 
@@ -1199,8 +1199,8 @@ bool ArangeRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
     double stop = ToScalar(cstop->data);
     double step = ToScalar(cstep->data);
     int32_t num_elem = static_cast<int32_t>(std::ceil((stop - start) / step));
-    CHECK_GT(num_elem, 0) << "Invalid arange attributes (start, stop, step): " << attrs->start
-                          << ", " << attrs->stop << ", " << attrs->step;
+    ICHECK_GT(num_elem, 0) << "Invalid arange attributes (start, stop, step): " << attrs->start
+                           << ", " << attrs->stop << ", " << attrs->step;
     reporter->Assign(types[3], TensorType({num_elem}, attrs->dtype));
     return true;
   } else {
@@ -1225,7 +1225,7 @@ inline te::Tensor DynamicArange(const te::Tensor& start, const te::Tensor& stop,
 Array<te::Tensor> ArangeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const ArangeAttrs* param = attrs.as<ArangeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   te::Tensor start = inputs[0];
   te::Tensor stop = inputs[1];
   te::Tensor step = inputs[2];
@@ -1276,10 +1276,10 @@ TVM_REGISTER_NODE_TYPE(RepeatAttrs);
 bool RepeatRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "repeat: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -1287,9 +1287,9 @@ bool RepeatRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const int ndim = static_cast<int>(data->shape.size());
   const int repeats = param->repeats;
   const int axis = param->axis;
-  CHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
-                      << ", but got repeats = " << repeats;
-  CHECK(-ndim - 1 <= axis && axis <= ndim)
+  ICHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
+                       << ", but got repeats = " << repeats;
+  ICHECK(-ndim - 1 <= axis && axis <= ndim)
       << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
   const int pivot = axis < 0 ? ndim + axis : axis;
@@ -1313,7 +1313,7 @@ bool RepeatRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> RepeatCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const RepeatAttrs* param = attrs.as<RepeatAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::repeat(inputs[0], param->repeats, param->axis)};
 }
 
@@ -1347,7 +1347,7 @@ TVM_REGISTER_NODE_TYPE(MeshgridAttrs);
 bool MeshgridRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
                  const TypeReporter& reporter) {
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const MeshgridAttrs* attrs = raw_attrs.as<MeshgridAttrs>();
   const auto* tensor_tuple = types[0].as<TupleTypeNode>();
   if (tensor_tuple == nullptr) {
@@ -1403,7 +1403,7 @@ bool MeshgridRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attr
 Array<te::Tensor> MeshgridCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                   const Type& out_type) {
   const MeshgridAttrs* param = attrs.as<MeshgridAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::meshgrid(inputs, param->indexing)};
 }
 
@@ -1486,10 +1486,10 @@ TVM_REGISTER_NODE_TYPE(TileAttrs);
 bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "tile: expect input type to be TensorType but get " << types[0];
     return false;
   }
@@ -1497,12 +1497,12 @@ bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const size_t ndim = data->shape.size();
   const Array<Integer>& reps = param->reps;
   // check dimension match
-  CHECK(reps.defined()) << "repetition array is not defined. data.ndim = " << ndim;
+  ICHECK(reps.defined()) << "repetition array is not defined. data.ndim = " << ndim;
   const size_t rndim = reps.size();
   for (size_t i = 0; i < rndim; ++i) {
     if (const tvm::tir::IntImmNode* val = reps[i].as<tvm::tir::IntImmNode>()) {
-      CHECK_GT(val->value, 0) << "Tile reps value should always be larger than 0, but get: "
-                              << val->value;
+      ICHECK_GT(val->value, 0) << "Tile reps value should always be larger than 0, but get: "
+                               << val->value;
     }
   }
   size_t tndim = (ndim > rndim) ? ndim : rndim;
@@ -1554,7 +1554,7 @@ bool TileRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> TileCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
   const TileAttrs* param = attrs.as<TileAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::tile(inputs[0], param->reps)};
 }
 
@@ -1587,17 +1587,17 @@ TVM_REGISTER_NODE_TYPE(ReverseAttrs);
 bool ReverseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "reverse: expect input type to be TensorType but get " << types[0];
     return false;
   }
   const auto* param = attrs.as<ReverseAttrs>();
   const int ndim = static_cast<int>(data->shape.size());
   const int axis = param->axis;
-  CHECK(-ndim <= axis && axis < ndim)
+  ICHECK(-ndim <= axis && axis < ndim)
       << "reverse only accepts `axis` in [-data.ndim, data.ndim - 1]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
   reporter->Assign(types[1], types[0]);
@@ -1607,7 +1607,7 @@ bool ReverseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> ReverseCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   const ReverseAttrs* param = attrs.as<ReverseAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   // pass empty seq_length tensor to reverse_sequence
   return {topi::reverse_sequence(inputs[0], te::Tensor(), param->axis)};
 }
@@ -1641,44 +1641,44 @@ TVM_REGISTER_NODE_TYPE(ReverseSequenceAttrs);
 bool ReverseSequenceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
   // `types` contains: [data, seq_lengths, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "reverse_sequence: expect input type to be TensorType but get " << types[0];
     return false;
   }
 
   const auto* seq_lengths = types[1].as<TensorTypeNode>();
   if (seq_lengths == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "reverse_sequence: expect input type to be TensorType but get " << types[1];
     return false;
   }
 
   const int seq_lengths_dim = static_cast<int>(seq_lengths->shape.size());
-  CHECK(seq_lengths_dim == 1) << "For reverse_sequnece, seq_lengths must be a 1D vector";
-  CHECK(seq_lengths->dtype.is_int())
+  ICHECK(seq_lengths_dim == 1) << "For reverse_sequnece, seq_lengths must be a 1D vector";
+  ICHECK(seq_lengths->dtype.is_int())
       << "For reverse_sequnece, seq_lengths must be tensor of integer";
 
   const auto* param = attrs.as<ReverseSequenceAttrs>();
   const int ndim = static_cast<int>(data->shape.size());
   int batch_axis = param->batch_axis;
-  CHECK(-ndim <= batch_axis && batch_axis < ndim)
+  ICHECK(-ndim <= batch_axis && batch_axis < ndim)
       << "reverse_sequence only accepts `batch_axis` in [-data.ndim, data.ndim - 1]"
       << ", but got batch_axis = " << batch_axis << ", and data.ndim = " << ndim;
 
   if (batch_axis < 0) {
     batch_axis = static_cast<int>(data->shape.size()) + batch_axis;
   }
-  CHECK(reporter->Assert(seq_lengths->shape[0] == data->shape[batch_axis]))
+  ICHECK(reporter->Assert(seq_lengths->shape[0] == data->shape[batch_axis]))
       << "For reverse_sequnece seq_lengths size should match with dimension of batch axis"
       << ", but got dimension of batch_axis = " << data->shape[batch_axis]
       << ", and seq_length size = " << seq_lengths->shape[0];
 
   const int seq_axis = param->seq_axis;
-  CHECK(-ndim <= seq_axis && seq_axis < ndim)
+  ICHECK(-ndim <= seq_axis && seq_axis < ndim)
       << "reverse_sequnece only accepts `seq_axis` in [-data.ndim, data.ndim - 1]"
       << ", but got seq_axis = " << seq_axis << ", and data.ndim = " << ndim;
 
@@ -1689,7 +1689,7 @@ bool ReverseSequenceRel(const Array<Type>& types, int num_inputs, const Attrs& a
 Array<te::Tensor> ReverseSequenceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                          const Type& out_type) {
   const ReverseSequenceAttrs* param = attrs.as<ReverseSequenceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::reverse_sequence(inputs[0], inputs[1], param->seq_axis, param->batch_axis)};
 }
 
@@ -1728,7 +1728,7 @@ Input is first sliced along batch axis and then elements are reversed along seq
 // where operator
 bool WhereRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4U);
+  ICHECK_EQ(types.size(), 4U);
   const auto* condition = types[0].as<TensorTypeNode>();
   const auto* x = types[1].as<TensorTypeNode>();
   const auto* y = types[2].as<TensorTypeNode>();
@@ -1740,18 +1740,18 @@ bool WhereRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto& cond_shape = condition->shape;
   const auto& x_shape = x->shape;
   const auto& y_shape = y->shape;
-  CHECK(x_shape.size() == y_shape.size()) << "x and y must have the same size";
+  ICHECK(x_shape.size() == y_shape.size()) << "x and y must have the same size";
 
   if (cond_shape.size() != x_shape.size()) {
-    CHECK_EQ(cond_shape.size(), 1) << "Shape of condition " << condition->shape
-                                   << " must be either equal to x or has dimension of 1.";
+    ICHECK_EQ(cond_shape.size(), 1) << "Shape of condition " << condition->shape
+                                    << " must be either equal to x or has dimension of 1.";
   }
   for (size_t i = 0; i < x_shape.size(); i++) {
-    CHECK(reporter->AssertEQ(x_shape[i], y_shape[i]))
+    ICHECK(reporter->AssertEQ(x_shape[i], y_shape[i]))
         << "x and y must have the same shape: " << x_shape << " vs " << y_shape;
 
     if (i < cond_shape.size()) {
-      CHECK(reporter->AssertEQ(cond_shape[i], x_shape[i]))
+      ICHECK(reporter->AssertEQ(cond_shape[i], x_shape[i]))
           << "condition and x must have the same shape: " << cond_shape << " vs " << x_shape;
     }
   }
@@ -1835,13 +1835,13 @@ TVM_REGISTER_GLOBAL("relay.op._make.squeeze").set_body_typed(MakeSqueeze);
 
 bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
   }
   const auto* param = attrs.as<SqueezeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   std::vector<IndexExpr> result_shape;
   // if axes is None, squeeze all axes of dimension 1
   if (!param->axis.defined()) {
@@ -1850,7 +1850,7 @@ bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
         LOG(FATAL) << "axis needs to be defined for dynamic input.";
       }
       const int64_t* axis_ptr = tir::as_const_int(e);
-      CHECK(axis_ptr != nullptr) << "the axes attribute must be concrete";
+      ICHECK(axis_ptr != nullptr) << "the axes attribute must be concrete";
       if (*axis_ptr != 1) {
         result_shape.push_back(e);
       }
@@ -1866,8 +1866,8 @@ bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       if (axis_val < 0) {
         axis_val += static_cast<int64_t>(original_shape.size());
       }
-      CHECK_GE(axis_val, 0);
-      CHECK_LT(axis_val, original_shape.size());
+      ICHECK_GE(axis_val, 0);
+      ICHECK_LT(axis_val, original_shape.size());
       original_shape.at(axis_val).second = false;
     }
     for (const auto& p : original_shape) {
@@ -1875,7 +1875,7 @@ bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
         result_shape.push_back(p.first);
       } else {
         if (const int64_t* axis_ptr = tir::as_const_int(p.first)) {
-          CHECK_EQ(*axis_ptr, 1) << "cannot squeeze axis with dimension not equal to 1";
+          ICHECK_EQ(*axis_ptr, 1) << "cannot squeeze axis with dimension not equal to 1";
         }
       }
     }
@@ -1887,7 +1887,7 @@ bool SqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> SqueezeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   const SqueezeAttrs* param = attrs.as<SqueezeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::squeeze(inputs[0], param->axis)};
 }
 
@@ -1908,7 +1908,7 @@ RELAY_REGISTER_OP("squeeze")
 // CollapseSumLike: <A, B> -> B where BroadCast(A, B) = A
 bool CollapseSumLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   reporter->Assign(types[2], types[1]);
   return BroadcastRel({types[0], types[1], types[0]}, 2, Attrs(), reporter);
 }
@@ -1921,7 +1921,7 @@ Expr MakeCollapseSumLike(Expr data, Expr collapse_type) {
 Array<te::Tensor> CollapseSumLikeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                          const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
   return {topi::collapse_sum(inputs[0], out_ttype->shape)};
 }
 
@@ -1941,14 +1941,14 @@ RELAY_REGISTER_OP("collapse_sum_like")
 // CollapseSumTo: <A, B> -> B where Broadcast(A, B) = A
 bool CollapseSumToRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
 
   const auto* target_shape = types[1].as<TensorTypeNode>();
   DataType out_dtype = types[0].as<TensorTypeNode>()->dtype;
 
   const IntImmNode* rank = target_shape->shape[0].as<IntImmNode>();
-  CHECK(rank) << "Parameter must have static rank";
+  ICHECK(rank) << "Parameter must have static rank";
 
   std::vector<IndexExpr> oshape;
   if (param->shape) {
@@ -1990,10 +1990,10 @@ RELAY_REGISTER_OP("collapse_sum_to")
 bool BroadCastToRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
   // types = [data_type, ret_type], broadcast_to_type is in attrs bc static
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
 
   const InitOpAttrs* param = attrs.as<InitOpAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   DataType out_dtype = types[0].as<TensorTypeNode>()->dtype;
   std::vector<IndexExpr> oshape;
@@ -2035,7 +2035,7 @@ RELAY_REGISTER_OP("broadcast_to")
 // BroadCastToLike: <A, B> -> B where BroadCast(A, B) = B
 bool BroadCastToLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                         const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   reporter->Assign(types[2], types[1]);
   return BroadcastRel({types[0], types[1], types[1]}, 2, Attrs(), reporter);
 }
@@ -2048,7 +2048,7 @@ Expr MakeBroadCastToLike(Expr data, Expr broadcast_type) {
 Array<te::Tensor> BroadCastToLikeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                          const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
-  CHECK(out_ttype != nullptr);
+  ICHECK(out_ttype != nullptr);
   return {topi::broadcast_to(inputs[0], out_ttype->shape)};
 }
 
@@ -2068,7 +2068,7 @@ RELAY_REGISTER_OP("broadcast_to_like")
 // Adapter function to make int array.
 Array<Integer> GetIntArray(Array<IndexExpr> arr) {
   for (size_t i = 0; i < arr.size(); ++i) {
-    CHECK(!arr[i].defined() || arr[i].as<IntImmNode>()) << "Expect an int array";
+    ICHECK(!arr[i].defined() || arr[i].as<IntImmNode>()) << "Expect an int array";
   }
   return Downcast<Array<Integer>>(arr);
 }
@@ -2078,7 +2078,7 @@ TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
 
 bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
   if (param == nullptr) {
     return false;
@@ -2099,7 +2099,7 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
     std::vector<int64_t> stride_vec(num_axis, 1);
     if (param->slice_mode == "end") {
       for (size_t i = 0; i < param->strides.value().size(); ++i) {
-        CHECK(param->strides.value()[i].defined());
+        ICHECK(param->strides.value()[i].defined());
         stride_vec[i] = param->strides.value()[i]->value;
       }
     }
@@ -2163,14 +2163,14 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
       int64_t slice_range, step;
       if (stride_v < 0) {
         if (end_v < -1) end_v = -1;
-        CHECK_LE(end_v, begin_v) << "strided_slice get empty slice at axis " << i;
+        ICHECK_LE(end_v, begin_v) << "strided_slice get empty slice at axis " << i;
         begin_v = std::min(dim_size - 1, begin_v);
         slice_range = begin_v - end_v;
         step = -stride_v;
       } else {
         if (begin_v < 0) begin_v = 0;
-        CHECK_GE(stride_v, 0);
-        CHECK_LE(begin_v, end_v) << "strided_slice get invalid slice at axis " << i;
+        ICHECK_GE(stride_v, 0);
+        ICHECK_LE(begin_v, end_v) << "strided_slice get invalid slice at axis " << i;
         end_v = std::min(dim_size, end_v);
         slice_range = end_v - begin_v;
         step = stride_v;
@@ -2178,9 +2178,9 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
       oshape[i] = tir::make_const(dshape[i].dtype(), (slice_range + step - 1) / step);
     }
   } else {
-    CHECK(param->begin) << "strided_slice recieved invalid begin " << param->begin;
-    CHECK(param->end) << "strided_slice recieved invalid end " << param->end;
-    CHECK(param->strides) << "strided_slice recieved invalid strides " << param->strides;
+    ICHECK(param->begin) << "strided_slice recieved invalid begin " << param->begin;
+    ICHECK(param->end) << "strided_slice recieved invalid end " << param->end;
+    ICHECK(param->strides) << "strided_slice recieved invalid strides " << param->strides;
   }
   reporter->Assign(types[1], TensorType(oshape, data->dtype));
   return true;
@@ -2192,37 +2192,37 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
                                                     const Array<tvm::relay::Type>& old_in_types) {
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
 
-  CHECK(old_in_layouts.defined());
-  CHECK_GE(old_in_layouts.size(), 1);
-  CHECK(old_in_shapes.defined());
-  CHECK_GE(old_in_shapes.size(), 1);
+  ICHECK(old_in_layouts.defined());
+  ICHECK_GE(old_in_layouts.size(), 1);
+  ICHECK(old_in_shapes.defined());
+  ICHECK_GE(old_in_shapes.size(), 1);
 
   auto layout = old_in_layouts[0];
   if (layout.defined() && new_in_layouts.defined()) {
-    CHECK_GE(new_in_layouts.size(), 1);
+    ICHECK_GE(new_in_layouts.size(), 1);
     auto new_layout = new_in_layouts[0];
     auto shape = old_in_shapes[0];
 
     // NOTE: Discard "const" qualifier here.
     auto* params = const_cast<StridedSliceAttrs*>(attrs.as<StridedSliceAttrs>());
-    CHECK(params != nullptr);
+    ICHECK(params != nullptr);
     Array<Integer> begin, end, strides;
     if (params->begin && params->end && params->strides) {
       for (Integer i : params->strides.value()) {
-        CHECK(i.defined());
+        ICHECK(i.defined());
         strides.push_back(params->slice_mode == "size" ? 1 : i->value);
       }
 
       for (Integer i : params->begin.value()) {
-        CHECK(i.defined());
+        ICHECK(i.defined());
         begin.push_back(i->value);
       }
       for (Integer i : params->end.value()) {
-        CHECK(i.defined());
+        ICHECK(i.defined());
         end.push_back(i->value);
       }
     }
@@ -2325,7 +2325,7 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
 Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                       const Type& out_type) {
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Array<Integer> begin, end, strides;
   begin = param->begin.value();
   end = param->end.value();
@@ -2333,7 +2333,7 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
   if (IsDynamic(out_type)) {
     auto input = inputs[0];
     size_t src_tensor_dim = input->shape.size();
-    CHECK(begin.size() == src_tensor_dim)
+    ICHECK(begin.size() == src_tensor_dim)
         << "for dynamic inputs, len(begin) must equal the input dimension";
     Array<IndexExpr> out_shape;
     for (size_t i = 0; i < src_tensor_dim; ++i) {
@@ -2416,7 +2416,7 @@ Examples::
 // strided_set
 bool StridedSetRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
   reporter->Assign(types[5], types[0]);
   return true;
 }
@@ -2460,23 +2460,23 @@ TVM_REGISTER_NODE_TYPE(SplitAttrs);
 bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
   // `types` contains: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
-  CHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
+  ICHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
   const auto param = attrs.as<SplitAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto axis = param->axis;
   if (axis < 0) {
     axis += data->shape.size();
   }
-  CHECK_LT(axis, data->shape.size()) << "axis should be within the input dimension range.";
-  CHECK_GE(axis, 0) << "axis should be within the input dimension range.";
+  ICHECK_LT(axis, data->shape.size()) << "axis should be within the input dimension range.";
+  ICHECK_GE(axis, 0) << "axis should be within the input dimension range.";
 
   if (const IntImmNode* sections = param->indices_or_sections.as<IntImmNode>()) {
     if (!data->shape[axis].as<AnyNode>()) {
-      CHECK(reporter->Assert(indexmod(data->shape[axis], sections->value) ==
-                             tir::make_zero(DataType::Int(64))))
+      ICHECK(reporter->Assert(indexmod(data->shape[axis], sections->value) ==
+                              tir::make_zero(DataType::Int(64))))
           << "indices_or_sections need to be able to divide input.shape[axis]";
     }
     std::vector<Type> fields;
@@ -2496,7 +2496,7 @@ bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     auto begin = IndexExpr(tir::make_zero(DataType::Int(32)));
     std::vector<Type> fields;
     for (unsigned int i = 0; i < indices.size(); ++i) {
-      CHECK(reporter->Assert(Downcast<IndexExpr>(indices[i]) > begin))
+      ICHECK(reporter->Assert(Downcast<IndexExpr>(indices[i]) > begin))
           << "indices_or_sections need to be a sorted ascending list";
       std::vector<IndexExpr> oshape(data->shape.begin(), data->shape.end());
       oshape[axis] = Downcast<IndexExpr>(indices[i]) - begin;
@@ -2505,7 +2505,7 @@ bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       fields.push_back(vec_type);
     }
     if (!data->shape[axis].as<AnyNode>()) {
-      CHECK(reporter->Assert(begin < data->shape[axis]))
+      ICHECK(reporter->Assert(begin < data->shape[axis]))
           << "The sum of sections must match the input.shape[axis]";
     }
     std::vector<IndexExpr> oshape(data->shape.begin(), data->shape.end());
@@ -2524,7 +2524,7 @@ bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> SplitCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                const Type& out_type) {
   const auto param = attrs.as<SplitAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   if (const IntImmNode* sections = param->indices_or_sections.as<IntImmNode>()) {
     int64_t num_sections = sections->value;
@@ -2590,7 +2590,7 @@ TVM_REGISTER_NODE_TYPE(SliceLikeAttrs);
  */
 bool SliceLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     return false;
@@ -2602,7 +2602,7 @@ bool SliceLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   const auto param = attrs.as<SliceLikeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   const Array<IndexExpr>& dshape = data->shape;
   const Array<IndexExpr>& target_shape = target->shape;
@@ -2612,22 +2612,22 @@ bool SliceLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     for (size_t i = 0; i < dshape.size(); ++i) {
       if (i < target_shape.size()) {
         oshape[i] = target_shape[i];
-        CHECK(reporter->Assert(oshape[i] <= dshape[i]))
+        ICHECK(reporter->Assert(oshape[i] <= dshape[i]))
             << "End index of axis " << i << " exceeds input shape: " << oshape[i] << " vs "
             << dshape[i];
       }
     }
   } else {
-    CHECK(param->axes.size() != 0) << "Axes cannot be empty.";
+    ICHECK(param->axes.size() != 0) << "Axes cannot be empty.";
     for (Integer val : param->axes) {
       int axis = val->value;
       if (axis < 0) {
         axis += dshape.size();
       }
-      CHECK(axis < static_cast<int>(target_shape.size()))
+      ICHECK(axis < static_cast<int>(target_shape.size()))
           << "Axis " << axis << " exceeds dimension " << target_shape.size() << " of target_shape.";
       oshape[axis] = target_shape[axis];
-      CHECK(reporter->Assert(oshape[axis] <= dshape[axis]))
+      ICHECK(reporter->Assert(oshape[axis] <= dshape[axis]))
           << "End index of axis " << axis << " exceeds input shape: " << oshape[axis] << " vs "
           << dshape[axis];
     }
@@ -2647,7 +2647,7 @@ Expr MakeSliceLike(Expr data, Expr shape_like, Array<Integer> axes) {
 Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                    const Type& out_type) {
   const auto* param = attrs.as<SliceLikeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Array<IndexExpr> src_shape = inputs[0]->shape;
   Array<IndexExpr> target_shape = inputs[1]->shape;
   Array<IndexExpr> begin_idx, end_idx, strides;
@@ -2660,7 +2660,7 @@ Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>&
     for (size_t i = 0; i < src_shape.size(); ++i) {
       if (i < target_shape.size()) {
         end_idx.Set(i, target_shape[i]);
-        CHECK_LE(topi::GetConstInt(end_idx[i]), topi::GetConstInt(src_shape[i]))
+        ICHECK_LE(topi::GetConstInt(end_idx[i]), topi::GetConstInt(src_shape[i]))
             << "End index of axis " << i
             << " exceeds input shape: " << topi::GetConstInt(end_idx[i]) << " vs "
             << topi::GetConstInt(src_shape[i]);
@@ -2672,7 +2672,7 @@ Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>&
         axis = static_cast<int>(src_shape.size()) + axis;
       }
       end_idx.Set(axis, target_shape[axis]);
-      CHECK_LE(topi::GetConstInt(end_idx[axis]), topi::GetConstInt(src_shape[axis]))
+      ICHECK_LE(topi::GetConstInt(end_idx[axis]), topi::GetConstInt(src_shape[axis]))
           << "End index of axis " << axis
           << " exceeds input shape: " << topi::GetConstInt(end_idx[axis]) << " vs "
           << topi::GetConstInt(src_shape[axis]);
@@ -2702,7 +2702,7 @@ TVM_REGISTER_NODE_TYPE(LayoutTransformAttrs);
 Array<te::Tensor> LayoutTransformCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                          const Type& out_type) {
   const auto* param = attrs.as<LayoutTransformAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{topi::layout_transform(inputs[0], param->src_layout, param->dst_layout)};
 }
 
@@ -2710,7 +2710,7 @@ bool LayoutTransformRel(const Array<Type>& types, int num_inputs, const Attrs& a
                         const TypeReporter& reporter) {
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "LayoutTransform: expect input data type to be TensorType but get " << types[0];
     return false;
   }
@@ -2719,9 +2719,9 @@ bool LayoutTransformRel(const Array<Type>& types, int num_inputs, const Attrs& a
   Layout src_layout(params->src_layout);
   Layout dst_layout(params->dst_layout);
 
-  CHECK(src_layout.defined() && dst_layout.defined()) << "cannot convert from/to undefined layout";
+  ICHECK(src_layout.defined() && dst_layout.defined()) << "cannot convert from/to undefined layout";
   auto layout_converter = tir::BijectiveLayout(src_layout, dst_layout);
-  CHECK(layout_converter.defined())
+  ICHECK(layout_converter.defined())
       << "cannot convert from " << params->src_layout << " to " << params->dst_layout;
 
   const auto& out_shape = layout_converter.ForwardShape(data->shape);
@@ -2792,39 +2792,39 @@ TVM_REGISTER_NODE_TYPE(GatherAttrs);
 bool GatherRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // `types` contains: [data, indices, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* indices = types[1].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "Gather: expect input data type to be TensorType but get " << types[0];
     return false;
   }
   if (indices == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "Gather: expect indices type to be TensorType but get " << types[1];
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of take must be tensor of integer";
   const auto param = attrs.as<GatherAttrs>();
-  CHECK(param != nullptr);
-  CHECK(param->axis.defined());
+  ICHECK(param != nullptr);
+  ICHECK(param->axis.defined());
 
   const auto ndim_data = data->shape.size();
   const auto ndim_indices = indices->shape.size();
   int axis = param->axis->value;
-  CHECK_EQ(ndim_data, ndim_indices);
-  CHECK_GE(axis, 0);
-  CHECK_LT(axis, ndim_data);
+  ICHECK_EQ(ndim_data, ndim_indices);
+  ICHECK_GE(axis, 0);
+  ICHECK_LT(axis, ndim_data);
 
   std::vector<IndexExpr> oshape;
   oshape.reserve(ndim_data);
   for (size_t i = 0; i < ndim_data; ++i) {
     if (i == (size_t)axis) {
       const int64_t* indice_shape_i = tir::as_const_int(indices->shape[i]);
-      CHECK_GE(*indice_shape_i, 1);
+      ICHECK_GE(*indice_shape_i, 1);
     } else {
-      CHECK(reporter->AssertEQ(indices->shape[i], data->shape[i]));
+      ICHECK(reporter->AssertEQ(indices->shape[i], data->shape[i]));
     }
     oshape.emplace_back(indices->shape[i]);
   }
@@ -2872,23 +2872,23 @@ which must just be not null. Output will have same shape as ``indices``.
 bool GatherNDRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
   // `types` contains: [data, indices, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* indices = types[1].as<TensorTypeNode>();
   if (data == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "GatherND: expect input data type to be TensorType but get " << types[0];
     return false;
   }
   if (indices == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "GatherND: expect indices type to be TensorType but get " << types[1];
     return false;
   }
   const size_t ndim = data->shape.size();
   const IntImmNode* mdim = indices->shape[0].as<IntImmNode>();
   const size_t kdim = indices->shape.size() - 1;
-  CHECK(size_t(mdim->value) <= ndim) << "GatherND: indices shape does satisfy.";
+  ICHECK(size_t(mdim->value) <= ndim) << "GatherND: indices shape does satisfy.";
 
   Array<IndexExpr> oshape;
   for (size_t i = 1; i < kdim + 1; ++i) oshape.push_back(indices->shape[i]);
@@ -2931,14 +2931,14 @@ TVM_REGISTER_NODE_TYPE(SequenceMaskAttrs);
 bool SequenceMaskRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   // `types` contains: [data, valid_length, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_length = types[1].as<TensorTypeNode>();
-  CHECK(data);
-  CHECK(valid_length);
+  ICHECK(data);
+  ICHECK(valid_length);
   const auto param = attrs.as<SequenceMaskAttrs>();
   Array<IndexExpr> valid_length_shape;
-  CHECK(param->axis == 0 || param->axis == 1);
+  ICHECK(param->axis == 0 || param->axis == 1);
   valid_length_shape.push_back(data->shape[1 - param->axis]);
   reporter->Assign(types[1], TensorType(valid_length_shape, valid_length->dtype));
   reporter->Assign(types[2], types[0]);
@@ -2948,7 +2948,7 @@ bool SequenceMaskRel(const Array<Type>& types, int num_inputs, const Attrs& attr
 Array<te::Tensor> SequenceMaskCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                       const Type& out_type) {
   const auto* param = attrs.as<SequenceMaskAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{
       topi::sequence_mask(inputs[0], inputs[1], param->mask_value, param->axis)};
 }
@@ -3028,12 +3028,12 @@ TVM_REGISTER_NODE_TYPE(OneHotAttrs);
 bool OneHotRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // `types` contains: [indices, on_value, off_value, result]
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* indices = types[0].as<TensorTypeNode>();
-  CHECK(indices);
+  ICHECK(indices);
 
   const auto param = attrs.as<OneHotAttrs>();
-  CHECK_GT(param->depth, 0);
+  ICHECK_GT(param->depth, 0);
 
   Array<IndexExpr> oshape;
   int ndim = indices->shape.size() + 1;
@@ -3054,7 +3054,7 @@ bool OneHotRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> OneHotCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const auto* param = attrs.as<OneHotAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{
       topi::one_hot(inputs[0], inputs[1](), inputs[2](), param->depth, param->axis, param->dtype)};
 }
@@ -3098,23 +3098,23 @@ RELAY_REGISTER_OP("one_hot")
 /* relay.unravel_index */
 bool UnRavelIndexRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
 
   const auto* indices = types[0].as<TensorTypeNode>();
   if (indices == nullptr) {
-    CHECK(types[0].as<IncompleteTypeNode>())
+    ICHECK(types[0].as<IncompleteTypeNode>())
         << "unravel_index: expect input type to be TensorType but get " << types[0];
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "indices of unravel_index must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "indices of unravel_index must be tensor of integer";
 
   const auto* shape = types[1].as<TensorTypeNode>();
   if (shape == nullptr) {
-    CHECK(types[1].as<IncompleteTypeNode>())
+    ICHECK(types[1].as<IncompleteTypeNode>())
         << "unravel_index: expect input type to be TensorType but get " << types[1];
     return false;
   }
-  CHECK(indices->dtype.is_int()) << "shape of unravel_index must be tensor of integer";
+  ICHECK(indices->dtype.is_int()) << "shape of unravel_index must be tensor of integer";
 
   Array<IndexExpr> indices_shape;
   Array<IndexExpr> shape_shape;
@@ -3160,7 +3160,7 @@ TVM_REGISTER_NODE_TYPE(SparseToDenseAttrs);
 
 bool SparseToDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 3);
+  ICHECK_EQ(num_inputs, 3);
   auto sparse_indices = types[0].as<TensorTypeNode>();
   auto sparse_values = types[1].as<TensorTypeNode>();
   auto default_value = types[2].as<TensorTypeNode>();
@@ -3169,17 +3169,17 @@ bool SparseToDenseRel(const Array<Type>& types, int num_inputs, const Attrs& att
     return false;
   }
 
-  CHECK(sparse_indices->dtype.is_int()) << "sparse_indices must be tensor of integers";
+  ICHECK(sparse_indices->dtype.is_int()) << "sparse_indices must be tensor of integers";
 
-  CHECK_LE(sparse_indices->shape.size(), 3)
+  ICHECK_LE(sparse_indices->shape.size(), 3)
       << "sparse_indices must be a tensor of either 0D, 1D or 2D";
 
-  CHECK_LE(sparse_values->shape.size(), 2) << "sparse_values must be a tensor of either 0D, 1D";
+  ICHECK_LE(sparse_values->shape.size(), 2) << "sparse_values must be a tensor of either 0D, 1D";
 
-  CHECK_EQ(default_value->shape.size(), 0) << "default_value should be a scalar";
+  ICHECK_EQ(default_value->shape.size(), 0) << "default_value should be a scalar";
 
   const auto* param = attrs.as<SparseToDenseAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   Array<IndexExpr> oshape;
   for (auto i : param->output_shape) {
@@ -3191,9 +3191,9 @@ bool SparseToDenseRel(const Array<Type>& types, int num_inputs, const Attrs& att
 
 Array<te::Tensor> SparseToDenseCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                        const Type& out_type) {
-  CHECK_EQ(inputs.size(), 3);
+  ICHECK_EQ(inputs.size(), 3);
   const auto* param = attrs.as<SparseToDenseAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::sparse_to_dense(inputs[0], param->output_shape, inputs[1], inputs[2]())};
 }
 
@@ -3238,16 +3238,16 @@ TVM_REGISTER_NODE_TYPE(MatrixSetDiagAttrs);
 bool MatrixSetDiagRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
   // `types` contains: [input, diagonal, result]
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
 
   const auto* input = types[0].as<TensorTypeNode>();
-  CHECK(input);
+  ICHECK(input);
 
   const auto* diagonal = types[1].as<TensorTypeNode>();
-  CHECK(diagonal);
+  ICHECK(diagonal);
 
   const auto param = attrs.as<MatrixSetDiagAttrs>();
-  CHECK_GE(param->k2, param->k1);
+  ICHECK_GE(param->k2, param->k1);
 
   int d_ndims = diagonal->shape.size();
   int i_ndims = input->shape.size();
@@ -3276,7 +3276,7 @@ bool MatrixSetDiagRel(const Array<Type>& types, int num_inputs, const Attrs& att
 Array<te::Tensor> MatrixSetDiagCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                        const Type& out_type) {
   const auto* param = attrs.as<MatrixSetDiagAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{topi::matrix_set_diag(inputs[0], inputs[1], param->k1, param->k2,
                                                  param->super_diag_right_align,
                                                  param->sub_diag_right_align)};
@@ -3317,7 +3317,7 @@ RELAY_REGISTER_OP("matrix_set_diag")
 // adv_index
 bool AdvIndexRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 1);
+  ICHECK_EQ(num_inputs, 1);
   auto inputs = types[0].as<TupleTypeNode>();
   auto data = inputs->fields[0].as<TensorTypeNode>();
 
@@ -3337,7 +3337,7 @@ bool AdvIndexRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       if (index_type == nullptr) {
         return false;
       }
-      CHECK(index_type->dtype.is_int()) << "indices must be tensor of integers";
+      ICHECK(index_type->dtype.is_int()) << "indices must be tensor of integers";
 
       int64_t flatten_len = 1;
       bool has_dyn_shape = false;
diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h
index 0fe4734fe883..4173d57a84de 100644
--- a/src/relay/op/tensor/transform.h
+++ b/src/relay/op/tensor/transform.h
@@ -44,7 +44,7 @@ template <typename AttrType>
 bool ConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
   // types: [data, result]
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   /* If we receive a tuple we can continue, if we receive
    * anything but an incomplete type we should signal an
    * error.
@@ -131,9 +131,9 @@ static inline Array<Array<Layout>> ConcatenateLayout(const Attrs& attrs,
   ConcatenateAttrs* param = const_cast<ConcatenateAttrs*>(attrs.as<ConcatenateAttrs>());
 
   Array<Array<IndexExpr>> old_in_shapes;
-  CHECK_EQ(old_in_types.size(), 1);
+  ICHECK_EQ(old_in_types.size(), 1);
   for (auto old_in_tuple_t : old_in_types) {
-    CHECK(old_in_tuple_t.as<TupleTypeNode>());
+    ICHECK(old_in_tuple_t.as<TupleTypeNode>());
     for (auto old_in_t : old_in_tuple_t.as<TupleTypeNode>()->fields) {
       old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
     }
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 59ef47f413fe..e17bdc0e0906 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -424,9 +424,9 @@ TVM_REGISTER_NODE_TYPE(ShapeOfAttrs);
 
 Array<te::Tensor> ShapeOfCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
-  CHECK_EQ(inputs.size(), 1);
+  ICHECK_EQ(inputs.size(), 1);
   const auto* param = attrs.as<ShapeOfAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return {topi::shape(inputs[0], param->dtype)};
 }
 
@@ -456,7 +456,7 @@ TVM_REGISTER_NODE_TYPE(NdarraySizeAttrs);
 
 bool NdarraySizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 1);
+  ICHECK_EQ(num_inputs, 1);
   auto tt = types[0].as<TensorTypeNode>();
 
   if (tt == nullptr) {
@@ -464,16 +464,16 @@ bool NdarraySizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   }
 
   const auto* param = attrs.as<NdarraySizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   reporter->Assign(types[1], TensorType({}, param->dtype));
   return true;
 }
 
 Array<te::Tensor> NdarraySizeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                      const Type& out_type) {
-  CHECK_EQ(inputs.size(), 1);
+  ICHECK_EQ(inputs.size(), 1);
   const auto* param = attrs.as<NdarraySizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   return Array<te::Tensor>{topi::ndarray_size(inputs[0], param->dtype)};
 }
 
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 0647ec9780f3..3dc33c5022e0 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -99,12 +99,12 @@ Type ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataType outp
 
 bool BroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   // DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
   //                 << ",Out:" << types[2] << std::endl;
   if (auto* t0 = types[0].as<TensorTypeNode>()) {
     if (auto* t1 = types[1].as<TensorTypeNode>()) {
-      CHECK_EQ(t0->dtype, t1->dtype);
+      ICHECK_EQ(t0->dtype, t1->dtype);
       reporter->Assign(
           types[2], ConcreteBroadcast(GetRef<TensorType>(t0), GetRef<TensorType>(t1), t0->dtype));
       return true;
@@ -115,12 +115,12 @@ bool BroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
 bool BroadcastCompRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   // DLOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
   //                 << ",Out:" << types[2] << std::endl;
   if (auto* t0 = types[0].as<TensorTypeNode>()) {
     if (auto* t1 = types[1].as<TensorTypeNode>()) {
-      CHECK_EQ(t0->dtype, t1->dtype);
+      ICHECK_EQ(t0->dtype, t1->dtype);
       reporter->Assign(types[2], ConcreteBroadcast(GetRef<TensorType>(t0), GetRef<TensorType>(t1),
                                                    DataType::Bool()));
       return true;
@@ -149,13 +149,13 @@ Array<IndexExpr> RankShape(const Array<IndexExpr>& shape) {
 
 bool ShapeOfRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  CHECK_EQ(num_inputs, 1);
+  ICHECK_EQ(num_inputs, 1);
   auto tt = types[0].as<TensorTypeNode>();
   if (tt == nullptr) {
     return false;
   }
   const auto* param = attrs.as<ShapeOfAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   auto rank_shape = RankShape(tt->shape);
   reporter->Assign(types[1], TensorType(rank_shape, param->dtype));
   return true;
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index b766facff050..17d0a4718298 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -32,12 +32,12 @@ TVM_REGISTER_NODE_TYPE(MultiBoxPriorAttrs);
 
 bool MultiboxPriorRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const MultiBoxPriorAttrs* param = attrs.as<MultiBoxPriorAttrs>();
   const auto& dshape = data->shape;
-  CHECK_EQ(dshape.size(), 4) << "Input data should be 4D: "
-                                "[batch, channel, height, width]";
+  ICHECK_EQ(dshape.size(), 4) << "Input data should be 4D: "
+                                 "[batch, channel, height, width]";
   IndexExpr in_height = dshape[2];
   IndexExpr in_width = dshape[3];
   int num_sizes = static_cast<int>(param->sizes.size());
@@ -78,7 +78,7 @@ TVM_REGISTER_NODE_TYPE(MultiBoxTransformLocAttrs);
 
 bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                              const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
 
   const auto* cls_prob = types[0].as<TensorTypeNode>();
   const auto* loc_pred = types[1].as<TensorTypeNode>();
@@ -92,17 +92,17 @@ bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs, const Att
   const auto& loc_shape = loc_pred->shape;
   const auto& anchor_shape = anchor->shape;
 
-  CHECK_EQ(cls_shape.size(), 3U) << "The dimension of class probability should be 3, but received "
-                                 << cls_shape.size();
-  CHECK_EQ(loc_shape.size(), 2U)
+  ICHECK_EQ(cls_shape.size(), 3U) << "The dimension of class probability should be 3, but received "
+                                  << cls_shape.size();
+  ICHECK_EQ(loc_shape.size(), 2U)
       << "The dimension of location prediction should be 2, but received " << loc_shape.size();
-  CHECK_EQ(anchor_shape.size(), 3U)
+  ICHECK_EQ(anchor_shape.size(), 3U)
       << "The dimension of anchor should be 3, but received " << anchor_shape.size();
 
-  CHECK(reporter->AssertEQ(cls_shape[2], anchor_shape[1])) << "Number of anchors mismatch found";
-  CHECK(reporter->AssertEQ(cls_shape[2] * 4, loc_shape[1])) << "# anchors mismatch with # loc.";
-  CHECK(reporter->Assert(anchor_shape[1] > 0)) << "Number of anchors must > 0.";
-  CHECK(reporter->AssertEQ(anchor_shape[2], 4));
+  ICHECK(reporter->AssertEQ(cls_shape[2], anchor_shape[1])) << "Number of anchors mismatch found";
+  ICHECK(reporter->AssertEQ(cls_shape[2] * 4, loc_shape[1])) << "# anchors mismatch with # loc.";
+  ICHECK(reporter->Assert(anchor_shape[1] > 0)) << "Number of anchors must > 0.";
+  ICHECK(reporter->AssertEQ(anchor_shape[2], 4));
 
   std::vector<IndexExpr> oshape0({cls_shape[0], anchor_shape[1], 6});
   std::vector<IndexExpr> oshape1({cls_shape[0]});
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index f9cdaf66e255..76fdf2829ed0 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -31,10 +31,10 @@ TVM_REGISTER_NODE_TYPE(GetValidCountsAttrs);
 
 bool GetValidCountRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto& dshape = data->shape;
-  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+  ICHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
 
   std::vector<IndexExpr> oshape({data->shape[0]});
   std::vector<IndexExpr> oshape_indices({data->shape[0], data->shape[1]});
@@ -73,14 +73,14 @@ TVM_REGISTER_NODE_TYPE(NonMaximumSuppressionAttrs);
 
 bool NMSRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
             const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_count = types[1].as<TensorTypeNode>();
   const NonMaximumSuppressionAttrs* param = attrs.as<NonMaximumSuppressionAttrs>();
   const auto& dshape = data->shape;
   const auto& vshape = valid_count->shape;
-  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
-  CHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
+  ICHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+  ICHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
 
   // assign output type
   if (param->return_indices) {
diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc
index 6550815c6422..8be38d020480 100644
--- a/src/relay/op/vision/rcnn_op.cc
+++ b/src/relay/op/vision/rcnn_op.cc
@@ -35,23 +35,23 @@ TVM_REGISTER_NODE_TYPE(ROIAlignAttrs);
 bool ROIAlignRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
   auto roi_align_attrs = attrs.as<ROIAlignAttrs>();
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* rois = types[1].as<TensorTypeNode>();
-  CHECK(data);
-  CHECK(rois);
+  ICHECK(data);
+  ICHECK(rois);
   const auto& dshape = data->shape;
   const auto& rshape = rois->shape;
-  CHECK(roi_align_attrs);
-  CHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
-  CHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
+  ICHECK(roi_align_attrs);
+  ICHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
+  ICHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
   // assign output type
   std::vector<IndexExpr> oshape;
   if (roi_align_attrs->layout == "NCHW") {
     oshape = {rshape[0], dshape[1], roi_align_attrs->pooled_size[0],
               roi_align_attrs->pooled_size[1]};
   } else {
-    CHECK_EQ(roi_align_attrs->layout, "NHWC") << "Unexpected ROI Align layout";
+    ICHECK_EQ(roi_align_attrs->layout, "NHWC") << "Unexpected ROI Align layout";
     oshape = {rshape[0], roi_align_attrs->pooled_size[0], roi_align_attrs->pooled_size[1],
               dshape[3]};
   }
@@ -111,15 +111,15 @@ TVM_REGISTER_NODE_TYPE(ROIPoolAttrs);
 bool ROIPoolRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
   auto roi_pool_attrs = attrs.as<ROIPoolAttrs>();
-  CHECK_EQ(types.size(), 3);
+  ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* rois = types[1].as<TensorTypeNode>();
   const auto& dshape = data->shape;
   const auto& rshape = rois->shape;
-  CHECK(roi_pool_attrs);
-  CHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
-  CHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
-  CHECK_EQ(roi_pool_attrs->layout, "NCHW") << "ROI Pool only supports NCHW layout";
+  ICHECK(roi_pool_attrs);
+  ICHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
+  ICHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
+  ICHECK_EQ(roi_pool_attrs->layout, "NCHW") << "ROI Pool only supports NCHW layout";
   // assign output type
   std::vector<IndexExpr> oshape(
       {rshape[0], dshape[1], roi_pool_attrs->pooled_size[0], roi_pool_attrs->pooled_size[1]});
@@ -160,7 +160,7 @@ TVM_REGISTER_NODE_TYPE(ProposalAttrs);
 bool ProposalRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
   auto proposal_attrs = attrs.as<ProposalAttrs>();
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* cls_prob = types[0].as<TensorTypeNode>();
   const auto* bbox_pred = types[1].as<TensorTypeNode>();
   const auto* im_info = types[2].as<TensorTypeNode>();
@@ -169,13 +169,13 @@ bool ProposalRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     return false;
   }
 
-  CHECK_EQ(cls_prob->shape.size(), 4U)
+  ICHECK_EQ(cls_prob->shape.size(), 4U)
       << "The dimension of class probability should be 4, but received " << cls_prob->shape.size();
-  CHECK_EQ(bbox_pred->shape.size(), 4U)
+  ICHECK_EQ(bbox_pred->shape.size(), 4U)
       << "The dimension of box prediction should be 4, but received " << bbox_pred->shape.size();
-  CHECK_EQ(im_info->shape.size(), 2U)
+  ICHECK_EQ(im_info->shape.size(), 2U)
       << "The dimension of image info should be 2, but received " << im_info->shape.size();
-  CHECK(reporter->AssertEQ(im_info->shape[1], 3));
+  ICHECK(reporter->AssertEQ(im_info->shape[1], 3));
 
   auto batch = cls_prob->shape[0];
 
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
index cfd81131be73..70d882061299 100644
--- a/src/relay/op/vision/yolo.cc
+++ b/src/relay/op/vision/yolo.cc
@@ -44,14 +44,14 @@ TVM_REGISTER_NODE_TYPE(YoloReorgAttrs);
  */
 bool YoloReorgRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
   const YoloReorgAttrs* param = attrs.as<YoloReorgAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
-  CHECK(data->shape.size() == 4) << "Yolo reorg supports only 4 dimension.";
+  ICHECK(data->shape.size() == 4) << "Yolo reorg supports only 4 dimension.";
   std::vector<IndexExpr> oshape(data->shape.begin(), data->shape.end());
   oshape[1] = oshape[1] * param->stride * param->stride;
   oshape[2] = indexdiv(oshape[2], param->stride);
@@ -80,7 +80,7 @@ Its function is mostly shape transform.")doc" TVM_ADD_FILELINE)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* params = attrs.as<YoloReorgAttrs>();
-      CHECK(params != nullptr);
+      ICHECK(params != nullptr);
       return Array<te::Tensor>{topi::vision::reorg(inputs[0], params->stride)};
     });
 
diff --git a/src/relay/op/vm/vm.cc b/src/relay/op/vm/vm.cc
index 424ed5f4bc98..8c1c9f3e9c59 100644
--- a/src/relay/op/vm/vm.cc
+++ b/src/relay/op/vm/vm.cc
@@ -69,12 +69,12 @@ TVM_REGISTER_GLOBAL("relay.op.vm.shape_func")
 
 bool ShapeFuncRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4u);
+  ICHECK_EQ(types.size(), 4u);
   auto shape_func_attrs = attrs.as<ShapeFuncAttrs>();
-  CHECK(shape_func_attrs != nullptr) << "Internal compiler error";
+  ICHECK(shape_func_attrs != nullptr) << "Internal compiler error";
 
   auto func_type = types[0].as<FuncTypeNode>();
-  CHECK(func_type != nullptr);
+  ICHECK(func_type != nullptr);
 
   auto tuple = TupleType(func_type->arg_types);
   auto in_types = FlattenTupleType(tuple);
@@ -137,20 +137,20 @@ RELAY_REGISTER_OP("vm.shape_func")
 // vm.invoke_tvm_op
 bool InvokeTVMOpRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4u);
+  ICHECK_EQ(types.size(), 4u);
   auto func_type = types[0].as<FuncTypeNode>();
-  CHECK(func_type != nullptr) << "input must be operator with known type";
+  ICHECK(func_type != nullptr) << "input must be operator with known type";
   auto input_type = types[1].as<TupleTypeNode>();
   auto output_type = types[2].as<TupleTypeNode>();
-  CHECK(input_type != nullptr)
+  ICHECK(input_type != nullptr)
       << "internal invariant violated: invoke_tvm_op inputs must be a tuple";
-  CHECK(output_type != nullptr)
+  ICHECK(output_type != nullptr)
       << "internal invariant violated: invoke_tvm_op outputs must be a tuple";
   Type ex_output;
   if (func_type->ret_type.as<TensorTypeNode>()) {
     ex_output = TupleType({func_type->ret_type});
   } else {
-    CHECK(func_type->ret_type.as<TupleTypeNode>()) << "should be tuple type";
+    ICHECK(func_type->ret_type.as<TupleTypeNode>()) << "should be tuple type";
     ex_output = func_type->ret_type;
   }
   auto ex_input = TupleType(func_type->arg_types);
@@ -188,11 +188,11 @@ TVM_REGISTER_NODE_TYPE(ReshapeTensorAttrs);
 
 bool ReshapeTensorRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                       const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3u);
+  ICHECK_EQ(types.size(), 3u);
   auto reshape_attrs = attrs.as<ReshapeTensorAttrs>();
-  CHECK(reshape_attrs);
+  ICHECK(reshape_attrs);
   auto tt = types[0].as<TensorTypeNode>();
-  CHECK(tt) << "input must be tensor type";
+  ICHECK(tt) << "input must be tensor type";
   reporter->Assign(types[2], TensorType(reshape_attrs->newshape, tt->dtype));
   return true;
 }
diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc
index 88d2ecc9b45b..7a716a1ec498 100644
--- a/src/relay/qnn/op/concatenate.cc
+++ b/src/relay/qnn/op/concatenate.cc
@@ -38,7 +38,7 @@ namespace qnn {
 
 bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
 
   // Check the scale and zero point types
   const auto* input_scales_tuple = types[1].as<TupleTypeNode>();
@@ -48,7 +48,7 @@ bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& at
                 << PrettyPrint(types[1]));
   }
   for (const auto& input_scale : input_scales_tuple->fields) {
-    CHECK(IsScalarType(input_scale, DataType::Float(32)));  // input_scales[idx]
+    ICHECK(IsScalarType(input_scale, DataType::Float(32)));  // input_scales[idx]
   }
 
   const auto* input_zero_points_tuple = types[2].as<TupleTypeNode>();
@@ -58,11 +58,11 @@ bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& at
                 << PrettyPrint(types[2]));
   }
   for (const auto& input_zero_point : input_zero_points_tuple->fields) {
-    CHECK(IsScalarType(input_zero_point, DataType::Int(32)));  // input_zero_points[idx]
+    ICHECK(IsScalarType(input_zero_point, DataType::Int(32)));  // input_zero_points[idx]
   }
 
-  CHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
-  CHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
 
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // Concatenate infer type function.
@@ -74,9 +74,9 @@ Array<Array<Layout>> QnnConcatenateLayout(const Attrs& attrs, const Array<Layout
                                           const Array<Layout>& old_in_layouts,
                                           const Array<tvm::relay::Type>& old_in_types) {
   // Collect the layouts and types to reuse Relay Concatenate Infer Correct Layout.
-  CHECK_EQ(old_in_types.size(), 5);
+  ICHECK_EQ(old_in_types.size(), 5);
   auto input_tuple_type = old_in_types[0].as<TupleTypeNode>();
-  CHECK(input_tuple_type);
+  ICHECK(input_tuple_type);
   auto num_input_tensors = input_tuple_type->fields.size();
 
   Array<Layout> relay_new_in_layouts(nullptr);
@@ -126,19 +126,19 @@ Expr MakeQnnConcatenate(Expr data, Expr input_scales, Expr input_zero_points, Ex
 Expr ConcatenateQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                                 const Array<tvm::relay::Type>& arg_types) {
   // Get the attrs.
-  CHECK_EQ(new_args.size(), 5);
+  ICHECK_EQ(new_args.size(), 5);
   auto& data = new_args[0];
   auto& input_scales = new_args[1];
   auto& input_zero_points = new_args[2];
   auto& output_scale = new_args[3];
   auto& output_zero_point = new_args[4];
   const auto* concatenate_attrs = attrs.as<ConcatenateAttrs>();
-  CHECK(concatenate_attrs != nullptr);
+  ICHECK(concatenate_attrs != nullptr);
 
   // Get the input dtype and shape.
-  CHECK_GE(arg_types.size(), 1);
+  ICHECK_GE(arg_types.size(), 1);
   auto tuple_type = arg_types[0].as<TupleTypeNode>();
-  CHECK(tuple_type != nullptr);
+  ICHECK(tuple_type != nullptr);
 
   // FIXME (anijain2305) - The lowering can be further optimized. Instead of inserting requantize in
   // the start, we can insert requantize at the end if and only if all the input tensors have same
@@ -156,13 +156,13 @@ Expr ConcatenateQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
       tuple_exprs.push_back(TupleGetItem(call, i));
     }
   }
-  CHECK(!tuple_exprs.empty());
+  ICHECK(!tuple_exprs.empty());
 
   auto tuple_input_scales = input_scales.as<TupleNode>();
-  CHECK(tuple_input_scales != nullptr);
+  ICHECK(tuple_input_scales != nullptr);
 
   auto tuple_input_zero_points = input_zero_points.as<TupleNode>();
-  CHECK(tuple_input_zero_points != nullptr);
+  ICHECK(tuple_input_zero_points != nullptr);
 
   int idx = 0;
   Array<Expr> requantized_exprs;
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 73ee4561907d..a9f2f361f2b3 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -42,34 +42,34 @@ namespace qnn {
 
 bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 7);
+  ICHECK_EQ(types.size(), 7);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr) << "Conv2DAttrs cannot be nullptr.";
-  CHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+  ICHECK(param != nullptr) << "Conv2DAttrs cannot be nullptr.";
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype;
-  CHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
-  CHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
+  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
       << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype;
-  CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
+  ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
 
   // Check the types of scale and zero points.
-  CHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
-  CHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
-  CHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
   // Kernel scale can be a vector of length output_channels or a scalar.
   if (param->groups == 1) {
     size_t axis = param->kernel_layout.operator std::string().find('O');
-    CHECK(axis != std::string::npos) << "Kernel layout attribute is not defined";
+    ICHECK(axis != std::string::npos) << "Kernel layout attribute is not defined";
     AssignType(types[5], DataType::Float(32), weight->shape[axis], reporter);  // kernel scale
   } else {
     // Here, total number of output channels depend on depth multiplier.
     size_t o_axis = param->kernel_layout.operator std::string().find('O');
     size_t i_axis = param->kernel_layout.operator std::string().find('I');
-    CHECK(o_axis != std::string::npos || i_axis != std::string::npos)
+    ICHECK(o_axis != std::string::npos || i_axis != std::string::npos)
         << "Kernel layout attribute is not defined";
     AssignType(types[5], DataType::Float(32), weight->shape[i_axis] * weight->shape[o_axis],
                reporter);  // kernel scale
@@ -628,18 +628,18 @@ Expr Conv2DCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3,
  */
 Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                            const Array<tvm::relay::Type>& arg_types) {
-  CHECK_EQ(new_args.size(), 6);
+  ICHECK_EQ(new_args.size(), 6);
   Expr data = new_args[0];
   Expr weight = new_args[1];
   Expr input_zero_point = new_args[2];
   Expr kernel_zero_point = new_args[3];
   const auto* param = attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   // Assertion checks for exisiing support.
-  CHECK(param->data_layout == "NCHW" || param->data_layout == "NHWC")
+  ICHECK(param->data_layout == "NCHW" || param->data_layout == "NHWC")
       << "qnn.conv2d supports only NCHW/NHWC input data layout.";
-  CHECK(param->kernel_layout == "OIHW" || param->kernel_layout == "HWIO" ||
-        param->kernel_layout == "HWOI")
+  ICHECK(param->kernel_layout == "OIHW" || param->kernel_layout == "HWIO" ||
+         param->kernel_layout == "HWOI")
       << "qnn.conv2d supports only OIHW/HWIO/HWOI kernel data layout.";
 
   int batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier;
@@ -655,14 +655,14 @@ Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // traverse the elements in dilated manner. Currently, we do not have strided pool. So, in case of
   // dilated conv with non-zero kernel point, we fall back to simpler but slow lowering.
 
-  CHECK_EQ(param->dilation.size(), 2) << "qnn.conv2d only supports 2D dilation";
+  ICHECK_EQ(param->dilation.size(), 2) << "qnn.conv2d only supports 2D dilation";
   auto dilation_h = get_const_int(param->dilation[0]);
   auto dilation_w = get_const_int(param->dilation[1]);
   if ((kernel_zero_point_int != 0 && (dilation_h != 1 || dilation_w != 1)) ||
       (param->groups != 1 && !is_depthwise(param))) {
     return Conv2DFallBack(data, weight, input_zero_point, kernel_zero_point, param);
   } else if (is_depthwise(param)) {
-    CHECK_NE(channel_multiplier, -1);
+    ICHECK_NE(channel_multiplier, -1);
     auto padded_data = Conv2DPadInput(data, input_zero_point, param);
     auto term1 = Conv2DFirstTerm(padded_data, weight, param);
     auto term2 = DepthwiseConv2DSecondTerm(padded_data, kernel_zero_point, param, kernel_h,
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index e1cbfaf98df1..62988c8cc52f 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -39,26 +39,26 @@ namespace qnn {
 
 bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 7);
+  ICHECK_EQ(types.size(), 7);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<DenseAttrs>();
-  CHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
-  CHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+  ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
-  CHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for weight but was " << weight->dtype;
-  CHECK(param->out_dtype == DataType::Int(32))
+  ICHECK(param->out_dtype == DataType::Int(32))
       << "Expected quantized dense type(int32) for output but was " << param->out_dtype;
 
   // Check the types of scale and zero points.
-  CHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
-  CHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
-  CHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
   AssignType(types[5], DataType::Float(32), param->units, reporter);
 
-  CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
+  ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
 
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // Dense infer type function.
@@ -133,7 +133,7 @@ Expr DenseFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int re
  */
 Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                           const Array<tvm::relay::Type>& arg_types) {
-  CHECK_EQ(new_args.size(), 6);
+  ICHECK_EQ(new_args.size(), 6);
   Expr quantized_data = new_args[0];
   Expr quantized_kernel = new_args[1];
   Expr input_zero_point = new_args[2];
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 0a81f3fe4fdb..2e7a28624e26 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -38,7 +38,7 @@ TVM_REGISTER_NODE_TYPE(DequantizeAttrs);
 
 bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) {
@@ -46,17 +46,17 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   const auto input_dtype = data->dtype;
-  CHECK(input_dtype == DataType::Int(8) || input_dtype == DataType::UInt(8) ||
-        input_dtype == DataType::Int(32))
+  ICHECK(input_dtype == DataType::Int(8) || input_dtype == DataType::UInt(8) ||
+         input_dtype == DataType::Int(32))
       << "Input type should be one of the quantized types [unit8, int8, int32] but was "
       << input_dtype;
 
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
   int axis = dequantize_attrs->axis;
   axis = (axis == -1) ? data->shape.size() - 1 : axis;
-  CHECK_LT(axis, static_cast<int>(data->shape.size()))
+  ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << dequantize_attrs->axis << " is out of range";
-  CHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
+  ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
 
   // Check and assign types for scale and zero points.
   AssignType(types[1], DataType::Float(32), data->shape[axis], reporter);  // scale
@@ -103,22 +103,22 @@ Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
 
 Expr DequantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                                const Array<tvm::relay::Type>& types) {
-  CHECK_EQ(new_args.size(), 3);
+  ICHECK_EQ(new_args.size(), 3);
   auto& data = new_args[0];
   auto& input_scale = new_args[1];
   auto& input_zero_point = new_args[2];
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
 
   // Get attrs.
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
-  CHECK(dequantize_attrs != nullptr);
+  ICHECK(dequantize_attrs != nullptr);
 
   // Find input shape.
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   auto in_type = types[0];
   auto in_tensor_type = in_type.as<TensorTypeNode>();
-  CHECK(in_tensor_type != nullptr) << "Type information missing."
-                                   << " Please run infer_type pass.";
+  ICHECK(in_tensor_type != nullptr) << "Type information missing."
+                                    << " Please run infer_type pass.";
   Array<IndexExpr> input_shape = in_tensor_type->shape;
 
   return DequantizeLower(data, input_scale, input_zero_point, input_shape, dequantize_attrs);
diff --git a/src/relay/qnn/op/op_common.h b/src/relay/qnn/op/op_common.h
index 3ca8f64ac9d9..330802c4c9b1 100644
--- a/src/relay/qnn/op/op_common.h
+++ b/src/relay/qnn/op/op_common.h
@@ -68,7 +68,7 @@ struct QnnBinaryOpArguments {
   Expr output_zero_point;
 
   explicit QnnBinaryOpArguments(const Array<Expr>& new_args) {
-    CHECK_EQ(new_args.size(), kNumQnnBinaryOpInputs);
+    ICHECK_EQ(new_args.size(), kNumQnnBinaryOpInputs);
     int idx = 0;
     lhs = new_args[idx++];
     rhs = new_args[idx++];
@@ -78,7 +78,7 @@ struct QnnBinaryOpArguments {
     rhs_zero_point = new_args[idx++];
     output_scale = new_args[idx++];
     output_zero_point = new_args[idx++];
-    CHECK_EQ(idx, kNumQnnBinaryOpInputs);
+    ICHECK_EQ(idx, kNumQnnBinaryOpInputs);
   }
 };
 
@@ -92,9 +92,9 @@ struct QnnBinaryOpTensorType {
   Array<PrimExpr> shape;
 
   explicit QnnBinaryOpTensorType(const Array<tvm::relay::Type>& arg_types, const int32_t arg_idx) {
-    CHECK_EQ(arg_types.size(), kNumQnnBinaryOpArgTypes);
+    ICHECK_EQ(arg_types.size(), kNumQnnBinaryOpArgTypes);
     auto tensor_type = arg_types[arg_idx].as<TensorTypeNode>();
-    CHECK(tensor_type != nullptr);
+    ICHECK(tensor_type != nullptr);
     dtype = tensor_type->dtype;
     shape = tensor_type->shape;
   }
@@ -168,15 +168,15 @@ inline Array<Array<Layout> > QnnBinaryBroadcastLayout(const Attrs& attrs,
 
 static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), kNumQnnBinaryOpArgTypes);
+  ICHECK_EQ(types.size(), kNumQnnBinaryOpArgTypes);
 
   // Check the scale and zero point types
-  CHECK(IsScalarType(types[2], DataType::Float(32)));  // lhs_scale
-  CHECK(IsScalarType(types[3], DataType::Int(32)));    // lhs_zero_point
-  CHECK(IsScalarType(types[4], DataType::Float(32)));  // rhs_scale
-  CHECK(IsScalarType(types[5], DataType::Int(32)));    // rhs_zero_point
-  CHECK(IsScalarType(types[6], DataType::Float(32)));  // output_scale
-  CHECK(IsScalarType(types[7], DataType::Int(32)));    // output_zero_point
+  ICHECK(IsScalarType(types[2], DataType::Float(32)));  // lhs_scale
+  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // lhs_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // rhs_scale
+  ICHECK(IsScalarType(types[5], DataType::Int(32)));    // rhs_zero_point
+  ICHECK(IsScalarType(types[6], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[7], DataType::Int(32)));    // output_zero_point
 
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // BroadcastRel infer type function.
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 07847916fae7..0622c96f04a6 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -38,7 +38,7 @@ TVM_REGISTER_NODE_TYPE(QuantizeAttrs);
 
 bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                  const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) {
@@ -46,15 +46,15 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   const auto input_dtype = data->dtype;
-  CHECK(input_dtype == DataType::Float(32))
+  ICHECK(input_dtype == DataType::Float(32))
       << "Input type should be one of float32 but was " << input_dtype;
 
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
   int axis = quantize_attrs->axis;
   axis = (axis == -1) ? data->shape.size() - 1 : axis;
-  CHECK_LT(axis, static_cast<int>(data->shape.size()))
+  ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << quantize_attrs->axis << " is out of range";
-  CHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
+  ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
 
   // Check and assign types for scale and zero points.
   AssignType(types[1], DataType::Float(32), data->shape[axis], reporter);  // scale
@@ -62,8 +62,8 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const Array<tvm::PrimExpr> oshape = data->shape;
   const DataType out_dtype = quantize_attrs->out_dtype;
-  CHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
-        out_dtype == DataType::Int(32))
+  ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
+         out_dtype == DataType::Int(32))
       << "Output type should be one of [int8, unit8, int32] but was " << out_dtype;
   // assign output type
   reporter->Assign(types[3], TensorType(oshape, out_dtype));
@@ -113,19 +113,19 @@ Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale,
 
 Expr QuantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                              const Array<tvm::relay::Type>& types) {
-  CHECK_EQ(new_args.size(), 3);
+  ICHECK_EQ(new_args.size(), 3);
   auto& data = new_args[0];
   auto& output_scale = new_args[1];
   auto& output_zero_point = new_args[2];
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
-  CHECK(quantize_attrs != nullptr);
+  ICHECK(quantize_attrs != nullptr);
 
   // Find input shape.
-  CHECK_EQ(types.size(), 4);
+  ICHECK_EQ(types.size(), 4);
   auto in_type = types[0];
   auto in_tensor_type = in_type.as<TensorTypeNode>();
-  CHECK(in_tensor_type != nullptr) << "Type information missing."
-                                   << " Please run infer_type pass.";
+  ICHECK(in_tensor_type != nullptr) << "Type information missing."
+                                    << " Please run infer_type pass.";
   Array<IndexExpr> input_shape = in_tensor_type->shape;
 
   return QuantizeLower(data, output_scale, output_zero_point, input_shape, quantize_attrs);
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 3572a3980ced..8e9b31e6fc39 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -44,7 +44,7 @@ Array<Array<Layout>> RequantizeInferCorrectLayout(const Attrs& attrs,
 
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
 
@@ -52,8 +52,8 @@ Array<Array<Layout>> RequantizeInferCorrectLayout(const Attrs& attrs,
   if (new_in_layouts.defined()) {
     // Adapt to new layout. The axis has to change.
     // Record original reduce axis. Convert to the modified layout axis.
-    CHECK_EQ(new_in_layouts.size(), 5);
-    CHECK_EQ(old_in_layouts.size(), 5);
+    ICHECK_EQ(new_in_layouts.size(), 5);
+    ICHECK_EQ(old_in_layouts.size(), 5);
 
     // 1) Get the axis.
     int axis = param->axis;
@@ -90,7 +90,7 @@ Array<Array<Layout>> RequantizeInferCorrectLayout(const Attrs& attrs,
     param->axis = new_axis;
   } else if (old_in_layouts.defined()) {
     // If the new layout is undefined, set the old layout as the inferred layout.
-    CHECK_EQ(old_in_layouts.size(), 5);
+    ICHECK_EQ(old_in_layouts.size(), 5);
 
     Layout old_layout = old_in_layouts[0];
 
@@ -214,32 +214,32 @@ Expr RequantizeLower(const Expr& input_tensor, const Expr& input_scale,
  */
 Expr RequantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                                const Array<tvm::relay::Type>& types) {
-  CHECK_EQ(new_args.size(), 5);
+  ICHECK_EQ(new_args.size(), 5);
   auto& quantized_data = new_args[0];
   auto& input_scale = new_args[1];
   auto& input_zero_point = new_args[2];
   auto& output_scale = new_args[3];
   auto& output_zero_point = new_args[4];
   const auto* param = attrs.as<RequantizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   // Find input shape.
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
   auto in_type = types[0];
   auto in_tensor_type = in_type.as<TensorTypeNode>();
-  CHECK(in_tensor_type != nullptr) << "Type information missing."
-                                   << " Please run infer_type pass.";
+  ICHECK(in_tensor_type != nullptr) << "Type information missing."
+                                    << " Please run infer_type pass.";
   Array<IndexExpr> input_shape = in_tensor_type->shape;
 
   // Find the output dtype.
   auto out_type = types[5];
   auto out_tensor_type = out_type.as<TensorTypeNode>();
-  CHECK(out_tensor_type != nullptr) << "Type information missing."
-                                    << " Please run infer_type pass.";
+  ICHECK(out_tensor_type != nullptr) << "Type information missing."
+                                     << " Please run infer_type pass.";
   auto out_dtype = out_tensor_type->dtype;
 
   // Check rounding validity.
-  CHECK(param->rounding == "UPWARD" || param->rounding == "TONEAREST")
+  ICHECK(param->rounding == "UPWARD" || param->rounding == "TONEAREST")
       << "QNN requantize supports two rounding modes - UPWARD and "
       << "TONEAREST";
   return RequantizeLower(quantized_data, input_scale, input_zero_point, output_scale,
@@ -256,7 +256,7 @@ Expr RequantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
  */
 bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                    const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 6);
+  ICHECK_EQ(types.size(), 6);
   const auto* data = types[0].as<TensorTypeNode>();
 
   if (data == nullptr) {
@@ -264,29 +264,29 @@ bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
 
   const auto in_dtype = data->dtype;
-  CHECK(in_dtype == DataType::Int(8) || in_dtype == DataType::UInt(8) ||
-        in_dtype == DataType::Int(32))
+  ICHECK(in_dtype == DataType::Int(8) || in_dtype == DataType::UInt(8) ||
+         in_dtype == DataType::Int(32))
       << "Input type should be one of [int8, uint8, int32] but was " << in_dtype;
 
   const RequantizeAttrs* requantize_attrs = attrs.as<RequantizeAttrs>();
   int axis = requantize_attrs->axis;
   axis = (axis == -1) ? data->shape.size() - 1 : axis;
-  CHECK_LT(axis, static_cast<int>(data->shape.size()))
+  ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << requantize_attrs->axis << " is out of range";
-  CHECK_GE(axis, 0) << "axis " << requantize_attrs->axis << " is out of range";
+  ICHECK_GE(axis, 0) << "axis " << requantize_attrs->axis << " is out of range";
 
   // Check and assign types for scale and zero points.
   AssignType(types[1], DataType::Float(32), data->shape[axis], reporter);  // input_scale
   AssignType(types[2], DataType::Int(32), data->shape[axis], reporter);    // input_zero_pt
   // For now, requantize output tensor is limited to full tensor uniform quantization.
-  CHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
-  CHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
 
   const Array<tvm::PrimExpr> oshape = data->shape;
   // assign output type
   auto out_dtype = requantize_attrs->out_dtype;
-  CHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
-        out_dtype == DataType::Int(32))
+  ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
+         out_dtype == DataType::Int(32))
       << "Output type should be one of [int8, uint8, int32] but was " << out_dtype;
   reporter->Assign(types[5], TensorType(oshape, out_dtype));
   return true;
diff --git a/src/relay/qnn/utils.cc b/src/relay/qnn/utils.cc
index fc59b61cc6a5..982efa0a61c1 100644
--- a/src/relay/qnn/utils.cc
+++ b/src/relay/qnn/utils.cc
@@ -46,12 +46,12 @@ std::pair<int32_t, int32_t> GetFixedPointMultiplierShift(double double_multiplie
   // multiplying the double value with 2^31 and then casting to int.
   significand_d = std::round(significand_d * (1ll << 31));
   auto significand_int64 = static_cast<int64_t>(significand_d);
-  CHECK_LE(significand_int64, (1ll << 31));
+  ICHECK_LE(significand_int64, (1ll << 31));
   if (significand_int64 == (1ll << 31)) {
     significand_int64 /= 2;
     ++exponent;
   }
-  CHECK_LE(significand_int64, std::numeric_limits<int32_t>::max());
+  ICHECK_LE(significand_int64, std::numeric_limits<int32_t>::max());
   significand = static_cast<int32_t>(significand_int64);
   return std::make_pair(significand, exponent);
 }
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index f8885c36d162..ab5c9a4fbbe2 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -41,16 +41,16 @@ namespace qnn {
 
 static inline Array<IndexExpr> get_shape(const Type& type) {
   auto input_tt = type.as<TensorTypeNode>();
-  CHECK(input_tt != nullptr) << "Type information missing."
-                             << " Please run infer_type pass.";
+  ICHECK(input_tt != nullptr) << "Type information missing."
+                              << " Please run infer_type pass.";
   return input_tt->shape;
 }
 
 static inline int32_t GetQmin(const DataType& dtype) {
-  CHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
+  ICHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
   if (dtype.is_int() || dtype.is_uint()) {
     auto* min_value = tir::as_const_int(tvm::min_value(dtype));
-    CHECK(min_value != nullptr);
+    ICHECK(min_value != nullptr);
     return static_cast<int32_t>(min_value[0]);
   } else {
     LOG(FATAL) << "Type not supported " << dtype;
@@ -59,10 +59,10 @@ static inline int32_t GetQmin(const DataType& dtype) {
 }
 
 static inline int32_t GetQmax(const DataType& dtype) {
-  CHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
+  ICHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
   if (dtype.is_int() || dtype.is_uint()) {
     auto* max_value = tir::as_const_int(tvm::max_value(dtype));
-    CHECK(max_value != nullptr);
+    ICHECK(max_value != nullptr);
     return static_cast<int32_t>(max_value[0]);
   } else {
     LOG(FATAL) << "Type not supported " << dtype;
@@ -109,7 +109,7 @@ static inline Expr Requantize(const Expr& data, const Array<IndexExpr>& input_sh
 
 static inline int64_t get_const_int(const tvm::PrimExpr& x) {
   auto* value_ptr = tir::as_const_int(x);
-  CHECK(value_ptr) << "Expr is not a constant int";
+  ICHECK(value_ptr) << "Expr is not a constant int";
   return value_ptr[0];
 }
 
@@ -172,10 +172,10 @@ Expr FixedPointMultiplyPerChannel(Expr tensor, std::vector<double> multiplier,
  */
 static inline bool IsScalarType(const Type& expr_type, const DataType& dtype) {
   const auto* tensor_type = expr_type.as<TensorTypeNode>();
-  CHECK(tensor_type) << "Only tensor type can be checked for scalar values. But got"
-                     << AsText(expr_type, false);
-  CHECK_EQ(tensor_type->shape.size(), 0);
-  CHECK(tensor_type->dtype == dtype) << "Expected " << dtype << " but got " << tensor_type->dtype;
+  ICHECK(tensor_type) << "Only tensor type can be checked for scalar values. But got"
+                      << AsText(expr_type, false);
+  ICHECK_EQ(tensor_type->shape.size(), 0);
+  ICHECK(tensor_type->dtype == dtype) << "Expected " << dtype << " but got " << tensor_type->dtype;
   return true;
 }
 
@@ -190,9 +190,10 @@ static inline void AssignType(const Type& expr_type, const DataType& dtype, cons
                               const TypeReporter& reporter) {
   // Scale/Zero_points can be either const scalar or a vector with C axis num elems.
   const auto* tensor_type = expr_type.as<TensorTypeNode>();
-  CHECK(tensor_type) << "Can assign type to Tensor type only. But got " << AsText(expr_type, false);
+  ICHECK(tensor_type) << "Can assign type to Tensor type only. But got "
+                      << AsText(expr_type, false);
   const auto tensor_dtype = tensor_type->dtype;
-  CHECK(tensor_dtype == dtype) << "Expected type is " << dtype << " but received " << tensor_dtype;
+  ICHECK(tensor_dtype == dtype) << "Expected type is " << dtype << " but received " << tensor_dtype;
   if (tensor_type->shape.size() != 0) {
     reporter->Assign(expr_type, TensorType({shape}, tensor_type->dtype));
   }
@@ -201,7 +202,7 @@ static inline void AssignType(const Type& expr_type, const DataType& dtype, cons
 static inline std::vector<float> GetFloatVectorFromConstant(const Expr& expr) {
   const auto* n = expr.as<ConstantNode>();
   std::vector<float> vals;
-  CHECK(n) << "Expr must be a constant expr - " << AsText(expr, false);
+  ICHECK(n) << "Expr must be a constant expr - " << AsText(expr, false);
   int64_t num_elems = 1;
   auto shape = n->data.Shape();
   for (size_t i = 0; i < shape.size(); i++) {
diff --git a/src/relay/quantize/annotate.cc b/src/relay/quantize/annotate.cc
index 8ae7df9e2941..3def616e9423 100644
--- a/src/relay/quantize/annotate.cc
+++ b/src/relay/quantize/annotate.cc
@@ -83,7 +83,7 @@ Pass QuantizeAnnotate() {
   std::function<Expr(const Expr&)> fmulti_ref = [](const Expr& e) {
     if (e->IsInstance<TempExprNode>()) {
       const auto* n = e.as<QAnnotateExprNode>();
-      CHECK(n);
+      ICHECK(n);
       const PackedFunc* f = runtime::Registry::Get("relay.quantize.attach_simulated_quantize");
       Expr ret = (*f)(n->expr, static_cast<int>(kQInput));
       return static_cast<Expr>(QAnnotateExpr(ret, kQInput));
diff --git a/src/relay/quantize/calibrate.cc b/src/relay/quantize/calibrate.cc
index ea42a198bf84..0ac445295496 100644
--- a/src/relay/quantize/calibrate.cc
+++ b/src/relay/quantize/calibrate.cc
@@ -71,7 +71,7 @@ static float ComputeEntropy(float* p, float* q, size_t size) {
   float q_sum = std::accumulate(q, q + size, 0.f);
   float ret = 0;
   for (size_t i = 0; i < size; i++) {
-    CHECK(p[i] > 0 && q[i] > 0);
+    ICHECK(p[i] > 0 && q[i] > 0);
     p[i] /= p_sum;
     q[i] /= q_sum;
     if (p[i] && q[i]) ret += p[i] * std::log(p[i] / q[i]);
@@ -150,7 +150,7 @@ class StatsCollector : private ExprMutator {
   Expr Collect(const Expr& expr) {
     auto new_e = this->Mutate(expr);
     const FunctionNode* func = new_e.as<FunctionNode>();
-    CHECK(func) << "Input shoule be Function";
+    ICHECK(func) << "Input shoule be Function";
     Expr new_body = Tuple(std::move(profile_data_));
     return Function(FreeVars(new_body), new_body, NullValue<Type>(), func->type_params,
                     func->attrs);
@@ -163,7 +163,7 @@ class StatsCollector : private ExprMutator {
   Expr VisitExpr_(const CallNode* call) {
     Expr new_e = ExprMutator::VisitExpr_(call);
     const CallNode* new_call = new_e.as<CallNode>();
-    CHECK(new_call);
+    ICHECK(new_call);
     if (new_call->op == simulated_quantize_op_) {
       auto attrs = new_call->attrs.as<SimulatedQuantizeAttrs>();
       // rewrite the annotation
@@ -178,7 +178,7 @@ class StatsCollector : private ExprMutator {
 
       // add non-const expressions to profile data
       if (attrs->kind != QAnnotateKind::kQWeight) {
-        CHECK(!quantize_input.as<ConstantNode>());
+        ICHECK(!quantize_input.as<ConstantNode>());
         profile_data_.push_back(identity_quantize);
       }
       return identity_quantize;
diff --git a/src/relay/quantize/quantize.cc b/src/relay/quantize/quantize.cc
index 64a02fff1dca..846367c9c8a9 100644
--- a/src/relay/quantize/quantize.cc
+++ b/src/relay/quantize/quantize.cc
@@ -39,9 +39,9 @@ TVM_REGISTER_NODE_TYPE(SimulatedQuantizeAttrs);
 
 bool SimulatedQuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                           const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 5);
+  ICHECK_EQ(types.size(), 5);
   const auto param = attrs.as<SimulatedQuantizeAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
 
   const auto* data = types[0].as<TensorTypeNode>();
 
@@ -49,7 +49,7 @@ bool SimulatedQuantizeRel(const Array<Type>& types, int num_inputs, const Attrs&
     return false;
   }
 
-  CHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
+  ICHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
 
   reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // dom_scale
   reporter->Assign(types[2], TensorType({}, DataType::Float(32)));  // clip_min
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 4b598907e76e..c96a1b063e98 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -107,7 +107,7 @@ inline Expr MulAndDiv(Expr data, float s1, float s2, DataType dtype,
 
   float factor = s1 / s2;
   float shift_factor = std::log2(factor);
-  CHECK_GT(shift_factor, 0);
+  ICHECK_GT(shift_factor, 0);
   if (static_cast<int>(shift_factor) == shift_factor) {
     return LeftShift(data, MakeConstantScalar(dtype, static_cast<int>(shift_factor)));
   } else if (static_cast<int>(factor) == factor) {
@@ -129,7 +129,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
   const QConfig& cfg = QConfig::Current();
   // do not handle data type cast
   const auto param = ref_call->attrs.as<SimulatedQuantizeAttrs>();
-  CHECK_EQ(param->rounding, "round");
+  ICHECK_EQ(param->rounding, "round");
 
   Expr dom_scale = new_args[1];
   Expr clip_min = new_args[2];
@@ -153,7 +153,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
     }
 
     float shift_nbit = std::log2(odom_scale_imm / idom_scale_imm);
-    CHECK_NE(shift_nbit, 0);
+    ICHECK_NE(shift_nbit, 0);
     if (static_cast<int>(shift_nbit) == shift_nbit) {
       if (shift_nbit > 0) {
         // use right shift
@@ -186,7 +186,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
   }
 
   // quantize from real
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   Expr data = new_args[0];
   Expr scaled_data = Multiply(data, MakeConstantScalar(DataType::Float(32), 1 / dom_scale_imm));
   Expr round_data = Clip(Round(scaled_data), clip_min_imm, clip_max_imm);
@@ -205,14 +205,14 @@ RELAY_REGISTER_OP("relay.op.annotation.simulated_quantize")
 
 Expr Conv2dRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 2);
+  ICHECK_EQ(new_args.size(), 2);
   if (!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>()) {
     return Expr(nullptr);
   }
   const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
-  CHECK(lhs);
+  ICHECK(lhs);
   const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
-  CHECK(rhs);
+  ICHECK(rhs);
 
   Expr ldata = lhs->data;
   if (lhs->dtype != cfg->dtype_input) {
@@ -236,7 +236,7 @@ RELAY_REGISTER_OP("nn.conv2d").set_attr<FForwardRewrite>("FQRealizeRewrite", Con
 
 Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 2);
+  ICHECK_EQ(new_args.size(), 2);
   if (!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>()) {
     return Expr(nullptr);
   }
@@ -265,7 +265,7 @@ RELAY_REGISTER_OP("nn.dense").set_attr<FForwardRewrite>("FQRealizeRewrite", Dens
 
 Expr MulRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 2);
+  ICHECK_EQ(new_args.size(), 2);
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     // execute the operation with activation data type.
     const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
@@ -286,7 +286,7 @@ Expr MulRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
     Expr dom_scale = FoldConstantOpt(mul);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -317,13 +317,13 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
   Array<Expr> ret;
   for (auto arg : args) {
     const auto* nptr = arg.as<QRealizeIntExprNode>();
-    CHECK(nptr);
+    ICHECK(nptr);
     nptrs.push_back(nptr);
     ret.push_back(nptr->data);
   }
 
   // unify the data type
-  CHECK_EQ(ref_args.size(), args.size());
+  ICHECK_EQ(ref_args.size(), args.size());
   DataType dtype;
 
   if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
@@ -357,7 +357,7 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
 }
 
 Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
-  CHECK_EQ(new_args.size(), 2);
+  ICHECK_EQ(new_args.size(), 2);
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
@@ -366,14 +366,14 @@ Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
 
-  CHECK(!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
 RELAY_REGISTER_OP("add").set_attr<FForwardRewrite>("FQRealizeRewrite", AddRealize);
 
 Expr ClipRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     const auto ref_attrs = ref_call->attrs.as<ClipAttrs>();
     auto attrs = make_object<ClipAttrs>();
@@ -384,20 +384,20 @@ Expr ClipRealize(const Call& ref_call, const Array<Expr>& new_args, const Object
     Expr ret = Call(ref_call->op, {n->data}, Attrs(attrs), ref_call->type_args);
     return QRealizeIntExpr(ret, n->dom_scale, n->dtype);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
 RELAY_REGISTER_OP("clip").set_attr<FForwardRewrite>("FQRealizeRewrite", ClipRealize);
 
 Expr ConcatenateRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
-  CHECK_EQ(new_args.size(), 1);
-  CHECK_EQ(ref_call->args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(ref_call->args.size(), 1);
 
   const auto* tuple = new_args[0].as<TupleNode>();
   const auto* ref_tuple = ref_call->args[0].as<TupleNode>();
-  CHECK(tuple);
-  CHECK(ref_tuple);
+  ICHECK(tuple);
+  ICHECK(ref_tuple);
   const Array<Expr>& arr = tuple->fields;
   const Array<Expr>& ref_arr = ref_tuple->fields;
 
@@ -409,7 +409,7 @@ Expr ConcatenateRealize(const Call& ref_call, const Array<Expr>& new_args, const
     return QRealizeIntExpr(ret, dom_scale, dtype);
   } else {
     for (auto arg : new_args) {
-      CHECK(!arg->IsInstance<TempExprNode>());
+      ICHECK(!arg->IsInstance<TempExprNode>());
     }
     return Expr(nullptr);
   }
@@ -419,12 +419,12 @@ RELAY_REGISTER_OP("concatenate").set_attr<FForwardRewrite>("FQRealizeRewrite", C
 
 /* \brief forward the original operator */
 Expr IdentityRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     Expr ret = ForwardOp(ref_call, {n->data});
     return QRealizeIntExpr(ret, n->dom_scale, n->dtype);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -442,13 +442,13 @@ RELAY_REGISTER_OP("annotation.stop_fusion")
 Expr CastDtypeInputRealize(const Call& ref_call, const Array<Expr>& new_args,
                            const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     Expr data = Cast(n->data, cfg->dtype_input);
     Expr ret = ForwardOp(ref_call, {data});
     return QRealizeIntExpr(ret, n->dom_scale, cfg->dtype_input);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -457,7 +457,7 @@ RELAY_REGISTER_OP("nn.max_pool2d")
 
 Expr AvgPoolRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     Expr data = n->data;
     if (n->dtype != cfg->dtype_activation) {
@@ -466,7 +466,7 @@ Expr AvgPoolRealize(const Call& ref_call, const Array<Expr>& new_args, const Obj
     Expr ret = ForwardOp(ref_call, {data});
     return QRealizeIntExpr(ret, n->dom_scale, cfg->dtype_activation);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -477,12 +477,12 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
 
 Expr CastHintRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const auto param = ref_call->attrs.as<CastHintAttrs>();
-  CHECK_EQ(new_args.size(), 1);
+  ICHECK_EQ(new_args.size(), 1);
   if (const auto* n = new_args[0].as<QRealizeIntExprNode>()) {
     Expr ret = Cast(n->data, param->dtype);
     return QRealizeIntExpr(ret, n->dom_scale, param->dtype);
   }
-  CHECK(!new_args[0]->IsInstance<TempExprNode>());
+  ICHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
diff --git a/src/relay/transforms/alter_op_layout.cc b/src/relay/transforms/alter_op_layout.cc
index 7c5ee019a437..924e61ad0d16 100644
--- a/src/relay/transforms/alter_op_layout.cc
+++ b/src/relay/transforms/alter_op_layout.cc
@@ -97,7 +97,7 @@ class AlterTransformMemorizer : public TransformMemorizer {
     }
 
     const CallNode* new_call = new_e.as<CallNode>();
-    CHECK(new_call) << "Can only replace the original operator with another call node";
+    ICHECK(new_call) << "Can only replace the original operator with another call node";
     return GetRef<Call>(new_call);
   }
 
diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
index b9d6cce762e5..7a083304515b 100644
--- a/src/relay/transforms/annotate_target.cc
+++ b/src/relay/transforms/annotate_target.cc
@@ -69,7 +69,7 @@ class AnnotateTargetRewriter : public ExprRewriter {
       if (call && call->op == CompilerBeginOp()) {
         // Argument is already compiler begin node meaning that this is not the first time
         // running this pass, so we simply remove it and will add a new one later.
-        CHECK_EQ(call->args.size(), 1U);
+        ICHECK_EQ(call->args.size(), 1U);
         const CallNode* end = call->args[0].as<CallNode>();
         if (end->op == CompilerEndOp()) {
           arg_target = end->attrs.as<CompilerAttrs>()->compiler;
@@ -137,13 +137,13 @@ class AnnotateTargetRewriter : public ExprRewriter {
     if (op_node && pre->op == CompilerBeginOp()) {
       // Bypass compiler begin due to lack of target information. It will be processed
       // when the following op handling arguments.
-      CHECK_EQ(pre->args.size(), 1U);
+      ICHECK_EQ(pre->args.size(), 1U);
       return post.as<CallNode>()->args[0];
     } else if (op_node && pre->op == CompilerEndOp()) {
       // Override compiler end with the new target.
-      CHECK_EQ(pre->args.size(), 1U);
+      ICHECK_EQ(pre->args.size(), 1U);
       auto input_expr = post.as<CallNode>()->args[0];
-      CHECK(op_expr_to_target_.find(input_expr) != op_expr_to_target_.end());
+      ICHECK(op_expr_to_target_.find(input_expr) != op_expr_to_target_.end());
       return InsertAnnotation(input_expr, op_expr_to_target_[input_expr], make_end_op);
     }
     // Check prior to peeking first argument
@@ -164,7 +164,7 @@ class AnnotateTargetRewriter : public ExprRewriter {
       // TVM operators: Check target specific op checking function and add to supported_targets
       // if it is supported.
       Op op = Downcast<Op>(pre->op);
-      CHECK(op.defined());
+      ICHECK(op.defined());
       for (const auto& target : this->targets_) {
         if (!Op::HasAttrMap("target." + std::string(target))) {
           continue;
@@ -178,7 +178,7 @@ class AnnotateTargetRewriter : public ExprRewriter {
       // Composite function: Add the target of a composite function to supported_targets
       // if it is in the target list.
       Function func = Downcast<Function>(pre->op);
-      CHECK(func.defined());
+      ICHECK(func.defined());
 
       if (auto comp_name = func->GetAttr<String>(attr::kComposite)) {
         std::string comp_name_str = comp_name.value();
diff --git a/src/relay/transforms/canonicalize_cast.cc b/src/relay/transforms/canonicalize_cast.cc
index 510d098990e3..b0e96cc47514 100644
--- a/src/relay/transforms/canonicalize_cast.cc
+++ b/src/relay/transforms/canonicalize_cast.cc
@@ -106,13 +106,13 @@ class CastCanonicalizer : public ExprMutator {
       if (call->op == cast_op_) {
         auto attrs = call->attrs.as<CastAttrs>();
         const auto* from_type = call->args[0]->type_as<TensorTypeNode>();
-        CHECK(from_type);
+        ICHECK(from_type);
 
         if (from_type->dtype.bits() < attrs->dtype.bits()) {
           if (++ref_counter_[call] > 1) {
             const CallNode* new_call = new_expr.as<CallNode>();
-            CHECK(new_call);
-            CHECK(new_call->op == cast_op_);
+            ICHECK(new_call);
+            ICHECK(new_call->op == cast_op_);
             return Call(new_call->op, new_call->args, new_call->attrs, new_call->type_args);
           }
         }
diff --git a/src/relay/transforms/canonicalize_ops.cc b/src/relay/transforms/canonicalize_ops.cc
index dfb30cae4693..cf14ddcb7c5b 100644
--- a/src/relay/transforms/canonicalize_ops.cc
+++ b/src/relay/transforms/canonicalize_ops.cc
@@ -41,7 +41,7 @@ class BiasAddSimplifier : public ExprRewriter {
     auto new_n = post;
     if (n->op == bias_add_op_) {
       Call call = Downcast<Call>(new_n);
-      CHECK_EQ(call->args.size(), 2);
+      ICHECK_EQ(call->args.size(), 2);
       const BiasAddAttrs* param = call->attrs.as<BiasAddAttrs>();
 
       auto ttype = n->args[0]->type_as<TensorTypeNode>();
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index 20fa3e404f6a..20b206e0423c 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -62,8 +62,8 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     const Layout kOIHW("OIHW");
     const auto* attrs_a = a->attrs.as<Conv2DAttrs>();
     const auto* attrs_b = b->attrs.as<Conv2DAttrs>();
-    CHECK(attrs_a);
-    CHECK(attrs_b);
+    ICHECK(attrs_a);
+    ICHECK(attrs_b);
     const auto* tweight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* tweight_b = b->args[1]->type_as<TensorTypeNode>();
     const auto shape_a =
@@ -89,7 +89,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
 
     const CallNode* group_root = branches[0][0];
     const auto* attrs = group_root->attrs.as<Conv2DAttrs>();
-    CHECK(attrs);
+    ICHECK(attrs);
     const auto new_attrs = make_object<Conv2DAttrs>();
     new_attrs->strides = attrs->strides;
     new_attrs->padding = attrs->padding;
@@ -105,7 +105,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     const std::string& layout =
         new_attrs->out_layout == "" ? new_attrs->data_layout : new_attrs->out_layout;
     channel_pos_ = layout.find('C');
-    CHECK_NE(channel_pos_, std::string::npos);
+    ICHECK_NE(channel_pos_, std::string::npos);
 
     return Call(conv2d, {data, new_weight}, Attrs{new_attrs}, {});
   }
@@ -198,7 +198,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     }
     auto index =
         branches[0][0]->attrs.as<Conv2DAttrs>()->kernel_layout.operator std::string().find('O');
-    CHECK_NE(index, std::string::npos);
+    ICHECK_NE(index, std::string::npos);
     return std::make_tuple(MakeConcatenate(Tuple(weights), index),
                            tir::make_const(DataType::Int(32), num_filters));
   }
diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc
index 74a6921c9409..6d4c8c000f31 100644
--- a/src/relay/transforms/combine_parallel_dense.cc
+++ b/src/relay/transforms/combine_parallel_dense.cc
@@ -61,8 +61,8 @@ class ParallelDenseToBatchCombiner : public ParallelOpBatchCombiner {
     StructuralEqual eq;
     const auto* attrs_a = a->attrs.as<DenseAttrs>();
     const auto* attrs_b = b->attrs.as<DenseAttrs>();
-    CHECK(attrs_a);
-    CHECK(attrs_b);
+    ICHECK(attrs_a);
+    ICHECK(attrs_b);
     const auto* weight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* weight_b = b->args[1]->type_as<TensorTypeNode>();
 
@@ -89,7 +89,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     const auto* attrs_b = b->attrs.as<DenseAttrs>();
     const auto* weight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* weight_b = b->args[1]->type_as<TensorTypeNode>();
-    CHECK(attrs_a != nullptr && attrs_b != nullptr && weight_a != nullptr && weight_b != nullptr);
+    ICHECK(attrs_a != nullptr && attrs_b != nullptr && weight_a != nullptr && weight_b != nullptr);
     // output dims (weight->shape[0]) can be different
     return eq(attrs_a->out_dtype, attrs_b->out_dtype) && eq(weight_a->shape[1], weight_b->shape[1]);
   }
@@ -102,7 +102,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     // concat all weights into one
     std::tie(new_weight, new_output_dims) = TransformWeight(branches);
     const auto* origin_attrs = branches[0][0]->attrs.as<DenseAttrs>();
-    CHECK(origin_attrs);
+    ICHECK(origin_attrs);
     const auto dense_attrs = make_object<DenseAttrs>();
     dense_attrs->units = new_output_dims;
     dense_attrs->out_dtype = origin_attrs->out_dtype;
@@ -115,7 +115,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     auto tb = b->args[index]->type_as<TensorTypeNode>();
     auto toutput_a = a->type_as<TensorTypeNode>();
     auto toutput_b = b->type_as<TensorTypeNode>();
-    CHECK(ta != nullptr && tb != nullptr && toutput_a != nullptr && toutput_b != nullptr);
+    ICHECK(ta != nullptr && tb != nullptr && toutput_a != nullptr && toutput_b != nullptr);
 
     if (!eq(ta->dtype, tb->dtype) || ta->shape.size() != tb->shape.size()) {
       return false;
@@ -148,7 +148,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
         auto parent = branch[depth]->args[parent_index];
         auto& parent_shape = parent->type_as<TensorTypeNode>()->shape;
         auto out_dim = tir::as_const_int(parent_shape[parent_shape.size() - 1]);
-        CHECK(out_dim != nullptr);
+        ICHECK(out_dim != nullptr);
 
         auto arg = branch[depth]->args[i];
         auto& arg_shape = arg->type_as<TensorTypeNode>()->shape;
@@ -158,7 +158,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
           arg = MakeExpandDims(arg, -1, 1);
         } else {
           auto arg_last_dim = tir::as_const_int(arg_shape[arg_shape.size() - 1]);
-          CHECK(arg_last_dim != nullptr);
+          ICHECK(arg_last_dim != nullptr);
           if (*out_dim > 1 && *arg_last_dim == 1) {
             repeat_last_dim = true;
           }
@@ -182,7 +182,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
       const CallNode* call = branch[depth];
       auto& out_shape = call->type_as<TensorTypeNode>()->shape;
       auto out_dims = tir::as_const_int(out_shape[out_shape.size() - 1]);
-      CHECK(out_dims != nullptr);
+      ICHECK(out_dims != nullptr);
       Array<Integer> begin;
       Array<Integer> end;
       Array<Integer> strides;
diff --git a/src/relay/transforms/combine_parallel_op.cc b/src/relay/transforms/combine_parallel_op.cc
index b23d01ff469b..1c9a58f49824 100644
--- a/src/relay/transforms/combine_parallel_op.cc
+++ b/src/relay/transforms/combine_parallel_op.cc
@@ -64,7 +64,7 @@ std::vector<Group> BranchGroupFinder::Find(const Expr& expr) {
       auto&& branch = CreateBranch(child);
       // add the branch to a group, or create a new group
       auto it = std::find_if(groups.begin() + ngroups, groups.end(), [&](const Group& group) {
-        CHECK(!group.empty() && !group[0].empty());
+        ICHECK(!group.empty() && !group[0].empty());
         return fare_compatible_ops_(child, group[0][0]);
       });
       if (it != groups.end()) {
@@ -141,7 +141,7 @@ void ParallelOpCombiner::CombineBranches(const Group& branches) {
     for (parent_index = 0; parent_index < branches[0][i]->args.size(); parent_index++) {
       if (branches[0][i]->args[parent_index].get() == branches[0][i - 1]) break;
     }
-    CHECK_NE(parent_index, branches[0][i]->args.size());
+    ICHECK_NE(parent_index, branches[0][i]->args.size());
     if (!CheckLevel(branches, i, parent_index)) break;
     combined = MakeCombinedCallFromFollowingOps(combined, branches, i, parent_index);
   }
diff --git a/src/relay/transforms/convert_layout.cc b/src/relay/transforms/convert_layout.cc
index 577fb068aab9..ba443f602c19 100644
--- a/src/relay/transforms/convert_layout.cc
+++ b/src/relay/transforms/convert_layout.cc
@@ -112,7 +112,7 @@ class ConvertTransformMemorizer : public TransformMemorizer {
     }
 
     const CallNode* new_call = new_e.as<CallNode>();
-    CHECK(new_call) << "Can only replace the original operator with another call node";
+    ICHECK(new_call) << "Can only replace the original operator with another call node";
     return GetRef<Call>(new_call);
   }
 
diff --git a/src/relay/transforms/convert_sparse_dense.cc b/src/relay/transforms/convert_sparse_dense.cc
index 36aaa478eab6..5f4dbe642c3d 100644
--- a/src/relay/transforms/convert_sparse_dense.cc
+++ b/src/relay/transforms/convert_sparse_dense.cc
@@ -75,9 +75,9 @@ class DenseToSparseDenseMutator : public ExprRewriter {
   DenseToSparseDenseMutator(const Array<ObjectRef>& weight_name,
                             const Array<Array<PrimExpr> >& weight_shape)
       : dense_op_(Op::Get("nn.dense")), sparse_dense_op_(Op::Get("nn.sparse_dense")) {
-    CHECK_EQ(weight_name.size(), weight_shape.size());
+    ICHECK_EQ(weight_name.size(), weight_shape.size());
     for (size_t i = 0; i < weight_name.size(); ++i) {
-      CHECK(weight_name[i]->IsInstance<runtime::StringObj>());
+      ICHECK(weight_name[i]->IsInstance<runtime::StringObj>());
       std::string k = weight_name[i].as<runtime::StringObj>()->data;
       const auto& ws = weight_shape[i];
       std::vector<int> v(ws.size());
diff --git a/src/relay/transforms/de_duplicate.cc b/src/relay/transforms/de_duplicate.cc
index 8c62fe6100c3..43b71f6f10cc 100644
--- a/src/relay/transforms/de_duplicate.cc
+++ b/src/relay/transforms/de_duplicate.cc
@@ -40,8 +40,8 @@ Expr DeDup(const Expr& e) {
     }
 
     Var Fresh(const Var& v) {
-      CHECK_EQ(rename_.count(v), 0);
-      CHECK_EQ(memo_.count(v), 0) << v.as<VarNode>();
+      ICHECK_EQ(rename_.count(v), 0);
+      ICHECK_EQ(memo_.count(v), 0) << v.as<VarNode>();
       Var ret = Var(v->name_hint(), VisitType(v->type_annotation));
       rename_[v] = ret;
       return ret;
@@ -94,10 +94,10 @@ Expr DeDup(const Expr& e) {
     std::unordered_map<Var, Var, ObjectPtrHash, ObjectPtrEqual> rename_;
     std::unordered_map<TypeVar, TypeVar, ObjectPtrHash, ObjectPtrEqual> type_rename_;
   };
-  CHECK(WellFormed(e)) << AsText(e, false);
+  ICHECK(WellFormed(e)) << AsText(e, false);
   Expr ret = DeDupMutator().VisitExpr(e);
-  CHECK(WellFormed(ret));
-  CHECK_EQ(FreeVars(e).size(), FreeVars(ret).size());
+  ICHECK(WellFormed(ret));
+  ICHECK_EQ(FreeVars(e).size(), FreeVars(ret).size());
   return ret;
 }
 
diff --git a/src/relay/transforms/dead_code.cc b/src/relay/transforms/dead_code.cc
index f6c2272a3018..2e7c08a684dc 100644
--- a/src/relay/transforms/dead_code.cc
+++ b/src/relay/transforms/dead_code.cc
@@ -46,7 +46,7 @@ class FindDef : private ExprVisitor {
   VarMap<Expr> expr_map_;
 
   void VisitExpr_(const LetNode* l) final {
-    CHECK_EQ(expr_map_.count(l->var), 0);
+    ICHECK_EQ(expr_map_.count(l->var), 0);
     expr_map_[l->var] = l->value;
     VisitExpr(l->value);
     VisitExpr(l->body);
diff --git a/src/relay/transforms/defunctionalization.cc b/src/relay/transforms/defunctionalization.cc
index 135d7fcee548..14a86bc8d080 100644
--- a/src/relay/transforms/defunctionalization.cc
+++ b/src/relay/transforms/defunctionalization.cc
@@ -103,12 +103,12 @@ class DefuncMutator : public ExprMutator {
 
   Expr VisitExpr_(const CallNode* call) {
     if (auto op = call->op.as<GlobalVarNode>()) {
-      CHECK_EQ(call->type_args.size(), op->checked_type().as<FuncTypeNode>()->type_params.size())
+      ICHECK_EQ(call->type_args.size(), op->checked_type().as<FuncTypeNode>()->type_params.size())
           << "all type args must be explicit";
 
       auto op_type = InstFuncType(op->checked_type().as<FuncTypeNode>(), call->type_args);
-      CHECK_EQ(FreeTypeVars(op_type, mod).size(), 0) << "free type vars in instantiated";
-      CHECK(!HasFuncType(op_type->ret_type)) << "returning functions not supported";
+      ICHECK_EQ(FreeTypeVars(op_type, mod).size(), 0) << "free type vars in instantiated";
+      ICHECK(!HasFuncType(op_type->ret_type)) << "returning functions not supported";
 
       if (!IsHigherOrderFunc(op_type)) {
         // not higher order function
@@ -152,7 +152,7 @@ class DefuncMutator : public ExprMutator {
       // var node will be encoded as datatype
       // so we need to use the `apply` helper method
       auto var_original_type = GetUnencodedType(op->type_annotation).as<FuncTypeNode>();
-      CHECK(var_original_type) << "var original type not saved in var_save_type map";
+      ICHECK(var_original_type) << "var original type not saved in var_save_type map";
       auto op_type = InstFuncType(var_original_type, call->type_args);
 
       Array<Expr> args = {GetRef<Var>(op)};
@@ -209,7 +209,7 @@ class DefuncMutator : public ExprMutator {
    */
   void AddApplyCase(GlobalVar apply_gv, FuncType ft, Constructor c, const Expr& expr,
                     const Array<Pattern> patterns) {
-    CHECK(c->inputs.size() == patterns.size())
+    ICHECK(c->inputs.size() == patterns.size())
         << "constructor function and pattern vars have different sizes";
     if (!mod->ContainGlobalVar(apply_gv->name_hint)) {
       auto x = Var("x", TypeCall(c->belong_to, {}));
@@ -229,7 +229,7 @@ class DefuncMutator : public ExprMutator {
     } else {
       auto f = Downcast<Function>(mod->Lookup(apply_gv));
       auto body = f->body.as<MatchNode>();
-      CHECK(body) << "internal invariant broken; apply function body should be a match node";
+      ICHECK(body) << "internal invariant broken; apply function body should be a match node";
 
       auto clauses = body->clauses;
       auto x = f->params[0];
@@ -245,8 +245,8 @@ class DefuncMutator : public ExprMutator {
 
   Expr EncodeArg(const Expr& arg, const Type& type) {
     // we assume arg is either an identifier (var or globalvar) or a function
-    CHECK(type.as<FuncTypeNode>()) << "assume no nested functions";
-    CHECK(arg.as<VarNode>() || arg.as<GlobalVarNode>() || arg.as<FunctionNode>())
+    ICHECK(type.as<FuncTypeNode>()) << "assume no nested functions";
+    ICHECK(arg.as<VarNode>() || arg.as<GlobalVarNode>() || arg.as<FunctionNode>())
         << "assume all first-order-parameters are identifiers or functions";
 
     if (arg.as<VarNode>()) {
@@ -334,11 +334,11 @@ class DefuncMutator : public ExprMutator {
    */
   FuncType GetUnencodedType(const Type& t) {
     auto tc = t.as<TypeCallNode>();
-    CHECK(tc) << "expected type call when getting original type from encoded type";
+    ICHECK(tc) << "expected type call when getting original type from encoded type";
     auto gv = tc->func.as<GlobalTypeVarNode>();
-    CHECK(gv) << "expected global type var in encoded type";
+    ICHECK(gv) << "expected global type var in encoded type";
     auto type = original_func_type_map[GetRef<GlobalTypeVar>(gv)];
-    CHECK(type.defined()) << "reverse mapping from encoded type to original type not found";
+    ICHECK(type.defined()) << "reverse mapping from encoded type to original type not found";
     return Downcast<FuncType>(type);
   }
 
@@ -357,8 +357,8 @@ class DefuncMutator : public ExprMutator {
    * \brief specialize a function type
    */
   FuncType InstFuncType(const FuncTypeNode* fty, const Array<Type> type_args) {
-    CHECK(fty) << "InstFuncType functype is null";
-    CHECK_EQ(fty->type_params.size(), type_args.size())
+    ICHECK(fty) << "InstFuncType functype is null";
+    ICHECK_EQ(fty->type_params.size(), type_args.size())
         << "size mismatch between function type params and type args";
     auto map = tvm::Map<TypeVar, Type>();
     for (size_t i = 0; i < type_args.size(); i++) {
@@ -372,7 +372,7 @@ class DefuncMutator : public ExprMutator {
    * \brief specialize a function expression
    */
   Function Specialize(const Function& f, const Array<Type> type_args) {
-    CHECK_EQ(f->type_params.size(), type_args.size())
+    ICHECK_EQ(f->type_params.size(), type_args.size())
         << "cannot specialize function with size mismatch between function type params and type "
            "args";
     auto map = tvm::Map<TypeVar, Type>();
@@ -389,7 +389,7 @@ class DefuncMutator : public ExprMutator {
    * using the `apply` function for applications
    */
   Function FirstifyVars(const Function& f) {
-    CHECK(f->type_params.size() == 0) << "firstify function has type params";
+    ICHECK(f->type_params.size() == 0) << "firstify function has type params";
 
     tvm::Map<Var, Expr> var_bind_map;
     Array<Var> params;
@@ -403,7 +403,7 @@ class DefuncMutator : public ExprMutator {
         var_bind_map.Set(var, new_var);
         params.push_back(new_var);
       } else {
-        CHECK(!HasFuncType(var->type_annotation))
+        ICHECK(!HasFuncType(var->type_annotation))
             << "nested function type in parameter not supported yet";
         params.push_back(var);
       }
@@ -416,11 +416,11 @@ class DefuncMutator : public ExprMutator {
 
 Expr Defunctionalization(const Function& f, const IRModule& mod) {
   // f is the starting point of the program, all types MUST be known
-  CHECK(f->type_params.size() == 0) << "no polymorphism supported for defunctionalization";
+  ICHECK(f->type_params.size() == 0) << "no polymorphism supported for defunctionalization";
   for (const auto& p : f->params) {
-    CHECK(!HasFuncType(p->checked_type())) << "program cannot have func type parameters";
+    ICHECK(!HasFuncType(p->checked_type())) << "program cannot have func type parameters";
   }
-  CHECK(!HasFuncType(f->ret_type)) << "return type cannot contain function";
+  ICHECK(!HasFuncType(f->ret_type)) << "return type cannot contain function";
 
   return Downcast<Function>(DefuncMutator(mod).VisitExpr(f));
 }
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index b3f22e00fda4..e744fb51e0a6 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -72,16 +72,16 @@ class ValidateAnnotation : private ExprVisitor {
     if (IsOnDeviceNode(call_node)) {
       int device_type = GetDeviceId(call_node);
       if (annotation_map_.count(call_node)) {
-        CHECK_EQ(annotation_map_.at(call_node), device_type)
+        ICHECK_EQ(annotation_map_.at(call_node), device_type)
             << "An expression node can only be annotated to one device.";
       } else {
         annotation_map_.insert({call_node, GetDeviceId(call_node)});
       }
 
-      CHECK_EQ(call_node->args.size(), 1U);
+      ICHECK_EQ(call_node->args.size(), 1U);
       const auto* node = call_node->args[0].operator->();
       if (annotation_map_.count(node)) {
-        CHECK_EQ(annotation_map_.at(node), device_type)
+        ICHECK_EQ(annotation_map_.at(node), device_type)
             << "An expression node can only be annotated to one device.";
       } else {
         annotation_map_.insert({node, GetDeviceId(call_node)});
@@ -103,7 +103,7 @@ class ValidateAnnotation : private ExprVisitor {
    * \return The device type.
    */
   int GetDeviceId(const CallNode* call_node) {
-    CHECK(IsOnDeviceNode(call_node)) << "The input call node must be on_device node.";
+    ICHECK(IsOnDeviceNode(call_node)) << "The input call node must be on_device node.";
     const OnDeviceAttrs* on_device_attr = call_node->attrs.as<OnDeviceAttrs>();
     return on_device_attr->device_type;
   }
@@ -226,7 +226,7 @@ class RewriteAnnotation : public ExprMutator {
     const auto sit = annotation_map_.find(src_node);
     if (sit == annotation_map_.end()) {
       const auto dit = annotation_map_.find(dst);
-      CHECK(dit != annotation_map_.end())
+      ICHECK(dit != annotation_map_.end())
           << "Device copy op is not required when both src and dst ops are not "
              "annotated.";
       return CreateDeviceCopy(src, fallback_device_, dit->second);
@@ -391,7 +391,7 @@ class DeviceInfo {
       // Skip annotation nodes.
       if (!IsOnDeviceNode(call)) {
         if (const auto* node = GetDeviceCopyNode(call)) {
-          CHECK(node->IsInstance<CallNode>());
+          ICHECK(node->IsInstance<CallNode>());
           const auto* call_node = static_cast<const CallNode*>(node);
           auto attrs = call_node->attrs.as<DeviceCopyAttrs>();
 
@@ -496,7 +496,7 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
           new_body.push_back(field);
         }
       }
-      CHECK_GT(new_body.size(), 0U);
+      ICHECK_GT(new_body.size(), 0U);
       if (new_body.size() == 1) {
         return Function(params, new_body[0], Type(nullptr), fn->type_params, fn->attrs);
       } else if (tuple->fields.size() == new_body.size()) {
@@ -515,7 +515,7 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
         new_fields.push_back(field);
       }
     }
-    CHECK_GT(new_fields.size(), 0U);
+    ICHECK_GT(new_fields.size(), 0U);
     if (tuple->fields.size() == new_fields.size()) {
       return new_fields.size() == 1 ? new_fields[0] : new_expr;
     } else {
diff --git a/src/relay/transforms/dynamic_to_static.cc b/src/relay/transforms/dynamic_to_static.cc
index 5caaea8c9ead..d16d6328301a 100644
--- a/src/relay/transforms/dynamic_to_static.cc
+++ b/src/relay/transforms/dynamic_to_static.cc
@@ -39,7 +39,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
         {Op::Get("dyn.reshape"),
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
-             CHECK_EQ(shape->data->ndim, 1);
+             ICHECK_EQ(shape->data->ndim, 1);
              return MakeReshape(call_node->args[0], ToVector(shape->data));
            }
            return Expr(nullptr);
@@ -47,7 +47,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
         {Op::Get("dyn.tile"),
          [](const CallNode* call_node) {
            if (const ConstantNode* reps = call_node->args[1].as<ConstantNode>()) {
-             CHECK_EQ(reps->data->ndim, 1);
+             ICHECK_EQ(reps->data->ndim, 1);
              return MakeTile(call_node->args[0], ToVector(reps->data));
            }
            return Expr(nullptr);
@@ -56,7 +56,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* k = call_node->args[1].as<ConstantNode>()) {
              const TopKAttrs* param = call_node->attrs.as<TopKAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeTopK(call_node->args[0], static_cast<int>(ToScalar(k->data, 0)),
                              param->axis, param->ret_type, param->is_ascend, param->dtype);
            }
@@ -65,7 +65,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
         {Op::Get("dyn.broadcast_to"),
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
-             CHECK_EQ(shape->data->ndim, 1);
+             ICHECK_EQ(shape->data->ndim, 1);
              return MakeBroadCastTo(call_node->args[0], ToVector(shape->data));
            }
            return Expr(nullptr);
@@ -74,7 +74,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[0].as<ConstantNode>()) {
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeZeros(ToVector(shape->data), param->dtype);
            }
            return Expr(nullptr);
@@ -83,7 +83,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[0].as<ConstantNode>()) {
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeOnes(ToVector(shape->data), param->dtype);
            }
            return Expr(nullptr);
@@ -92,7 +92,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* depth = call_node->args[3].as<ConstantNode>()) {
              const OneHotAttrs* param = call_node->attrs.as<OneHotAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeOneHot(call_node->args[0], call_node->args[1], call_node->args[2],
                                static_cast<int>(ToScalar(depth->data, 0)), param->axis,
                                param->dtype);
@@ -103,7 +103,7 @@ class DynamicToStaticMutator : public MixedModeMutator {
          [](const CallNode* call_node) {
            if (const ConstantNode* size = call_node->args[1].as<ConstantNode>()) {
              const ResizeAttrs* param = call_node->attrs.as<ResizeAttrs>();
-             CHECK(param);
+             ICHECK(param);
              auto size_int = ToVector(size->data);
              Array<PrimExpr> size_prim;
              for (size_t i = 0; i < size_int.size(); ++i) {
@@ -117,9 +117,9 @@ class DynamicToStaticMutator : public MixedModeMutator {
         {Op::Get("dyn.full"),
          [](const CallNode* call_node) {
            if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
-             CHECK_EQ(shape->data->ndim, 1);
+             ICHECK_EQ(shape->data->ndim, 1);
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeFull(call_node->args[0], ToVector(shape->data), param->dtype);
            }
            return Expr(nullptr);
@@ -129,10 +129,10 @@ class DynamicToStaticMutator : public MixedModeMutator {
            const ConstantNode* scale_h = call_node->args[1].as<ConstantNode>();
            const ConstantNode* scale_w = call_node->args[2].as<ConstantNode>();
            if (scale_h && scale_w) {
-             CHECK_EQ(scale_h->data->ndim, 0);
-             CHECK_EQ(scale_w->data->ndim, 0);
+             ICHECK_EQ(scale_h->data->ndim, 0);
+             ICHECK_EQ(scale_w->data->ndim, 0);
              const UpSamplingAttrs* param = call_node->attrs.as<UpSamplingAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeUpSampling(call_node->args[0], ToScalar(scale_h->data),
                                    ToScalar(scale_w->data), param->layout, param->method,
                                    param->align_corners);
@@ -145,11 +145,11 @@ class DynamicToStaticMutator : public MixedModeMutator {
            const ConstantNode* scale_h = call_node->args[2].as<ConstantNode>();
            const ConstantNode* scale_w = call_node->args[3].as<ConstantNode>();
            if (scale_d && scale_h && scale_w) {
-             CHECK_EQ(scale_d->data->ndim, 0);
-             CHECK_EQ(scale_h->data->ndim, 0);
-             CHECK_EQ(scale_w->data->ndim, 0);
+             ICHECK_EQ(scale_d->data->ndim, 0);
+             ICHECK_EQ(scale_h->data->ndim, 0);
+             ICHECK_EQ(scale_w->data->ndim, 0);
              const UpSampling3DAttrs* param = call_node->attrs.as<UpSampling3DAttrs>();
-             CHECK(param);
+             ICHECK(param);
 
              return MakeUpSampling3D(call_node->args[0], ToScalar(scale_d->data),
                                      ToScalar(scale_h->data), ToScalar(scale_w->data),
@@ -163,11 +163,11 @@ class DynamicToStaticMutator : public MixedModeMutator {
            const ConstantNode* pad_width = call_node->args[1].as<ConstantNode>();
            const ConstantNode* pad_fill = call_node->args[2].as<ConstantNode>();
            if (pad_width && pad_fill) {
-             CHECK_EQ(pad_fill->data->ndim, 0);   // pad_val is 1d
-             CHECK_EQ(pad_width->data->ndim, 2);  // pad_width is 2d
+             ICHECK_EQ(pad_fill->data->ndim, 0);   // pad_val is 1d
+             ICHECK_EQ(pad_width->data->ndim, 2);  // pad_width is 2d
 
              const PadAttrs* param = call_node->attrs.as<PadAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakePad(call_node->args[0], ToMatrix(pad_width->data), ToScalar(pad_fill->data),
                             param->pad_mode);
            }
@@ -179,11 +179,11 @@ class DynamicToStaticMutator : public MixedModeMutator {
            const ConstantNode* end = call_node->args[2].as<ConstantNode>();
            const ConstantNode* stride = call_node->args[3].as<ConstantNode>();
            if (begin && end && stride) {
-             CHECK_EQ(begin->data->ndim, 1);
-             CHECK_EQ(end->data->ndim, 1);
-             CHECK_EQ(stride->data->ndim, 1);
+             ICHECK_EQ(begin->data->ndim, 1);
+             ICHECK_EQ(end->data->ndim, 1);
+             ICHECK_EQ(stride->data->ndim, 1);
              const StridedSliceAttrs* param = call_node->attrs.as<StridedSliceAttrs>();
-             CHECK(param);
+             ICHECK(param);
              return MakeStridedSlice(call_node->args[0], ToVector(begin->data), ToVector(end->data),
                                      ToVector(stride->data), param->slice_mode);
            }
diff --git a/src/relay/transforms/eliminate_common_subexpr.cc b/src/relay/transforms/eliminate_common_subexpr.cc
index 720a97e9d19d..e9603575111d 100644
--- a/src/relay/transforms/eliminate_common_subexpr.cc
+++ b/src/relay/transforms/eliminate_common_subexpr.cc
@@ -45,7 +45,7 @@ class CommonSubexprEliminator : public MixedModeMutator {
     static auto op_stateful = Op::GetAttrMap<TOpIsStateful>("TOpIsStateful");
     Expr new_expr = post;
     const CallNode* new_call = new_expr.as<CallNode>();
-    CHECK(new_call);
+    ICHECK(new_call);
     const OpNode* op = new_call->op.as<OpNode>();
     StructuralEqual attrs_equal;
 
@@ -83,7 +83,7 @@ class CommonSubexprEliminator : public MixedModeMutator {
   Expr Rewrite_(const TupleGetItemNode* op, const Expr& post) final {
     Expr new_expr = post;
     const TupleGetItemNode* new_tuple_item = new_expr.as<TupleGetItemNode>();
-    CHECK(new_tuple_item);
+    ICHECK(new_tuple_item);
 
     if (fskip_ != nullptr && fskip_(new_expr)) {
       return new_expr;
diff --git a/src/relay/transforms/eta_expand.cc b/src/relay/transforms/eta_expand.cc
index 42718eec9179..4023c9dafef4 100644
--- a/src/relay/transforms/eta_expand.cc
+++ b/src/relay/transforms/eta_expand.cc
@@ -62,7 +62,7 @@ class EtaExpander : public ExprMutator {
         type_var_replacer_(TypeVarReplacer()),
         expand_constructor_(expand_constructor),
         expand_global_var_(expand_global_var) {
-    CHECK(expand_constructor || expand_global_var) << "must expand at least one language feature";
+    ICHECK(expand_constructor || expand_global_var) << "must expand at least one language feature";
   }
 
   IRModule Expand() {
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 4a739ddba40f..48af31f9a11f 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -110,7 +110,7 @@ class ConstantFolder : public MixedModeMutator {
   bool inside_primitive = false;
   Expr VisitExpr_(const FunctionNode* op) final {
     if (op->HasNonzeroAttr(attr::kPrimitive)) {
-      CHECK_EQ(inside_primitive, false);
+      ICHECK_EQ(inside_primitive, false);
       inside_primitive = true;
       auto ret = ExprMutator::VisitExpr_(op);
       inside_primitive = false;
@@ -253,7 +253,7 @@ class ConstantFolder : public MixedModeMutator {
   Expr EvaluateShapeOf(Expr expr, Array<Expr> args, Attrs attrs) {
     Expr input = args[0];
     const auto* param = attrs.as<ShapeOfAttrs>();
-    CHECK(param != nullptr);
+    ICHECK(param != nullptr);
 
     tvm::Array<IndexExpr> ishape;
     if (auto opt = GetConstantShape(input)) {
@@ -271,7 +271,7 @@ class ConstantFolder : public MixedModeMutator {
     if (ishape.size() == 0) {
       value = runtime::NDArray::Empty({}, cdtype, ctx);
     } else {
-      CHECK_NE(ishape.size(), 0);
+      ICHECK_NE(ishape.size(), 0);
       std::vector<int64_t> cshape = {static_cast<int64_t>(ishape.size())};
       value = runtime::NDArray::Empty(cshape, cdtype, ctx);
       int32_t* dims = static_cast<int32_t*>(value->data);
@@ -300,7 +300,7 @@ class ConstantFolder : public MixedModeMutator {
   Expr EvaluateNdarraySize(Expr expr, Array<Expr> args, Attrs attrs) {
     Expr input = args[0];
     const auto* param = attrs.as<NdarraySizeAttrs>();
-    CHECK(param != nullptr);
+    ICHECK(param != nullptr);
 
     tvm::Array<IndexExpr> ishape;
     if (auto opt = GetConstantShape(input)) {
diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc
index e4c924dad1e8..23be70c1e442 100644
--- a/src/relay/transforms/fold_scale_axis.cc
+++ b/src/relay/transforms/fold_scale_axis.cc
@@ -182,7 +182,7 @@ class ScaledExprNode : public TempExprNode {
   Expr scale = NullValue<Expr>();
 
   Expr Realize() const final {
-    CHECK(!axes.defined()) << "outstanding scale";
+    ICHECK(!axes.defined()) << "outstanding scale";
     return value;
   }
 
@@ -268,7 +268,7 @@ class ForwardPrep : private ExprVisitor {
       auto f = fprep.get(call->op, nullptr);
       if (f != nullptr) {
         Array<Message> in_messages = f(GetRef<Call>(call), out_message);
-        CHECK_EQ(in_messages.size(), call->args.size());
+        ICHECK_EQ(in_messages.size(), call->args.size());
         for (size_t i = 0; i < call->args.size(); ++i) {
           this->Update(call->args[i], in_messages[i]);
         }
@@ -400,8 +400,8 @@ Expr AddSubForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
   auto rnode = make_object<ScaledExprNode>();
 
   if (slhs != nullptr) {
-    CHECK(srhs == nullptr);
-    CHECK(MatchBroadcastToLeftAxes(tlhs, trhs, slhs->axes));
+    ICHECK(srhs == nullptr);
+    ICHECK(MatchBroadcastToLeftAxes(tlhs, trhs, slhs->axes));
     Expr scale = ReshapeOrExpandToMatchAxis(slhs->scale, tlhs->shape, slhs->axes);
     if (!scale.defined()) {
       return Expr();
@@ -411,8 +411,8 @@ Expr AddSubForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
     rnode->scale = slhs->scale;
     rnode->axes = slhs->axes;
   } else {
-    CHECK(srhs != nullptr);
-    CHECK(MatchBroadcastToLeftAxes(trhs, tlhs, srhs->axes));
+    ICHECK(srhs != nullptr);
+    ICHECK(MatchBroadcastToLeftAxes(trhs, tlhs, srhs->axes));
     Expr scale = ReshapeOrExpandToMatchAxis(srhs->scale, trhs->shape, srhs->axes);
     if (!scale.defined()) {
       return Expr();
@@ -441,12 +441,12 @@ Expr MultiplyForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
                             const Message& message) {
   if (!message.defined()) return Expr();
   const auto& expected_out_axes = message->axes;
-  CHECK(expected_out_axes.defined() && expected_out_axes.size());
+  ICHECK(expected_out_axes.defined() && expected_out_axes.size());
   // TODO(tvm-team) allow same axes accumulation
   // not as important because it is less common in nn.
   const auto* slhs = new_args[0].as<ScaledExprNode>();
   const auto* srhs = new_args[1].as<ScaledExprNode>();
-  CHECK(!slhs && !srhs);
+  ICHECK(!slhs && !srhs);
 
   const auto* tlhs = ref_call->args[0]->type_as<TensorTypeNode>();
   const auto* trhs = ref_call->args[1]->type_as<TensorTypeNode>();
@@ -480,13 +480,13 @@ Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
   // TODO(tvm-team) support general data layout
   // by transforming weight
   const auto* param = call->attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
   int c_big_axis = data_layout.IndexOf(LayoutAxis::Get('C'));
   int c_small_axis = data_layout.IndexOf(LayoutAxis::Get('c'));
 
-  CHECK_GE(c_big_axis, 0);
+  ICHECK_GE(c_big_axis, 0);
   Message none = NullValue<Message>();
   // For now, we only support simple pattern (no folded weight/data)
   // More general layout can be supported under the current framework.
@@ -520,11 +520,11 @@ Expr Conv2DForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
   if (sdata == nullptr) return Expr();
   if (sweight != nullptr) return Expr();
   const auto* param = ref_call->attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
   int c_big_axis = data_layout.IndexOf(LayoutAxis::Get('C'));
-  CHECK_GE(c_big_axis, 0);
+  ICHECK_GE(c_big_axis, 0);
   int small_ko_axis = kernel_layout.IndexOf(LayoutAxis::Get('o'));
   int small_ki_axis = kernel_layout.IndexOf(LayoutAxis::Get('i'));
   int big_ki_axis = kernel_layout.IndexOf(LayoutAxis::Get('I'));
@@ -532,11 +532,11 @@ Expr Conv2DForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
 
   bool is_simple = (small_ko_axis < 0 && small_ki_axis < 0 && big_ki_axis >= 0);
   bool is_blocking = (small_ko_axis >= 0 && small_ki_axis >= 0 && big_ki_axis >= 0);
-  CHECK(is_simple || is_blocking);
+  ICHECK(is_simple || is_blocking);
 
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, kernel_layout);
-  CHECK(param->groups == 1 || is_depthwise_conv2d);
+  ICHECK(param->groups == 1 || is_depthwise_conv2d);
 
   Expr weight = new_args[1];
 
@@ -628,7 +628,7 @@ class BackwardPrep : private ExprVisitor {
     auto f = fprep.get(call->op, nullptr);
     if (f == nullptr) return;
     auto rit = ref_counter_.find(call);
-    CHECK(rit != ref_counter_.end());
+    ICHECK(rit != ref_counter_.end());
     // We only allow propagation of scale backward
     // if the expression is only referred by a single parent.
     if (rit->second != 1) return;
@@ -668,7 +668,7 @@ class BackwardTransformerNode : public Object, private ExprMutator {
     if (const CallNode* call_node = expr.as<CallNode>()) {
       return Transform(call_node, message, scale);
     } else {
-      CHECK(!message.defined()) << "outstanding scale";
+      ICHECK(!message.defined()) << "outstanding scale";
       return ExprMutator::VisitExpr(expr);
     }
   }
@@ -738,7 +738,7 @@ Expr BackwardTransformerNode::Transform(const CallNode* call_node, Message messa
     memo_[call] = new_expr;
     return new_expr;
   } else {
-    CHECK(!message.defined()) << "outstanding scale";
+    ICHECK(!message.defined()) << "outstanding scale";
     return NormalCallTransform(call_node);
   }
 }
@@ -807,13 +807,13 @@ Expr AddSubBackwardTransform(const Call& call, const Message& message, const Exp
   StructuralEqual equal;
 
   if (lhs_message.defined() && rhs_message.defined()) {
-    CHECK(equal(lhs_message->axes, rhs_message->axes));
-    CHECK(equal(message->axes, lhs_message->axes));
+    ICHECK(equal(lhs_message->axes, rhs_message->axes));
+    ICHECK(equal(message->axes, lhs_message->axes));
     Expr lhs = transformer->Transform(call->args[0], message, scale);
     Expr rhs = transformer->Transform(call->args[1], message, scale);
     return Call(call->op, {lhs, rhs}, call->attrs, call->type_args);
   } else if (lhs_message.defined()) {
-    CHECK(equal(message->axes, lhs_message->axes));
+    ICHECK(equal(message->axes, lhs_message->axes));
     Expr lhs = transformer->Transform(call->args[0], message, scale);
     Expr rhs = transformer->Transform(call->args[1], NullValue<Message>(), NullValue<Expr>());
     Expr rhs_scale = ReshapeOrExpandToMatchAxis(scale, tlhs->shape, message->axes);
@@ -823,7 +823,7 @@ Expr AddSubBackwardTransform(const Call& call, const Message& message, const Exp
     rhs = Multiply(rhs, rhs_scale);
     return Call(call->op, {lhs, rhs}, call->attrs, call->type_args);
   } else if (rhs_message.defined()) {
-    CHECK(equal(message->axes, rhs_message->axes));
+    ICHECK(equal(message->axes, rhs_message->axes));
     Expr lhs = transformer->Transform(call->args[0], NullValue<Message>(), NullValue<Expr>());
     Expr rhs = transformer->Transform(call->args[1], message, scale);
     Expr lhs_scale = ReshapeOrExpandToMatchAxis(scale, trhs->shape, message->axes);
@@ -852,13 +852,13 @@ RELAY_REGISTER_OP("subtract")
 // Multiply produces the scale-axis pair.
 Expr MultiplyBackwardTransform(const Call& call, const Message& message, const Expr& scale,
                                const BackwardTransformer& transformer) {
-  CHECK(!message.defined()) << "outstanding scale";
+  ICHECK(!message.defined()) << "outstanding scale";
   const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
   const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
   Message lhs_message = transformer->GetMessage(call->args[0]);
   Message rhs_message = transformer->GetMessage(call->args[1]);
   if (lhs_message.defined()) {
-    CHECK(lhs_message->axes.defined() && lhs_message->axes.size());
+    ICHECK(lhs_message->axes.defined() && lhs_message->axes.size());
     // NOTE we won't recursively call mutating on scale part.
     // since there  won't be scale chance within scale part.
     Expr rhs = call->args[1];
@@ -867,7 +867,7 @@ Expr MultiplyBackwardTransform(const Call& call, const Message& message, const E
       return transformer->Transform(call->args[0], lhs_message, rhs);
     }
   } else if (rhs_message.defined()) {
-    CHECK(rhs_message->axes.defined() && rhs_message->axes.size());
+    ICHECK(rhs_message->axes.defined() && rhs_message->axes.size());
     Expr lhs = call->args[0];
     if (MatchBroadcastToLeftAxes(trhs, tlhs, rhs_message->axes, &lhs) &&
         (!rhs_message->require_positive || IsAllPositiveConstant(lhs))) {
@@ -884,13 +884,13 @@ RELAY_REGISTER_OP("multiply")
 // Conv2D send out requirement of axis folding.
 Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages) {
   const auto* param = call->attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   int c_big_axis = out_layout.IndexOf(LayoutAxis::Get('C'));
   int c_small_axis = out_layout.IndexOf(LayoutAxis::Get('c'));
 
-  CHECK_GE(c_big_axis, 0);
+  ICHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // More general layout can be supported under the current framework.
   // By using a unified layout transformation.
@@ -921,11 +921,11 @@ Expr Conv2DBackwardTransform(const Call& call, const Message& message, const Exp
     return transformer->NormalCallTransform(call.operator->());
   }
   const auto* param = call->attrs.as<Conv2DAttrs>();
-  CHECK(param != nullptr);
+  ICHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
   int c_big_axis = out_layout.IndexOf(LayoutAxis::Get('C'));
-  CHECK_GE(c_big_axis, 0);
+  ICHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
   int small_ko_axis = kernel_layout.IndexOf(LayoutAxis::Get('o'));
@@ -934,10 +934,10 @@ Expr Conv2DBackwardTransform(const Call& call, const Message& message, const Exp
   int big_ko_axis = kernel_layout.IndexOf(LayoutAxis::Get('O'));
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  CHECK(param->groups == 1 || is_depthwise_conv2d);
+  ICHECK(param->groups == 1 || is_depthwise_conv2d);
   bool is_simple = (small_ko_axis < 0 && small_ki_axis < 0 && big_ki_axis >= 0);
   bool is_blocking = (small_ko_axis >= 0 && small_ki_axis >= 0 && big_ki_axis >= 0);
-  CHECK(is_simple || is_blocking);
+  ICHECK(is_simple || is_blocking);
 
   Expr data = transformer->Transform(call->args[0], NullValue<Message>(), NullValue<Expr>());
   Expr weight = transformer->Transform(call->args[1], NullValue<Message>(), NullValue<Expr>());
diff --git a/src/relay/transforms/forward_rewrite.cc b/src/relay/transforms/forward_rewrite.cc
index 58396256105b..be2d37477eb6 100644
--- a/src/relay/transforms/forward_rewrite.cc
+++ b/src/relay/transforms/forward_rewrite.cc
@@ -89,7 +89,7 @@ class ForwardRewriter : private MixedModeMutator {
     if (fmulti_ref_trigger_ != nullptr) {
       Expr ret = post;
       auto it = ref_counter_.find(expr.get());
-      CHECK(it != ref_counter_.end());
+      ICHECK(it != ref_counter_.end());
       if (it->second > 1) {
         ret = fmulti_ref_trigger_(ret);
       }
@@ -136,7 +136,7 @@ class ForwardRewriter : private MixedModeMutator {
     if (rewrite_func_) {
       frewrite = *rewrite_func_;
     } else {
-      CHECK(rewrite_map_);
+      ICHECK(rewrite_map_);
       frewrite = rewrite_map_->get(call_node->op, nullptr);
     }
     const auto* post_node = post.as<CallNode>();
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index bc6335a539af..8023305f3f64 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -188,9 +188,9 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
 
   void AddNode(const tvm::Object* key) {
     auto it = graph_.node_map.find(key);
-    CHECK(it != graph_.node_map.end()) << "Cannot find node " << GetRef<ObjectRef>(key);
+    ICHECK(it != graph_.node_map.end()) << "Cannot find node " << GetRef<ObjectRef>(key);
     IndexedForwardGraph::Node* node = it->second;
-    CHECK(node->ref == nullptr);
+    ICHECK(node->ref == nullptr);
     node->ref = key;
     node->index = graph_.post_dfs_order.size();
     graph_.post_dfs_order.push_back(node);
@@ -226,7 +226,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
   }
 
   void VisitExpr_(const CallNode* call) final {
-    CHECK(graph_.node_map.count(call));
+    ICHECK(graph_.node_map.count(call));
     Node* node = graph_.node_map.at(call);
     static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
     // Now we set the pattern of this call.
@@ -270,7 +270,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
   }
 
   void VisitExpr_(const TupleNode* op) final {
-    CHECK(graph_.node_map.count(op));
+    ICHECK(graph_.node_map.count(op));
     Node* tuple_node = graph_.node_map.at(op);
     tuple_node->pattern = kTuple;
     for (const Expr& field : op->fields) {
@@ -286,7 +286,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
 
   void VisitExpr_(const TupleGetItemNode* op) final {
     auto tuple_type = op->tuple->checked_type().as<TupleTypeNode>();
-    CHECK(tuple_type);
+    ICHECK(tuple_type);
     // When TVM lowers a fused function, it expects all arguments to be a Tensor or
     // a tuple containing only Tensors. But this tuple may contain a reference or
     // another tuple. To avoid modifying codegen logic, we do not allow fusing through this node
@@ -302,7 +302,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     if (has_non_tensor) {
       this->Update(op->tuple, nullptr, kOpaque);
     } else {
-      CHECK(graph_.node_map.count(op));
+      ICHECK(graph_.node_map.count(op));
       Node* node = graph_.node_map.at(op);
       node->pattern = kInjective;
       this->Update(op->tuple, node, kInjective);
@@ -443,9 +443,9 @@ class DominatorTree {
     }
     auto get_node = [&](const IndexedForwardGraph::Edge& edge) {
       size_t oindex = edge.node->index;
-      CHECK_LT(oindex, nodes.size());
+      ICHECK_LT(oindex, nodes.size());
       Node* onode = nodes[oindex];
-      CHECK(onode != nullptr);
+      ICHECK(onode != nullptr);
       return onode;
     };
     Node* parent = get_node(link->value);
@@ -563,7 +563,7 @@ class GraphPartitioner {
     if (visited_.count(src)) return true;
     visited_.insert(src);
     Group* gnode = groups_[src->index];
-    CHECK(gnode != nullptr);
+    ICHECK(gnode != nullptr);
     gnode = gnode->FindRoot();
     if (!fcond(gnode->pattern, src == sink)) return false;
     if (src == sink) return true;
@@ -586,9 +586,9 @@ class GraphPartitioner {
    */
   template <typename F>
   bool CheckPath(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink, F fcond) {
-    CHECK(!src->extern_ref);
+    ICHECK(!src->extern_ref);
     visited_.clear();
-    CHECK(src != sink);
+    ICHECK(src != sink);
     for (auto link = src->outputs.head; link != nullptr; link = link->next) {
       if (!CheckPath_(link->value.node, sink, fcond)) return false;
     }
@@ -616,7 +616,7 @@ class GraphPartitioner {
     child->parent = parent;
     // update anchor ref and pattern
     if (child->anchor_ref != nullptr) {
-      CHECK(parent->anchor_ref == nullptr);
+      ICHECK(parent->anchor_ref == nullptr);
       parent->anchor_ref = child->anchor_ref;
       parent->pattern = CombinePattern(child->pattern, parent->pattern);
     }
@@ -627,7 +627,7 @@ class GraphPartitioner {
     if (visited_.count(src)) return;
     visited_.insert(src);
     Group* gnode = groups_[src->index];
-    CHECK(gnode != nullptr);
+    ICHECK(gnode != nullptr);
     // merge the current group to the parent if possible.
     MergeFromTo(gnode, target);
     for (auto link = src->outputs.head; link != nullptr; link = link->next) {
@@ -643,7 +643,7 @@ class GraphPartitioner {
   void CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink) {
     Group* target = groups_[sink->index];
     visited_.clear();
-    CHECK(src != sink);
+    ICHECK(src != sink);
     CommitFuse_(src, sink, target);
   }
 
@@ -651,7 +651,7 @@ class GraphPartitioner {
     if (src == sink || visited_.count(src)) return 0;
     visited_.insert(src);
     Group* gnode = groups_[src->index];
-    CHECK(gnode != nullptr);
+    ICHECK(gnode != nullptr);
     auto sum = gnode->num_nodes;
     for (auto link = src->outputs.head; link != nullptr; link = link->next) {
       sum += CountNodesUptoSink_(link->value.node, sink);
@@ -669,7 +669,7 @@ class GraphPartitioner {
                                      IndexedForwardGraph::Node* dom_parent) {
     Group* target = groups_[dom_parent->index];
     visited_.clear();
-    CHECK(child != dom_parent);
+    ICHECK(child != dom_parent);
     return target->FindRoot()->num_nodes + CountNodesUptoSink_(child, dom_parent);
   }
 
@@ -696,12 +696,12 @@ class GraphPartitioner {
       auto* graph_node = graph.post_dfs_order[nid];
       auto* dom_node = post_dom_tree.nodes[nid];
       Group* group_node = groups_[nid];
-      CHECK(group_node != nullptr);
+      ICHECK(group_node != nullptr);
       // no actions for opaque nodes
       if (group_node->pattern == kOpaque) continue;
       // no actions needed if the current node have no dominator
       if (dom_node->parent == nullptr) continue;
-      CHECK(!graph_node->extern_ref);
+      ICHECK(!graph_node->extern_ref);
       size_t dom_parent_gindex = dom_node->parent->gnode->index;
 
       // refuse the fusion if too many ops are going to be fused together
@@ -740,7 +740,7 @@ class GraphPartitioner {
         // Path for OutEWiseFusable: conv2d
         // Check if the dominator relation is elemwise.
         if (dom_node->parent != nullptr && dom_node->pattern == kElemWise) {
-          CHECK(dom_node->parent->gnode != nullptr);
+          ICHECK(dom_node->parent->gnode != nullptr);
           // The fuse can be executed if all the intermediate ops are still broadcast.
           auto fcond = [](OpPatternKind kind, bool is_sink) { return kind <= kBroadcast; };
           if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
@@ -778,7 +778,7 @@ class GraphPartitioner {
         }
       } else {
         // do nothing.
-        CHECK(group_node->pattern == kCommReduce);
+        ICHECK(group_node->pattern == kCommReduce);
       }
     }
   }
@@ -805,7 +805,7 @@ class FuseMutator : private ExprMutator {
     auto graph = IndexedForwardGraph::Create(&arena_, body);
     auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth).Partition(graph);
     for (size_t nid = 0; nid < graph.post_dfs_order.size(); ++nid) {
-      CHECK(graph.post_dfs_order[nid]->ref != nullptr);
+      ICHECK(graph.post_dfs_order[nid]->ref != nullptr);
       gmap_[graph.post_dfs_order[nid]->ref] = groups[nid];
     }
     // The following line can be used for debug.
@@ -863,7 +863,7 @@ class FuseMutator : private ExprMutator {
 
       // If it is a primitive op call
       // then we must have a group assignment for it already.
-      CHECK(gmap_.count(call));
+      ICHECK(gmap_.count(call));
       if (call->op == stop_fusion_op) {
         return ExprMutator::VisitExpr(call->args[0]);
       }
diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/gradient.cc
index 1722c90069cb..9441f8af5d27 100644
--- a/src/relay/transforms/gradient.cc
+++ b/src/relay/transforms/gradient.cc
@@ -74,7 +74,7 @@ Expr FirstOrderGradient(const Expr& e, const Optional<IRModule>& mod);
 Type WithGradientType(const Type& t) {
   // TODO(@M.K.): stricter checking
   auto ty = t.as<FuncTypeNode>();
-  CHECK(ty) << "input should be a function";
+  ICHECK(ty) << "input should be a function";
   return FuncType(ty->arg_types, TupleType({ty->ret_type, TupleType(ty->arg_types)}), {}, {});
 }
 
@@ -102,7 +102,7 @@ struct ADValueNode {
   template <typename T>
   T& get() {
     auto ret = dynamic_cast<T*>(this);
-    CHECK(ret) << "cannot downcast";
+    ICHECK(ret) << "cannot downcast";
     return *ret;
   }
 };
@@ -183,7 +183,7 @@ struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
 
   ADValue VisitExpr_(const OpNode* op) final {
     Op op_ref = GetRef<Op>(op);
-    CHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined";
+    ICHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined";
     return std::make_shared<ADFunction>(
         [this, op_ref](const Type& orig_type, const std::vector<ADValue>& args, const Attrs& attrs,
                        const tvm::Array<Type>& type_args) {
@@ -196,7 +196,7 @@ struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
           auto ret = std::make_shared<ADTensor>(ll, orig);
           backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) {
             tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
-            CHECK(args.size() == rev.size());
+            ICHECK(args.size() == rev.size());
             for (size_t i = 0; i < args.size(); ++i) {
               args[i]->get<ADTensor>().reverse =
                   ll->Push(Add(args[i]->get<ADTensor>().reverse, rev[i]));
@@ -271,7 +271,7 @@ struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
     return std::make_shared<ADFunction>(
         [this, f](const Type& orig_type, const std::vector<ADValue>& args, const Attrs& attrs,
                   const tvm::Array<Type>& type_args) {
-          CHECK_EQ(f->params.size(), args.size());
+          ICHECK_EQ(f->params.size(), args.size());
           for (size_t i = 0; i < f->params.size(); ++i) {
             env[f->params[i]] = args[i];
           }
@@ -305,8 +305,8 @@ Expr FirstOrderGradient(const Expr& re, const Optional<IRModule>& mod) {
   // order case.
   auto e = DeGlobal(mod, re);
   auto f = e.as<FunctionNode>();
-  CHECK(f) << "FOWithGradient expects its argument to be a function: " << f;
-  CHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
+  ICHECK(f) << "FOWithGradient expects its argument to be a function: " << f;
+  ICHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
 
   // We will then build a sequence of lets which implement reverse mode.
   Expr body = LetList::With([&](LetList* ll) {
@@ -364,7 +364,7 @@ Type ReverseType(const Type& t) { return ReverseADType()(t); }
 Expr LiftTensor(const std::function<Expr(const Expr& t)>& f,
                 const std::function<Type(const Type&)>& tf, const Type& forward_type, const Expr& e,
                 LetList* ll) {
-  CHECK(IsAtomic(e)) << e;
+  ICHECK(IsAtomic(e)) << e;
   if (forward_type.as<TensorTypeNode>()) {
     auto ret = ll->Push(f(e));
     ret->checked_type_ = tf(forward_type);
@@ -390,8 +390,8 @@ Expr LiftTensor(const std::function<Expr(const Expr& t)>& f,
  * by stitching the references in the AD values.
  */
 void TransferGrads(const Type& forward_type, const Expr& from, const Expr& to, LetList* ll) {
-  CHECK(IsAtomic(from)) << from;
-  CHECK(IsAtomic(to)) << to;
+  ICHECK(IsAtomic(from)) << from;
+  ICHECK(IsAtomic(to)) << to;
   if (forward_type.as<TensorTypeNode>()) {
     auto from_ref = TupleGetItem(from, 1);
     auto to_ref = TupleGetItem(to, 1);
@@ -487,9 +487,9 @@ struct ReverseAD : ExprMutator {
 
   Expr VisitCheckpoint(const CallNode* call) {
     const OpNode* op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "expected op in call";
+    ICHECK(op_node) << "expected op in call";
     Op op_ref = GetRef<Op>(op_node);
-    CHECK(op_ref->name == "annotation.checkpoint") << "expected checkpoint annotation";
+    ICHECK(op_ref->name == "annotation.checkpoint") << "expected checkpoint annotation";
     auto x = call->args[0];
     return LetList::With([&](LetList* ll) {
       auto x_var = ll->Push(Remap(x));
@@ -518,7 +518,7 @@ struct ReverseAD : ExprMutator {
         return VisitCheckpoint(call);
       }
 
-      CHECK(rev_map.count(op_ref)) << op_node->name << " does not have reverse mode defined";
+      ICHECK(rev_map.count(op_ref)) << op_node->name << " does not have reverse mode defined";
       return LetList::With([&](LetList* ll) {
         std::vector<Var> args;
         for (const auto& arg : call->args) {
@@ -536,7 +536,7 @@ struct ReverseAD : ExprMutator {
         auto bpv = ll->Push(RefRead(bp));
         Expr nbp_body = LetList::With([&](LetList* ll) {
           tvm::Array<Expr> rev = rev_map[op_ref](orig, GetGrad(call->checked_type(), ret, ll));
-          CHECK(args.size() == rev.size());
+          ICHECK(args.size() == rev.size());
           for (size_t i = 0; i < args.size(); ++i) {
             UpdateGrad(call->args[i]->checked_type(), args[i], rev[i], ll);
           }
@@ -585,7 +585,7 @@ struct ReverseAD : ExprMutator {
   Expr VisitExpr_(const GlobalVarNode* op) final {
     // todo: concatenating string to add attribute seems like a brittle hack.
     // maybe get module indexed by a rose tree of string?
-    CHECK(mod.defined());
+    ICHECK(mod.defined());
     auto orig_gv = GetRef<GlobalVar>(op);
     if (ad_gvars->count(orig_gv) == 0) {
       GlobalVar gv(op->name_hint + "_grad");
@@ -653,12 +653,12 @@ Expr Gradient(const Expr& re, const Optional<IRModule>& mod) {
   }
   auto e = DeGlobal(mod, re);
   auto f = e.as<FunctionNode>();
-  CHECK(f) << "input need to be a function";
-  CHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
+  ICHECK(f) << "input need to be a function";
+  ICHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
   for (const auto& p : f->params) {
-    CHECK(p->checked_type().as<TensorTypeNode>()) << "input parameters need to be tensor";
+    ICHECK(p->checked_type().as<TensorTypeNode>()) << "input parameters need to be tensor";
   }
-  CHECK(!MissingGrad(e)) << "input has operators with missing gradients";
+  ICHECK(!MissingGrad(e)) << "input has operators with missing gradients";
   Expr body = LetList::With([&](LetList* ll) {
     Var bp = ll->Push(BPEmpty(), bpt);
     Expr rev = ReverseAD(mod, bp, std::make_shared<ReverseAD::ADVarMap>(),
@@ -676,7 +676,7 @@ Expr Gradient(const Expr& re, const Optional<IRModule>& mod) {
       if (t.as<TensorTypeNode>()) {
         ll->Push(RefWrite(GetField(e, 1), OnesLike(GetField(e, 0))));
       } else if (auto tt = t.as<TupleTypeNode>()) {
-        CHECK_GT(tt->fields.size(), 0);
+        ICHECK_GT(tt->fields.size(), 0);
         init_grad(ll->Push(GetField(e, 0)), tt->fields[0]);
       } else {
         LOG(FATAL) << "unhandled type " << t;
diff --git a/src/relay/transforms/infer_layout_utils.h b/src/relay/transforms/infer_layout_utils.h
index 3965b0a6a78b..7edb07ce71ce 100644
--- a/src/relay/transforms/infer_layout_utils.h
+++ b/src/relay/transforms/infer_layout_utils.h
@@ -108,7 +108,7 @@ inline Array<Array<Layout>> ElemwiseArbitraryLayout(const Attrs& attrs,
   Layout ret;
 
   if (new_in_layouts.defined()) {
-    CHECK_GE(new_in_layouts.size(), 1);
+    ICHECK_GE(new_in_layouts.size(), 1);
     ret = new_in_layouts[0];
   } else {
     for (size_t i = 0; i < old_in_layouts.size(); ++i) {
@@ -130,7 +130,7 @@ inline Array<Array<Layout>> BinaryBroadcastLayout(const Attrs& attrs,
   Array<Layout> layouts;
   Array<Array<IndexExpr>> old_in_shapes;
   for (auto old_in_t : old_in_types) {
-    CHECK(old_in_t.as<TensorTypeNode>());
+    ICHECK(old_in_t.as<TensorTypeNode>());
     old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
   }
 
@@ -217,7 +217,7 @@ static inline std::tuple<Array<Layout>, Array<Layout>, bool> InferCorrectLayouts
   if (finfer_layout.count(op)) {
     Array<Array<Layout>> inferred_layouts;
     inferred_layouts = finfer_layout[op](call->attrs, new_in_layouts, old_in_layouts, old_in_types);
-    CHECK_EQ(inferred_layouts.size(), 2)
+    ICHECK_EQ(inferred_layouts.size(), 2)
         << "FInferCorrectLayout should return an array with size of 2";
     for (auto x : inferred_layouts) {
       for (auto y : x) {
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index c9a0de44e2d4..dae34674de77 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -114,16 +114,16 @@ class Inliner : ExprMutator {
 
   // Make a new Relay expression to replace the callee.
   Expr MakeNewExpr(const GlobalVar& global, const Array<Expr>& args, const Expr& callee) {
-    CHECK(callee->IsInstance<CallNode>() || callee->IsInstance<GlobalVarNode>());
+    ICHECK(callee->IsInstance<CallNode>() || callee->IsInstance<GlobalVarNode>());
     auto base_func = call_graph_->GetGlobalFunction(global);
     const auto* fn = base_func.as<FunctionNode>();
-    CHECK(fn) << "Expected to work on a Relay function.";
+    ICHECK(fn) << "Expected to work on a Relay function.";
 
     auto func = Function(fn->params, fn->body, fn->ret_type, fn->type_params, fn->attrs);
     // Inline the function body to the caller if this function uses default
     // compiler, i.e. no external codegen is needed.
     if (!func->GetAttr<String>(attr::kCompiler).defined()) {
-      CHECK_EQ(func->params.size(), args.size())
+      ICHECK_EQ(func->params.size(), args.size())
           << "Mismatch found in the number of parameters and call args";
       // Bind the parameters with call args.
       Map<Var, Expr> bind_map;
@@ -137,7 +137,7 @@ class Inliner : ExprMutator {
         // its body when the global var returns FuncType.
         return ret_type->IsInstance<FuncTypeNode>() ? std::move(func) : func->body;
       } else {
-        CHECK(callee->IsInstance<CallNode>());
+        ICHECK(callee->IsInstance<CallNode>());
         return Bind(func->body, bind_map);
       }
     } else if (const auto* call_node = callee.as<CallNode>()) {
@@ -189,7 +189,7 @@ IRModule Inline(const IRModule& module) {
     if (const auto* fn = base_func.as<FunctionNode>()) {
       auto func = GetRef<Function>(fn);
       if (func->HasNonzeroAttr(attr::kInline)) {
-        CHECK_EQ(cgn->GetRefCount(), 0U)
+        ICHECK_EQ(cgn->GetRefCount(), 0U)
             << cgn->GetNameHint() << " is marked as inline but not inlined.";
         cgn->CleanCallGraphEntries();
         cg->RemoveGlobalVarFromModule(cgn, /*update_call_graph*/ true);
diff --git a/src/relay/transforms/lazy_gradient_init.cc b/src/relay/transforms/lazy_gradient_init.cc
index de9406ec309d..079b790e74c0 100644
--- a/src/relay/transforms/lazy_gradient_init.cc
+++ b/src/relay/transforms/lazy_gradient_init.cc
@@ -131,8 +131,8 @@ class LazyGradientInitializer : public ExprMutator, public TypeMutator {
     auto* f = e.as<FunctionNode>();
     auto* transformed = this->Mutate(e).as<FunctionNode>();
 
-    CHECK(f);
-    CHECK(transformed);
+    ICHECK(f);
+    ICHECK(transformed);
 
     if (e.same_as(GetRef<Function>(transformed))) {
       return GetRef<Function>(transformed);
diff --git a/src/relay/transforms/legalize.cc b/src/relay/transforms/legalize.cc
index 89f59f625a8d..7daa028bbcf3 100644
--- a/src/relay/transforms/legalize.cc
+++ b/src/relay/transforms/legalize.cc
@@ -73,7 +73,7 @@ class Legalizer : public ExprRewriter {
         if (legalized_value.defined()) {
           // Check that the returned Expr from legalize is CallNode.
           const CallNode* legalized_call_node = legalized_value.as<CallNode>();
-          CHECK(legalized_call_node)
+          ICHECK(legalized_call_node)
               << "Can only replace the original operator with another call node";
           return legalized_value;
         }
diff --git a/src/relay/transforms/let_list.h b/src/relay/transforms/let_list.h
index c925dc0922a4..c75f18f6831c 100644
--- a/src/relay/transforms/let_list.h
+++ b/src/relay/transforms/let_list.h
@@ -64,8 +64,8 @@ class LetList {
    * \return a Var that hold the inserted expr.
    */
   Var Push(Var pv, Expr expr) {
-    CHECK(!used_);
-    CHECK(WellFormed(expr));
+    ICHECK(!used_);
+    ICHECK(WellFormed(expr));
     lets_.emplace_back(std::make_pair(pv, expr));
     return pv;
   }
@@ -98,7 +98,7 @@ class LetList {
    *  \return the wrapped expr.
    */
   Expr Get(const Expr& body) {
-    CHECK(!used_);
+    ICHECK(!used_);
     Expr ret = body;
     for (auto rit = lets_.rbegin(); rit != lets_.rend(); ++rit) {
       ret = Let(std::get<0>(*rit), std::get<1>(*rit), ret);
diff --git a/src/relay/transforms/merge_compiler_regions.cc b/src/relay/transforms/merge_compiler_regions.cc
index 17fd44707b02..c7049bb4ee25 100644
--- a/src/relay/transforms/merge_compiler_regions.cc
+++ b/src/relay/transforms/merge_compiler_regions.cc
@@ -64,14 +64,14 @@ class RegionMerger : public MixedModeVisitor {
 
       // Check the region target.
       auto compiler_attrs = call->attrs.as<CompilerAttrs>();
-      CHECK_EQ(region->GetTarget(), compiler_attrs->compiler);
+      ICHECK_EQ(region->GetTarget(), compiler_attrs->compiler);
 
       // Visit the unmerged parent regions.
       for (const auto& arg : region->GetInputs()) {
         // Region inputs must be begin annotation, and the region of
         // the begin annotation's argument is the parent region.
         auto begin = Downcast<Call>(arg);
-        CHECK_EQ(begin->op, CompilerBeginOp());
+        ICHECK_EQ(begin->op, CompilerBeginOp());
         auto parent_region = regions_->GetRegion(begin->args[0]);
 
         // Skip this region if it has been merged.
@@ -86,7 +86,7 @@ class RegionMerger : public MixedModeVisitor {
       std::unordered_set<AnnotatedRegion, ObjectPtrHash, ObjectPtrEqual> mergeable_regions;
       for (const auto& arg : region->GetInputs()) {
         auto begin = Downcast<Call>(arg);
-        CHECK_EQ(begin->op, CompilerBeginOp());
+        ICHECK_EQ(begin->op, CompilerBeginOp());
         auto parent_region = regions_->GetRegion(begin->args[0]);
         if (parent_region.defined()) {
           mergeable_regions.insert(parent_region);
diff --git a/src/relay/transforms/merge_composite.cc b/src/relay/transforms/merge_composite.cc
index 7e7ad0e665a7..51f1387fd9ca 100644
--- a/src/relay/transforms/merge_composite.cc
+++ b/src/relay/transforms/merge_composite.cc
@@ -46,7 +46,7 @@ Function InferType(const Function& expr, const IRModule& m) {
 Expr MergeComposite(const Function& func, const Array<runtime::String>& pattern_names,
                     const Array<DFPattern>& patterns, const std::vector<PackedFunc>& checks,
                     const IRModule& m) {
-  CHECK_EQ(pattern_names.size(), patterns.size());
+  ICHECK_EQ(pattern_names.size(), patterns.size());
   Function merged_func = func;
   // merge the patterns one-by-one in order
   for (size_t i = 0; i < patterns.size(); i++) {
diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc
index 276d093d6993..fa080a7ff22c 100644
--- a/src/relay/transforms/partial_eval.cc
+++ b/src/relay/transforms/partial_eval.cc
@@ -279,7 +279,7 @@ class FuelNode : public RelayNode {
   }
   /*! \brief return the new Fuel, and write (*progress | is progress made) to *progress. */
   virtual Fuel Meet(const Fuel& f, bool* progress) const {
-    CHECK(progress);
+    ICHECK(progress);
     auto ret = Meet(f);
     *progress |= std::get<1>(ret);
     return std::get<0>(ret);
@@ -295,8 +295,8 @@ struct FSeqNode : FuelNode {
   std::vector<Fuel> fuels;
   Fuel Meet(const Fuel& f, bool* progress) const final {
     auto x = f.as<FSeqNode>();
-    CHECK(x);
-    CHECK_EQ(fuels.size(), x->fuels.size());
+    ICHECK(x);
+    ICHECK_EQ(fuels.size(), x->fuels.size());
     std::vector<Fuel> new_fuels;
     for (size_t i = 0; i < fuels.size(); ++i) {
       new_fuels.push_back(fuels[i]->Meet(x->fuels[i], progress));
@@ -320,7 +320,7 @@ struct FTimeNode : FuelNode {
   Time time;
   std::tuple<Fuel, bool> Meet(const Fuel& f) const final {
     auto x = f.as<FTimeNode>();
-    CHECK(x);
+    ICHECK(x);
     Time new_time = std::min(time, x->time);
     return std::make_tuple(MkFTime(new_time), new_time < time);
   }
@@ -342,7 +342,7 @@ struct FTValueNode : FuelNode {
   size_t tvalue;
   std::tuple<Fuel, bool> Meet(const Fuel& f) const final {
     auto x = f.as<FTValueNode>();
-    CHECK(x);
+    ICHECK(x);
     size_t new_tvalue = std::min(tvalue, x->tvalue);
     return std::make_tuple(MkFTValue(new_tvalue), new_tvalue < tvalue);
   }
@@ -401,9 +401,9 @@ class Environment {
   }
 
   void Insert(const Var& v, const PStatic& ps) {
-    CHECK(ps.defined());
-    CHECK_GT(env_.size(), 0);
-    CHECK_EQ(env_.back().locals.count(v), 0);
+    ICHECK(ps.defined());
+    ICHECK_GT(env_.size(), 0);
+    ICHECK_EQ(env_.back().locals.count(v), 0);
     env_.back().locals[v] = ps;
   }
 
@@ -459,7 +459,7 @@ class Store {
   }
 
   void Insert(const SRefNode* r, const PStatic& ps) {
-    CHECK(r);
+    ICHECK(r);
     store_.back().store[r] = ps;
   }
 
@@ -503,7 +503,7 @@ class Store {
 };
 
 PStatic HasStatic(const Static& stat, const Expr& dynamic) {
-  CHECK(stat.defined());
+  ICHECK(stat.defined());
   return PStatic(make_object<PStaticNode>(stat, dynamic));
 }
 
@@ -579,8 +579,8 @@ Function AsFunc(const Expr& e) {
   if (e.as<FunctionNode>()) {
     return Downcast<Function>(e);
   } else if (const CallNode* c = e.as<CallNode>()) {
-    CHECK(c->op == with_funcid_op);
-    CHECK_EQ(c->args.size(), 1);
+    ICHECK(c->op == with_funcid_op);
+    ICHECK_EQ(c->args.size(), 1);
     return AsFunc(c->args[0]);
   } else {
     LOG(FATAL) << "Unknown case";
@@ -595,20 +595,20 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
   PStatic VisitExpr(const Expr& e, LetList* ll) final {
     PStatic ret = ExprFunctor<PStatic(const Expr&, LetList*)>::VisitExpr(e, ll);
-    CHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
+    ICHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
     return ret;
   }
 
   PStatic VisitExpr(const Expr& e, LetList* ll, const Var& name) {
     if (const CallNode* c = e.as<CallNode>()) {
       if (c->op == with_funcid_op) {
-        CHECK_EQ(c->args.size(), 1);
+        ICHECK_EQ(c->args.size(), 1);
         return VisitExpr(c->args[0], ll, name);
       }
     }
     PStatic ret =
         e.as<FunctionNode>() ? VisitFunc(Downcast<Function>(e), ll, name) : VisitExpr(e, ll);
-    CHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
+    ICHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
     return ret;
   }
 
@@ -639,7 +639,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   PStatic VisitExpr_(const VarNode* op, LetList* ll) final { return env_.Lookup(GetRef<Var>(op)); }
 
   PStatic VisitGlobalVar(const GlobalVar& gv) {
-    CHECK(mod_.defined());
+    ICHECK(mod_.defined());
     if (gv_map_.count(gv) == 0) {
       BaseFunc base_func = mod_->Lookup(gv);
       if (auto* n = base_func.as<FunctionNode>()) {
@@ -670,7 +670,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     PStatic c = VisitExpr(op->cond, ll);
     if (c->pstatic.defined()) {
       NDArray cpu_array = Downcast<STensor>(c->pstatic)->data.CopyTo(CPUContext());
-      CHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
+      ICHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
       if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
         return VisitExpr(op->true_branch, ll);
       } else {
@@ -719,7 +719,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
   PStatic VisitExpr_(const CallNode* op, LetList* ll) final {
     if (op->op == with_funcid_op) {
-      CHECK_EQ(op->args.size(), 1);
+      ICHECK_EQ(op->args.size(), 1);
       return VisitExpr(op->args[0], ll);
     }
     PStatic f = VisitExpr(op->op, ll);
@@ -743,7 +743,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     FuncId fid_;
     Fuel old_fuel;
     FuelFrame(PartialEvaluator* pe, FuncId fid, const Fuel& new_fuel) : pe_(pe), fid_(fid) {
-      CHECK_GT(pe_->fuel_map_.count(fid_), 0);
+      ICHECK_GT(pe_->fuel_map_.count(fid_), 0);
       old_fuel = pe_->fuel_map_[fid_];
       pe_->fuel_map_[fid_] = new_fuel;
     }
@@ -775,7 +775,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   Func VisitFuncStatic(const Function& func, const Expr& var) {
-    CHECK(IsAtomic(var));
+    ICHECK(IsAtomic(var));
     if (func->HasNonzeroAttr(attr::kPrimitive)) {
       return ConstEvaluateFunc(func);
     }
@@ -788,8 +788,8 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     return [=](const PStatic& self, const std::vector<PStatic>& pv, const Attrs& attrs,
                const tvm::Array<Type>& type_args, LetList* ll) {
       return env_.Extend<PStatic>([&]() {
-        CHECK_EQ(pv.size(), func->params.size());
-        CHECK_GT(func_map_.count(func), 0);
+        ICHECK_EQ(pv.size(), func->params.size());
+        ICHECK_GT(func_map_.count(func), 0);
         FuncId fid = func_map_.at(func);
         if (fuel_map_.count(fid) == 0) {
           fuel_map_.insert({fid, MkFTop()});
@@ -914,7 +914,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   Func ConstEvaluateFunc(const Expr& expr) {
-    CHECK_EQ(FreeVars(expr).size(), 0);
+    ICHECK_EQ(FreeVars(expr).size(), 0);
     return [=](const PStatic& self, const std::vector<PStatic>& pv, const Attrs& attrs,
                const tvm::Array<Type>& type_args, LetList* ll) {
       tvm::Array<Expr> ns_args;
@@ -1002,10 +1002,10 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   MatchStatus VisitPattern_(const PatternConstructorNode* op, const PStatic& ps) final {
     if (ps->pstatic.defined()) {
       SConstructor scn = Downcast<SConstructor>(ps->pstatic);
-      CHECK_NE(op->constructor->tag, -1);
-      CHECK_NE(scn->constructor->tag, -1);
+      ICHECK_NE(op->constructor->tag, -1);
+      ICHECK_NE(scn->constructor->tag, -1);
       if (op->constructor->tag == scn->constructor->tag) {
-        CHECK_EQ(op->patterns.size(), scn->fields.size());
+        ICHECK_EQ(op->patterns.size(), scn->fields.size());
         MatchStatus current_match_status = MatchStatus::Match;
         for (size_t i = 0; i < op->patterns.size(); ++i) {
           MatchStatus ms = VisitPattern(op->patterns[i], scn->fields[i]);
@@ -1029,7 +1029,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   MatchStatus VisitPattern_(const PatternTupleNode* op, const PStatic& ps) final {
     if (ps->pstatic.defined()) {
       STuple stn = Downcast<STuple>(ps->pstatic);
-      CHECK_EQ(op->patterns.size(), stn->fields.size());
+      ICHECK_EQ(op->patterns.size(), stn->fields.size());
       MatchStatus current_match_status = MatchStatus::Match;
       for (size_t i = 0; i < op->patterns.size(); ++i) {
         MatchStatus ms = VisitPattern(op->patterns[i], stn->fields[i]);
@@ -1055,7 +1055,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
       void VisitExpr_(const FunctionNode* op) final {
         Function f = GetRef<Function>(op);
-        CHECK_EQ(pe->func_map_.count(f), 0);
+        ICHECK_EQ(pe->func_map_.count(f), 0);
         pe->func_map_.insert({f, pe->func_map_.size()});
         VisitExpr(f->body);
       }
@@ -1072,13 +1072,13 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
       void VisitExpr_(const CallNode* op) final {
         if (op->op == with_funcid_op) {
-          CHECK_EQ(op->args.size(), 1);
-          CHECK(op->attrs.defined());
-          CHECK(op->attrs.as<WithFuncIdAttrs>());
+          ICHECK_EQ(op->args.size(), 1);
+          ICHECK(op->attrs.defined());
+          ICHECK(op->attrs.as<WithFuncIdAttrs>());
           Function f = AsFunc(op->args[0]);
           FuncId fid = op->attrs.as<WithFuncIdAttrs>()->fid;
           if (pe->func_map_.count(f) != 0) {
-            CHECK_EQ(pe->func_map_.at(f), fid);
+            ICHECK_EQ(pe->func_map_.at(f), fid);
           }
           pe->func_map_.insert({f, fid});
         }
@@ -1087,7 +1087,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
       void VisitExpr_(const FunctionNode* op) final {
         Function f = GetRef<Function>(op);
-        CHECK_GT(pe->func_map_.count(f), 0);
+        ICHECK_GT(pe->func_map_.count(f), 0);
         ExprVisitor::VisitExpr_(op);
       }
 
@@ -1104,7 +1104,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
       Expr VisitExpr_(const FunctionNode* op) final {
         Function f = GetRef<Function>(op);
-        CHECK_GT(pe->func_map_.count(f), 0);
+        ICHECK_GT(pe->func_map_.count(f), 0);
         return MkWithFuncId(ExprMutator::VisitExpr_(op), pe->func_map_.at(f));
       }
 
@@ -1163,7 +1163,7 @@ Expr StripWithFuncId(const Expr& e) {
   struct StripWithFuncIdMutator : ExprMutator, PatternMutator {
     Expr VisitExpr_(const CallNode* op) final {
       if (op->op == with_funcid_op) {
-        CHECK_EQ(op->args.size(), 1);
+        ICHECK_EQ(op->args.size(), 1);
         return VisitExpr(op->args[0]);
       } else {
         return ExprMutator::VisitExpr_(op);
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index 08d26d76ee2d..75bc46387cc6 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -130,7 +130,7 @@ class Partitioner : public MixedModeMutator {
       return post;
     } else if (call->op == CompilerBeginOp()) {
       // The annotation node is inserted on edge so it must have only one argument.
-      CHECK_EQ(call->args.size(), 1U);
+      ICHECK_EQ(call->args.size(), 1U);
 
       // Traverse the rest graph.
       Expr parent = call->args[0];
@@ -147,7 +147,7 @@ class Partitioner : public MixedModeMutator {
 
       AnnotatedRegion sg = GetRegion(GetRef<Call>(call));
       int index = GetArgIdx(sg, GetRef<Call>(call));
-      CHECK_NE(index, -1);
+      ICHECK_NE(index, -1);
 
       if (region_func_meta_[sg].region_func_in.count(parent)) {
         return region_func_meta_[sg].region_func_in[parent];
@@ -169,10 +169,10 @@ class Partitioner : public MixedModeMutator {
         return std::move(var);
       }
     } else {
-      CHECK_EQ(call->op, CompilerEndOp());
+      ICHECK_EQ(call->op, CompilerEndOp());
       // The annotation node is inserted on edge so it must have only one
       // argument.
-      CHECK_EQ(call->args.size(), 1U);
+      ICHECK_EQ(call->args.size(), 1U);
 
       AnnotatedRegion region = GetRegion(GetRef<Call>(call));
 
@@ -182,7 +182,7 @@ class Partitioner : public MixedModeMutator {
 
       // Traverse subgraph inputs.
       auto input = Downcast<Call>(post)->args[0];
-      CHECK(region.defined()) << "Region not defined for " << GetRef<Call>(call);
+      ICHECK(region.defined()) << "Region not defined for " << GetRef<Call>(call);
       // functions are created for each annotated regions,
       // when their first output is encountered.
       // If multiple outputs are there, a tuple node is inserted at the end.
@@ -194,7 +194,7 @@ class Partitioner : public MixedModeMutator {
 
       // Retrieve this particular output of function.
       Expr region_out_expr = Downcast<Call>(GetRef<Call>(call))->args[0];
-      CHECK(region_func_meta_[region].region_func_out.count(region_out_expr));
+      ICHECK(region_func_meta_[region].region_func_out.count(region_out_expr));
       return region_func_meta_[region].region_func_out[region_out_expr];
     }
   }
@@ -325,7 +325,7 @@ class Partitioner : public MixedModeMutator {
     global_region_func = WithAttr(std::move(global_region_func), attr::kInline, tvm::Integer(1));
 
     std::string fname = name;
-    CHECK(!module_->ContainGlobalVar(fname)) << "Global function " << fname << " already exists";
+    ICHECK(!module_->ContainGlobalVar(fname)) << "Global function " << fname << " already exists";
     // Create a global function and add it to the IRModule for the region.
     // This way we lift the functions that should be handled by external
     // codegen to the module scope and rely on the pass manager to prevent
@@ -444,7 +444,7 @@ IRModule FlattenTupleOutputs(IRModule module) {
       if (call->op == CompilerEndOp()) {
         std::string target = call->attrs.as<CompilerAttrs>()->compiler;
         // Arguments of annotation ops should be 1
-        CHECK_EQ(call->args.size(), 1U);
+        ICHECK_EQ(call->args.size(), 1U);
         auto annotated_op = Downcast<Call>(post)->args[0];
         if (const auto* tn = annotated_op.as<TupleNode>()) {
           Array<Expr> new_fields;
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index 82ffd8a17c1b..555391a27e4b 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -163,7 +163,7 @@ inline Expr ExpandBiasToMatchAxis(Expr bias, int target_ndim, const Array<Intege
       }
     } else {
       int64_t diff = axes[i]->value - axes[i - 1]->value;
-      CHECK_GE(diff, 0L);
+      ICHECK_GE(diff, 0L);
       if (diff > 0) {
         auto attrs = make_object<ExpandDimsAttrs>();
         attrs->axis = i;
@@ -199,7 +199,7 @@ inline int64_t GetConv2DSuperChannelsDim(const CallNode* call) {
   auto param = call->attrs.as<Conv2DAttrs>();
   auto tweight = call->args[1]->type_as<TensorTypeNode>();
   auto index = param->kernel_layout.operator std::string().find('O');
-  CHECK_NE(index, std::string::npos);
+  ICHECK_NE(index, std::string::npos);
   auto channels = tir::as_const_int(tweight->shape[index]);
   return *channels;
 }
@@ -331,8 +331,8 @@ static inline Constant CheckConstantShape(const Array<IndexExpr>& shape) {
   auto* shape_data = static_cast<int64_t*>(shape_array->data);
   for (size_t i = 0; i < shape.size(); ++i) {
     const auto& dim_val = shape[i].as<IntImmNode>();
-    CHECK(dim_val) << "Do not support symbolic shape for "
-                      "Array format. Pass shape as Expr instead.";
+    ICHECK(dim_val) << "Do not support symbolic shape for "
+                       "Array format. Pass shape as Expr instead.";
     shape_data[i] = dim_val->value;
   }
   return Constant(shape_array);
@@ -350,8 +350,8 @@ static inline Array<Integer> CheckConstantShapeArrayInteger(const Array<IndexExp
 
   for (size_t i = 0; i < shape.size(); ++i) {
     const auto& dim_val = shape[i].as<IntImmNode>();
-    CHECK(dim_val) << "Do not support symbolic shape for "
-                      "Array format. Pass shape as Expr instead.";
+    ICHECK(dim_val) << "Do not support symbolic shape for "
+                       "Array format. Pass shape as Expr instead.";
 
     constShape.push_back(dim_val->value);
   }
@@ -423,7 +423,7 @@ static inline long double ToScalar(const runtime::NDArray& array, size_t i = 0)
  */
 static inline Array<Integer> ToVector(const runtime::NDArray& array) {
   size_t ndim = array.Shape().size();
-  CHECK_EQ(ndim, 1) << "This function should only be used for 1D NDArrays";
+  ICHECK_EQ(ndim, 1) << "This function should only be used for 1D NDArrays";
   size_t len = array.Shape().front();
   Array<Integer> out;
   for (size_t i = 0; i < len; ++i) {
@@ -440,7 +440,7 @@ static inline Array<Integer> ToVector(const runtime::NDArray& array) {
  */
 static inline Array<Array<Integer>> ToMatrix(const runtime::NDArray& array) {
   size_t ndim = array.Shape().size();
-  CHECK_EQ(ndim, 2) << "This function should only used for 2D NDArrays";
+  ICHECK_EQ(ndim, 2) << "This function should only used for 2D NDArrays";
   size_t dim1 = array.Shape().at(0);
   size_t dim2 = array.Shape().at(1);
 
@@ -494,8 +494,8 @@ inline Expr Log(Expr e) {
 template <typename T>
 T GetScalarFromConstant(Expr expr) {
   const auto* n = expr.as<ConstantNode>();
-  CHECK(n) << "Expr must be a constant expr - " << AsText(expr, false);
-  CHECK(n->is_scalar());
+  ICHECK(n) << "Expr must be a constant expr - " << AsText(expr, false);
+  ICHECK(n->is_scalar());
   return static_cast<T*>(n->data->data)[0];
 }
 
diff --git a/src/relay/transforms/simplify_fc_transpose.cc b/src/relay/transforms/simplify_fc_transpose.cc
index 99ded0ba591d..b5090e7e6fe4 100644
--- a/src/relay/transforms/simplify_fc_transpose.cc
+++ b/src/relay/transforms/simplify_fc_transpose.cc
@@ -81,7 +81,7 @@ class FCTransposeMutator : public ExprRewriter {
   explicit FCTransposeMutator(const Array<ObjectRef>& target_weights)
       : dense_op_(Op::Get("nn.dense")), transpose_op_(Op::Get("transpose")) {
     for (size_t i = 0; i < target_weights.size(); ++i) {
-      CHECK(target_weights[i]->IsInstance<runtime::StringObj>());
+      ICHECK(target_weights[i]->IsInstance<runtime::StringObj>());
       std::string k = target_weights[i].as<runtime::StringObj>()->data;
       target_weights_.emplace(k);
     }
@@ -96,7 +96,7 @@ class FCTransposeMutator : public ExprRewriter {
           const auto arg = weight->args[0];
           if (arg.as<VarNode>()) {
             const auto& arg_node = arg.as<VarNode>();
-            CHECK_GT(target_weights_.count(arg_node->name_hint()), 0);
+            ICHECK_GT(target_weights_.count(arg_node->name_hint()), 0);
             const auto& tt = arg_node->type_annotation.as<TensorTypeNode>();
             auto wt_type = TensorType({tt->shape[1], tt->shape[0]}, tt->dtype);
             Var wt(arg_node->name_hint() + ".T", wt_type);
diff --git a/src/relay/transforms/simplify_inference.cc b/src/relay/transforms/simplify_inference.cc
index 7df71967d834..7e587664b4dc 100644
--- a/src/relay/transforms/simplify_inference.cc
+++ b/src/relay/transforms/simplify_inference.cc
@@ -34,7 +34,7 @@ namespace relay {
 Expr BatchNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta, Expr moving_mean,
                             Expr moving_var, Type tdata) {
   auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
+  ICHECK(ttype);
   const auto param = attrs.as<BatchNormAttrs>();
   Expr epsilon = MakeConstantScalar(ttype->dtype, static_cast<float>(param->epsilon));
   Expr var_add_eps = Add(moving_var, epsilon);
@@ -62,9 +62,9 @@ Expr BatchNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta,
 
 Expr GroupNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta, Type tdata) {
   auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
+  ICHECK(ttype);
   const auto param = attrs.as<GroupNormAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   int ndim = ttype->shape.size();
   int axis = (param->axis < 0) ? param->axis + ndim : param->axis;
@@ -117,9 +117,9 @@ Expr GroupNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta,
 
 Expr LayerNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta, Type tdata) {
   auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
+  ICHECK(ttype);
   const auto param = attrs.as<LayerNormAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   Expr epsilon = MakeConstantScalar(ttype->dtype, static_cast<float>(param->epsilon));
   Expr mean = Mean(data, {param->axis}, true, false);
@@ -140,9 +140,9 @@ Expr LayerNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta,
 
 Expr InstanceNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr beta, Type tdata) {
   auto ttype = tdata.as<TensorTypeNode>();
-  CHECK(ttype);
+  ICHECK(ttype);
   const auto param = attrs.as<InstanceNormAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   int ndim = ttype->shape.size();
   int axis = (param->axis < 0) ? param->axis + ndim : param->axis;
@@ -168,7 +168,7 @@ Expr InstanceNormToInferUnpack(const Attrs attrs, Expr data, Expr gamma, Expr be
 
 Expr L2NormToInferUnpack(const Attrs attrs, Expr data) {
   const auto param = attrs.as<L2NormalizeAttrs>();
-  CHECK(param);
+  ICHECK(param);
 
   Expr epsilon = MakeConstantScalar(DataType::Float(32), static_cast<float>(param->eps));
 
diff --git a/src/relay/transforms/to_a_normal_form.cc b/src/relay/transforms/to_a_normal_form.cc
index e5d7b133c0c0..05844477cc5b 100644
--- a/src/relay/transforms/to_a_normal_form.cc
+++ b/src/relay/transforms/to_a_normal_form.cc
@@ -65,7 +65,7 @@ std::pair<NodeScopeMap, ExprSet> CalcScope(const DependencyGraph& dg) {
     auto iit = n->parents.head;
     Scope s;
     if (iit == nullptr) {
-      CHECK(!global_scope_used);
+      ICHECK(!global_scope_used);
       s = global_scope;
       global_scope_used = true;
     } else {
@@ -90,7 +90,7 @@ std::pair<NodeScopeMap, ExprSet> CalcScope(const DependencyGraph& dg) {
       expr_scope.insert({n, s});
     }
   }
-  CHECK(global_scope_used);
+  ICHECK(global_scope_used);
   return std::make_pair(expr_scope, lifted_exprs);
 }
 
@@ -114,11 +114,11 @@ Scope Fill::GetSubScope(const Expr& e, size_t i) {
   DependencyGraph::Node* n = dg_.expr_node.at(e);
   auto h = n->children.head;
   while (i != 0) {
-    CHECK(h);
+    ICHECK(h);
     --i;
     h = h->next;
   }
-  CHECK(h);
+  ICHECK(h);
   return node_scope_->at(h->value);
 }
 
@@ -130,7 +130,7 @@ Expr Fill::VisitExpr(const Expr& e, const Var& v) {
   }
   auto ret = memo.at(e);
   // if no include_set is specified, every expression should be atomic.
-  if (include_set_ == nullptr) CHECK(IsAtomic(ret));
+  if (include_set_ == nullptr) ICHECK(IsAtomic(ret));
   return ret;
 }
 
@@ -258,12 +258,12 @@ IRModule ToANormalForm(const IRModule& m) {
   tvm::Map<GlobalVar, Function> updates;
   auto funcs = m->functions;
   for (const auto& it : funcs) {
-    CHECK_EQ(FreeVars(it.second).size(), 0);
+    ICHECK_EQ(FreeVars(it.second).size(), 0);
     if (const auto* n = it.second.as<FunctionNode>()) {
       if (n->GetAttr<String>(attr::kCompiler).defined()) continue;
     }
     Expr ret = TransformF([&](const Expr& e) { return transform::ToANormalForm(e); }, it.second);
-    CHECK_EQ(FreeVars(ret).size(), 0)
+    ICHECK_EQ(FreeVars(ret).size(), 0)
         << AsText(ret) << "should not has free vars: " << FreeVars(ret);
     updates.Set(it.first, Downcast<Function>(ret));
   }
diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc
index fcec4e80ce5b..1aab367cf22a 100644
--- a/src/relay/transforms/to_basic_block_normal_form.cc
+++ b/src/relay/transforms/to_basic_block_normal_form.cc
@@ -54,7 +54,7 @@ IRModule ToBasicBlockNormalForm(const IRModule& mod) {
   tvm::Map<GlobalVar, Function> updates;
   auto funcs = mod->functions;
   for (const auto& it : funcs) {
-    CHECK_EQ(FreeVars(it.second).size(), 0) << "Expected no free variables";
+    ICHECK_EQ(FreeVars(it.second).size(), 0) << "Expected no free variables";
     if (const auto* n = it.second.as<FunctionNode>()) {
       if (n->GetAttr<String>(attr::kCompiler).defined()) continue;
     }
diff --git a/src/relay/transforms/to_cps.cc b/src/relay/transforms/to_cps.cc
index 5ece50133172..b7f9cafbc7dc 100644
--- a/src/relay/transforms/to_cps.cc
+++ b/src/relay/transforms/to_cps.cc
@@ -134,7 +134,7 @@ Function ToCPS(const Function& f, const IRModule& m, CPSMap* cm, VarMap* vm,
     }
 
     Expr VisitExpr_(const FunctionNode* op, const MCont& k) final {
-      CHECK(!op->HasNonzeroAttr(attr::kPrimitive)) << "primitive func not supported yet.";
+      ICHECK(!op->HasNonzeroAttr(attr::kPrimitive)) << "primitive func not supported yet.";
       return k(ToCPS(GetRef<Function>(op), m, cm, vm, answer));
     }
 
@@ -309,14 +309,14 @@ Function ToCPS(const Function& f, const IRModule& m) {
 
 Function UnCPS(const Function& f) {
   CheckFeature(f, FeatureSet::All() - fGraph);
-  CHECK_GT(f->params.size(), 0);
+  ICHECK_GT(f->params.size(), 0);
   std::vector<Var> new_params;
   for (const auto& p : f->params) {
     new_params.push_back(Var(p->name_hint(), p->checked_type()));
   }
   auto cont_type = Downcast<FuncType>(new_params.back()->type_annotation);
   new_params.pop_back();
-  CHECK_EQ(cont_type->arg_types.size(), 1);
+  ICHECK_EQ(cont_type->arg_types.size(), 1);
   auto new_ret_type = Type(cont_type->arg_types[0]);
   std::vector<TypeVar> new_type_params;
   for (const auto& tp : f->type_params) {
@@ -325,7 +325,7 @@ Function UnCPS(const Function& f) {
   auto answer_type = new_type_params.back();
   new_type_params.pop_back();
   // TODO(@M.K.): make alphaequal work on free term
-  // CHECK(tvm::StructuralEqual()(cont_type, Arrow(new_ret_type, answer_type)));
+  // ICHECK(tvm::StructuralEqual()(cont_type, Arrow(new_ret_type, answer_type)));
   auto x = Var("x", new_ret_type);
   auto cont = Function({x}, x, new_ret_type, {}, {});
   tvm::Array<Expr> args;
diff --git a/src/relay/transforms/transform_layout.h b/src/relay/transforms/transform_layout.h
index c250d3801b68..35fb176c6bca 100644
--- a/src/relay/transforms/transform_layout.h
+++ b/src/relay/transforms/transform_layout.h
@@ -138,9 +138,9 @@ class TransformMemorizer : public ObjectRef {
     }
 
     // 2) Insert layout transform on the transformed src.
-    CHECK(new_src_layout.defined() && dst_layout.defined())
+    ICHECK(new_src_layout.defined() && dst_layout.defined())
         << "Cannot insert layout transform because there are undefined layouts";
-    CHECK(tir::BijectiveLayout(new_src_layout, dst_layout).defined())
+    ICHECK(tir::BijectiveLayout(new_src_layout, dst_layout).defined())
         << "Cannot insert layout transform because there are inconvertible layouts: "
         << new_src_layout << " v.s. " << dst_layout;
     return MakeLayoutTransform(input_expr, new_src_layout.name(), dst_layout.name());
@@ -299,7 +299,7 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
   if (!success) {
     return Expr(nullptr);
   }
-  CHECK_EQ(old_in.size(), new_in.size());
+  ICHECK_EQ(old_in.size(), new_in.size());
 
   // if new_in == 'undef':  new_in = old_in
   for (size_t i = 0; i < new_in.size(); ++i) {
@@ -322,9 +322,9 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
     return Expr(nullptr);
   }
 
-  CHECK_EQ(new_out.size(), old_out.size())
+  ICHECK_EQ(new_out.size(), old_out.size())
       << "The number of output nodes should keep the same during alter_op_layout";
-  CHECK_EQ(new_in.size(), new_in2.size())
+  ICHECK_EQ(new_in.size(), new_in2.size())
       << "The number of input nodes should keep the same during alter_op_layout";
 
   // if (new_in != new_in2): insert transform (new_in -> new_in2)
@@ -344,7 +344,7 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
       pt++;
     }
   }
-  CHECK_EQ(pt, inputs.size());
+  ICHECK_EQ(pt, inputs.size());
 
   // state[node] = (old_out, new_out)
   // (handle tuple output)
@@ -362,7 +362,7 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
     return Tuple(fields);
   } else {
     auto rnode = make_object<LayoutAlternatedExprNode<TransformMemorizerT>>();
-    CHECK_EQ(new_out.size(), 1);
+    ICHECK_EQ(new_out.size(), 1);
     rnode->value = Call(new_call->op, transformed_args, new_call->attrs);
     rnode->old_layout = old_out[0];
     rnode->new_layout = new_out[0];
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index 105aed3614cd..cb3ba0030a5b 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -60,15 +60,15 @@ struct TupleGetItemAttrs : public tvm::AttrsNode<TupleGetItemAttrs> {
 
 bool TupleGetItemRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
+  ICHECK_EQ(types.size(), 2);
   if (types[0].as<IncompleteTypeNode>()) return false;
   const auto* data = types[0].as<TupleTypeNode>();
-  CHECK(data != nullptr) << "TupleGetItem expect input type to be TupleType "
-                         << " get " << types[0] << " instead";
+  ICHECK(data != nullptr) << "TupleGetItem expect input type to be TupleType "
+                          << " get " << types[0] << " instead";
   const auto* param = attrs.as<TupleGetItemAttrs>();
-  CHECK(param != nullptr);
-  CHECK_GE(param->index, 0);
-  CHECK_LT(param->index, data->fields.size());
+  ICHECK(param != nullptr);
+  ICHECK_GE(param->index, 0);
+  ICHECK_LT(param->index, data->fields.size());
   reporter->Assign(types[1], data->fields[param->index]);
   return true;
 }
@@ -149,7 +149,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
       return it->second.checked_type;
     }
     Type ret = this->VisitExpr(expr);
-    CHECK(ret.defined());
+    ICHECK(ret.defined());
     KindCheck(ret, mod_, this->diag_ctx);
     ResolvedTypeInfo& rti = type_map_[expr];
     rti.checked_type = ret;
@@ -202,8 +202,8 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   }
 
   void VisitPattern_(const PatternConstructorNode* con, const Type& t) {
-    CHECK(mod_.defined()) << "Cannot do type inference without a environment:"
-                          << con->constructor->name_hint;
+    ICHECK(mod_.defined()) << "Cannot do type inference without a environment:"
+                           << con->constructor->name_hint;
     TypeData td = mod_->type_definitions.at(con->constructor->belong_to);
     auto pc = GetRef<PatternConstructor>(con);
 
@@ -264,7 +264,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     if (!tt) {
       this->EmitFatal(Diagnostic::Error(pt->span) << "Expected a tuple type, got " << unified);
     }
-    CHECK(tup->patterns.size() == tt->fields.size()) << "not enough pattern";
+    ICHECK(tup->patterns.size() == tt->fields.size()) << "not enough pattern";
     for (size_t i = 0; i < tup->patterns.size(); ++i) {
       VisitPattern(tup->patterns[i], tt->fields[i]);
     }
@@ -325,7 +325,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     Type vtype = GetType(let->value);
     let_type = Unify(let_type, vtype, let->span);
 
-    CHECK(is_functional_literal || !type_map_.count(let->var));
+    ICHECK(is_functional_literal || !type_map_.count(let->var));
     // NOTE: no scoping is necessary because var are unique in program
     type_map_[let->var].checked_type = let_type;
     return GetType(let->body);
@@ -368,7 +368,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
 
     // Build a subsitituion map up from the function type and type arguments.
     // Eventually allow the type vars to be passed in.
-    CHECK(fn_ty->type_params.size() == ty_args.size())
+    ICHECK(fn_ty->type_params.size() == ty_args.size())
         << "number of type parameters does not match expected";
     for (size_t i = 0; i < ty_args.size(); ++i) {
       subst_map.Set(fn_ty->type_params[i], ty_args[i]);
@@ -408,7 +408,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     if (type_info == type_map_.end()) {
       type_map_.insert({expr, ResolvedTypeInfo(Type(), type_args)});
     } else {
-      CHECK(!type_info->second.type_args.defined());
+      ICHECK(!type_info->second.type_args.defined());
       type_info->second.type_args = type_args;
     }
   }
@@ -511,7 +511,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     if (f->ret_type.defined()) {
       rtype = this->Unify(f->ret_type, rtype, GetRef<Function>(f)->span);
     }
-    CHECK(rtype.defined());
+    ICHECK(rtype.defined());
     auto ret = FuncType(arg_types, rtype, f->type_params, {});
     return solver_.Resolve(ret);
   }
@@ -532,7 +532,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   }
 
   Type VisitExpr_(const ConstructorNode* c) final {
-    CHECK(mod_.defined()) << "Cannot do type inference without a environment:" << c->name_hint;
+    ICHECK(mod_.defined()) << "Cannot do type inference without a environment:" << c->name_hint;
     TypeData td = mod_->LookupTypeDef(c->belong_to);
     std::vector<Type> types;
     for (const auto& t : td->type_vars) {
@@ -595,7 +595,7 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
   template <typename T>
   Expr AttachCheckedType(const T* op) {
     auto it = tmap_.find(GetRef<Expr>(op));
-    CHECK(it != tmap_.end());
+    ICHECK(it != tmap_.end());
     Type checked_type = solver_->Resolve(it->second.checked_type);
 
     if (checked_type.as<IncompleteTypeNode>() != nullptr) {
@@ -664,7 +664,7 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
     }
     if (need_update_fn) {
       auto* fn_type = checked_type.as<FuncTypeNode>();
-      CHECK(fn_type != nullptr);
+      ICHECK(fn_type != nullptr);
       new_fn->ret_type = fn_type->ret_type;
     }
     return new_e;
@@ -713,7 +713,7 @@ struct AllCheckTypePopulated : ExprVisitor {
     if (e.as<ConstructorNode>()) {
       return;
     }
-    CHECK(e->checked_type_.defined()) << "Expression: " << e;
+    ICHECK(e->checked_type_.defined()) << "Expression: " << e;
     return ExprVisitor::VisitExpr(e);
   }
 };
@@ -788,7 +788,7 @@ Pass InferType() {
             }
 
             auto free_tvars = FreeTypeVars(updated_func, mod);
-            CHECK(free_tvars.size() == 0)
+            ICHECK(free_tvars.size() == 0)
                 << "Found unbound type variables in " << updated_func << ": " << free_tvars;
             EnsureCheckedType(updated_func);
             updates.push_back({it.first, Downcast<Function>(updated_func)});
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 9222cebe3a4d..4d10a72e2391 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -44,47 +44,47 @@ namespace runtime {
 
 std::string GetCustomTypeName(uint8_t type_code) {
   auto f = tvm::runtime::Registry::Get("runtime._datatype_get_type_name");
-  CHECK(f) << "Function runtime._datatype_get_type_name not found";
+  ICHECK(f) << "Function runtime._datatype_get_type_name not found";
   return (*f)(type_code).operator std::string();
 }
 
 uint8_t GetCustomTypeCode(const std::string& type_name) {
   auto f = tvm::runtime::Registry::Get("runtime._datatype_get_type_code");
-  CHECK(f) << "Function runtime._datatype_get_type_code not found";
+  ICHECK(f) << "Function runtime._datatype_get_type_code not found";
   return (*f)(type_name).operator int();
 }
 
 bool GetCustomTypeRegistered(uint8_t type_code) {
   auto f = tvm::runtime::Registry::Get("runtime._datatype_get_type_registered");
-  CHECK(f) << "Function runtime._datatype_get_type_registered not found";
+  ICHECK(f) << "Function runtime._datatype_get_type_registered not found";
   return (*f)(type_code).operator bool();
 }
 
 uint8_t ParseCustomDatatype(const std::string& s, const char** scan) {
-  CHECK(s.substr(0, 6) == "custom") << "Not a valid custom datatype string";
+  ICHECK(s.substr(0, 6) == "custom") << "Not a valid custom datatype string";
 
   auto tmp = s.c_str();
 
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   *scan = s.c_str() + 6;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   if (**scan != '[') LOG(FATAL) << "expected opening brace after 'custom' type in" << s;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   *scan += 1;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   size_t custom_name_len = 0;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   while (*scan + custom_name_len <= s.c_str() + s.length() && *(*scan + custom_name_len) != ']')
     ++custom_name_len;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   if (*(*scan + custom_name_len) != ']')
     LOG(FATAL) << "expected closing brace after 'custom' type in" << s;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   *scan += custom_name_len + 1;
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
 
   auto type_name = s.substr(7, custom_name_len);
-  CHECK(s.c_str() == tmp);
+  ICHECK(s.c_str() == tmp);
   return GetCustomTypeCode(type_name);
 }
 
@@ -129,7 +129,7 @@ class DeviceAPIManager {
     std::string factory = "device_api." + name;
     auto* f = Registry::Get(factory);
     if (f == nullptr) {
-      CHECK(allow_missing) << "Device API " << name << " is not enabled.";
+      ICHECK(allow_missing) << "Device API " << name << " is not enabled.";
       return nullptr;
     }
     void* ptr = (*f)();
@@ -177,7 +177,7 @@ void DeviceAPI::SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src,
 /*!
  * \brief Normalize error message
  *
- *  Parse them header generated by by LOG(FATAL) and CHECK
+ *  Parse them header generated by by LOG(FATAL) and ICHECK
  *  and reformat the message into the standard format.
  *
  *  This function will also merge all the stack traces into
@@ -452,7 +452,7 @@ int TVMFuncCall(TVMFunctionHandle func, TVMValue* args, int* arg_type_codes, int
 
 int TVMCFuncSetReturn(TVMRetValueHandle ret, TVMValue* value, int* type_code, int num_ret) {
   API_BEGIN();
-  CHECK_EQ(num_ret, 1);
+  ICHECK_EQ(num_ret, 1);
   TVMRetValue* rv = static_cast<TVMRetValue*>(ret);
   *rv = TVMArgValue(value[0], type_code[0]);
   API_END();
diff --git a/src/runtime/container.cc b/src/runtime/container.cc
index 253243271d93..916a912b3c5e 100644
--- a/src/runtime/container.cc
+++ b/src/runtime/container.cc
@@ -45,7 +45,7 @@ TVM_REGISTER_GLOBAL("runtime.GetADTFields").set_body([](TVMArgs args, TVMRetValu
   ObjectRef obj = args[0];
   int idx = args[1];
   const auto& adt = Downcast<ADT>(obj);
-  CHECK_LT(idx, adt.size());
+  ICHECK_LT(idx, adt.size());
   *rv = adt[idx];
 });
 
diff --git a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
index 2feb5b03c88b..f9a67010e6e2 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
@@ -29,7 +29,7 @@ namespace runtime {
 namespace contrib {
 
 void* ACLAllocator::allocate(size_t size, size_t alignment) {
-  CHECK_GT(size, 0) << "Cannot allocate size less than or equal to zero";
+  ICHECK_GT(size, 0) << "Cannot allocate size less than or equal to zero";
   return this->device_api_->AllocWorkspace(this->ctx_, size, {});
 }
 
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index e5f2c2d47281..09879bdc6e95 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -75,7 +75,7 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param consts The constant params from compiled model.
    */
   void Init(const Array<NDArray>& consts) override {
-    CHECK_EQ(consts.size(), const_idx_.size())
+    ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     SetupConstants(consts);
     BuildEngine();
@@ -222,7 +222,7 @@ class ACLRuntime : public JSONRuntimeBase {
     arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);
 
     int groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
-    CHECK(groups == 1) << "Arm Compute Library NEON convolution only supports group size of 1.";
+    ICHECK(groups == 1) << "Arm Compute Library NEON convolution only supports group size of 1.";
 
     arm_compute::ActivationLayerInfo act_info;
     if (node.HasAttr("activation_type")) {
@@ -242,7 +242,7 @@ class ACLRuntime : public JSONRuntimeBase {
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.conv2d") {
-      CHECK(num_inputs >= 8U && num_inputs <= 9U)
+      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
           << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
       has_bias = num_inputs == 9;
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
@@ -253,7 +253,7 @@ class ACLRuntime : public JSONRuntimeBase {
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
-      CHECK(num_inputs >= 2U && num_inputs <= 3U)
+      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
           << "Convolution requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
@@ -286,7 +286,7 @@ class ACLRuntime : public JSONRuntimeBase {
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.dense") {
-      CHECK(num_inputs >= 8U && num_inputs <= 9U)
+      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
           << "Quantized fully connected (dense) layer requires 9 inputs with a bias, 8 inputs "
              "without.";
       has_bias = num_inputs == 9;
@@ -298,7 +298,7 @@ class ACLRuntime : public JSONRuntimeBase {
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
-      CHECK(num_inputs >= 2U && num_inputs <= 3U)
+      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
           << "Fully connected (dense) layer requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 59c941df5195..0b6d27623a1a 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -35,7 +35,8 @@ namespace contrib {
 using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
 
 void CheckACLError(const arm_compute::Status& status) {
-  CHECK(status.error_code() == arm_compute::ErrorCode::OK) << "ACL: " << status.error_description();
+  ICHECK(status.error_code() == arm_compute::ErrorCode::OK)
+      << "ACL: " << status.error_description();
 }
 
 arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
@@ -65,7 +66,7 @@ arm_compute::TensorInfo MakeACLTensorInfo(const std::vector<int64_t>& shape,
   if (scale != nullptr && offset != nullptr) {
     std::vector<float> scale_data = GetVectorFromDLTensor<float>(scale);
     std::vector<int> offset_data = GetVectorFromDLTensor<int>(offset);
-    CHECK(scale_data.size() == 1 && offset_data.size() == 1)
+    ICHECK(scale_data.size() == 1 && offset_data.size() == 1)
         << "Currently only per-layer quantization is supported in the Arm Compute Library runtime.";
     arm_compute::QuantizationInfo qinfo(scale_data[0], offset_data[0]);
     info.set_quantization_info(qinfo);
@@ -134,7 +135,7 @@ arm_compute::DataType MakeACLDataType(const DLDataType& data_type) {
 
 template <typename T>
 std::vector<T> GetVectorFromDLTensor(const DLTensor* tensor) {
-  CHECK(tensor) << "Cannot convert a nullptr";
+  ICHECK(tensor) << "Cannot convert a nullptr";
   int len = 1;
   for (int i = 0; i < tensor->ndim; i++) {
     len *= tensor->shape[i];
diff --git a/src/runtime/contrib/cblas/cblas.cc b/src/runtime/contrib/cblas/cblas.cc
index 80d39f6efa9c..16496e06aae3 100644
--- a/src/runtime/contrib/cblas/cblas.cc
+++ b/src/runtime/contrib/cblas/cblas.cc
@@ -20,9 +20,9 @@
 /*!
  * \file Use external cblas library call.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 extern "C" {
 #include <cblas.h>
@@ -125,7 +125,7 @@ struct CblasDgemmBatchIterativeOp {
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
 
   if (TypeMatch(A->dtype, kDLFloat, 32))
     CallGemm(args, ret, CblasSgemmOp());
@@ -135,7 +135,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul").set_body([](TVMArgs args, TVMRet
 
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.batch_matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
   if (TypeMatch(A->dtype, kDLFloat, 32)) {
     CallBatchGemm(args, ret, CblasSgemmBatchOp());
   } else {
@@ -146,7 +146,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cblas.batch_matmul").set_body([](TVMArgs args,
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.batch_matmul_iterative")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       DLTensor* A = args[0];
-      CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+      ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
       if (TypeMatch(A->dtype, kDLFloat, 32)) {
         CallBatchGemm(args, ret, CblasSgemmBatchIterativeOp());
       } else {
diff --git a/src/runtime/contrib/cblas/gemm_common.h b/src/runtime/contrib/cblas/gemm_common.h
index d92f9d710a44..6c31fbdd06a3 100644
--- a/src/runtime/contrib/cblas/gemm_common.h
+++ b/src/runtime/contrib/cblas/gemm_common.h
@@ -71,23 +71,23 @@ inline void CallGemm(TVMArgs args, TVMRetValue* ret, TGemmOp op) {
   bool transa = args[3];
   bool transb = args[4];
   int bit_depth = sizeof(typename TGemmOp::TDatatype) * 8;
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
 
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed(C));
 
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  CHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
-  CHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
+  ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
   op(transb, transa, ColumnCount(B, transb), RowCount(A, transa), ColumnCount(A, transa),
@@ -118,24 +118,24 @@ inline void CallU8S8S32Gemm(TVMArgs args, TVMRetValue* ret, TGemmOp op) {
   int offset_c[1];
   offset_c[0] = 0;
 
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
 
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed(C));
 
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  CHECK(TypeMatch(A->dtype, kDLUInt, 8));
-  CHECK(TypeMatch(B->dtype, kDLInt, 8));
-  CHECK(TypeMatch(C->dtype, kDLInt, 32));
+  ICHECK(TypeMatch(A->dtype, kDLUInt, 8));
+  ICHECK(TypeMatch(B->dtype, kDLInt, 8));
+  ICHECK(TypeMatch(C->dtype, kDLInt, 32));
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
   op(transb, transa, ColumnCount(B, transb), RowCount(A, transa), ColumnCount(A, transa),
@@ -180,22 +180,22 @@ inline void CallBatchGemm(TVMArgs args, TVMRetValue* ret, TBatchGemmOp op) {
   bool transa = args[3];
   bool transb = args[4];
   int bit_depth = sizeof(DType) * 8;
-  CHECK_EQ(A->ndim, 3);
-  CHECK_EQ(B->ndim, 3);
-  CHECK_EQ(C->ndim, 3);
+  ICHECK_EQ(A->ndim, 3);
+  ICHECK_EQ(B->ndim, 3);
+  ICHECK_EQ(C->ndim, 3);
   int batch_size = BatchCount3D(A);
-  CHECK_EQ(BatchCount3D(B), batch_size);
-  CHECK_EQ(BatchCount3D(C), batch_size);
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(BatchCount3D(B), batch_size);
+  ICHECK_EQ(BatchCount3D(C), batch_size);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed3D(C));
+  ICHECK(!IsInPlaceTransposed3D(C));
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed3D(A) ? !transa : transa;
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
-  CHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
-  CHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
+  ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
   const int A_size = A->shape[1] * A->shape[2];
diff --git a/src/runtime/contrib/cblas/mkl.cc b/src/runtime/contrib/cblas/mkl.cc
index 14e2375a311e..273aa45367dd 100644
--- a/src/runtime/contrib/cblas/mkl.cc
+++ b/src/runtime/contrib/cblas/mkl.cc
@@ -20,9 +20,9 @@
 /*!
  * \file Use external mkl library call.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 extern "C" {
 #include <mkl_cblas.h>
@@ -156,7 +156,7 @@ struct MKLDgemmBatchIterativeOp {
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.mkl.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
 
   if (TypeMatch(A->dtype, kDLFloat, 32))
     CallGemm(args, ret, MKLSgemmOp());
@@ -169,15 +169,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.mkl.matmul_u8s8s32").set_body([](TVMArgs args,
   DLTensor* A = args[0];
   DLTensor* B = args[1];
   DLTensor* C = args[2];
-  CHECK(TypeMatch(A->dtype, kDLUInt, 8) && TypeMatch(B->dtype, kDLInt, 8) &&
-        TypeMatch(C->dtype, kDLInt, 32));
+  ICHECK(TypeMatch(A->dtype, kDLUInt, 8) && TypeMatch(B->dtype, kDLInt, 8) &&
+         TypeMatch(C->dtype, kDLInt, 32));
 
   CallU8S8S32Gemm(args, ret, MKLGemmU8S8S32Op());
 });
 
 TVM_REGISTER_GLOBAL("tvm.contrib.mkl.batch_matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
   if (TypeMatch(A->dtype, kDLFloat, 32)) {
     CallBatchGemm(args, ret, MKLSgemmBatchOp());
   } else {
@@ -188,7 +188,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.mkl.batch_matmul").set_body([](TVMArgs args, TV
 TVM_REGISTER_GLOBAL("tvm.contrib.mkl.batch_matmul_iterative")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       DLTensor* A = args[0];
-      CHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+      ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
       if (TypeMatch(A->dtype, kDLFloat, 32)) {
         CallBatchGemm(args, ret, MKLSgemmBatchIterativeOp());
       } else {
diff --git a/src/runtime/contrib/cblas/mkldnn.cc b/src/runtime/contrib/cblas/mkldnn.cc
index 43c0dba595cc..1c3fa023dcc7 100644
--- a/src/runtime/contrib/cblas/mkldnn.cc
+++ b/src/runtime/contrib/cblas/mkldnn.cc
@@ -20,9 +20,9 @@
 /*!
  * \file Use external cblas library call.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 extern "C" {
 #include <dnnl.h>
@@ -48,7 +48,7 @@ struct MKLDNNSgemmOp {
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
   CallGemm(args, ret, MKLDNNSgemmOp());
 });
 }  // namespace contrib
diff --git a/src/runtime/contrib/coreml/coreml_runtime.mm b/src/runtime/contrib/coreml/coreml_runtime.mm
index fafc14a6898a..18d4f735a55e 100644
--- a/src/runtime/contrib/coreml/coreml_runtime.mm
+++ b/src/runtime/contrib/coreml/coreml_runtime.mm
@@ -59,7 +59,7 @@
 
   MLMultiArray* dest = [[MLMultiArray alloc] initWithShape:shape dataType:dataType error:nil];
 
-  CHECK(data_in->strides == NULL);
+  ICHECK(data_in->strides == NULL);
   memcpy(dest.dataPointer, data_in->data, size);
 
   NSString* nsKey = [NSString stringWithUTF8String:key.c_str()];
@@ -155,7 +155,8 @@
 
       // Copy input tensors to corresponding data entries.
       for (auto i = 0; i < args.size() - 1; ++i) {
-        CHECK(args[i].type_code() == kTVMDLTensorHandle || args[i].type_code() == kTVMNDArrayHandle)
+        ICHECK(args[i].type_code() == kTVMDLTensorHandle ||
+               args[i].type_code() == kTVMNDArrayHandle)
             << "Expect NDArray or DLTensor as inputs\n";
         if (args[i].type_code() == kTVMDLTensorHandle) {
           model_->SetInput([input_names[i] UTF8String], args[i]);
@@ -238,7 +239,7 @@ Module CoreMLRuntimeLoadFromBinary(void* strm) {
   NSString* model_path = [tempDir stringByAppendingPathComponent:dirname];
   NSURL* url = [NSURL fileURLWithPath:model_path];
   BOOL res = [dirWrapper writeToURL:url options:0 originalContentsURL:nil error:nil];
-  CHECK(res) << "Failed to create model directory " << [model_path UTF8String];
+  ICHECK(res) << "Failed to create model directory " << [model_path UTF8String];
 
   auto exec = make_object<CoreMLRuntime>();
   exec->Init(symbol, [model_path UTF8String]);
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index 59367d17405d..ce69d4ca7bde 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -20,9 +20,9 @@
 /*!
  * \file Use external cblas library call.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include "../cblas/gemm_common.h"
 #include "cublas_utils.h"
@@ -152,19 +152,19 @@ inline void CallLtIgemm(TVMArgs args, TVMRetValue* ret, cublasLtHandle_t hdl) {
   int lda = M * K / (roundoff(K, 32) / 32);
   int ldb = K * N / (roundoff(K, 32) / 32);
   int ldc = M * N_out / (roundoff(N_out, 32) / 32);
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
 
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
-  CHECK(TypeEqual(A->dtype, B->dtype));
-  CHECK(TypeMatch(A->dtype, kDLInt, 8));
-  CHECK(TypeMatch(C->dtype, kDLInt, 32));
+  ICHECK(TypeEqual(A->dtype, B->dtype));
+  ICHECK(TypeMatch(A->dtype, kDLInt, 8));
+  ICHECK(TypeMatch(C->dtype, kDLInt, 32));
 
-  CHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
+  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
   int32_t alpha = args.size() > 5 ? args[5] : 1;
   int32_t beta = args.size() > 6 ? args[6] : 0;
   cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
@@ -214,27 +214,27 @@ inline void CallGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl) {
   DLTensor* C = args[2];
   bool transa = args[3];
   bool transb = args[4];
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
 
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
-  CHECK(TypeEqual(A->dtype, B->dtype));
+  ICHECK(TypeEqual(A->dtype, B->dtype));
 
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed(C));
 
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  CHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
-  CHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
+  ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
-  CHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
@@ -272,29 +272,29 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl)
   DLTensor* C = args[2];
   bool transa = args[3];
   bool transb = args[4];
-  CHECK_EQ(A->ndim, 3);
-  CHECK_EQ(B->ndim, 3);
-  CHECK_EQ(C->ndim, 3);
+  ICHECK_EQ(A->ndim, 3);
+  ICHECK_EQ(B->ndim, 3);
+  ICHECK_EQ(C->ndim, 3);
   int batch_size = BatchCount3D(A);
-  CHECK_EQ(BatchCount3D(B), batch_size);
-  CHECK_EQ(BatchCount3D(C), batch_size);
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(BatchCount3D(B), batch_size);
+  ICHECK_EQ(BatchCount3D(C), batch_size);
+  ICHECK_EQ(ElementStride(A), 1);
+  ICHECK_EQ(ElementStride(B), 1);
+  ICHECK_EQ(ElementStride(C), 1);
 
-  CHECK(TypeEqual(A->dtype, B->dtype));
+  ICHECK(TypeEqual(A->dtype, B->dtype));
 
   // C can never be transposed.
-  CHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed(C));
 
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  CHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type";
-  CHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type";
+  ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
-  CHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
@@ -339,8 +339,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cublas.matmul").set_body([](TVMArgs args, TVMRe
   CUBLASTryEnableTensorCore(entry_ptr->handle);
 
   if (TypeEqual(A->dtype, C->dtype)) {
-    CHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-          TypeMatch(A->dtype, kDLFloat, 64));
+    ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
+           TypeMatch(A->dtype, kDLFloat, 64));
 
     if (TypeMatch(A->dtype, kDLFloat, 16))
       CallGemm(args, ret, CublasHgemmOp(entry_ptr->handle));
@@ -361,7 +361,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cublaslt.matmul").set_body([](TVMArgs args, TVM
 
   CUBLASTryEnableTensorCore(entry_ptr->handle);
 
-  CHECK(TypeMatch(A->dtype, kDLInt, 8)) << "Expects dtype to be int8\n";
+  ICHECK(TypeMatch(A->dtype, kDLInt, 8)) << "Expects dtype to be int8\n";
   cublasLtHandle_t ltHandle;
   CHECK_CUBLAS_ERROR(cublasLtCreate(&ltHandle));
   CallLtIgemm(args, ret, ltHandle);
@@ -377,8 +377,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cublas.batch_matmul").set_body([](TVMArgs args,
 
   CUBLASTryEnableTensorCore(entry_ptr->handle);
   if (TypeEqual(A->dtype, C->dtype)) {
-    CHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-          TypeMatch(A->dtype, kDLFloat, 64));
+    ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
+           TypeMatch(A->dtype, kDLFloat, 64));
 
     if (TypeMatch(A->dtype, kDLFloat, 16))
       CallBatchGemm(args, ret, CublasHgemmBatchOp(entry_ptr->handle));
diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h
index 5189c4f483a8..32c3b03ddbb0 100644
--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <dlpack/dlpack.h>
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #include <cstdint>
 #if CUDART_VERSION >= 10010
@@ -63,10 +63,10 @@ inline const char* GetCublasErrorString(int error) {
 }
 
 #ifndef CHECK_CUBLAS_ERROR
-#define CHECK_CUBLAS_ERROR(fn)                                                           \
-  do {                                                                                   \
-    int error = static_cast<int>(fn);                                                    \
-    CHECK_EQ(error, CUBLAS_STATUS_SUCCESS) << "CUBLAS: " << GetCublasErrorString(error); \
+#define CHECK_CUBLAS_ERROR(fn)                                                            \
+  do {                                                                                    \
+    int error = static_cast<int>(fn);                                                     \
+    ICHECK_EQ(error, CUBLAS_STATUS_SUCCESS) << "CUBLAS: " << GetCublasErrorString(error); \
   } while (0)  // ; intentionally left off.
 #endif         // CHECK_CUBLAS_ERROR
 
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h
index 1b4eb40f193f..528298b75187 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -25,18 +25,18 @@
 #define TVM_RUNTIME_CONTRIB_CUDNN_CUDNN_UTILS_H_
 
 #include <cudnn.h>
-#include <dmlc/logging.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/support/logging.h>
 
 #include "../../cuda/cuda_common.h"
 
 namespace tvm {
 namespace contrib {
 
-#define CUDNN_CALL(func)                                                      \
-  {                                                                           \
-    cudnnStatus_t e = (func);                                                 \
-    CHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \
+#define CUDNN_CALL(func)                                                       \
+  {                                                                            \
+    cudnnStatus_t e = (func);                                                  \
+    ICHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \
   }
 
 /*! breif Convert DLTensor type to CuDNN type */
diff --git a/src/runtime/contrib/cudnn/softmax.cc b/src/runtime/contrib/cudnn/softmax.cc
index ff6d6a1dbd81..648c9b633ea4 100644
--- a/src/runtime/contrib/cudnn/softmax.cc
+++ b/src/runtime/contrib/cudnn/softmax.cc
@@ -39,7 +39,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.softmax.forward")
       int ndim = x->ndim;
       int64_t* shape = x->shape;
       if (axis < 0) axis += ndim;
-      CHECK(axis >= 0 && axis < ndim);
+      ICHECK(axis >= 0 && axis < ndim);
 
       CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
       entry_ptr->softmax_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index bda9f1a44932..eef67a702d9c 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -54,7 +54,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
   void Init(const Array<NDArray>& consts) override {
     BuildEngine();
 
-    CHECK_EQ(consts.size(), const_idx_.size())
+    ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
 
     // Setup constants entries for weights.
@@ -98,7 +98,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       if (node.GetOpType() == "kernel") {
-        CHECK_EQ(node.GetOpType(), "kernel");
+        ICHECK_EQ(node.GetOpType(), "kernel");
         auto op_name = node.GetOpName();
         if ("nn.conv2d" == op_name) {
           Conv2d(nid);
@@ -137,12 +137,12 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     auto eid = EntryID(entry);
     // Since the DNNL memory has been created before calling this function, we assume the entry
     // has not yet been bound to the other DNNL memory; otherwise it may have memory leak.
-    CHECK_EQ(entry_out_mem_.count(eid), 0);
+    ICHECK_EQ(entry_out_mem_.count(eid), 0);
 
     // TODO(@comanic): Support other data types (i.e., int8).
     auto data_node = nodes_[entry.id_];
     auto dltype = data_node.GetOpDataType()[entry.index_];
-    CHECK_EQ(dltype.bits, 32);
+    ICHECK_EQ(dltype.bits, 32);
 
     entry_out_mem_[eid] = {mem, offset};
     return entry_out_mem_[eid].first;
@@ -214,11 +214,11 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     net_.push_back(conv);
 
     // Data memory.
-    CHECK_EQ(node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
+    ICHECK_EQ(node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
     auto conv2d_src_memory = BindDNNLMemory(data_entry, {src_dims, dt::f32, tag::nchw});
 
     // Weight memory.
-    CHECK_EQ(node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
+    ICHECK_EQ(node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
     auto conv2d_weights_memory = BindDNNLMemory(
         weight_entry, {weights_dims, dt::f32, (groups > 1) ? tag::goihw : tag::oihw});
 
@@ -343,7 +343,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     auto relu_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference,
                                                  dnnl::algorithm::eltwise_relu, data_md, 0);
     auto relu_prim_desc = dnnl::eltwise_forward::primitive_desc(relu_desc, engine_);
-    CHECK(data_md == relu_prim_desc.dst_desc());
+    ICHECK(data_md == relu_prim_desc.dst_desc());
 
     auto relu = dnnl::eltwise_forward(relu_prim_desc);
     net_.push_back(relu);
@@ -364,7 +364,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::vector<dnnl::memory::desc> data_mds;
     std::vector<dnnl::memory> data_memories;
 
-    CHECK_EQ(node.GetInputs().size(), 2U);
+    ICHECK_EQ(node.GetInputs().size(), 2U);
     for (auto entry : node.GetInputs()) {
       auto data_shape = nodes_[entry.id_].GetOpShape()[entry.index_];
       dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dt::f32);
@@ -373,7 +373,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       data_mds.push_back(data_md);
       data_memories.push_back(BindDNNLMemory(entry, data_md));
     }
-    CHECK(data_dims[0] == data_dims[1]);
+    ICHECK(data_dims[0] == data_dims[1]);
     auto out_md = data_mds[0];
     JSONGraphNodeEntry out_entry(nid, 0);
     auto out_memory = BindDNNLMemory(out_entry, out_md);
diff --git a/src/runtime/contrib/json/json_node.h b/src/runtime/contrib/json/json_node.h
index 6a07129bf006..77c289b04c6d 100644
--- a/src/runtime/contrib/json/json_node.h
+++ b/src/runtime/contrib/json/json_node.h
@@ -73,13 +73,13 @@ class JSONGraphNodeEntry {
    */
   void Load(dmlc::JSONReader* reader) {
     reader->BeginArray();
-    CHECK(reader->NextArrayItem()) << "invalid json format";
+    ICHECK(reader->NextArrayItem()) << "invalid json format";
     reader->Read(&id_);
-    CHECK(reader->NextArrayItem()) << "invalid json format";
+    ICHECK(reader->NextArrayItem()) << "invalid json format";
     reader->Read(&index_);
     if (reader->NextArrayItem()) {
       reader->Read(&version_);
-      CHECK(!reader->NextArrayItem()) << "invalid json format";
+      ICHECK(!reader->NextArrayItem()) << "invalid json format";
     } else {
       version_ = 0;
     }
@@ -145,27 +145,27 @@ class JSONGraphNode {
       } else if (key == "dtype") {
         std::vector<std::string> tmp;
         reader->BeginArray();
-        CHECK(reader->NextArrayItem());
+        ICHECK(reader->NextArrayItem());
         reader->Read(&tmp);
-        CHECK(!reader->NextArrayItem());
+        ICHECK(!reader->NextArrayItem());
         for (const auto& it : tmp) {
           dtype_.push_back(tvm::runtime::String2DLDataType(it));
         }
       } else if (key == "shape") {
         reader->BeginArray();
-        CHECK(reader->NextArrayItem());
+        ICHECK(reader->NextArrayItem());
         reader->Read(&shape_);
-        CHECK(!reader->NextArrayItem());
+        ICHECK(!reader->NextArrayItem());
       } else {
         reader->BeginArray();
-        CHECK(reader->NextArrayItem());
+        ICHECK(reader->NextArrayItem());
         std::vector<std::string> tmp;
         reader->Read(&tmp);
         attrs_[key] = tmp;
-        CHECK(!reader->NextArrayItem());
+        ICHECK(!reader->NextArrayItem());
       }
     }
-    CHECK_EQ(shape_.size(), dtype_.size());
+    ICHECK_EQ(shape_.size(), dtype_.size());
   }
 
   /*!
@@ -256,7 +256,7 @@ class JSONGraphNode {
    */
   template <typename T>
   T GetAttr(const std::string& key) const {
-    CHECK_GT(attrs_.count(key), 0U) << "Key: " << key << "is not found";
+    ICHECK_GT(attrs_.count(key), 0U) << "Key: " << key << "is not found";
     return dmlc::get<T>(attrs_.at(key));
   }
 
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 9eb7fcd2f689..3ae652ccaf24 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -78,7 +78,7 @@ class JSONRuntimeBase : public ModuleNode {
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->const_names_; });
     } else if (this->symbol_name_ == name) {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK(this->initialized_) << "The module has not been initialized";
+        ICHECK(this->initialized_) << "The module has not been initialized";
 
         // Bind argument tensors to data entries.
         this->SetInputOutputBuffers(args);
@@ -88,7 +88,7 @@ class JSONRuntimeBase : public ModuleNode {
     } else if ("__init_" + this->symbol_name_ == name) {
       // The function to initialize constant tensors.
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        CHECK_EQ(args.size(), 1U);
+        ICHECK_EQ(args.size(), 1U);
         this->Init(args[0]);
         this->initialized_ = true;
         *rv = 0;
@@ -119,9 +119,9 @@ class JSONRuntimeBase : public ModuleNode {
     std::string graph_json;
     std::vector<std::string> consts;
     // Load the symbol
-    CHECK(stream->Read(&symbol)) << "Loading symbol name failed";
-    CHECK(stream->Read(&graph_json)) << "Loading graph json failed";
-    CHECK(stream->Read(&consts)) << "Loading the const name list failed";
+    ICHECK(stream->Read(&symbol)) << "Loading symbol name failed";
+    ICHECK(stream->Read(&graph_json)) << "Loading graph json failed";
+    ICHECK(stream->Read(&consts)) << "Loading the const name list failed";
     Array<String> const_names;
     for (const auto& it : consts) {
       const_names.push_back(it);
@@ -146,13 +146,13 @@ class JSONRuntimeBase : public ModuleNode {
    * \param args The packed args.
    */
   void SetInputOutputBuffers(const TVMArgs& args) {
-    CHECK_EQ(args.size(), input_var_eid_.size() + outputs_.size())
+    ICHECK_EQ(args.size(), input_var_eid_.size() + outputs_.size())
         << "Found mismatch in the number of provided data entryies and required.";
 
     for (size_t i = 0; i < static_cast<size_t>(args.size()); i++) {
       auto eid = i < input_var_eid_.size() ? input_var_eid_[i]
                                            : EntryID(outputs_[i - input_var_eid_.size()]);
-      CHECK(args[i].type_code() == kTVMNDArrayHandle || args[i].type_code() == kTVMDLTensorHandle)
+      ICHECK(args[i].type_code() == kTVMNDArrayHandle || args[i].type_code() == kTVMDLTensorHandle)
           << "Expect NDArray or DLTensor as inputs";
 
       const DLTensor* arg;
@@ -183,23 +183,23 @@ class JSONRuntimeBase : public ModuleNode {
       uint32_t nid = input_nodes_[i];
       std::string name = nodes_[nid].name_;
       if (nodes_[nid].op_type_ == "input") {
-        CHECK_EQ(nodes_[nid].GetOpShape().size(), nodes_[nid].GetOpDataType().size());
+        ICHECK_EQ(nodes_[nid].GetOpShape().size(), nodes_[nid].GetOpDataType().size());
         for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
           input_var_eid_.push_back(EntryID(nid, j));
         }
       } else {
-        CHECK_EQ(nodes_[nid].op_type_, "const");
+        ICHECK_EQ(nodes_[nid].op_type_, "const");
         auto pos = std::find(std::begin(const_names_), std::end(const_names_), name);
-        CHECK(pos != std::end(const_names_)) << "Found non-existent constant: " << name;
+        ICHECK(pos != std::end(const_names_)) << "Found non-existent constant: " << name;
         const_idx_.push_back(nid);
         consts.push_back(name);
       }
     }
-    CHECK_EQ(consts.size(), const_names_.size())
+    ICHECK_EQ(consts.size(), const_names_.size())
         << "Found mismatch for the number of constants in the graph and required.";
 
     for (size_t i = 0; i < consts.size(); i++) {
-      CHECK_EQ(consts[i], const_names_[i])
+      ICHECK_EQ(consts[i], const_names_[i])
           << "The position of constant in the graph must be the same as the required.";
     }
 
diff --git a/src/runtime/contrib/miopen/miopen_utils.h b/src/runtime/contrib/miopen/miopen_utils.h
index 4dec2ad710ba..9982f0914f6b 100644
--- a/src/runtime/contrib/miopen/miopen_utils.h
+++ b/src/runtime/contrib/miopen/miopen_utils.h
@@ -24,9 +24,9 @@
 #ifndef TVM_RUNTIME_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
 #define TVM_RUNTIME_CONTRIB_MIOPEN_MIOPEN_UTILS_H_
 
-#include <dmlc/logging.h>
 #include <miopen/miopen.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/support/logging.h>
 
 #include <string>
 
@@ -38,10 +38,10 @@ namespace miopen {
 
 std::string miopenGetErrorString(int error_code);
 
-#define MIOPEN_CALL(func)                                                            \
-  {                                                                                  \
-    miopenStatus_t e = (func);                                                       \
-    CHECK_EQ(e, miopenStatusSuccess) << "miopen error: " << miopenGetErrorString(e); \
+#define MIOPEN_CALL(func)                                                             \
+  {                                                                                   \
+    miopenStatus_t e = (func);                                                        \
+    ICHECK_EQ(e, miopenStatusSuccess) << "miopen error: " << miopenGetErrorString(e); \
   }
 
 struct ConvEntry {
diff --git a/src/runtime/contrib/mps/conv.mm b/src/runtime/contrib/mps/conv.mm
index b598014f0267..3b16f0820d64 100644
--- a/src/runtime/contrib/mps/conv.mm
+++ b/src/runtime/contrib/mps/conv.mm
@@ -80,15 +80,15 @@
   int pad = args[3];
   int stride = args[4];
 
-  CHECK_EQ(data->ndim, 4);
-  CHECK_EQ(weight->ndim, 4);
-  CHECK_EQ(output->ndim, 4);
-  CHECK(output->strides == nullptr);
-  CHECK(weight->strides == nullptr);
-  CHECK(data->strides == nullptr);
-
-  CHECK_EQ(data->shape[0], 1);
-  CHECK_EQ(output->shape[0], 1);
+  ICHECK_EQ(data->ndim, 4);
+  ICHECK_EQ(weight->ndim, 4);
+  ICHECK_EQ(output->ndim, 4);
+  ICHECK(output->strides == nullptr);
+  ICHECK(weight->strides == nullptr);
+  ICHECK(data->strides == nullptr);
+
+  ICHECK_EQ(data->shape[0], 1);
+  ICHECK_EQ(output->shape[0], 1);
 
   int oCh = weight->shape[0];
   int kH = weight->shape[1];
diff --git a/src/runtime/contrib/mps/gemm.mm b/src/runtime/contrib/mps/gemm.mm
index 109c952ff0c4..c1d80dbed7f3 100644
--- a/src/runtime/contrib/mps/gemm.mm
+++ b/src/runtime/contrib/mps/gemm.mm
@@ -31,19 +31,19 @@
   bool transa = args[3];
   bool transb = args[4];
   // call gemm for simple compact code.
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
-  CHECK(C->strides == nullptr);
-  CHECK(B->strides == nullptr);
-  CHECK(A->strides == nullptr);
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-  CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-  CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
+  ICHECK(C->strides == nullptr);
+  ICHECK(B->strides == nullptr);
+  ICHECK(A->strides == nullptr);
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(B->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
   // Get Metal device API
   MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal();
-  // CHECK_EQ(A->ctx, B->ctx);
-  // CHECK_EQ(A->ctx, C->ctx);
+  // ICHECK_EQ(A->ctx, B->ctx);
+  // ICHECK_EQ(A->ctx, C->ctx);
   id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(A->ctx);
   id<MTLCommandQueue> queue = entry_ptr->metal_api->GetCommandQueue(A->ctx);
   id<MTLCommandBuffer> cb = [queue commandBuffer];
@@ -51,7 +51,7 @@
   NSUInteger N = B->shape[1 - (transb ? 1 : 0)];
   NSUInteger K = B->shape[0 + (transb ? 1 : 0)];
 
-  CHECK_EQ(A->shape[1 - (transa ? 1 : 0)], K);
+  ICHECK_EQ(A->shape[1 - (transa ? 1 : 0)], K);
   // mps a
   MPSDataType dtype = MPSType::DLTypeToMPSType(A->dtype);
   MPSMatrixDescriptor* descA =
@@ -86,7 +86,7 @@
                                            interiorColumns:K
                                                      alpha:1.0f
                                                       beta:0.0f];
-  CHECK(sgemm != nil);
+  ICHECK(sgemm != nil);
   [sgemm encodeToCommandBuffer:cb leftMatrix:matrixA rightMatrix:matrixB resultMatrix:matrixC];
   [cb commit];
 });
diff --git a/src/runtime/contrib/mps/mps_utils.h b/src/runtime/contrib/mps/mps_utils.h
index 170451ea385b..d1c49732318a 100644
--- a/src/runtime/contrib/mps/mps_utils.h
+++ b/src/runtime/contrib/mps/mps_utils.h
@@ -25,11 +25,11 @@
 #define TVM_RUNTIME_CONTRIB_MPS_MPS_UTILS_H_
 
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <vector>
 
diff --git a/src/runtime/contrib/nnpack/convolution.cc b/src/runtime/contrib/nnpack/convolution.cc
index 54c9ea4f969b..b3ea6c891d43 100644
--- a/src/runtime/contrib/nnpack/convolution.cc
+++ b/src/runtime/contrib/nnpack/convolution.cc
@@ -20,11 +20,11 @@
 /*!
  * \file Use external nnpack library call.
  */
-#include <dmlc/logging.h>
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include "nnpack_utils.h"
 
@@ -36,7 +36,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal();
       static std::once_flag flag;
-      std::call_once(flag, []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); });
       DLTensor* input = args[0];
       DLTensor* kernel = args[1];
       DLTensor* bias = nullptr;
@@ -52,36 +52,36 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
 
       uint64_t algo_ = args[11];
       nnp_convolution_algorithm algo = static_cast<nnp_convolution_algorithm>(algo_);
-      CHECK_EQ(input->ndim, 4);
-      CHECK_EQ(kernel->ndim, 4);
+      ICHECK_EQ(input->ndim, 4);
+      ICHECK_EQ(kernel->ndim, 4);
       if (bias) {
-        CHECK_EQ(bias->ndim, 1);
+        ICHECK_EQ(bias->ndim, 1);
       }
-      CHECK_EQ(output->ndim, 4);
-      CHECK_EQ(input->shape[1], kernel->shape[1]);
-      CHECK_EQ(input->shape[0], output->shape[0]);
+      ICHECK_EQ(output->ndim, 4);
+      ICHECK_EQ(input->shape[1], kernel->shape[1]);
+      ICHECK_EQ(input->shape[0], output->shape[0]);
       size_t input_channels = input->shape[1];
-      CHECK_EQ(output->shape[1], kernel->shape[0]);
+      ICHECK_EQ(output->shape[1], kernel->shape[0]);
       if (bias) {
-        CHECK_EQ(output->shape[1], bias->shape[0]);
+        ICHECK_EQ(output->shape[1], bias->shape[0]);
       }
       size_t output_channels = output->shape[1];
       nnp_size input_size{static_cast<size_t>(input->shape[2]),
                           static_cast<size_t>(input->shape[3])};
       nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
                            static_cast<size_t>(kernel->shape[3])};
-      CHECK(input->strides == nullptr);
-      CHECK(kernel->strides == nullptr);
+      ICHECK(input->strides == nullptr);
+      ICHECK(kernel->strides == nullptr);
       if (bias) {
-        CHECK(bias->strides == nullptr);
+        ICHECK(bias->strides == nullptr);
       }
 
-      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(input->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
       if (bias) {
-        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+        ICHECK(TypeMatch(bias->dtype, kDLFloat, 32));
       }
-      CHECK(TypeMatch(output->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(output->dtype, kDLFloat, 32));
 
       // Allocate a zero-bias if we don't pass one in.
       std::unique_ptr<std::vector<float>> zero_bias;
@@ -94,7 +94,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
           algo, nnp_convolution_transform_strategy_compute, input_channels, output_channels,
           input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr,
           nullptr, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      ICHECK_EQ(status, nnp_status_success);
 
       // Division with rounding up, in case size is not multiple of sizeof(float)
       const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float);
@@ -105,7 +105,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
       DeviceAPI* cpu_api = DeviceAPI::Get(ctx);
       void* workspace_buffer =
           cpu_api->AllocWorkspace(ctx, workspace_elements * sizeof(float), type_hint);
-      CHECK(workspace_buffer != nullptr);
+      ICHECK(workspace_buffer != nullptr);
 
       for (auto n = 0; n < input->shape[0]; ++n) {
         nnp_status status = nnp_convolution_inference(
@@ -120,7 +120,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
             workspace_buffer, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool,
             nullptr);
 
-        CHECK_EQ(status, nnp_status_success);
+        ICHECK_EQ(status, nnp_status_success);
       }
       cpu_api->FreeWorkspace(ctx, workspace_buffer);
     });
@@ -129,7 +129,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal();
       static std::once_flag flag;
-      std::call_once(flag, []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); });
       DLTensor* input = args[0];
       DLTensor* transformed_kernel = args[1];
       DLTensor* bias = nullptr;
@@ -145,32 +145,32 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
 
       uint64_t algo_ = args[11];
       nnp_convolution_algorithm algo = static_cast<nnp_convolution_algorithm>(algo_);
-      CHECK_EQ(input->ndim, 4);
+      ICHECK_EQ(input->ndim, 4);
       if (bias) {
-        CHECK_EQ(bias->ndim, 1);
+        ICHECK_EQ(bias->ndim, 1);
       }
-      CHECK_EQ(output->ndim, 4);
-      CHECK_EQ(input->shape[0], output->shape[0]);
+      ICHECK_EQ(output->ndim, 4);
+      ICHECK_EQ(input->shape[0], output->shape[0]);
       size_t input_channels = input->shape[1];
       if (bias) {
-        CHECK_EQ(output->shape[1], bias->shape[0]);
+        ICHECK_EQ(output->shape[1], bias->shape[0]);
       }
       size_t output_channels = output->shape[1];
       nnp_size input_size{static_cast<size_t>(input->shape[2]),
                           static_cast<size_t>(input->shape[3])};
       nnp_size kernel_size{3, 3};
-      CHECK(input->strides == nullptr);
-      CHECK(transformed_kernel->strides == nullptr);
+      ICHECK(input->strides == nullptr);
+      ICHECK(transformed_kernel->strides == nullptr);
       if (bias) {
-        CHECK(bias->strides == nullptr);
+        ICHECK(bias->strides == nullptr);
       }
 
-      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(transformed_kernel->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(input->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(transformed_kernel->dtype, kDLFloat, 32));
       if (bias) {
-        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+        ICHECK(TypeMatch(bias->dtype, kDLFloat, 32));
       }
-      CHECK(TypeMatch(output->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(output->dtype, kDLFloat, 32));
 
       // Allocate a zero-bias if we don't pass one in.
       std::unique_ptr<std::vector<float>> zero_bias;
@@ -183,7 +183,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
           algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
           input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr,
           nullptr, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool, nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      ICHECK_EQ(status, nnp_status_success);
 
       // Division with rounding up, in case size is not multiple of sizeof(float)
       const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float);
@@ -194,7 +194,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
       DeviceAPI* cpu_api = DeviceAPI::Get(ctx);
       void* workspace_buffer =
           cpu_api->AllocWorkspace(ctx, workspace_elements * sizeof(float), type_hint);
-      CHECK(workspace_buffer != nullptr);
+      ICHECK(workspace_buffer != nullptr);
 
       for (auto n = 0; n < input->shape[0]; ++n) {
         nnp_status status = nnp_convolution_inference(
@@ -208,7 +208,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
                 n * output->shape[1] * output->shape[2] * output->shape[3],
             workspace_buffer, &workspace_size, nnp_activation_identity, nullptr, entry->threadpool,
             nullptr);
-        CHECK_EQ(status, nnp_status_success);
+        ICHECK_EQ(status, nnp_status_success);
       }
 
       cpu_api->FreeWorkspace(ctx, workspace_buffer);
@@ -218,7 +218,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal();
       static std::once_flag flag;
-      std::call_once(flag, []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      std::call_once(flag, []() { ICHECK_EQ(nnp_initialize(), nnp_status_success); });
       DLTensor* kernel = args[0];
       DLTensor* transformed_kernel = args[1];
       // Dummy sizes
@@ -231,15 +231,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
 
       uint64_t algo_ = args[3];
       nnp_convolution_algorithm algo = static_cast<nnp_convolution_algorithm>(algo_);
-      CHECK_EQ(kernel->ndim, 4);
+      ICHECK_EQ(kernel->ndim, 4);
       size_t input_channels = kernel->shape[1];
       size_t output_channels = kernel->shape[0];
-      CHECK_EQ(kernel->shape[2], 3);
-      CHECK_EQ(kernel->shape[3], 3);
+      ICHECK_EQ(kernel->shape[2], 3);
+      ICHECK_EQ(kernel->shape[3], 3);
       nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
                            static_cast<size_t>(kernel->shape[3])};
-      CHECK(kernel->strides == nullptr);
-      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+      ICHECK(kernel->strides == nullptr);
+      ICHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
 
       size_t transformed_kernel_size = 0;
       nnp_status status;
@@ -248,9 +248,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
           input_size, input_padding, kernel_size, stride_size, nullptr, nullptr, nullptr, nullptr,
           nullptr, &transformed_kernel_size, nnp_activation_identity, nullptr, entry->threadpool,
           nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      ICHECK_EQ(status, nnp_status_success);
 
-      CHECK_LE(transformed_kernel_size, GetDataSize(*transformed_kernel));
+      ICHECK_LE(transformed_kernel_size, GetDataSize(*transformed_kernel));
 
       status = nnp_convolution_inference(
           algo, nnp_convolution_transform_strategy_precompute, input_channels, output_channels,
@@ -258,7 +258,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
           static_cast<float*>(kernel->data), nullptr, nullptr,
           static_cast<float*>(transformed_kernel->data), &transformed_kernel_size,
           nnp_activation_identity, nullptr, entry->threadpool, nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      ICHECK_EQ(status, nnp_status_success);
     });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/contrib/nnpack/fully_connected.cc b/src/runtime/contrib/nnpack/fully_connected.cc
index 543d23958633..8b72eb38e08c 100644
--- a/src/runtime/contrib/nnpack/fully_connected.cc
+++ b/src/runtime/contrib/nnpack/fully_connected.cc
@@ -20,10 +20,10 @@
 /*!
  * \file Use external nnpack library call.
  */
-#include <dmlc/logging.h>
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include "nnpack_utils.h"
 
@@ -42,17 +42,17 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_inference")
       DLTensor* C = args[2];
       NNPackConfig(args[3]);
 
-      CHECK_EQ(A->ndim, 1);
-      CHECK_EQ(B->ndim, 2);
-      CHECK_EQ(C->ndim, 1);
-      CHECK_EQ(B->shape[0], C->shape[0]);
-      CHECK_EQ(B->shape[1], A->shape[0]);
-      CHECK(C->strides == nullptr);
-      CHECK(B->strides == nullptr);
-      CHECK(A->strides == nullptr);
-      CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+      ICHECK_EQ(A->ndim, 1);
+      ICHECK_EQ(B->ndim, 2);
+      ICHECK_EQ(C->ndim, 1);
+      ICHECK_EQ(B->shape[0], C->shape[0]);
+      ICHECK_EQ(B->shape[1], A->shape[0]);
+      ICHECK(C->strides == nullptr);
+      ICHECK(B->strides == nullptr);
+      ICHECK(A->strides == nullptr);
+      ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(B->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
       nnp_fully_connected_inference(B->shape[1], B->shape[0], static_cast<float*>(A->data),
                                     static_cast<float*>(B->data), static_cast<float*>(C->data),
diff --git a/src/runtime/contrib/nnpack/nnpack_utils.cc b/src/runtime/contrib/nnpack/nnpack_utils.cc
index 91cf865128e9..2fd6f69bf20c 100644
--- a/src/runtime/contrib/nnpack/nnpack_utils.cc
+++ b/src/runtime/contrib/nnpack/nnpack_utils.cc
@@ -35,7 +35,7 @@ NNPackThreadLocalEntry* NNPackThreadLocalEntry::ThreadLocal() {
 bool NNPackConfig(uint64_t nthreads) {
   NNPackThreadLocalEntry* entry = NNPackThreadLocalEntry::ThreadLocal();
   if (entry->threadpool && pthreadpool_get_threads_count(entry->threadpool) == nthreads) {
-    CHECK_NE(nthreads, 1);
+    ICHECK_NE(nthreads, 1);
     return true;
   }
   if (entry->threadpool) {
@@ -46,7 +46,7 @@ bool NNPackConfig(uint64_t nthreads) {
   if (nthreads == 1) {
     // a null threadpool means the function is invoked on the calling thread,
     // which is the desired logic for nthreads == 1
-    CHECK(!entry->threadpool);
+    ICHECK(!entry->threadpool);
     return true;
   }
 
diff --git a/src/runtime/contrib/nnpack/nnpack_utils.h b/src/runtime/contrib/nnpack/nnpack_utils.h
index bbb0d16bc868..231309baaa8e 100644
--- a/src/runtime/contrib/nnpack/nnpack_utils.h
+++ b/src/runtime/contrib/nnpack/nnpack_utils.h
@@ -22,11 +22,11 @@
  */
 #ifndef TVM_RUNTIME_CONTRIB_NNPACK_NNPACK_UTILS_H_
 #define TVM_RUNTIME_CONTRIB_NNPACK_NNPACK_UTILS_H_
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 namespace tvm {
 namespace contrib {
diff --git a/src/runtime/contrib/onnx/onnx_module.cc b/src/runtime/contrib/onnx/onnx_module.cc
index 9574b8674c8b..b235d63dbc58 100644
--- a/src/runtime/contrib/onnx/onnx_module.cc
+++ b/src/runtime/contrib/onnx/onnx_module.cc
@@ -53,8 +53,8 @@ class ONNXSourceModuleNode : public runtime::ModuleNode {
   std::string GetSource(const std::string& format) final { return code_; }
 
   void SaveToFile(const std::string& path, const std::string& format) final {
-    CHECK_EQ(format, "onnx") << "Can only save to onnx format";
-    CHECK_NE(code_.length(), 0);
+    ICHECK_EQ(format, "onnx") << "Can only save to onnx format";
+    ICHECK_NE(code_.length(), 0);
     const PackedFunc* to_onnx_ = runtime::Registry::Get("relay.ext.onnx.save_to_file");
     (*to_onnx_)(code_, path, format);
   }
diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc
index 8c20f0700ee7..49bc056dcafb 100644
--- a/src/runtime/contrib/random/mt_random_engine.cc
+++ b/src/runtime/contrib/random/mt_random_engine.cc
@@ -21,9 +21,9 @@
  * \file random/mt_random_engine.cc
  * \brief mt19937 random engine
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <ctime>
@@ -71,8 +71,8 @@ class RandomEngine {
    * \brief Fills a tensor with values drawn from Unif(low, high)
    */
   void SampleUniform(DLTensor* data, float low, float high) {
-    CHECK_GT(high, low) << "high must be bigger than low";
-    CHECK(data->strides == nullptr);
+    ICHECK_GT(high, low) << "high must be bigger than low";
+    ICHECK(data->strides == nullptr);
 
     DLDataType dtype = data->dtype;
     int64_t size = 1;
@@ -80,7 +80,7 @@ class RandomEngine {
       size *= data->shape[i];
     }
 
-    CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
+    ICHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
 
     if (data->ctx.device_type == kDLCPU) {
       std::uniform_real_distribution<float> uniform_dist(low, high);
@@ -95,8 +95,8 @@ class RandomEngine {
    * \brief Fills a tensor with values drawn from Normal(loc, scale**2)
    */
   void SampleNormal(DLTensor* data, float loc, float scale) {
-    CHECK_GT(scale, 0) << "standard deviation must be positive";
-    CHECK(data->strides == nullptr);
+    ICHECK_GT(scale, 0) << "standard deviation must be positive";
+    ICHECK(data->strides == nullptr);
 
     DLDataType dtype = data->dtype;
     int64_t size = 1;
@@ -104,7 +104,7 @@ class RandomEngine {
       size *= data->shape[i];
     }
 
-    CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
+    ICHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
 
     if (data->ctx.device_type == kDLCPU) {
       std::normal_distribution<float> normal_dist(loc, scale);
diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc
index 14bdd267d38c..edcd20883369 100644
--- a/src/runtime/contrib/random/random.cc
+++ b/src/runtime/contrib/random/random.cc
@@ -20,10 +20,10 @@
 /*!
  * \file External random functions for tensor.
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 
@@ -73,8 +73,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.random.randint").set_body([](TVMArgs args, TVMR
   int64_t low = args[0];
   int64_t high = args[1];
   DLTensor* out = args[2];
-  CHECK_GT(high, low) << "high must be bigger than low";
-  CHECK(out->strides == nullptr);
+  ICHECK_GT(high, low) << "high must be bigger than low";
+  ICHECK(out->strides == nullptr);
 
   DLDataType dtype = out->dtype;
   int64_t size = 1;
diff --git a/src/runtime/contrib/rocblas/rocblas.cc b/src/runtime/contrib/rocblas/rocblas.cc
index bca00a591d48..dca1ebc6ed83 100644
--- a/src/runtime/contrib/rocblas/rocblas.cc
+++ b/src/runtime/contrib/rocblas/rocblas.cc
@@ -22,9 +22,9 @@
  */
 #include "rocblas.h"
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 namespace tvm {
 namespace contrib {
@@ -56,15 +56,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.matmul").set_body([](TVMArgs args, TVMR
   bool transa = args[3];
   bool transb = args[4];
   // call gemm for simple compact code.
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
-  CHECK(C->strides == nullptr);
-  CHECK(B->strides == nullptr);
-  CHECK(A->strides == nullptr);
-  CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-  CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-  CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+  ICHECK_EQ(A->ndim, 2);
+  ICHECK_EQ(B->ndim, 2);
+  ICHECK_EQ(C->ndim, 2);
+  ICHECK(C->strides == nullptr);
+  ICHECK(B->strides == nullptr);
+  ICHECK(A->strides == nullptr);
+  ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(B->dtype, kDLFloat, 32));
+  ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
   rocblas_handle handle;
   CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle));
@@ -97,12 +97,12 @@ TVM_REGISTER_GLOBAL("tvm.contrib.rocblas.batch_matmul")
       bool transa = args[3];
       bool transb = args[4];
       // call gemm for simple compact code.
-      CHECK_EQ(A->ndim, 3);
-      CHECK_EQ(B->ndim, 3);
-      CHECK_EQ(C->ndim, 3);
-      CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-      CHECK(TypeMatch(C->dtype, kDLFloat, 32));
+      ICHECK_EQ(A->ndim, 3);
+      ICHECK_EQ(B->ndim, 3);
+      ICHECK_EQ(C->ndim, 3);
+      ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(B->dtype, kDLFloat, 32));
+      ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
       rocblas_handle handle;
       CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle));
diff --git a/src/runtime/contrib/sort/sort.cc b/src/runtime/contrib/sort/sort.cc
index 9543e4b4c64e..31cf38d7d7a5 100644
--- a/src/runtime/contrib/sort/sort.cc
+++ b/src/runtime/contrib/sort/sort.cc
@@ -68,15 +68,15 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort_nms").set_body([](TVMArgs args, TV
   }
 
   // Currently only supports input dtype to be float32.
-  CHECK_EQ(dtype.code, 2) << "Currently only supports input dtype "
-                             "to be float.";
+  ICHECK_EQ(dtype.code, 2) << "Currently only supports input dtype "
+                              "to be float.";
 #if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC != 1)
-  CHECK_EQ(dtype.bits, 32) << "Currently only supports input dtype "
-                              "to be float32.";
+  ICHECK_EQ(dtype.bits, 32) << "Currently only supports input dtype "
+                               "to be float32.";
 #endif
-  CHECK_LT(axis, input->ndim) << "Axis out of boundary for "
-                                 "input ndim "
-                              << input->ndim;
+  ICHECK_LT(axis, input->ndim) << "Axis out of boundary for "
+                                  "input ndim "
+                               << input->ndim;
 
   for (int i = 0; i < input->ndim; ++i) {
     if (i < axis) {
@@ -175,9 +175,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort").set_body([](TVMArgs args, TVMRet
   if (axis < 0) {
     axis = input->ndim + axis;
   }
-  CHECK_LT(axis, input->ndim) << "Axis out of boundary for "
-                                 "input ndim "
-                              << input->ndim;
+  ICHECK_LT(axis, input->ndim) << "Axis out of boundary for "
+                                  "input ndim "
+                               << input->ndim;
 
   auto data_dtype = DLDataType2String(input->dtype);
   auto out_dtype = DLDataType2String(output->dtype);
@@ -322,7 +322,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.topk").set_body([](TVMArgs args, TVMRetVal
   if (axis < 0) {
     axis = input->ndim + axis;
   }
-  CHECK(axis >= 0 && axis < input->ndim) << "Axis out of boundary for input ndim " << input->ndim;
+  ICHECK(axis >= 0 && axis < input->ndim) << "Axis out of boundary for input ndim " << input->ndim;
 
   auto data_dtype = DLDataType2String(input->dtype);
   auto out_dtype = (indices_out == nullptr) ? "int64" : DLDataType2String(indices_out->dtype);
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index bf0dbfe724ed..d308200eba05 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -67,7 +67,7 @@ void TensorRTBuilder::AddInput(int nid, const JSONGraphNode& node) {
   auto node_name = node.GetOpName();
   auto shapes = node.GetOpShape();
   auto dtypes = node.GetOpDataType();
-  CHECK_EQ(shapes.size(), dtypes.size());
+  ICHECK_EQ(shapes.size(), dtypes.size());
   node_output_map_[nid] = {};
   for (size_t i = 0; i < shapes.size(); ++i) {
     const std::string name = node_name + "_" + std::to_string(i);
@@ -77,7 +77,7 @@ void TensorRTBuilder::AddInput(int nid, const JSONGraphNode& node) {
       shape.erase(shape.begin());
     }
     nvinfer1::Dims dims = VectorToTrtDims(shape);
-    CHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
+    ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
     auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
     node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
     network_input_names_.push_back(input_tensor->getName());
@@ -96,7 +96,7 @@ void TensorRTBuilder::AddConstant(int nid, const DLTensor* data) {
 
 void TensorRTBuilder::AddOutput(const JSONGraphNodeEntry& node) {
   auto it = node_output_map_.find(node.id_);
-  CHECK(it != node_output_map_.end()) << "Output was not found.";
+  ICHECK(it != node_output_map_.end()) << "Output was not found.";
   auto out_tensor = it->second[node.index_].tensor;
   std::string name = "tensorrt_output_" + std::to_string(network_output_names_.size());
   out_tensor->setName(name.c_str());
@@ -108,14 +108,14 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
   TensorRTOpConverterParams params(network_, node, &trt_weights_);
   // Look up converter.
   auto it = GetOpConverters()->find(params.op_name);
-  CHECK(it != GetOpConverters()->end())
+  ICHECK(it != GetOpConverters()->end())
       << "Unsupported operator conversion to TRT, op name: " << params.op_name;
   const auto converter = it->second;
   // Get inputs.
   for (size_t i = 0; i < node.GetInputs().size(); ++i) {
     auto in_node = node.GetInputs()[i];
     auto it = node_output_map_.find(in_node.id_);
-    CHECK(it != node_output_map_.end()) << "Input was not found.";
+    ICHECK(it != node_output_map_.end()) << "Input was not found.";
     auto input = it->second[in_node.index_];
     if (!converter->variable_input_count) {
       if (converter->input_types[i] == kTensor && input.type == kWeight) {
@@ -127,7 +127,7 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
     }
     params.inputs.push_back(input);
   }
-  CHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
+  ICHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
       << "Op expected a different number of inputs.";
 
   // Convert op to TRT.
@@ -165,7 +165,7 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
 #else
   nvinfer1::ICudaEngine* engine = builder_->buildCudaEngine(*network_);
 #endif
-  CHECK_EQ(engine->getNbBindings(), network_input_names_.size() + network_output_names_.size());
+  ICHECK_EQ(engine->getNbBindings(), network_input_names_.size() + network_output_names_.size());
   nvinfer1::IExecutionContext* context = engine->createExecutionContext();
   CleanUp();
   return {engine, context, network_input_names_, network_output_names_};
@@ -173,9 +173,9 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
 
 nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
                                                         DLDeviceType src_device) {
-  CHECK_EQ(dptr->ctx.device_type, src_device);
-  CHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
-        static_cast<int>(dptr->dtype.code) == kDLInt);
+  ICHECK_EQ(dptr->ctx.device_type, src_device);
+  ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
+         static_cast<int>(dptr->dtype.code) == kDLInt);
   const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
                              ? nvinfer1::DataType::kFLOAT
                              : nvinfer1::DataType::kINT32;
@@ -185,12 +185,12 @@ nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
   for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
     count *= dptr->shape[i];
   }
-  CHECK_EQ(count * 4, weight_bytes);
+  ICHECK_EQ(count * 4, weight_bytes);
   weight.count = count;
   weight.values = new float[count];
-  CHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
-                               weight_bytes),
-           0)
+  ICHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
+                                weight_bytes),
+            0)
       << TVMGetLastError();
   trt_weights_.push_back(weight);
   return weight;
diff --git a/src/runtime/contrib/tensorrt/tensorrt_logger.h b/src/runtime/contrib/tensorrt/tensorrt_logger.h
index 53b6dfeea763..087cb010189c 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_logger.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_logger.h
@@ -25,7 +25,7 @@
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
 
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #include "NvInfer.h"
 #include "tensorrt_utils.h"
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index a1da6c39f68e..4c5eeea1e644 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -47,7 +47,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* param
                                                 nvinfer1::ITensor* input,
                                                 const std::vector<int>& new_shape) const {
   auto layer = params->network->addShuffle(*input);
-  CHECK(layer != nullptr);
+  ICHECK(layer != nullptr);
   layer->setReshapeDimensions(VectorToTrtDims(new_shape));
   return layer->getOutput(0);
 }
@@ -56,17 +56,17 @@ nvinfer1::ITensor* TensorRTOpConverter::Transpose(TensorRTOpConverterParams* par
                                                   nvinfer1::ITensor* input,
                                                   const std::vector<int>& order) const {
   auto layer = params->network->addShuffle(*input);
-  CHECK(layer != nullptr);
+  ICHECK(layer != nullptr);
   nvinfer1::Permutation perm;
   if (TRT_HAS_IMPLICIT_BATCH(params)) {
     // Batch dimension cannot be modified.
-    CHECK_EQ(input->getDimensions().nbDims, order.size() - 1);
-    CHECK_EQ(order[0], 0);
+    ICHECK_EQ(input->getDimensions().nbDims, order.size() - 1);
+    ICHECK_EQ(order[0], 0);
     for (size_t i = 0; i < order.size(); ++i) {
       perm.order[i] = order[i + 1] - 1;
     }
   } else {
-    CHECK_EQ(input->getDimensions().nbDims, order.size());
+    ICHECK_EQ(input->getDimensions().nbDims, order.size());
     for (size_t i = 0; i < order.size(); ++i) {
       perm.order[i] = order[i];
     }
@@ -81,11 +81,11 @@ int TensorRTOpConverter::ConvertAxis(TensorRTOpConverterParams* params, int axis
   if (TRT_HAS_IMPLICIT_BATCH(params)) {
     input_rank += 1;
   }
-  CHECK(axis >= -input_rank && axis < input_rank);
+  ICHECK(axis >= -input_rank && axis < input_rank);
   if (axis < 0) axis += input_rank;
   if (TRT_HAS_IMPLICIT_BATCH(params)) {
     // Can't modify batch dimenson.
-    CHECK_NE(axis, 0);
+    ICHECK_NE(axis, 0);
     // Subtract 1 for implicit batch dim.
     axis -= 1;
   }
@@ -107,7 +107,7 @@ nvinfer1::ITensor* TensorRTOpConverter::CreateScalar(
 void TensorRTOpConverter::GetPadding(const std::vector<std::string>& padding,
                                      bool* use_asymmetric_padding, nvinfer1::DimsHW* prepadding,
                                      nvinfer1::DimsHW* postpadding) const {
-  CHECK(padding.size() == 1 || padding.size() == 2 || padding.size() == 4);
+  ICHECK(padding.size() == 1 || padding.size() == 2 || padding.size() == 4);
   if (padding.size() == 4) {
     // four int : padding width in the order of (top, left, bottom, right).
     *prepadding = nvinfer1::DimsHW(std::stoi(padding[0]), std::stoi(padding[1]));
@@ -129,7 +129,7 @@ void TensorRTOpConverter::GetPadding(const std::vector<std::string>& padding,
 void TensorRTOpConverter::GetPadding3D(const std::vector<std::string>& padding,
                                        bool* use_asymmetric_padding, nvinfer1::Dims* prepadding,
                                        nvinfer1::Dims* postpadding) const {
-  CHECK(padding.size() == 1 || padding.size() == 3 || padding.size() == 6);
+  ICHECK(padding.size() == 1 || padding.size() == 3 || padding.size() == 6);
   if (padding.size() == 6) {
     // six int : padding width in the order of (front, top, left, back, bottom, right)
     *prepadding =
@@ -167,7 +167,7 @@ class ActivationOpConverter : public TensorRTOpConverter {
 #endif
     };
     auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported activation type " << params->op_name;
+    ICHECK(it != op_map.end()) << "Unsupported activation type " << params->op_name;
     nvinfer1::IActivationLayer* act_layer =
         params->network->addActivation(*params->inputs.at(0).tensor, it->second);
 #if TRT_VERSION_GE(5, 1, 5)
@@ -181,7 +181,7 @@ class ActivationOpConverter : public TensorRTOpConverter {
       act_layer->setAlpha(alpha);
     }
 #endif
-    CHECK(act_layer != nullptr);
+    ICHECK(act_layer != nullptr);
     params->outputs.push_back(act_layer->getOutput(0));
   }
 };
@@ -200,7 +200,7 @@ class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
         {"maximum", nvinfer1::ElementWiseOperation::kMAX},
         {"minimum", nvinfer1::ElementWiseOperation::kMIN}};
     auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
+    ICHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
     // Broadcast
     auto input0 = params->inputs.at(0).tensor;
     auto input0_dims = TrtDimsToVector(input0->getDimensions());
@@ -221,7 +221,7 @@ class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
 
     nvinfer1::IElementWiseLayer* elemwise_layer =
         params->network->addElementWise(*input0, *input1, it->second);
-    CHECK(elemwise_layer != nullptr);
+    ICHECK(elemwise_layer != nullptr);
     params->outputs.push_back(elemwise_layer->getOutput(0));
   }
 };
@@ -234,10 +234,10 @@ class Conv2DOpConverter : public TensorRTOpConverter {
     auto input_tensor = params->inputs.at(0).tensor;
     auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
     auto weight_shape = params->inputs.at(1).weight_shape;
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
-    CHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
-          params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCHW");
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
+    ICHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+           params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
     auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
     auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
@@ -251,7 +251,7 @@ class Conv2DOpConverter : public TensorRTOpConverter {
 #if !TRT_VERSION_GE(5, 1, 5)
     if (use_asymmetric_padding) {
       auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
+      ICHECK(pad_layer != nullptr);
       input_tensor = pad_layer->getOutput(0);
       // No need for conv op to do any padding.
       use_asymmetric_padding = false;
@@ -263,7 +263,7 @@ class Conv2DOpConverter : public TensorRTOpConverter {
     nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
                                                       params->inputs.at(1).weight, bias);
-    CHECK(conv_layer != nullptr);
+    ICHECK(conv_layer != nullptr);
     if (use_asymmetric_padding) {
 #if TRT_VERSION_GE(5, 1, 5)
       conv_layer->setPrePadding(prepadding);
@@ -272,10 +272,10 @@ class Conv2DOpConverter : public TensorRTOpConverter {
     } else {
       conv_layer->setPadding(prepadding);
     }
-    CHECK_EQ(str_strides.size(), 2);
+    ICHECK_EQ(str_strides.size(), 2);
     const auto strides = nvinfer1::DimsHW(std::stoi(str_strides[0]), std::stoi(str_strides[1]));
     conv_layer->setStride(strides);
-    CHECK_EQ(str_dilation.size(), 2);
+    ICHECK_EQ(str_dilation.size(), 2);
     const auto dilation = nvinfer1::DimsHW(std::stoi(str_dilation[0]), std::stoi(str_dilation[1]));
     conv_layer->setDilation(dilation);
     conv_layer->setNbGroups(groups);
@@ -292,10 +292,10 @@ class Conv3DOpConverter : public TensorRTOpConverter {
     auto input_tensor = params->inputs.at(0).tensor;
     auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
     auto weight_shape = params->inputs.at(1).weight_shape;
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCDHW");
-    CHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
-          params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCDHW");
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIDHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCDHW");
+    ICHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+           params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCDHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIDHW");
     auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
     auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
@@ -311,18 +311,18 @@ class Conv3DOpConverter : public TensorRTOpConverter {
     nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
                                                         params->inputs.at(1).weight, bias);
-    CHECK(conv_layer != nullptr);
+    ICHECK(conv_layer != nullptr);
     if (use_asymmetric_padding) {
       conv_layer->setPrePadding(prepadding);
       conv_layer->setPostPadding(postpadding);
     } else {
       conv_layer->setPaddingNd(prepadding);
     }
-    CHECK_EQ(str_strides.size(), 3);
+    ICHECK_EQ(str_strides.size(), 3);
     const auto strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
                                          std::stoi(str_strides[2]));
     conv_layer->setStrideNd(strides);
-    CHECK_EQ(str_dilation.size(), 3);
+    ICHECK_EQ(str_dilation.size(), 3);
     const auto dilation = nvinfer1::Dims3(std::stoi(str_dilation[0]), std::stoi(str_dilation[1]),
                                           std::stoi(str_dilation[2]));
     conv_layer->setDilationNd(dilation);
@@ -339,7 +339,7 @@ class DenseOpConverter : public TensorRTOpConverter {
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
     auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
-    CHECK(input_dims.size() > 0 && input_dims.size() <= 3);
+    ICHECK(input_dims.size() > 0 && input_dims.size() <= 3);
     const size_t required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
     const bool need_reshape_on_input = input_dims.size() != required_rank;
     if (need_reshape_on_input) {
@@ -349,12 +349,12 @@ class DenseOpConverter : public TensorRTOpConverter {
       input_tensor = Reshape(params, input_tensor, new_shape);
     }
     // Weights are in KC format.
-    CHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
+    ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
     const int num_units = params->inputs.at(1).weight_shape[0];
     nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
         *input_tensor, num_units, params->inputs.at(1).weight, bias);
-    CHECK(fc_layer != nullptr);
+    ICHECK(fc_layer != nullptr);
     auto output_tensor = fc_layer->getOutput(0);
     if (need_reshape_on_input) {
       // Remove added dims.
@@ -375,14 +375,14 @@ class BatchNormOpConverter : public TensorRTOpConverter {
     auto beta = params->inputs.at(2).weight;
     auto mean = params->inputs.at(3).weight;
     auto var = params->inputs.at(4).weight;
-    CHECK_EQ(gamma.count, beta.count);
-    CHECK_EQ(gamma.count, mean.count);
-    CHECK_EQ(gamma.count, var.count);
+    ICHECK_EQ(gamma.count, beta.count);
+    ICHECK_EQ(gamma.count, mean.count);
+    ICHECK_EQ(gamma.count, var.count);
     const float epsilon = std::stof(params->node.GetAttr<std::vector<std::string>>("epsilon")[0]);
     const int axis = std::stoi(params->node.GetAttr<std::vector<std::string>>("axis")[0]);
     const bool scale = std::stoi(params->node.GetAttr<std::vector<std::string>>("scale")[0]);
     const bool center = std::stoi(params->node.GetAttr<std::vector<std::string>>("center")[0]);
-    CHECK(axis == 1 || axis == 3);
+    ICHECK(axis == 1 || axis == 3);
     const bool need_transpose = axis == 3;
 
     void* weight_scale_ptr = new float[gamma.count];
@@ -415,7 +415,7 @@ class BatchNormOpConverter : public TensorRTOpConverter {
     }
     nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
         *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power);
-    CHECK(scale_layer != nullptr);
+    ICHECK(scale_layer != nullptr);
     auto output = scale_layer->getOutput(0);
     if (need_transpose) {
       output = Transpose(params, output, {0, 2, 3, 1});
@@ -448,7 +448,7 @@ class SoftmaxOpConverter : public TensorRTOpConverter {
     const int axis = ConvertAxis(params, original_axis, input_rank);
     nvinfer1::ISoftMaxLayer* softmax_layer = params->network->addSoftMax(*input);
     softmax_layer->setAxes(1 << axis);
-    CHECK(softmax_layer != nullptr);
+    ICHECK(softmax_layer != nullptr);
     params->outputs.push_back(softmax_layer->getOutput(0));
   }
 };
@@ -463,8 +463,8 @@ class PoolingOpConverter : public TensorRTOpConverter {
         {"nn.max_pool2d", nvinfer1::PoolingType::kMAX},
         {"nn.avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
     auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
     auto str_pool_size = params->node.GetAttr<std::vector<std::string>>("pool_size");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
     auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
@@ -478,7 +478,7 @@ class PoolingOpConverter : public TensorRTOpConverter {
 #if !TRT_VERSION_GE(5, 1, 5)
     if (use_asymmetric_padding) {
       auto pad_layer = params->network->addPadding(*input, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
+      ICHECK(pad_layer != nullptr);
       input = pad_layer->getOutput(0);
       // No need for pooling op to do any padding.
       use_asymmetric_padding = false;
@@ -489,7 +489,7 @@ class PoolingOpConverter : public TensorRTOpConverter {
     nvinfer1::DimsHW window_size =
         nvinfer1::DimsHW(std::stoi(str_pool_size[0]), std::stoi(str_pool_size[1]));
     auto pool_layer = params->network->addPooling(*input, it->second, window_size);
-    CHECK(pool_layer != nullptr);
+    ICHECK(pool_layer != nullptr);
     nvinfer1::DimsHW strides =
         nvinfer1::DimsHW(std::stoi(str_strides[0]), std::stoi(str_strides[1]));
     pool_layer->setStride(strides);
@@ -519,7 +519,7 @@ class PoolingOpConverter : public TensorRTOpConverter {
       pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
     }
 #else
-    CHECK(!ceil_mode);
+    ICHECK(!ceil_mode);
 #endif
     params->outputs.push_back(pool_layer->getOutput(0));
   }
@@ -536,8 +536,8 @@ class Pooling3DOpConverter : public TensorRTOpConverter {
         {"nn.max_pool3d", nvinfer1::PoolingType::kMAX},
         {"nn.avg_pool3d", nvinfer1::PoolingType::kAVERAGE}};
     auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCDHW");
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCDHW");
     auto str_pool_size = params->node.GetAttr<std::vector<std::string>>("pool_size");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
     auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
@@ -548,7 +548,7 @@ class Pooling3DOpConverter : public TensorRTOpConverter {
     nvinfer1::Dims window_size = nvinfer1::Dims3(
         std::stoi(str_pool_size[0]), std::stoi(str_pool_size[1]), std::stoi(str_pool_size[2]));
     auto pool_layer = params->network->addPoolingNd(*input, it->second, window_size);
-    CHECK(pool_layer != nullptr);
+    ICHECK(pool_layer != nullptr);
     nvinfer1::Dims strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
                                              std::stoi(str_strides[2]));
     pool_layer->setStrideNd(strides);
@@ -582,13 +582,13 @@ class GlobalPoolingOpConverter : public TensorRTOpConverter {
         {"nn.global_max_pool2d", nvinfer1::PoolingType::kMAX},
         {"nn.global_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
     auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
     const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
     const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
     auto pool_layer =
         params->network->addPooling(*input_tensor, it->second, nvinfer1::DimsHW(h, w));
-    CHECK(pool_layer != nullptr);
+    ICHECK(pool_layer != nullptr);
     params->outputs.push_back(pool_layer->getOutput(0));
   }
 };
@@ -650,10 +650,10 @@ class UnaryOpConverter : public TensorRTOpConverter {
 #endif
     };
     auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name;
+    ICHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name;
     nvinfer1::IUnaryLayer* unary_layer =
         params->network->addUnary(*params->inputs.at(0).tensor, it->second);
-    CHECK(unary_layer != nullptr);
+    ICHECK(unary_layer != nullptr);
     params->outputs.push_back(unary_layer->getOutput(0));
   }
 };
@@ -664,12 +664,12 @@ class ConcatOpConverter : public TensorRTOpConverter {
 
   void Convert(TensorRTOpConverterParams* params) const {
     const int num_inputs = params->inputs.size();
-    CHECK_GT(num_inputs, 0);
+    ICHECK_GT(num_inputs, 0);
     const int input_rank = params->inputs[0].tensor->getDimensions().nbDims;
     std::vector<nvinfer1::ITensor*> input_tensors;
     for (auto input : params->inputs) {
-      CHECK(input.type == kTensor);
-      CHECK_EQ(input_rank, input.tensor->getDimensions().nbDims);
+      ICHECK(input.type == kTensor);
+      ICHECK_EQ(input_rank, input.tensor->getDimensions().nbDims);
       input_tensors.push_back(input.tensor);
     }
 
@@ -678,7 +678,7 @@ class ConcatOpConverter : public TensorRTOpConverter {
 
     nvinfer1::IConcatenationLayer* concat_layer =
         params->network->addConcatenation(input_tensors.data(), input_tensors.size());
-    CHECK(concat_layer != nullptr);
+    ICHECK(concat_layer != nullptr);
     concat_layer->setAxis(axis);
     params->outputs.push_back(concat_layer->getOutput(0));
   }
@@ -692,7 +692,7 @@ class BiasAddOpConverter : public TensorRTOpConverter {
     auto input_tensor = params->inputs.at(0).tensor;
     auto input_dims = TrtDimsToVector(input_tensor->getDimensions());
     const size_t required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4;
-    CHECK(input_dims.size() > 0 && input_dims.size() <= required_rank);
+    ICHECK(input_dims.size() > 0 && input_dims.size() <= required_rank);
     const bool need_reshape_on_input = input_dims.size() != required_rank;
     if (need_reshape_on_input) {
       // Add dims of size 1 until rank is required_rank.
@@ -705,7 +705,7 @@ class BiasAddOpConverter : public TensorRTOpConverter {
     nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
     nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
         *input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
-    CHECK(scale_layer != nullptr);
+    ICHECK(scale_layer != nullptr);
     auto output_tensor = scale_layer->getOutput(0);
     if (need_reshape_on_input) {
       // Remove added dims.
@@ -722,12 +722,12 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
     auto weight_shape = params->inputs.at(1).weight_shape;
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
-    CHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
-          params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCHW");
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCHW");
+    ICHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+           params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIHW");
     auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
-    CHECK(std::stoi(str_dilation[0]) == 1 && std::stoi(str_dilation[1]) == 1);
+    ICHECK(std::stoi(str_dilation[0]) == 1 && std::stoi(str_dilation[1]) == 1);
     auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
     auto str_output_padding = params->node.GetAttr<std::vector<std::string>>("output_padding");
@@ -741,7 +741,7 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
 #if !TRT_VERSION_GE(5, 1, 5)
     if (use_asymmetric_padding) {
       auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding);
-      CHECK(pad_layer != nullptr);
+      ICHECK(pad_layer != nullptr);
       input_tensor = pad_layer->getOutput(0);
       // No need for conv op to do any padding.
       use_asymmetric_padding = false;
@@ -755,7 +755,7 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
     nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
                                                           params->inputs.at(1).weight, bias);
-    CHECK(deconv_layer != nullptr);
+    ICHECK(deconv_layer != nullptr);
     if (use_asymmetric_padding) {
 #if TRT_VERSION_GE(5, 1, 5)
       deconv_layer->setPrePadding(prepadding);
@@ -791,14 +791,14 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
     auto weight_shape = params->inputs.at(1).weight_shape;
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCDHW");
-    CHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
-          params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCDHW");
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIDHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("data_layout")[0], "NCDHW");
+    ICHECK(params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "" ||
+           params->node.GetAttr<std::vector<std::string>>("out_layout")[0] == "NCDHW");
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("kernel_layout")[0], "OIDHW");
     auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
-    CHECK_EQ(str_dilation.size(), 3);
-    CHECK(std::stoi(str_dilation[0]) == 1 && std::stoi(str_dilation[1]) == 1 &&
-          std::stoi(str_dilation[2]) == 1);
+    ICHECK_EQ(str_dilation.size(), 3);
+    ICHECK(std::stoi(str_dilation[0]) == 1 && std::stoi(str_dilation[1]) == 1 &&
+           std::stoi(str_dilation[2]) == 1);
     auto str_strides = params->node.GetAttr<std::vector<std::string>>("strides");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
     auto str_output_padding = params->node.GetAttr<std::vector<std::string>>("output_padding");
@@ -813,14 +813,14 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
     nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
                                                             params->inputs.at(1).weight, bias);
-    CHECK(deconv_layer != nullptr);
+    ICHECK(deconv_layer != nullptr);
     if (use_asymmetric_padding) {
       deconv_layer->setPrePadding(prepadding);
       deconv_layer->setPostPadding(postpadding);
     } else {
       deconv_layer->setPaddingNd(prepadding);
     }
-    CHECK_EQ(str_strides.size(), 3);
+    ICHECK_EQ(str_strides.size(), 3);
     const auto strides = nvinfer1::Dims3(std::stoi(str_strides[0]), std::stoi(str_strides[1]),
                                          std::stoi(str_strides[2]));
     deconv_layer->setStrideNd(strides);
@@ -830,7 +830,7 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
     if (str_output_padding.size()) {
       GetPadding3D(str_output_padding, &use_asymmetric_padding, &prepadding, &postpadding);
       // Are any post-padding values non-zero?
-      CHECK(!std::any_of(postpadding.d, postpadding.d + postpadding.nbDims, [](int x) {
+      ICHECK(!std::any_of(postpadding.d, postpadding.d + postpadding.nbDims, [](int x) {
         return x != 0;
       })) << "TRT does not support padding on 3 dimensions.";
     }
@@ -882,13 +882,13 @@ class ReshapeOpConverter : public TensorRTOpConverter {
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
-    CHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("reverse")[0]), false);
+    ICHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("reverse")[0]), false);
     auto str_newshape = params->node.GetAttr<std::vector<std::string>>("newshape");
     std::vector<int> new_shape;
     const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
     for (size_t i = start_index; i < str_newshape.size(); ++i) {
       const int value = std::stoi(str_newshape[i]);
-      CHECK_GE(value, -1);
+      ICHECK_GE(value, -1);
       new_shape.push_back(value);
     }
     params->outputs.push_back(Reshape(params, input, new_shape));
@@ -923,14 +923,14 @@ class ReduceOpConverter : public TensorRTOpConverter {
         {"min", nvinfer1::ReduceOperation::kMIN},
         {"mean", nvinfer1::ReduceOperation::kAVG}};
     auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported reduce type " << params->op_name;
+    ICHECK(it != op_map.end()) << "Unsupported reduce type " << params->op_name;
 
     auto input = params->inputs.at(0).tensor;
-    CHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("exclude")[0]), false);
+    ICHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("exclude")[0]), false);
     bool keepdims = std::stoi(params->node.GetAttr<std::vector<std::string>>("keepdims")[0]);
     auto str_axis = params->node.GetAttr<std::vector<std::string>>("axis");
     // TODO(trevmorr): Support reduce to scalar.
-    CHECK_GT(str_axis.size(), 0);
+    ICHECK_GT(str_axis.size(), 0);
     uint32_t reduce_axes = 0;
     for (size_t i = 0; i < str_axis.size(); ++i) {
       const int axis = ConvertAxis(params, std::stoi(str_axis[i]), input->getDimensions().nbDims);
@@ -982,8 +982,8 @@ class AdaptivePoolingOpConverter : public TensorRTOpConverter {
         {"nn.adaptive_max_pool2d", nvinfer1::PoolingType::kMAX},
         {"nn.adaptive_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
     auto it = op_map.find(params->op_name);
-    CHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
-    CHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
 
     // This is an approximation of adaptive pooling. Results will not be
     // mathematically exact except when output_size is (1, 1).
@@ -995,7 +995,7 @@ class AdaptivePoolingOpConverter : public TensorRTOpConverter {
     const auto window_size = nvinfer1::DimsHW(h - (output_size.h() - 1) * stride.h(),
                                               w - (output_size.w() - 1) * stride.w());
     auto pool_layer = params->network->addPooling(*input_tensor, it->second, window_size);
-    CHECK(pool_layer != nullptr);
+    ICHECK(pool_layer != nullptr);
     pool_layer->setStride(stride);
     params->outputs.push_back(pool_layer->getOutput(0));
   }
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 72c025695f7d..f183e2f24449 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -73,7 +73,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
    * \param consts The constant params from compiled model.
    */
   void Init(const Array<NDArray>& consts) override {
-    CHECK_EQ(consts.size(), const_idx_.size())
+    ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     LoadGlobalAttributes();
     if (GetCachedEnginesFromDisk()) return;
@@ -118,7 +118,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
           uint32_t eid = EntryID(nid, j);
           const std::string name = nodes_[nid].GetOpName() + "_" + std::to_string(j);
           int binding_index = engine->getBindingIndex(name.c_str());
-          CHECK_NE(binding_index, -1);
+          ICHECK_NE(binding_index, -1);
           bindings[binding_index] = data_entry_[eid]->data;
         }
       }
@@ -128,18 +128,18 @@ class TensorRTRuntime : public JSONRuntimeBase {
       uint32_t eid = EntryID(outputs_[i]);
       const std::string& name = engine_and_context.outputs[i];
       int binding_index = engine->getBindingIndex(name.c_str());
-      CHECK_NE(binding_index, -1);
+      ICHECK_NE(binding_index, -1);
       bindings[binding_index] = data_entry_[eid]->data;
     }
 
 #if TRT_VERSION_GE(6, 0, 1)
     if (use_implicit_batch_) {
-      CHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
+      ICHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
     } else {
-      CHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
+      ICHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
     }
 #else
-    CHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
+    ICHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
 #endif
   }
 
@@ -162,7 +162,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
       if (node.GetOpType() == "input") {
         builder.AddInput(nid, node);
       } else {
-        CHECK_EQ(node.GetOpType(), "const");
+        ICHECK_EQ(node.GetOpType(), "const");
         uint32_t eid = EntryID(nid, 0);
         builder.AddConstant(nid, data_entry_[eid]);
       }
diff --git a/src/runtime/contrib/tflite/tflite_runtime.cc b/src/runtime/contrib/tflite/tflite_runtime.cc
index 8b34e90312b0..f56e62ec1a40 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.cc
+++ b/src/runtime/contrib/tflite/tflite_runtime.cc
@@ -117,7 +117,7 @@ void TFLiteRuntime::SetInput(int index, DLTensor* data_in) {
   TVM_DTYPE_DISPATCH(dtype, DType, {
     DType* dest = interpreter_->typed_input_tensor<DType>(index);
     DType* src = static_cast<DType*>(data_in->data);
-    CHECK(data_in->strides == NULL);
+    ICHECK(data_in->strides == NULL);
     int64_t size = 1;
     for (int64_t i = 0; i < data_in->ndim; ++i) {
       size *= data_in->shape[i];
@@ -155,7 +155,7 @@ PackedFunc TFLiteRuntime::GetFunction(const std::string& name,
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       int in_idx = args[0];
-      CHECK_GE(in_idx, 0);
+      ICHECK_GE(in_idx, 0);
       this->SetInput(in_idx, args[1]);
     });
   } else if (name == "get_output") {
diff --git a/src/runtime/contrib/tflite/tflite_runtime.h b/src/runtime/contrib/tflite/tflite_runtime.h
index f3e3bd90bba4..ff0e6ab0db56 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.h
+++ b/src/runtime/contrib/tflite/tflite_runtime.h
@@ -37,7 +37,7 @@
 namespace tvm {
 namespace runtime {
 
-#define CHECK_TFLITE_STATUS(ret) CHECK_EQ(ret, kTfLiteOk)
+#define CHECK_TFLITE_STATUS(ret) ICHECK_EQ(ret, kTfLiteOk)
 
 /*!
  * \brief Tflite runtime.
diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu
index c40235d7cc9e..2054db710b6d 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -130,7 +130,7 @@ void thrust_sort_common(DLTensor* input,
 
 TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort_nms")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_GE(args.num_args, 5);
+  ICHECK_GE(args.num_args, 5);
   DLTensor* input = args[0];
   DLTensor* valid_count = args[1];
   DLTensor* values_out = args[2];
@@ -149,7 +149,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort_nms")
 
 TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_GE(args.num_args, 4);
+  ICHECK_GE(args.num_args, 4);
   DLTensor* input = args[0];
   DLTensor* values_out = args[1];
   DLTensor* indices_out = args[2];
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index 5eeba29ee755..6cb8b820e6cc 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -20,10 +20,10 @@
 /*!
  * \file cpu_device_api.cc
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <cstdlib>
 #include <cstring>
diff --git a/src/runtime/cuda/cuda_common.h b/src/runtime/cuda/cuda_common.h
index 25ff28a91a6c..471fefb230a1 100644
--- a/src/runtime/cuda/cuda_common.h
+++ b/src/runtime/cuda/cuda_common.h
@@ -44,10 +44,11 @@ namespace runtime {
     }                                                                   \
   }
 
-#define CUDA_CALL(func)                                                                            \
-  {                                                                                                \
-    cudaError_t e = (func);                                                                        \
-    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) << "CUDA: " << cudaGetErrorString(e); \
+#define CUDA_CALL(func)                                       \
+  {                                                           \
+    cudaError_t e = (func);                                   \
+    ICHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+        << "CUDA: " << cudaGetErrorString(e);                 \
   }
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index f7b88ccdd964..30abfc8dc559 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -107,7 +107,7 @@ class CUDADeviceAPI final : public DeviceAPI {
   }
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
-    CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
+    ICHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
     void* ret;
     if (ctx.device_type == kDLCPUPinned) {
       CUDA_CALL(cudaMallocHost(&ret, nbytes));
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index c356897c8e90..a877bc634300 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -71,11 +71,11 @@ class CUDAModuleNode : public runtime::ModuleNode {
     std::string fmt = GetFileFormat(file_name, format);
     std::string meta_file = GetMetaFilePath(file_name);
     if (fmt == "cu") {
-      CHECK_NE(cuda_source_.length(), 0);
+      ICHECK_NE(cuda_source_.length(), 0);
       SaveMetaDataToFile(meta_file, fmap_);
       SaveBinaryToFile(file_name, cuda_source_);
     } else {
-      CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+      ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
       SaveMetaDataToFile(meta_file, fmap_);
       SaveBinaryToFile(file_name, data_);
     }
@@ -124,7 +124,7 @@ class CUDAModuleNode : public runtime::ModuleNode {
     size_t nbytes;
 
     CUresult result = cuModuleGetGlobal(&global, &nbytes, module_[device_id], global_name.c_str());
-    CHECK_EQ(nbytes, expect_nbytes);
+    ICHECK_EQ(nbytes, expect_nbytes);
     if (result != CUDA_SUCCESS) {
       const char* msg;
       cuGetErrorName(result, &msg);
@@ -232,8 +232,8 @@ class CUDAPrepGlobalBarrier {
 
 PackedFunc CUDAModuleNode::GetFunction(const std::string& name,
                                        const ObjectPtr<Object>& sptr_to_self) {
-  CHECK_EQ(sptr_to_self.get(), this);
-  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  ICHECK_EQ(sptr_to_self.get(), this);
+  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
   if (name == symbol::tvm_prepare_global_barrier) {
     return PackedFunc(CUDAPrepGlobalBarrier(this, sptr_to_self));
   }
diff --git a/src/runtime/dso_library.cc b/src/runtime/dso_library.cc
index 6d3eec402306..c439bde82497 100644
--- a/src/runtime/dso_library.cc
+++ b/src/runtime/dso_library.cc
@@ -63,7 +63,7 @@ class DSOLibrary final : public Library {
     // use wstring version that is needed by LLVM.
     std::wstring wname(name.begin(), name.end());
     lib_handle_ = LoadLibraryW(wname.c_str());
-    CHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name;
+    ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name;
   }
 
   void Unload() {
@@ -76,8 +76,8 @@ class DSOLibrary final : public Library {
   // load the library
   void Load(const std::string& name) {
     lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    CHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
-                                  << dlerror();
+    ICHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name << " " << dlerror();
   }
 
   void* GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
diff --git a/src/runtime/file_utils.cc b/src/runtime/file_utils.cc
index c3298d266cdd..42cbfdc3b1ed 100644
--- a/src/runtime/file_utils.cc
+++ b/src/runtime/file_utils.cc
@@ -23,8 +23,8 @@
 #include "file_utils.h"
 
 #include <dmlc/json.h>
-#include <dmlc/logging.h>
 #include <tvm/runtime/serializer.h>
+#include <tvm/support/logging.h>
 
 #include <fstream>
 #include <unordered_map>
@@ -114,7 +114,7 @@ std::string GetMetaFilePath(const std::string& file_name) {
 
 void LoadBinaryFromFile(const std::string& file_name, std::string* data) {
   std::ifstream fs(file_name, std::ios::in | std::ios::binary);
-  CHECK(!fs.fail()) << "Cannot open " << file_name;
+  ICHECK(!fs.fail()) << "Cannot open " << file_name;
   // get its size:
   fs.seekg(0, std::ios::end);
   size_t size = static_cast<size_t>(fs.tellg());
@@ -125,7 +125,7 @@ void LoadBinaryFromFile(const std::string& file_name, std::string* data) {
 
 void SaveBinaryToFile(const std::string& file_name, const std::string& data) {
   std::ofstream fs(file_name, std::ios::out | std::ios::binary);
-  CHECK(!fs.fail()) << "Cannot open " << file_name;
+  ICHECK(!fs.fail()) << "Cannot open " << file_name;
   fs.write(&data[0], data.length());
 }
 
@@ -133,7 +133,7 @@ void SaveMetaDataToFile(const std::string& file_name,
                         const std::unordered_map<std::string, FunctionInfo>& fmap) {
   std::string version = "0.1.0";
   std::ofstream fs(file_name.c_str());
-  CHECK(!fs.fail()) << "Cannot open file " << file_name;
+  ICHECK(!fs.fail()) << "Cannot open file " << file_name;
   dmlc::JSONWriter writer(&fs);
   writer.BeginObject();
   writer.WriteObjectKeyValue("tvm_version", version);
@@ -145,7 +145,7 @@ void SaveMetaDataToFile(const std::string& file_name,
 void LoadMetaDataFromFile(const std::string& file_name,
                           std::unordered_map<std::string, FunctionInfo>* fmap) {
   std::ifstream fs(file_name.c_str());
-  CHECK(!fs.fail()) << "Cannot open file " << file_name;
+  ICHECK(!fs.fail()) << "Cannot open file " << file_name;
   std::string version;
   dmlc::JSONReader reader(&fs);
   dmlc::JSONObjectReadHelper helper;
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 5439be9109f9..3e9ff4f279e7 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -148,7 +148,7 @@ class GraphRuntimeDebug : public GraphRuntime {
    * \param data_out the node data.
    */
   void DebugGetNodeOutput(int index, DLTensor* data_out) {
-    CHECK_LT(static_cast<size_t>(index), op_execs_.size());
+    ICHECK_LT(static_cast<size_t>(index), op_execs_.size());
     uint32_t eid = index;
 
     for (size_t i = 0; i < op_execs_.size(); ++i) {
@@ -185,9 +185,9 @@ PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name,
       int number = args[0];
       int repeat = args[1];
       int min_repeat_ms = args[2];
-      CHECK_GT(number, 0);
-      CHECK_GT(repeat, 0);
-      CHECK_GE(min_repeat_ms, 0);
+      ICHECK_GT(number, 0);
+      ICHECK_GT(repeat, 0);
+      ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
   } else {
@@ -209,9 +209,9 @@ Module GraphRuntimeDebugCreate(const std::string& sym_json, const tvm::runtime::
 }
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
-                                "at least 4, but it has "
-                             << args.num_args;
+  ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
+                                 "at least 4, but it has "
+                              << args.num_args;
   *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
 });
 }  // namespace runtime
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 5eeca52b68f2..601c68abdf08 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -99,7 +99,7 @@ int GraphRuntime::GetInputIndex(const std::string& name) {
  * \param data_in The input data.
  */
 void GraphRuntime::SetInput(int index, DLTensor* data_in) {
-  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   data_entry_[eid].CopyFrom(data_in);
 }
@@ -140,18 +140,18 @@ std::vector<std::string> GraphRuntime::GetWeightNames() const {
  * \param data_ref The input data that is referred.
  */
 void GraphRuntime::SetInputZeroCopy(int index, DLTensor* data_ref) {
-  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   const DLTensor* old_t = data_entry_[eid].operator->();
 
   // check the consistency of input
-  CHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*data_ref));
-  CHECK_EQ(reinterpret_cast<size_t>(data_ref->data) % kAllocAlignment, 0);
-  CHECK_EQ(old_t->ndim, static_cast<size_t>(data_ref->ndim));
-  CHECK_EQ(old_t->ctx.device_type, data_ref->ctx.device_type);
-  CHECK_EQ(old_t->ctx.device_id, data_ref->ctx.device_id);
+  ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*data_ref));
+  ICHECK_EQ(reinterpret_cast<size_t>(data_ref->data) % kAllocAlignment, 0);
+  ICHECK_EQ(old_t->ndim, static_cast<size_t>(data_ref->ndim));
+  ICHECK_EQ(old_t->ctx.device_type, data_ref->ctx.device_type);
+  ICHECK_EQ(old_t->ctx.device_id, data_ref->ctx.device_id);
   for (auto i = 0; i < data_ref->ndim; ++i) {
-    CHECK_EQ(old_t->shape[i], data_ref->shape[i]);
+    ICHECK_EQ(old_t->shape[i], data_ref->shape[i]);
   }
 
   // Update the data pointer for each argument of each op
@@ -190,7 +190,7 @@ std::string GraphRuntime::GetOutputType(int index) const {
  * \return NDArray corresponding to given input node index.
  */
 NDArray GraphRuntime::GetInput(int index) const {
-  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   return data_entry_[eid];
 }
@@ -201,7 +201,7 @@ NDArray GraphRuntime::GetInput(int index) const {
  * \return NDArray corresponding to given output node index.
  */
 NDArray GraphRuntime::GetOutput(int index) const {
-  CHECK_LT(static_cast<size_t>(index), outputs_.size());
+  ICHECK_LT(static_cast<size_t>(index), outputs_.size());
   uint32_t eid = this->entry_id(outputs_[index]);
   return data_entry_[eid];
 }
@@ -211,14 +211,14 @@ NDArray GraphRuntime::GetOutput(int index) const {
  * \param data_out the output data.
  */
 void GraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
-  CHECK_LT(static_cast<size_t>(index), outputs_.size());
+  ICHECK_LT(static_cast<size_t>(index), outputs_.size());
   uint32_t eid = this->entry_id(outputs_[index]);
 
   // Check the shapes to avoid receiving in different dimension but same size.
   const NDArray& data = data_entry_[eid];
-  CHECK_EQ(data->ndim, data_out->ndim);
+  ICHECK_EQ(data->ndim, data_out->ndim);
   for (int32_t j = 0; j < data->ndim; ++j) {
-    CHECK_EQ(data->shape[j], data_out->shape[j]);
+    ICHECK_EQ(data->shape[j], data_out->shape[j]);
   }
 
   data_entry_[eid].CopyTo(data_out);
@@ -235,14 +235,15 @@ void GraphRuntime::LoadParams(const std::string& param_blob) {
 
 void GraphRuntime::LoadParams(dmlc::Stream* strm) {
   uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid parameters file format";
-  CHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
-  CHECK(strm->Read(&reserved)) << "Invalid parameters file format";
-  CHECK(strm->Read(&weight_names_)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
+  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
+
+  ICHECK(strm->Read(&weight_names_)) << "Invalid parameters file format";
   uint64_t sz;
   strm->Read(&sz);
   size_t size = static_cast<size_t>(sz);
-  CHECK(size == weight_names_.size()) << "Invalid parameters file format";
+  ICHECK(size == weight_names_.size()) << "Invalid parameters file format";
   for (size_t i = 0; i < size; ++i) {
     int in_idx = GetInputIndex(weight_names_[i]);
     if (in_idx < 0) {
@@ -251,7 +252,7 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
       continue;
     }
     uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
-    CHECK_LT(eid, data_entry_.size());
+    ICHECK_LT(eid, data_entry_.size());
 
     // The data_entry is allocated on device, NDArray.load always load the array into CPU.
     NDArray temp;
@@ -262,23 +263,23 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
 
 void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
   uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid parameters file format";
-  CHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
-  CHECK(strm->Read(&reserved)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
+  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
   std::vector<std::string> names;
-  CHECK(strm->Read(&names)) << "Invalid parameters file format";
+  ICHECK(strm->Read(&names)) << "Invalid parameters file format";
   uint64_t sz;
   strm->Read(&sz);
   size_t size = static_cast<size_t>(sz);
-  CHECK(size == names.size()) << "Invalid parameters file format";
+  ICHECK(size == names.size()) << "Invalid parameters file format";
   for (size_t i = 0; i < size; ++i) {
     int in_idx = GetInputIndex(names[i]);
     if (in_idx < 0) continue;
     uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
-    CHECK_LT(eid, data_entry_.size());
-    CHECK_EQ(data_entry_[eid].use_count(), 1);
+    ICHECK_LT(eid, data_entry_.size());
+    ICHECK_EQ(data_entry_[eid].use_count(), 1);
     data_entry_[eid] = other.GetInput(GetInputIndex(names[i]));
-    CHECK_GT(data_entry_[eid].use_count(), 1);
+    ICHECK_GT(data_entry_[eid].use_count(), 1);
     const DLTensor* tmp = data_entry_[eid].operator->();
     data_alignment_[eid] = details::GetDataAlignment(*tmp);
   }
@@ -306,17 +307,17 @@ void GraphRuntime::SetupStorage() {
     for (int64_t sz : attrs_.shape[i]) {
       size *= static_cast<size_t>(sz);
     }
-    CHECK_GE(storage_id, 0) << "Do not support runtime shape op";
+    ICHECK_GE(storage_id, 0) << "Do not support runtime shape op";
     DLDataType t = vtype[i];
     size_t bits = t.bits * t.lanes;
-    CHECK(bits % 8U == 0U || bits == 1U);
+    ICHECK(bits % 8U == 0U || bits == 1U);
     size_t bytes = ((bits + 7U) / 8U) * size;
 
     uint32_t sid = static_cast<uint32_t>(storage_id);
     if (sid >= pool_entry.size()) {
       pool_entry.resize(sid + 1, {0, -1});
     } else {
-      CHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
+      ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
     }
     pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
@@ -343,7 +344,7 @@ void GraphRuntime::SetupStorage() {
   data_alignment_.resize(num_node_entries());
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
-    CHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
+    ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
     data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
     const DLTensor* tmp = data_entry_[i].operator->();
     data_alignment_[i] = details::GetDataAlignment(*tmp);
@@ -372,7 +373,7 @@ void GraphRuntime::SetupOpExecs() {
       uint32_t eid = this->entry_id(nid, index);
       args.push_back(*(data_entry_[eid].operator->()));
     }
-    CHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
+    ICHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
 
     std::shared_ptr<OpArgs> op_args = nullptr;
     std::tie(op_execs_[nid], op_args) = CreateTVMOp(inode.param, args, inode.inputs.size());
@@ -425,7 +426,7 @@ std::pair<std::function<void()>, std::shared_ptr<GraphRuntime::OpArgs> > GraphRu
   // Get compiled function from the module that contains both host and device
   // code.
   tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, true);
-  CHECK(pf != nullptr) << "no such function in module: " << param.func_name;
+  ICHECK(pf != nullptr) << "no such function in module: " << param.func_name;
 
   auto fexec = [arg_ptr, pf]() {
     TVMRetValue rv;
@@ -492,7 +493,7 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name,
   } else if (name == "share_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       const auto& module = args[0].operator Module();
-      CHECK_EQ(module.operator->()->type_key(), "GraphRuntime");
+      ICHECK_EQ(module.operator->()->type_key(), "GraphRuntime");
       const auto& param_blob = args[1].operator std::string();
       dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
       this->ShareParams(dynamic_cast<const GraphRuntime&>(*module.operator->()), &strm);
@@ -529,9 +530,9 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
 // be passed in. The third one is the number of devices.
 // Eventually, we will only probably pass TVMContext for all the languages.
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
-                                "at least 4, but it has "
-                             << args.num_args;
+  ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
+                                 "at least 4, but it has "
+                              << args.num_args;
   const auto& contexts = GetAllContext(args);
   *rv = GraphRuntimeCreate(args[0], args[1], contexts);
 });
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 34f34715d66f..c08f5e671a08 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -41,10 +41,10 @@ namespace tvm {
 namespace runtime {
 
 /*! \brief macro to do C API call */
-#define TVM_CCALL(func)                    \
-  {                                        \
-    int ret = (func);                      \
-    CHECK_EQ(ret, 0) << TVMGetLastError(); \
+#define TVM_CCALL(func)                     \
+  {                                         \
+    int ret = (func);                       \
+    ICHECK_EQ(ret, 0) << TVMGetLastError(); \
   }
 
 /*! \brief Magic number for NDArray list file  */
@@ -219,13 +219,13 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     // JSON Loader
     void Load(dmlc::JSONReader* reader) {
       reader->BeginArray();
-      CHECK(reader->NextArrayItem()) << "invalid json format";
+      ICHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&node_id);
-      CHECK(reader->NextArrayItem()) << "invalid json format";
+      ICHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&index);
       if (reader->NextArrayItem()) {
         reader->Read(&version);
-        CHECK(!reader->NextArrayItem()) << "invalid json format";
+        ICHECK(!reader->NextArrayItem()) << "invalid json format";
       } else {
         version = 0;
       }
@@ -265,7 +265,7 @@ class TVM_DLL GraphRuntime : public ModuleNode {
           bitmask |= 8;
         }
       }
-      CHECK_EQ(bitmask, 1 | 2 | 4 | 8) << "invalid format";
+      ICHECK_EQ(bitmask, 1 | 2 | 4 | 8) << "invalid format";
     }
     // JSON Loader
     void Load(dmlc::JSONReader* reader) {
@@ -290,7 +290,7 @@ class TVM_DLL GraphRuntime : public ModuleNode {
           LOG(FATAL) << "do not support key " << key;
         }
       }
-      CHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
+      ICHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
     }
   };
   struct GraphAttr {
@@ -307,58 +307,58 @@ class TVM_DLL GraphRuntime : public ModuleNode {
       while (reader->NextObjectItem(&key)) {
         if (key == "dltype") {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
-          CHECK_EQ(type, "list_str");
-          CHECK(reader->NextArrayItem());
+          ICHECK_EQ(type, "list_str");
+          ICHECK(reader->NextArrayItem());
           reader->Read(&dltype);
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
           bitmask |= 1;
         } else if (key == "storage_id") {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
+          ICHECK_EQ(type, "list_int");
+          ICHECK(reader->NextArrayItem());
           reader->Read(&storage_id);
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
           bitmask |= 2;
         } else if (key == "shape") {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
-          CHECK_EQ(type, "list_shape");
-          CHECK(reader->NextArrayItem());
+          ICHECK_EQ(type, "list_shape");
+          ICHECK(reader->NextArrayItem());
           reader->Read(&shape);
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
           bitmask |= 4;
         } else if (key == "device_index") {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
+          ICHECK_EQ(type, "list_int");
+          ICHECK(reader->NextArrayItem());
           reader->Read(&device_index);
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
         } else {
           reader->BeginArray();
-          CHECK(reader->NextArrayItem());
+          ICHECK(reader->NextArrayItem());
           reader->Read(&type);
           if (type == "list_int") {
-            CHECK(reader->NextArrayItem());
+            ICHECK(reader->NextArrayItem());
             std::vector<int> temp;
             reader->Read(&temp);
           } else if (type == "size_t") {
-            CHECK(reader->NextArrayItem());
+            ICHECK(reader->NextArrayItem());
             size_t temp;
             reader->Read(&temp);
           } else {
             LOG(FATAL) << "cannot skip graph attr " << key;
           }
-          CHECK(!reader->NextArrayItem());
+          ICHECK(!reader->NextArrayItem());
         }
       }
-      CHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
+      ICHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
     }
   };
   // The graph attribute fields.
@@ -388,7 +388,7 @@ class TVM_DLL GraphRuntime : public ModuleNode {
         LOG(FATAL) << "key " << key << " is not supported";
       }
     }
-    CHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
+    ICHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
   }
   /*! \brief Setup the temporal storage */
   void SetupStorage();
diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc
index aa35afaf70f8..632a25c987bc 100644
--- a/src/runtime/graph/graph_runtime_factory.cc
+++ b/src/runtime/graph/graph_runtime_factory.cc
@@ -55,9 +55,9 @@ PackedFunc GraphRuntimeFactory::GetFunction(
     });
   } else if (name == "debug_create") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_GE(args.size(), 2);
+      ICHECK_GE(args.size(), 2);
       std::string module_name = args[0].operator String();
-      CHECK(module_name == module_name_) << "Currently we only support single model for now.";
+      ICHECK(module_name == module_name_) << "Currently we only support single model for now.";
       std::vector<TVMContext> contexts;
       for (int i = 1; i < args.num_args; ++i) {
         contexts.emplace_back(args[i].operator TVMContext());
@@ -86,7 +86,7 @@ void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) {
     arrays.emplace_back(const_cast<DLTensor*>(v.second.operator->()));
   }
   uint64_t sz = arrays.size();
-  CHECK(sz == names.size());
+  ICHECK(sz == names.size());
   stream->Write(sz);
   stream->Write(names);
   for (size_t i = 0; i < sz; ++i) {
@@ -105,8 +105,8 @@ Module GraphRuntimeFactory::RuntimeCreate(const std::vector<TVMContext>& ctxs) {
 
 Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector<TVMContext>& ctxs) {
   const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_debug.create");
-  CHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_debug.create in registry. "
-                          "Do you enable debug graph runtime build?";
+  ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_debug.create in registry. "
+                           "Do you enable debug graph runtime build?";
   // Debug runtime create packed function will call GetAllContexs, so we unpack the ctxs.
   std::vector<int> unpacked_ctxs;
   for (const auto& ctx : ctxs) {
@@ -135,29 +135,29 @@ Module GraphRuntimeFactoryModuleLoadBinary(void* strm) {
   std::string graph_json;
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
   std::string module_name;
-  CHECK(stream->Read(&graph_json));
+  ICHECK(stream->Read(&graph_json));
   uint64_t sz;
-  CHECK(stream->Read(&sz));
+  ICHECK(stream->Read(&sz));
   std::vector<std::string> names;
-  CHECK(stream->Read(&names));
-  CHECK(sz == names.size());
+  ICHECK(stream->Read(&names));
+  ICHECK(sz == names.size());
   for (size_t i = 0; i < sz; ++i) {
     tvm::runtime::NDArray temp;
     temp.Load(stream);
     params[names[i]] = temp;
   }
-  CHECK(stream->Read(&module_name));
+  ICHECK(stream->Read(&module_name));
   auto exec = make_object<GraphRuntimeFactory>(graph_json, params, module_name);
   return Module(exec);
 }
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime_factory.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  CHECK_GE(args.num_args, 3) << "The expected number of arguments for "
-                                "graph_runtime_factory.create needs at least 3, "
-                                "but it has "
-                             << args.num_args;
+  ICHECK_GE(args.num_args, 3) << "The expected number of arguments for "
+                                 "graph_runtime_factory.create needs at least 3, "
+                                 "but it has "
+                              << args.num_args;
   // The argument order is graph_json, module, module_name, params.
-  CHECK_EQ((args.size() - 3) % 2, 0);
+  ICHECK_EQ((args.size() - 3) % 2, 0);
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
   for (size_t i = 3; i < static_cast<size_t>(args.size()); i += 2) {
     std::string name = args[i].operator String();
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index a89015707f99..605c55eb89b9 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -17,9 +17,9 @@
  * under the License.
  */
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <cstring>
@@ -60,12 +60,12 @@ inline void HexagonDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRe
 
 inline void* HexagonDeviceAPI::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                                               DLDataType type_hint) {
-  CHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
   return hexagon::Device::Global()->Alloc(nbytes, alignment);
 }
 
 inline void HexagonDeviceAPI::FreeDataSpace(TVMContext ctx, void* ptr) {
-  CHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
   hexagon::Device::Global()->Free(ptr);
 }
 
@@ -85,22 +85,22 @@ inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offse
     if (ctx_from.device_type == kDLCPU) {
       memmove(dst, src, num_bytes);
     } else if (static_cast<int>(ctx_from.device_type) == kDLHexagon) {
-      CHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
-      CHECK_EQ(ctx_from.device_id, ctx_to.device_id);
-      CHECK(Is32bit(dst) && Is32bit(src));
+      ICHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
+      ICHECK_EQ(ctx_from.device_id, ctx_to.device_id);
+      ICHECK(Is32bit(dst) && Is32bit(src));
       hexagon::Device::Global()->CopyDeviceToDevice(dst, src, num_bytes);
     }
   } else {
     if (ctx_from.device_type == kDLCPU) {
-      CHECK_EQ(static_cast<int>(ctx_to.device_type), kDLHexagon);
-      CHECK(Is32bit(dst));
-      CHECK(hexagon::Device::ValidateDeviceId(ctx_to.device_id));
+      ICHECK_EQ(static_cast<int>(ctx_to.device_type), kDLHexagon);
+      ICHECK(Is32bit(dst));
+      ICHECK(hexagon::Device::ValidateDeviceId(ctx_to.device_id));
       hexagon::Device::Global()->CopyHostToDevice(dst, src, num_bytes);
     } else {
-      CHECK_EQ(static_cast<int>(ctx_from.device_type), kDLHexagon);
-      CHECK_EQ(ctx_to.device_type, kDLCPU);
-      CHECK(Is32bit(src));
-      CHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
+      ICHECK_EQ(static_cast<int>(ctx_from.device_type), kDLHexagon);
+      ICHECK_EQ(ctx_to.device_type, kDLCPU);
+      ICHECK(Is32bit(src));
+      ICHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
       hexagon::Device::Global()->CopyDeviceToHost(dst, src, num_bytes);
     }
   }
@@ -109,7 +109,7 @@ inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offse
 inline void HexagonDeviceAPI::StreamSync(TVMContext ctx, TVMStreamHandle stream) {}
 
 inline void* HexagonDeviceAPI::AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint) {
-  CHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
   if (type_hint.code == 100) {
     size_t align = std::min(nbytes, 2048lu);
     return hexagon::Device::Global()->AllocVtcm(nbytes, align);
@@ -118,7 +118,7 @@ inline void* HexagonDeviceAPI::AllocWorkspace(TVMContext ctx, size_t nbytes, DLD
 }
 
 inline void HexagonDeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) {
-  CHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
   DeviceAPI::FreeWorkspace(ctx, ptr);
 }
 
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 305fd50cbdd5..994e24b99084 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -22,8 +22,8 @@
 #ifdef __ANDROID__
 #include <android/log.h>
 #endif
-#include <dmlc/logging.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <memory>
 #include <set>
@@ -176,8 +176,8 @@ void ArgLayout::Push(uint32_t* v, unsigned t_size, unsigned t_align) {
 
   if (!InReg) {
     // Allocate on stack.
-    CHECK_EQ((t_align & (t_align - 1)), 0) << "Alignment should be a power of 2";
-    CHECK_GE(t_align, 4) << "Alignment should be at least 4";
+    ICHECK_EQ((t_align & (t_align - 1)), 0) << "Alignment should be a power of 2";
+    ICHECK_GE(t_align, 4) << "Alignment should be at least 4";
     // Round t_size up to a multiple of 4.
     unsigned s_size = Stack.size();
     unsigned s_align = t_align / 4;  // Alignment of T in words on the stack.
@@ -223,18 +223,18 @@ class HexagonModuleNode final : public runtime::ModuleNode {
       std::string meta_file = GetMetaFilePath(file_name);
       SaveMetaDataToFile(meta_file, fmap_);
       std::string c = "cp " + data_ + " " + file_name;
-      CHECK(std::system(c.c_str()) == 0) << "Cannot create " + file_name;
+      ICHECK(std::system(c.c_str()) == 0) << "Cannot create " + file_name;
     } else if (fmt == "s" || fmt == "asm") {
-      CHECK(!asm_.empty()) << "Assembler source not available";
+      ICHECK(!asm_.empty()) << "Assembler source not available";
       SaveBinaryToFile(file_name, asm_);
     } else if (fmt == "o" || fmt == "obj") {
-      CHECK(!obj_.empty()) << "Object data not available";
+      ICHECK(!obj_.empty()) << "Object data not available";
       SaveBinaryToFile(file_name, obj_);
     } else if (fmt == "ll") {
-      CHECK(!ir_.empty()) << "LLVM IR source not available";
+      ICHECK(!ir_.empty()) << "LLVM IR source not available";
       SaveBinaryToFile(file_name, ir_);
     } else if (fmt == "bc") {
-      CHECK(!bc_.empty()) << "LLVM IR bitcode not available";
+      ICHECK(!bc_.empty()) << "LLVM IR bitcode not available";
       SaveBinaryToFile(file_name, bc_);
     } else {
       LOG(FATAL) << "HexagonModuleNode::SaveToFile: unhandled format `" << fmt << "'";
@@ -480,7 +480,7 @@ hexagon::ArgLayout HexagonModuleNode::BuildArgLayout(const TVMArgs& As) const {
         // types, so there is no way to tell if the value being passed needs
         // one or two registers. Assume that all integers are 32-bit, and
         // simply abort if the actual value does not fit.
-        CHECK_EQ(static_cast<int64_t>(A), static_cast<int32_t>(A));
+        ICHECK_EQ(static_cast<int64_t>(A), static_cast<int32_t>(A));
         Args.Push(static_cast<int>(A));
         break;
       // 64-bit values
diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h
index b922b169bd61..e558997b7a4c 100644
--- a/src/runtime/hexagon/hexagon_module.h
+++ b/src/runtime/hexagon/hexagon_module.h
@@ -20,8 +20,8 @@
 #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 #define TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/module.h>
+#include <tvm/support/logging.h>
 
 #include <array>
 #include <memory>
diff --git a/src/runtime/hexagon/sim/hexagon_device_sim.cc b/src/runtime/hexagon/sim/hexagon_device_sim.cc
index 9ff5a0421d51..6cc7dcf3209f 100644
--- a/src/runtime/hexagon/sim/hexagon_device_sim.cc
+++ b/src/runtime/hexagon/sim/hexagon_device_sim.cc
@@ -17,12 +17,12 @@
  * under the License.
  */
 
-#include <dmlc/logging.h>
 #include <llvm/ADT/Optional.h>
 #include <llvm/ADT/STLExtras.h>
 #include <llvm/ADT/StringRef.h>
 #include <llvm/Support/FileSystem.h>
 #include <llvm/Support/Process.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <deque>
@@ -107,7 +107,7 @@ struct non_const_str {
   }
   size_t size() const { return pointers_.size(); }
   operator char*() {
-    CHECK_EQ(pointers_.size(), 1);
+    ICHECK_EQ(pointers_.size(), 1);
     return pointers_[0];
   }
   operator char* *() { return pointers_.data(); }
@@ -394,17 +394,17 @@ decltype(HexagonSimulator::opt_map_) HexagonSimulator::opt_map_ = {
     {"--verbose", &HexagonSimulator::HandleVerbose},
 };
 
-#define CHECKED_CALL(func, ...)                                                      \
-  do {                                                                               \
-    HEXAPI_Status s = sim_->func(__VA_ARGS__);                                       \
-    CHECK_EQ(s, HEX_STAT_SUCCESS) << "HexagonSimulator: " #func " failed with code " \
-                                  << HexagonSimulator::to_string(s);                 \
+#define CHECKED_CALL(func, ...)                                                               \
+  do {                                                                                        \
+    HEXAPI_Status s = sim_->func(__VA_ARGS__);                                                \
+    ICHECK_EQ(s, HEX_STAT_SUCCESS)                                                            \
+        << "HexagonSimulator: " #func " failed with code " << HexagonSimulator::to_string(s); \
   } while (false)
 
 inline HEX_VA_t HexagonSimulator::p2va(const void* p) {
   uintptr_t u = reinterpret_cast<uintptr_t>(p);
   HEX_VA_t va = static_cast<HEX_VA_t>(u);
-  CHECK_EQ(static_cast<uintptr_t>(va), u);
+  ICHECK_EQ(static_cast<uintptr_t>(va), u);
   return va;
 }
 
@@ -425,13 +425,13 @@ template <unsigned N>
 void HexagonSimulator::CopyNToV(HEX_VA_t dst, const void* host_src) {
   using src_uint_t = typename unalign<typename uint<N>::type>::type;
   auto* ps = reinterpret_cast<const src_uint_t*>(host_src);
-  CHECK_EQ(sim_->WriteVirtual(dst, -1u, N, ps->value), HEX_STAT_SUCCESS);
+  ICHECK_EQ(sim_->WriteVirtual(dst, -1u, N, ps->value), HEX_STAT_SUCCESS);
 }
 
 template <unsigned N>
 void HexagonSimulator::CopyNFromV(void* host_dst, HEX_VA_t src) {
   typename uint<N>::type v;
-  CHECK_EQ(sim_->ReadVirtual(src, -1u, N, &v), HEX_STAT_SUCCESS);
+  ICHECK_EQ(sim_->ReadVirtual(src, -1u, N, &v), HEX_STAT_SUCCESS);
 
   using dst_uint_t = typename unalign<typename uint<N>::type>::type;
   auto* pd = reinterpret_cast<dst_uint_t*>(host_dst);
@@ -465,7 +465,7 @@ void HexagonSimulator::CopyToV(HEX_VA_t dst, const void* host_src, unsigned len)
     src++;
     len--;
   }
-  CHECK_EQ(len, 0);
+  ICHECK_EQ(len, 0);
 }
 
 void HexagonSimulator::CopyFromV(void* host_dst, HEX_VA_t src, unsigned len) {
@@ -495,7 +495,7 @@ void HexagonSimulator::CopyFromV(void* host_dst, HEX_VA_t src, unsigned len) {
     src++;
     len--;
   }
-  CHECK_EQ(len, 0);
+  ICHECK_EQ(len, 0);
 }
 
 void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
@@ -504,13 +504,13 @@ void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
     HEX_4u_t result;
     HEX_8u_t cycles0, cycles1;
     if (report_cycles) {
-      CHECK_EQ(sim_->GetSimulatedCycleCount(&cycles0), HEX_STAT_SUCCESS);
+      ICHECK_EQ(sim_->GetSimulatedCycleCount(&cycles0), HEX_STAT_SUCCESS);
     }
 
     core = sim_->Run(&result);
-    CHECK_EQ(core, HEX_CORE_BREAKPOINT);
+    ICHECK_EQ(core, HEX_CORE_BREAKPOINT);
     if (report_cycles) {
-      CHECK_EQ(sim_->GetSimulatedCycleCount(&cycles1), HEX_STAT_SUCCESS);
+      ICHECK_EQ(sim_->GetSimulatedCycleCount(&cycles1), HEX_STAT_SUCCESS);
       LOG(INFO) << "host: execution took " << (cycles1 - cycles0) << " cycles";
     }
   };
@@ -522,8 +522,8 @@ void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
 
   // Receive the acknowledgement with the address for the payload.
   CopyFromV(&r, message_buffer_v_, sizeof(r));
-  CHECK_EQ(r.code, kMsgAck);
-  CHECK_GE(r.len, m.len);
+  ICHECK_EQ(r.code, kMsgAck);
+  ICHECK_GE(r.len, m.len);
 
   // Send the actual message.
   m.va = r.va;
@@ -533,7 +533,7 @@ void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
 
   // Receive the return data.
   CopyFromV(&m, message_buffer_v_, sizeof(m));
-  CHECK_EQ(m.code, kNone);
+  ICHECK_EQ(m.code, kNone);
 }
 
 HexagonSimulator::HexagonSimulator(bool enable_queuing) {
@@ -610,12 +610,12 @@ void* HexagonSimulator::Alloc(unsigned size, unsigned align) {
   MsgAlloc ma = {size, align};
   SendMsg(m, &ma, true);
 
-  CHECK_EQ(sizeof(MsgPointer), m.len);
+  ICHECK_EQ(sizeof(MsgPointer), m.len);
   MsgPointer mp;
   CopyFromV(&mp, m.va, m.len);
 
   LOG(INFO) << "HexagonSimulator::Alloc -> " << std::hex << mp.va << std::dec;
-  CHECK_NE(mp.va, 0);
+  ICHECK_NE(mp.va, 0);
   return va2p(mp.va);
 }
 
@@ -636,12 +636,12 @@ void* HexagonSimulator::AllocVtcm(unsigned size, unsigned align) {
   MsgAlloc ma = {size, align};
   SendMsg(m, &ma, true);
 
-  CHECK_EQ(sizeof(MsgPointer), m.len);
+  ICHECK_EQ(sizeof(MsgPointer), m.len);
   MsgPointer mp;
   CopyFromV(&mp, m.va, m.len);
 
   LOG(INFO) << "HexagonSimulator::AllocVtcm -> " << std::hex << mp.va << std::dec;
-  CHECK_NE(mp.va, 0);
+  ICHECK_NE(mp.va, 0);
   return va2p(mp.va);
 }
 
@@ -650,7 +650,7 @@ void HexagonSimulator::FreeVtcm(void* ptr) {}
 void HexagonSimulator::CopyDeviceToDevice(void* dst, const void* src, unsigned len) {
   LOG(INFO) << "HexagonSimulator::CopyDeviceToDevice(dst=" << std::hex << dst << ", src=" << src
             << ", len=" << std::dec << len << ')';
-  CHECK(dst != nullptr && src != nullptr);
+  ICHECK(dst != nullptr && src != nullptr);
   Message m = {kCopy, sizeof(MsgCopy), 0u};
   MsgCopy mc = {p2va(dst), p2va(src), len};
   SendMsg(m, &mc, true);
@@ -677,7 +677,7 @@ void* HexagonSimulator::Load(const std::string& data, const std::string& fmt) {
   Message m = {kLoad, static_cast<uint32_t>(data.size() + 1), 0u};
   SendMsg(m, data.c_str(), false);
 
-  CHECK_EQ(sizeof(MsgPointer), m.len);
+  ICHECK_EQ(sizeof(MsgPointer), m.len);
   MsgPointer mp;
   CopyFromV(&mp, m.va, sizeof(mp));
 
@@ -685,7 +685,7 @@ void* HexagonSimulator::Load(const std::string& data, const std::string& fmt) {
 }
 
 void HexagonSimulator::Unload(void* mod) {
-  CHECK(mod);
+  ICHECK(mod);
   Message m = {kUnload, sizeof(MsgPointer), 0u};
   MsgPointer mp = {p2va(mod)};
   SendMsg(m, &mp, false);
@@ -696,7 +696,7 @@ void* HexagonSimulator::Resolve(const std::string& sym) {
   Message m = {kResolve, static_cast<uint32_t>(sym.size() + 1), 0u};
   SendMsg(m, sym.c_str(), true);
 
-  CHECK_EQ(sizeof(MsgPointer), m.len);
+  ICHECK_EQ(sizeof(MsgPointer), m.len);
   MsgPointer mp;
   CopyFromV(&mp, m.va, sizeof(mp));
 
@@ -717,7 +717,7 @@ void HexagonSimulator::Call(void* func, uint32_t* scalar, unsigned sc_num, uint3
   // Copy the MsgCall contents into the data vector as a sequence of uints.
   MsgCall me = {p2va(func), sc_num, st_num};
 
-  CHECK((is_multiple_of<sizeof(MsgCall), sizeof(uint32_t)>()));
+  ICHECK((is_multiple_of<sizeof(MsgCall), sizeof(uint32_t)>()));
   for (unsigned i = 0, e = sizeof(me) / sizeof(uint32_t); i != e; ++i)
     data.push_back(reinterpret_cast<uint32_t*>(&me)[i]);
 
@@ -763,14 +763,14 @@ bool HexagonSimulator::Configure(string_list& opts) {
       LOG(FATAL) << "Unrecognized simulator option: " << key;
       // unreachable
     }
-    CHECK((this->*f->second)(opts)) << "error handling option: " << key;
+    ICHECK((this->*f->second)(opts)) << "error handling option: " << key;
   }
 
   // Check AHB.
   if (ahb_.first.hasValue() && ahb_.second.hasValue()) {
     CHECKED_CALL(ConfigureAHB, *ahb_.first, *ahb_.second);
   } else {
-    CHECK(!ahb_.first.hasValue() && !ahb_.second.hasValue())
+    ICHECK(!ahb_.first.hasValue() && !ahb_.second.hasValue())
         << "HexagonSimulator: please specify both low and high addresses "
            "for AHB";
   }
@@ -779,7 +779,7 @@ bool HexagonSimulator::Configure(string_list& opts) {
   if (axi2_.first.hasValue() && axi2_.second.hasValue()) {
     CHECKED_CALL(ConfigureAXI2, *axi2_.first, *axi2_.second);
   } else {
-    CHECK(!axi2_.first.hasValue() && !axi2_.second.hasValue())
+    ICHECK(!axi2_.first.hasValue() && !axi2_.second.hasValue())
         << "HexagonSimulator: please specify both low and high addresses "
            "for AXI2";
   }
@@ -806,7 +806,7 @@ bool HexagonSimulator::HandleAHBBusRatio(string_list& rest) {
 
 bool HexagonSimulator::HandleAHBHighAddr(string_list& rest) {
   auto addr = detail::to_uint(detail::pop_front(rest));
-  CHECK(addr) << "HexagonSimulator: invalid value for AHB high adddress";
+  ICHECK(addr) << "HexagonSimulator: invalid value for AHB high adddress";
   if (addr) {
     ahb_.second = *addr;
   }
@@ -815,7 +815,7 @@ bool HexagonSimulator::HandleAHBHighAddr(string_list& rest) {
 
 bool HexagonSimulator::HandleAHBLowAddr(string_list& rest) {
   auto addr = detail::to_uint(detail::pop_front(rest));
-  CHECK(addr) << "HexagonSimulator: invalid value for AHB low adddress";
+  ICHECK(addr) << "HexagonSimulator: invalid value for AHB low adddress";
   if (addr) {
     ahb_.first = *addr;
   }
@@ -841,7 +841,7 @@ bool HexagonSimulator::HandleAXI2BusRatio(string_list& rest) {
 
 bool HexagonSimulator::HandleAXI2HighAddr(string_list& rest) {
   auto addr = detail::to_uint(detail::pop_front(rest));
-  CHECK(addr) << "HexagonSimulator: invalid value for AXI2 high adddress";
+  ICHECK(addr) << "HexagonSimulator: invalid value for AXI2 high adddress";
   if (addr) {
     axi2_.second = *addr;
   }
@@ -850,7 +850,7 @@ bool HexagonSimulator::HandleAXI2HighAddr(string_list& rest) {
 
 bool HexagonSimulator::HandleAXI2LowAddr(string_list& rest) {
   auto addr = detail::to_uint(detail::pop_front(rest));
-  CHECK(addr) << "HexagonSimulator: invalid value for AXI2 low adddress";
+  ICHECK(addr) << "HexagonSimulator: invalid value for AXI2 low adddress";
   if (addr) {
     axi2_.first = *addr;
   }
diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
index bf10feb652cd..d494db82e2c7 100644
--- a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
@@ -21,8 +21,8 @@
 #include "hexagon_dsprpcapi.h"
 
 #include <dlfcn.h>
-#include <dmlc/logging.h>
 #include <stdint.h>
+#include <tvm/support/logging.h>
 
 #include "hexagon_target_log.h"
 
@@ -32,7 +32,7 @@ namespace runtime {
 namespace hexagon {
 
 DspRpcAPI::DspRpcAPI() {
-  CHECK(lib_handle_ = dlopen(rpc_lib_name_, RTLD_LAZY | RTLD_LOCAL));
+  ICHECK(lib_handle_ = dlopen(rpc_lib_name_, RTLD_LAZY | RTLD_LOCAL));
 
 #define RESOLVE(n) n##_ = GetSymbol<n##_t*>(#n)
   RESOLVE(remote_handle_close);
diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.h b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
index ca812e6c2f1f..c0e40805ecbf 100644
--- a/src/runtime/hexagon/target/hexagon_dsprpcapi.h
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
@@ -21,8 +21,8 @@
 #define TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_DSPRPCAPI_H_
 
 #ifdef __ANDROID__
-#include <dmlc/logging.h>
 #include <stdint.h>
+#include <tvm/support/logging.h>
 
 #include "remote.h"
 #include "remote64.h"
@@ -109,7 +109,7 @@ class DspRpcAPI {
 
 #define DECLFUNC(fn)                                   \
   fn##_t* fn##_ptr(bool allow_nullptr = false) const { \
-    if (!allow_nullptr) CHECK(fn##_ != nullptr);       \
+    if (!allow_nullptr) ICHECK(fn##_ != nullptr);      \
     return fn##_;                                      \
   }
   DECLFUNC(remote_handle_close)
diff --git a/src/runtime/hexagon/target/hexagon_stubapi.cc b/src/runtime/hexagon/target/hexagon_stubapi.cc
index 2ed33471b98f..5428ae7c1cff 100644
--- a/src/runtime/hexagon/target/hexagon_stubapi.cc
+++ b/src/runtime/hexagon/target/hexagon_stubapi.cc
@@ -21,9 +21,9 @@
 #include "hexagon_stubapi.h"
 
 #include <dlfcn.h>
-#include <dmlc/logging.h>
 #include <stdint.h>
 #include <sys/stat.h>
+#include <tvm/support/logging.h>
 
 #include "hexagon_target_log.h"
 
@@ -45,7 +45,7 @@ StubAPI::StubAPI() {
   constexpr auto nondomain_lib_name = "libtvm_remote_nd_stub.so";
 
   const char* lib_name = enable_domains_ ? domain_lib_name : nondomain_lib_name;
-  CHECK(lib_handle_ = dlopen(lib_name, RTLD_LAZY | RTLD_LOCAL));
+  ICHECK(lib_handle_ = dlopen(lib_name, RTLD_LAZY | RTLD_LOCAL));
 
 #define RESOLVE(fn) p##fn##_ = GetSymbol<fn##_t*>(#fn)
   if (enable_domains_) {
diff --git a/src/runtime/hexagon/target/hexagon_stubapi.h b/src/runtime/hexagon/target/hexagon_stubapi.h
index 5213b6d0d7af..cc5b7b7413ca 100644
--- a/src/runtime/hexagon/target/hexagon_stubapi.h
+++ b/src/runtime/hexagon/target/hexagon_stubapi.h
@@ -22,9 +22,9 @@
 
 #ifdef __ANDROID__
 #include <AEEStdErr.h>
-#include <dmlc/logging.h>
 #include <rpcmem.h>
 #include <stdint.h>
+#include <tvm/support/logging.h>
 
 #include <tuple>
 
diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc
index a5935491fcd7..30ef2141c508 100644
--- a/src/runtime/library_module.cc
+++ b/src/runtime/library_module.cc
@@ -46,7 +46,7 @@ class LibraryModuleNode final : public ModuleNode {
     if (name == runtime::symbol::tvm_module_main) {
       const char* entry_name =
           reinterpret_cast<const char*>(lib_->GetSymbol(runtime::symbol::tvm_module_main));
-      CHECK(entry_name != nullptr)
+      ICHECK(entry_name != nullptr)
           << "Symbol " << runtime::symbol::tvm_module_main << " is not presented";
       faddr = reinterpret_cast<TVMBackendPackedCFunc>(lib_->GetSymbol(entry_name));
     } else {
@@ -75,7 +75,7 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>&
     int ret_type_code = kTVMNullptr;
     int ret = (*faddr)(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
                        args.num_args, &ret_value, &ret_type_code, nullptr);
-    CHECK_EQ(ret, 0) << TVMGetLastError();
+    ICHECK_EQ(ret, 0) << TVMGetLastError();
     if (ret_type_code != kTVMNullptr) {
       *rv = TVMRetValue::MoveFromCHost(ret_value, ret_type_code);
     }
@@ -107,7 +107,7 @@ void InitContextFunctions(std::function<void*(const char*)> fgetsymbol) {
  * \return Root Module.
  */
 runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
-  CHECK(mblob != nullptr);
+  ICHECK(mblob != nullptr);
   uint64_t nbytes = 0;
   for (size_t i = 0; i < sizeof(nbytes); ++i) {
     uint64_t c = mblob[i];
@@ -117,21 +117,21 @@ runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
                                  static_cast<size_t>(nbytes));
   dmlc::Stream* stream = &fs;
   uint64_t size;
-  CHECK(stream->Read(&size));
+  ICHECK(stream->Read(&size));
   std::vector<Module> modules;
   std::vector<uint64_t> import_tree_row_ptr;
   std::vector<uint64_t> import_tree_child_indices;
   for (uint64_t i = 0; i < size; ++i) {
     std::string tkey;
-    CHECK(stream->Read(&tkey));
+    ICHECK(stream->Read(&tkey));
     // Currently, _lib is for DSOModule, but we
     // don't have loadbinary function for it currently
     if (tkey == "_lib") {
       auto dso_module = Module(make_object<LibraryModuleNode>(lib));
       modules.emplace_back(dso_module);
     } else if (tkey == "_import_tree") {
-      CHECK(stream->Read(&import_tree_row_ptr));
-      CHECK(stream->Read(&import_tree_child_indices));
+      ICHECK(stream->Read(&import_tree_row_ptr));
+      ICHECK(stream->Read(&import_tree_child_indices));
     } else {
       std::string loadkey = "runtime.module.loadbinary_";
       std::string fkey = loadkey + tkey;
@@ -146,7 +146,7 @@ runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
             loaders += name.substr(loadkey.size());
           }
         }
-        CHECK(f != nullptr)
+        ICHECK(f != nullptr)
             << "Binary was created using " << tkey
             << " but a loader of that name is not registered. Available loaders are " << loaders
             << ". Perhaps you need to recompile with this runtime enabled.";
@@ -169,12 +169,12 @@ runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
       for (size_t j = import_tree_row_ptr[i]; j < import_tree_row_ptr[i + 1]; ++j) {
         auto module_import_addr = ModuleInternal::GetImportsAddr(modules[i].operator->());
         auto child_index = import_tree_child_indices[j];
-        CHECK(child_index < modules.size());
+        ICHECK(child_index < modules.size());
         module_import_addr->emplace_back(modules[child_index]);
       }
     }
   }
-  CHECK(!modules.empty());
+  ICHECK(!modules.empty());
   // invariance: root module is always at location 0.
   // The module order is collected via DFS
   return modules[0];
diff --git a/src/runtime/metadata_module.cc b/src/runtime/metadata_module.cc
index 56f894c46906..acef9d4736fd 100644
--- a/src/runtime/metadata_module.cc
+++ b/src/runtime/metadata_module.cc
@@ -69,7 +69,7 @@ class MetadataModuleNode : public ModuleNode {
     // Run the module.
     // Normally we would only have a limited number of submodules. The runtime
     // symobl lookup overhead should be minimal.
-    CHECK(!this->imports().empty());
+    ICHECK(!this->imports().empty());
     for (Module it : this->imports()) {
       PackedFunc pf = it.GetFunction(name);
       if (pf != nullptr) return pf;
@@ -86,10 +86,10 @@ class MetadataModuleNode : public ModuleNode {
    */
   Array<NDArray> GetRequiredMetadata(const std::string& symbol) {
     Array<NDArray> ret;
-    CHECK_GT(sym_vars_.count(symbol), 0U) << "No symbol is recorded for " << symbol;
+    ICHECK_GT(sym_vars_.count(symbol), 0U) << "No symbol is recorded for " << symbol;
     std::vector<std::string> vars = sym_vars_[symbol];
     for (const auto& it : vars) {
-      CHECK_GT(metadata_.count(it), 0U) << "Found not recorded constant variable: " << it;
+      ICHECK_GT(metadata_.count(it), 0U) << "Found not recorded constant variable: " << it;
       ret.push_back(metadata_[it]);
     }
     return ret;
@@ -119,7 +119,7 @@ class MetadataModuleNode : public ModuleNode {
         // Initialize the module with metadata.
         int ret = init(md);
         // Report the error if initialization is failed.
-        CHECK_EQ(ret, 0) << TVMGetLastError();
+        ICHECK_EQ(ret, 0) << TVMGetLastError();
         break;
       }
     }
@@ -164,10 +164,10 @@ class MetadataModuleNode : public ModuleNode {
 
     // Load the variables.
     std::vector<std::string> variables;
-    CHECK(stream->Read(&variables)) << "Loading variables failed";
+    ICHECK(stream->Read(&variables)) << "Loading variables failed";
     uint64_t sz;
-    CHECK(stream->Read(&sz, sizeof(sz))) << "Loading metadata size failed";
-    CHECK_EQ(static_cast<size_t>(sz), variables.size())
+    ICHECK(stream->Read(&sz, sizeof(sz))) << "Loading metadata size failed";
+    ICHECK_EQ(static_cast<size_t>(sz), variables.size())
         << "The number of variables and ndarray counts must match";
     // Load the list of ndarray.
     std::vector<NDArray> arrays;
@@ -179,19 +179,19 @@ class MetadataModuleNode : public ModuleNode {
 
     std::unordered_map<std::string, NDArray> metadata;
     for (uint64_t i = 0; i < sz; i++) {
-      CHECK_EQ(metadata.count(variables[i]), 0U);
+      ICHECK_EQ(metadata.count(variables[i]), 0U);
       metadata[variables[i]] = arrays[i];
     }
 
     // Load the symbol to list of required constant variables mapping
     std::vector<std::string> symbols;
-    CHECK(stream->Read(&symbols)) << "Loading symbols failed";
-    CHECK(stream->Read(&sz, sizeof(sz))) << "Loading number of symbols failed";
-    CHECK_EQ(static_cast<size_t>(sz), symbols.size());
+    ICHECK(stream->Read(&symbols)) << "Loading symbols failed";
+    ICHECK(stream->Read(&sz, sizeof(sz))) << "Loading number of symbols failed";
+    ICHECK_EQ(static_cast<size_t>(sz), symbols.size());
     std::vector<std::vector<std::string>> const_vars;
     for (uint64_t i = 0; i < sz; i++) {
       std::vector<std::string> vars;
-      CHECK(stream->Read(&vars)) << "Loading const variables failed";
+      ICHECK(stream->Read(&vars)) << "Loading const variables failed";
       const_vars.push_back(vars);
     }
 
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index 634ee305153b..d13ac7e78982 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -30,10 +30,10 @@
 #import <Metal/MTLCommandQueue.h>
 #import <Metal/MTLDevice.h>
 #import <Metal/MTLLibrary.h>
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/support/logging.h>
 
 #include <memory>
 #include <mutex>
@@ -64,15 +64,15 @@ class MetalWorkspace final : public DeviceAPI {
   ~MetalWorkspace();
   // Get command queue for given context.
   id<MTLCommandQueue> GetCommandQueue(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kDLMetal);
-    CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
+    ICHECK_EQ(ctx.device_type, kDLMetal);
+    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
         << "Invalid Metal device_id=" << ctx.device_id;
     return queues[ctx.device_id];
   }
   // Get device for given context
   id<MTLDevice> GetDevice(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kDLMetal);
-    CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < devices.size())
+    ICHECK_EQ(ctx.device_type, kDLMetal);
+    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < devices.size())
         << "Invalid Metal device_id=" << ctx.device_id;
     return devices[ctx.device_id];
   }
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 79007394b18f..0169a4c2ec28 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -43,7 +43,7 @@
     *rv = int(index < devices.size());
     return;
   }
-  CHECK_LT(index, devices.size()) << "Invalid device id " << index;
+  ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
   switch (kind) {
     case kMaxThreadsPerBlock: {
       *rv = static_cast<int>([devices[ctx.device_id] maxThreadsPerThreadgroup].width);
@@ -101,11 +101,11 @@ int GetWarpSize(id<MTLDevice> dev) {
   id<MTLLibrary> lib = [dev newLibraryWithSource:[NSString stringWithUTF8String:kDummyKernel]
                                          options:nil
                                            error:&error_msg];
-  CHECK(lib != nil) << [[error_msg localizedDescription] UTF8String];
+  ICHECK(lib != nil) << [[error_msg localizedDescription] UTF8String];
   id<MTLFunction> f = [lib newFunctionWithName:[NSString stringWithUTF8String:"CopyKernel"]];
-  CHECK(f != nil);
+  ICHECK(f != nil);
   id<MTLComputePipelineState> state = [dev newComputePipelineStateWithFunction:f error:&error_msg];
-  CHECK(state != nil) << [[error_msg localizedDescription] UTF8String];
+  ICHECK(state != nil) << [[error_msg localizedDescription] UTF8String];
   return static_cast<int>(state.threadExecutionWidth);
 }
 
@@ -159,7 +159,7 @@ int GetWarpSize(id<MTLDevice> dev) {
   #endif
   */
   id<MTLBuffer> buf = [dev newBufferWithLength:nbytes options:storage_mode];
-  CHECK(buf != nil);
+  ICHECK(buf != nil);
   return (void*)(CFBridgingRetain(buf));
 }
 
@@ -176,7 +176,7 @@ int GetWarpSize(id<MTLDevice> dev) {
                                     TVMContext ctx_to, DLDataType type_hint,
                                     TVMStreamHandle stream) {
   this->Init();
-  CHECK(stream == nullptr);
+  ICHECK(stream == nullptr);
   TVMContext ctx = ctx_from;
   if (ctx_from.device_type == kDLCPU) ctx = ctx_to;
   id<MTLCommandQueue> queue = GetCommandQueue(ctx);
@@ -185,7 +185,7 @@ int GetWarpSize(id<MTLDevice> dev) {
   int to_dev_type = static_cast<int>(ctx_to.device_type);
 
   if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) {
-    CHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy.";
+    ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy.";
     id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
     [encoder copyFromBuffer:(__bridge id<MTLBuffer>)(from)
                sourceOffset:from_offset
@@ -237,7 +237,7 @@ int GetWarpSize(id<MTLDevice> dev) {
 }
 
 void MetalWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
-  CHECK(stream == nullptr);
+  ICHECK(stream == nullptr);
   // commit an empty command buffer and wait until it completes.
   id<MTLCommandQueue> queue = GetCommandQueue(ctx);
   id<MTLCommandBuffer> cb = [queue commandBuffer];
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index 2d3a901c8524..7d46811fe78d 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -50,7 +50,7 @@ explicit MetalModuleNode(std::string data, std::string fmt,
 
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+    ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
     std::string meta_file = GetMetaFilePath(file_name);
     SaveMetaDataToFile(meta_file, fmap_);
     SaveBinaryToFile(file_name, data_);
@@ -74,7 +74,7 @@ void SaveToBinary(dmlc::Stream* stream) final {
   // get a from primary context in device_id
   id<MTLComputePipelineState> GetPipelineState(size_t device_id, const std::string& func_name) {
     metal::MetalWorkspace* w = metal::MetalWorkspace::Global();
-    CHECK_LT(device_id, w->devices.size());
+    ICHECK_LT(device_id, w->devices.size());
     // start lock scope.
     std::lock_guard<std::mutex> lock(mutex_);
     if (finfo_.size() <= device_id) {
@@ -118,16 +118,16 @@ void SaveToBinary(dmlc::Stream* stream) final {
     }
     id<MTLFunction> f =
         [e.lib newFunctionWithName:[NSString stringWithUTF8String:func_name.c_str()]];
-    CHECK(f != nil) << "cannot find function " << func_name;
+    ICHECK(f != nil) << "cannot find function " << func_name;
     id<MTLComputePipelineState> state =
         [w->devices[device_id] newComputePipelineStateWithFunction:f error:&err_msg];
-    CHECK(state != nil) << "cannot get state:"
-                        << " for function " << func_name
-                        << [[err_msg localizedDescription] UTF8String];
+    ICHECK(state != nil) << "cannot get state:"
+                         << " for function " << func_name
+                         << [[err_msg localizedDescription] UTF8String];
     // The state.threadExecutionWidth can change dynamically according
     // to the resource constraint in kernel, so it is not strictly hold
     // Turn of warp aware optimziation for now.
-    // CHECK_EQ(state.threadExecutionWidth, w->warp_size[device_id]);
+    // ICHECK_EQ(state.threadExecutionWidth, w->warp_size[device_id]);
     e.smap[func_name] = [state retain];
     return state;
   }
@@ -231,8 +231,8 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const
 
 PackedFunc MetalModuleNode::GetFunction(const std::string& name,
                                         const ObjectPtr<Object>& sptr_to_self) {
-  CHECK_EQ(sptr_to_self.get(), this);
-  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  ICHECK_EQ(sptr_to_self.get(), this);
+  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
   auto it = fmap_.find(name);
   if (it == fmap_.end()) return PackedFunc();
   const FunctionInfo& info = it->second;
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 1b12d8f341ff..ea7682d3de57 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -23,10 +23,10 @@
 
 #include "micro_session.h"
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/crt/rpc_common/framing.h>
 #include <tvm/runtime/crt/rpc_common/session.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <chrono>
@@ -106,7 +106,7 @@ class MicroTransportChannel : public RPCChannel {
         int unframer_error = unframer_.Write((const uint8_t*)pending_chunk_.data(),
                                              pending_chunk_.size(), &bytes_consumed);
 
-        CHECK(bytes_consumed <= pending_chunk_.size())
+        ICHECK(bytes_consumed <= pending_chunk_.size())
             << "consumed " << bytes_consumed << " want <= " << pending_chunk_.size();
         pending_chunk_ = pending_chunk_.substr(bytes_consumed);
         bytes_received += bytes_consumed;
@@ -138,7 +138,7 @@ class MicroTransportChannel : public RPCChannel {
   }
 
   bool StartSession() {
-    CHECK(state_ == State::kReset)
+    ICHECK(state_ == State::kReset)
         << "MicroSession: state_: expected kReset, got " << uint8_t(state_);
 
     ::std::chrono::steady_clock::time_point start_time = ::std::chrono::steady_clock::now();
@@ -151,8 +151,8 @@ class MicroTransportChannel : public RPCChannel {
       end_time = session_start_end_time;
     }
     while (!session_.IsEstablished()) {
-      CHECK_EQ(kTvmErrorNoError, session_.Initialize());
-      CHECK_EQ(kTvmErrorNoError, session_.StartSession());
+      ICHECK_EQ(kTvmErrorNoError, session_.Initialize());
+      ICHECK_EQ(kTvmErrorNoError, session_.StartSession());
 
       ::std::chrono::microseconds time_remaining = ::std::max(
           ::std::chrono::microseconds{0}, ::std::chrono::duration_cast<::std::chrono::microseconds>(
@@ -176,7 +176,7 @@ class MicroTransportChannel : public RPCChannel {
   size_t Send(const void* data, size_t size) override {
     const uint8_t* data_bytes = static_cast<const uint8_t*>(data);
     tvm_crt_error_t err = session_.SendMessage(MessageType::kNormal, data_bytes, size);
-    CHECK(err == kTvmErrorNoError) << "SendMessage returned " << err;
+    ICHECK(err == kTvmErrorNoError) << "SendMessage returned " << err;
 
     return size;
   }
@@ -191,7 +191,7 @@ class MicroTransportChannel : public RPCChannel {
           session_.ClearReceiveBuffer();
         }
         if (num_bytes_recv == size) {
-          CHECK(message_buffer_ == nullptr || message_buffer_->ReadAvailable() > 0);
+          ICHECK(message_buffer_ == nullptr || message_buffer_->ReadAvailable() > 0);
           return num_bytes_recv;
         }
       }
@@ -256,7 +256,7 @@ class MicroTransportChannel : public RPCChannel {
           return;
         }
 
-        CHECK_EQ(buf->Read(message, sizeof(message) - 1), message_size_bytes);
+        ICHECK_EQ(buf->Read(message, sizeof(message) - 1), message_size_bytes);
         message[message_size_bytes] = 0;
         LOG(INFO) << "remote: " << message;
         session_.ClearReceiveBuffer();
@@ -316,5 +316,5 @@ void TVMLogf(const char* fmt, ...) {
   LOG(INFO) << msg_buf;
 }
 
-void TVMPlatformAbort(int error_code) { CHECK(false) << "TVMPlatformAbort: " << error_code; }
+void TVMPlatformAbort(int error_code) { ICHECK(false) << "TVMPlatformAbort: " << error_code; }
 }
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index 565f92ad59be..62f7236b8e2a 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -46,7 +46,7 @@
 #endif
 
 #if TVM_MINRPC_ENABLE_LOGGING
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 #endif
 
 namespace tvm {
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 8c3fb49ea7e0..ac2b60f8a383 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -39,7 +39,7 @@ void ModuleNode::Import(Module other) {
     static const PackedFunc* fimport_ = nullptr;
     if (fimport_ == nullptr) {
       fimport_ = runtime::Registry::Get("rpc.ImportRemoteModule");
-      CHECK(fimport_ != nullptr);
+      ICHECK(fimport_ != nullptr);
     }
     (*fimport_)(GetRef<Module>(this), other);
     return;
@@ -57,7 +57,7 @@ void ModuleNode::Import(Module other) {
       stack.push_back(next);
     }
   }
-  CHECK(!visited.count(this)) << "Cyclic dependency detected during import";
+  ICHECK(!visited.count(this)) << "Cyclic dependency detected during import";
   this->imports_.emplace_back(std::move(other));
 }
 
@@ -75,13 +75,13 @@ PackedFunc ModuleNode::GetFunction(const std::string& name, bool query_imports)
 
 Module Module::LoadFromFile(const std::string& file_name, const std::string& format) {
   std::string fmt = GetFileFormat(file_name, format);
-  CHECK(fmt.length() != 0) << "Cannot deduce format of file " << file_name;
+  ICHECK(fmt.length() != 0) << "Cannot deduce format of file " << file_name;
   if (fmt == "dll" || fmt == "dylib" || fmt == "dso") {
     fmt = "so";
   }
   std::string load_f_name = "runtime.module.loadfile_" + fmt;
   const PackedFunc* f = Registry::Get(load_f_name);
-  CHECK(f != nullptr) << "Loader of " << format << "(" << load_f_name << ") is not presented.";
+  ICHECK(f != nullptr) << "Loader of " << format << "(" << load_f_name << ") is not presented.";
   Module m = (*f)(file_name, format);
   return m;
 }
@@ -109,8 +109,8 @@ const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) {
   }
   if (pf == nullptr) {
     const PackedFunc* f = Registry::Get(name);
-    CHECK(f != nullptr) << "Cannot find function " << name
-                        << " in the imported modules or global registry";
+    ICHECK(f != nullptr) << "Cannot find function " << name
+                         << " in the imported modules or global registry";
     return f;
   } else {
     import_cache_.insert(std::make_pair(name, std::make_shared<PackedFunc>(pf)));
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 9c1eeeb973d6..dae775606a7e 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -21,10 +21,10 @@
  * \file ndarray.cc
  * \brief NDArray container infratructure.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/support/logging.h>
 
 #include "runtime_base.h"
 
@@ -39,9 +39,9 @@ namespace tvm {
 namespace runtime {
 
 inline void VerifyDataType(DLDataType dtype) {
-  CHECK_GE(dtype.lanes, 1);
+  ICHECK_GE(dtype.lanes, 1);
   if (dtype.code == kDLFloat) {
-    CHECK_EQ(dtype.bits % 8, 0);
+    ICHECK_EQ(dtype.bits % 8, 0);
   } else {
     // allow uint1 as a special flag for bool.
     if (dtype.bits == 1 && dtype.code == kDLUInt) return;
@@ -53,9 +53,9 @@ inline void VerifyDataType(DLDataType dtype) {
     else if (dtype.bits == 4 && dtype.code == kDLInt)
       return;
     else
-      CHECK_EQ(dtype.bits % 8, 0);
+      ICHECK_EQ(dtype.bits % 8, 0);
   }
-  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
+  ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
 inline size_t GetDataAlignment(const DLTensor& arr) {
@@ -69,8 +69,8 @@ void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
   cpu_ctx.device_type = kDLCPU;
   cpu_ctx.device_id = 0;
   size_t arr_size = GetDataSize(*handle);
-  CHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
-  CHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";
+  ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
+  ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";
   DeviceAPI::Get(handle->ctx)
       ->CopyDataFromTo(data, 0, handle->data, static_cast<size_t>(handle->byte_offset), nbytes,
                        cpu_ctx, handle->ctx, handle->dtype, nullptr);
@@ -83,8 +83,8 @@ void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
   cpu_ctx.device_type = kDLCPU;
   cpu_ctx.device_id = 0;
   size_t arr_size = GetDataSize(*handle);
-  CHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
-  CHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";
+  ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
+  ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";
   DeviceAPI::Get(handle->ctx)
       ->CopyDataFromTo(handle->data, static_cast<size_t>(handle->byte_offset), data, 0, nbytes,
                        handle->ctx, cpu_ctx, handle->dtype, nullptr);
@@ -153,7 +153,7 @@ struct NDArray::Internal {
   }
 
   static DLManagedTensor* ToDLPack(NDArray::Container* from) {
-    CHECK(from != nullptr);
+    ICHECK(from != nullptr);
     DLManagedTensor* ret = new DLManagedTensor();
     ret->dl_tensor = from->dl_tensor;
     ret->manager_ctx = from;
@@ -169,13 +169,13 @@ struct NDArray::Internal {
 };
 
 NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
-  CHECK(data_ != nullptr);
-  CHECK(get_mutable()->dl_tensor.strides == nullptr) << "Can only create view for compact tensor";
+  ICHECK(data_ != nullptr);
+  ICHECK(get_mutable()->dl_tensor.strides == nullptr) << "Can only create view for compact tensor";
   NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.ctx);
   ret.get_mutable()->dl_tensor.byte_offset = this->get_mutable()->dl_tensor.byte_offset;
   size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor);
   size_t view_size = GetDataSize(ret.get_mutable()->dl_tensor);
-  CHECK_LE(view_size, curr_size)
+  ICHECK_LE(view_size, curr_size)
       << "Tries to create a view that has bigger memory than current one";
   // increase ref count
   get_mutable()->IncRef();
@@ -211,25 +211,25 @@ NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
 }
 
 void NDArray::CopyToBytes(void* data, size_t nbytes) const {
-  CHECK(data != nullptr);
-  CHECK(data_ != nullptr);
+  ICHECK(data != nullptr);
+  ICHECK(data_ != nullptr);
   ArrayCopyToBytes(&get_mutable()->dl_tensor, data, nbytes);
 }
 
 void NDArray::CopyFromBytes(const void* data, size_t nbytes) {
-  CHECK(data != nullptr);
-  CHECK(data_ != nullptr);
+  ICHECK(data != nullptr);
+  ICHECK(data_ != nullptr);
   ArrayCopyFromBytes(&get_mutable()->dl_tensor, data, nbytes);
 }
 
 void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   size_t from_size = GetDataSize(*from);
   size_t to_size = GetDataSize(*to);
-  CHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match";
+  ICHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match";
 
-  CHECK(from->ctx.device_type == to->ctx.device_type || from->ctx.device_type == kDLCPU ||
-        to->ctx.device_type == kDLCPU || from->ctx.device_type == kDLCPUPinned ||
-        to->ctx.device_type == kDLCPUPinned)
+  ICHECK(from->ctx.device_type == to->ctx.device_type || from->ctx.device_type == kDLCPU ||
+         to->ctx.device_type == kDLCPU || from->ctx.device_type == kDLCPUPinned ||
+         to->ctx.device_type == kDLCPUPinned)
       << "Can not copy across different ctx types directly";
 
   // Use the context that is *not* a cpu context to get the correct device
diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index dc5f1ceabbae..ad68c70698ea 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -20,9 +20,9 @@
  * \file src/runtime/object.cc
  * \brief Object type management system.
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <iostream>
 #include <mutex>
@@ -70,7 +70,7 @@ class TypeContext {
     if (child_tindex == parent_tindex) return true;
     {
       std::lock_guard<std::mutex> lock(mutex_);
-      CHECK_LT(child_tindex, type_table_.size());
+      ICHECK_LT(child_tindex, type_table_.size());
       while (child_tindex > parent_tindex) {
         child_tindex = type_table_[child_tindex].parent_index;
       }
@@ -87,10 +87,10 @@ class TypeContext {
       return it->second;
     }
     // try to allocate from parent's type table.
-    CHECK_LT(parent_tindex, type_table_.size())
+    ICHECK_LT(parent_tindex, type_table_.size())
         << " skey= " << skey << "static_index=" << static_tindex;
     TypeInfo& pinfo = type_table_[parent_tindex];
-    CHECK_EQ(pinfo.index, parent_tindex);
+    ICHECK_EQ(pinfo.index, parent_tindex);
 
     // if parent cannot overflow, then this class cannot.
     if (!pinfo.child_slots_can_overflow) {
@@ -104,8 +104,8 @@ class TypeContext {
     if (static_tindex != TypeIndex::kDynamic) {
       // statically assigned type
       allocated_tindex = static_tindex;
-      CHECK_LT(static_tindex, type_table_.size());
-      CHECK_EQ(type_table_[allocated_tindex].allocated_slots, 0U)
+      ICHECK_LT(static_tindex, type_table_.size());
+      ICHECK_EQ(type_table_[allocated_tindex].allocated_slots, 0U)
           << "Conflicting static index " << static_tindex << " between "
           << type_table_[allocated_tindex].name << " and " << skey;
     } else if (pinfo.allocated_slots + num_slots <= pinfo.num_slots) {
@@ -114,15 +114,15 @@ class TypeContext {
       // update parent's state
       pinfo.allocated_slots += num_slots;
     } else {
-      CHECK(pinfo.child_slots_can_overflow)
+      ICHECK(pinfo.child_slots_can_overflow)
           << "Reach maximum number of sub-classes for " << pinfo.name;
       // allocate new entries.
       allocated_tindex = type_counter_;
       type_counter_ += num_slots;
-      CHECK_LE(type_table_.size(), type_counter_);
+      ICHECK_LE(type_table_.size(), type_counter_);
       type_table_.resize(type_counter_, TypeInfo());
     }
-    CHECK_GT(allocated_tindex, parent_tindex);
+    ICHECK_GT(allocated_tindex, parent_tindex);
     // initialize the slot.
     type_table_[allocated_tindex].index = allocated_tindex;
     type_table_[allocated_tindex].parent_index = parent_tindex;
@@ -138,21 +138,21 @@ class TypeContext {
 
   std::string TypeIndex2Key(uint32_t tindex) {
     std::lock_guard<std::mutex> lock(mutex_);
-    CHECK(tindex < type_table_.size() && type_table_[tindex].allocated_slots != 0)
+    ICHECK(tindex < type_table_.size() && type_table_[tindex].allocated_slots != 0)
         << "Unknown type index " << tindex;
     return type_table_[tindex].name;
   }
 
   size_t TypeIndex2KeyHash(uint32_t tindex) {
     std::lock_guard<std::mutex> lock(mutex_);
-    CHECK(tindex < type_table_.size() && type_table_[tindex].allocated_slots != 0)
+    ICHECK(tindex < type_table_.size() && type_table_[tindex].allocated_slots != 0)
         << "Unknown type index " << tindex;
     return type_table_[tindex].name_hash;
   }
 
   uint32_t TypeKey2Index(const std::string& skey) {
     auto it = type_key2index_.find(skey);
-    CHECK(it != type_key2index_.end())
+    ICHECK(it != type_key2index_.end())
         << "Cannot find type " << skey
         << ". Did you forget to register the node by TVM_REGISTER_NODE_TYPE ?";
     return it->second;
@@ -229,7 +229,7 @@ TVM_REGISTER_GLOBAL("runtime.DumpTypeTable").set_body_typed([](int min_child_cou
 
 int TVMObjectGetTypeIndex(TVMObjectHandle obj, unsigned* out_tindex) {
   API_BEGIN();
-  CHECK(obj != nullptr);
+  ICHECK(obj != nullptr);
   out_tindex[0] = static_cast<tvm::runtime::Object*>(obj)->type_index();
   API_END();
 }
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 290f8c256508..fa118ed9525b 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -24,10 +24,10 @@
 #ifndef TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
 #define TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/support/logging.h>
 
 /* There are many OpenCL platforms that do not yet support OpenCL 2.0,
  * hence we use 1.2 APIs, some of which are now deprecated.  In order
@@ -167,7 +167,7 @@ inline const char* CLGetErrorString(cl_int error) {
  * \param func Expression to call.
  */
 #define OPENCL_CHECK_ERROR(e) \
-  { CHECK(e == CL_SUCCESS) << "OpenCL Error, code=" << e << ": " << cl::CLGetErrorString(e); }
+  { ICHECK(e == CL_SUCCESS) << "OpenCL Error, code=" << e << ": " << cl::CLGetErrorString(e); }
 
 #define OPENCL_CALL(func)  \
   {                        \
@@ -221,9 +221,9 @@ class OpenCLWorkspace : public DeviceAPI {
   virtual bool IsOpenCLDevice(TVMContext ctx) { return ctx.device_type == kDLOpenCL; }
   // get the queue of the context
   cl_command_queue GetQueue(TVMContext ctx) {
-    CHECK(IsOpenCLDevice(ctx));
+    ICHECK(IsOpenCLDevice(ctx));
     this->Init();
-    CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
+    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
         << "Invalid OpenCL device_id=" << ctx.device_id;
     return queues[ctx.device_id];
   }
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 83944cd4a83e..a3ec21e28f1d 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -47,7 +47,7 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
     *rv = static_cast<int>(index < devices.size());
     return;
   }
-  CHECK_LT(index, devices.size()) << "Invalid device id " << index;
+  ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
   switch (kind) {
     case kExist:
       break;
@@ -119,7 +119,7 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
 void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
-  CHECK(context != nullptr) << "No OpenCL device";
+  ICHECK(context != nullptr) << "No OpenCL device";
   cl_int err_code;
   cl_mem mptr = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
   OPENCL_CHECK_ERROR(err_code);
@@ -140,7 +140,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void*
                                      TVMContext ctx_to, DLDataType type_hint,
                                      TVMStreamHandle stream) {
   this->Init();
-  CHECK(stream == nullptr);
+  ICHECK(stream == nullptr);
   if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) {
     OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(ctx_to),
                                     static_cast<cl_mem>((void*)from),  // NOLINT(*)
@@ -163,7 +163,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void*
 }
 
 void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
-  CHECK(stream == nullptr);
+  ICHECK(stream == nullptr);
   OPENCL_CALL(clFinish(this->GetQueue(ctx)));
 }
 
@@ -266,7 +266,7 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   this->context = clCreateContext(nullptr, this->devices.size(), &(this->devices[0]), nullptr,
                                   nullptr, &err_code);
   OPENCL_CHECK_ERROR(err_code);
-  CHECK_EQ(this->queues.size(), 0U);
+  ICHECK_EQ(this->queues.size(), 0U);
   for (size_t i = 0; i < this->devices.size(); ++i) {
     cl_device_id did = this->devices[i];
     this->queues.push_back(clCreateCommandQueue(this->context, did, 0, &err_code));
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 590a446efe64..a4c61e47b376 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -50,7 +50,7 @@ class OpenCLWrappedFunc {
   }
   // invoke the function with void arguments
   void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const {
-    CHECK(w_->context != nullptr) << "No OpenCL device";
+    ICHECK(w_->context != nullptr) << "No OpenCL device";
     cl::OpenCLThreadEntry* t = w_->GetThreadEntry();
     // get the kernel from thread local kernel table.
     if (entry_.kernel_id >= t->kernel_table.size()) {
@@ -116,8 +116,8 @@ cl::OpenCLWorkspace* OpenCLModuleNode::GetGlobalWorkspace() {
 
 PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
                                          const ObjectPtr<Object>& sptr_to_self) {
-  CHECK_EQ(sptr_to_self.get(), this);
-  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  ICHECK_EQ(sptr_to_self.get(), this);
+  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
   auto it = fmap_.find(name);
   if (it == fmap_.end()) return PackedFunc();
   const FunctionInfo& info = it->second;
@@ -125,13 +125,13 @@ PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
   std::vector<size_t> arg_size(info.arg_types.size());
   for (size_t i = 0; i < info.arg_types.size(); ++i) {
     DLDataType t = info.arg_types[i];
-    CHECK_EQ(t.lanes, 1U);
+    ICHECK_EQ(t.lanes, 1U);
     if (t.code == kTVMOpaqueHandle) {
       // specially store pointer type size in OpenCL driver
       arg_size[i] = sizeof(void*);
     } else {
       uint32_t bits = t.bits;
-      CHECK_EQ(bits % 8, 0U);
+      ICHECK_EQ(bits % 8, 0U);
       arg_size[i] = bits / 8;
     }
   }
@@ -142,7 +142,7 @@ PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
 
 void OpenCLModuleNode::SaveToFile(const std::string& file_name, const std::string& format) {
   std::string fmt = GetFileFormat(file_name, format);
-  CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+  ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
   std::string meta_file = GetMetaFilePath(file_name);
   SaveMetaDataToFile(meta_file, fmap_);
   SaveBinaryToFile(file_name, data_);
diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h
index ae9771641b23..45cde22bda08 100644
--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -119,7 +119,7 @@ enum ArgConvertCode {
 };
 
 inline ArgConvertCode GetArgConvertCode(DLDataType t) {
-  CHECK_EQ(t.lanes, 1U) << "Cannot pass vector type argument to devic function for now";
+  ICHECK_EQ(t.lanes, 1U) << "Cannot pass vector type argument to devic function for now";
   if (t.code == kDLInt) {
     if (t.bits == 64U) return INT64_TO_INT64;
     if (t.bits == 32U) return INT64_TO_INT32;
@@ -284,7 +284,7 @@ inline size_t NumBufferArgs(const std::vector<DLDataType>& arg_types) {
     }
   }
   for (size_t i = base; i < arg_types.size(); ++i) {
-    CHECK(arg_types[i].code != kTVMOpaqueHandle) << "Device function need to be organized";
+    ICHECK(arg_types[i].code != kTVMOpaqueHandle) << "Device function need to be organized";
   }
   return base;
 }
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index 641532a83927..6e74dc354259 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -21,9 +21,9 @@
  * \file registry.cc
  * \brief The global registry of packed function.
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <array>
 #include <memory>
@@ -65,7 +65,7 @@ Registry& Registry::Register(const std::string& name, bool can_override) {  // N
   Manager* m = Manager::Global();
   std::lock_guard<std::mutex> lock(m->mutex);
   if (m->fmap.count(name)) {
-    CHECK(can_override) << "Global PackedFunc " << name << " is already registered";
+    ICHECK(can_override) << "Global PackedFunc " << name << " is already registered";
   }
 
   Registry* r = new Registry();
diff --git a/src/runtime/rocm/rocm_common.h b/src/runtime/rocm/rocm_common.h
index 6ed9bccb1ab7..b258e37508df 100644
--- a/src/runtime/rocm/rocm_common.h
+++ b/src/runtime/rocm/rocm_common.h
@@ -43,10 +43,10 @@ namespace runtime {
     }                                                                                          \
   }
 
-#define ROCM_CALL(func)                                             \
-  {                                                                 \
-    hipError_t e = (func);                                          \
-    CHECK(e == hipSuccess) << "ROCM HIP: " << hipGetErrorString(e); \
+#define ROCM_CALL(func)                                              \
+  {                                                                  \
+    hipError_t e = (func);                                           \
+    ICHECK(e == hipSuccess) << "ROCM HIP: " << hipGetErrorString(e); \
   }
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 7f5bc99380a4..26e44eca0d12 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -21,12 +21,12 @@
  * \file rocm_device_api.cc
  * \brief GPU specific API
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <hip/hip_runtime_api.h>
 #include <hsa/hsa.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include "rocm_common.h"
 
@@ -122,7 +122,7 @@ class ROCMDeviceAPI final : public DeviceAPI {
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
     ROCM_CALL(hipSetDevice(ctx.device_id));
-    CHECK_EQ(256 % alignment, 0U) << "ROCM space is aligned at 256 bytes";
+    ICHECK_EQ(256 % alignment, 0U) << "ROCM space is aligned at 256 bytes";
     void* ret;
     ROCM_CALL(hipMalloc(&ret, nbytes));
     return ret;
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 8a83599c644b..567557c56794 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -70,7 +70,7 @@ class ROCMModuleNode : public runtime::ModuleNode {
     std::string fmt = GetFileFormat(file_name, format);
     std::string meta_file = GetMetaFilePath(file_name);
     // note: llvm and asm formats are not laodable, so we don't save them
-    CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+    ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
     SaveMetaDataToFile(meta_file, fmap_);
     SaveBinaryToFile(file_name, data_);
   }
@@ -121,7 +121,7 @@ class ROCMModuleNode : public runtime::ModuleNode {
     size_t nbytes = 0;
 
     ROCM_DRIVER_CALL(hipModuleGetGlobal(&global, &nbytes, module_[device_id], global_name.c_str()));
-    CHECK_EQ(nbytes, expect_nbytes);
+    ICHECK_EQ(nbytes, expect_nbytes);
     return global;
   }
 
@@ -189,8 +189,8 @@ class ROCMWrappedFunc {
 
 PackedFunc ROCMModuleNode::GetFunction(const std::string& name,
                                        const ObjectPtr<Object>& sptr_to_self) {
-  CHECK_EQ(sptr_to_self.get(), this);
-  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  ICHECK_EQ(sptr_to_self.get(), this);
+  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
   auto it = fmap_.find(name);
   if (it == fmap_.end()) return PackedFunc();
   const FunctionInfo& info = it->second;
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 196a97ecbd66..943990fd9585 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -20,9 +20,9 @@
 /*!
  * \file rpc_device_api.cc
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/support/logging.h>
 
 #include <utility>
 
@@ -71,7 +71,7 @@ class RPCDeviceAPI final : public DeviceAPI {
     int from_dev_type = ctx_from.device_type;
     int to_dev_type = ctx_to.device_type;
     if (from_dev_type > kRPCSessMask && to_dev_type > kRPCSessMask) {
-      CHECK(ctx_from.device_type == ctx_to.device_type)
+      ICHECK(ctx_from.device_type == ctx_to.device_type)
           << "Cannot copy across two different remote session";
       auto remote_ctx_from = RemoveSessMask(ctx_from);
       auto remote_ctx_to = RemoveSessMask(ctx_to);
@@ -104,7 +104,7 @@ class RPCDeviceAPI final : public DeviceAPI {
  private:
   std::shared_ptr<RPCSession> GetSess(TVMContext ctx) {
     int dev_type = ctx.device_type;
-    CHECK_GE(dev_type, kRPCSessMask);
+    ICHECK_GE(dev_type, kRPCSessMask);
     int tbl_index = dev_type / kRPCSessMask - 1;
     return RPCSession::Get(tbl_index);
   }
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index 2deae07b0315..0f526007f49e 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -122,7 +122,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
           break;
         case kRecvPacketNumBytes: {
           uint64_t packet_nbytes;
-          CHECK(this->Read(&packet_nbytes));
+          ICHECK(this->Read(&packet_nbytes));
           if (packet_nbytes != 0) {
             this->SwitchToState(kProcessPacket);
             this->RequestBytes(packet_nbytes);
@@ -178,7 +178,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
                    << args[i].AsObjectRef<ObjectRef>()->GetTypeKey() << " is not supported by RPC";
       } else if (tcode == kTVMContext) {
         DLContext ctx = args[i];
-        CHECK_LT(static_cast<int>(ctx.device_type), kRPCSessMask)
+        ICHECK_LT(static_cast<int>(ctx.device_type), kRPCSessMask)
             << "InternalError: cannot pass RPC context in the channel";
       }
     }
@@ -254,7 +254,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
   void SwitchToState(State state) {
     // invariant
     if (state != kCopyAckReceived) {
-      CHECK_EQ(pending_request_bytes_, 0U) << "state=" << state;
+      ICHECK_EQ(pending_request_bytes_, 0U) << "state=" << state;
     }
     // need to actively flush the writer
     // so the data get pushed out.
@@ -262,7 +262,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       flush_writer_();
     }
     state_ = state;
-    CHECK(state != kInitHeader) << "cannot switch to init header";
+    ICHECK(state != kInitHeader) << "cannot switch to init header";
     if (state == kRecvPacketNumBytes) {
       this->RequestBytes(sizeof(uint64_t));
       // recycle arena for the next session.
@@ -280,7 +280,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       this->RequestBytes(len);
       return;
     } else {
-      CHECK_EQ(init_header_step_, 1);
+      ICHECK_EQ(init_header_step_, 1);
       this->ReadArray(dmlc::BeginPtr(*remote_key_), remote_key_->length());
       this->SwitchToState(kRecvPacketNumBytes);
     }
@@ -378,7 +378,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       LOG(FATAL) << "RPCError: Error caught from RPC call:\n" << msg;
     }
 
-    CHECK(setreturn != nullptr) << "fsetreturn not available";
+    ICHECK(setreturn != nullptr) << "fsetreturn not available";
     setreturn(args);
 
     this->SwitchToState(kReturnReceived);
@@ -518,10 +518,10 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
     TVMArgs args = RecvPackedSeq();
 
     try {
-      CHECK(serving_session_ == nullptr) << "Server has already been initialized";
+      ICHECK(serving_session_ == nullptr) << "Server has already been initialized";
 
       std::string server_protocol_ver = kRPCProtocolVer;
-      CHECK_EQ(client_protocol_ver, server_protocol_ver)
+      ICHECK_EQ(client_protocol_ver, server_protocol_ver)
           << "Server[" << name_ << "]: Client protocol version mismatch with the server "
           << " server protocol=" << server_protocol_ver
           << ", client protocol=" << client_protocol_ver;
@@ -538,7 +538,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       }
 
       auto* fconstructor = Registry::Get(constructor_name);
-      CHECK(fconstructor != nullptr) << " Cannot find session constructor " << constructor_name;
+      ICHECK(fconstructor != nullptr) << " Cannot find session constructor " << constructor_name;
       TVMRetValue con_ret;
 
       try {
@@ -549,12 +549,12 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
                    << e.what();
       }
 
-      CHECK_EQ(con_ret.type_code(), kTVMModuleHandle)
+      ICHECK_EQ(con_ret.type_code(), kTVMModuleHandle)
           << "Server[" << name_ << "]:"
           << " Constructor " << constructor_name << " need to return an RPCModule";
       Module mod = con_ret;
       std::string tkey = mod->type_key();
-      CHECK_EQ(tkey, "rpc") << "Constructor " << constructor_name << " to return an RPCModule";
+      ICHECK_EQ(tkey, "rpc") << "Constructor " << constructor_name << " to return an RPCModule";
       serving_session_ = RPCModuleGetSession(mod);
       this->ReturnVoid();
     } catch (const std::runtime_error& e) {
@@ -606,9 +606,9 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
 
  private:
   RPCSession* GetServingSession() const {
-    CHECK(serving_session_ != nullptr)
+    ICHECK(serving_session_ != nullptr)
         << "Need to call InitRemoteSession first before any further actions";
-    CHECK(!serving_session_->IsAsync() || async_server_mode_)
+    ICHECK(!serving_session_->IsAsync() || async_server_mode_)
         << "Cannot host an async session in a non-Event driven server";
 
     return serving_session_.get();
@@ -616,7 +616,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
   // Utility functions
   // Internal read function, update pending_request_bytes_
   size_t Read(void* data, size_t size) final {
-    CHECK_LE(size, pending_request_bytes_);
+    ICHECK_LE(size, pending_request_bytes_);
     reader_->Read(data, size);
     pending_request_bytes_ -= size;
     return size;
@@ -693,10 +693,10 @@ void RPCEndpoint::Init() {
     handler_->SendPackedSeq(args.values, args.type_codes, args.num_args, true);
 
     code = HandleUntilReturnEvent(true, [rv](TVMArgs args) {
-      CHECK_EQ(args.size(), 1);
+      ICHECK_EQ(args.size(), 1);
       *rv = args[0];
     });
-    CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
+    ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
   });
 }
 
@@ -739,7 +739,7 @@ void RPCEndpoint::ServerLoop() {
     (*f)();
   }
   TVMRetValue rv;
-  CHECK(HandleUntilReturnEvent(false, [](TVMArgs) {}) == RPCCode::kShutdown);
+  ICHECK(HandleUntilReturnEvent(false, [](TVMArgs) {}) == RPCCode::kShutdown);
   if (const auto* f = Registry::Get("tvm.rpc.server.shutdown")) {
     (*f)();
   }
@@ -757,7 +757,7 @@ int RPCEndpoint::ServerAsyncIOEventHandler(const std::string& in_bytes, int even
         [this](const void* data, size_t size) { return channel_->Send(data, size); },
         writer_.bytes_available());
   }
-  CHECK(code != RPCCode::kReturn && code != RPCCode::kCopyAck);
+  ICHECK(code != RPCCode::kReturn && code != RPCCode::kCopyAck);
   if (code == RPCCode::kShutdown) return 0;
   if (writer_.bytes_available() != 0) return 2;
   return 1;
@@ -781,7 +781,7 @@ void RPCEndpoint::InitRemoteSession(TVMArgs args) {
   handler_->SendPackedSeq(args.values, args.type_codes, args.num_args, true);
 
   code = HandleUntilReturnEvent(true, [](TVMArgs args) {});
-  CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
+  ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
 }
 
 // Get remote function with name
@@ -804,7 +804,7 @@ void RPCEndpoint::CallFunc(RPCSession::PackedFuncHandle h, const TVMValue* arg_v
   handler_->SendPackedSeq(arg_values, arg_type_codes, num_args, true);
 
   code = HandleUntilReturnEvent(true, encode_return);
-  CHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
+  ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
 }
 
 void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset,
@@ -827,7 +827,7 @@ void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t
   handler_->Write(type_hint);
   handler_->WriteArray(reinterpret_cast<char*>(from) + from_offset, data_size);
 
-  CHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn);
+  ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn);
 }
 
 void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset,
@@ -850,7 +850,7 @@ void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_
   handler_->Write(type_hint);
 
   TVMRetValue rv;
-  CHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck);
+  ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck);
   handler_->ReadArray(reinterpret_cast<char*>(to) + to_offset, data_size);
   handler_->FinishCopyAck();
 }
@@ -917,7 +917,7 @@ void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
   if (ctx.device_type == kDLCPU) {
     ctx = ctx_to;
   } else {
-    CHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type)
+    ICHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type)
         << "Can not copy across different ctx types directly";
   }
   handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from,
@@ -957,7 +957,7 @@ void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
   }
 
   if (state_ != kWaitForAsyncCallback) {
-    CHECK_EQ(state_, kRecvPacketNumBytes);
+    ICHECK_EQ(state_, kRecvPacketNumBytes);
   }
 }
 
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index d1eb89164fb7..a3d888e927ed 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -109,7 +109,7 @@ class RPCWrappedFunc : public Object {
   // remove a remote session mask
   TVMContext RemoveSessMask(TVMContext ctx) const {
     int dev_type = ctx.device_type;
-    CHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
+    ICHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
         << "Can not pass in local context or context with a different remote session";
     ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
     return ctx;
@@ -145,7 +145,7 @@ class RPCWrappedFunc : public Object {
     data->dl_tensor.ctx.device_type = static_cast<DLDeviceType>(
         static_cast<int>(tensor->ctx.device_type) + kRPCSessMask * (sess_->table_index() + 1));
     // check strides.
-    CHECK(tensor->strides == nullptr);
+    ICHECK(tensor->strides == nullptr);
     // setup byteoffset
     data->dl_tensor.byte_offset = tensor->byte_offset;
     return ret;
@@ -190,7 +190,7 @@ class RPCModuleNode final : public ModuleNode {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass ctx by parts.
     int dev_type = ctx.device_type;
-    CHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
+    ICHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
         << "ValueError: Need to pass the matched remote context to RPCModule.GetTimeEvaluator";
     ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
 
@@ -224,7 +224,7 @@ class RPCModuleNode final : public ModuleNode {
   void InitRemoteFunc(FType* func, const std::string& name) {
     if (*func != nullptr) return;
     RPCSession::PackedFuncHandle handle = sess_->GetFunction(name);
-    CHECK(handle != nullptr) << "Cannot found remote function " << name;
+    ICHECK(handle != nullptr) << "Cannot found remote function " << name;
     *func = WrapRemoteFunc(handle);
   }
 
@@ -253,9 +253,9 @@ void* RPCWrappedFunc::UnwrapRemoteValueToHandle(const TVMArgValue& arg) const {
   if (arg.type_code() == kTVMModuleHandle) {
     Module mod = arg;
     std::string tkey = mod->type_key();
-    CHECK_EQ(tkey, "rpc") << "ValueError: Cannot pass a non-RPC module to remote";
+    ICHECK_EQ(tkey, "rpc") << "ValueError: Cannot pass a non-RPC module to remote";
     auto* rmod = static_cast<RPCModuleNode*>(mod.operator->());
-    CHECK(rmod->sess() == sess_)
+    ICHECK(rmod->sess() == sess_)
         << "ValueError: Cannot pass in module into a different remote session";
     return rmod->module_handle();
   } else {
@@ -270,22 +270,22 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons
 
   if (tcode == kTVMNullptr) return;
   if (tcode == kTVMPackedFuncHandle) {
-    CHECK_EQ(args.size(), 2);
+    ICHECK_EQ(args.size(), 2);
     void* handle = args[1];
     auto wf = std::make_shared<RPCWrappedFunc>(handle, sess_);
     *rv = PackedFunc([wf](TVMArgs args, TVMRetValue* rv) { return wf->operator()(args, rv); });
   } else if (tcode == kTVMModuleHandle) {
-    CHECK_EQ(args.size(), 2);
+    ICHECK_EQ(args.size(), 2);
     void* handle = args[1];
     auto n = make_object<RPCModuleNode>(handle, sess_);
     *rv = Module(n);
   } else if (tcode == kTVMDLTensorHandle || tcode == kTVMNDArrayHandle) {
-    CHECK_EQ(args.size(), 3);
+    ICHECK_EQ(args.size(), 3);
     DLTensor* tensor = args[1];
     void* nd_handle = args[2];
     *rv = WrapRemoteNDArray(tensor, nd_handle);
   } else {
-    CHECK_EQ(args.size(), 2);
+    ICHECK_EQ(args.size(), 2);
     *rv = args[1];
   }
 }
@@ -298,7 +298,7 @@ Module CreateRPCSessionModule(std::shared_ptr<RPCSession> sess) {
 
 std::shared_ptr<RPCSession> RPCModuleGetSession(Module mod) {
   std::string tkey = mod->type_key();
-  CHECK_EQ(tkey, "rpc") << "ValueError: Cannot pass a non-RPC module to remote";
+  ICHECK_EQ(tkey, "rpc") << "ValueError: Cannot pass a non-RPC module to remote";
   auto* rmod = static_cast<RPCModuleNode*>(mod.operator->());
   return rmod->sess();
 }
@@ -340,11 +340,11 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
 
 PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
                              int min_repeat_ms, PackedFunc f_preproc) {
-  CHECK(pf != nullptr);
+  ICHECK(pf != nullptr);
 
   if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
     auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
-    CHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
+    ICHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
     return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
   }
 
@@ -414,7 +414,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
           PackedFunc f_preproc;
           if (!f_preproc_name.empty()) {
             auto* pf_preproc = runtime::Registry::Get(f_preproc_name);
-            CHECK(pf_preproc != nullptr)
+            ICHECK(pf_preproc != nullptr)
                 << "Cannot find " << f_preproc_name << " in the global function";
             f_preproc = *pf_preproc;
           }
@@ -423,11 +423,11 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
         }
       } else {
         auto* pf = runtime::Registry::Get(name);
-        CHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
+        ICHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
         PackedFunc f_preproc;
         if (!f_preproc_name.empty()) {
           auto* pf_preproc = runtime::Registry::Get(f_preproc_name);
-          CHECK(pf_preproc != nullptr)
+          ICHECK(pf_preproc != nullptr)
               << "Cannot find " << f_preproc_name << " in the global function";
           f_preproc = *pf_preproc;
         }
@@ -452,20 +452,20 @@ TVM_REGISTER_GLOBAL("tvm.rpc.server.ModuleGetFunction")
 // functions to access an RPC module.
 TVM_REGISTER_GLOBAL("rpc.LoadRemoteModule").set_body_typed([](Module sess, std::string name) {
   std::string tkey = sess->type_key();
-  CHECK_EQ(tkey, "rpc");
+  ICHECK_EQ(tkey, "rpc");
   return static_cast<RPCModuleNode*>(sess.operator->())->LoadModule(name);
 });
 
 TVM_REGISTER_GLOBAL("rpc.ImportRemoteModule").set_body_typed([](Module parent, Module child) {
   std::string tkey = parent->type_key();
-  CHECK_EQ(tkey, "rpc");
+  ICHECK_EQ(tkey, "rpc");
   static_cast<RPCModuleNode*>(parent.operator->())->ImportModule(child);
 });
 
 TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue* rv) {
   Module m = args[0];
   std::string tkey = m->type_key();
-  CHECK_EQ(tkey, "rpc");
+  ICHECK_EQ(tkey, "rpc");
   *rv = static_cast<RPCModuleNode*>(m.operator->())->sess()->table_index();
 });
 
diff --git a/src/runtime/rpc/rpc_pipe_impl.cc b/src/runtime/rpc/rpc_pipe_impl.cc
index 2f4243574909..6f2f7e22deb4 100644
--- a/src/runtime/rpc/rpc_pipe_impl.cc
+++ b/src/runtime/rpc/rpc_pipe_impl.cc
@@ -78,8 +78,8 @@ class PipeChannel final : public RPCChannel {
 Module CreatePipeClient(std::vector<std::string> cmd) {
   int parent2child[2];
   int child2parent[2];
-  CHECK_EQ(pipe(parent2child), 0);
-  CHECK_EQ(pipe(child2parent), 0);
+  ICHECK_EQ(pipe(parent2child), 0);
+  ICHECK_EQ(pipe(child2parent), 0);
 
   int parent_read = child2parent[0];
   int parent_write = parent2child[1];
diff --git a/src/runtime/rpc/rpc_server_env.cc b/src/runtime/rpc/rpc_server_env.cc
index cb25150449a1..7ceb12caaf1f 100644
--- a/src/runtime/rpc/rpc_server_env.cc
+++ b/src/runtime/rpc/rpc_server_env.cc
@@ -31,7 +31,7 @@ namespace runtime {
 std::string RPCGetPath(const std::string& name) {
   // do live lookup everytime as workpath can change.
   const PackedFunc* f = runtime::Registry::Get("tvm.rpc.server.workpath");
-  CHECK(f != nullptr) << "require tvm.rpc.server.workpath";
+  ICHECK(f != nullptr) << "require tvm.rpc.server.workpath";
   return (*f)(name);
 }
 
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 9e05e5d1628d..f5405f0c2fa0 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -108,7 +108,7 @@ class RPCSessTable {
   }
   // Get session from table
   std::shared_ptr<RPCSession> Get(int index) {
-    CHECK(index >= 0 && index < kMaxRPCSession);
+    ICHECK(index >= 0 && index < kMaxRPCSession);
     return tbl_[index].lock();
   }
   // Insert session into table.
@@ -137,7 +137,7 @@ std::shared_ptr<RPCSession> RPCSession::Get(int table_index) {
 }
 
 void RPCSession::InsertToSessionTable(std::shared_ptr<RPCSession> sess) {
-  CHECK_EQ(sess->table_index_, 0);
+  ICHECK_EQ(sess->table_index_, 0);
   sess->table_index_ = RPCSessTable::Global()->Insert(sess);
 }
 
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 77a743be0de6..4e7fe3196d45 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -70,17 +70,17 @@ std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string k
   support::TCPSocket sock;
   support::SockAddr addr(url.c_str(), port);
   sock.Create(addr.ss_family());
-  CHECK(sock.Connect(addr)) << "Connect to " << addr.AsString() << " failed";
+  ICHECK(sock.Connect(addr)) << "Connect to " << addr.AsString() << " failed";
   // hand shake
   std::ostringstream os;
   int code = kRPCMagic;
   int keylen = static_cast<int>(key.length());
-  CHECK_EQ(sock.SendAll(&code, sizeof(code)), sizeof(code));
-  CHECK_EQ(sock.SendAll(&keylen, sizeof(keylen)), sizeof(keylen));
+  ICHECK_EQ(sock.SendAll(&code, sizeof(code)), sizeof(code));
+  ICHECK_EQ(sock.SendAll(&keylen, sizeof(keylen)), sizeof(keylen));
   if (keylen != 0) {
-    CHECK_EQ(sock.SendAll(key.c_str(), keylen), keylen);
+    ICHECK_EQ(sock.SendAll(key.c_str(), keylen), keylen);
   }
-  CHECK_EQ(sock.RecvAll(&code, sizeof(code)), sizeof(code));
+  ICHECK_EQ(sock.RecvAll(&code, sizeof(code)), sizeof(code));
   if (code == kRPCMagic + 2) {
     sock.Close();
     LOG(FATAL) << "URL " << url << ":" << port << " cannot find server that matches key=" << key;
@@ -91,11 +91,11 @@ std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string k
     sock.Close();
     LOG(FATAL) << "URL " << url << ":" << port << " is not TVM RPC server";
   }
-  CHECK_EQ(sock.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
+  ICHECK_EQ(sock.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
   std::string remote_key;
   if (keylen != 0) {
     remote_key.resize(keylen);
-    CHECK_EQ(sock.RecvAll(&remote_key[0], keylen), keylen);
+    ICHECK_EQ(sock.RecvAll(&remote_key[0], keylen), keylen);
   }
   auto endpt =
       RPCEndpoint::Create(std::unique_ptr<SockChannel>(new SockChannel(sock)), key, remote_key);
diff --git a/src/runtime/stackvm/stackvm.cc b/src/runtime/stackvm/stackvm.cc
index 042815b3d68b..4a5211e9c829 100644
--- a/src/runtime/stackvm/stackvm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -360,7 +360,7 @@ void StackVM::Run(State* s) const {
       }
       case PUSH_VALUE: {
         int relpos = code[pc + 1].v_int;
-        CHECK_LE(relpos, 0);
+        ICHECK_LE(relpos, 0);
         stack[sp + 1] = stack[sp + relpos];
         sp += 1;
         pc += 2;
@@ -390,7 +390,7 @@ void StackVM::Run(State* s) const {
         break;
       }
       case ASSERT: {
-        CHECK(stack[sp].v_int64) << str_data[code[pc + 1].v_int];
+        ICHECK(stack[sp].v_int64) << str_data[code[pc + 1].v_int];
         sp -= 1;
         pc += 2;
         break;
@@ -417,8 +417,8 @@ void StackVM::Run(State* s) const {
       }
       case ASSERT_SP: {
         int64_t expected = code[pc + 1].v_int;
-        CHECK_EQ(sp, expected) << "sp assertion failed, expected=" << expected << " now=" << sp
-                               << ", pc=" << pc;
+        ICHECK_EQ(sp, expected) << "sp assertion failed, expected=" << expected << " now=" << sp
+                                << ", pc=" << pc;
         pc += 2;
         break;
       }
@@ -594,19 +594,19 @@ void StackVM::Run(State* s) const {
         break;
       }
     }
-    CHECK_GE(sp, alloca_sp) << "touch allocated space";
-    CHECK_LT(sp, stack_cap) << "Stack overflow";
+    ICHECK_GE(sp, alloca_sp) << "touch allocated space";
+    ICHECK_LT(sp, stack_cap) << "Stack overflow";
   }
 }
 
 const PackedFunc& StackVM::GetExtern(State* s, int fid) const {
-  CHECK_LT(static_cast<size_t>(fid), extern_func_cache_.size());
+  ICHECK_LT(static_cast<size_t>(fid), extern_func_cache_.size());
   // allow race write in this, since write is idempotent
   PackedFunc& f = extern_func_cache_[fid];
   if (f == nullptr) {
-    CHECK(s->mod_ctx != nullptr) << "No local context is set in stackvm";
+    ICHECK(s->mod_ctx != nullptr) << "No local context is set in stackvm";
     const PackedFunc* pf = s->mod_ctx->GetFuncFromEnv(extern_func_name[fid]);
-    CHECK(pf != nullptr);
+    ICHECK(pf != nullptr);
     f = *pf;
   }
   return f;
diff --git a/src/runtime/stackvm/stackvm.h b/src/runtime/stackvm/stackvm.h
index 09581a6d0b62..e57cb0b03952 100644
--- a/src/runtime/stackvm/stackvm.h
+++ b/src/runtime/stackvm/stackvm.h
@@ -162,7 +162,7 @@ class StackVM {
     /*!
      * \brief Assert condition is true.
      * \code
-     *  CHECK(stack[sp]) << str_data[code[pc + 1].v_int];
+     *  ICHECK(stack[sp]) << str_data[code[pc + 1].v_int];
      *  sp = sp - 1;
      * \endcode
      */
@@ -201,7 +201,7 @@ class StackVM {
     /*!
      * \brief debug instruction.
      * \code
-     *  CHECK_EQ(sp, code[pc + 1]).v_int;
+     *  ICHECK_EQ(sp, code[pc + 1]).v_int;
      *  pc += 2;
      * \code
      */
@@ -391,7 +391,7 @@ class StackVM {
    * \return The load opcode
    */
   static OpCode GetLoad(DLDataType t) {
-    CHECK_EQ(t.lanes, 1U);
+    ICHECK_EQ(t.lanes, 1U);
     if (t.code == kTVMOpaqueHandle) return ARRAY_LOAD_HANDLE;
     if (t.code == kDLInt) {
       switch (t.bits) {
@@ -420,7 +420,7 @@ class StackVM {
    * \return The load opcode
    */
   static OpCode GetStore(DLDataType t) {
-    CHECK_EQ(t.lanes, 1U);
+    ICHECK_EQ(t.lanes, 1U);
     if (t.code == kTVMOpaqueHandle) return ARRAY_STORE_HANDLE;
     if (t.code == kDLInt) {
       switch (t.bits) {
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
index 88c19362a1f8..c815857ac66f 100644
--- a/src/runtime/stackvm/stackvm_module.cc
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -71,7 +71,7 @@ class StackVMModuleNode : public runtime::ModuleNode {
     strm->Write(num_imports);
 
     for (runtime::Module im : imports_) {
-      CHECK_EQ(im->imports().size(), 0U) << "Only support simply one-level hierarchy";
+      ICHECK_EQ(im->imports().size(), 0U) << "Only support simply one-level hierarchy";
       std::string tkey = im->type_key();
       strm->Write(tkey);
       LOG(INFO) << "save " << tkey;
@@ -100,7 +100,7 @@ class StackVMModuleNode : public runtime::ModuleNode {
     strm->Read(&num_imports);
     for (uint64_t i = 0; i < num_imports; ++i) {
       std::string tkey;
-      CHECK(strm->Read(&tkey));
+      ICHECK(strm->Read(&tkey));
       std::string loadkey = "runtime.module.loadbinary_";
       std::string fkey = loadkey + tkey;
       const PackedFunc* f = Registry::Get(fkey);
@@ -114,7 +114,7 @@ class StackVMModuleNode : public runtime::ModuleNode {
             loaders += name.substr(loadkey.size());
           }
         }
-        CHECK(f != nullptr)
+        ICHECK(f != nullptr)
             << "Binary was created using " << tkey
             << " but a loader of that name is not registered. Available loaders are " << loaders
             << ". Perhaps you need to recompile with this runtime enabled.";
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index bf4133453e7c..9bb00eea1edc 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -21,13 +21,13 @@
  * \file thread_pool.cc
  * \brief Threadpool for multi-threading runtime.
  */
-#include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/threading_backend.h>
+#include <tvm/support/logging.h>
 #if TVM_THREADPOOL_USE_OPENMP
 #include <omp.h>
 #endif
@@ -189,7 +189,7 @@ class SpscTaskQueue {
     }
     const uint32_t head = head_.load(std::memory_order_relaxed);
     // sanity check if the queue is empty
-    CHECK(tail_.load(std::memory_order_acquire) != head);
+    ICHECK(tail_.load(std::memory_order_acquire) != head);
     *output = buffer_[head];
     head_.store((head + 1) % kRingSize, std::memory_order_release);
     return true;
@@ -280,13 +280,13 @@ class ThreadPool {
   }
   int Launch(FTVMParallelLambda flambda, void* cdata, int num_task, int need_sync) {
     ParallelLauncher* launcher = ParallelLauncher::ThreadLocal();
-    CHECK(!launcher->is_worker)
+    ICHECK(!launcher->is_worker)
         << "Cannot launch parallel job inside worker, consider fuse then parallel";
     if (num_task == 0) {
       num_task = num_workers_used_;
     }
     if (need_sync != 0) {
-      CHECK_LE(num_task, num_workers_used_)
+      ICHECK_LE(num_task, num_workers_used_)
           << "Request parallel sync task larger than number of threads used "
           << " workers=" << num_workers_used_ << " request=" << num_task;
     }
@@ -333,7 +333,7 @@ class ThreadPool {
     // TODO(tulloch): should we make this configurable via standard APIs?
     static size_t spin_count = GetSpinCount();
     while (queue->Pop(&task, spin_count)) {
-      CHECK(task.launcher != nullptr);
+      ICHECK(task.launcher != nullptr);
       TVMParallelGroupEnv* penv = &(task.launcher->env);
       void* cdata = task.launcher->cdata;
       if ((*task.launcher->flambda)(task.task_id, penv, cdata) == 0) {
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 019df3e597c9..2527f4799086 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -21,8 +21,8 @@
  * \file threading_backend.cc
  * \brief Native threading backend
  */
-#include <dmlc/logging.h>
 #include <tvm/runtime/threading_backend.h>
+#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <thread>
@@ -46,7 +46,7 @@ class ThreadGroup::Impl {
  public:
   Impl(int num_workers, std::function<void(int)> worker_callback, bool exclude_worker0)
       : num_workers_(num_workers) {
-    CHECK_GE(num_workers, 1) << "Requested a non-positive number of worker threads.";
+    ICHECK_GE(num_workers, 1) << "Requested a non-positive number of worker threads.";
     for (int i = exclude_worker0; i < num_workers_; ++i) {
       threads_.emplace_back([worker_callback, i] { worker_callback(i); });
     }
@@ -112,7 +112,7 @@ class ThreadGroup::Impl {
 #endif
 #endif
 #if defined(__linux__) || defined(__ANDROID__)
-    CHECK_GE(sorted_order_.size(), num_workers_);
+    ICHECK_GE(sorted_order_.size(), num_workers_);
 
     for (unsigned i = 0; i < threads_.size(); ++i) {
       unsigned core_id;
diff --git a/src/runtime/vm/bytecode.cc b/src/runtime/vm/bytecode.cc
index 78972beb1ed2..f82d708468f7 100644
--- a/src/runtime/vm/bytecode.cc
+++ b/src/runtime/vm/bytecode.cc
@@ -22,7 +22,6 @@
  * \brief The bytecode for Relay virtual machine.
  */
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/vm/bytecode.h>
 #include <tvm/support/logging.h>
 
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 08e9af61fdc3..eb1707b25aa3 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -43,9 +43,9 @@ namespace tvm {
 namespace runtime {
 namespace vm {
 
-#define STREAM_CHECK(val, section)                                         \
-  CHECK(val) << "Invalid VM file format in the " << section << " section." \
-             << "\n";
+#define STREAM_CHECK(val, section)                                          \
+  ICHECK(val) << "Invalid VM file format in the " << section << " section." \
+              << "\n";
 
 // Helper to serialize a vm instruction.
 VMInstructionSerializer SerializeInstruction(const Instruction& instr);
@@ -527,7 +527,7 @@ void Executable::LoadConstantSection(dmlc::Stream* strm) {
   // Load the const to device mapping.
   std::vector<size_t> const_device_type;
   STREAM_CHECK(strm->Read(&const_device_type), "constant");
-  CHECK_EQ(size, const_device_type.size());
+  ICHECK_EQ(size, const_device_type.size());
   for (auto dev : const_device_type) {
     this->const_device_type.push_back(static_cast<Index>(dev));
   }
@@ -545,7 +545,7 @@ void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
 // `instr_fields`.
 inline std::vector<Index> ExtractFields(const std::vector<Index>& instr_fields, Index start,
                                         Index cnt) {
-  CHECK_LE(static_cast<size_t>(start + cnt), instr_fields.size());
+  ICHECK_LE(static_cast<size_t>(start + cnt), instr_fields.size());
   std::vector<Index> ret;
   for (auto i = start; i < start + cnt; i++) {
     ret.push_back(instr_fields[i]);
@@ -765,8 +765,8 @@ void Executable::LoadCodeSection(dmlc::Stream* strm) {
     VMFunction vm_func = VMFunction(loaded_func.name, loaded_func.params, instructions,
                                     loaded_func.register_file_size, loaded_func.params_device_type);
     auto it = this->global_map.find(loaded_func.name);
-    CHECK(it != this->global_map.end());
-    CHECK_LE(it->second, this->global_map.size());
+    ICHECK(it != this->global_map.end());
+    ICHECK_LE(it->second, this->global_map.size());
     this->functions[it->second] = vm_func;
   }
 }
@@ -774,14 +774,14 @@ void Executable::LoadCodeSection(dmlc::Stream* strm) {
 TVM_REGISTER_GLOBAL("runtime.GetNumOfGlobals").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec);
+  ICHECK(exec);
   *rv = static_cast<int>(exec->global_map.size());
 });
 
 TVM_REGISTER_GLOBAL("runtime.GetGlobalFields").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec);
+  ICHECK(exec);
   int idx = args[1];
   std::vector<std::pair<std::string, Index> > globals(exec->global_map.begin(),
                                                       exec->global_map.end());
@@ -789,24 +789,24 @@ TVM_REGISTER_GLOBAL("runtime.GetGlobalFields").set_body([](TVMArgs args, TVMRetV
     return a.second < b.second;
   };
   std::sort(globals.begin(), globals.end(), comp);
-  CHECK_LT(idx, globals.size());
+  ICHECK_LT(idx, globals.size());
   *rv = globals[idx].first;
 });
 
 TVM_REGISTER_GLOBAL("runtime.GetNumOfPrimitives").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec);
+  ICHECK(exec);
   *rv = static_cast<int>(exec->primitive_map.size());
 });
 
 TVM_REGISTER_GLOBAL("runtime.GetPrimitiveFields").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec);
+  ICHECK(exec);
   int idx = args[1];
-  CHECK_GE(idx, 0);
-  CHECK_LT(idx, exec->primitive_map.size());
+  ICHECK_GE(idx, 0);
+  ICHECK_LT(idx, exec->primitive_map.size());
 
   for (const auto& it : exec->primitive_map) {
     if (idx == static_cast<int>(it.second)) {
diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc
index 4d443d9a26a2..4e480507e71a 100644
--- a/src/runtime/vm/memory_manager.cc
+++ b/src/runtime/vm/memory_manager.cc
@@ -35,7 +35,7 @@ namespace vm {
 
 static void BufferDeleter(Object* obj) {
   auto* ptr = static_cast<NDArray::Container*>(obj);
-  CHECK(ptr->manager_ctx != nullptr);
+  ICHECK(ptr->manager_ctx != nullptr);
   Buffer* buffer = reinterpret_cast<Buffer*>(ptr->manager_ctx);
   MemoryManager::GetAllocator(buffer->ctx)->Free(*(buffer));
   delete buffer;
@@ -59,15 +59,15 @@ void StorageObj::Deleter(Object* obj) {
 }
 
 inline void VerifyDataType(DLDataType dtype) {
-  CHECK_GE(dtype.lanes, 1);
+  ICHECK_GE(dtype.lanes, 1);
   if (dtype.code == kDLFloat) {
-    CHECK_EQ(dtype.bits % 8, 0);
+    ICHECK_EQ(dtype.bits % 8, 0);
   } else {
     // allow uint1 as a special flag for bool.
     if (dtype.bits == 1 && dtype.code == kDLUInt) return;
-    CHECK_EQ(dtype.bits % 8, 0);
+    ICHECK_EQ(dtype.bits % 8, 0);
   }
-  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
+  ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
 inline size_t GetDataAlignment(const DLTensor& arr) {
@@ -102,7 +102,7 @@ NDArray StorageObj::AllocNDArray(size_t offset, std::vector<int64_t> shape, DLDa
   NDArray ret(GetObjectPtr<Object>(container));
   // RAII in effect, now run the check.
 
-  CHECK(offset + needed_size <= this->buffer.size)
+  ICHECK(offset + needed_size <= this->buffer.size)
       << "storage allocation failure, attempted to allocate " << needed_size << " at offset "
       << offset << " in region that is " << this->buffer.size << "bytes";
 
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 63001634558e..94d827893b92 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -43,7 +43,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
                                             const ObjectPtr<Object>& sptr_to_self) {
   if (name == "get_stat") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.size(), 1U);
+      ICHECK_EQ(args.size(), 1U);
       std::vector<std::pair<Index, double>> op_acc_time;
       for (auto kv : op_durations_) {
         auto val =
@@ -95,7 +95,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
 
 void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
   VirtualMachine::LoadExecutable(exec);
-  CHECK(exec_);
+  ICHECK(exec_);
   for (auto kv : exec_->primitive_map) {
     packed_index_map_[kv.second] = kv.first;
     op_invokes_[kv.second] = 0;
@@ -104,17 +104,17 @@ void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
 
 void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
                                        Index output_size, const std::vector<ObjectRef>& args) {
-  CHECK(exec_);
-  CHECK(!ctxs_.empty()) << "Context has not been initialized yet.";
+  ICHECK(exec_);
+  ICHECK(!ctxs_.empty()) << "Context has not been initialized yet.";
   // The device context of any input of the operator is used for
   // synchronization.
-  CHECK_GT(arg_count, 0U);
+  ICHECK_GT(arg_count, 0U);
   ObjectRef arg = args[0];
   while (arg->IsInstance<ADTObj>()) {
     ADT adt = Downcast<ADT>(arg);
     arg = adt[0];
   }
-  CHECK(arg->IsInstance<NDArray::ContainerType>());
+  ICHECK(arg->IsInstance<NDArray::ContainerType>());
   auto nd_array = Downcast<NDArray>(arg);
   auto ctx = nd_array->ctx;
 
@@ -140,8 +140,8 @@ runtime::Module CreateVirtualMachineDebug(const Executable* exec) {
 TVM_REGISTER_GLOBAL("runtime._VirtualMachineDebug").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec) << "Virtual machine has not been defined yet."
-              << "\n";
+  ICHECK(exec) << "Virtual machine has not been defined yet."
+               << "\n";
   *rv = CreateVirtualMachineDebug(exec);
 });
 
diff --git a/src/runtime/vm/serialize_utils.h b/src/runtime/vm/serialize_utils.h
index 726a46ee2fa1..990da31750d4 100644
--- a/src/runtime/vm/serialize_utils.h
+++ b/src/runtime/vm/serialize_utils.h
@@ -79,8 +79,8 @@ struct VMFunctionSerializer {
   bool Load(dmlc::Stream* strm) {
     std::vector<std::string> func_info;
     if (!strm->Read(&func_info)) return false;
-    CHECK_EQ(func_info.size(), 3U) << "Failed to decode the vm function."
-                                   << "\n";
+    ICHECK_EQ(func_info.size(), 3U) << "Failed to decode the vm function."
+                                    << "\n";
     name = func_info[0];
     register_file_size = std::stoll(func_info[1]);
     // Get the number of instructions.
@@ -135,7 +135,7 @@ struct VMInstructionSerializer {
   bool Load(dmlc::Stream* strm) {
     std::vector<Index> instr;
     if (!strm->Read(&instr)) return false;
-    CHECK_GE(instr.size(), 2U);
+    ICHECK_GE(instr.size(), 2U);
     Index loaded_hash = instr[0];
     opcode = instr[1];
 
@@ -144,7 +144,7 @@ struct VMInstructionSerializer {
     }
 
     Index hash = Hash();
-    CHECK_EQ(loaded_hash, hash) << "Found mismatch in hash for opcode: " << opcode << "\n";
+    ICHECK_EQ(loaded_hash, hash) << "Found mismatch in hash for opcode: " << opcode << "\n";
     return true;
   }
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 0a0ff2697674..473b5d759272 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -70,7 +70,7 @@ inline ObjectRef CopyTo(ObjectRef src, const DLContext& ctx) {
     }
     return src;
   } else {
-    CHECK(src->IsInstance<ADTObj>())
+    ICHECK(src->IsInstance<ADTObj>())
         << "VM data must be NDArray or a list of NDArray, but received: " << src->_type_key;
     std::vector<ObjectRef> ret;
     ADT adt = Downcast<ADT>(src);
@@ -93,7 +93,7 @@ std::vector<int64_t> ToShape(NDArray shape_tensor) {
 
   // Otherwise we should be rank-1, and we will extract the number of dimensions
   // for the output vector.
-  CHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, found " << rank;
+  ICHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, found " << rank;
   int64_t ndim = shape_tensor.Shape().at(0);
   shape.resize(ndim);
 
@@ -115,24 +115,24 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
                                        const ObjectPtr<Object>& sptr_to_self) {
   if (name == "invoke") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK(exec_) << "The executable is not created yet.";
+      ICHECK(exec_) << "The executable is not created yet.";
       std::string func_name = args[0];
       auto git = exec_->global_map.find(func_name);
-      CHECK(git != exec_->global_map.end())
+      ICHECK(git != exec_->global_map.end())
           << "Cannot find function " << func_name << " in the executable";
       auto func = exec_->functions[git->second];
       if (func.params.empty()) {
         *rv = Invoke(func, {});
       } else {
         auto it = inputs_.find(func_name);
-        CHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
+        ICHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
         const std::vector<ObjectRef>& func_args = it->second;
         *rv = Invoke(func, func_args);
       }
     });
   } else if (name == "init") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args.size() % 3, 0);
+      ICHECK_EQ(args.size() % 3, 0);
       std::vector<TVMContext> contexts;
       std::vector<AllocatorType> alloc_types;
       for (int i = 0; i < args.size() / 3; ++i) {
@@ -148,16 +148,16 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
     });
   } else if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      CHECK(exec_) << "The executable is not created yet.";
+      ICHECK(exec_) << "The executable is not created yet.";
       std::string func_name = args[0];
       auto gvit = exec_->global_map.find(func_name);
-      CHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
+      ICHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
       auto func_index = gvit->second;
       const auto& vm_func = exec_->functions[func_index];
       const auto& param_names = vm_func.params;
-      CHECK_EQ(args.size() - 1, param_names.size())
+      ICHECK_EQ(args.size() - 1, param_names.size())
           << "The number of provided parameters doesn't match the number of arguments";
-      CHECK_EQ(param_names.size(), vm_func.params_device_type.size())
+      ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
           << "The number of provided parameters doesn't match the number of assigned devices";
       std::vector<ObjectRef> func_args(param_names.size());
       for (int i = 1; i < args.size(); ++i) {
@@ -176,10 +176,10 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
 }
 
 inline TVMContext VirtualMachine::GetContext(Index device_type) const {
-  CHECK_GE(ctxs_.size(), device_type) << "ctxs_ list doesn't contain device:" << device_type;
+  ICHECK_GE(ctxs_.size(), device_type) << "ctxs_ list doesn't contain device:" << device_type;
 
   auto ctx = ctxs_[device_type];
-  CHECK_EQ(static_cast<Index>(ctx.device_type), device_type)
+  ICHECK_EQ(static_cast<Index>(ctx.device_type), device_type)
       << "device type " << device_type << " has not been initialized int the context list.";
   return ctx;
 }
@@ -190,7 +190,7 @@ void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction&
 }
 
 Index VirtualMachine::PopFrame() {
-  CHECK_GT(frames_.size(), 0);
+  ICHECK_GT(frames_.size(), 0);
   const VMFrame& fr = frames_.back();
   func_index_ = fr.func_index;
   code_ = fr.code;
@@ -222,9 +222,9 @@ ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<Objec
 }
 
 ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<ObjectRef>& args) {
-  CHECK(exec_) << "The executable has not been created yet.";
+  ICHECK(exec_) << "The executable has not been created yet.";
   auto it = exec_->global_map.find(name);
-  CHECK(it != exec_->global_map.end()) << "Cannot find function " << name << " in the executable";
+  ICHECK(it != exec_->global_map.end()) << "Cannot find function " << name << " in the executable";
   auto func_index_ = it->second;
   DLOG(INFO) << "Invoke Global " << name << " at index " << func_index_;
   return Invoke(exec_->functions[func_index_], args);
@@ -263,12 +263,12 @@ void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func, In
 }
 
 void VirtualMachine::LoadExecutable(const Executable* exec) {
-  CHECK(exec) << "The executable is not created yet.";
+  ICHECK(exec) << "The executable is not created yet.";
   exec_ = exec;
 
   runtime::Module lib = exec_->lib;
   // Get the list of packed functions.
-  CHECK(exec->primitive_map.empty() || lib.operator->())
+  ICHECK(exec->primitive_map.empty() || lib.operator->())
       << "runtime module should have been built for primitive functions"
       << "\n";
   for (const auto& it : exec_->primitive_map) {
@@ -278,17 +278,17 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
       packed_funcs_.resize(packed_index + 1);
     }
     tvm::runtime::PackedFunc pf = lib.GetFunction(packed_name, true);
-    CHECK(pf != nullptr) << "Cannot find function in module: " << packed_name;
+    ICHECK(pf != nullptr) << "Cannot find function in module: " << packed_name;
     packed_funcs_[packed_index] = pf;
   }
   for (size_t i = 0; i < packed_funcs_.size(); ++i) {
-    CHECK(packed_funcs_[i] != nullptr) << "Packed function " << i << " is not initialized";
+    ICHECK(packed_funcs_[i] != nullptr) << "Packed function " << i << " is not initialized";
   }
 }
 
 void VirtualMachine::Init(const std::vector<TVMContext>& ctxs,
                           const std::vector<AllocatorType>& alloc_types) {
-  CHECK_EQ(ctxs.size(), alloc_types.size());
+  ICHECK_EQ(ctxs.size(), alloc_types.size());
   // Cache the context
   for (size_t i = 0; i < ctxs.size(); i++) {
     auto dev_type = static_cast<size_t>(ctxs[i].device_type);
@@ -343,8 +343,8 @@ inline int64_t VirtualMachine::LoadScalarInt(Index r) const {
 }
 
 void VirtualMachine::RunLoop() {
-  CHECK(this->exec_);
-  CHECK(this->code_);
+  ICHECK(this->exec_);
+  ICHECK(this->code_);
   pc_ = 0;
   Index frame_start = frames_.size();
   while (true) {
@@ -398,7 +398,7 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::InvokePacked: {
         DLOG(INFO) << "InvokedPacked " << instr.packed_index << " arity=" << instr.arity;
-        CHECK_LE(instr.packed_index, packed_funcs_.size());
+        ICHECK_LE(instr.packed_index, packed_funcs_.size());
         const auto& func = packed_funcs_[instr.packed_index];
         const auto& arity = instr.arity;
         std::vector<ObjectRef> args;
@@ -456,10 +456,10 @@ void VirtualMachine::RunLoop() {
         int32_t target_val = LoadScalarInt(instr.if_op.target);
 
         if (test_val == target_val) {
-          CHECK_NE(instr.if_op.true_offset, 0);
+          ICHECK_NE(instr.if_op.true_offset, 0);
           pc_ += instr.if_op.true_offset;
         } else {
-          CHECK_NE(instr.if_op.false_offset, 0);
+          ICHECK_NE(instr.if_op.false_offset, 0);
           pc_ += instr.if_op.false_offset;
         }
 
@@ -524,10 +524,10 @@ void VirtualMachine::RunLoop() {
 
         auto storage_obj = SimpleObjAllocator().make_object<StorageObj>();
         auto dev_type = instr.alloc_storage.device_type;
-        CHECK_LT(static_cast<size_t>(dev_type), allocators_.size())
+        ICHECK_LT(static_cast<size_t>(dev_type), allocators_.size())
             << "Memory allocator for device " << dev_type << " has not been initialized";
         auto* alloc = allocators_[dev_type];
-        CHECK(alloc) << "Did you forget to init the VirtualMachine with contexts?";
+        ICHECK(alloc) << "Did you forget to init the VirtualMachine with contexts?";
         storage_obj->buffer = alloc->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
         Storage storage(storage_obj);
         WriteRegister(instr.dst, storage);
@@ -569,8 +569,8 @@ void VirtualMachine::RunLoop() {
         auto shape_obj = ReadRegister(instr.reshape_tensor.newshape);
         NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_ctx));
         const DLTensor* dl_tensor = shape_tensor.operator->();
-        CHECK_EQ(dl_tensor->dtype.code, 0u);
-        CHECK_EQ(dl_tensor->dtype.bits, 64);
+        ICHECK_EQ(dl_tensor->dtype.code, 0u);
+        ICHECK_EQ(dl_tensor->dtype.bits, 64);
         int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
         int64_t ndim = shape_tensor->shape[0];
         std::vector<int64_t> shape(dims, dims + ndim);
@@ -584,7 +584,7 @@ void VirtualMachine::RunLoop() {
         auto tensor_src = ReadRegister(instr.src);
         NDArray src_data = Downcast<NDArray>(tensor_src);
         DLContext src_ctx = src_data->ctx;
-        CHECK_EQ(static_cast<Index>(src_ctx.device_type), instr.src_device_type);
+        ICHECK_EQ(static_cast<Index>(src_ctx.device_type), instr.src_device_type);
 
         DLContext dst_ctx;
         dst_ctx.device_type = static_cast<DLDeviceType>(instr.dst_device_type);
@@ -610,7 +610,7 @@ runtime::Module CreateVirtualMachine(const Executable* exec) {
 TVM_REGISTER_GLOBAL("runtime._VirtualMachine").set_body([](TVMArgs args, TVMRetValue* rv) {
   runtime::Module mod = args[0];
   const auto* exec = dynamic_cast<Executable*>(mod.operator->());
-  CHECK(exec) << "The virtual machine executable has not been defined yet.";
+  ICHECK(exec) << "The virtual machine executable has not been defined yet.";
   *rv = CreateVirtualMachine(exec);
 });
 
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index 3cbe245ed095..cbf1974ee3c7 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -202,7 +202,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
-    CHECK(stream == nullptr);
+    ICHECK(stream == nullptr);
     TVMContext ctx = ctx_from;
     if (ctx_from.device_type == kDLCPU) {
       ctx = ctx_to;
@@ -223,7 +223,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
             copy_info.size = size;
             vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, to_buf->buffer, 1, &copy_info);
             // 2: barrier(transfer-> compute|transfer)
-            CHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Vulkan disallow cross device copy.";
+            ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Vulkan disallow cross device copy.";
             VkMemoryBarrier barrier_info;
             barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
             barrier_info.pNext = nullptr;
@@ -324,7 +324,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    CHECK(stream == nullptr);
+    ICHECK(stream == nullptr);
     VulkanThreadEntry::ThreadLocal()->Stream(ctx.device_id)->Synchronize();
   }
 
@@ -347,7 +347,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
   }
 
   const VulkanContext& context(size_t device_id) const {
-    CHECK_LT(device_id, context_.size());
+    ICHECK_LT(device_id, context_.size());
     return context_[device_id];
   }
 
@@ -363,7 +363,7 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
     *rv = static_cast<int>(index < context_.size());
     return;
   }
-  CHECK_LT(index, context_.size()) << "Invalid device id " << index;
+  ICHECK_LT(index, context_.size()) << "Invalid device id " << index;
   const auto& vctx = context(index);
   switch (kind) {
     case kMaxThreadsPerBlock: {
@@ -600,7 +600,7 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
         ctx.coherent_staging = ty.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
       }
     }
-    CHECK_GE(win_rank, 0) << "Cannot find suitable staging memory on device.";
+    ICHECK_GE(win_rank, 0) << "Cannot find suitable staging memory on device.";
 
     win_rank = -1;
     for (uint32_t k = 0; k < prop.memoryTypeCount; ++k) {
@@ -619,7 +619,7 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
         ctx.compute_mtype_index = k;
       }
     }
-    CHECK_GE(win_rank, 0) << "Cannot find suitable local memory on device.";
+    ICHECK_GE(win_rank, 0) << "Cannot find suitable local memory on device.";
     auto has_extension = [&extensions](const char* query) {
       return std::any_of(extensions.begin(), extensions.end(),
                          [&](const char* extension) { return std::strcmp(query, extension) == 0; });
@@ -740,8 +740,8 @@ class VulkanModuleNode final : public runtime::ModuleNode {
   const char* type_key() const final { return "vulkan"; }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
-    CHECK_EQ(sptr_to_self.get(), this);
-    CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+    ICHECK_EQ(sptr_to_self.get(), this);
+    ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
     auto it = fmap_.find(name);
     if (it == fmap_.end()) return PackedFunc();
     const FunctionInfo& info = it->second;
@@ -757,7 +757,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
     for (size_t device_id = 0; device_id < ecache_.size(); ++device_id) {
       for (auto& kv : ecache_[device_id]) {
         auto& pe = kv.second;
-        CHECK(pe);
+        ICHECK(pe);
         const auto& vctx = VulkanDeviceAPI::Global()->context(device_id);
 
         if (pe->descriptor_update_template != VK_NULL_HANDLE) {
@@ -786,7 +786,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
     {
       // create shader
       auto sit = smap_.find(func_name);
-      CHECK(sit != smap_.end());
+      ICHECK(sit != smap_.end());
       const std::vector<uint32_t>& data = sit->second.data;
       VkShaderModuleCreateInfo shader_cinfo;
       shader_cinfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
@@ -802,7 +802,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
 
     {
       auto fit = fmap_.find(func_name);
-      CHECK(fit != fmap_.end());
+      ICHECK(fit != fmap_.end());
       for (DLDataType arg_type : fit->second.arg_types) {
         if (arg_type.code == kTVMOpaqueHandle) {
           {
@@ -885,7 +885,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
     if (num_pack_args != 0) {
       playout_cinfo.pushConstantRangeCount = 1;
       playout_cinfo.pPushConstantRanges = &crange;
-      CHECK_LE(crange.size, vctx.phy_device_prop.limits.maxPushConstantsSize);
+      ICHECK_LE(crange.size, vctx.phy_device_prop.limits.maxPushConstantsSize);
     } else {
       playout_cinfo.pushConstantRangeCount = 0;
       playout_cinfo.pPushConstantRanges = nullptr;
@@ -932,7 +932,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
 
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, fmt_) << "Can only save to customized format vulkan";
+    ICHECK_EQ(fmt, fmt_) << "Can only save to customized format vulkan";
     std::string meta_file = GetMetaFilePath(file_name);
     SaveMetaDataToFile(meta_file, fmap_);
     std::string data_bin;
@@ -1046,7 +1046,7 @@ VulkanStream* VulkanThreadEntry::Stream(size_t device_id) {
 
 void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const {
   int device_id = VulkanThreadEntry::ThreadLocal()->ctx.device_id;
-  CHECK_LT(device_id, kVulkanMaxNumDevice);
+  ICHECK_LT(device_id, kVulkanMaxNumDevice);
   const auto& vctx = VulkanDeviceAPI::Global()->context(device_id);
   if (!scache_[device_id]) {
     scache_[device_id] = m_->GetPipeline(device_id, func_name_, num_pack_args_);
@@ -1067,7 +1067,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion
     // Can safely capture by reference as this lambda is immediately executed on the calling thread.
     VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Launch([&](VulkanStreamState* state) {
       vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline);
-      CHECK(pipeline->descriptor_update_template != VK_NULL_HANDLE);
+      ICHECK(pipeline->descriptor_update_template != VK_NULL_HANDLE);
       vctx.descriptor_template_khr_functions->vkCmdPushDescriptorSetWithTemplateKHR(
           state->cmd_buffer_, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0,
           descriptor_buffers.data());
@@ -1152,7 +1152,7 @@ Module VulkanModuleLoadFile(const std::string& file_name, const std::string& for
   dmlc::Stream* stream = &fs;
   uint32_t magic;
   stream->Read(&magic);
-  CHECK_EQ(magic, kVulkanModuleMagic) << "VulkanModule Magic mismatch";
+  ICHECK_EQ(magic, kVulkanModuleMagic) << "VulkanModule Magic mismatch";
   stream->Read(&smap);
   return VulkanModuleCreate(smap, fmap, "");
 }
diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h
index 780b11184931..da604f6fa792 100644
--- a/src/runtime/vulkan/vulkan_common.h
+++ b/src/runtime/vulkan/vulkan_common.h
@@ -18,10 +18,10 @@
  */
 #pragma once
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/support/logging.h>
 #include <vulkan/vulkan.h>
 
 #include <memory>
@@ -80,10 +80,10 @@ inline const char* VKGetErrorString(VkResult error) {
  * \brief Protected Vulkan call
  * \param func Expression to call.
  */
-#define VULKAN_CHECK_ERROR(__e)                                     \
-  {                                                                 \
-    CHECK(__e == VK_SUCCESS) << "Vulan Error, code=" << __e << ": " \
-                             << vulkan::VKGetErrorString(__e);      \
+#define VULKAN_CHECK_ERROR(__e)                                      \
+  {                                                                  \
+    ICHECK(__e == VK_SUCCESS) << "Vulan Error, code=" << __e << ": " \
+                              << vulkan::VKGetErrorString(__e);      \
   }
 
 #define VULKAN_CALL(func)    \
diff --git a/src/runtime/vulkan/vulkan_shader.h b/src/runtime/vulkan/vulkan_shader.h
index d56ca61e91cb..7558a95ee45e 100644
--- a/src/runtime/vulkan/vulkan_shader.h
+++ b/src/runtime/vulkan/vulkan_shader.h
@@ -18,10 +18,10 @@
  */
 #pragma once
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/support/logging.h>
 
 #include <vector>
 
diff --git a/src/runtime/vulkan/vulkan_stream.h b/src/runtime/vulkan/vulkan_stream.h
index 388cacc577b0..c5094bdf28db 100644
--- a/src/runtime/vulkan/vulkan_stream.h
+++ b/src/runtime/vulkan/vulkan_stream.h
@@ -93,7 +93,7 @@ class VulkanStream {
   void LaunchDeferred(const std::function<void()>& deferred_initializer,
                       const std::function<void(VulkanStreamState*)>& deferred_kernel,
                       const VulkanStreamToken& deferred_token) {
-    CHECK(!vctx_->UseImmediate());
+    ICHECK(!vctx_->UseImmediate());
 
     // It is invalid to schedule this instance on the current stream if we already
     // have a matching descriptor set and a non-matching buffer set.
diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc
index 49a4c961159d..2d347c32ac10 100644
--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -95,7 +95,7 @@ class WorkspacePool::Pool {
       int index = static_cast<int>(allocated_.size()) - 2;
       for (; index > 0 && allocated_[index].data != data; --index) {
       }
-      CHECK_GT(index, 0) << "trying to free things that has not been allocated";
+      ICHECK_GT(index, 0) << "trying to free things that has not been allocated";
       e = allocated_[index];
       allocated_.erase(allocated_.begin() + index);
     }
@@ -115,7 +115,7 @@ class WorkspacePool::Pool {
   }
   // Release all resources
   void Release(TVMContext ctx, DeviceAPI* device) {
-    CHECK_EQ(allocated_.size(), 1);
+    ICHECK_EQ(allocated_.size(), 1);
     for (size_t i = 1; i < free_list_.size(); ++i) {
       device->FreeDataSpace(ctx, free_list_[i].data);
     }
@@ -160,7 +160,7 @@ void* WorkspacePool::AllocWorkspace(TVMContext ctx, size_t size) {
 }
 
 void WorkspacePool::FreeWorkspace(TVMContext ctx, void* ptr) {
-  CHECK(static_cast<size_t>(ctx.device_id) < array_.size() && array_[ctx.device_id] != nullptr);
+  ICHECK(static_cast<size_t>(ctx.device_id) < array_.size() && array_[ctx.device_id] != nullptr);
   array_[ctx.device_id]->Free(ptr);
 }
 
diff --git a/src/support/base64.h b/src/support/base64.h
index 9849542471c2..901922db8edc 100644
--- a/src/support/base64.h
+++ b/src/support/base64.h
@@ -26,7 +26,7 @@
 #ifndef TVM_SUPPORT_BASE64_H_
 #define TVM_SUPPORT_BASE64_H_
 
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #include <cctype>
 #include <cstdio>
@@ -154,7 +154,7 @@ class Base64InStream : public dmlc::Stream {
       {
         // second byte
         temp_ch_ = reader_.GetChar();
-        CHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
+        ICHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
         nvalue |= DecodeTable[temp_ch_] << 12;
         *cptr++ = (nvalue >> 16) & 0xFF;
         --tlen;
@@ -162,13 +162,13 @@ class Base64InStream : public dmlc::Stream {
       {
         // third byte
         temp_ch_ = reader_.GetChar();
-        CHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
+        ICHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
         // handle termination
         if (temp_ch_ == '=') {
           temp_ch_ = reader_.GetChar();
-          CHECK(temp_ch_ == '=') << "invalid base64 format";
+          ICHECK(temp_ch_ == '=') << "invalid base64 format";
           temp_ch_ = reader_.GetChar();
-          CHECK(temp_ch_ == EOF || isspace(temp_ch_)) << "invalid base64 format";
+          ICHECK(temp_ch_ == EOF || isspace(temp_ch_)) << "invalid base64 format";
           break;
         }
         nvalue |= DecodeTable[temp_ch_] << 6;
@@ -182,10 +182,10 @@ class Base64InStream : public dmlc::Stream {
       {
         // fourth byte
         temp_ch_ = reader_.GetChar();
-        CHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
+        ICHECK(temp_ch_ != EOF && !isspace(temp_ch_)) << "invalid base64 format";
         if (temp_ch_ == '=') {
           temp_ch_ = reader_.GetChar();
-          CHECK(temp_ch_ == EOF || isspace(temp_ch_)) << "invalid base64 format";
+          ICHECK(temp_ch_ == EOF || isspace(temp_ch_)) << "invalid base64 format";
           break;
         }
         nvalue |= DecodeTable[temp_ch_];
@@ -200,7 +200,7 @@ class Base64InStream : public dmlc::Stream {
       temp_ch_ = reader_.GetChar();
     }
     if (kStrictCheck) {
-      CHECK_EQ(tlen, 0) << "Base64InStream: read incomplete";
+      ICHECK_EQ(tlen, 0) << "Base64InStream: read incomplete";
     }
     return size - tlen;
   }
diff --git a/src/support/parallel_for.cc b/src/support/parallel_for.cc
index 0b8c810da70b..f4756c29adeb 100644
--- a/src/support/parallel_for.cc
+++ b/src/support/parallel_for.cc
@@ -21,7 +21,7 @@
  * \file parallel_for.cc
  * \brief An implementation to run loop in parallel.
  */
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 #include <tvm/support/parallel_for.h>
 
 #include <future>
@@ -34,8 +34,8 @@ namespace support {
 
 std::vector<std::vector<int>> rr_partitioner(int begin, int end, int step, int num_threads) {
   int total_task_count = (end - begin) / step;
-  CHECK_GE(total_task_count, 0) << "Infinite loop condition with begin: " << begin
-                                << " end: " << end << " step: " << step;
+  ICHECK_GE(total_task_count, 0) << "Infinite loop condition with begin: " << begin
+                                 << " end: " << end << " step: " << step;
   std::vector<std::vector<int>> ret;
   ret.reserve(num_threads);
   for (size_t thread = 0; begin < end; begin += step, thread = (thread + 1) % num_threads) {
@@ -53,8 +53,8 @@ void parallel_for(int begin, int end, const std::function<void(int)>& f, int ste
   static std::mutex M_GLOBAL_PARALLEL_FOR_FLAG;
   {
     std::unique_lock<std::mutex> l(M_GLOBAL_PARALLEL_FOR_FLAG);
-    CHECK(!GLOBAL_PARALLEL_FOR_FLAG) << "There's another parallel_for running. Maybe you're "
-                                     << "currently inside another parallel_for loop.";
+    ICHECK(!GLOBAL_PARALLEL_FOR_FLAG) << "There's another parallel_for running. Maybe you're "
+                                      << "currently inside another parallel_for loop.";
     GLOBAL_PARALLEL_FOR_FLAG = true;
   }
 
@@ -81,7 +81,7 @@ void parallel_for(int begin, int end, const std::function<void(int)>& f, int ste
   }
   {
     std::unique_lock<std::mutex> l(M_GLOBAL_PARALLEL_FOR_FLAG);
-    CHECK(GLOBAL_PARALLEL_FOR_FLAG);
+    ICHECK(GLOBAL_PARALLEL_FOR_FLAG);
     GLOBAL_PARALLEL_FOR_FLAG = false;
   }
   try {
diff --git a/src/support/pipe.h b/src/support/pipe.h
index dcebd0ddf32f..3c1356ba174c 100644
--- a/src/support/pipe.h
+++ b/src/support/pipe.h
@@ -25,7 +25,7 @@
 #define TVM_SUPPORT_PIPE_H_
 
 #include <dmlc/io.h>
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #ifdef _WIN32
 #include <windows.h>
@@ -64,12 +64,12 @@ class Pipe : public dmlc::Stream {
     if (size == 0) return 0;
 #ifdef _WIN32
     DWORD nread;
-    CHECK(ReadFile(handle_, static_cast<TCHAR*>(ptr), &nread, nullptr))
+    ICHECK(ReadFile(handle_, static_cast<TCHAR*>(ptr), &nread, nullptr))
         << "Read Error: " << GetLastError();
 #else
     ssize_t nread;
     nread = read(handle_, ptr, size);
-    CHECK_GE(nread, 0) << "Write Error: " << strerror(errno);
+    ICHECK_GE(nread, 0) << "Write Error: " << strerror(errno);
 #endif
     return static_cast<size_t>(nread);
   }
@@ -83,13 +83,13 @@ class Pipe : public dmlc::Stream {
     if (size == 0) return;
 #ifdef _WIN32
     DWORD nwrite;
-    CHECK(WriteFile(handle_, static_cast<const TCHAR*>(ptr), &nwrite, nullptr) &&
-          static_cast<size_t>(nwrite) == size)
+    ICHECK(WriteFile(handle_, static_cast<const TCHAR*>(ptr), &nwrite, nullptr) &&
+           static_cast<size_t>(nwrite) == size)
         << "Write Error: " << GetLastError();
 #else
     ssize_t nwrite;
     nwrite = write(handle_, ptr, size);
-    CHECK_EQ(static_cast<size_t>(nwrite), size) << "Write Error: " << strerror(errno);
+    ICHECK_EQ(static_cast<size_t>(nwrite), size) << "Write Error: " << strerror(errno);
 #endif
   }
   /*!
diff --git a/src/support/ring_buffer.h b/src/support/ring_buffer.h
index a3938491f1d1..af814158f7b6 100644
--- a/src/support/ring_buffer.h
+++ b/src/support/ring_buffer.h
@@ -93,7 +93,7 @@ class RingBuffer {
    * \param size The number of bytes to read.
    */
   void Read(void* data, size_t size) {
-    CHECK_GE(bytes_available_, size);
+    ICHECK_GE(bytes_available_, size);
     size_t ncopy = std::min(size, ring_.size() - head_ptr_);
     memcpy(data, &ring_[0] + head_ptr_, ncopy);
     if (ncopy < size) {
@@ -112,7 +112,7 @@ class RingBuffer {
   template <typename FSend>
   size_t ReadWithCallback(FSend fsend, size_t max_nbytes) {
     size_t size = std::min(max_nbytes, bytes_available_);
-    CHECK_NE(size, 0U);
+    ICHECK_NE(size, 0U);
     size_t ncopy = std::min(size, ring_.size() - head_ptr_);
     size_t nsend = fsend(&ring_[0] + head_ptr_, ncopy);
     bytes_available_ -= nsend;
diff --git a/src/support/socket.h b/src/support/socket.h
index 571b1503072a..16fba6b58e3d 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -49,7 +49,7 @@ using ssize_t = int;
 #include <sys/socket.h>
 #include <unistd.h>
 #endif
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 
 #include <cstring>
 #include <string>
@@ -75,7 +75,7 @@ namespace support {
 inline std::string GetHostName() {
   std::string buf;
   buf.resize(256);
-  CHECK_NE(gethostname(&buf[0], 256), -1);
+  ICHECK_NE(gethostname(&buf[0], 256), -1);
   return std::string(buf.c_str());
 }
 
@@ -117,7 +117,7 @@ struct SockAddr {
     size_t sep = url.find(",");
     std::string host = url.substr(2, sep - 3);
     std::string port = url.substr(sep + 1, url.length() - 1);
-    CHECK(ValidateIP(host)) << "Url address is not valid " << url;
+    ICHECK(ValidateIP(host)) << "Url address is not valid " << url;
     if (host == "localhost") {
       host = "127.0.0.1";
     }
@@ -137,7 +137,7 @@ struct SockAddr {
     hints.ai_socktype = SOCK_STREAM;
     addrinfo* res = nullptr;
     int sig = getaddrinfo(host, nullptr, &hints, &res);
-    CHECK(sig == 0 && res != nullptr) << "cannot obtain address of " << host;
+    ICHECK(sig == 0 && res != nullptr) << "cannot obtain address of " << host;
     switch (res->ai_family) {
       case AF_INET: {
         sockaddr_in* addr4 = reinterpret_cast<sockaddr_in*>(&addr);
@@ -152,7 +152,7 @@ struct SockAddr {
         addr6->sin6_family = AF_INET6;
       } break;
       default:
-        CHECK(false) << "cannot decode address";
+        ICHECK(false) << "cannot decode address";
     }
     freeaddrinfo(res);
   }
@@ -177,7 +177,7 @@ struct SockAddr {
       const in_addr& addr4 = reinterpret_cast<const sockaddr_in*>(&addr)->sin_addr;
       sinx_addr = reinterpret_cast<const void*>(&addr4);
     } else {
-      CHECK(false) << "illegal address";
+      ICHECK(false) << "illegal address";
     }
 
 #ifdef _WIN32
@@ -187,7 +187,7 @@ struct SockAddr {
     const char* s =
         inet_ntop(addr.ss_family, sinx_addr, &buf[0], static_cast<socklen_t>(buf.length()));
 #endif
-    CHECK(s != nullptr) << "cannot decode address";
+    ICHECK(s != nullptr) << "cannot decode address";
     std::ostringstream os;
     os << s << ":" << port();
     return os.str();
@@ -526,8 +526,8 @@ class TCPSocket : public Socket {
    */
   void SendBytes(std::string data) {
     int datalen = data.length();
-    CHECK_EQ(SendAll(&datalen, sizeof(datalen)), sizeof(datalen));
-    CHECK_EQ(SendAll(data.c_str(), datalen), datalen);
+    ICHECK_EQ(SendAll(&datalen, sizeof(datalen)), sizeof(datalen));
+    ICHECK_EQ(SendAll(data.c_str(), datalen), datalen);
   }
   /*!
    * \brief Receive the data to remote.
@@ -535,10 +535,10 @@ class TCPSocket : public Socket {
    */
   std::string RecvBytes() {
     int datalen = 0;
-    CHECK_EQ(RecvAll(&datalen, sizeof(datalen)), sizeof(datalen));
+    ICHECK_EQ(RecvAll(&datalen, sizeof(datalen)), sizeof(datalen));
     std::string data;
     data.resize(datalen);
-    CHECK_EQ(RecvAll(&data[0], datalen), datalen);
+    ICHECK_EQ(RecvAll(&data[0], datalen), datalen);
     return data;
   }
 };
diff --git a/src/target/build_common.h b/src/target/build_common.h
index 9d92697aa319..1816c3ac2650 100644
--- a/src/target/build_common.h
+++ b/src/target/build_common.h
@@ -44,7 +44,7 @@ inline std::unordered_map<std::string, runtime::FunctionInfo> ExtractFuncInfo(co
   std::unordered_map<std::string, runtime::FunctionInfo> fmap;
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<tir::PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    ICHECK(kv.second->IsInstance<tir::PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
     auto f = Downcast<tir::PrimFunc>(kv.second);
 
     runtime::FunctionInfo info;
diff --git a/src/target/codegen.cc b/src/target/codegen.cc
index 47603e404635..18aa954787ce 100644
--- a/src/target/codegen.cc
+++ b/src/target/codegen.cc
@@ -55,7 +55,7 @@ runtime::Module Build(IRModule mod, Target target) {
   }
   // the build function.
   const PackedFunc* bf = runtime::Registry::Get(build_f_name);
-  CHECK(bf != nullptr) << build_f_name << " is not enabled";
+  ICHECK(bf != nullptr) << build_f_name << " is not enabled";
   return (*bf)(mod, target);
 }
 
@@ -233,7 +233,7 @@ runtime::Module PackImportsToLLVM(const runtime::Module& mod, bool system_lib,
   std::string codegen_f_name = "codegen.codegen_blob";
   // the codegen function.
   const PackedFunc* codegen_f = runtime::Registry::Get(codegen_f_name);
-  CHECK(codegen_f != nullptr) << "codegen.codegen_blob is not presented.";
+  ICHECK(codegen_f != nullptr) << "codegen.codegen_blob is not presented.";
   return (*codegen_f)(blob_byte_array, system_lib, target_triple);
 }
 
diff --git a/src/target/datatype/registry.cc b/src/target/datatype/registry.cc
index c84f917d5c3e..e7807798741d 100644
--- a/src/target/datatype/registry.cc
+++ b/src/target/datatype/registry.cc
@@ -49,20 +49,20 @@ Registry* Registry::Global() {
 }
 
 void Registry::Register(const std::string& type_name, uint8_t type_code) {
-  CHECK(type_code >= DataType::kCustomBegin)
+  ICHECK(type_code >= DataType::kCustomBegin)
       << "Please choose a type code >= DataType::kCustomBegin for custom types";
   code_to_name_[type_code] = type_name;
   name_to_code_[type_name] = type_code;
 }
 
 uint8_t Registry::GetTypeCode(const std::string& type_name) {
-  CHECK(name_to_code_.find(type_name) != name_to_code_.end())
+  ICHECK(name_to_code_.find(type_name) != name_to_code_.end())
       << "Type name " << type_name << " not registered";
   return name_to_code_[type_name];
 }
 
 std::string Registry::GetTypeName(uint8_t type_code) {
-  CHECK(code_to_name_.find(type_code) != code_to_name_.end())
+  ICHECK(code_to_name_.find(type_code) != code_to_name_.end())
       << "Type code " << static_cast<unsigned>(type_code) << " not registered";
   return code_to_name_[type_code];
 }
diff --git a/src/target/generic_func.cc b/src/target/generic_func.cc
index b5842eebc9e3..16e5a5f9cdc6 100644
--- a/src/target/generic_func.cc
+++ b/src/target/generic_func.cc
@@ -68,7 +68,7 @@ void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name)
   Manager* m = Manager::Global();
   std::lock_guard<std::mutex>(m->mutex);
   auto it = m->fmap.find(name);
-  CHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
+  ICHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
   func->name_ = name;
   m->fmap[name] = func;
 }
@@ -76,7 +76,7 @@ void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name)
 GenericFunc& GenericFunc::set_default(const PackedFunc value, bool allow_override) {
   auto node = static_cast<GenericFuncNode*>(operator->());
   if (!allow_override) {
-    CHECK(node->generic_func_ == nullptr)
+    ICHECK(node->generic_func_ == nullptr)
         << "Generic function already registered for " << node->name_;
   }
   node->generic_func_ = value;
@@ -88,7 +88,7 @@ GenericFunc& GenericFunc::register_func(const std::vector<std::string>& tags,
   for (auto& t : tags) {
     if (!allow_override) {
       auto iter = (*this)->dispatch_dict_.find(t);
-      CHECK(iter == (*this)->dispatch_dict_.end())
+      ICHECK(iter == (*this)->dispatch_dict_.end())
           << "Tag " << t << " already registered for schedule factory " << (*this)->name_;
     }
     (*this)->dispatch_dict_[t] = value;
@@ -112,7 +112,7 @@ void GenericFunc::CallPacked(TVMArgs args, TVMRetValue* ret) const {
   }
 
   if (func == nullptr) {
-    CHECK(node->generic_func_ != nullptr) << "No generic function registered for " << node->name_;
+    ICHECK(node->generic_func_ != nullptr) << "No generic function registered for " << node->name_;
     func = node->generic_func_;
   }
 
diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index fa0ee38d8130..0808d237fc28 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -81,7 +81,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.rsqrt")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
       const CallNode* call = e.as<CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
 
       auto one = make_const(call->args[0].dtype(), 1);
       *rv = one / sqrt(call->args[0]);
@@ -93,7 +93,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sigmoid")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
       const CallNode* call = e.as<CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
 
       auto one = make_const(call->args[0].dtype(), 1);
       *rv = one / (one + exp(-call->args[0]));
@@ -103,7 +103,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.isfinite")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
       const CallNode* call = e.as<CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       *rv = isfinite(call->args[0]);
     });
 
@@ -111,7 +111,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.isinf")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
       const CallNode* call = e.as<CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       *rv = isinf(call->args[0]);
     });
 
@@ -121,7 +121,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.q_multiply_shift")
 
       PrimExpr e = args[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
 
       PrimExpr x = call->args[0];
       PrimExpr y = call->args[1];
@@ -129,8 +129,8 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.q_multiply_shift")
       PrimExpr s = call->args[3];
 
       // Only int32 types are supported (any number of lanes is allowed)
-      CHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
-      CHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
+      ICHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
+      ICHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
 
       DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
       DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
diff --git a/src/target/intrin_rule.h b/src/target/intrin_rule.h
index 359c5b9580b5..69196e1b2c39 100644
--- a/src/target/intrin_rule.h
+++ b/src/target/intrin_rule.h
@@ -58,13 +58,13 @@ template <typename T>
 inline void DispatchPureExtern(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   // Use string based dispatch to extern for backward compact
   // TODO(tvm-team) replace once the new dispatching system is inplace.
   const OpNode* op = call->op.as<OpNode>();
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   std::string name = op->name;
-  CHECK_EQ(name.substr(0, 4), "tir.");
+  ICHECK_EQ(name.substr(0, 4), "tir.");
   name = T()(call->dtype, name.substr(4));
 
   if (name.length() != 0) {
diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 1f6eedde0b21..2890c1ce3e56 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -70,11 +70,11 @@ class CodeGenAMDGPU : public CodeGenLLVM {
   }
 
   void VisitStmt_(const AllocateNode* op) final {
-    CHECK(!is_zero(op->condition));
+    ICHECK(!is_zero(op->condition));
     llvm::Value* buf = nullptr;
 
     int32_t constant_size = op->constant_allocation_size();
-    CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
+    ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
 
     StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
     if (constant_size % 4 == 0 && info.alignment == 0) {
@@ -99,7 +99,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       }
       buf = alloca;
     } else {
-      CHECK(info.scope.rank == runtime::StorageRank::kShared)
+      ICHECK(info.scope.rank == runtime::StorageRank::kShared)
           << "Can only allocate shared or local memory inside kernel";
       // Shared memory: address space  == 3
       const unsigned shared_address_space = 3;
@@ -120,7 +120,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
 
     buf = builder_->CreatePointerCast(
         buf, DTypeToLLVMType(op->dtype)->getPointerTo(buf->getType()->getPointerAddressSpace()));
-    CHECK(!var_map_.count(op->buffer_var.get()));
+    ICHECK(!var_map_.count(op->buffer_var.get()));
     var_map_[op->buffer_var.get()] = buf;
     this->VisitStmt(op->body);
   }
@@ -144,7 +144,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
           LOG(FATAL) << "unknown workitem idx";
       }
     } else {
-      CHECK_EQ(ts.rank, 0);
+      ICHECK_EQ(ts.rank, 0);
       switch (ts.dim_index) {
         case 0:
           intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_x;
@@ -207,7 +207,7 @@ runtime::Module BuildAMDGPU(IRModule mod, Target target) {
   cg->Init("TVMAMDGPUModule", tm.get(), ctx.get(), false, false, false);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
     auto f = Downcast<PrimFunc>(kv.second);
     cg->AddFunction(f);
   }
@@ -249,13 +249,13 @@ runtime::Module BuildAMDGPU(IRModule mod, Target target) {
   llvm::legacy::PassManager pass;
 
 #if TVM_LLVM_VERSION <= 60
-  CHECK(tm->addPassesToEmitFile(pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, destObj, llvm::TargetMachine::CGFT_ObjectFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #elif TVM_LLVM_VERSION <= 90
-  CHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::TargetMachine::CGFT_ObjectFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::TargetMachine::CGFT_ObjectFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #else
-  CHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::CGFT_ObjectFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::CGFT_ObjectFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #endif
   pass.run(*mObj);
@@ -263,21 +263,21 @@ runtime::Module BuildAMDGPU(IRModule mod, Target target) {
 
   llvm::legacy::PassManager passAsm;
 #if TVM_LLVM_VERSION <= 60
-  CHECK(tm->addPassesToEmitFile(passAsm, destAsm, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(passAsm, destAsm, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_AssemblyFile";
 #elif TVM_LLVM_VERSION <= 90
-  CHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr,
-                                llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr,
+                                 llvm::TargetMachine::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_AssemblyFile";
 #else
-  CHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr, llvm::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr, llvm::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_AssemblyFile";
 #endif
   passAsm.run(*mAsm);
   std::string assembly(dataAsm.begin(), dataAsm.end());
 
   const auto* f = tvm::runtime::Registry::Get("tvm_callback_rocm_link");
-  CHECK(f != nullptr) << "Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm";
+  ICHECK(f != nullptr) << "Require tvm_callback_rocm_link to exist, do import tvm.contrib.rocm";
 
   TVMByteArray arr;
   arr.data = &obj[0];
diff --git a/src/target/llvm/codegen_arm.cc b/src/target/llvm/codegen_arm.cc
index 5e5a94b50064..06f1dfeb1a2d 100644
--- a/src/target/llvm/codegen_arm.cc
+++ b/src/target/llvm/codegen_arm.cc
@@ -89,7 +89,7 @@ PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   PrimExpr input8 = reinterpret(uint8_type, e);
   // Popcount 8bit->8bit
   const CallNode* c0 = input8.as<CallNode>();
-  CHECK(c0 != nullptr);
+  ICHECK(c0 != nullptr);
   Array<PrimExpr> vcnt8_args;
   vcnt8_args.push_back(IntImm(DataType::UInt(32), ctpop_id));
   vcnt8_args.push_back(IntImm(DataType::UInt(32), 1));
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index d15c6151edc5..fea5f8036678 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -119,13 +119,13 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) {
   CodeGenLLVM::AddFunction(f);
   if (f_tvm_register_system_symbol_ != nullptr) {
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
     export_system_symbols_.emplace_back(
         std::make_pair(global_symbol.value().operator std::string(), function_));
   } else if (target_c_runtime_) {
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
     registry_functions_.emplace_back(
         std::make_pair(global_symbol.value().operator std::string(), function_));
@@ -136,7 +136,7 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) {
 // Following Glow |DebugInfo::generateFunctionDebugInfo|, https://git.io/fjadv
 void CodeGenCPU::AddDebugInformation(llvm::Function* function) {
 #if TVM_LLVM_VERSION >= 50 && TVM_LLVM_VERSION < 70
-  CHECK(!function->getSubprogram());
+  ICHECK(!function->getSubprogram());
   llvm::SmallVector<llvm::Metadata*, 4> paramTys;
   llvm::DIType* returnTy =
       getDebugType(builder_.get(), dbg_info_->di_builder_.get(), function->getReturnType());
@@ -159,9 +159,9 @@ void CodeGenCPU::AddDebugInformation(llvm::Function* function) {
       true, 0 /* line number */, llvm::DINode::FlagPrototyped, true /* isOptimized */);
 #endif
 
-  CHECK(DIFunction);
+  ICHECK(DIFunction);
   function->setSubprogram(DIFunction);
-  CHECK_EQ(function->getSubprogram(), DIFunction);
+  ICHECK_EQ(function->getSubprogram(), DIFunction);
 
   IRBuilder builder(&function->getEntryBlock());
   if (!function->getEntryBlock().empty()) {
@@ -223,7 +223,7 @@ llvm::DIType* CodeGenCPU::getDebugType(IRBuilder* builder, llvm::DIBuilder* di_b
 
 void CodeGenCPU::AddMainFunction(const std::string& entry_func_name) {
   llvm::Function* f = module_->getFunction(entry_func_name);
-  CHECK(f) << "Function " << entry_func_name << "does not in module";
+  ICHECK(f) << "Function " << entry_func_name << "does not in module";
   llvm::Type* type = llvm::ArrayType::get(t_char_, entry_func_name.length() + 1);
   llvm::GlobalVariable* global =
       new llvm::GlobalVariable(*module_, type, true, llvm::GlobalValue::WeakAnyLinkage, nullptr,
@@ -258,7 +258,7 @@ llvm::Value* CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::
     if (buf->getType() == t_void_p_) {
       buf = builder_->CreatePointerCast(buf, t_tvm_array_->getPointerTo());
     } else {
-      CHECK_EQ(buf->getType(), t_tvm_array_->getPointerTo());
+      ICHECK_EQ(buf->getType(), t_tvm_array_->getPointerTo());
     }
   }
   switch (kind) {
@@ -296,8 +296,8 @@ llvm::Value* CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::
       return builder_->CreateInBoundsGEP(buf, {index, ConstInt32(1), ConstInt32(0)});
     }
     case builtin::kTVMValueContent: {
-      CHECK_EQ(t.lanes(), 1);
-      CHECK(t.is_handle() || t.bits() == 64);
+      ICHECK_EQ(t.lanes(), 1);
+      ICHECK(t.is_handle() || t.bits() == 64);
       if (t.is_int()) {
         buf = builder_->CreatePointerCast(buf, t_int64_->getPointerTo());
         return builder_->CreateInBoundsGEP(buf, index);
@@ -305,7 +305,7 @@ llvm::Value* CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::
         buf = builder_->CreatePointerCast(buf, t_float64_->getPointerTo());
         return builder_->CreateInBoundsGEP(buf, index);
       } else {
-        CHECK(t.is_handle());
+        ICHECK(t.is_handle());
         buf = builder_->CreatePointerCast(buf, t_tvm_value_->getPointerTo());
         buf = builder_->CreateInBoundsGEP(buf, index);
         return builder_->CreatePointerCast(buf, t_void_p_->getPointerTo());
@@ -377,7 +377,7 @@ llvm::GlobalVariable* CodeGenCPU::InitContextPtr(llvm::Type* p_type, std::string
 }
 
 llvm::Value* CodeGenCPU::GetContextPtr(llvm::GlobalVariable* gv) {
-  CHECK(gv != nullptr);
+  ICHECK(gv != nullptr);
 #if TVM_LLVM_VERSION >= 110
   llvm::LoadInst* faddr = builder_->CreateAlignedLoad(gv, llvm::Align(gv->getAlignment()));
 #else
@@ -496,7 +496,7 @@ llvm::Value* CodeGenCPU::PackClosureData(const Array<Var>& vfields, uint64_t* nu
   std::vector<llvm::Type*> fields;
   for (Var v : vfields) {
     auto it = var_map_.find(v.get());
-    CHECK(it != var_map_.end());
+    ICHECK(it != var_map_.end());
     fields.push_back(it->second->getType());
   }
   llvm::StructType* tcdata = llvm::StructType::create(fields);
@@ -563,7 +563,7 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) {
   std::swap(var_map_, new_vmap);
   std::swap(parallel_env_, par_env);
   std::swap(function_, f);
-  CHECK_NE(par_env.parallel_loop_count, 0) << "Cannot find parallel loop within parallel launch";
+  ICHECK_NE(par_env.parallel_loop_count, 0) << "Cannot find parallel loop within parallel launch";
   builder_->SetInsertPoint(par_launch_end);
 }
 
@@ -606,7 +606,7 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod
   // setup new variable map, swap it with current var context.
   std::unordered_map<const VarNode*, llvm::Value*> new_vmap;
   UnpackClosureData(cdata, vfields, &new_vmap);
-  CHECK(parallel_env_.penv == nullptr);
+  ICHECK(parallel_env_.penv == nullptr);
   std::swap(function_, f);
   std::swap(var_map_, new_vmap);
   this->VisitStmt(body);
@@ -697,7 +697,7 @@ llvm::BasicBlock* CodeGenCPU::MakeCallPacked(const Array<PrimExpr>& args, llvm::
   llvm::Value* handle = GetPackedFuncHandle(func_name);
   // call the function
   int64_t nargs = end - begin;
-  CHECK_GE(nargs, 0);
+  ICHECK_GE(nargs, 0);
   llvm::Value* stack_value = MakeValue(args[1]);
   llvm::Value* stack_tcode = MakeValue(args[2]);
   llvm::Value* arg_value = builder_->CreateInBoundsGEP(
@@ -726,7 +726,7 @@ llvm::BasicBlock* CodeGenCPU::MakeCallPacked(const Array<PrimExpr>& args, llvm::
 }
 
 llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op) {
-  CHECK_EQ(op->args.size(), 5U);
+  ICHECK_EQ(op->args.size(), 5U);
   llvm::Value* rvalue = nullptr;
   llvm::Value* ret_tcode = nullptr;
   MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImmNode>()->value,
@@ -736,7 +736,7 @@ llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op) {
 
 llvm::Value* CodeGenCPU::CreateCallTracePacked(const CallNode* op) {
   using llvm::BasicBlock;
-  CHECK_EQ(op->args.size(), 6U);
+  ICHECK_EQ(op->args.size(), 6U);
   llvm::Value* rvalue = nullptr;
   llvm::Value* ret_tcode = nullptr;
   BasicBlock* end_block =
@@ -793,7 +793,7 @@ llvm::Value* CodeGenCPU::RuntimeTVMParallelBarrier() {
 
 void CodeGenCPU::AddStartupFunction() {
   if (registry_functions_.size() != 0) {
-    CHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime";
+    ICHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime";
     std::vector<std::string> symbols;
     std::vector<llvm::Constant*> funcs;
     for (auto sym : registry_functions_) {
@@ -861,7 +861,7 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
     builder_->SetInsertPoint(new_bb);
     return ConstInt32(-1);
   } else if (op->op.same_as(builtin::tvm_struct_get())) {
-    CHECK_EQ(op->args.size(), 3U);
+    ICHECK_EQ(op->args.size(), 3U);
     int kind = op->args[2].as<IntImmNode>()->value;
     llvm::Value* ref =
         this->CreateStructRefPtr(op->dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
@@ -871,23 +871,23 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
       return builder_->CreateLoad(ref);
     }
   } else if (op->op.same_as(builtin::tvm_struct_set())) {
-    CHECK_EQ(op->args.size(), 4U);
+    ICHECK_EQ(op->args.size(), 4U);
     int kind = op->args[2].as<IntImmNode>()->value;
     llvm::Value* value = MakeValue(op->args[3]);
     llvm::Value* ref = this->CreateStructRefPtr(op->args[3].dtype(), MakeValue(op->args[0]),
                                                 MakeValue(op->args[1]), kind);
-    CHECK(kind != builtin::kArrAddr);
+    ICHECK(kind != builtin::kArrAddr);
     if (value->getType()->isPointerTy()) {
       value = builder_->CreatePointerCast(value, ref->getType()->getPointerElementType());
     }
     builder_->CreateStore(value, ref);
     return ConstInt32(0);
   } else if (op->op.same_as(builtin::tvm_stack_alloca())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     const std::string& type = op->args[0].as<StringImmNode>()->value;
     return WithFunctionEntry([&]() -> llvm::AllocaInst* {
       const int64_t* pval = as_const_int(op->args[1]);
-      CHECK(pval) << "require stack alloca to contain constant value";
+      ICHECK(pval) << "require stack alloca to contain constant value";
       llvm::Value* num = ConstInt32(pval[0]);
       if (type == "shape") {
         return builder_->CreateAlloca(t_tvm_shape_index_, num);
@@ -941,15 +941,15 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
     this->CreateComputeScope(op);
   } else if (tir::attr::IsPragmaKey(op->attr_key)) {
     if (op->attr_key == "pragma_parallel_stride_pattern") {
-      CHECK(parallel_env_.penv != nullptr)
+      ICHECK(parallel_env_.penv != nullptr)
           << "Pragma parallel_stride_pattern only valid in parallel launch";
       parallel_env_.stride_pattern = true;
       this->VisitStmt(op->body);
     } else if (op->attr_key == "pragma_parallel_launch_point") {
       CreateParallelLaunch(op->body, 0);
     } else if (op->attr_key == "pragma_parallel_barrier_when_finish") {
-      CHECK(parallel_env_.penv != nullptr) << "Cannot run barrier without parallel environment";
-      CHECK(!parallel_env_.in_parallel_loop)
+      ICHECK(parallel_env_.penv != nullptr) << "Cannot run barrier without parallel environment";
+      ICHECK(!parallel_env_.in_parallel_loop)
           << "Cannot not place within parallel loop as the workload may differ, "
           << " place it between parallel and parallel_launch_point";
       this->VisitStmt(op->body);
@@ -962,7 +962,7 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
       builder_->CreateCall(bar_callee, {MakeValue(parallel_env_.task_id), parallel_env_.penv});
     } else if (op->attr_key == tir::attr::pragma_import_llvm) {
       const StringImmNode* value = op->value.as<StringImmNode>();
-      CHECK(value != nullptr);
+      ICHECK(value != nullptr);
       this->HandleImport(value->value);
       this->VisitStmt(op->body);
     } else {
@@ -975,7 +975,7 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
 }
 
 void CodeGenCPU::VisitStmt_(const ForNode* op) {
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   if (op->for_type == ForType::Serial || op->for_type == ForType::Unrolled) {
     CodeGenLLVM::VisitStmt_(op);
   } else if (op->for_type == ForType::Parallel) {
@@ -984,13 +984,13 @@ void CodeGenCPU::VisitStmt_(const ForNode* op) {
           For(op->loop_var, op->min, op->extent, op->for_type, op->device_api, op->body), 0);
     } else {
       // already in parallel env.
-      CHECK(parallel_env_.task_id.defined());
-      CHECK(parallel_env_.num_task.defined());
-      CHECK(parallel_env_.penv != nullptr);
+      ICHECK(parallel_env_.task_id.defined());
+      ICHECK(parallel_env_.num_task.defined());
+      ICHECK(parallel_env_.penv != nullptr);
       DataType t = op->extent.dtype();
       PrimExpr num_task = cast(t, parallel_env_.num_task);
       PrimExpr task_id = cast(t, parallel_env_.task_id);
-      CHECK(!parallel_env_.in_parallel_loop)
+      ICHECK(!parallel_env_.in_parallel_loop)
           << "Nested parallel loop is not supported by threadpool, try fuse them instead";
       parallel_env_.in_parallel_loop = true;
       if (parallel_env_.stride_pattern) {
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index a7e96c95e07f..c1af2a366a6b 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -48,7 +48,7 @@ namespace codegen {
 
 static std::string get_name(const PrimFunc& f) {
   auto global_symbol = f->GetAttr<runtime::String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined())
+  ICHECK(global_symbol.defined())
       << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
   return std::string(global_symbol.value());
 }
@@ -139,9 +139,9 @@ void CodeGenHexagon::InitTarget(llvm::TargetMachine* tm) {
   if (len_end != npos) {
     int hvx_bytes = 0;
     len_begin += std::strlen(hvx_length_feature);
-    CHECK(!fs.substr(len_begin, len_end - len_begin).getAsInteger(10, hvx_bytes))
+    ICHECK(!fs.substr(len_begin, len_end - len_begin).getAsInteger(10, hvx_bytes))
         << "invalid HVX length in feature string: " << fs.str();
-    CHECK(hvx_bytes == 64 || hvx_bytes == 128)
+    ICHECK(hvx_bytes == 64 || hvx_bytes == 128)
         << "invalid HVX vector length: " << hvx_bytes << ", should be 64 or 128";
     native_vector_bits_ = hvx_bytes * 8;
   }
@@ -249,7 +249,7 @@ llvm::GlobalVariable* CodeGenHexagon::InitContextPtr(llvm::Type* p_type, std::st
 }
 
 llvm::Value* CodeGenHexagon::GetContextPtr(llvm::GlobalVariable* gv) {
-  CHECK(gv != nullptr);
+  ICHECK(gv != nullptr);
 #if TVM_LLVM_VERSION >= 110
   llvm::LoadInst* faddr = builder_->CreateAlignedLoad(gv, llvm::Align(gv->getAlignment()));
 #else
@@ -305,7 +305,7 @@ llvm::BasicBlock* CodeGenHexagon::MakeCallPacked(const Array<PrimExpr>& args, ll
   llvm::Value* handle = GetPackedFuncHandle(func_name);
   // call the function
   int64_t nargs = end - begin;
-  CHECK_GE(nargs, 0);
+  ICHECK_GE(nargs, 0);
   llvm::Value* stack_value = MakeValue(args[1]);
   llvm::Value* stack_tcode = MakeValue(args[2]);
   llvm::Value* arg_value = builder_->CreateInBoundsGEP(
@@ -416,7 +416,7 @@ llvm::Value* CodeGenHexagon::CreateCallPacked(const CallNode* op) {
     return ConstInt32(0);
   }
 
-  CHECK_EQ(op->args.size(), 5U);
+  ICHECK_EQ(op->args.size(), 5U);
   llvm::Value* rvalue = nullptr;
   llvm::Value* ret_tcode = nullptr;
   MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImmNode>()->value,
@@ -426,7 +426,7 @@ llvm::Value* CodeGenHexagon::CreateCallPacked(const CallNode* op) {
 
 llvm::Value* CodeGenHexagon::CreateCallTracePacked(const CallNode* op) {
   using llvm::BasicBlock;
-  CHECK_EQ(op->args.size(), 6U);
+  ICHECK_EQ(op->args.size(), 6U);
   llvm::Value* rvalue = nullptr;
   llvm::Value* ret_tcode = nullptr;
   BasicBlock* end_block =
@@ -506,7 +506,7 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::tvm_call_trace_packed_lowered())) {
     return CreateCallTracePacked(op);
   } else if (op->op.same_as(builtin::tvm_struct_get())) {
-    CHECK_EQ(op->args.size(), 3);
+    ICHECK_EQ(op->args.size(), 3);
     int kind = op->args[2].as<IntImmNode>()->value;
     llvm::Value* ref =
         CreateStructRefPtr(op->dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
@@ -515,9 +515,9 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
     }
     return builder_->CreateLoad(ref);
   } else if (op->op.same_as(builtin::tvm_struct_set())) {
-    CHECK_EQ(op->args.size(), 4);
+    ICHECK_EQ(op->args.size(), 4);
     int kind = op->args[2].as<IntImmNode>()->value;
-    CHECK(kind != builtin::kArrAddr);
+    ICHECK(kind != builtin::kArrAddr);
     llvm::Value* ref = CreateStructRefPtr(op->args[3].dtype(), MakeValue(op->args[0]),
                                           MakeValue(op->args[1]), kind);
     llvm::Value* value = MakeValue(op->args[3]);
@@ -527,7 +527,7 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
     builder_->CreateStore(value, ref);
     return ConstInt32(0);
   } else if (op->op.same_as(builtin::tvm_stack_alloca())) {
-    CHECK_EQ(op->args.size(), 2);
+    ICHECK_EQ(op->args.size(), 2);
     const std::string& name = op->args[0].as<StringImmNode>()->value;
     llvm::Value* size = ConstInt32(op->args[1].as<IntImmNode>()->value);
     return builder_->CreateAlloca(types_for_alloca_.at(name), size);
@@ -559,7 +559,7 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll
     if (buf->getType() == t_void_p_) {
       buf = builder_->CreatePointerCast(buf, t_tvm_array_->getPointerTo());
     } else {
-      CHECK_EQ(buf->getType(), t_tvm_array_->getPointerTo());
+      ICHECK_EQ(buf->getType(), t_tvm_array_->getPointerTo());
     }
     /* The following "kinds" are accessing the members of DLTensor:
        typedef struct {
@@ -605,8 +605,8 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll
          TVMContext v_ctx;
        } TVMValue;
     */
-    CHECK_EQ(t.lanes(), 1);
-    CHECK(t.is_handle() || t.bits() == 64);
+    ICHECK_EQ(t.lanes(), 1);
+    ICHECK(t.is_handle() || t.bits() == 64);
     if (t.is_int()) {
       buf = builder_->CreatePointerCast(buf, t_int64_->getPointerTo());
       return builder_->CreateInBoundsGEP(buf, index);
@@ -614,7 +614,7 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll
       buf = builder_->CreatePointerCast(buf, t_float64_->getPointerTo());
       return builder_->CreateInBoundsGEP(buf, index);
     } else {
-      CHECK(t.is_handle());
+      ICHECK(t.is_handle());
       buf = builder_->CreatePointerCast(buf, t_tvm_value_->getPointerTo());
       buf = builder_->CreateInBoundsGEP(buf, index);
       return builder_->CreatePointerCast(buf, t_void_p_->getPointerTo());
@@ -708,7 +708,7 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
   cg->Init("TVMHexagonModule", tm.get(), ctx.get(), false, false, false);
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
     auto f = Downcast<PrimFunc>(kv.second);
     cg->AddFunction(f);
   }
@@ -740,7 +740,7 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
       llvm::raw_svector_ostream os(ss);
       std::unique_ptr<llvm::Module> cm = CloneModule(m);
       legacy::PassManager pass;
-      CHECK(tm->addPassesToEmitFile(pass, os, nullptr, ft) == 0) << "Cannot emit target code";
+      ICHECK(tm->addPassesToEmitFile(pass, os, nullptr, ft) == 0) << "Cannot emit target code";
       pass.run(*cm.get());
       out.assign(ss.c_str(), ss.size());
     }
@@ -752,13 +752,13 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
     llvm::SmallString<64> file_name;
     int fd;
     std::error_code ec = llvm::sys::fs::createTemporaryFile("tvm", suffix, fd, file_name);
-    CHECK_EQ(static_cast<bool>(ec), false) << ec.message();
+    ICHECK_EQ(static_cast<bool>(ec), false) << ec.message();
     llvm::raw_fd_ostream file(fd, true);
     file << data;
-    CHECK(!file.has_error()) << file.error().message();
+    ICHECK(!file.has_error()) << file.error().message();
     // If there is an error, execution will never get here, but return
     // {ec, name} anyway to allow caller to handle error conditions.
-    // This way the "CHECK" above can be removed with minimal effort.
+    // This way the "ICHECK" above can be removed with minimal effort.
     return std::make_pair(file.error(), std::string(file_name.c_str()));
   };
 
@@ -772,12 +772,12 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   so_name += "so";
 
   const auto* f = tvm::runtime::Registry::Get("tvm.contrib.hexagon.link_shared");
-  CHECK(f != nullptr) << "tvm.contrib.hexagon.link_shared does not to exist, "
-                         "do import tvm.contrib.hexagon";
+  ICHECK(f != nullptr) << "tvm.contrib.hexagon.link_shared does not to exist, "
+                          "do import tvm.contrib.hexagon";
 
   Array<PrimExpr> o_names = {StringImm(o_name)};
   int rc = (*f)(so_name, o_names);
-  CHECK(rc == 0) << "Failed to link " << so_name;
+  ICHECK(rc == 0) << "Failed to link " << so_name;
 
   // Move it to ExtractFuncInfo?
   std::set<std::string> export_abi;
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 9bc56dc91458..2a7e4644571b 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -108,7 +108,7 @@ void CodeGenLLVM::InitFuncState() {
 void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   this->InitFuncState();
 
-  CHECK_EQ(f->buffer_map.size(), 0U)
+  ICHECK_EQ(f->buffer_map.size(), 0U)
       << "Cannot codegen function with buffer_map, please lower them first";
 
   std::vector<llvm::Type*> param_types;
@@ -126,9 +126,9 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
       llvm::FunctionType::get(ret_void ? t_void_ : t_int_, param_types, false);
 
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined())
+  ICHECK(global_symbol.defined())
       << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
-  CHECK(module_->getFunction(static_cast<std::string>(global_symbol.value())) == nullptr)
+  ICHECK(module_->getFunction(static_cast<std::string>(global_symbol.value())) == nullptr)
       << "Function " << global_symbol << " already exist in module";
 
   function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
@@ -182,7 +182,7 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
 std::unique_ptr<llvm::Module> CodeGenLLVM::Finish() {
   this->AddStartupFunction();
   for (size_t i = 0; i < link_modules_.size(); ++i) {
-    CHECK(!llvm::Linker::linkModules(*module_, std::move(link_modules_[i])))
+    ICHECK(!llvm::Linker::linkModules(*module_, std::move(link_modules_[i])))
         << "Failed to link modules";
   }
   link_modules_.clear();
@@ -302,7 +302,7 @@ unsigned CodeGenLLVM::GetGlobalAddressSpace() const { return 0; }
 
 llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
   if (dtype.is_handle()) {
-    CHECK_EQ(dtype.lanes(), 1);
+    ICHECK_EQ(dtype.lanes(), 1);
     return t_void_p_;
   }
   if (dtype.is_void()) {
@@ -489,7 +489,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
 llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) {
   int num_elems = GetVectorNumElements(vec);
   if (extent == num_elems && begin == 0) return vec;
-  CHECK(begin >= 0 && extent <= num_elems) << "Slicing out of bound!\n";
+  ICHECK(begin >= 0 && extent <= num_elems) << "Slicing out of bound!\n";
   std::vector<llvm::Constant*> indices;
   indices.reserve(extent);
   for (int i = 0; i < extent; ++i) {
@@ -519,7 +519,7 @@ llvm::Value* CodeGenLLVM::CreateVecPad(llvm::Value* vec, int target_lanes) {
   llvm::Value* mask = llvm::UndefValue::get(DTypeToLLVMType(DataType::Int(32, target_lanes)));
   int num_elems = GetVectorNumElements(vec);
   if (num_elems == target_lanes) return vec;
-  CHECK_LT(num_elems, target_lanes);
+  ICHECK_LT(num_elems, target_lanes);
   for (int i = 0; i < num_elems; ++i) {
     mask = builder_->CreateInsertElement(mask, ConstInt32(i), ConstInt32(i));
   }
@@ -578,7 +578,7 @@ void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Va
   builder_->SetInsertPoint(for_begin);
   llvm::PHINode* loop_value = builder_->CreatePHI(begin->getType(), 2);
   loop_value->addIncoming(begin, pre_block);
-  CHECK(!var_map_.count(loop_var.get()));
+  ICHECK(!var_map_.count(loop_var.get()));
   var_map_[loop_var.get()] = loop_value;
   builder_->CreateCondBr(CreateLT(loop_var.dtype(), loop_value, end), for_body, for_end,
                          md_very_likely_branch_);
@@ -621,7 +621,7 @@ llvm::Value* CodeGenLLVM::CreateCast(DataType from, DataType to, llvm::Value* va
   } else if (from.is_uint() && to.is_float()) {
     return builder_->CreateUIToFP(value, target);
   } else {
-    CHECK(from.is_float() && to.is_float());
+    ICHECK(from.is_float() && to.is_float());
     return builder_->CreateFPCast(value, target);
   }
 }
@@ -647,7 +647,7 @@ llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) {
 
 llvm::Value* CodeGenLLVM::CreateBufferPtr(DataType t, llvm::Value* buffer, llvm::Value* index) {
   llvm::PointerType* btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
-  CHECK(btype != nullptr);
+  ICHECK(btype != nullptr);
   llvm::PointerType* ptype = DTypeToLLVMType(t)->getPointerTo(btype->getAddressSpace());
   if (btype != ptype) {
     buffer = builder_->CreatePointerCast(buffer, ptype);
@@ -657,7 +657,7 @@ llvm::Value* CodeGenLLVM::CreateBufferPtr(DataType t, llvm::Value* buffer, llvm:
 
 llvm::Value* CodeGenLLVM::GetVarValue(const VarNode* v) const {
   auto it = var_map_.find(v);
-  CHECK(it != var_map_.end()) << "cannot find variable " << v->name_hint;
+  ICHECK(it != var_map_.end()) << "cannot find variable " << v->name_hint;
   return it->second;
 }
 
@@ -747,7 +747,7 @@ llvm::Function* CodeGenLLVM::GetIntrinsicDecl(llvm::Intrinsic::ID id, llvm::Type
 
 llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   if (op->op.same_as(builtin_call_llvm_intrin_) || op->op.same_as(builtin_call_llvm_pure_intrin_)) {
-    CHECK_GE(op->args.size(), 2U);
+    ICHECK_GE(op->args.size(), 2U);
     llvm::Intrinsic::ID id = static_cast<llvm::Intrinsic::ID>(Downcast<IntImm>(op->args[0])->value);
     int64_t num_signature = Downcast<IntImm>(op->args[1])->value;
     std::vector<llvm::Value*> arg_value;
@@ -768,8 +768,8 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     llvm::Type* return_type = (id != llvm::Intrinsic::prefetch) ? GetLLVMType(GetRef<PrimExpr>(op))
                                                                 : llvm::Type::getVoidTy(*ctx_);
     llvm::Function* f = GetIntrinsicDecl(id, return_type, arg_type);
-    CHECK(f) << "Cannot find intrinsic declaration, possible type mismatch: "
-             << llvm::Intrinsic::getName(id, {});
+    ICHECK(f) << "Cannot find intrinsic declaration, possible type mismatch: "
+              << llvm::Intrinsic::getName(id, {});
     return builder_->CreateCall(f, arg_value);
   } else if (op->op.same_as(builtin::bitwise_and())) {
     return builder_->CreateAnd(MakeValue(op->args[0]), MakeValue(op->args[1]));
@@ -791,7 +791,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     return CreateStorageSync(op);
   } else if (op->op.same_as(builtin::address_of())) {
     const LoadNode* l = op->args[0].as<LoadNode>();
-    CHECK(op->args.size() == 1 && l);
+    ICHECK(op->args.size() == 1 && l);
     const RampNode* r = l->index.as<RampNode>();
     llvm::Value* ptr;
     unsigned addrspace;
@@ -809,13 +809,13 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::isnullptr())) {
     return builder_->CreateIsNull(MakeValue(op->args[0]));
   } else if (op->op.same_as(builtin::large_uint_imm())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     uint64_t low = static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
     uint64_t high = static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
     uint64_t val = (high << 32U) | low;
     return llvm::ConstantInt::get(DTypeToLLVMType(op->dtype), val);
   } else if (op->op.same_as(builtin::if_then_else())) {
-    CHECK_EQ(op->args[0].dtype().lanes(), 1) << "if_then_else can only take scalar condition";
+    ICHECK_EQ(op->args[0].dtype().lanes(), 1) << "if_then_else can only take scalar condition";
     using llvm::BasicBlock;
     BasicBlock* then_block = BasicBlock::Create(*ctx_, "if_then", function_);
     BasicBlock* else_block = BasicBlock::Create(*ctx_, "if_else", function_);
@@ -913,7 +913,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const StringImmNode* op) { return GetConstS
         return builder_->Create##Op(a, b);                                           \
       }                                                                              \
     } else {                                                                         \
-      CHECK(t.is_float());                                                           \
+      ICHECK(t.is_float());                                                          \
       return builder_->CreateF##Op(a, b);                                            \
     }                                                                                \
   }                                                                                  \
@@ -932,7 +932,7 @@ DEFINE_CODEGEN_BINARY_OP(Mul);
     } else if (t.is_uint()) {                                                        \
       return builder_->CreateICmpU##Op(a, b);                                        \
     } else {                                                                         \
-      CHECK(t.is_float());                                                           \
+      ICHECK(t.is_float());                                                          \
       return builder_->CreateFCmpO##Op(a, b);                                        \
     }                                                                                \
   }                                                                                  \
@@ -953,7 +953,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const DivNode* op) {
   } else if (op->dtype.is_uint()) {
     return builder_->CreateUDiv(a, b);
   } else {
-    CHECK(op->dtype.is_float());
+    ICHECK(op->dtype.is_float());
     return builder_->CreateFDiv(a, b);
   }
 }
@@ -966,7 +966,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ModNode* op) {
   } else if (op->dtype.is_uint()) {
     return builder_->CreateURem(a, b);
   } else {
-    CHECK(op->dtype.is_float());
+    ICHECK(op->dtype.is_float());
     return builder_->CreateFRem(a, b);
   }
 }
@@ -1023,7 +1023,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const SelectNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const LetNode* op) {
   auto it = let_binding_.find(op->var);
   if (it != let_binding_.end()) {
-    CHECK(deep_equal_(it->second->value, op->value))
+    ICHECK(deep_equal_(it->second->value, op->value))
         << "Let cannot bind the same var to two different values";
   } else {
     let_binding_[op->var] = op;
@@ -1057,7 +1057,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const LoadNode* op) {
       if (is_one(ramp->stride)) {
         int alignment, native_bits;
         GetAlignment(t, op->buffer_var.get(), ramp->base, &alignment, &native_bits);
-        CHECK_EQ(ramp->lanes, t.lanes());
+        ICHECK_EQ(ramp->lanes, t.lanes());
         llvm::Value* ptr = CreateBufferPtr(t.element_of(), buffer, MakeValue(ramp->base));
         ptr = builder_->CreatePointerCast(ptr, DTypeToLLVMType(t)->getPointerTo(addrspace));
 #if TVM_LLVM_VERSION >= 110
@@ -1093,7 +1093,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
     auto call_op = GetRef<Op>(ptr_op);
     if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
       // call extern intrinsic
-      CHECK_GE(op->args.size(), 1U);
+      ICHECK_GE(op->args.size(), 1U);
       auto global_symbol = Downcast<StringImm>(op->args[0]);
       return this->CreateCallExtern(GetType(GetRef<PrimExpr>(op)), global_symbol->value, op->args,
                                     true);
@@ -1105,7 +1105,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
       return CreateIntrinsic(op);
     }
   } else {
-    CHECK(op->op.as<GlobalVarNode>());
+    ICHECK(op->op.as<GlobalVarNode>());
     LOG(FATAL) << "Do not yet support cross function call";
     return nullptr;
   }
@@ -1131,8 +1131,8 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ShuffleNode* op) {
   std::vector<uint32_t> idx(op->indices.size());
   for (int i = 0, e = op->indices.size(); i < e; ++i) {
     const int64_t* val = as_const_int(op->indices[i]);
-    CHECK(val && *val >= 0 && *val < total_lanes) << "Shuffled indeces are suppose to be int, "
-                                                  << "but get " << op->indices[i] << "\n";
+    ICHECK(val && *val >= 0 && *val < total_lanes) << "Shuffled indeces are suppose to be int, "
+                                                   << "but get " << op->indices[i] << "\n";
     idx[i] = *val;
   }
   llvm::Value* mask = llvm::ConstantDataVector::get(builder_->getContext(), idx);
@@ -1149,7 +1149,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const BroadcastNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
-  CHECK(is_one(op->predicate));
+  ICHECK(is_one(op->predicate));
   DataType t = op->value.dtype();
   bool is_volatile = volatile_buf_.count(op->buffer_var.get());
   llvm::Value* buffer = MakeValue(op->buffer_var);
@@ -1175,7 +1175,7 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
       if (is_one(ramp->stride)) {
         int alignment, native_bits;
         GetAlignment(t, op->buffer_var.get(), ramp->base, &alignment, &native_bits);
-        CHECK_EQ(ramp->lanes, t.lanes());
+        ICHECK_EQ(ramp->lanes, t.lanes());
         llvm::Value* ptr = CreateBufferPtr(t.element_of(), buffer, MakeValue(ramp->base));
         ptr = builder_->CreatePointerCast(ptr, DTypeToLLVMType(t)->getPointerTo(addrspace));
 #if TVM_LLVM_VERSION >= 110
@@ -1189,7 +1189,7 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
       }
     }
   }
-  CHECK_GE(t.bits(), 8);
+  ICHECK_GE(t.bits(), 8);
   // scalarized store.
   int basic_align = t.bits() / 8;
   auto f = [&](int i, llvm::Value* index) {
@@ -1207,13 +1207,13 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const ForNode* op) {
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
   if (op->for_type == ForType::Unrolled) {
     LOG(WARNING) << "Unroll hint get ignore at CodeGenLLVM backend, "
                  << " consider set unroll_explicit=True";
   } else {
-    CHECK(op->for_type == ForType::Serial);
+    ICHECK(op->for_type == ForType::Serial);
   }
   CreateSerialFor(MakeValue(op->min), MakeValue(op->extent),
                   llvm::ConstantInt::getSigned(GetLLVMType(op->extent), 1), op->loop_var, op->body);
@@ -1243,11 +1243,11 @@ void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
-  CHECK(!is_zero(op->condition));
+  ICHECK(!is_zero(op->condition));
   llvm::Value* buf = nullptr;
 
   int32_t constant_size = op->constant_allocation_size();
-  CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation";
+  ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation";
   StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
   if (constant_size % 4 == 0 && info.alignment == 0) {
     info.alignment = GetTempAllocaAlignment(op->dtype, constant_size);
@@ -1271,7 +1271,7 @@ void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
 
   buf = builder_->CreatePointerCast(
       buf, DTypeToLLVMType(op->dtype)->getPointerTo(buf->getType()->getPointerAddressSpace()));
-  CHECK(!var_map_.count(op->buffer_var.get()));
+  ICHECK(!var_map_.count(op->buffer_var.get()));
   var_map_[op->buffer_var.get()] = buf;
   this->VisitStmt(op->body);
 }
@@ -1287,12 +1287,12 @@ void CodeGenLLVM::VisitStmt_(const AttrStmtNode* op) {
     }
   } else if (op->attr_key == tir::attr::storage_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     alloc_storage_info_[v].scope =
         runtime::StorageScope::Create(op->value.as<StringImmNode>()->value);
   } else if (op->attr_key == tir::attr::storage_alignment) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     alloc_storage_info_[v].alignment = static_cast<int>(op->value.as<IntImmNode>()->value);
     if (var_map_.count(v) && alloc_storage_info_[v].alignment > 1) {
       builder_->CreateAlignmentAssumption(*data_layout_, GetVarValue(v),
@@ -1300,7 +1300,7 @@ void CodeGenLLVM::VisitStmt_(const AttrStmtNode* op) {
     }
   } else if (op->attr_key == tir::attr::volatile_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     volatile_buf_.insert(v);
   }
   this->VisitStmt(op->body);
@@ -1313,7 +1313,7 @@ void CodeGenLLVM::VisitStmt_(const AssertStmtNode* op) {
 
 void CodeGenLLVM::VisitStmt_(const LetStmtNode* op) {
   const VarNode* v = op->var.get();
-  CHECK(!var_map_.count(v));
+  ICHECK(!var_map_.count(v));
   if (v->dtype.is_handle()) {
     if (!is_restricted_) {
       alias_var_set_.insert(v);
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 601df86d10ba..22e612b11090 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -46,11 +46,11 @@ class CodeGenNVPTX : public CodeGenLLVM {
   }
 
   void VisitStmt_(const AllocateNode* op) final {
-    CHECK(!is_zero(op->condition));
+    ICHECK(!is_zero(op->condition));
     llvm::Value* buf = nullptr;
 
     int32_t constant_size = op->constant_allocation_size();
-    CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
+    ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
     StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
     if (constant_size % 4 == 0 && info.alignment == 0) {
       info.alignment = GetTempAllocaAlignment(op->dtype, constant_size);
@@ -75,7 +75,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
       }
       buf = alloca;
     } else {
-      CHECK(info.scope.rank == runtime::StorageRank::kShared)
+      ICHECK(info.scope.rank == runtime::StorageRank::kShared)
           << "Can only allocate shared or local memory inside kernel";
       // Shared memory: address space  == 3
       const unsigned shared_address_space = 3;
@@ -94,7 +94,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
 
     buf = builder_->CreatePointerCast(
         buf, DTypeToLLVMType(op->dtype)->getPointerTo(buf->getType()->getPointerAddressSpace()));
-    CHECK(!var_map_.count(op->buffer_var.get()));
+    ICHECK(!var_map_.count(op->buffer_var.get()));
     var_map_[op->buffer_var.get()] = buf;
     this->VisitStmt(op->body);
   }
@@ -118,7 +118,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
           LOG(FATAL) << "unknown thread idx";
       }
     } else {
-      CHECK_EQ(ts.rank, 0);
+      ICHECK_EQ(ts.rank, 0);
       switch (ts.dim_index) {
         case 0:
           intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
@@ -238,7 +238,7 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
 
 int GetCUDAComputeVersion(const Target& target) {
   Optional<String> mcpu = target->GetAttr<String>("mcpu");
-  CHECK(mcpu.defined()) << "InternalError: \"-mcpu\" is undefined in the NVPTX target";
+  ICHECK(mcpu.defined()) << "InternalError: \"-mcpu\" is undefined in the NVPTX target";
   std::string sm_version = mcpu.value();
   return std::stoi(sm_version.substr(3));
 }
@@ -255,7 +255,7 @@ runtime::Module BuildNVPTX(IRModule mod, Target target) {
   cg->Init("TVMPTXModule", tm.get(), ctx.get(), false, false, false);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
     auto f = Downcast<PrimFunc>(kv.second);
     cg->AddFunction(f);
   }
@@ -287,14 +287,14 @@ runtime::Module BuildNVPTX(IRModule mod, Target target) {
   // emit ptx
   llvm::legacy::PassManager pass;
 #if TVM_LLVM_VERSION <= 60
-  CHECK(tm->addPassesToEmitFile(pass, dest_ptx, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, dest_ptx, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #elif TVM_LLVM_VERSION <= 90
-  CHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
-        0)
+  ICHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
+         0)
       << "Cannot emit target CGFT_ObjectFile";
 #else
-  CHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::CGFT_AssemblyFile) == 0)
+  ICHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #endif
   pass.run(*module);
diff --git a/src/target/llvm/codegen_x86_64.cc b/src/target/llvm/codegen_x86_64.cc
index a71a0226c958..c2785458a004 100644
--- a/src/target/llvm/codegen_x86_64.cc
+++ b/src/target/llvm/codegen_x86_64.cc
@@ -79,7 +79,7 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const CastNode* op) {
   const auto from = op->value.dtype();
   const auto to = op->dtype;
   if (from.is_float() && to.is_float() && from.bits() == 16 && to.bits() == 32) {
-    CHECK_EQ(from.lanes(), to.lanes());
+    ICHECK_EQ(from.lanes(), to.lanes());
     CHECK_NOTNULL(target_machine_);
 
     const auto has_avx512 = TargetHasFeature(*target_machine_, "avx512f");
@@ -128,13 +128,13 @@ llvm::Value* CodeGenX86_64::CallVectorIntrin(llvm::Intrinsic::ID id, size_t intr
 
   // Otherwise, we split the vector into intrin_lanes sized elements (widening where necessary),
   // compute each result, and then concatenate the vectors (slicing the result if necessary).
-  CHECK_LT(intrin_lanes, num_elems);
+  ICHECK_LT(intrin_lanes, num_elems);
   std::vector<llvm::Value*> split_results;
   for (size_t i = 0; i < num_elems; i += intrin_lanes) {
     std::vector<llvm::Value*> split_args;
     for (const auto& v : args) {
       if (v->getType()->isVectorTy()) {
-        CHECK_EQ(GetVectorNumElements(v), num_elems);
+        ICHECK_EQ(GetVectorNumElements(v), num_elems);
         split_args.push_back(CreateVecSlice(v, i, intrin_lanes));
       } else {
         split_args.push_back(v);
diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index abf350e2208a..4c8862bbfb63 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -47,7 +47,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.exp10")
       using tir::make_zero;
       PrimExpr e = targs[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
       PrimExpr ln10 = make_const(x.dtype(), 2.302585093);
       PrimExpr ret = exp(x * ln10);
@@ -93,7 +93,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.tanh")
       using tir::make_zero;
       PrimExpr e = targs[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
       PrimExpr one = make_const(x.dtype(), 1);
       PrimExpr two = make_const(x.dtype(), 2);
@@ -116,7 +116,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.popcount")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.tan").set_body([](const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   const PrimExpr& x = call->args[0];
   PrimExpr tan_x = sin(x) / cos(x);
   *rv = tan_x;
@@ -131,7 +131,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.cosh")
       using tir::make_zero;
       PrimExpr e = targs[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
       PrimExpr two = make_const(x.dtype(), 2);
       PrimExpr neg_one = make_const(x.dtype(), -1);
@@ -150,7 +150,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.sinh")
       using tir::make_zero;
       PrimExpr e = targs[0];
       const tir::CallNode* call = e.as<tir::CallNode>();
-      CHECK(call != nullptr);
+      ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
       PrimExpr two = make_const(x.dtype(), 2);
       PrimExpr neg_one = make_const(x.dtype(), -1);
diff --git a/src/target/llvm/intrin_rule_llvm.h b/src/target/llvm/intrin_rule_llvm.h
index 1a6775e92e12..99463793d8de 100644
--- a/src/target/llvm/intrin_rule_llvm.h
+++ b/src/target/llvm/intrin_rule_llvm.h
@@ -41,7 +41,7 @@ template <unsigned id, int num_signature>
 inline void DispatchLLVMPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   Array<PrimExpr> cargs;
   // intrin id.
   cargs.push_back(IntImm(DataType::UInt(32), id));
@@ -57,7 +57,7 @@ template <unsigned id, int num_signature>
 inline void DispatchLLVMIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   Array<PrimExpr> cargs;
   // intrin id.
   cargs.push_back(IntImm(DataType::UInt(32), id));
diff --git a/src/target/llvm/intrin_rule_nvptx.cc b/src/target/llvm/intrin_rule_nvptx.cc
index 0e332940339c..bb653e8ee5e0 100644
--- a/src/target/llvm/intrin_rule_nvptx.cc
+++ b/src/target/llvm/intrin_rule_nvptx.cc
@@ -36,13 +36,14 @@ inline void DispatchPureExternLibDevice(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   using namespace tir;
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
-  CHECK(call->dtype.bits() == 32 || call->dtype.bits() == 64) << "Only support float32 or float64.";
+  ICHECK(call != nullptr);
+  ICHECK(call->dtype.bits() == 32 || call->dtype.bits() == 64)
+      << "Only support float32 or float64.";
 
   const OpNode* op = call->op.as<OpNode>();
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   std::string name = op->name;
-  CHECK_EQ(name.substr(0, 4), "tir.");
+  ICHECK_EQ(name.substr(0, 4), "tir.");
 
   std::ostringstream intrinsic_name;
   intrinsic_name << "__nv_" << name.substr(4);
diff --git a/src/target/llvm/intrin_rule_rocm.cc b/src/target/llvm/intrin_rule_rocm.cc
index 22ebf9b192aa..08b32ed1b946 100644
--- a/src/target/llvm/intrin_rule_rocm.cc
+++ b/src/target/llvm/intrin_rule_rocm.cc
@@ -36,12 +36,12 @@ inline void DispatchPureExternOCML(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   using namespace tir;
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
 
   const OpNode* op = call->op.as<OpNode>();
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   std::string name = op->name;
-  CHECK_EQ(name.substr(0, 4), "tir.");
+  ICHECK_EQ(name.substr(0, 4), "tir.");
 
   std::ostringstream intrinsic_name;
   intrinsic_name << "__ocml_" << name.substr(4) << "_f" << call->dtype.bits();
@@ -58,10 +58,10 @@ inline void DispatchShuffle(const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e_call = targs[0];
   using namespace tir;
   const CallNode* call = e_call.as<CallNode>();
-  CHECK(call != nullptr);
-  CHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   PrimExpr var = call->args[1];
-  CHECK_EQ(var.dtype().bits(), 32);
+  ICHECK_EQ(var.dtype().bits(), 32);
 
   // get own lane in self (__lane_id)
   PrimExpr minus_one = tir::make_const(DataType::Int(32), -1);
@@ -82,7 +82,7 @@ inline void DispatchShuffle(const TVMArgs& targs, TVMRetValue* rv) {
     index = self - delta;
     index = Select(index < (self & ~(width - 1)), self, index);
   } else {
-    CHECK(call->op.same_as(builtin::tvm_warp_shuffle_down()));
+    ICHECK(call->op.same_as(builtin::tvm_warp_shuffle_down()));
     PrimExpr delta = call->args[2];
     index = self + delta;
     index = Select((self & (width - 1)) + delta >= width, self, index);
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index e8225ab5b6e4..35bfc8dc2e5b 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -24,7 +24,7 @@
 
 #include "llvm_common.h"
 
-#include <dmlc/logging.h>
+#include <tvm/support/logging.h>
 #include <tvm/target/target.h>
 
 #include <atomic>
@@ -133,7 +133,7 @@ std::unique_ptr<llvm::TargetMachine> GetLLVMTargetMachine(const Target& target,
   std::string err;
   const llvm::Target* llvm_target = llvm::TargetRegistry::lookupTarget(target_triple, err);
   if (llvm_target == nullptr) {
-    CHECK(allow_null) << err << " target_triple=" << target_triple;
+    ICHECK(allow_null) << err << " target_triple=" << target_triple;
     return nullptr;
   }
   llvm::TargetMachine* tm =
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index a37710d5622b..569082022852 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -76,7 +76,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     if (name == runtime::symbol::tvm_module_main) {
       const char* entry_name =
           reinterpret_cast<const char*>(GetGlobalAddr(runtime::symbol::tvm_module_main));
-      CHECK(entry_name != nullptr)
+      ICHECK(entry_name != nullptr)
           << "Symbol " << runtime::symbol::tvm_module_main << " is not presented";
       faddr = reinterpret_cast<TVMBackendPackedCFunc>(GetFunctionAddr(entry_name));
     } else {
@@ -90,7 +90,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     std::string fmt = runtime::GetFileFormat(file_name, format);
     std::error_code ecode;
     llvm::raw_fd_ostream dest(file_name, ecode, llvm::sys::fs::F_None);
-    CHECK_EQ(ecode.value(), 0) << "Cannot open file: " << file_name << " " << ecode.message();
+    ICHECK_EQ(ecode.value(), 0) << "Cannot open file: " << file_name << " " << ecode.message();
     if (fmt == "o" || fmt == "obj") {
 #if TVM_LLVM_VERSION <= 60
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
@@ -98,16 +98,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
 #endif
       llvm::legacy::PassManager pass;
-      CHECK(tm_);
+      ICHECK(tm_);
 #if TVM_LLVM_VERSION <= 60
-      CHECK(tm_->addPassesToEmitFile(pass, dest, llvm::TargetMachine::CGFT_ObjectFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, llvm::TargetMachine::CGFT_ObjectFile) == 0)
           << "Cannot emit target CGFT_ObjectFile";
 #elif TVM_LLVM_VERSION <= 90
-      CHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::TargetMachine::CGFT_ObjectFile) ==
-            0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::TargetMachine::CGFT_ObjectFile) ==
+             0)
           << "Cannot emit target CGFT_ObjectFile";
 #else
-      CHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::CGFT_ObjectFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::CGFT_ObjectFile) == 0)
           << "Cannot emit target CGFT_ObjectFile";
 #endif
       pass.run(*m);
@@ -118,16 +118,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
 #endif
       llvm::legacy::PassManager pass;
-      CHECK(tm_);
+      ICHECK(tm_);
 #if TVM_LLVM_VERSION <= 60
-      CHECK(tm_->addPassesToEmitFile(pass, dest, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #elif TVM_LLVM_VERSION <= 90
-      CHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
-            0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, nullptr,
+                                      llvm::TargetMachine::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #else
-      CHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::CGFT_AssemblyFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, dest, nullptr, llvm::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #endif
       pass.run(*m);
@@ -163,16 +163,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
 #endif
       llvm::legacy::PassManager pass;
-      CHECK(tm_);
+      ICHECK(tm_);
 #if TVM_LLVM_VERSION <= 60
-      CHECK(tm_->addPassesToEmitFile(pass, rso, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, rso, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #elif TVM_LLVM_VERSION <= 90
-      CHECK(tm_->addPassesToEmitFile(pass, rso, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
-            0)
+      ICHECK(tm_->addPassesToEmitFile(pass, rso, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) ==
+             0)
           << "Cannot emit target CGFT_AssemblyFile";
 #else
-      CHECK(tm_->addPassesToEmitFile(pass, rso, nullptr, llvm::CGFT_AssemblyFile) == 0)
+      ICHECK(tm_->addPassesToEmitFile(pass, rso, nullptr, llvm::CGFT_AssemblyFile) == 0)
           << "Cannot emit target CGFT_AssemblyFile";
 #endif
       pass.run(*m);
@@ -180,7 +180,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     } else if (fmt == "" || fmt == "ll") {
       std::string type_str;
       llvm::raw_string_ostream rso(type_str);
-      CHECK(mptr_ != nullptr);
+      ICHECK(mptr_ != nullptr);
       mptr_->print(rso, nullptr);
       return rso.str();
     } else {
@@ -200,16 +200,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     std::vector<PrimFunc> funcs;
     std::string entry_func;
     for (auto kv : mod->functions) {
-      CHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
+      ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
       auto f = Downcast<PrimFunc>(kv.second);
       if (f->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
         auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-        CHECK(global_symbol.defined());
+        ICHECK(global_symbol.defined());
         entry_func = global_symbol.value();
       }
       funcs.push_back(f);
     }
-    CHECK_NE(funcs.size(), 0U);
+    ICHECK_NE(funcs.size(), 0U);
     // TODO(tqchen): remove the entry function behavior as it does not
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
@@ -254,7 +254,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     llvm::Metadata* tvm_target = module_->getModuleFlag("tvm_target");
     if (tvm_target != nullptr) {
       llvm::MDString* pstr = llvm::dyn_cast<llvm::MDString>(tvm_target);
-      CHECK(pstr != nullptr);
+      ICHECK(pstr != nullptr);
       target_metadata = pstr->getString().str();
       if (!(target_metadata.length() >= 4 && target_metadata.substr(0, 4) == "llvm")) {
         target_metadata = "llvm " + target_metadata;
@@ -311,12 +311,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
                  << " system=" << tm_sys->getTargetTriple().str();
     }
     llvm::DataLayout layout(tm->createDataLayout());
-    CHECK(layout == mptr_->getDataLayout())
+    ICHECK(layout == mptr_->getDataLayout())
         << "Data layout mismatch between module("
         << mptr_->getDataLayout().getStringRepresentation() << ")"
         << " and ExecutionEngine (" << layout.getStringRepresentation() << ")";
     ee_ = builder.create(tm.release());
-    CHECK(ee_ != nullptr) << "Failed to initialize jit engine for " << mptr_->getTargetTriple();
+    ICHECK(ee_ != nullptr) << "Failed to initialize jit engine for " << mptr_->getTargetTriple();
     ee_->runStaticConstructorsDestructors(false);
 
     if (void** ctx_addr =
diff --git a/src/target/opt/build_cuda_on.cc b/src/target/opt/build_cuda_on.cc
index 780829c256ce..1a0f08920fb6 100644
--- a/src/target/opt/build_cuda_on.cc
+++ b/src/target/opt/build_cuda_on.cc
@@ -109,7 +109,7 @@ std::string NVRTCCompile(const std::string& code, bool include_path = false) {
   std::string log;
   log.resize(log_size);
   NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0]));
-  CHECK_EQ(compile_res, NVRTC_SUCCESS) << log;
+  ICHECK_EQ(compile_res, NVRTC_SUCCESS) << log;
   size_t ptx_size;
   NVRTC_CALL(nvrtcGetPTXSize(prog, &ptx_size));
 
@@ -128,10 +128,10 @@ runtime::Module BuildCUDA(IRModule mod, Target target) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenCUDA: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenCUDA: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenCUDA: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
diff --git a/src/target/source/codegen_aocl.cc b/src/target/source/codegen_aocl.cc
index 00533d27c5a6..b3ed7cf32f7f 100644
--- a/src/target/source/codegen_aocl.cc
+++ b/src/target/source/codegen_aocl.cc
@@ -41,10 +41,10 @@ runtime::Module BuildAOCL(IRModule mod, Target target, bool emulation) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenOpenCL: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenOpenCL: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodegenOpenCL: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 2f19d6e126ad..ca9b80564cd9 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -78,7 +78,8 @@ void CodeGenC::AddFunction(const PrimFunc& f) {
   ReserveKeywordsAsUnique();
 
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined()) << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
+  ICHECK(global_symbol.defined())
+      << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
 
   this->PrintFuncPrefix();
@@ -187,7 +188,7 @@ std::string CodeGenC::GetBufferRef(DataType t, const VarNode* buffer, PrimExpr i
       // optimize for constant access
       if (auto* ptr = index.as<tir::IntImmNode>()) {
         int64_t offset = ptr->value;
-        CHECK_EQ(offset % t.lanes(), 0) << "Find unaligned vector load to a vector type";
+        ICHECK_EQ(offset % t.lanes(), 0) << "Find unaligned vector load to a vector type";
         os << vid << '[' << (offset / t.lanes()) << ']';
         return os.str();
       }
@@ -275,7 +276,7 @@ std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const Pri
     os << ')';
     return os.str();
   } else {
-    CHECK_LT(kind, builtin::kTVMValueKindBound_);
+    ICHECK_LT(kind, builtin::kTVMValueKindBound_);
     std::ostringstream os;
     os << "(((TVMValue*)";
     this->PrintExpr(buffer, os);
@@ -305,7 +306,7 @@ void CodeGenC::RegisterHandleType(const VarNode* buf_var, DataType t) {
   if (it == handle_data_type_.end()) {
     handle_data_type_[buf_var] = t;
   } else {
-    CHECK(it->second == t) << "conflicting buf var type";
+    ICHECK(it->second == t) << "conflicting buf var type";
   }
 }
 
@@ -346,11 +347,11 @@ void CodeGenC::PrintStorageSync(const CallNode* op) {  // NOLINT(*)
 }
 
 void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) {  // NOLINT(*)
-  CHECK_EQ(scope, "global");
+  ICHECK_EQ(scope, "global");
 }
 
 void CodeGenC::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
-  CHECK_EQ(t.lanes(), 1) << "do not yet support vector types";
+  ICHECK_EQ(t.lanes(), 1) << "do not yet support vector types";
   if (t.is_handle()) {
     os << "void*";
     return;
@@ -491,7 +492,7 @@ inline void PrintBinaryIntrinsic(const CallNode* op, const char* opstr,
                                  std::ostream& os,  // NOLINT(*)
                                  CodeGenC* p) {
   if (op->dtype.lanes() == 1) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     os << '(';
     p->PrintExpr(op->args[0], os);
     os << opstr;
@@ -576,7 +577,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
     auto call_op = GetRef<Op>(ptr_op);
 
     if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
-      CHECK_GE(op->args.size(), 1U);
+      ICHECK_GE(op->args.size(), 1U);
       auto func = Downcast<StringImm>(op->args[0]);
       this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), func->value, op->args, true, os);
     } else if (op_attr_global_symbol_.count(call_op)) {
@@ -586,7 +587,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
     } else if (op->op.same_as(builtin::bitwise_and())) {
       PrintBinaryIntrinsic(op, " & ", os, this);
     } else if (op->op.same_as(builtin::large_uint_imm())) {
-      CHECK_EQ(op->args.size(), 2U);
+      ICHECK_EQ(op->args.size(), 2U);
       uint64_t low = static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
       uint64_t high = static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
       uint64_t val = (high << 32U) | low;
@@ -596,7 +597,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
     } else if (op->op.same_as(builtin::bitwise_or())) {
       PrintBinaryIntrinsic(op, " | ", os, this);
     } else if (op->op.same_as(builtin::bitwise_not())) {
-      CHECK_EQ(op->args.size(), 1U);
+      ICHECK_EQ(op->args.size(), 1U);
       os << "(~";
       this->PrintExpr(op->args[0], os);
       os << ')';
@@ -614,7 +615,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       os << ")";
     } else if (op->op.same_as(builtin::address_of())) {
       const LoadNode* l = op->args[0].as<LoadNode>();
-      CHECK(op->args.size() == 1 && l);
+      ICHECK(op->args.size() == 1 && l);
       os << "((";
       this->PrintType(l->dtype.element_of(), os);
       os << " *)" << this->GetVarID(l->buffer_var.get()) << " + "
@@ -625,10 +626,10 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       }
       os << "))";
     } else if (op->op.same_as(builtin::tvm_struct_get())) {
-      CHECK_EQ(op->args.size(), 3U);
+      ICHECK_EQ(op->args.size(), 3U);
       os << GetStructRef(op->dtype, op->args[0], op->args[1], op->args[2].as<IntImmNode>()->value);
     } else if (op->op.same_as(builtin::isnullptr())) {
-      CHECK_EQ(op->args.size(), 1U);
+      ICHECK_EQ(op->args.size(), 1U);
       os << "(";
       this->PrintExpr(op->args[0], os);
       os << " == NULL)";
@@ -649,7 +650,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       LOG(FATAL) << "Unresolved call " << op->op;
     }
   } else {
-    CHECK(op->op.as<GlobalVarNode>());
+    ICHECK(op->op.as<GlobalVarNode>());
     LOG(FATAL) << "Do not yet support cross function call";
   }
 }
@@ -678,7 +679,7 @@ void CodeGenC::VisitExpr_(const LoadNode* op, std::ostream& os) {  // NOLINT(*)
     std::string ref = GetBufferRef(op->dtype, op->buffer_var.get(), op->index);
     HandleVolatileLoads(ref, op, os);
   } else {
-    CHECK(is_one(op->predicate)) << "predicated load is not supported";
+    ICHECK(is_one(op->predicate)) << "predicated load is not supported";
 
     arith::PVar<PrimExpr> base;
     if (arith::ramp(base, 1, op->dtype.lanes()).Match(op->index)) {
@@ -722,7 +723,7 @@ void CodeGenC::VisitStmt_(const StoreNode* op) {
     this->PrintIndent();
     stream << ref << " = " << value << ";\n";
   } else {
-    CHECK(is_one(op->predicate)) << "Predicated store is not supported";
+    ICHECK(is_one(op->predicate)) << "Predicated store is not supported";
     arith::PVar<PrimExpr> base;
 
     // The assignment below introduces side-effect, and the resulting value cannot
@@ -767,7 +768,7 @@ void CodeGenC::VisitStmt_(const StoreNode* op) {
 void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
   auto it = let_binding_.find(op->var);
   if (it != let_binding_.end()) {
-    CHECK(deep_equal_(it->second->value, op->value))
+    ICHECK(deep_equal_(it->second->value, op->value))
         << "Let cannot bind the same var to two different values";
   } else {
     let_binding_[op->var] = op;
@@ -779,7 +780,7 @@ void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
 
 void CodeGenC::VisitExpr_(const RampNode* op, std::ostream& os) {  // NOLINT(*)
   // constraint of current logic
-  CHECK_EQ(op->base.dtype(), DataType::Int(32));
+  ICHECK_EQ(op->base.dtype(), DataType::Int(32));
   os << "((int" << op->lanes << ")(";
   for (int i = 0; i < op->lanes; i++) {
     os << "(" << PrintExpr(op->base) << ")"
@@ -810,7 +811,7 @@ void CodeGenC::VisitExpr_(const SelectNode* op, std::ostream& os) {  // NOLINT(*
 void CodeGenC::VisitStmt_(const LetStmtNode* op) {
   std::string value = PrintExpr(op->value);
   if (print_ssa_form_) {
-    CHECK(!var_idmap_.count(op->var.get()));
+    ICHECK(!var_idmap_.count(op->var.get()));
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
@@ -828,12 +829,12 @@ void CodeGenC::VisitStmt_(const LetStmtNode* op) {
 }
 
 void CodeGenC::VisitStmt_(const AllocateNode* op) {
-  CHECK(!is_zero(op->condition));
+  ICHECK(!is_zero(op->condition));
   std::string vid = AllocVarID(op->buffer_var.get());
 
   this->PrintIndent();
   int32_t constant_size = op->constant_allocation_size();
-  CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
+  ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
   const VarNode* buffer = op->buffer_var.as<VarNode>();
   std::string scope = alloc_storage_scope_.at(buffer);
   PrintStorageScope(scope, stream);
@@ -854,15 +855,15 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) {
     }
   } else if (op->attr_key == tir::attr::storage_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     alloc_storage_scope_[v] = op->value.as<StringImmNode>()->value;
   } else if (op->attr_key == tir::attr::volatile_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     volatile_buf_.insert(v);
   } else if (op->attr_key == tir::attr::pragma_import_c) {
     const StringImmNode* value = op->value.as<StringImmNode>();
-    CHECK(value != nullptr);
+    ICHECK(value != nullptr);
     decl_stream << value->value;
   }
   this->PrintStmt(op->body);
@@ -873,7 +874,7 @@ void CodeGenC::VisitStmt_(const AssertStmtNode* op) {
   PrintIndent();
   if (const auto* str = op->message.as<StringImmNode>()) {
     // GLOG style check
-    stream << "CHECK(" << cond << ") << \"" << str->value << "\";\n";
+    stream << "ICHECK(" << cond << ") << \"" << str->value << "\";\n";
   } else {
     stream << "assert(" << cond << ");\n";
   }
@@ -884,7 +885,7 @@ void CodeGenC::VisitStmt_(const ForNode* op) {
   std::string extent = PrintExpr(op->extent);
   PrintIndent();
   std::string vid = AllocVarID(op->loop_var.get());
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   stream << "for (";
   PrintType(op->loop_var.dtype(), stream);
   stream << ' ' << vid << " = 0; " << vid << " < " << extent << "; ++" << vid << ") {\n";
@@ -932,7 +933,7 @@ void CodeGenC::VisitStmt_(const EvaluateNode* op) {
       this->PrintStorageSync(call);
       return;
     } else if (call->op.same_as(builtin::tvm_struct_set())) {
-      CHECK_EQ(call->args.size(), 4);
+      ICHECK_EQ(call->args.size(), 4);
       std::string value = PrintExpr(call->args[3]);
       std::string ref = GetStructRef(call->args[3].dtype(), call->args[0], call->args[1],
                                      call->args[2].as<IntImmNode>()->value);
@@ -949,7 +950,7 @@ void CodeGenC::VisitStmt_(const EvaluateNode* op) {
 }
 
 void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) {
-  CHECK_GT(t.lanes(), 1);
+  ICHECK_GT(t.lanes(), 1);
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (i != 0) {
       os << "|";
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index dc93c31e7024..310dab41215b 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -49,7 +49,7 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) {
 
 void CodeGenCHost::AddFunction(const PrimFunc& f) {
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined())
+  ICHECK(global_symbol.defined())
       << "CodeGenCHost: Expect PrimFunc to have the global_symbol attribute";
   function_names_.emplace_back(global_symbol.value());
 
@@ -71,7 +71,7 @@ void CodeGenCHost::PrintFinalReturn() {  // NOLINT(*)
 void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    CHECK_EQ(lanes, 1) << "does not support vector types";
+    ICHECK_EQ(lanes, 1) << "does not support vector types";
     os << "void*";
     return;
   }
@@ -192,7 +192,7 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
     std::string stack_name = GetUniqueName("stack");
     const std::string& type = op->args[0].as<StringImmNode>()->value;
     const IntImmNode* num = op->args[1].as<IntImmNode>();
-    CHECK(num != nullptr);
+    ICHECK(num != nullptr);
     static_assert(alignof(TVMValue) % alignof(DLTensor) == 0, "invariant");
     size_t unit = sizeof(TVMValue);
     size_t size = 0;
@@ -212,18 +212,18 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
     os << stack_name;
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
     const StringImmNode* s = op->args[0].as<StringImmNode>();
-    CHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
+    ICHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
     int64_t begin = op->args[3].as<IntImmNode>()->value;
     int64_t end = op->args[4].as<IntImmNode>()->value;
     int64_t num_args = end - begin;
-    CHECK_GE(num_args, 0);
+    ICHECK_GE(num_args, 0);
     std::string func_name = s->value;
     // NOTE: cannot rely on GetUnique for global decl_stream declarations
     // because it is reset between AddFunction().
     std::string packed_func_name = func_name + "_packed";
     if (declared_globals_.insert(packed_func_name).second) {
       // Still reserve the name among unique names.
-      CHECK(GetUniqueName(packed_func_name) == packed_func_name)
+      ICHECK(GetUniqueName(packed_func_name) == packed_func_name)
           << "Expected name " << packed_func_name << " to not be taken";
       decl_stream << "static void* " << packed_func_name << " = NULL;\n";
     }
@@ -307,13 +307,13 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   cg.Init(output_ssa, emit_asserts);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     cg.AddFunction(f);
   }
 
   if (target->GetAttr<Bool>("system-lib").value_or(Bool(false))) {
-    CHECK_EQ(target->GetAttr<String>("runtime").value_or(""), "c")
+    ICHECK_EQ(target->GetAttr<String>("runtime").value_or(""), "c")
         << "c target only supports generating C runtime SystemLibs";
     cg.GenerateFuncRegistry();
     cg.GenerateCrtSystemLib();
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index d57efa007272..51fcbb633de7 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -41,7 +41,7 @@ void CodeGenCUDA::Init(bool output_ssa) {
   CodeGenC::Init(output_ssa);
   vid_global_barrier_state_ = GetUniqueName(runtime::symbol::tvm_global_barrier_state);
   vid_global_barrier_expect_ = GetUniqueName("__barrier_expect");
-  CHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state);
+  ICHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state);
 }
 
 void CodeGenCUDA::PrintFuncPrefix() { stream << "extern \"C\" __global__ void"; }
@@ -83,7 +83,7 @@ std::string CodeGenCUDA::Finish() {
 }
 
 void CodeGenCUDA::VisitStmt_(const tir::ForNode* op) {
-  CHECK(is_const_int(op->min, 0));
+  ICHECK(is_const_int(op->min, 0));
   if (op->for_type == tir::ForType::Unrolled) {
     PrintIndent();
     stream << "#pragma unroll\n";
@@ -92,14 +92,14 @@ void CodeGenCUDA::VisitStmt_(const tir::ForNode* op) {
 }
 
 void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
-  CHECK(!var_idmap_.count(iv->var.get()));
+  ICHECK(!var_idmap_.count(iv->var.get()));
   var_idmap_[iv->var.get()] = CastFromTo(iv->thread_tag, DataType::UInt(32), iv->var.dtype());
 }
 
 void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    CHECK_EQ(lanes, 1) << "do not yet support vector types";
+    ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
@@ -120,7 +120,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           // h4.z is emitted as *(half2*)(&(u2.y)).x
           // h4.w is emitted as *(half2*)(&(u2.y)).y
           //
-          CHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
+          ICHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
           os << "uint" << lanes / 2;
         } else {
           fail = true;
@@ -308,7 +308,7 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  CHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
+  ICHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
   if ((t.is_int()) && t.bits() == 8) {
     if (t.lanes() == 2 || t.lanes() == 3) {
       os << vec << "." << access[i % t.lanes()];
@@ -332,7 +332,7 @@ void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
                                     const std::string& value) {
   this->PrintIndent();
   static const char access[] = {'x', 'y', 'z', 'w'};
-  CHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
+  ICHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (t.lanes() == 2 || t.lanes() == 3) {
       stream << vec << '.' << access[i % t.lanes()] << "="
@@ -394,8 +394,8 @@ void CodeGenCUDA::PrintStorageSync(const CallNode* op) {
 }
 
 void CodeGenCUDA::PrintStorageScope(const std::string& scope, std::ostream& os) {  // NOLINT(*)
-  CHECK_NE(scope, "global") << "Cannot allocate global memory when targeting CUDA. You must pass "
-                               "all global arrays as input instead";
+  ICHECK_NE(scope, "global") << "Cannot allocate global memory when targeting CUDA. You must pass "
+                                "all global arrays as input instead";
   if (scope == "shared") {
     os << "__shared__ ";
   }
@@ -404,7 +404,7 @@ void CodeGenCUDA::PrintStorageScope(const std::string& scope, std::ostream& os)
 void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
   DataType from_ty = op->value.dtype();
   DataType target_ty = op->dtype;
-  CHECK_EQ(target_ty.lanes(), from_ty.lanes());
+  ICHECK_EQ(target_ty.lanes(), from_ty.lanes());
 
   // Emit simple C-style type conversion.
   if (from_ty.is_scalar()) return CodeGenC::VisitExpr_(op, os);
@@ -496,7 +496,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
 
   if (op->op.same_as(builtin::tvm_fill_fragment())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 6U);
+    ICHECK_EQ(op->args.size(), 6U);
     os << "nvcuda::wmma::fill_fragment(";
     this->PrintExpr(op->args[0], os);
     os << "[";
@@ -506,7 +506,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << ")";
   } else if (op->op.same_as(builtin::tvm_load_matrix_sync())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 8U);
+    ICHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::load_matrix_sync(";
     this->PrintExpr(op->args[0], os);
     os << "[";
@@ -518,7 +518,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << ")";
   } else if (op->op.same_as(builtin::tvm_store_matrix_sync())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 8U);
+    ICHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::store_matrix_sync(";
     this->PrintExpr(op->args[5], os);
     os << ", ";
@@ -535,7 +535,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     os << ")";
   } else if (op->op.same_as(builtin::tvm_mma_sync())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 8U);
+    ICHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::mma_sync(";
     for (int i = 0; i < 4; ++i) {
       this->PrintExpr(op->args[i * 2], os);
@@ -545,7 +545,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     }
   } else if (op->op.same_as(builtin::tvm_bmma_sync())) {
     need_mma_h_ = true;
-    CHECK_EQ(op->args.size(), 8U);
+    ICHECK_EQ(op->args.size(), 8U);
     os << "nvcuda::wmma::bmma_sync(";
     for (int i = 0; i < 4; ++i) {
       this->PrintExpr(op->args[i * 2], os);
@@ -572,24 +572,24 @@ void CodeGenCUDA::VisitStmt_(const AttrStmtNode* op) {
 }
 
 void CodeGenCUDA::VisitStmt_(const AllocateNode* op) {
-  CHECK(!is_zero(op->condition));
+  ICHECK(!is_zero(op->condition));
   std::string vid = AllocVarID(op->buffer_var.get());
 
   this->PrintIndent();
   int32_t constant_size = op->constant_allocation_size();
-  CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
+  ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
   const VarNode* buffer = op->buffer_var.as<VarNode>();
   std::string scope = alloc_storage_scope_.at(buffer);
   if (scope.find("wmma.") == 0) {
     if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
-      CHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Int(8) ||
-            op->dtype == DataType::UInt(8) || op->dtype == DataType::Int(4) ||
-            op->dtype == DataType::UInt(4) || op->dtype == DataType::Int(1))
+      ICHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Int(8) ||
+             op->dtype == DataType::UInt(8) || op->dtype == DataType::Int(4) ||
+             op->dtype == DataType::UInt(4) || op->dtype == DataType::Int(1))
           << "Matrix_a and matrix_b only support half or char or unsigned char "
           << "or uint4 or int4 or int1 type for now";
     } else {
-      CHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Float(32) ||
-            op->dtype == DataType::Int(32))
+      ICHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Float(32) ||
+             op->dtype == DataType::Int(32))
           << "Accumulator only support half, float and int type for now";
     }
     constant_size = GetWmmaFragmentSize(scope, buffer, constant_size);
@@ -640,7 +640,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
   if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 8 && op->lanes == 4) {
     // make_int8x4
     const int64_t* p = as_const_int(op->value);
-    CHECK(p);
+    ICHECK(p);
     int64_t v = *p & 0xFF;
     v = (v << 24) | (v << 16) | (v << 8) | v;
     if (op->dtype.is_uint()) {
@@ -678,7 +678,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
 void CodeGenCUDA::VisitExpr_(const ShuffleNode* op, std::ostream& os) {
   std::vector<std::string> to_shuffle(op->vectors.size());
   for (int i = 0, e = op->vectors.size(); i < e; ++i) {
-    CHECK(op->vectors[i].dtype().lanes() == 1) << "Only scalars can be shuffled in CUDA!";
+    ICHECK(op->vectors[i].dtype().lanes() == 1) << "Only scalars can be shuffled in CUDA!";
     to_shuffle[i] = PrintExpr(op->vectors[i]);
   }
   os << "make_";
@@ -686,7 +686,7 @@ void CodeGenCUDA::VisitExpr_(const ShuffleNode* op, std::ostream& os) {
   os << '(';
   for (int i = 0, e = op->indices.size(); i < e; ++i) {
     const int64_t* val = as_const_int(op->indices[i]);
-    CHECK(val && *val >= 0 && (int)*val < (int)to_shuffle.size());
+    ICHECK(val && *val >= 0 && (int)*val < (int)to_shuffle.size());
     if (i != 0) os << ", ";
     os << to_shuffle[*val];
   }
@@ -701,8 +701,8 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream& os) {
   }
 
   // Codegen vector condition case by serializing the select op.
-  CHECK(op->false_value->dtype == op->dtype && op->true_value->dtype == op->dtype &&
-        op->dtype.lanes() == op->condition.dtype().lanes());
+  ICHECK(op->false_value->dtype == op->dtype && op->true_value->dtype == op->dtype &&
+         op->dtype.lanes() == op->condition.dtype().lanes());
 
   std::string r_var = GetUniqueName("_");
   this->PrintIndent();
@@ -846,7 +846,7 @@ void CodeGenCUDA::HandleVolatileLoads(const std::string& value, const LoadNode*
 
 void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
                                        std::ostream& os) {
-  CHECK_GT(t.lanes(), 1);
+  ICHECK_GT(t.lanes(), 1);
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (!(t.lanes() == 2 || t.lanes() == 3)) {
       if (i != 0) {
diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
index fb235d2d785d..7b69e8fbb903 100644
--- a/src/target/source/codegen_metal.cc
+++ b/src/target/source/codegen_metal.cc
@@ -59,7 +59,8 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
 
   // add to alloc buffer type.
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined()) << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
+  ICHECK(global_symbol.defined())
+      << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
 
   // Function header.
   this->stream << "kernel void " << static_cast<std::string>(global_symbol.value()) << "(";
@@ -97,7 +98,7 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
     decl_stream << "struct " << arg_buf_type << " {\n";
     for (size_t i = num_buffer; i < f->params.size(); ++i) {
       Var v = f->params[i];
-      CHECK(!v.dtype().is_handle());
+      ICHECK(!v.dtype().is_handle());
       std::string vid = AllocVarID(v.get());
       std::ostringstream vref;
       if (v.dtype().bits() == 32) {
@@ -116,8 +117,8 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
     decl_stream << "};\n\n";
   }
   // Setup the thread group info.
-  CHECK_EQ(GetUniqueName("threadIdx"), "threadIdx");
-  CHECK_EQ(GetUniqueName("blockIdx"), "blockIdx");
+  ICHECK_EQ(GetUniqueName("threadIdx"), "threadIdx");
+  ICHECK_EQ(GetUniqueName("blockIdx"), "blockIdx");
   int work_dim = 0;
   auto thread_axis = f->GetAttr<Array<tir::IterVar>>(tir::attr::kDeviceThreadAxis).value();
 
@@ -136,7 +137,7 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
   }
   // bind thread axis
   for (IterVar iv : thread_axis) {
-    CHECK(!var_idmap_.count(iv->var.get()));
+    ICHECK(!var_idmap_.count(iv->var.get()));
     std::string vname = iv->thread_tag;
     if (work_dim <= 1) {
       vname = vname.substr(0, iv->thread_tag.length() - 2);
@@ -154,7 +155,7 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
 }
 
 void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
-  CHECK(!var_idmap_.count(iv->var.get()));
+  ICHECK(!var_idmap_.count(iv->var.get()));
   var_idmap_[iv->var.get()] =
       CastFromTo(iv->thread_tag, DataType::UInt(thread_index_bits_), iv->var.dtype());
 }
@@ -162,7 +163,7 @@ void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
 void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    CHECK_EQ(lanes, 1) << "do not yet support vector types";
+    ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
@@ -289,10 +290,10 @@ runtime::Module BuildMetal(IRModule mod, Target target) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenMetal: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenMetal: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenMetal: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 10cc007c4572..0f79df37701c 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -79,7 +79,7 @@ std::string CodeGenOpenCL::Finish() {
 }
 
 void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
-  CHECK(!var_idmap_.count(iv->var.get()));
+  ICHECK(!var_idmap_.count(iv->var.get()));
   runtime::ThreadScope ts = runtime::ThreadScope::Create(iv->thread_tag);
   std::ostringstream os;
   if (ts.rank == 1) {
@@ -93,7 +93,7 @@ void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
 void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    CHECK_EQ(lanes, 1) << "do not yet support vector types";
+    ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
@@ -233,7 +233,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
   if (op->op.same_as(builtin::address_of())) {
     // Overload tvm_address_of to add storage scope (e.g. __global).
     const LoadNode* load = op->args[0].as<LoadNode>();
-    CHECK(op->args.size() == 1 && load);
+    ICHECK(op->args.size() == 1 && load);
     os << "((";
     auto it = alloc_storage_scope_.find(load->buffer_var.get());
     if (it != alloc_storage_scope_.end()) {
@@ -287,10 +287,10 @@ runtime::Module BuildOpenCL(IRModule mod, Target target) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenOpenCL: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 9b2f0345864f..9f0cf9a70b61 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -70,7 +70,7 @@ std::string CodeGenSourceBase::SSAGetID(std::string src, DataType t) {
 }
 
 std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) {
-  CHECK(!var_idmap_.count(v)) << "Need input to be in SSA form dup " << v->name_hint;
+  ICHECK(!var_idmap_.count(v)) << "Need input to be in SSA form dup " << v->name_hint;
   std::string key = v->name_hint;
   std::string vid = GetUniqueName(key);
   var_idmap_[v] = vid;
@@ -79,7 +79,7 @@ std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) {
 
 std::string CodeGenSourceBase::GetVarID(const tir::VarNode* v) const {
   auto it = var_idmap_.find(v);
-  CHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
+  ICHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
   return it->second;
 }
 
@@ -97,7 +97,7 @@ void CodeGenSourceBase::MarkConst(std::string vid) {
     e.scope_id = 0;
     ssa_assign_map_[vid] = e;
   } else {
-    CHECK_EQ(it->second.vid, vid);
+    ICHECK_EQ(it->second.vid, vid);
   }
 }
 
diff --git a/src/target/source/codegen_vhls.cc b/src/target/source/codegen_vhls.cc
index 9401f0682db8..9896d8b833f9 100644
--- a/src/target/source/codegen_vhls.cc
+++ b/src/target/source/codegen_vhls.cc
@@ -146,10 +146,10 @@ runtime::Module BuildSDAccel(IRModule mod, Target target) {
   cg.Init(output_ssa);
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenVHLS: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenVHLS: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenVLHS: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     cg.AddFunction(f);
   }
@@ -160,7 +160,7 @@ runtime::Module BuildSDAccel(IRModule mod, Target target) {
   Array<Array<runtime::String> > kernel_info;
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenOpenCL: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     CodeGenVivadoHLS cg;
     cg.Init(output_ssa);
@@ -171,7 +171,7 @@ runtime::Module BuildSDAccel(IRModule mod, Target target) {
     }
 
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
     kernel_info.push_back({global_symbol.value(), code});
   }
diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc
index 9ffceb68e278..0a68736bcd05 100644
--- a/src/target/source/intrin_rule_cuda.cc
+++ b/src/target/source/intrin_rule_cuda.cc
@@ -102,7 +102,7 @@ struct CUDAWarpIntrinsic {
     } else if (orig_op.same_as(builtin::tvm_warp_shuffle_up())) {
       return Op::Get("tir.cuda.__shfl_up_sync");
     } else {
-      CHECK(orig_op.same_as(builtin::tvm_warp_shuffle_down()));
+      ICHECK(orig_op.same_as(builtin::tvm_warp_shuffle_down()));
       return Op::Get("tir.cuda.__shfl_down_sync");
     }
   }
@@ -117,8 +117,8 @@ template <typename T>
 static void DispatchCUDAShuffle(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
-  CHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   Array<PrimExpr> cuda_args{{call->args[0], call->args[1], call->args[2], call->args[3]}};
 
   *rv = Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args);
diff --git a/src/target/source/intrin_rule_opencl.cc b/src/target/source/intrin_rule_opencl.cc
index 7f81e335ec8d..54da5c74ab02 100644
--- a/src/target/source/intrin_rule_opencl.cc
+++ b/src/target/source/intrin_rule_opencl.cc
@@ -74,10 +74,10 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.cosh").set_body(DispatchPureExtern<D
 static void DispatchIntelShuffle(const TVMArgs& args, TVMRetValue* rv) {
   PrimExpr e = args[0];
   const CallNode* call = e.as<CallNode>();
-  CHECK(call != nullptr);
-  CHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   arith::Analyzer analyzer;
-  CHECK(analyzer.CanProve(call->args[3] == call->args[4]))
+  ICHECK(analyzer.CanProve(call->args[3] == call->args[4]))
       << "Intel warp shuffle dose not support width != warp_size";
   Array<PrimExpr> opencl_args{{StringImm("intel_sub_group_shuffle"), call->args[1], call->args[2]}};
   *rv = Call(call->dtype, builtin::call_pure_extern(), opencl_args);
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index e1ee1539d986..3be658aa0125 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -67,7 +67,7 @@ runtime::Module CreateMetadataModule(
       for (size_t i = 0; i < variables.size(); i++) {
         arrays.push_back(variables[i].operator std::string());
       }
-      CHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
+      ICHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
       sym_metadata[symbol] = arrays;
     }
   }
@@ -132,10 +132,10 @@ class CSourceModuleNode : public runtime::ModuleNode {
     std::string fmt = GetFileFormat(file_name, format);
     std::string meta_file = GetMetaFilePath(file_name);
     if (fmt == "cc") {
-      CHECK_NE(code_.length(), 0);
+      ICHECK_NE(code_.length(), 0);
       SaveBinaryToFile(file_name, code_);
     } else {
-      CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+      ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
     }
   }
 
@@ -179,7 +179,7 @@ class DeviceSourceModuleNode final : public runtime::ModuleNode {
 
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
-    CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+    ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
     std::string meta_file = GetMetaFilePath(file_name);
     SaveMetaDataToFile(meta_file, fmap_);
     SaveBinaryToFile(file_name, data_);
diff --git a/src/target/spirv/build_vulkan.cc b/src/target/spirv/build_vulkan.cc
index 1eef2f8f88e5..a0f0b76eefbd 100644
--- a/src/target/spirv/build_vulkan.cc
+++ b/src/target/spirv/build_vulkan.cc
@@ -49,10 +49,10 @@ class SPIRVTools {
                         SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES | SPV_BINARY_TO_TEXT_OPTION_INDENT,
                         &text, &diagnostic);
 
-    CHECK_EQ(res, SPV_SUCCESS) << " line=" << diagnostic->position.line
-                               << " column=" << diagnostic->position.column
-                               << " index=" << diagnostic->position.index
-                               << " error:" << diagnostic->error;
+    ICHECK_EQ(res, SPV_SUCCESS) << " line=" << diagnostic->position.line
+                                << " column=" << diagnostic->position.column
+                                << " index=" << diagnostic->position.index
+                                << " error:" << diagnostic->error;
 
     std::string ret(text->str);
     spvTextDestroy(text);
@@ -78,13 +78,13 @@ runtime::Module BuildSPIRV(IRModule mod, Target target, bool webgpu_restriction)
   CodeGenSPIRV cg;
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenSPIRV: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenSPIRV: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    CHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
         << "CodeGenSPIRV: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenSPIRV: Expect PrimFunc to have the global_symbol attribute";
 
     std::string f_name = global_symbol.value();
@@ -95,7 +95,7 @@ runtime::Module BuildSPIRV(IRModule mod, Target target, bool webgpu_restriction)
 
     if (webgpu_restriction) {
       for (auto param : f->params) {
-        CHECK(param.dtype().is_handle()) << "WebGPU does not yet support non-buffer arguments";
+        ICHECK(param.dtype().is_handle()) << "WebGPU does not yet support non-buffer arguments";
       }
     }
 
@@ -104,7 +104,7 @@ runtime::Module BuildSPIRV(IRModule mod, Target target, bool webgpu_restriction)
       arr.data = reinterpret_cast<const char*>(dmlc::BeginPtr(shader.data));
       arr.size = shader.data.size() * sizeof(uint32_t);
       std::string transformed = (*postproc)(arr);
-      CHECK_EQ(transformed.length() % 4U, 0U);
+      ICHECK_EQ(transformed.length() % 4U, 0U);
       shader.data.resize(transformed.size() / 4U);
       std::copy(transformed.begin(), transformed.end(),
                 reinterpret_cast<char*>(dmlc::BeginPtr(shader.data)));
diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc
index 2a67d953f960..c3b12ab943c6 100644
--- a/src/target/spirv/codegen_spirv.cc
+++ b/src/target/spirv/codegen_spirv.cc
@@ -35,7 +35,7 @@ namespace codegen {
 
 std::vector<uint32_t> CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::string& name) {
   this->InitFuncState();
-  CHECK(f->HasNonzeroAttr(tir::attr::kNoAlias)) << "SPIRV only takes restricted memory model";
+  ICHECK(f->HasNonzeroAttr(tir::attr::kNoAlias)) << "SPIRV only takes restricted memory model";
   std::vector<Var> pod_args;
   uint32_t num_buffer = 0;
 
@@ -44,7 +44,7 @@ std::vector<uint32_t> CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::
     if (t.is_handle()) {
       if (auto* ptr = arg->type_annotation.as<PointerTypeNode>()) {
         auto* prim = ptr->element_type.as<PrimTypeNode>();
-        CHECK(prim);
+        ICHECK(prim);
         DataType value_type = prim->dtype;
         spirv::Value arg_value =
             builder_->BufferArgument(builder_->GetSType(value_type), 0, num_buffer);
@@ -98,9 +98,9 @@ spirv::Value CodeGenSPIRV::GetThreadIndex(const IterVar& iv, const PrimExpr& ext
   if (ts.rank == 1) {
     v = builder_->GetLocalID(ts.dim_index);
     auto* sizeptr = extent.as<tir::IntImmNode>();
-    CHECK(sizeptr) << "SPIRV only allows constant thread group size "
-                   << " get " << extent;
-    CHECK_LT(ts.dim_index, 3);
+    ICHECK(sizeptr) << "SPIRV only allows constant thread group size "
+                    << " get " << extent;
+    ICHECK_LT(ts.dim_index, 3);
     workgroup_size_[ts.dim_index] = static_cast<uint32_t>(sizeptr->value);
   } else {
     v = builder_->GetWorkgroupID(ts.dim_index);
@@ -130,7 +130,7 @@ spirv::Value CodeGenSPIRV::CreateStorageSync(const CallNode* op) {
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const VarNode* op) {
   auto it = var_map_.find(op);
-  CHECK(it != var_map_.end()) << "cannot find variable " << op->name_hint;
+  ICHECK(it != var_map_.end()) << "cannot find variable " << op->name_hint;
   return it->second;
 }
 
@@ -232,7 +232,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const SelectNode* op) {
 spirv::Value CodeGenSPIRV::VisitExpr_(const LetNode* op) {
   auto it = let_binding_.find(op->var);
   if (it != let_binding_.end()) {
-    CHECK(deep_equal_(it->second->value, op->value))
+    ICHECK(deep_equal_(it->second->value, op->value))
         << "Let cannot bind the same var to two different values";
   } else {
     let_binding_[op->var] = op;
@@ -244,7 +244,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const LetNode* op) {
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(builtin::call_spirv_pure_glsl450())) {
-    CHECK_GE(op->args.size(), 2U);
+    ICHECK_GE(op->args.size(), 2U);
     uint32_t inst_id = static_cast<uint32_t>(op->args[0].as<IntImmNode>()->value);
     std::vector<spirv::Value> values;
     for (size_t i = 1; i < op->args.size(); ++i) {
@@ -252,31 +252,31 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     }
     return builder_->CallGLSL450(builder_->GetSType(op->dtype), inst_id, values);
   } else if (op->op.same_as(builtin::bitwise_and())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     return builder_->MakeValue(spv::OpBitwiseAnd, a.stype, a, b);
   } else if (op->op.same_as(builtin::bitwise_xor())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     return builder_->MakeValue(spv::OpBitwiseXor, a.stype, a, b);
   } else if (op->op.same_as(builtin::bitwise_or())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     return builder_->MakeValue(spv::OpBitwiseOr, a.stype, a, b);
   } else if (op->op.same_as(builtin::bitwise_not())) {
-    CHECK_EQ(op->args.size(), 1U);
+    ICHECK_EQ(op->args.size(), 1U);
     spirv::Value a = MakeValue(op->args[0]);
     return builder_->MakeValue(spv::OpNot, a.stype, a);
   } else if (op->op.same_as(builtin::shift_left())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     return builder_->MakeValue(spv::OpShiftLeftLogical, a.stype, a, b);
   } else if (op->op.same_as(builtin::shift_right())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
     if (op->args[0].dtype().is_int()) {
@@ -288,7 +288,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     return builder_->MakeValue(spv::OpBitcast, builder_->GetSType(op->dtype),
                                MakeValue(op->args[0]));
   } else if (op->op.same_as(builtin::large_uint_imm())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     uint64_t low = static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
     uint64_t high = static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
     uint64_t val = (high << 32U) | low;
@@ -296,7 +296,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
   } else if (op->op.same_as(builtin::tvm_storage_sync())) {
     return this->CreateStorageSync(op);
   } else if (op->op.same_as(builtin::if_then_else())) {
-    CHECK_EQ(op->args.size(), 3U);
+    ICHECK_EQ(op->args.size(), 3U);
     spirv::Value cond = MakeValue(op->args[0]);
     spirv::Label then_label = builder_->NewLabel();
     spirv::Label else_label = builder_->NewLabel();
@@ -352,9 +352,9 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const BroadcastNode* op) {
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const LoadNode* op) {
-  CHECK(is_one(op->predicate));
+  ICHECK(is_one(op->predicate));
   auto it = storage_info_.find(op->buffer_var.get());
-  CHECK(it != storage_info_.end());
+  ICHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
   if (!info.content_fixed) {
     info.UpdateContentType(op->dtype);
@@ -369,7 +369,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const LoadNode* op) {
     mask |= spv::MemoryAccessVolatileMask;
   }
   if (op->dtype.lanes() == 1) {
-    CHECK_EQ(info.content_type, op->dtype)
+    ICHECK_EQ(info.content_type, op->dtype)
         << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(op->index);
     spirv::Value ptr = builder_->StructArrayAccess(ptr_type, buffer, index);
@@ -387,9 +387,9 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const LoadNode* op) {
     } else {
       if (const RampNode* ramp = op->index.as<RampNode>()) {
         if (is_one(ramp->stride)) {
-          CHECK_EQ(ramp->lanes, op->dtype.lanes());
+          ICHECK_EQ(ramp->lanes, op->dtype.lanes());
           arith::ModularSet me = analyzer_->modular_set(ramp->base);
-          CHECK((me->coeff % ramp->lanes) == 0 && (me->base % ramp->lanes) == 0)
+          ICHECK((me->coeff % ramp->lanes) == 0 && (me->base % ramp->lanes) == 0)
               << "Only aligned vector access is allowed in SPIRV";
           PrimExpr vec_index =
               analyzer_->Simplify(ramp->base / make_const(ramp->base.dtype(), ramp->lanes));
@@ -420,9 +420,9 @@ void CodeGenSPIRV::Scalarize(const PrimExpr& e, std::function<void(int i, spirv:
 }
 
 void CodeGenSPIRV::VisitStmt_(const StoreNode* op) {
-  CHECK(is_one(op->predicate));
+  ICHECK(is_one(op->predicate));
   auto it = storage_info_.find(op->buffer_var.get());
-  CHECK(it != storage_info_.end());
+  ICHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
 
   if (!info.content_fixed) {
@@ -440,7 +440,7 @@ void CodeGenSPIRV::VisitStmt_(const StoreNode* op) {
   }
 
   if (op->value.dtype().lanes() == 1) {
-    CHECK_EQ(info.content_type, op->value.dtype())
+    ICHECK_EQ(info.content_type, op->value.dtype())
         << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(op->index);
     spirv::Value ptr = builder_->StructArrayAccess(ptr_type, buffer, index);
@@ -457,9 +457,9 @@ void CodeGenSPIRV::VisitStmt_(const StoreNode* op) {
     } else {
       if (const RampNode* ramp = op->index.as<RampNode>()) {
         if (is_one(ramp->stride)) {
-          CHECK_EQ(ramp->lanes, op->value.dtype().lanes());
+          ICHECK_EQ(ramp->lanes, op->value.dtype().lanes());
           arith::ModularSet me = analyzer_->modular_set(ramp->base);
-          CHECK((me->coeff % ramp->lanes) == 0 && (me->base % ramp->lanes) == 0)
+          ICHECK((me->coeff % ramp->lanes) == 0 && (me->base % ramp->lanes) == 0)
               << "Only aligned vector access is allowed in SPIRV";
           PrimExpr vec_index =
               analyzer_->Simplify(ramp->base / make_const(ramp->base.dtype(), ramp->lanes));
@@ -474,7 +474,7 @@ void CodeGenSPIRV::VisitStmt_(const StoreNode* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const ForNode* op) {
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
   spirv::Value init_value = MakeValue(op->min);
   spirv::Value extent_value = MakeValue(op->extent);
@@ -544,10 +544,10 @@ void CodeGenSPIRV::VisitStmt_(const IfThenElseNode* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const AllocateNode* op) {
-  CHECK(!is_zero(op->condition));
-  CHECK(!op->dtype.is_handle());
+  ICHECK(!is_zero(op->condition));
+  ICHECK(!op->dtype.is_handle());
   int32_t constant_size = op->constant_allocation_size();
-  CHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
+  ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation in GPU";
   spirv::Value buf;
   StorageInfo& info = storage_info_[op->buffer_var.get()];
   spirv::SType etype = builder_->GetSType(op->dtype);
@@ -556,15 +556,15 @@ void CodeGenSPIRV::VisitStmt_(const AllocateNode* op) {
         builder_->Allocate(etype, static_cast<uint32_t>(constant_size), spv::StorageClassFunction);
   } else {
     // shared memory
-    CHECK(info.scope.rank == runtime::StorageRank::kShared)
+    ICHECK(info.scope.rank == runtime::StorageRank::kShared)
         << "Can only allocate shared or local memory inside kernel";
     // Shared memory
     buf =
         builder_->Allocate(etype, static_cast<uint32_t>(constant_size), spv::StorageClassWorkgroup);
   }
-  CHECK(!info.content_fixed);
+  ICHECK(!info.content_fixed);
   info.UpdateContentType(op->dtype);
-  CHECK(!var_map_.count(op->buffer_var.get()));
+  ICHECK(!var_map_.count(op->buffer_var.get()));
   var_map_[op->buffer_var.get()] = buf;
   this->VisitStmt(op->body);
 }
@@ -580,11 +580,11 @@ void CodeGenSPIRV::VisitStmt_(const AttrStmtNode* op) {
     }
   } else if (op->attr_key == tir::attr::storage_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     storage_info_[v].scope = runtime::StorageScope::Create(op->value.as<StringImmNode>()->value);
   } else if (op->attr_key == tir::attr::volatile_scope) {
     const VarNode* v = op->node.as<VarNode>();
-    CHECK(v);
+    ICHECK(v);
     storage_info_[v].is_volatile = true;
   }
   this->VisitStmt(op->body);
@@ -596,8 +596,8 @@ void CodeGenSPIRV::VisitStmt_(const AssertStmtNode* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const LetStmtNode* op) {
-  CHECK(!var_map_.count(op->var.get()));
-  CHECK(!op->var.dtype().is_handle());
+  ICHECK(!var_map_.count(op->var.get()));
+  ICHECK(!op->var.dtype().is_handle());
   var_map_[op->var.get()] = MakeValue(op->value);
   analyzer_->Bind(op->var, op->value);
   this->VisitStmt(op->body);
diff --git a/src/target/spirv/codegen_spirv.h b/src/target/spirv/codegen_spirv.h
index 9bf81095f066..be755641c8a5 100644
--- a/src/target/spirv/codegen_spirv.h
+++ b/src/target/spirv/codegen_spirv.h
@@ -116,7 +116,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
     // Update content type if it hasn't beenupdated.
     void UpdateContentType(DataType type) {
       if (content_fixed) {
-        CHECK_EQ(type, content_type) << "Cannot use two different content type in GLSL model";
+        ICHECK_EQ(type, content_type) << "Cannot use two different content type in GLSL model";
       } else {
         this->content_type = type;
         content_fixed = true;
diff --git a/src/target/spirv/intrin_rule_spirv.cc b/src/target/spirv/intrin_rule_spirv.cc
index ea575ca83866..90b2eb2a671f 100644
--- a/src/target/spirv/intrin_rule_spirv.cc
+++ b/src/target/spirv/intrin_rule_spirv.cc
@@ -36,7 +36,7 @@ template <unsigned id>
 inline void DispatchGLSLPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
-  CHECK(call != nullptr);
+  ICHECK(call != nullptr);
   Array<PrimExpr> cargs;
   // intrin id.
   cargs.push_back(IntImm(DataType::UInt(32), id));
diff --git a/src/target/spirv/ir_builder.cc b/src/target/spirv/ir_builder.cc
index 305464ac398b..273fc48c3e30 100644
--- a/src/target/spirv/ir_builder.cc
+++ b/src/target/spirv/ir_builder.cc
@@ -30,7 +30,7 @@ namespace spirv {
 // implementations
 
 void IRBuilder::InitHeader() {
-  CHECK_EQ(header_.size(), 0U);
+  ICHECK_EQ(header_.size(), 0U);
   header_.push_back(spv::MagicNumber);
 
   // Use the spirv version as indicated in the SDK.
@@ -93,7 +93,7 @@ SType IRBuilder::GetSType(const DataType& dtype) {
 }
 
 SType IRBuilder::GetPointerType(const SType& value_type, spv::StorageClass storage_class) {
-  CHECK_NE(storage_class, spv::StorageClassMax);
+  ICHECK_NE(storage_class, spv::StorageClassMax);
   auto key = std::make_pair(value_type.id, storage_class);
   auto it = pointer_type_tbl_.find(key);
   if (it != pointer_type_tbl_.end()) {
@@ -128,7 +128,7 @@ SType IRBuilder::GetStructArrayType(const SType& value_type, uint32_t num_elems)
     ib_.Begin(spv::OpTypeRuntimeArray).AddSeq(arr_type, value_type).Commit(&global_);
   }
   int nbits = value_type.type.bits() * value_type.type.lanes();
-  CHECK_EQ(nbits % 8, 0);
+  ICHECK_EQ(nbits % 8, 0);
   uint32_t nbytes = static_cast<uint32_t>(nbits) / 8;
   // decorate the array type.
   this->Decorate(spv::OpDecorate, arr_type, spv::DecorationArrayStride, nbytes);
@@ -158,7 +158,7 @@ SType IRBuilder::GetStructArrayType(const SType& value_type, uint32_t num_elems)
 }
 
 Value IRBuilder::StructArrayAccess(const SType& res_type, Value buffer, Value index) {
-  CHECK(buffer.flag == kStructArrayPtr);
+  ICHECK(buffer.flag == kStructArrayPtr);
   return MakeValue(spv::OpInBoundsAccessChain, res_type, buffer, const_i32_zero_, index);
 }
 
@@ -177,7 +177,7 @@ Value IRBuilder::FloatImm(const SType& dtype, double value) {
     uint64_t data = ptr[0];
     return GetConst_(dtype, &data);
   } else {
-    CHECK_EQ(dtype.type.bits(), 16);
+    ICHECK_EQ(dtype.type.bits(), 16);
     return Cast(dtype, FloatImm(GetSType(DataType::Float(32)), value));
   }
 }
@@ -204,7 +204,7 @@ Value IRBuilder::BufferArgument(const SType& value_type, uint32_t descriptor_set
 }
 
 Value IRBuilder::DeclarePushConstant(const std::vector<SType>& value_types) {
-  CHECK_EQ(push_const_.id, 0);
+  ICHECK_EQ(push_const_.id, 0);
   SType struct_type;
   struct_type.id = id_counter_++;
   struct_type.type = DataType::Handle();
@@ -221,7 +221,7 @@ Value IRBuilder::DeclarePushConstant(const std::vector<SType>& value_types) {
         .Commit(&decorate_);
     DataType t = value_types[i].type;
     uint32_t nbits = t.bits() * t.lanes();
-    CHECK_EQ(nbits % 8, 0);
+    ICHECK_EQ(nbits % 8, 0);
     offset += nbits / 8;
   }
   // Decorate push constants as UBO
@@ -243,7 +243,7 @@ Value IRBuilder::GetPushConstant(Value ptr_push_const, const SType& v_type, uint
 Value IRBuilder::NewFunction() { return NewValue(t_void_func_, kFunction); }
 
 void IRBuilder::CommitKernelFunction(const Value& func, const std::string& name) {
-  CHECK_EQ(func.flag, kFunction);
+  ICHECK_EQ(func.flag, kFunction);
   ib_.Begin(spv::OpEntryPoint).AddSeq(spv::ExecutionModelGLCompute, func, name);
   if (workgroup_id_.id != 0) {
     ib_.Add(workgroup_id_);
@@ -255,7 +255,7 @@ void IRBuilder::CommitKernelFunction(const Value& func, const std::string& name)
 }
 
 void IRBuilder::StartFunction(const Value& func) {
-  CHECK_EQ(func.flag, kFunction);
+  ICHECK_EQ(func.flag, kFunction);
   // add function declaration to the header.
   ib_.Begin(spv::OpFunction).AddSeq(t_void_, func, 0, t_void_func_).Commit(&func_header_);
 
@@ -265,7 +265,7 @@ void IRBuilder::StartFunction(const Value& func) {
 }
 
 void IRBuilder::SetLocalSize(const Value& func, uint32_t local_size[3]) {
-  CHECK_EQ(func.flag, kFunction);
+  ICHECK_EQ(func.flag, kFunction);
   ib_.Begin(spv::OpExecutionMode)
       .AddSeq(func, spv::ExecutionModeLocalSize, local_size[0], local_size[1], local_size[2])
       .Commit(&exec_mode_);
@@ -273,7 +273,7 @@ void IRBuilder::SetLocalSize(const Value& func, uint32_t local_size[3]) {
 
 Value IRBuilder::Allocate(const SType& value_type, uint32_t num_elems,
                           spv::StorageClass storage_class) {
-  CHECK_NE(num_elems, 0U);
+  ICHECK_NE(num_elems, 0U);
   SType sarr_type = GetStructArrayType(value_type, num_elems);
   SType ptr_type = GetPointerType(sarr_type, storage_class);
   Value val = NewValue(ptr_type, kStructArrayPtr);
@@ -322,7 +322,7 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
   if (it != const_tbl_.end()) {
     return it->second;
   }
-  CHECK_LE(dtype.type.bits(), 64);
+  ICHECK_LE(dtype.type.bits(), 64);
   Value ret = NewValue(dtype, kConstant);
   if (dtype.type == DataType::UInt(1)) {
     // bool types.
@@ -357,7 +357,7 @@ SType IRBuilder::DeclareType(const DataType& dtype) {
     t.id = id_counter_++;
     t.type = dtype;
     if (dtype.bits() == 1) {
-      CHECK(dtype.is_uint());
+      ICHECK(dtype.is_uint());
       ib_.Begin(spv::OpTypeBool).Add(t).Commit(&global_);
     } else if (dtype.is_int()) {
       ib_.Begin(spv::OpTypeInt).AddSeq(t, dtype.bits(), 1).Commit(&global_);
@@ -390,7 +390,7 @@ PhiValue IRBuilder::MakePhi(const SType& out_type, uint32_t num_incoming) {
   phi.stype = out_type;
   phi.flag = kNormal;
   phi.instr = ib_.Commit(&function_);
-  CHECK_EQ(phi.instr.WordCount(), 2 * num_incoming + 3);
+  ICHECK_EQ(phi.instr.WordCount(), 2 * num_incoming + 3);
   return phi;
 }
 
@@ -410,7 +410,7 @@ Value IRBuilder::Concat(const std::vector<Value>& vec) {
   DataType etype = vec[0].stype.type;
   int lanes = etype.lanes();
   for (size_t i = 1; i < vec.size(); ++i) {
-    CHECK_EQ(etype, vec[i].stype.type.element_of())
+    ICHECK_EQ(etype, vec[i].stype.type.element_of())
         << "Cannot concat vector of different element type";
     lanes += vec[i].stype.type.lanes();
     is_const = is_const && (vec[i].flag == kConstant);
@@ -435,11 +435,11 @@ Value IRBuilder::Concat(const std::vector<Value>& vec) {
 }
 
 Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
-  CHECK_NE(value.stype.id, 0U);
+  ICHECK_NE(value.stype.id, 0U);
   if (value.stype.id == dst_type.id) return value;
   const tvm::DataType& from = value.stype.type;
   const tvm::DataType& to = dst_type.type;
-  CHECK_EQ(from.lanes(), to.lanes());
+  ICHECK_EQ(from.lanes(), to.lanes());
   if (from == DataType::Bool()) {
     if (to.is_int()) {
       return Select(value, IntImm(dst_type, 1), IntImm(dst_type, 0));
@@ -493,24 +493,24 @@ Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
 
 #define DEFINE_BUILDER_BINARY_USIGN_OP(_OpName, _Op)       \
   Value IRBuilder::_OpName(Value a, Value b) {             \
-    CHECK_EQ(a.stype.id, b.stype.id);                      \
+    ICHECK_EQ(a.stype.id, b.stype.id);                     \
     if (a.stype.type.is_int() || a.stype.type.is_uint()) { \
       return MakeValue(spv::OpI##_Op, a.stype, a, b);      \
     } else {                                               \
-      CHECK(a.stype.type.is_float());                      \
+      ICHECK(a.stype.type.is_float());                     \
       return MakeValue(spv::OpF##_Op, a.stype, a, b);      \
     }                                                      \
   }
 
 #define DEFINE_BUILDER_BINARY_SIGN_OP(_OpName, _Op)   \
   Value IRBuilder::_OpName(Value a, Value b) {        \
-    CHECK_EQ(a.stype.id, b.stype.id);                 \
+    ICHECK_EQ(a.stype.id, b.stype.id);                \
     if (a.stype.type.is_int()) {                      \
       return MakeValue(spv::OpS##_Op, a.stype, a, b); \
     } else if (a.stype.type.is_uint()) {              \
       return MakeValue(spv::OpU##_Op, a.stype, a, b); \
     } else {                                          \
-      CHECK(a.stype.type.is_float());                 \
+      ICHECK(a.stype.type.is_float());                \
       return MakeValue(spv::OpF##_Op, a.stype, a, b); \
     }                                                 \
   }
@@ -521,28 +521,28 @@ DEFINE_BUILDER_BINARY_USIGN_OP(Mul, Mul);
 DEFINE_BUILDER_BINARY_SIGN_OP(Div, Div);
 
 Value IRBuilder::Mod(Value a, Value b) {
-  CHECK_EQ(a.stype.id, b.stype.id);
+  ICHECK_EQ(a.stype.id, b.stype.id);
   if (a.stype.type.is_int()) {
     return MakeValue(spv::OpSRem, a.stype, a, b);
   } else if (a.stype.type.is_uint()) {
     return MakeValue(spv::OpUMod, a.stype, a, b);
   } else {
-    CHECK(a.stype.type.is_float());
+    ICHECK(a.stype.type.is_float());
     return MakeValue(spv::OpFRem, a.stype, a, b);
   }
 }
 
 #define DEFINE_BUILDER_CMP_OP(_OpName, _Op)                                                     \
   Value IRBuilder::_OpName(Value a, Value b) {                                                  \
-    CHECK_EQ(a.stype.id, b.stype.id);                                                           \
-    CHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                                       \
+    ICHECK_EQ(a.stype.id, b.stype.id);                                                          \
+    ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                                      \
     const auto& bool_type = this->GetSType(DataType::UInt(1).with_lanes(a.stype.type.lanes())); \
     if (a.stype.type.is_int()) {                                                                \
       return MakeValue(spv::OpS##_Op, bool_type, a, b);                                         \
     } else if (a.stype.type.is_uint()) {                                                        \
       return MakeValue(spv::OpU##_Op, bool_type, a, b);                                         \
     } else {                                                                                    \
-      CHECK(a.stype.type.is_float());                                                           \
+      ICHECK(a.stype.type.is_float());                                                          \
       return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                      \
     }                                                                                           \
   }
@@ -554,13 +554,13 @@ DEFINE_BUILDER_CMP_OP(GE, GreaterThanEqual);
 
 #define DEFINE_BUILDER_CMP_UOP(_OpName, _Op)                                                    \
   Value IRBuilder::_OpName(Value a, Value b) {                                                  \
-    CHECK_EQ(a.stype.id, b.stype.id);                                                           \
-    CHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                                       \
+    ICHECK_EQ(a.stype.id, b.stype.id);                                                          \
+    ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                                      \
     const auto& bool_type = this->GetSType(DataType::UInt(1).with_lanes(a.stype.type.lanes())); \
     if (a.stype.type.is_int() || a.stype.type.is_uint()) {                                      \
       return MakeValue(spv::OpI##_Op, bool_type, a, b);                                         \
     } else {                                                                                    \
-      CHECK(a.stype.type.is_float());                                                           \
+      ICHECK(a.stype.type.is_float());                                                          \
       return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                      \
     }                                                                                           \
   }
@@ -569,8 +569,8 @@ DEFINE_BUILDER_CMP_UOP(EQ, Equal);
 DEFINE_BUILDER_CMP_UOP(NE, NotEqual);
 
 Value IRBuilder::Select(Value cond, Value a, Value b) {
-  CHECK_EQ(a.stype.id, b.stype.id);
-  CHECK_EQ(cond.stype.type.element_of(), DataType::UInt(1));
+  ICHECK_EQ(a.stype.id, b.stype.id);
+  ICHECK_EQ(cond.stype.type.element_of(), DataType::UInt(1));
   return MakeValue(spv::OpSelect, a.stype, cond, a, b);
 }
 
diff --git a/src/target/spirv/ir_builder.h b/src/target/spirv/ir_builder.h
index c52f92fd7c20..8a08048e1955 100644
--- a/src/target/spirv/ir_builder.h
+++ b/src/target/spirv/ir_builder.h
@@ -93,7 +93,7 @@ class Instr {
    * \return reference to idx-th word.
    */
   uint32_t& operator[](uint32_t idx) {
-    CHECK_LT(idx, word_count_);
+    ICHECK_LT(idx, word_count_);
     return (*data_)[begin_ + idx];
   }
 
@@ -122,7 +122,7 @@ struct PhiValue : public Value {
    * \param parent The parent label.
    */
   void SetIncoming(uint32_t index, const Value& value, const Label& parent) {
-    CHECK_EQ(this->stype.id, value.stype.id);
+    ICHECK_EQ(this->stype.id, value.stype.id);
     instr[3 + index * 2] = value.id;
     instr[3 + index * 2 + 1] = parent.id;
   }
@@ -152,7 +152,7 @@ class InstrBuilder {
    */
   InstrBuilder& Begin(spv::Op op) {  // NOLINT(*);
     // finish previous build
-    CHECK_EQ(data_.size(), 0U);
+    ICHECK_EQ(data_.size(), 0U);
     op_ = op;
     data_.push_back(0);
     return *this;
diff --git a/src/target/stackvm/codegen_stackvm.cc b/src/target/stackvm/codegen_stackvm.cc
index ac3ba78fa4d5..0dd96e07ed96 100644
--- a/src/target/stackvm/codegen_stackvm.cc
+++ b/src/target/stackvm/codegen_stackvm.cc
@@ -75,12 +75,12 @@ StackVM::StructFieldKind MapFieldKind(int64_t kind) {
 }
 
 StackVM CodeGenStackVM::Compile(const PrimFunc& f) {
-  CHECK_EQ(f->buffer_map.size(), 0U)
+  ICHECK_EQ(f->buffer_map.size(), 0U)
       << "Cannot codegen function with buffer_map, please lower them first";
   for (size_t i = 0; i < f->params.size(); ++i) {
     Var v = f->params[i];
     int vid = AllocVarID(v.get());
-    CHECK_EQ(static_cast<size_t>(vid), i);
+    ICHECK_EQ(static_cast<size_t>(vid), i);
   }
   this->Push(f->body);
   vm_.InitCache();
@@ -101,7 +101,7 @@ void CodeGenStackVM::PushOp(StackVM::OpCode opcode) {
 }
 
 void CodeGenStackVM::SetOperand(int64_t operand_index, int64_t operand) {
-  CHECK(operand >= std::numeric_limits<int>::min() && operand <= std::numeric_limits<int>::max());
+  ICHECK(operand >= std::numeric_limits<int>::min() && operand <= std::numeric_limits<int>::max());
   vm_.code.at(operand_index).v_int = static_cast<int>(operand);
 }
 
@@ -125,9 +125,9 @@ int CodeGenStackVM::GetStrID(const std::string& key) {
 }
 
 int CodeGenStackVM::AllocVarID(const VarNode* v) {
-  CHECK(!var_idmap_.count(v));
+  ICHECK(!var_idmap_.count(v));
   int vid = static_cast<int>(vm_.heap_size);
-  CHECK_EQ(vm_.heap_size, var_idmap_.size());
+  ICHECK_EQ(vm_.heap_size, var_idmap_.size());
   vm_.heap_id_name.push_back(v->name_hint);
   ++vm_.heap_size;
   var_idmap_[v] = vid;
@@ -136,7 +136,7 @@ int CodeGenStackVM::AllocVarID(const VarNode* v) {
 
 int CodeGenStackVM::GetVarID(const VarNode* v) const {
   auto it = var_idmap_.find(v);
-  CHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
+  ICHECK(it != var_idmap_.end()) << "Find undefined Variable " << v->name_hint;
   return it->second;
 }
 
@@ -177,7 +177,7 @@ void CodeGenStackVM::VisitStmt_(const AllocateNode* op) {
 void CodeGenStackVM::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(builtin::address_of())) {
     const LoadNode* l = op->args[0].as<LoadNode>();
-    CHECK(op->args.size() == 1 && l);
+    ICHECK(op->args.size() == 1 && l);
     this->PushOp(StackVM::LOAD_HEAP, GetVarID(l->buffer_var.get()));
     this->Push(l->index);
     this->PushOp(StackVM::PUSH_I64, l->dtype.element_of().bytes());
@@ -186,11 +186,11 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
   } else if (op->op.same_as(builtin::reinterpret())) {
     this->Push(op->args[0]);
   } else if (op->op.same_as(builtin::tvm_struct_get())) {
-    CHECK_EQ(op->args.size(), 3U);
+    ICHECK_EQ(op->args.size(), 3U);
     int kind = op->args[2].as<IntImmNode>()->value;
     this->Push(op->args[0]);
     const IntImmNode* index = op->args[1].as<IntImmNode>();
-    CHECK(index != nullptr);
+    ICHECK(index != nullptr);
     StackVM::Code code;
     code.op_code = StackVM::TVM_STRUCT_GET;
     vm_.code.push_back(code);
@@ -199,9 +199,9 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
     code.v_int = MapFieldKind(kind);
     vm_.code.push_back(code);
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
-    CHECK_GE(op->args.size(), 5U);
+    ICHECK_GE(op->args.size(), 5U);
     const StringImmNode* s = op->args[0].as<StringImmNode>();
-    CHECK(s != nullptr) << "tvm_call_global expect first argument as function name";
+    ICHECK(s != nullptr) << "tvm_call_global expect first argument as function name";
     this->Push(op->args[1]);
     this->Push(op->args[2]);
     int begin = op->args[3].as<IntImmNode>()->value;
@@ -228,10 +228,10 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
     code.v_int = end;
     vm_.code.push_back(code);
   } else if (op->op.same_as(builtin::tvm_stack_alloca())) {
-    CHECK_EQ(op->args.size(), 2U);
+    ICHECK_EQ(op->args.size(), 2U);
     const std::string& type = op->args[0].as<StringImmNode>()->value;
     const IntImmNode* num = op->args[1].as<IntImmNode>();
-    CHECK(num != nullptr);
+    ICHECK(num != nullptr);
     static_assert(alignof(TVMValue) % alignof(DLTensor) == 0, "invariant");
     // static_assert(alignof(TVMValue) % alignof(tvm_index_t) == 0, "invariant");
     size_t unit = sizeof(TVMValue);
@@ -251,7 +251,7 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
     vm_.stack_size += size;
     this->PushOp(StackVM::TVM_STACK_ALLOCA_BY_8BYTE, static_cast<int>(size));
   } else if (op->op.same_as(backend_alloc_workspace_op_)) {
-    CHECK_EQ(op->args.size(), 5U);
+    ICHECK_EQ(op->args.size(), 5U);
     this->Push(op->args[0]);
     this->Push(op->args[1]);
     this->Push(op->args[2]);
@@ -259,7 +259,7 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
     this->Push(op->args[4]);
     this->PushOp(StackVM::TVM_DEVICE_ALLOCA);
   } else if (op->op.same_as(backend_free_workspace_op_)) {
-    CHECK_EQ(op->args.size(), 3U);
+    ICHECK_EQ(op->args.size(), 3U);
     this->Push(op->args[0]);
     this->Push(op->args[1]);
     this->Push(op->args[2]);
@@ -267,7 +267,7 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
   } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
     this->PushOp(StackVM::TVM_THROW_LAST_ERROR);
   } else if (op->op.same_as(builtin::isnullptr())) {
-    CHECK_EQ(op->args.size(), 1U);
+    ICHECK_EQ(op->args.size(), 1U);
     this->Push(op->args[0]);
     this->PushOp(StackVM::PUSH_I64, 0);
     this->PushOp(StackVM::EQ_HANDLE);
@@ -305,8 +305,8 @@ void CodeGenStackVM::VisitExpr_(const StringImmNode* op) {
 }
 
 void CodeGenStackVM::VisitExpr_(const IntImmNode* op) {
-  CHECK(op->value >= std::numeric_limits<int>::min() &&
-        op->value <= std::numeric_limits<int>::max())
+  ICHECK(op->value >= std::numeric_limits<int>::min() &&
+         op->value <= std::numeric_limits<int>::max())
       << "Int constant exceed bound";
   this->PushOp(StackVM::PUSH_I64, static_cast<int>(op->value));
 }
@@ -399,7 +399,7 @@ void CodeGenStackVM::VisitExpr_(const NotNode* op) {
 }
 
 void CodeGenStackVM::VisitStmt_(const ForNode* op) {
-  CHECK(is_zero(op->min));
+  ICHECK(is_zero(op->min));
   int vid = this->AllocVarID(op->loop_var.get());
   this->PushOp(StackVM::PUSH_I64, 0);
   int64_t loop_head = this->GetPC();
@@ -432,11 +432,11 @@ void CodeGenStackVM::VisitStmt_(const EvaluateNode* ev) {
   if (is_const_int(ev->value)) return;
   const CallNode* op = ev->value.as<CallNode>();
   if (op && op->op.same_as(builtin::tvm_struct_set())) {
-    CHECK_EQ(op->args.size(), 4U);
+    ICHECK_EQ(op->args.size(), 4U);
     this->Push(op->args[0]);
     this->Push(op->args[3]);
     const IntImmNode* index = op->args[1].as<IntImmNode>();
-    CHECK(index != nullptr);
+    ICHECK(index != nullptr);
     StackVM::Code code;
     code.op_code = StackVM::TVM_STRUCT_SET;
     vm_.code.push_back(code);
@@ -515,14 +515,14 @@ runtime::Module BuildStackVM(IRModule mod, Target target) {
   std::string entry_func;
 
   for (auto kv : mod->functions) {
-    CHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenStackVM: Can only take PrimFunc";
+    ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodeGenStackVM: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    CHECK(global_symbol.defined())
+    ICHECK(global_symbol.defined())
         << "CodeGenStackVM: Expect PrimFunc to have the global_symbol attribute";
     std::string f_name = global_symbol.value();
     StackVM vm = codegen::CodeGenStackVM().Compile(f);
-    CHECK(!fmap.count(f_name)) << "Function name " << f_name << "already exist in list";
+    ICHECK(!fmap.count(f_name)) << "Function name " << f_name << "already exist in list";
     fmap[f_name] = std::move(vm);
 
     if (f->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
diff --git a/src/target/tag.cc b/src/target/tag.cc
index 3e47e456691a..8198435a9494 100644
--- a/src/target/tag.cc
+++ b/src/target/tag.cc
@@ -60,7 +60,7 @@ Map<String, Target> TargetTag::ListTags() {
 
 Target TargetTag::AddTag(String name, Map<String, ObjectRef> config, bool override) {
   TargetTagRegEntry& tag = TargetTagRegEntry::RegisterOrGet(name).set_name();
-  CHECK(override || tag.tag_->config.empty())
+  ICHECK(override || tag.tag_->config.empty())
       << "Tag \"" << name << "\" has been previously defined as: " << tag.tag_->config;
   tag.set_config(config);
   return Target(config);
diff --git a/src/target/target.cc b/src/target/target.cc
index 052824249392..e44a15c3ff59 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -421,8 +421,8 @@ void Target::EnterWithScope() {
 
 void Target::ExitWithScope() {
   TVMTargetThreadLocalEntry* entry = TVMTargetThreadLocalStore::Get();
-  CHECK(!entry->context_stack.empty());
-  CHECK(entry->context_stack.top().same_as(*this));
+  ICHECK(!entry->context_stack.empty());
+  ICHECK(entry->context_stack.top().same_as(*this));
   entry->context_stack.pop();
 }
 
@@ -431,7 +431,7 @@ Target Target::Current(bool allow_not_defined) {
   if (entry->context_stack.size() > 0) {
     return entry->context_stack.top();
   }
-  CHECK(allow_not_defined)
+  ICHECK(allow_not_defined)
       << "Target context required. Please set it by constructing a TargetContext";
 
   return Target();
@@ -473,8 +473,8 @@ ObjectPtr<Object> TargetInternal::FromString(const String& tag_or_config_or_targ
 
 ObjectPtr<Object> TargetInternal::FromConfigString(const String& config_str) {
   const auto* loader = tvm::runtime::Registry::Get("target._load_config_dict");
-  CHECK(loader) << "AttributeError: \"target._load_config_dict\" is not registered. Please check "
-                   "if the python module is properly loaded";
+  ICHECK(loader) << "AttributeError: \"target._load_config_dict\" is not registered. Please check "
+                    "if the python module is properly loaded";
   Optional<Map<String, ObjectRef>> config = (*loader)(config_str);
   if (!config.defined()) {
     throw dmlc::Error(": Cannot load config dict with python JSON loader");
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index b5d2bf7ceb85..017ba396f861 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -122,7 +122,7 @@ void CheckOrSetAttr(Map<String, ObjectRef>* attrs, const String& name, const Str
     attrs->Set(name, value);
   } else {
     const auto* str = (*iter).second.as<StringObj>();
-    CHECK(str != nullptr && GetRef<String>(str) == value)
+    ICHECK(str != nullptr && GetRef<String>(str) == value)
         << "ValueError: Expects \"" << name << "\" to be \"" << value
         << "\", but gets: " << (*iter).second;
   }
@@ -143,7 +143,7 @@ Map<String, ObjectRef> UpdateNVPTXAttrs(Map<String, ObjectRef> attrs) {
     // If -mcpu has been specified, validate the correctness
     String mcpu = Downcast<String>(attrs.at("mcpu"));
     arch = ExtractIntWithPrefix(mcpu, "sm_");
-    CHECK(arch != -1) << "ValueError: NVPTX target gets an invalid CUDA arch: -mcpu=" << mcpu;
+    ICHECK(arch != -1) << "ValueError: NVPTX target gets an invalid CUDA arch: -mcpu=" << mcpu;
   } else {
     // Use the compute version of the first CUDA GPU instead
     TVMRetValue version;
@@ -170,7 +170,7 @@ Map<String, ObjectRef> UpdateROCmAttrs(Map<String, ObjectRef> attrs) {
   if (attrs.count("mcpu")) {
     String mcpu = Downcast<String>(attrs.at("mcpu"));
     arch = ExtractIntWithPrefix(mcpu, "gfx");
-    CHECK(arch != -1) << "ValueError: ROCm target gets an invalid GFX version: -mcpu=" << mcpu;
+    ICHECK(arch != -1) << "ValueError: ROCm target gets an invalid GFX version: -mcpu=" << mcpu;
   } else {
     TVMRetValue val;
     if (!DetectDeviceFlag({kDLROCM, 0}, runtime::kGcnArch, &val)) {
diff --git a/src/te/autodiff/ad_simplify.cc b/src/te/autodiff/ad_simplify.cc
index 81df5c9d6b42..cc0e82066171 100644
--- a/src/te/autodiff/ad_simplify.cc
+++ b/src/te/autodiff/ad_simplify.cc
@@ -97,8 +97,8 @@ Array<IterVar> IterVarsFromMap(const Array<Var>& vars, const Map<Var, Range>& vr
                                IterVarType iter_type = kDataPar, std::string thread_tag = "") {
   Array<IterVar> res;
   for (const Var& v : vars) {
-    CHECK(vranges.count(v)) << "A range for the variable " << v << " was not provided in map "
-                            << vranges;
+    ICHECK(vranges.count(v)) << "A range for the variable " << v << " was not provided in map "
+                             << vranges;
     res.push_back(IterVar(vranges[v], v, iter_type, thread_tag));
   }
   return res;
@@ -478,7 +478,7 @@ class FactorOutAtomicFormulasFunctor
 // and a non-atomic residual. Atomic formulas are consts, calls, variables and comparisons (a <= b,
 // etc), i.e. formulas which are not logical operators (||, &&, !) on the top level.
 FactorOutAtomicFormulasResult FactorOutAtomicFormulas(const PrimExpr& e) {
-  CHECK(e.dtype().is_bool());
+  ICHECK(e.dtype().is_bool());
   return FactorOutAtomicFormulasFunctor().VisitExpr(e);
 }
 
@@ -494,7 +494,7 @@ inline PrimExpr ModImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   if (mode == kTruncDiv) {
     return truncmod(a, b);
   } else {
-    CHECK_EQ(mode, kFloorDiv);
+    ICHECK_EQ(mode, kFloorDiv);
     return floormod(a, b);
   }
 }
@@ -503,7 +503,7 @@ inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   if (mode == kTruncDiv) {
     return truncdiv(a, b);
   } else {
-    CHECK_EQ(mode, kFloorDiv);
+    ICHECK_EQ(mode, kFloorDiv);
     return floordiv(a, b);
   }
 }
@@ -817,7 +817,7 @@ PrimExpr SimplifyReductionDomain(const PrimExpr& expr, const Map<Var, Range>& ou
 // Extract from cond an implication of cond not containing vars
 std::pair<PrimExpr, PrimExpr> ImplicationNotContainingVars(
     const PrimExpr& cond, const std::unordered_set<const VarNode*>& vars) {
-  CHECK(cond.dtype().is_bool()) << "The type of cond must be bool";
+  ICHECK(cond.dtype().is_bool()) << "The type of cond must be bool";
   // TODO(sgrechanik-h): NOTs could be pushed down using De Morgan laws
   // before running this function but this case didn't seem to be important enough.
   if (const AndNode* op = cond.as<AndNode>()) {
@@ -938,7 +938,7 @@ class RemoveRedundantInequalitiesMutator : public ExprMutator {
 
   virtual PrimExpr VisitExpr_(const ReduceNode* op) {
     Array<PrimExpr> known_with_axes = known_;
-    CHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
+    ICHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
     for (const PrimExpr& axis_cond : IterVarsToInequalities(op->axis)) {
       known_with_axes.push_back(axis_cond);
     }
@@ -1011,7 +1011,7 @@ PrimExpr TrySimplifyCompute(const PrimExpr& expr, const PrimExpr& cond,
   Array<Var> used_res_variables;
   for (const Var& var : res->dst->variables) {
     if (ExprUseVar(new_expr, var)) {
-      CHECK(res->dst->ranges.count(var)) << "Range of " << var << " cannot be inferred.";
+      ICHECK(res->dst->ranges.count(var)) << "Range of " << var << " cannot be inferred.";
       used_res_variables.push_back(var);
     }
   }
@@ -1031,7 +1031,7 @@ PrimExpr TrySimplifyCompute(const PrimExpr& expr, const PrimExpr& cond,
   // Compute volumes before and after
   PrimExpr old_volume = make_const(DataType::Int(64), 1);
   for (const Var& var : outer_axis) {
-    CHECK(vranges.count(var)) << "Range of " << var << " was not provided.";
+    ICHECK(vranges.count(var)) << "Range of " << var << " was not provided.";
     old_volume = old_volume * vranges[var]->extent;
   }
 
@@ -1069,7 +1069,7 @@ class ReductionAsTensorAccessMutator : public ExprMutator {
     ReductionAsTensorAccessMutator new_mutator(Concat(IterVarsToVars(op->axis), outer_axis_),
                                                Merge(vranges_, IterVarsToMap(op->axis)), name_);
 
-    CHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
+    ICHECK(op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
     Array<PrimExpr> new_source;
     for (const PrimExpr& src : op->source) {
       new_source.push_back(new_mutator(src));
@@ -1152,7 +1152,7 @@ PrimExpr RemoveJacobianAndLiftNonzeroCondImpl(const PrimExpr& expr_orig, const A
   PrimExpr expr = analyzer.Simplify(expr_orig, kSimplifyRewriteCanonicalRewrite);
 
   if (const ReduceNode* red = expr.as<ReduceNode>()) {
-    CHECK(red->init.empty()) << "Derivative of Reduction with initialization is not implemented";
+    ICHECK(red->init.empty()) << "Derivative of Reduction with initialization is not implemented";
     // TODO(sgrechanik-h): There are some other operations which behave like sum
     bool is_sum = IsSumCombiner(red->combiner, vranges);
     if (is_sum || CanFactorZeroFromCombiner(red->combiner, red->value_index, vranges)) {
diff --git a/src/te/autodiff/jacobian.cc b/src/te/autodiff/jacobian.cc
index ba03ba08febd..7104424957af 100644
--- a/src/te/autodiff/jacobian.cc
+++ b/src/te/autodiff/jacobian.cc
@@ -82,7 +82,7 @@ class JacobianMutator : public ExprMutator {
     auto tensor = Downcast<te::Tensor>(op->producer);
     if (input_.get() && tensor == input_) {
       // Tensor(indices)
-      CHECK_EQ(indices_.size(), op->indices.size());
+      ICHECK_EQ(indices_.size(), op->indices.size());
       PrimExpr condition = const_true();
       for (size_t i = 0; i < input_.ndim(); ++i) {
         condition = And(condition, EQ(indices_[i], op->indices[i]));
@@ -181,7 +181,8 @@ class JacobianMutator : public ExprMutator {
     PrimExpr expr_with_new_axes = te::CloneReduction(GetRef<PrimExpr>(op));
     const ReduceNode* new_op = expr_with_new_axes.as<ReduceNode>();
 
-    CHECK(new_op->init.empty()) << "Derivative of Reduction with initialization is not implemented";
+    ICHECK(new_op->init.empty())
+        << "Derivative of Reduction with initialization is not implemented";
 
     // New lhs and rhs variables of the new combiner consist of
     // variables representing derivatives (which are later derived from new_op->source)
@@ -303,7 +304,7 @@ PrimExpr Jacobian(const PrimExpr& expr, const Tensor& input, const Array<PrimExp
 
 Tensor Jacobian(const Tensor& output, const Tensor& input) {
   const ComputeOpNode* op = output->op.as<ComputeOpNode>();
-  CHECK(op) << "Derivative of this operation is not implemented: " << output->op;
+  ICHECK(op) << "Derivative of this operation is not implemented: " << output->op;
   bool is_input_tensor = false;
   for (const Tensor& child : op->InputTensors()) {
     if (input == child) {
@@ -311,8 +312,8 @@ Tensor Jacobian(const Tensor& output, const Tensor& input) {
       break;
     }
   }
-  CHECK(is_input_tensor) << "Jacobian is called on a pair of tensors such that the output "
-                         << "does not directly depend on the input.";
+  ICHECK(is_input_tensor) << "Jacobian is called on a pair of tensors such that the output "
+                          << "does not directly depend on the input.";
 
   // We have to clone the iteration axes because otherwise the original expression
   // cannot be used together with the derivative (it will lead to errors during lowering)
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index 64995761524b..3b225760d75d 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -74,12 +74,12 @@ Array<IterVar> BaseComputeOpNode::root_iter_vars() const {
 }
 
 DataType ComputeOpNode::output_dtype(size_t idx) const {
-  CHECK_LT(idx, num_outputs());
+  ICHECK_LT(idx, num_outputs());
   return body[idx].dtype();
 }
 
 Array<PrimExpr> BaseComputeOpNode::output_shape(size_t idx) const {
-  CHECK_LT(idx, num_outputs());
+  ICHECK_LT(idx, num_outputs());
   // for now, all outputs of a BaseComputeOp have the same shape
   Array<PrimExpr> shape;
   for (const auto& ivar : this->axis) {
@@ -170,7 +170,7 @@ Array<Tensor> ComputeOpNode::InputTensors() const {
 
 Operation ComputeOpNode::ReplaceInputs(const Operation& self,
                                        const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   VerifyComputeOp(this);
   Array<PrimExpr> arr;
   if (this->body[0]->IsInstance<tir::ReduceNode>()) {
@@ -202,7 +202,7 @@ Operation ComputeOpNode::ReplaceInputs(const Operation& self,
 void ComputeOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
                                       const std::unordered_map<const VarNode*, IntSet>& dom_map,
                                       std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto fvisit = [&dom_map, out_dom_map, analyzer](const ObjectRef& n) {
     if (auto* pload = n.as<tir::ProducerLoadNode>()) {
       Tensor t = Downcast<Tensor>(pload->producer);
@@ -245,15 +245,15 @@ void ComputeOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* an
 void BaseComputeOpNode::GatherBound(const Operation& self,
                                     const std::unordered_map<Tensor, TensorDom>& tensor_dom,
                                     std::unordered_map<IterVar, Range>* out_dom_map) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   const TensorDom& tdom = tensor_dom.at(self.output(0));
   for (size_t i = 0; i < this->axis.size(); ++i) {
     Range r = arith::Union(tdom.data.at(i)).CoverRange(this->axis[i]->dom);
-    CHECK(!out_dom_map->count(this->axis[i]));
+    ICHECK(!out_dom_map->count(this->axis[i]));
     (*out_dom_map)[this->axis[i]] = r;
   }
   for (size_t i = 0; i < this->reduce_axis.size(); ++i) {
-    CHECK(!out_dom_map->count(this->reduce_axis[i]));
+    ICHECK(!out_dom_map->count(this->reduce_axis[i]));
     (*out_dom_map)[this->reduce_axis[i]] = this->reduce_axis[i]->dom;
   }
 }
@@ -261,7 +261,7 @@ void BaseComputeOpNode::GatherBound(const Operation& self,
 Stmt BaseComputeOpNode::BuildRealize(const Stage& stage,
                                      const std::unordered_map<IterVar, Range>& realize_map,
                                      const Stmt& body) const {
-  CHECK_EQ(stage->op.get(), this);
+  ICHECK_EQ(stage->op.get(), this);
   Region bounds;
   for (IterVar iv : this->axis) {
     bounds.push_back(realize_map.at(iv));
@@ -301,9 +301,9 @@ void MakeReduction(const ComputeOpNode* op, const Array<Tensor>& tensors, Stmt*
 
   size_t size = op->body.size();
   const ReduceNode* reduce = op->body[0].as<ReduceNode>();
-  CHECK(reduce);
+  ICHECK(reduce);
   const CommReducerNode* combiner = reduce->combiner.as<CommReducerNode>();
-  CHECK(combiner);
+  ICHECK(combiner);
   Array<PrimExpr> lhs;
   for (size_t i = 0; i < size; ++i) {
     lhs.push_back(tensors[i](args));
@@ -405,11 +405,11 @@ ComputeType DetectComputeType(const ComputeOpNode* self, const Stage& stage) {
         ++normal_red;
       }
     } else {
-      CHECK_EQ(thread_red, 0) << "Cross thread reduce cannot swap with normal data axis";
+      ICHECK_EQ(thread_red, 0) << "Cross thread reduce cannot swap with normal data axis";
     }
   }
   if (tensorize != 0) {
-    CHECK(thread_red == 0) << "Cannot mix cross thread reduction with Tensorize";
+    ICHECK(thread_red == 0) << "Cannot mix cross thread reduction with Tensorize";
     return ComputeType::kTensorize;
   }
   if (thread_red != 0) {
@@ -423,7 +423,7 @@ ComputeType DetectComputeType(const ComputeOpNode* self, const Stage& stage) {
 Stmt ComputeOpNode::BuildProvide(const Stage& stage,
                                  const std::unordered_map<IterVar, Range>& dom_map,
                                  bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
   ComputeType ctype = DetectComputeType(this, stage);
   if (ctype == ComputeType::kCrossThreadReduction) {
     // specially handle cross thread reduction.
@@ -438,7 +438,7 @@ Stmt ComputeOpNode::BuildProvide(const Stage& stage,
 ComputeLoopNest ComputeLoopNest::Create(const BaseComputeOpNode* self, const Stage& stage,
                                         const std::unordered_map<IterVar, Range>& dom_map,
                                         bool debug_keep_trivial_loop) {
-  CHECK_EQ(stage->op.operator->(), self);
+  ICHECK_EQ(stage->op.operator->(), self);
   ComputeLoopNest ret;
   // make main loop nest
   ret.main_nest = MakeLoopNest(stage, dom_map, 0, false, std::unordered_set<IterVar>(),
@@ -489,7 +489,7 @@ ComputeLoopNest ComputeLoopNest::Create(const BaseComputeOpNode* self, const Sta
       e = likely(e);
     }
   } else {
-    CHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
+    ICHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
     ret.num_common_loop = stage->leaf_iter_vars.size();
   }
   // copy elison here.
@@ -524,12 +524,12 @@ class ComputeVerifier final : protected tir::ExprVisitor {
     for (const PrimExpr e : compute_->body) {
       // Check for consistency of top level reductions
       const tir::ReduceNode* reduce = e.as<tir::ReduceNode>();
-      CHECK((reduce && reduce_) || (!reduce && !reduce_)) << "All ComputeOp should be consistent "
-                                                          << "with being Reduce operation or not.";
+      ICHECK((reduce && reduce_) || (!reduce && !reduce_)) << "All ComputeOp should be consistent "
+                                                           << "with being Reduce operation or not.";
 
       if (reduce && reduce_) {
-        CHECK(ReduceEqual(reduce, reduce_)) << "The Reduce inputs of ComputeOp should "
-                                            << "have the same attribute except value_index";
+        ICHECK(ReduceEqual(reduce, reduce_)) << "The Reduce inputs of ComputeOp should "
+                                             << "have the same attribute except value_index";
       }
 
       level_ = 0;
@@ -548,8 +548,8 @@ class ComputeVerifier final : protected tir::ExprVisitor {
 
   void VisitExpr_(const tir::ReduceNode* op) final {
     // Check for non top level reductions
-    CHECK(0 == level_) << "Reductions are only allowed at the top level of compute. "
-                       << "Please create another tensor for further composition.";
+    ICHECK(0 == level_) << "Reductions are only allowed at the top level of compute. "
+                        << "Please create another tensor for further composition.";
   }
   //@}
 
@@ -581,7 +581,7 @@ Stmt TransformUpdate(const Stage& stage, const std::unordered_map<IterVar, Range
     }
     if (iv->iter_type == kCommReduce) {
       auto vit = dom_map.find(iv);
-      CHECK(vit != dom_map.end());
+      ICHECK(vit != dom_map.end());
       const Range& vrange = vit->second;
       conds.push_back(likely(iv->var > vrange->min));
       banned.insert(iv->var.get());
diff --git a/src/te/operation/cross_thread_reduction.cc b/src/te/operation/cross_thread_reduction.cc
index 6aba9ab500b6..b0fb9b667558 100644
--- a/src/te/operation/cross_thread_reduction.cc
+++ b/src/te/operation/cross_thread_reduction.cc
@@ -92,12 +92,13 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
                            debug_keep_trivial_loop);
 
   size_t size = self->body.size();
-  CHECK_GT(size, 0);
+  ICHECK_GT(size, 0);
   std::vector<const ReduceNode*> reduces(size);
   for (size_t i = 0; i < size; ++i) {
     const ReduceNode* reduce = self->body[i].as<ReduceNode>();
-    CHECK(reduce);
-    CHECK(reduce->init.empty()) << "Cannot perform cross_thread_reduction for reductions with init";
+    ICHECK(reduce);
+    ICHECK(reduce->init.empty())
+        << "Cannot perform cross_thread_reduction for reductions with init";
     reduces[i] = reduce;
   }
 
@@ -140,7 +141,7 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
     normal_init.reserve(size);
     normal_update.resize(size);
     const CommReducerNode* combiner = reduces[0]->combiner.as<CommReducerNode>();
-    CHECK(combiner);
+    ICHECK(combiner);
     Array<PrimExpr> lhs;
     for (size_t i = 0; i < size; ++i) {
       DataType t = reduces[i]->dtype;
diff --git a/src/te/operation/extern_op.cc b/src/te/operation/extern_op.cc
index 2afdd4a93c7e..1c9a3cb336ae 100644
--- a/src/te/operation/extern_op.cc
+++ b/src/te/operation/extern_op.cc
@@ -60,14 +60,14 @@ ExternOp::ExternOp(std::string name, std::string tag, Map<String, ObjectRef> att
   n->name = std::move(name);
   n->tag = std::move(tag);
   n->attrs = std::move(attrs);
-  CHECK_EQ(inputs.size(), input_placeholders.size());
+  ICHECK_EQ(inputs.size(), input_placeholders.size());
   for (size_t i = 0; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i]->dtype, input_placeholders[i]->dtype);
-    CHECK_EQ(inputs[i]->shape.size(), input_placeholders[i]->shape.size());
+    ICHECK_EQ(inputs[i]->dtype, input_placeholders[i]->dtype);
+    ICHECK_EQ(inputs[i]->shape.size(), input_placeholders[i]->shape.size());
     for (size_t dim = 0; dim < inputs[i]->shape.size(); ++dim) {
-      CHECK(inputs[i]->shape[dim].same_as(input_placeholders[i]->shape[dim]));
+      ICHECK(inputs[i]->shape[dim].same_as(input_placeholders[i]->shape[dim]));
     }
-    CHECK_EQ(input_placeholders[i]->strides.size(), 0U);
+    ICHECK_EQ(input_placeholders[i]->strides.size(), 0U);
   }
   n->inputs = std::move(inputs);
   n->input_placeholders = std::move(input_placeholders);
@@ -87,7 +87,7 @@ Array<Tensor> ExternOpNode::InputTensors() const { return inputs; }
 
 Operation ExternOpNode::ReplaceInputs(const Operation& self,
                                       const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto n = make_object<ExternOpNode>(*this);
   n->body = ReplaceTensor(this->body, rmap);
   for (size_t i = 0; i < n->inputs.size(); ++i) {
@@ -125,7 +125,7 @@ void ExternOpNode::GatherBound(const Operation& self,
 Stmt ExternOpNode::BuildRealize(const Stage& stage,
                                 const std::unordered_map<IterVar, Range>& realize_map,
                                 const Stmt& body) const {
-  CHECK_EQ(stage->op.get(), this);
+  ICHECK_EQ(stage->op.get(), this);
   Stmt realize_body = body;
   for (int k = 0; k < num_outputs(); ++k) {
     Tensor t = stage->op.output(k);
@@ -141,7 +141,7 @@ Stmt ExternOpNode::BuildRealize(const Stage& stage,
 Stmt ExternOpNode::BuildProvide(const Stage& stage,
                                 const std::unordered_map<IterVar, Range>& dom_map,
                                 bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
   Stmt ret = AttrStmt(make_zero(DataType::Int(32)), tir::attr::extern_scope, 0, this->body);
   auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) {
     Array<ObjectRef> bind_spec;
diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc
index 98270e9a2952..94e06d206ddb 100644
--- a/src/te/operation/hybrid_op.cc
+++ b/src/te/operation/hybrid_op.cc
@@ -101,7 +101,7 @@ Array<Tensor> HybridOpNode::InputTensors() const {
 
 Operation HybridOpNode::ReplaceInputs(const Operation& self,
                                       const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto n = make_object<HybridOpNode>(*this);
   n->body = te::ReplaceTensor(this->body, rmap);
   for (size_t i = 0; i < n->inputs.size(); ++i) {
@@ -137,7 +137,7 @@ void HybridOpNode::GatherBound(const Operation& self,
                                const std::unordered_map<Tensor, TensorDom>& tensor_dom,
                                std::unordered_map<IterVar, Range>* out_dom_map) const {
   for (auto iter_var : axis) {
-    CHECK(!out_dom_map->count(iter_var));
+    ICHECK(!out_dom_map->count(iter_var));
     out_dom_map->operator[](iter_var) = iter_var->dom;
   }
 }
@@ -146,7 +146,7 @@ Stmt HybridOpNode::BuildRealize(const Stage& stage,
                                 const std::unordered_map<IterVar, Range>& realize_map,
                                 const Stmt& body) const {
   // TODO(@were): Add attribute inject here and remove it from hybrid parser.
-  CHECK_EQ(stage->op.get(), this);
+  ICHECK_EQ(stage->op.get(), this);
   Stmt realize_body = body;
   for (int k = 0; k < num_outputs(); ++k) {
     Tensor t = stage->op.output(k);
@@ -162,7 +162,7 @@ Stmt HybridOpNode::BuildRealize(const Stage& stage,
 Stmt HybridOpNode::BuildProvide(const Stage& stage,
                                 const std::unordered_map<IterVar, Range>& dom_map,
                                 bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
   Stmt ret = AttrStmt(make_zero(DataType::Int(32)), tir::attr::extern_scope, 0, this->body);
   std::unordered_map<Tensor, Tensor> rmap;
   for (int i = 0; i < this->num_outputs(); ++i) {
@@ -213,14 +213,14 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range
       parent = split->parent->var.get();
 
       auto& inner_ = split->inner;
-      CHECK(dom_map.count(inner_));
+      ICHECK(dom_map.count(inner_));
       auto& inner_dom = dom_map.find(inner_)->second;
-      CHECK(is_const_int(inner_dom->min, 0));
+      ICHECK(is_const_int(inner_dom->min, 0));
 
       auto& outer_ = split->outer;
-      CHECK(dom_map.count(outer_));
+      ICHECK(dom_map.count(outer_));
       auto& outer_dom = dom_map.find(outer_)->second;
-      CHECK(is_const_int(outer_dom->min, 0));
+      ICHECK(is_const_int(outer_dom->min, 0));
 
       inner = IterVar(inner_dom, inner_->var, inner_->iter_type);
       outer = IterVar(outer_dom, outer_->var, outer_->iter_type);
@@ -264,7 +264,7 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range
     // TODO(@were): Handle imperfect loops
     Stmt VisitStmt_(const ForNode* op) final {
       if (op->loop_var.get() == inner) {
-        CHECK(under_outer);
+        ICHECK(under_outer);
         std::unordered_map<const VarNode*, PrimExpr> rmap;
         rmap[op->loop_var.get()] = indexmod(parent, op->extent);
         extent = op->extent;
@@ -295,11 +295,11 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range
     if (const SplitNode* split = rel.as<SplitNode>()) {
       LoopSpliter Spliter(split, dom_map);
       stmt = Spliter(stmt);
-      CHECK(Spliter.splitted);
+      ICHECK(Spliter.splitted);
     } else if (const FuseNode* fuse = rel.as<FuseNode>()) {
       LoopFuser Fuser(fuse);
       stmt = Fuser(stmt);
-      CHECK(Fuser.fused);
+      ICHECK(Fuser.fused);
     }
   }
 
@@ -322,8 +322,8 @@ Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar,
         if (attr->bind_thread.defined()) {
           const auto& iter_var = attr->bind_thread;
           if (iter_var->dom.defined()) {
-            CHECK(is_const_int(iter_var->dom->min, 0));
-            CHECK(expr_equal(iter_var->dom->extent, op->extent))
+            ICHECK(is_const_int(iter_var->dom->min, 0));
+            ICHECK(expr_equal(iter_var->dom->extent, op->extent))
                 << "Thread extent and loop extent mismatch!\n";
           }
           std::unordered_map<const VarNode*, PrimExpr> rmap;
@@ -361,7 +361,7 @@ Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar,
       }
     });
 
-    CHECK_EQ(found, 1) << " iter var should be found exactly once!";
+    ICHECK_EQ(found, 1) << " iter var should be found exactly once!";
     if (need_change) {
       stmt = LoopAnnotator(var, attr)(std::move(stmt));
     }
@@ -377,14 +377,14 @@ Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map<IterVar, Range>
   });
   std::reverse(current_order.begin(), current_order.end());
   auto& required_ord = stage->leaf_iter_vars;
-  CHECK_EQ(current_order.size(), required_ord.size()) << "Cannot reorder the loops!";
+  ICHECK_EQ(current_order.size(), required_ord.size()) << "Cannot reorder the loops!";
   std::unordered_map<const VarNode*, IterVar> reorder;
   bool need_reorder = false;
   for (size_t i = 0; i < current_order.size(); ++i) {
     auto& current = current_order[i];
     const IterVar& iter_var = required_ord[i];
     const IterVar& required = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
-    CHECK(required->dom.defined() || dom_map.count(required)) << required << "\n";
+    ICHECK(required->dom.defined() || dom_map.count(required)) << required << "\n";
     reorder[current] = required;
     if (current != required->var.get()) {
       need_reorder = true;
@@ -404,7 +404,7 @@ Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map<IterVar, Range>
     Stmt VisitStmt_(const ForNode* op) final {
       // Reorder from in to out
       Stmt body_ = this->VisitStmt(op->body);
-      CHECK(reorder.count(op->loop_var.get()));
+      ICHECK(reorder.count(op->loop_var.get()));
       auto target = reorder.find(op->loop_var.get())->second;
       if (body_.same_as(op->body) && op->loop_var.get() == target->var.get())
         return GetRef<Stmt>(op);
@@ -431,8 +431,8 @@ Stmt ApplySchedule(const Stage& stage, const std::unordered_map<IterVar, Range>&
   for (auto rel : stage->relations) {
     if (const auto* rebase = rel.as<RebaseNode>()) {
       rebased[rebase->rebased] = rebase->parent;
-      CHECK(rebase->parent->dom.defined());
-      CHECK(dom_map.count(rebase->rebased));
+      ICHECK(rebase->parent->dom.defined());
+      ICHECK(dom_map.count(rebase->rebased));
     }
   }
   stmt = ApplyLoopShapes(stage, dom_map, stmt);
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index 80f7fe2b4e41..f1991c181e67 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -100,7 +100,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
           default:
             LOG(FATAL) << "Unknown iter type" << it_attr->iter_type << " in the iter_var_attrs";
         }
-        CHECK_EQ(it_attr->pragma_keys.size(), it_attr->pragma_values.size());
+        ICHECK_EQ(it_attr->pragma_keys.size(), it_attr->pragma_values.size());
         for (size_t k = 0; k < it_attr->pragma_keys.size(); ++k) {
           const std::string& pkey = it_attr->pragma_keys[k].as<StringImmNode>()->value;
           PrimExpr pvalue = it_attr->pragma_values[k];
@@ -125,8 +125,8 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
         nest[i + 1].emplace_back(LetStmt(var, new_value, no_op));
       }
       if (it_attr.defined() && it_attr->prefetch_data.size() != 0) {
-        CHECK(!is_one(dom->extent)) << "Cannot prefetch on trivial loop with extent=1";
-        CHECK_EQ(it_attr->prefetch_data.size(), it_attr->prefetch_offset.size());
+        ICHECK(!is_one(dom->extent)) << "Cannot prefetch on trivial loop with extent=1";
+        ICHECK_EQ(it_attr->prefetch_data.size(), it_attr->prefetch_offset.size());
         for (size_t j = 0; j < it_attr->prefetch_data.size(); ++j) {
           nest[i + 1].emplace_back(AttrStmt(it_attr->prefetch_data[j], tir::attr::prefetch_scope,
                                             it_attr->prefetch_offset[j], no_op));
@@ -135,23 +135,23 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
     } else if (bind_iv->thread_tag == "vthread" || bind_iv->thread_tag == "cthread") {
       // virtual thread
       // Always restrict threaded IterVar to starts from 0.
-      CHECK(is_zero(dom->min));
-      CHECK(is_positive_const(dom->extent));
+      ICHECK(is_zero(dom->min));
+      ICHECK(is_positive_const(dom->extent));
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::virtual_thread, dom->extent, no_op));
       value_map[iv] = var;
     } else if (bind_iv->thread_tag == "pipeline") {
       // pipeline marker.
-      CHECK(is_zero(dom->min));
-      CHECK(is_one(dom->extent));
+      ICHECK(is_zero(dom->min));
+      ICHECK(is_one(dom->extent));
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(
           AttrStmt(bind_iv, tir::attr::pipeline_exec_scope, dom->extent, no_op));
       value_map[iv] = dom->min;
     } else {
       // Always restrict threaded IterVar to starts from 0.
-      CHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at "
-                               << dom->min;
+      ICHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at "
+                                << dom->min;
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent, dom->extent, no_op));
       if (!debug_keep_trivial_loop && is_one(dom->extent)) {
@@ -205,7 +205,7 @@ class TensorReplacer : public tir::StmtExprMutator {
   PrimExpr VisitExpr_(const tir::ProducerLoadNode* op) final {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<tir::ProducerLoadNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
 
     Tensor t = Downcast<Tensor>(op->producer);
     auto it = vmap_.find(t);
diff --git a/src/te/operation/placeholder_op.cc b/src/te/operation/placeholder_op.cc
index 5b7ede314e49..c51e53e16cd1 100644
--- a/src/te/operation/placeholder_op.cc
+++ b/src/te/operation/placeholder_op.cc
@@ -41,12 +41,12 @@ int PlaceholderOpNode::num_outputs() const { return 1; }
 Array<IterVar> PlaceholderOpNode::root_iter_vars() const { return {}; }
 
 DataType PlaceholderOpNode::output_dtype(size_t i) const {
-  CHECK_EQ(i, 0U);
+  ICHECK_EQ(i, 0U);
   return dtype;
 }
 
 Array<PrimExpr> PlaceholderOpNode::output_shape(size_t i) const {
-  CHECK_EQ(i, 0U);
+  ICHECK_EQ(i, 0U);
   return shape;
 }
 
diff --git a/src/te/operation/scan_op.cc b/src/te/operation/scan_op.cc
index 726714580b78..a555e86097b7 100644
--- a/src/te/operation/scan_op.cc
+++ b/src/te/operation/scan_op.cc
@@ -51,7 +51,7 @@ Array<IterVar> ScanOpNode::root_iter_vars() const {
 DataType ScanOpNode::output_dtype(size_t i) const { return update[i]->dtype; }
 
 Array<PrimExpr> ScanOpNode::output_shape(size_t i) const {
-  CHECK_LT(i, state_placeholder.size());
+  ICHECK_LT(i, state_placeholder.size());
   return state_placeholder[i]->shape;
 }
 
@@ -62,27 +62,27 @@ ScanOp::ScanOp(std::string name, std::string tag, Map<String, ObjectRef> attrs,
     attrs = Map<String, ObjectRef>();
   }
   auto n = make_object<ScanOpNode>();
-  CHECK_EQ(init.size(), update.size());
-  CHECK_EQ(init.size(), state_placeholder.size());
+  ICHECK_EQ(init.size(), update.size());
+  ICHECK_EQ(init.size(), state_placeholder.size());
   arith::Analyzer analyzer;
   auto prove_equal = [&](PrimExpr lhs, PrimExpr rhs) {
     return is_zero(analyzer.Simplify(lhs - rhs));
   };
 
   for (size_t i = 0; i < init.size(); ++i) {
-    CHECK_EQ(init[i]->dtype, state_placeholder[i]->dtype);
-    CHECK_EQ(init[i]->dtype, update[i]->dtype);
-    CHECK(prove_equal(init[i]->shape[0], axis->dom->min))
+    ICHECK_EQ(init[i]->dtype, state_placeholder[i]->dtype);
+    ICHECK_EQ(init[i]->dtype, update[i]->dtype);
+    ICHECK(prove_equal(init[i]->shape[0], axis->dom->min))
         << "init.shape[0] need to match scan_axis.dom.min";
-    CHECK(prove_equal(state_placeholder[i]->shape[0], axis->dom->min + axis->dom->extent))
+    ICHECK(prove_equal(state_placeholder[i]->shape[0], axis->dom->min + axis->dom->extent))
         << "state_placeholder.shape[0] need to match"
         << " scan_axis.dom.min + scan_axis.dom.extent";
-    CHECK_EQ(state_placeholder[i].ndim(), init[i].ndim())
+    ICHECK_EQ(state_placeholder[i].ndim(), init[i].ndim())
         << "The dimension of init need to match state_placeholder";
-    CHECK_EQ(update[i].ndim(), state_placeholder[i].ndim())
+    ICHECK_EQ(update[i].ndim(), state_placeholder[i].ndim())
         << "The update.ndim need to be state_placeholder.ndim - 1";
     for (size_t k = 0; k < update[i].ndim(); ++k) {
-      CHECK(prove_equal(update[i]->shape[k], state_placeholder[i]->shape[k]));
+      ICHECK(prove_equal(update[i]->shape[k], state_placeholder[i]->shape[k]));
       if (k != 0) {
         // setup spatial axis
         std::ostringstream spatial_name;
@@ -93,7 +93,7 @@ ScanOp::ScanOp(std::string name, std::string tag, Map<String, ObjectRef> attrs,
     }
 
     for (size_t k = 1; k < init[i].ndim(); ++k) {
-      CHECK(prove_equal(init[i]->shape[k], state_placeholder[i]->shape[k]));
+      ICHECK(prove_equal(init[i]->shape[k], state_placeholder[i]->shape[k]));
     }
   }
   n->name = std::move(name);
@@ -141,7 +141,7 @@ Array<Tensor> ScanOpNode::InputTensors() const {
 
 Operation ScanOpNode::ReplaceInputs(const Operation& self,
                                     const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto n = make_object<ScanOpNode>(*this);
   for (size_t i = 0; i < n->init.size(); ++i) {
     if (rmap.count(n->init[i])) {
@@ -161,7 +161,7 @@ Operation ScanOpNode::ReplaceInputs(const Operation& self,
 void ScanOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
                                    const std::unordered_map<const VarNode*, IntSet>& dom_map,
                                    std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   for (size_t i = 0, sp_idx = 0; i < this->init.size(); ++i) {
     TensorDom* init_dom = nullptr;
     TensorDom* update_dom = nullptr;
@@ -195,8 +195,8 @@ void ScanOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analy
 void ScanOpNode::GatherBound(const Operation& self,
                              const std::unordered_map<Tensor, TensorDom>& tensor_dom,
                              std::unordered_map<IterVar, Range>* out_dom_map) const {
-  CHECK_EQ(self.operator->(), this);
-  CHECK(!out_dom_map->count(this->scan_axis));
+  ICHECK_EQ(self.operator->(), this);
+  ICHECK(!out_dom_map->count(this->scan_axis));
   std::vector<Tensor> output(this->num_outputs());
   for (size_t i = 0; i < output.size(); ++i) {
     output[i] = self.output(i);
@@ -207,7 +207,7 @@ void ScanOpNode::GatherBound(const Operation& self,
     const TensorDom& d = tensor_dom.at(output[i]);
     time_dom.insert(time_dom.end(), d.data[0].begin(), d.data[0].end());
   }
-  CHECK(!out_dom_map->count(this->scan_axis));
+  ICHECK(!out_dom_map->count(this->scan_axis));
   arith::Analyzer analyzer;
   Range sdom = this->scan_axis->dom;
   Range r = arith::Union(time_dom).CoverRange(sdom);
@@ -220,8 +220,8 @@ void ScanOpNode::GatherBound(const Operation& self,
     const TensorDom& d = tensor_dom.at(output[i]);
     for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) {
       IterVar sp_ax = this->spatial_axis_[sp_idx];
-      CHECK(!out_dom_map->count(sp_ax));
-      CHECK(fix_pt.count(sp_ax));
+      ICHECK(!out_dom_map->count(sp_ax));
+      ICHECK(fix_pt.count(sp_ax));
       if (fix_pt[sp_ax].as<tir::IntImmNode>()->value) {
         // fix point, we can slice it.
         (*out_dom_map)[sp_ax] = arith::Union(d.data[k]).CoverRange(sp_ax->dom);
@@ -236,14 +236,14 @@ void ScanOpNode::GatherBound(const Operation& self,
 Stmt ScanOpNode::BuildRealize(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
                               const Stmt& body) const {
   arith::Analyzer analyzer;
-  CHECK_EQ(stage->op.get(), this);
+  ICHECK_EQ(stage->op.get(), this);
   Range sdom = dom_map.at(this->scan_axis);
   Range tdom = Range::FromMinExtent(0, analyzer.Simplify(sdom->extent + sdom->min));
   Stmt ret = body;
   size_t sp_idx = 0;
   for (size_t i = 0; i < update.size(); ++i) {
     Tensor t = stage->op.output(i);
-    CHECK_EQ(static_cast<size_t>(t->value_index), i);
+    ICHECK_EQ(static_cast<size_t>(t->value_index), i);
     Region bounds;
     bounds.push_back(tdom);
     for (size_t k = 1; k < this->update[i]->shape.size(); ++k, ++sp_idx) {
@@ -257,14 +257,14 @@ Stmt ScanOpNode::BuildRealize(const Stage& stage, const std::unordered_map<IterV
 
 Stmt ScanOpNode::BuildProvide(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
                               bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
   Stmt provide =
       AttrStmt(stage->op, tir::attr::scan_update_scope, this->scan_axis->var, Evaluate(0));
   Stmt init = AttrStmt(stage->op, tir::attr::scan_init_scope, 0, Evaluate(0));
   size_t begin_scan = 0;
   for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
     if (stage->leaf_iter_vars[i]->iter_type == kThreadIndex) {
-      CHECK_EQ(begin_scan, i);
+      ICHECK_EQ(begin_scan, i);
       begin_scan = i + 1;
     }
   }
diff --git a/src/te/operation/tensor_compute_op.cc b/src/te/operation/tensor_compute_op.cc
index ecb2e860c3e6..262e5a2b97f4 100644
--- a/src/te/operation/tensor_compute_op.cc
+++ b/src/te/operation/tensor_compute_op.cc
@@ -83,7 +83,7 @@ Array<Tensor> TensorComputeOpNode::InputTensors() const { return inputs; }
 
 Operation TensorComputeOpNode::ReplaceInputs(const Operation& self,
                                              const std::unordered_map<Tensor, Tensor>& rmap) const {
-  CHECK_EQ(self.operator->(), this);
+  ICHECK_EQ(self.operator->(), this);
   auto n = make_object<TensorComputeOpNode>(*this);
   auto intrin = make_object<TensorIntrinNode>(*(this->intrin.operator->()));
   intrin->body = ReplaceTensor(this->intrin->body, rmap);
@@ -132,7 +132,7 @@ size_t TensorComputeOpNode::num_schedulable_dims() const { return schedulable_nd
 Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
                                        const std::unordered_map<IterVar, Range>& dom_map,
                                        bool debug_keep_trivial_loop) const {
-  CHECK_EQ(stage->op.operator->(), this);
+  ICHECK_EQ(stage->op.operator->(), this);
 
   // Start bind data.
   Stmt nop = Evaluate(0);
@@ -194,7 +194,7 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
     PrimExpr esp = sp;
     sp_expr.push_back(esp);
   }
-  CHECK_EQ(sp_expr.size(), user_expr.size());
+  ICHECK_EQ(sp_expr.size(), user_expr.size());
   // TODO(jdavies-huawei): what name should be used here?
   binder.BindArray(sp_expr, user_expr, this->name);
 
@@ -204,8 +204,8 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
   if (this->reduce_axis.size() == 0) {
     std::vector<std::vector<Stmt> > nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
     nest.emplace_back(MakeIfNest(n.main_predicates));
-    CHECK_EQ(n.init_predicates.size(), 0U);
-    CHECK(this->intrin->body.defined())
+    ICHECK_EQ(n.init_predicates.size(), 0U);
+    ICHECK(this->intrin->body.defined())
         << "Normal store op for intrin " << this << " is not defined";
     Stmt body = MergeNest(output_bind_nest, this->intrin->body);
     body = MergeNest(input_bind_nest, body);
@@ -216,9 +216,9 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
     return ret;
   } else {
     // Need to split reduction
-    CHECK(this->intrin->reduce_update.defined()) << "Reduction update op is not defined";
+    ICHECK(this->intrin->reduce_update.defined()) << "Reduction update op is not defined";
     // Need init and update steps
-    CHECK_NE(this->reduce_axis.size(), 0U);
+    ICHECK_NE(this->reduce_axis.size(), 0U);
     std::vector<std::vector<Stmt> > common(n.main_nest.begin(),
                                            n.main_nest.begin() + n.num_common_loop + 1);
     std::vector<std::vector<Stmt> > update_nest(n.main_nest.begin() + n.num_common_loop + 1,
@@ -243,7 +243,7 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage,
       return MergeNest(common, SeqStmt::Flatten(init, update));
     } else {
       // When init op is not available, use body op for reset in the first iter.
-      CHECK(this->intrin->body.defined()) << "Normal body op is not defined";
+      ICHECK(this->intrin->body.defined()) << "Normal body op is not defined";
       Stmt update =
           TransformUpdate(stage, dom_map, n, this->intrin->body, this->intrin->reduce_update);
       update = MergeNest(output_bind_nest, update);
diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc
index 9733cd940a9f..bfd1ec579818 100644
--- a/src/te/operation/tensorize.cc
+++ b/src/te/operation/tensorize.cc
@@ -50,14 +50,14 @@ size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage,
   // Loop over the leafs
   for (size_t i = stage->leaf_iter_vars.size(); i != 0; --i) {
     IterVar iv = stage->leaf_iter_vars[i - 1];
-    CHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce);
+    ICHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce);
     auto vit = dom_map.find(iv);
-    CHECK(vit != dom_map.end());
+    ICHECK(vit != dom_map.end());
     const Range& vrange = vit->second;
     if (is_one(vrange->extent)) {
       up_state[iv] = IntSet::SinglePoint(vrange->min);
     } else if (found_point) {
-      CHECK(is_zero(vrange->min));
+      ICHECK(is_zero(vrange->min));
       up_state[iv] = IntSet::SinglePoint(iv->var);
     } else {
       up_state[iv] = IntSet::FromRange(vrange);
@@ -66,16 +66,16 @@ size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage,
     if (iit != stage->iter_var_attrs.end()) {
       const IterVarAttr& attr = (*iit).second;
       if (!found_point) {
-        CHECK(!attr->bind_thread.defined()) << "Do not allow thread in tensorize scope";
+        ICHECK(!attr->bind_thread.defined()) << "Do not allow thread in tensorize scope";
       }
       if (attr->iter_type == kTensorized) {
-        CHECK(!found_point) << "Do not allow two tensorized point";
+        ICHECK(!found_point) << "Do not allow two tensorized point";
         found_point = true;
         loc_scope = i - 1;
       }
     }
   }
-  CHECK(found_point);
+  ICHECK(found_point);
   // Get domain of the tensorized scope.
   te::PassUpDomain(stage, dom_map, &up_state);
   // Get domains if inputs
@@ -101,7 +101,7 @@ size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage,
     const Tensor& t = kv.first;
     for (size_t i = 0; i < t.ndim(); ++i) {
       Range r = arith::Union(kv.second.data.at(i)).CoverRange(none);
-      CHECK(r.defined()) << "cannot deduce region of tensorized scope for input " << t;
+      ICHECK(r.defined()) << "cannot deduce region of tensorized scope for input " << t;
       vec.push_back(std::move(r));
     }
     (*in_region)[t] = std::move(vec);
@@ -113,8 +113,8 @@ void VerifyTensorizeLoopNest(const ComputeOpNode* self, const Stage& stage,
                              const ComputeLoopNest& n, size_t tloc) {
   // Veirfication step.
   std::unordered_set<const VarNode*> banned;
-  CHECK_EQ(n.main_nest.size(), stage->leaf_iter_vars.size() + 1);
-  CHECK(n.init_nest.size() == stage->leaf_iter_vars.size() + 1 || n.init_nest.size() == 0);
+  ICHECK_EQ(n.main_nest.size(), stage->leaf_iter_vars.size() + 1);
+  ICHECK(n.init_nest.size() == stage->leaf_iter_vars.size() + 1 || n.init_nest.size() == 0);
   auto f_push_banned = [&banned](const Stmt& s) {
     if (const ForNode* op = s.as<ForNode>()) {
       banned.insert(op->loop_var.get());
@@ -163,7 +163,7 @@ class TensorIntrinMatcher final : public StmtExprMutator {
     auto it = in_remap_.find(t);
     if (it != in_remap_.end()) {
       const InputEntry& e = it->second;
-      CHECK_EQ(op->indices.size(), e.region.size());
+      ICHECK_EQ(op->indices.size(), e.region.size());
       Array<PrimExpr> indices;
       for (size_t i = e.start; i < e.region.size(); ++i) {
         indices.push_back(op->indices[i] - e.region[i]->min);
@@ -200,7 +200,7 @@ class TensorIntrinMatcher final : public StmtExprMutator {
             const std::unordered_map<IterVar, Range>& out_dom,
             const std::unordered_map<Tensor, Array<Range> >& in_region, const TensorIntrin& intrin,
             Map<Var, Range>* compute_intrin_iter_space) {
-    CHECK(self == stage->op.get());
+    ICHECK(self == stage->op.get());
 
     for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
       IterVar iv = stage->leaf_iter_vars[i];
@@ -214,17 +214,17 @@ class TensorIntrinMatcher final : public StmtExprMutator {
 
     // input remap.
     Array<Tensor> inputs = self->InputTensors();
-    CHECK_EQ(inputs.size(), intrin->inputs.size());
+    ICHECK_EQ(inputs.size(), intrin->inputs.size());
     for (size_t i = 0; i < inputs.size(); ++i) {
       InputEntry e;
       e.tensor = intrin->inputs[i];
       e.region = Array<Range>(in_region.at(inputs[i]));
-      CHECK_GE(e.region.size(), e.tensor.ndim());
+      ICHECK_GE(e.region.size(), e.tensor.ndim());
       // Enable fuzzy matching, to match [1, n, m] to [n, m]
       e.start = e.region.size() - e.tensor.ndim();
       for (size_t j = 0; j < e.start; ++j) {
         auto canonical_extent = analyzer_.Simplify(e.region[j]->extent);
-        CHECK(is_one(canonical_extent))
+        ICHECK(is_one(canonical_extent))
             << "Tensorize " << intrin->name << ":"
             << " Input dimension mismatch with tensor intrin "
             << " expected shape=" << e.tensor->shape << ", given region=" << e.region;
@@ -233,16 +233,16 @@ class TensorIntrinMatcher final : public StmtExprMutator {
     }
     // output remap
     const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-    CHECK(intrin_compute) << "Only support compute intrinsic for now";
-    CHECK_GE(self->axis.size(), intrin_compute->axis.size())
+    ICHECK(intrin_compute) << "Only support compute intrinsic for now";
+    ICHECK_GE(self->axis.size(), intrin_compute->axis.size())
         << "Tensorize: Output mismatch with tensor intrin ";
     // Enable fuzzy matching, to match [1, n, m] to [n, m]
     size_t axis_start = self->axis.size() - intrin_compute->axis.size();
     for (size_t i = 0; i < axis_start; ++i) {
       Range r = out_dom.at(self->axis[i]);
-      CHECK(is_one(r->extent)) << "Tensorize: Output mismatch with tensor intrin "
-                               << " intrin-dim=" << intrin_compute->axis.size()
-                               << ", tensorize-dim=" << self->axis.size();
+      ICHECK(is_one(r->extent)) << "Tensorize: Output mismatch with tensor intrin "
+                                << " intrin-dim=" << intrin_compute->axis.size()
+                                << ", tensorize-dim=" << self->axis.size();
       var_remap_[self->axis[i]->var.get()] = r->min;
     }
     // Assume we tensorize at regin axis i [min, min + extent)
@@ -257,14 +257,14 @@ class TensorIntrinMatcher final : public StmtExprMutator {
       compute_intrin_iter_space->Set(target_iv->var, target_iv->dom);
     }
     // Remap reduction axis
-    CHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
+    ICHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
         << "Tensorize: Reduction dimension mismatch with tensor intrin";
     axis_start = self->reduce_axis.size() - intrin_compute->reduce_axis.size();
     for (size_t i = 0; i < axis_start; ++i) {
       Range r = out_dom.at(self->reduce_axis[i]);
-      CHECK(is_one(r->extent)) << "Tensorize: Reduction mismatch with tensor intrin "
-                               << " intrin-dim=" << intrin_compute->reduce_axis.size()
-                               << ", tensorize-dim=" << self->reduce_axis.size();
+      ICHECK(is_one(r->extent)) << "Tensorize: Reduction mismatch with tensor intrin "
+                                << " intrin-dim=" << intrin_compute->reduce_axis.size()
+                                << ", tensorize-dim=" << self->reduce_axis.size();
       var_remap_[self->reduce_axis[i]->var.get()] = r->min;
     }
     for (size_t i = axis_start; i < self->reduce_axis.size(); ++i) {
@@ -320,8 +320,8 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
   Array<PrimExpr> body = MatchTensorizeBody(self, stage, dom_map, out_dom, in_region, intrin,
                                             &compute_intrin_iter_space);
   const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-  CHECK(intrin_compute) << "Only support compute intrinsic for now";
-  CHECK_EQ(body.size(), intrin_compute->body.size()) << "Tensorize failed: body size mismatch";
+  ICHECK(intrin_compute) << "Only support compute intrinsic for now";
+  ICHECK_EQ(body.size(), intrin_compute->body.size()) << "Tensorize failed: body size mismatch";
   arith::Analyzer ana;
   ana.Bind(compute_intrin_iter_space);
 
@@ -333,9 +333,9 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
                  << "'s declaration "
                  << " provided=" << lhs.dtype() << ", intrin=" << rhs.dtype();
     }
-    CHECK(expr_equal(lhs, rhs)) << "Failed to match the compute with TensorIntrin " << intrin->name
-                                << "'s declaration "
-                                << " provided= " << lhs << ", intrin=  " << rhs;
+    ICHECK(expr_equal(lhs, rhs)) << "Failed to match the compute with TensorIntrin " << intrin->name
+                                 << "'s declaration "
+                                 << " provided= " << lhs << ", intrin=  " << rhs;
   }
 }
 
@@ -346,7 +346,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   std::unordered_map<Tensor, Array<Range> > in_region;
   size_t tloc = InferTensorizeRegion(self, stage, dom_map, &out_dom, &in_region);
   TensorIntrin intrin = stage->iter_var_attrs.at(stage->leaf_iter_vars[tloc])->tensor_intrin;
-  CHECK(intrin.defined());
+  ICHECK(intrin.defined());
   ComputeLoopNest n = ComputeLoopNest::Create(self, stage, dom_map, debug_keep_trivial_loop);
   VerifyTensorizeLoopNest(self, stage, n, tloc);
   VerifyTensorizeBody(self, stage, dom_map, out_dom, in_region, intrin);
@@ -354,14 +354,14 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   Stmt nop = Evaluate(0);
   std::vector<Stmt> input_bind_nest, output_bind_nest;
   Array<Tensor> inputs = self->InputTensors();
-  CHECK_EQ(inputs.size(), intrin->inputs.size()) << "Tensorize failed: input size mismatch ";
+  ICHECK_EQ(inputs.size(), intrin->inputs.size()) << "Tensorize failed: input size mismatch ";
   // input binding
   for (size_t i = 0; i < intrin->inputs.size(); ++i) {
     Tensor tensor = inputs[i];
     Buffer buffer = intrin->buffers[i];
     Array<ObjectRef> bind_spec{buffer, tensor};
     auto it = in_region.find(tensor);
-    CHECK(it != in_region.end());
+    ICHECK(it != in_region.end());
     const Array<Range>& region = it->second;
     Array<PrimExpr> tuple;
     for (const Range r : region) {
@@ -374,13 +374,13 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   }
   // output binding
   const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
-  CHECK(intrin_compute) << "Only support compute intrinsic for now";
-  CHECK_EQ(intrin->inputs.size() + intrin_compute->body.size(), intrin->buffers.size());
-  CHECK_EQ(intrin_compute->body.size(), self->body.size());
+  ICHECK(intrin_compute) << "Only support compute intrinsic for now";
+  ICHECK_EQ(intrin->inputs.size() + intrin_compute->body.size(), intrin->buffers.size());
+  ICHECK_EQ(intrin_compute->body.size(), self->body.size());
   Array<PrimExpr> tuple;
   for (IterVar iv : self->axis) {
     auto it = out_dom.find(iv);
-    CHECK(it != out_dom.end());
+    ICHECK(it != out_dom.end());
     tuple.push_back(it->second->min);
     tuple.push_back(it->second->extent);
   }
@@ -395,20 +395,20 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   // Check variable remap
   std::unordered_map<const VarNode*, PrimExpr> vmap;
   tir::ArgBinder binder(&vmap);
-  CHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
+  ICHECK_GE(self->reduce_axis.size(), intrin_compute->reduce_axis.size())
       << "Tensorization fail: reduction axis size do not match";
   size_t start = self->reduce_axis.size() - intrin_compute->reduce_axis.size();
   for (size_t i = 0; i < start; ++i) {
     IterVar iv = self->reduce_axis[i];
     auto it = out_dom.find(iv);
-    CHECK(it != out_dom.end());
-    CHECK(is_one(it->second->extent)) << "Tensorization fail: reduction axis size do not match";
+    ICHECK(it != out_dom.end());
+    ICHECK(is_one(it->second->extent)) << "Tensorization fail: reduction axis size do not match";
   }
   for (size_t i = start; i < self->reduce_axis.size(); ++i) {
     IterVar iv = self->reduce_axis[i];
     IterVar target = intrin_compute->reduce_axis[i - start];
     auto it = out_dom.find(iv);
-    CHECK(it != out_dom.end());
+    ICHECK(it != out_dom.end());
     binder.Bind(target->dom->min, make_const(iv->dom->min.dtype(), 0),
                 "tensir_intrin.reduction.min");
     binder.Bind(target->dom->extent, it->second->extent, "tensir_intrin.reduction.extent");
@@ -417,8 +417,8 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
     // Do no need to split reduction
     std::vector<std::vector<Stmt> > nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
     nest.emplace_back(MakeIfNest(n.main_predicates));
-    CHECK_EQ(n.init_predicates.size(), 0U);
-    CHECK(intrin->body.defined()) << "Normal store op for intrin " << intrin << " is not defined";
+    ICHECK_EQ(n.init_predicates.size(), 0U);
+    ICHECK(intrin->body.defined()) << "Normal store op for intrin " << intrin << " is not defined";
     Stmt body = MergeNest(output_bind_nest, intrin->body);
     body = MergeNest(input_bind_nest, body);
     body = tir::Substitute(body, vmap);
@@ -427,10 +427,10 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
     return MergeNest(nest, body);
   } else {
     // Need to split reduction
-    CHECK(intrin->reduce_update.defined())
+    ICHECK(intrin->reduce_update.defined())
         << "Reduction update op for intrin " << intrin << " is not defined";
     // Need init and update steps
-    CHECK_NE(self->reduce_axis.size(), 0U);
+    ICHECK_NE(self->reduce_axis.size(), 0U);
     std::vector<std::vector<Stmt> > common(n.main_nest.begin(),
                                            n.main_nest.begin() + n.num_common_loop + 1);
     std::vector<std::vector<Stmt> > update_nest(n.main_nest.begin() + n.num_common_loop + 1,
@@ -455,7 +455,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
       return MergeNest(common, SeqStmt::Flatten(init, update));
     } else {
       // When init op is not available, use body op for reset in the first iter.
-      CHECK(intrin->body.defined()) << "Normal body op for intrin " << intrin << " is not defined";
+      ICHECK(intrin->body.defined()) << "Normal body op for intrin " << intrin << " is not defined";
       Stmt update = TransformUpdate(stage, dom_map, n, intrin->body, intrin->reduce_update);
       update = MergeNest(output_bind_nest, update);
       update = MergeNest(input_bind_nest, update);
@@ -474,7 +474,7 @@ TVM_REGISTER_GLOBAL("test.op.InferTensorizeRegion").set_body([](TVMArgs args, TV
   Map<IterVar, Range> dmap = args[1];
   std::unordered_map<IterVar, Range> out_dom;
   std::unordered_map<Tensor, Array<Range> > in_region;
-  CHECK(stage->op.as<ComputeOpNode>());
+  ICHECK(stage->op.as<ComputeOpNode>());
   InferTensorizeRegion(stage->op.as<ComputeOpNode>(), stage, as_unordered_map(dmap), &out_dom,
                        &in_region);
   *ret = Array<ObjectRef>{Map<IterVar, Range>(out_dom), Map<Tensor, Array<Range> >(in_region)};
@@ -486,7 +486,7 @@ TVM_REGISTER_GLOBAL("test.op.MatchTensorizeBody").set_body([](TVMArgs args, TVMR
   Map<Tensor, Array<Range> > in_region = args[2];
   TensorIntrin intrin = args[3];
   Map<Var, Range> vrange;
-  CHECK(stage->op.as<ComputeOpNode>());
+  ICHECK(stage->op.as<ComputeOpNode>());
   *ret = MatchTensorizeBody(stage->op.as<ComputeOpNode>(), stage, {{}}, as_unordered_map(out_dom),
                             as_unordered_map(in_region), intrin, &vrange);
 });
diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc
index 83a1caf3c63a..12c9b5538b44 100644
--- a/src/te/schedule/bound.cc
+++ b/src/te/schedule/bound.cc
@@ -89,16 +89,16 @@ StorageScope InferStorageScope(const Stage& stage, const GraphContext& ctx) {
 
 void InferRootBound(const Stage& stage, const GraphContext& ctx,
                     std::unordered_map<IterVar, Range>* rmap) {
-  CHECK_NE(stage->attach_type, kInline) << "call schedule.normalize before scheduleops";
+  ICHECK_NE(stage->attach_type, kInline) << "call schedule.normalize before scheduleops";
   if (stage->attach_type == kInlinedAlready) return;
   if (stage->is_output) {
     // verify correctness.
-    CHECK_EQ(stage.GetAttachSpec()->attach_type, kGroupRoot) << "Output must be attached at root";
+    ICHECK_EQ(stage.GetAttachSpec()->attach_type, kGroupRoot) << "Output must be attached at root";
   }
   if (stage->is_output || stage->op.as<PlaceholderOpNode>()) {
     for (auto iv : stage->op->root_iter_vars()) {
-      CHECK(iv->dom.defined());
-      CHECK(!rmap->count(iv));
+      ICHECK(iv->dom.defined());
+      ICHECK(!rmap->count(iv));
       (*rmap)[iv] = iv->dom;
     }
     return;
@@ -132,7 +132,7 @@ void InferRootBound(const Stage& stage, const GraphContext& ctx,
     Map<Var, IntSet> relax_set;
     std::unordered_map<IterVar, IntSet> up_state;
     bool found_attach = false;
-    CHECK(ctx.op2stage_.count(op.get()));
+    ICHECK(ctx.op2stage_.count(op.get()));
     const Stage& op_stage = ctx.op2stage_.at(op.get());
     // Consumer nest
     for (size_t i = op_stage->leaf_iter_vars.size(); i != 0; --i) {
@@ -141,13 +141,13 @@ void InferRootBound(const Stage& stage, const GraphContext& ctx,
         found_attach = true;
       }
       auto it = rmap->find(iv);
-      CHECK(it != rmap->end());
+      ICHECK(it != rmap->end());
       const Range& vrange = it->second;
       if (is_one(vrange->extent)) {
         up_state[iv] = IntSet::SinglePoint(vrange->min);
       } else if (!NeedRelax(iv, found_attach, ctx.bind_map, scope)) {
-        CHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
-                                    << " call schedule.normalize to achieve this. ";
+        ICHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
+                                     << " call schedule.normalize to achieve this. ";
         if (ctx.bind_map.count(iv)) {
           up_state[iv] = IntSet::SinglePoint(ctx.bind_map.at(iv)->var);
         } else {
@@ -163,8 +163,8 @@ void InferRootBound(const Stage& stage, const GraphContext& ctx,
         found_attach = true;
       }
       Range vrange = rmap->at(iv);
-      CHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
-                                  << "call schedule.normalize to achieve this.";
+      ICHECK(is_zero(vrange->min)) << "InferBound requires every leaf iter var's min equals 0, "
+                                   << "call schedule.normalize to achieve this.";
       if (NeedRelax(iv, found_attach, ctx.bind_map, scope)) {
         relax_set.Set(iv->var, IntSet::FromRange(vrange));
         if (ctx.bind_map.count(iv)) {
@@ -172,7 +172,7 @@ void InferRootBound(const Stage& stage, const GraphContext& ctx,
         }
       }
     }
-    CHECK(found_attach || stage_attach.size() == 0)
+    ICHECK(found_attach || stage_attach.size() == 0)
         << "Invalid Schedule, cannot find the producer " << stage->op
         << " along the loop nest specified by compute_at of consumer " << op;
     // Get the domain of the consumer
@@ -218,7 +218,7 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
   for (Stage stage : sch->stages) {
     for (auto kv : stage->iter_var_attrs) {
       if (kv.second->bind_thread.defined()) {
-        CHECK(!ctx.bind_map.count(kv.first));
+        ICHECK(!ctx.bind_map.count(kv.first));
         ctx.bind_map[kv.first] = kv.second->bind_thread;
       }
     }
@@ -242,7 +242,7 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
     // pass down to get bound of all iter vars.
     PassDownDomain(stage, &ret, &analyzer);
     for (IterVar iv : stage->env_threads) {
-      CHECK(iv->dom.defined());
+      ICHECK(iv->dom.defined());
       ret[iv] = iv->dom;
     }
   }
diff --git a/src/te/schedule/graph.cc b/src/te/schedule/graph.cc
index 09e899581d14..502753284da6 100644
--- a/src/te/schedule/graph.cc
+++ b/src/te/schedule/graph.cc
@@ -174,7 +174,7 @@ AttachPath CreateAttachPath(Schedule sch) {
     std::unordered_set<const Object*> visited;
     Array<IterVar> path;
     for (Stage s = stage; s.defined();) {
-      CHECK(!visited.count(s.get())) << "Find loop in compute_at attach group";
+      ICHECK(!visited.count(s.get())) << "Find loop in compute_at attach group";
       visited.insert(s.get());
       Stage spec = s.GetAttachSpec();
       bool start_attach;
@@ -183,14 +183,14 @@ AttachPath CreateAttachPath(Schedule sch) {
         attach_ivar = spec->attach_ivar;
         s = spec->attach_stage;
         start_attach = false;
-        CHECK(attach_ivar.defined());
+        ICHECK(attach_ivar.defined());
       } else if (spec->attach_type == kScanUpdate) {
         s = spec->attach_stage;
         start_attach = true;
       } else {
         break;
       }
-      CHECK(s.defined());
+      ICHECK(s.defined());
       for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) {
         IterVar iv = s->leaf_iter_vars[i - 1];
         if (!start_attach && iv.same_as(attach_ivar)) {
@@ -198,8 +198,8 @@ AttachPath CreateAttachPath(Schedule sch) {
         }
         if (start_attach) path.push_back(iv);
       }
-      CHECK(start_attach) << "Invalid Schedule: cannot find attach point " << attach_ivar
-                          << " in the schedule of " << s->op;
+      ICHECK(start_attach) << "Invalid Schedule: cannot find attach point " << attach_ivar
+                           << " in the schedule of " << s->op;
     }
     if (!ret.count(stage->op)) {
       ret.Set(stage->op, path);
diff --git a/src/te/schedule/message_passing.cc b/src/te/schedule/message_passing.cc
index 0a82673aa4b8..d45f29ebc5b6 100644
--- a/src/te/schedule/message_passing.cc
+++ b/src/te/schedule/message_passing.cc
@@ -40,9 +40,9 @@ void Update(std::unordered_map<IterVar, Range>* p_state, const IterVar& iv, Rang
   } else {
     bool match =
         is_zero(it->second->min) && analyzer->CanProve(r->extent - it->second->extent == 0);
-    CHECK(match) << iv << " domain already inferred,"
-                 << " cannot prove their extents are the same " << it->second->extent << " vs "
-                 << r->extent;
+    ICHECK(match) << iv << " domain already inferred,"
+                  << " cannot prove their extents are the same " << it->second->extent << " vs "
+                  << r->extent;
   }
 }
 
@@ -109,10 +109,10 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
   for (IterVarRelation rel : stage->relations) {
     if (const SplitNode* r = rel.as<SplitNode>()) {
       if (!state.count(r->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
-      CHECK(!state.count(r->inner));
+      ICHECK(!state.count(r->inner));
       const Range& range_parent = state.at(r->parent);
       // Tighten iv's extent to min(parent_extent, factor_or_nparts), only if all of the
       // following conditions are met:
@@ -143,7 +143,7 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
       }
     } else if (const FuseNode* r = rel.as<FuseNode>()) {
       if (!state.count(r->outer) || !state.count(r->inner)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       const Range& range_outer = state.at(r->outer);
@@ -151,7 +151,7 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
       state[r->fused] = Range::FromMinExtent(0, range_outer->extent * range_inner->extent);
     } else if (const RebaseNode* r = rel.as<RebaseNode>()) {
       if (!state.count(r->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       Update(p_state, r->rebased, Range::FromMinExtent(0, state.at(r->parent)->extent), actx);
@@ -164,7 +164,7 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
   // update the extents of binded threads.
   for (auto kv : stage->iter_var_attrs) {
     if (kv.second->bind_thread.defined()) {
-      CHECK(state.count(kv.first));
+      ICHECK(state.count(kv.first));
       Update(p_state, kv.second->bind_thread, state.at(kv.first), actx);
     }
   }
@@ -177,7 +177,7 @@ void PassUpIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
     IterVarRelation rel = stage->relations[i - 1];
     if (const SplitNode* s = rel.as<SplitNode>()) {
       if (!state.count(s->outer) || !state.count(s->inner)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr outer = state.at(s->outer);
@@ -191,7 +191,7 @@ void PassUpIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
       }
     } else if (const FuseNode* s = rel.as<FuseNode>()) {
       if (!state.count(s->fused)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr value = state.at(s->fused);
@@ -213,7 +213,7 @@ void PassUpIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
       state[s->inner] = cast(s->inner->var.dtype(), state[s->inner]);
     } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
       if (!state.count(s->rebased)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr value = state.at(s->rebased);
@@ -237,18 +237,18 @@ void PassDownIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
   for (IterVarRelation rel : stage->relations) {
     if (const SplitNode* s = rel.as<SplitNode>()) {
       if (!state.count(s->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       Range r = dom_map.at(s->inner);
-      CHECK(is_zero(r->min));
+      ICHECK(is_zero(r->min));
       PrimExpr parent = state.at(s->parent);
       PrimExpr factor = r->extent;
       state[s->outer] = indexdiv(parent, factor);
       state[s->inner] = indexmod(parent, factor);
     } else if (const FuseNode* s = rel.as<FuseNode>()) {
       if (!state.count(s->inner) && !state.count(s->outer)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr factor = dom_map.at(s->inner)->extent;
@@ -256,17 +256,17 @@ void PassDownIndex(const Stage& stage, const Map<IterVar, Range>& dom_map,
       PrimExpr inner_min = dom_map.at(s->inner)->min;
       PrimExpr inner = state.at(s->inner);
       PrimExpr outer = state.at(s->outer);
-      CHECK(is_zero(outer_min));
-      CHECK(is_zero(inner_min));
+      ICHECK(is_zero(outer_min));
+      ICHECK(is_zero(inner_min));
       state[s->fused] = outer * factor + inner;
     } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
       if (!state.count(s->rebased)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       PrimExpr value = state.at(s->parent);
       PrimExpr parent_min = dom_map.at(s->parent)->min;
-      CHECK(is_zero(parent_min));
+      ICHECK(is_zero(parent_min));
       state[s->rebased] = value;
     } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
       state[s->iter] = make_zero(s->iter->var.dtype());
@@ -286,18 +286,18 @@ void PassUpDomain(const SplitNode* s, const std::unordered_map<IterVar, Range>&
   }
   PrimExpr factor = dom_map.at(s->inner)->extent;
   PrimExpr parent_min = dom_map.at(s->parent)->min;
-  CHECK(outer.defined());
-  CHECK(inner.defined());
-  CHECK(factor.defined());
+  ICHECK(outer.defined());
+  ICHECK(inner.defined());
+  ICHECK(factor.defined());
   *parent = arith::EvalSet(s->outer->var * factor + s->inner->var + parent_min,
                            {{s->outer, outer}, {s->inner, inner}});
 }
 
 void PassUpDomain(const FuseNode* s, const std::unordered_map<IterVar, Range>& dom_map,
                   const IntSet& fused, IntSet* outer, IntSet* inner) {
-  CHECK(dom_map.count(s->outer));
-  CHECK(dom_map.count(s->inner));
-  CHECK(dom_map.count(s->fused));
+  ICHECK(dom_map.count(s->outer));
+  ICHECK(dom_map.count(s->inner));
+  ICHECK(dom_map.count(s->fused));
   arith::Analyzer ana;
 
   if (fused.MatchRange(dom_map.at(s->fused))) {
@@ -342,7 +342,7 @@ void PassUpDomain(const FuseNode* s, const std::unordered_map<IterVar, Range>& d
 
 void PassUpDomain(const RebaseNode* s, const std::unordered_map<IterVar, Range>& dom_map,
                   const IntSet& rebased, IntSet* parent) {
-  CHECK(dom_map.count(s->parent));
+  ICHECK(dom_map.count(s->parent));
   if (rebased.MatchRange(dom_map.at(s->rebased))) {
     *parent = IntSet::FromRange(dom_map.at(s->parent));
     return;
@@ -384,7 +384,7 @@ void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_sta
     IterVarRelation rel = stage->relations[i - 1];
     if (const SplitNode* s = rel.as<SplitNode>()) {
       if (!state.count(s->inner) && !state.count(s->outer)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       int res = 0;
@@ -394,7 +394,7 @@ void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_sta
       state[s->parent] = res;
     } else if (const FuseNode* s = rel.as<FuseNode>()) {
       if (!state.count(s->fused)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       if (!state.count(s->outer)) {
@@ -409,7 +409,7 @@ void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_sta
       }
     } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
       if (!state.count(s->rebased)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       if (!state.count(s->parent)) {
@@ -430,7 +430,7 @@ void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_s
   for (IterVarRelation rel : stage->relations) {
     if (const SplitNode* s = rel.as<SplitNode>()) {
       if (!state.count(s->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       if (!state.count(s->outer)) {
@@ -445,7 +445,7 @@ void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_s
       }
     } else if (const FuseNode* s = rel.as<FuseNode>()) {
       if (!state.count(s->outer) && !state.count(s->inner)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       int res = 0;
@@ -455,7 +455,7 @@ void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_s
       state[s->fused] = res;
     } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
       if (!state.count(s->parent)) {
-        CHECK(allow_missing);
+        ICHECK(allow_missing);
         continue;
       }
       if (!state.count(s->rebased)) {
@@ -561,7 +561,7 @@ std::vector<PrimExpr> MakeBoundCheck(const Stage& stage, const Map<IterVar, Rang
   for (const IterVar& iv : stage->op->root_iter_vars()) {
     if (skip_iter.count(iv) || iv->iter_type == kOpaque) continue;
     Range dom = dom_map.at(iv);
-    CHECK(iv->dom.defined());
+    ICHECK(iv->dom.defined());
     if (!skip_ivar_domain && !IsRangeSame(iv->dom, dom)) {
       PrimExpr value = value_map.at(iv) - iv->dom->min;
       IntSet s = analyzer.int_set(value, iset_dmap);
diff --git a/src/te/schedule/operation_inline.cc b/src/te/schedule/operation_inline.cc
index 01d93c5ec8bd..8eed6e3f10fc 100644
--- a/src/te/schedule/operation_inline.cc
+++ b/src/te/schedule/operation_inline.cc
@@ -48,9 +48,9 @@ class OperationInliner final : public StmtExprMutator {
     auto tensor = Downcast<Tensor>(op->producer);
 
     if (tensor->op.same_as(operation_)) {
-      CHECK_EQ(tensor->value_index, 0);
+      ICHECK_EQ(tensor->value_index, 0);
       expr = body_;
-      CHECK_EQ(args_.size(), op->indices.size());
+      ICHECK_EQ(args_.size(), op->indices.size());
 
       bool has_side_effect = false;
       for (size_t i = 0; i < op->indices.size(); ++i) {
@@ -81,7 +81,7 @@ class OperationInliner final : public StmtExprMutator {
 };
 
 Stmt Inline(Stmt stmt, Operation f, Array<Var> args, PrimExpr body) {
-  CHECK_EQ(f->num_outputs(), 1) << "can only inline output single value operation";
+  ICHECK_EQ(f->num_outputs(), 1) << "can only inline output single value operation";
   Stmt ret = OperationInliner(f, args, body)(std::move(stmt));
   if (ret.same_as(stmt)) return ret;
   return ConvertSSA(ret);
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index 941817a5d954..6aac3b769a47 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -163,7 +163,7 @@ Tensor Schedule::cache_read(const Tensor& tensor, const std::string& scope,
   for (Operation op : readers) {
     Stage s = operator[](op);
     Operation repl_op = s->op->ReplaceInputs(s->op, vsub);
-    CHECK(!repl_op.same_as(s->op)) << "Cannot find " << tensor << " in the inputs of " << s->op;
+    ICHECK(!repl_op.same_as(s->op)) << "Cannot find " << tensor << " in the inputs of " << s->op;
     vmap[s->op.output(0)] = repl_op.output(0);
     rvmap[repl_op.output(0)] = s->op.output(0);
     s->op = repl_op;
@@ -174,7 +174,7 @@ Tensor Schedule::cache_read(const Tensor& tensor, const std::string& scope,
   size_t pos = FindNodeRef(stages.GetArrayNode(), op_stage);
   Stage cache_stage = Stage(cache->op);
   cache_stage.set_scope(scope);
-  CHECK_LT(pos, stages.size());
+  ICHECK_LT(pos, stages.size());
   stages.insert(stages.begin() + pos + 1, cache_stage);
   (*this)->stage_map.Set(cache->op, cache_stage);
   // Update group
@@ -212,7 +212,7 @@ void PrepareAxisMapping(Stage orig_stage, OpType* op, std::unordered_set<IterVar
     std::unordered_map<IterVar, PrimExpr> value_map;
     for (IterVar iv : orig_stage->leaf_iter_vars) {
       if (red_axis.count(iv)) continue;
-      CHECK_EQ(iv->iter_type, kDataPar) << "Can only relayout with in data parallel dimensions";
+      ICHECK_EQ(iv->iter_type, kDataPar) << "Can only relayout with in data parallel dimensions";
       Range dom = dom_map.at(iv);
       IterVar new_iv = IterVar(dom, iv->var.copy_with_suffix(".c"), iv->iter_type);
       new_axis.push_back(new_iv);
@@ -266,7 +266,7 @@ Array<Tensor> ReplaceOriginalOp(Schedule sch, Stage orig_stage, const std::strin
   size_t pos = FindNodeRef(stages.GetArrayNode(), orig_stage);
   Stage cache_stage = Stage(cache_op);
   cache_stage.set_scope(scope);
-  CHECK_LT(pos, stages.size());
+  ICHECK_LT(pos, stages.size());
   stages.insert(stages.begin() + pos, cache_stage);
   sch->stage_map.Set(cache_op, cache_stage);
   // Update group
@@ -309,14 +309,14 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch, const Array<Tensor>& tensor_a
     if (body->IsInstance<tir::ReduceNode>()) {
       const tir::ReduceNode* reduce_body = body.as<tir::ReduceNode>();
       if (first_reduce != nullptr) {
-        CHECK(ReduceEqual(reduce_body, first_reduce));
+        ICHECK(ReduceEqual(reduce_body, first_reduce));
         body = tir::Reduce(first_reduce->combiner, first_reduce->source, first_reduce->axis,
                            first_reduce->condition, reduce_body->value_index, reduce_body->init);
       } else {
         first_reduce = reduce_body;
       }
     } else {
-      CHECK(first_reduce == nullptr) << "cannot mix reduce and other node in ONE compute bodys";
+      ICHECK(first_reduce == nullptr) << "cannot mix reduce and other node in ONE compute bodys";
     }
     body_list.push_back(body);
   }
@@ -355,7 +355,7 @@ Array<Tensor> CacheWriteWithReLayoutTensor(Schedule sch, const Array<Tensor>& te
   Tensor tensor = tensor_array[0];
   Stage orig_stage = sch[tensor->op];
   const TensorComputeOpNode* tensor_op = orig_stage->op.as<TensorComputeOpNode>();
-  CHECK_EQ(tensor_op->num_outputs(), 1)
+  ICHECK_EQ(tensor_op->num_outputs(), 1)
       << "cache write only support single output tensor_compute_op";
 
   std::unordered_set<IterVar> red_axis;
@@ -435,15 +435,15 @@ Array<Tensor> CacheWriteWithReLayoutTensor(Schedule sch, const Array<Tensor>& te
 
 Array<Tensor> Schedule::cache_write(const Array<Tensor>& tensor_array, const std::string& scope) {
   (*this)->InvalidateCache();
-  CHECK(tensor_array.size() > 0) << "size of tensor_array must be greater than 0";
+  ICHECK(tensor_array.size() > 0) << "size of tensor_array must be greater than 0";
   Tensor tensor = tensor_array[0];
   Stage orig_stage = operator[](tensor->op);
   const ComputeOpNode* compute = tensor->op.as<ComputeOpNode>();
-  CHECK(static_cast<size_t>(compute->num_outputs()) == tensor_array.size())
+  ICHECK(static_cast<size_t>(compute->num_outputs()) == tensor_array.size())
       << "size of input tensor list must be same as number of stage outputs";
   for (size_t i = 1; i < tensor_array.size(); i++) {
     Stage tmp_stage = operator[](tensor_array[i]->op);
-    CHECK(orig_stage.same_as(tmp_stage)) << "Input tensor list must be generated by ONE computeOp";
+    ICHECK(orig_stage.same_as(tmp_stage)) << "Input tensor list must be generated by ONE computeOp";
   }
   return CacheWriteWithReLayout(*this, tensor_array, scope);
 }
@@ -519,11 +519,11 @@ void InjectInline(ScheduleNode* sch) {
       {
         // setup args
         const ComputeOpNode* compute = stage->op.as<ComputeOpNode>();
-        CHECK(compute) << "can only inline compute op";
+        ICHECK(compute) << "can only inline compute op";
         for (auto iv : compute->axis) {
           args.push_back(iv->var);
         }
-        CHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
+        ICHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
         body = compute->body[0];
       }
       for (size_t j = i; j < sch->stages.size(); ++j) {
@@ -539,9 +539,9 @@ void InjectInline(ScheduleNode* sch) {
             const tir::ReduceNode* reduce = new_body[j][0].as<tir::ReduceNode>();
             for (size_t k = 1; k < new_body[j].size(); ++k) {
               const tir::ReduceNode* reduce_ = new_body[j][k].as<tir::ReduceNode>();
-              CHECK(reduce_);
-              CHECK(ReduceEqual(reduce_, reduce)) << "The Reduce inputs of ComputeOp should "
-                                                  << "have the same attribute except value_index";
+              ICHECK(reduce_);
+              ICHECK(ReduceEqual(reduce_, reduce)) << "The Reduce inputs of ComputeOp should "
+                                                   << "have the same attribute except value_index";
             }
             PrimExpr new_value = Inline(tir::Evaluate(new_body[j][0]), stage->op, args, body)
                                      .as<tir::EvaluateNode>()
@@ -549,8 +549,8 @@ void InjectInline(ScheduleNode* sch) {
             if (!new_value.same_as(new_body[j][0])) {
               changed[j] = true;
               const tir::ReduceNode* r = new_value.as<tir::ReduceNode>();
-              CHECK(r != nullptr);
-              CHECK_EQ(new_body[j].size(), r->source.size());
+              ICHECK(r != nullptr);
+              ICHECK_EQ(new_body[j].size(), r->source.size());
               for (size_t k = 0; k < new_body[j].size(); ++k) {
                 auto n = make_object<tir::ReduceNode>(*r);
                 n->value_index = static_cast<int>(k);
@@ -590,7 +590,7 @@ void InjectInline(ScheduleNode* sch) {
     if (new_body[i].size()) {
       // Logics from ReplaceDataFlow
       const ComputeOpNode* compute = sch->stages[i]->op.as<ComputeOpNode>();
-      CHECK(compute);
+      ICHECK(compute);
       Operation op = s->op;
       if (changed[i]) {
         op = ComputeOp(compute->name, compute->tag, compute->attrs, compute->axis, new_body[i]);
@@ -604,7 +604,7 @@ void InjectInline(ScheduleNode* sch) {
       }
     } else if (hybrid_changed[i]) {
       const HybridOpNode* hybrid = sch->stages[i]->op.as<HybridOpNode>();
-      CHECK(hybrid);
+      ICHECK(hybrid);
       Operation op = HybridOp(hybrid->name, hybrid->tag, hybrid->attrs, hybrid->inputs,
                               hybrid->outputs, new_hybrid_body[i]);
       op = op->ReplaceInputs(op, repl);
@@ -647,8 +647,8 @@ void LegalizeInvalidAttach(ScheduleNode* sch) {
       bool start_attach = false;
       IterVar attach_ivar = spec->attach_ivar;
       s = spec->attach_stage;
-      CHECK(attach_ivar.defined());
-      CHECK(s.defined());
+      ICHECK(attach_ivar.defined());
+      ICHECK(s.defined());
 
       for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) {
         IterVar iv = s->leaf_iter_vars[i - 1];
@@ -710,14 +710,15 @@ Schedule Schedule::normalize() {
 Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int factor_axis) {
   (*this)->InvalidateCache();
   using tir::ReduceNode;
-  CHECK_EQ(axis->iter_type, kCommReduce) << "Can only factor reduction axis";
+  ICHECK_EQ(axis->iter_type, kCommReduce) << "Can only factor reduction axis";
   Stage reduce_stage = operator[](tensor->op);
   const ComputeOpNode* compute_op = reduce_stage->op.as<ComputeOpNode>();
-  CHECK(compute_op) << "Can only factor ComputeOp";
+  ICHECK(compute_op) << "Can only factor ComputeOp";
   ArrayNode* leaf_vars = reduce_stage->leaf_iter_vars.CopyOnWrite();
   {
     size_t axis_pos = FindNodeRef(leaf_vars, axis);
-    CHECK_NE(axis_pos, leaf_vars->size()) << "Cannot find IterVar " << axis << " in leaf iter vars";
+    ICHECK_NE(axis_pos, leaf_vars->size())
+        << "Cannot find IterVar " << axis << " in leaf iter vars";
   }
   // Find touched reduction axis.
   std::unordered_map<IterVar, int> touch_map;
@@ -728,7 +729,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   std::unordered_set<IterVar> skip_bound_check;
   // Verify normal axis are not touched.
   for (IterVar iv : compute_op->axis) {
-    CHECK(!touch_map.count(iv)) << "Factor axis touches normal axis.";
+    ICHECK(!touch_map.count(iv)) << "Factor axis touches normal axis.";
     skip_bound_check.insert(iv);
   }
   // get analyzer.
@@ -762,14 +763,14 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   // Get the factored op node.
   const int factor_axis_pos =
       factor_axis >= 0 ? factor_axis : static_cast<int>(compute_op->axis.size() + 1) + factor_axis;
-  CHECK_LE(factor_axis_pos, compute_op->axis.size());
+  ICHECK_LE(factor_axis_pos, compute_op->axis.size());
   auto n = make_object<ComputeOpNode>();
   n->name = compute_op->name + ".rf";
   {
     // axis relacement.
     auto iv_node = make_object<IterVarNode>();
     iv_node->dom = dom_map.at(axis);
-    CHECK(is_zero(iv_node->dom->min)) << "Can only factor reduction domain starting from 0";
+    ICHECK(is_zero(iv_node->dom->min)) << "Can only factor reduction domain starting from 0";
     iv_node->var = axis->var;
     iv_node->iter_type = kDataPar;
 
@@ -787,7 +788,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   // predicate generation, copy not touched axis.
   int idx = tensor->value_index;
   const ReduceNode* reduce = compute_op->body[idx].as<ReduceNode>();
-  CHECK(reduce) << "Can only rfactor non-inline reductions";
+  ICHECK(reduce) << "Can only rfactor non-inline reductions";
   predicates.push_back(reduce->condition);
   auto fand = [](PrimExpr a, PrimExpr b) { return a && b; };
 
@@ -799,7 +800,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
     if (!touch_map.count(iv)) {
       n->reduce_axis.push_back(iv);
     } else {
-      CHECK(value_map.count(iv));
+      ICHECK(value_map.count(iv));
       PrimExpr index = value_map.at(iv);
       vsub[iv->var.get()] = index;
     }
@@ -808,7 +809,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   // Copy touched axis.
   for (IterVar iv : reduce_stage->leaf_iter_vars) {
     if (touch_map.count(iv) && !iv.same_as(axis)) {
-      CHECK_EQ(iv->iter_type, kCommReduce);
+      ICHECK_EQ(iv->iter_type, kCommReduce);
       auto ncpy = make_object<IterVarNode>(*iv.operator->());
       ncpy->dom = dom_map.at(iv);
       n->reduce_axis.push_back(IterVar(ncpy));
@@ -848,7 +849,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   size_t stage_pos = FindNodeRef(stages.GetArrayNode(), reduce_stage);
   Stage factor_stage = Stage(factor_op);
   factor_stage->relations = rels;
-  CHECK_LT(stage_pos, stages.size());
+  ICHECK_LT(stage_pos, stages.size());
   stages.insert(stages.begin() + stage_pos, factor_stage);
   (*this)->stage_map.Set(factor_op, factor_stage);
   factor_stage->group = reduce_stage->group;
@@ -880,7 +881,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
           std::unordered_map<const VarNode*, PrimExpr> init_vsub;
           for (const auto& init : reduce->init) {
             if (init->IsInstance<ProducerLoadNode>()) {
-              CHECK_EQ(compute_op->axis.size(), idx_size)
+              ICHECK_EQ(compute_op->axis.size(), idx_size)
                   << "'init' should have the number of dimensions as output when using with "
                      "rfactor";
               for (int idx = 0; idx < idx_size; idx++) {
diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc
index a8257c07a473..8964c1013a53 100644
--- a/src/te/schedule/schedule_lang.cc
+++ b/src/te/schedule/schedule_lang.cc
@@ -58,8 +58,8 @@ size_t FindLeafVar(ArrayNode* all_vars, ArrayNode* leaf_vars, const IterVar& v)
 DataType MatchDataType(std::vector<DataType> dtypes) {
   int max_bits = -1;
   for (const auto& dtype : dtypes) {
-    CHECK(dtype.is_int());
-    CHECK(dtype.is_scalar());
+    ICHECK(dtype.is_int());
+    ICHECK(dtype.is_scalar());
     max_bits = std::max(max_bits, dtype.bits());
   }
   return DataType::Int(max_bits);
@@ -68,8 +68,8 @@ DataType MatchDataType(std::vector<DataType> dtypes) {
 void SplitHelper(StageNode* self, IterVar parent, PrimExpr factor, PrimExpr nparts,
                  IterVar* p_outer, IterVar* p_inner) {
   // Check if split is valid.
-  CHECK(parent->iter_type == kDataPar || parent->iter_type == kCommReduce ||
-        parent->iter_type == kOrdered)
+  ICHECK(parent->iter_type == kDataPar || parent->iter_type == kCommReduce ||
+         parent->iter_type == kOrdered)
       << "Cannot split on " << IterVarType2String(parent->iter_type);
   IterVar outer = IterVar(Range(), parent->var.copy_with_suffix(".outer"), parent->iter_type);
   IterVar inner = IterVar(Range(), parent->var.copy_with_suffix(".inner"), parent->iter_type);
@@ -127,7 +127,7 @@ Stage& Stage::set_scope(std::string scope) {  // NOLINT(*)
 }
 
 Stage& Stage::compute_at(Stage parent, IterVar scope) {  // NOLINT(*)
-  CHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
+  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
   // Group constraint checking.
   Stage group = (*this)->group;
   if (group.defined()) {
@@ -135,7 +135,7 @@ Stage& Stage::compute_at(Stage parent, IterVar scope) {  // NOLINT(*)
     while (pg.defined() && !pg.same_as(group)) {
       pg = pg->group;
     }
-    CHECK(pg.same_as(group)) << "Can only assign compute_at to stages within the same group";
+    ICHECK(pg.same_as(group)) << "Can only assign compute_at to stages within the same group";
   }
 
   (*this)->attach_type = kScope;
@@ -148,28 +148,28 @@ Stage& Stage::compute_at(Stage parent, IterVar scope) {  // NOLINT(*)
       break;
     }
   }
-  CHECK(found) << "Cannot find the axis " << scope << " in parent's leaf_iter_vars"
-               << " parent=" << parent;
+  ICHECK(found) << "Cannot find the axis " << scope << " in parent's leaf_iter_vars"
+                << " parent=" << parent;
   return *this;
 }
 
 Stage& Stage::compute_inline() {  // NOLINT(*)
-  CHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
+  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
   (*this)->attach_type = kInline;
   return *this;
 }
 
 Stage& Stage::compute_root() {  // NOLINT(*)
-  CHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
+  ICHECK_NE((*this)->attach_type, kScanUpdate) << "Cannot specify compute_at for scan updates";
   (*this)->attach_type = kGroupRoot;
   return *this;
 }
 
 Stage& Stage::bind(IterVar ivar, IterVar thread_ivar) {  // NOLINT(*)
   StageNode* self = operator->();
-  CHECK(ivar->iter_type == kDataPar || ivar->iter_type == kCommReduce)
+  ICHECK(ivar->iter_type == kDataPar || ivar->iter_type == kCommReduce)
       << "Cannot bind " << IterVarType2String(ivar->iter_type) << " to thread";
-  CHECK(thread_ivar->iter_type == kThreadIndex)
+  ICHECK(thread_ivar->iter_type == kThreadIndex)
       << "Cannot rebase by " << IterVarType2String(ivar->iter_type)
       << ", only thread axis is allowed so far";
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
@@ -193,9 +193,9 @@ Stage& Stage::bind(IterVar ivar, IterVar thread_ivar) {  // NOLINT(*)
 
 Stage& Stage::env_threads(Array<IterVar> threads) {
   StageNode* self = operator->();
-  CHECK(self->op.defined() && self->op.as<ScanOpNode>())
+  ICHECK(self->op.defined() && self->op.as<ScanOpNode>())
       << "env_threads is only valid for composite ops such as ScanOp";
-  CHECK_EQ(self->env_threads.size(), 0U) << "Already set env_threads";
+  ICHECK_EQ(self->env_threads.size(), 0U) << "Already set env_threads";
   Array<IterVar>& leaf_vars = self->leaf_iter_vars;
   Array<IterVar>& all_vars = self->all_iter_vars;
   std::vector<ObjectRef> temp;
@@ -228,11 +228,11 @@ Stage& Stage::split_by_nparts(IterVar parent, PrimExpr nparts, IterVar* p_outer,
 
 Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT(*)
   StageNode* self = operator->();
-  CHECK(outer->iter_type == kDataPar || outer->iter_type == kCommReduce ||
-        outer->iter_type == kOrdered)
+  ICHECK(outer->iter_type == kDataPar || outer->iter_type == kCommReduce ||
+         outer->iter_type == kOrdered)
       << "Cannot fuse " << IterVarType2String(outer->iter_type);
-  CHECK(inner->iter_type == kDataPar || inner->iter_type == kCommReduce ||
-        inner->iter_type == kOrdered)
+  ICHECK(inner->iter_type == kDataPar || inner->iter_type == kCommReduce ||
+         inner->iter_type == kOrdered)
       << "Cannot fuse " << IterVarType2String(inner->iter_type);
 
   IterVarType iter_type = outer->iter_type;
@@ -251,7 +251,7 @@ Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT
     std::swap(outer, inner);
     std::swap(pos_inner, pos_outer);
   }
-  CHECK_EQ(pos_inner, pos_outer + 1)
+  ICHECK_EQ(pos_inner, pos_outer + 1)
       << "Can only fuse iterations that are consecutive between each other";
   self->relations.push_back(Fuse(outer, inner, fused));
   all_vars.push_back(fused);
@@ -288,11 +288,11 @@ Stage& Stage::reorder(const Array<IterVar>& order) {  // NOLINT(*)
   std::unordered_set<IterVar> seen_var;
   StageNode* self = operator->();
   for (IterVar iv : order) {
-    CHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce ||
-          iv->iter_type == kThreadIndex)
+    ICHECK(iv->iter_type == kDataPar || iv->iter_type == kCommReduce ||
+           iv->iter_type == kThreadIndex)
         << "Cannot reorder IterVar(" << IterVarType2String(iv->iter_type) << ")";
 
-    CHECK_EQ(seen_var.count(iv), 0) << "Same axis can not appear more than once " << iv;
+    ICHECK_EQ(seen_var.count(iv), 0) << "Same axis can not appear more than once " << iv;
     seen_var.insert(iv);
   }
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
@@ -345,9 +345,9 @@ inline void SetAttrIterType(StageNode* self, IterVar var, IterVarType iter_type)
 }
 
 Stage& Stage::vectorize(IterVar var) {  // NOLINT(*)
-  CHECK(var->iter_type == kDataPar || var->iter_type == kOpaque || var->iter_type == kUnrolled ||
-        var->iter_type == kVectorized || var->iter_type == kTensorized ||
-        var->iter_type == kParallelized)
+  ICHECK(var->iter_type == kDataPar || var->iter_type == kOpaque || var->iter_type == kUnrolled ||
+         var->iter_type == kVectorized || var->iter_type == kTensorized ||
+         var->iter_type == kParallelized)
       << "Cannot vectorize on " << IterVarType2String(var->iter_type);
   SetAttrIterType(operator->(), var, kVectorized);
   return *this;
@@ -418,7 +418,7 @@ Stage& Stage::storage_align(IterVar axis, int factor, int offset) {
 
 Stage& Stage::double_buffer() {
   StageNode* self = operator->();
-  CHECK(!self->is_output) << "Cannot apply double buffer on output";
+  ICHECK(!self->is_output) << "Cannot apply double buffer on output";
   self->double_buffer = true;
   return *this;
 }
@@ -451,23 +451,23 @@ Schedule Schedule::copy() const {
   }
   for (Stage s : n->stages) {
     if (s->attach_stage.defined()) {
-      CHECK(smap.find(s->attach_stage) != smap.end())
+      ICHECK(smap.find(s->attach_stage) != smap.end())
           << s->attach_stage << " not found in " << (*this);
       s->attach_stage = smap.at(s->attach_stage);
     }
     if (s->group.defined()) {
-      CHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
+      ICHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
       s->group = smap.at(s->group);
     }
   }
   for (Stage s : n->groups) {
     if (s->attach_stage.defined()) {
-      CHECK(smap.find(s->attach_stage) != smap.end())
+      ICHECK(smap.find(s->attach_stage) != smap.end())
           << s->attach_stage << " not found in " << (*this);
       s->attach_stage = smap.at(s->attach_stage);
     }
     if (s->group.defined()) {
-      CHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
+      ICHECK(smap.find(s->group) != smap.end()) << s->group << " not found in " << (*this);
       s->group = smap.at(s->group);
     }
   }
@@ -476,7 +476,7 @@ Schedule Schedule::copy() const {
 
 Stage Schedule::operator[](const Operation& op) {
   auto it = (*this)->stage_map.find(op);
-  CHECK(it != (*this)->stage_map.end())
+  ICHECK(it != (*this)->stage_map.end())
       << "Cannot find Stage for operator " << op << " in the schedule";
   return (*it).second;
 }
@@ -504,7 +504,7 @@ Array<Tensor> RemapTensor(ScheduleNode* self, const Array<Tensor>& arr) {
   Array<Tensor> ret;
   for (Tensor t : arr) {
     if (!op2stage_cache.count(t->op.get())) {
-      CHECK(self->stage_map.count(t->op)) << "Given tensor is not in the schedule plan";
+      ICHECK(self->stage_map.count(t->op)) << "Given tensor is not in the schedule plan";
       t = self->stage_map[t->op]->op.output(t->value_index);
     }
     ret.push_back(t);
@@ -534,7 +534,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>&
   for (size_t i = 0; i < ops.size(); ++i) {
     Operation op = ops[i];
     auto it = op2stage_cache.find(op.get());
-    CHECK(it != op2stage_cache.end());
+    ICHECK(it != op2stage_cache.end());
     Stage op_group = it->second->group;
     if (i == 0) {
       parent_group = op_group;
@@ -575,7 +575,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>&
   // Verification and remappig the subgroups.
   for (auto& kv : counter) {
     if (kv.first.same_as(parent_group)) continue;
-    CHECK_EQ(kv.first->num_child_stages, kv.second.count)
+    ICHECK_EQ(kv.first->num_child_stages, kv.second.count)
         << "Trying to group region that intersect with an already existed group";
     if (kv.first->group.same_as(parent_group)) {
       Stage s = kv.first;
@@ -589,7 +589,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>&
   // Remap the group of op stages.
   for (Operation op : ops) {
     auto it = op2stage_cache.find(op.get());
-    CHECK(it != op2stage_cache.end());
+    ICHECK(it != op2stage_cache.end());
     Stage s = it->second;
     if (s->group.same_as(parent_group)) {
       s->group = gstage;
@@ -602,7 +602,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs, const Array<Tensor>&
   // Correct the attach to keep everything in group.
   for (Operation op : ops) {
     auto it = op2stage_cache.find(op.get());
-    CHECK(it != op2stage_cache.end());
+    ICHECK(it != op2stage_cache.end());
     Stage s = it->second;
     if (s->attach_type == kScope) {
       Stage cg = LeastCommonAncestor(s->attach_stage->group, gstage);
@@ -628,7 +628,7 @@ void ScheduleNode::InitCache() {
       op2stage_cache_[s->op.get()] = s;
     }
   }
-  CHECK_EQ(op2stage_cache_.size(), stages.size());
+  ICHECK_EQ(op2stage_cache_.size(), stages.size());
 }
 
 bool ScheduleNode::Contain(const Operation& op) const {
@@ -667,7 +667,7 @@ Schedule::Schedule(Array<Operation> ops) {
 
       for (size_t i = 0; i < scan->update.size(); ++i) {
         Stage s = n->stage_map[scan->update[i]->op];
-        CHECK(scan_group.same_as(s->group));
+        ICHECK(scan_group.same_as(s->group));
       }
     }
   }
@@ -726,8 +726,8 @@ void SpecializedCondition::EnterWithScope() {
 
 void SpecializedCondition::ExitWithScope() {
   TVMSpecializationThreadLocalEntry* entry = TVMSpecializationThreadLocalStore::Get();
-  CHECK(!entry->condition_stack.empty());
-  CHECK(entry->condition_stack.top().same_as(*this));
+  ICHECK(!entry->condition_stack.empty());
+  ICHECK(entry->condition_stack.top().same_as(*this));
   entry->condition_stack.pop();
 }
 
diff --git a/src/te/schedule/schedule_ops.cc b/src/te/schedule/schedule_ops.cc
index a16d9bb73000..355e3c39494b 100644
--- a/src/te/schedule/schedule_ops.cc
+++ b/src/te/schedule/schedule_ops.cc
@@ -69,13 +69,13 @@ class InjectAttach : public StmtMutator {
         debug_keep_trivial_loop_(debug_keep_trivial_loop) {}
 
   Stmt VisitStmt(const Stmt& input_stmt) final {
-    CHECK(input_stmt.defined());
+    ICHECK(input_stmt.defined());
     auto stmt = StmtMutator::VisitStmt(input_stmt);
     const AttrStmtNode* op = stmt.as<AttrStmtNode>();
     if (op != nullptr && op->attr_key == tir::attr::loop_scope) {
       if (attach_spec_->attach_type == kScope && op->node == attach_spec_->attach_ivar) {
-        CHECK(!found_attach) << "Find IterVar" << attach_spec_->attach_ivar
-                             << " in multiple places in the IR";
+        ICHECK(!found_attach) << "Find IterVar" << attach_spec_->attach_ivar
+                              << " in multiple places in the IR";
         found_attach = true;
         stmt = AttrStmt(op->node, op->attr_key, op->value,
                         MakePipeline(stage_, dom_map_, op->body, debug_keep_trivial_loop_));
@@ -111,7 +111,7 @@ class InjectScanStep : public StmtMutator {
         debug_keep_trivial_loop_(debug_keep_trivial_loop) {}
 
   Stmt VisitStmt(const Stmt& input_stmt) final {
-    CHECK(input_stmt.defined());
+    ICHECK(input_stmt.defined());
     auto stmt = StmtMutator::VisitStmt(input_stmt);
     // update
     const AttrStmtNode* op = stmt.as<AttrStmtNode>();
@@ -160,14 +160,14 @@ class SchedulePostProc : public StmtExprMutator {
       return this->VisitStmt(op->body);
     } else if (op->attr_key == tir::attr::scan_update_scope) {
       const ScanOpNode* scan = op->node.as<ScanOpNode>();
-      CHECK(scan);
+      ICHECK(scan);
       var_value_[scan->scan_axis->var.get()] = op->value;
       return this->VisitStmt(op->body);
     } else if (op->attr_key == tir::attr::thread_extent) {
       // delete duplicated thread extent attr
       auto it = thread_extent_scope_.find(op->node.get());
       if (it != thread_extent_scope_.end()) {
-        CHECK(is_zero(analyzer_.Simplify(it->second - op->value)));
+        ICHECK(is_zero(analyzer_.Simplify(it->second - op->value)));
         return this->VisitStmt(op->body);
       } else {
         thread_extent_scope_[op->node.get()] = op->value;
@@ -243,7 +243,7 @@ class SchedulePostProc : public StmtExprMutator {
   PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<ProducerLoadNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
 
     auto key = Downcast<Tensor>(op->producer);
     auto it = replace_buffer_.find(key);
@@ -271,7 +271,7 @@ class SchedulePostProc : public StmtExprMutator {
         if (kv.second->bind_thread.defined()) {
           const Var& from = kv.first->var;
           const Var& to = kv.second->bind_thread->var;
-          CHECK(!var_value_.count(from.get()));
+          ICHECK(!var_value_.count(from.get()));
           var_value_[from.get()] = to;
         }
       }
@@ -325,7 +325,8 @@ Stmt ScheduleOps(Schedule sch, Map<IterVar, Range> dom_map_, bool debug_keep_tri
     if (!scan) continue;
     for (Tensor t : scan->init) {
       if (scan_init.count(t->op)) {
-        CHECK(scan_init.at(t->op).same_as(s->op)) << "Scan init tensor can only belong to one scan";
+        ICHECK(scan_init.at(t->op).same_as(s->op))
+            << "Scan init tensor can only belong to one scan";
       } else {
         scan_init[t->op] = s->op;
       }
@@ -333,44 +334,44 @@ Stmt ScheduleOps(Schedule sch, Map<IterVar, Range> dom_map_, bool debug_keep_tri
   }
   // verify correctness of group.
   for (Stage g : sch->groups) {
-    CHECK(!g->op.defined());
-    CHECK_EQ(g->leaf_iter_vars.size(), 0U);
+    ICHECK(!g->op.defined());
+    ICHECK_EQ(g->leaf_iter_vars.size(), 0U);
   }
   // reverse the post DFS order.
   for (size_t i = sch->stages.size(); i != 0; --i) {
     Stage s = sch->stages[i - 1];
-    CHECK_NE(s->attach_type, kInline) << "call schedule.normalize before scheduleops";
-    CHECK(s->op.defined());
+    ICHECK_NE(s->attach_type, kInline) << "call schedule.normalize before scheduleops";
+    ICHECK(s->op.defined());
     // no need to specify place holder op.
     if (s->op.as<PlaceholderOpNode>()) continue;
     // Remove grouping sugar, get the real attach spec.
     Stage attach_spec = s.GetAttachSpec();
 
     if (scan_init.count(s->op)) {
-      CHECK(body.defined());
+      ICHECK(body.defined());
       InjectScanStep mu(s, scan_init.at(s->op), dom_map, true, debug_keep_trivial_loop);
       body = mu(std::move(body));
-      CHECK(mu.found_attach) << "did not find attachment point for scan.init";
+      ICHECK(mu.found_attach) << "did not find attachment point for scan.init";
     } else if (attach_spec->attach_type == kScanUpdate) {
       // Handle scan update
-      CHECK(body.defined());
+      ICHECK(body.defined());
       InjectScanStep mu(s, attach_spec->attach_stage->op, dom_map, false, debug_keep_trivial_loop);
       body = mu(std::move(body));
-      CHECK(mu.found_attach) << "did not find attachment point for scan.update";
+      ICHECK(mu.found_attach) << "did not find attachment point for scan.update";
     } else if (attach_spec->attach_type == kInlinedAlready) {
       // do nothing
     } else if (attach_spec->attach_type == kGroupRoot) {
-      CHECK(!s->group.defined());
+      ICHECK(!s->group.defined());
       body = MakePipeline(s, dom_map, body, debug_keep_trivial_loop);
     } else {
-      CHECK_EQ(attach_spec->attach_type, kScope);
-      CHECK(body.defined());
+      ICHECK_EQ(attach_spec->attach_type, kScope);
+      ICHECK(body.defined());
       InjectAttach mutator(s, attach_spec, dom_map, debug_keep_trivial_loop);
       body = mutator(std::move(body));
-      CHECK(mutator.found_attach) << "did not find attachment point for " << s << " in "
-                                  << attach_spec->attach_stage->op << " x "
-                                  << attach_spec->attach_ivar << ", body:\n"
-                                  << body;
+      ICHECK(mutator.found_attach)
+          << "did not find attachment point for " << s << " in " << attach_spec->attach_stage->op
+          << " x " << attach_spec->attach_ivar << ", body:\n"
+          << body;
     }
   }
   SchedulePostProc post_proc;
diff --git a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
index 7c4a3c7f6ebd..f81d72e0fe02 100644
--- a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
+++ b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
@@ -415,7 +415,7 @@ class BufferAnalyser : public StmtExprVisitor {
     } else if (op->attr_key == tir::attr::buffer_dim_align) {
       te::Tensor tensor = Downcast<te::Tensor>(op->node);
       const CallNode* tuple = op->value.as<CallNode>();
-      CHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
+      ICHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
       auto& vinfo = dim_align_[tensor];
       size_t dim = tuple->args[0].as<IntImmNode>()->value;
       if (dim >= vinfo.size()) {
@@ -433,9 +433,9 @@ class BufferAnalyser : public StmtExprVisitor {
     StmtExprVisitor::VisitStmt_(op);
     auto key = Downcast<Tensor>(op->producer);
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key->GetNameHint();
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key->GetNameHint();
     const BufferInfo& bi = it->second;
-    CHECK(!bi.released) << "Read a buffer that is already out of scope";
+    ICHECK(!bi.released) << "Read a buffer that is already out of scope";
 
     if (matrix_abc_.count(key->GetNameHint())) {
       if (bi.shape.size() < 2) {
@@ -535,9 +535,9 @@ class BufferAnalyser : public StmtExprVisitor {
 
     auto tensor = Downcast<Tensor>(op->producer);
     auto it = buf_map_.find(tensor);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << tensor->GetNameHint();
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << tensor->GetNameHint();
     const BufferInfo& bi = it->second;
-    CHECK(!bi.released) << "Read a buffer that is already out of scope";
+    ICHECK(!bi.released) << "Read a buffer that is already out of scope";
 
     if (matrix_abc_.count(tensor->op->name)) {
       if (bi.shape.size() < 2) {
@@ -591,7 +591,7 @@ class BufferAnalyser : public StmtExprVisitor {
   void VisitStmt_(const ProducerRealizeNode* op) final {
     auto key = Downcast<Tensor>(op->producer);
     if (buf_map_.count(key)) {
-      CHECK(buf_map_.at(key).external);
+      ICHECK(buf_map_.at(key).external);
       this->VisitStmt(op->body);
     } else {
       // create a buffer entry
@@ -678,7 +678,7 @@ class BufferAnalyser : public StmtExprVisitor {
     inline Array<PrimExpr> RelIndex(Array<PrimExpr> args) const {
       if (bounds.size() != 0) {
         Array<PrimExpr> index;
-        CHECK_EQ(bounds.size(), args.size());
+        ICHECK_EQ(bounds.size(), args.size());
         for (size_t i = 0; i < bounds.size(); ++i) {
           index.push_back(args[i] - bounds[i]->min);
         }
@@ -797,7 +797,7 @@ class TensorCoreIRMutator : public StmtExprMutator {
       for (size_t i = 0; i < op->bounds.size() - 2; ++i) {
         new_bounds.push_back(op->bounds[i]);
       }
-      CHECK_GE(op->bounds.size(), 2) << "Less than 2 dimensions for matrix " << key->GetNameHint();
+      ICHECK_GE(op->bounds.size(), 2) << "Less than 2 dimensions for matrix " << key->GetNameHint();
       new_bounds.push_back(
           Range::FromMinExtent(op->bounds[op->bounds.size() - 2]->min, new_extents[0]));
       new_bounds.push_back(
@@ -818,7 +818,7 @@ class TensorCoreIRMutator : public StmtExprMutator {
         }
 
         auto it = matrix_abc_.find(simplify_name(node->name));
-        CHECK(it != matrix_abc_.end()) << "Cannot find matrix info for " << node->name;
+        ICHECK(it != matrix_abc_.end()) << "Cannot find matrix info for " << node->name;
         auto matrix_abc = tvm::tir::StringImm("wmma." + it->second);
         Stmt body = this->VisitStmt(op->body);
         return AttrStmt(op->node, op->attr_key, matrix_abc, body);
@@ -887,12 +887,12 @@ class TensorCoreIRMutator : public StmtExprMutator {
       }
 
       const ProducerLoadNode* value = op->value.as<ProducerLoadNode>();
-      CHECK(value != nullptr) << "Can only load fragment from a buffer";
+      ICHECK(value != nullptr) << "Can only load fragment from a buffer";
 
       auto it = strides_.find(value->producer->GetNameHint());
-      CHECK(it != strides_.end()) << "Cannot find stride for " << value->producer->GetNameHint();
+      ICHECK(it != strides_.end()) << "Cannot find stride for " << value->producer->GetNameHint();
       auto strides = it->second;
-      CHECK_GE(strides.size(), 2);
+      ICHECK_GE(strides.size(), 2);
       PrimExpr stride = strides[strides.size() - 2];
 
       // thread index unification inside a warp
@@ -905,7 +905,7 @@ class TensorCoreIRMutator : public StmtExprMutator {
       auto pload = dst.as<ProducerLoadNode>();
       PrimExpr matrix_major;
       auto iter2 = matrix_major_.find(simplify_name(pload->producer->GetNameHint()));
-      CHECK(iter2 != matrix_major_.end())
+      ICHECK(iter2 != matrix_major_.end())
           << "Can not determine matrix major for " << pload->producer->GetNameHint();
       if (iter2->second == "col_major") {
         matrix_major = StringImm("col_major");
@@ -928,9 +928,9 @@ class TensorCoreIRMutator : public StmtExprMutator {
     auto it3 = frag_store_.find(op);
     if (it3 != frag_store_.end()) {
       auto it = strides_.find(op->producer->GetNameHint());
-      CHECK(it != strides_.end()) << "Cannot find stride for " << op->producer->GetNameHint();
+      ICHECK(it != strides_.end()) << "Cannot find stride for " << op->producer->GetNameHint();
       auto strides = it->second;
-      CHECK_GE(strides.size(), 2);
+      ICHECK_GE(strides.size(), 2);
       PrimExpr stride = strides[strides.size() - 2];
 
       PrimExpr dst = it3->second;
@@ -978,7 +978,7 @@ class TensorCoreIRMutator : public StmtExprMutator {
   Array<PrimExpr> get_tile_size_(const std::string& name) {
     auto it = matrix_abc_.find(name);
     auto it2 = matrix_major_.find(name);
-    CHECK(it != matrix_abc_.end() && it2 != matrix_major_.end())
+    ICHECK(it != matrix_abc_.end() && it2 != matrix_major_.end())
         << "Cannot find matrix info for " << name;
     PrimExpr size0 = make_const(DataType::Int(32), 16);
     PrimExpr size1 = make_const(DataType::Int(32), 16);
@@ -1011,13 +1011,13 @@ class TensorCoreIRMutator : public StmtExprMutator {
                               const std::function<Stmt(const Buffer& buffer)>& call_back) {
     auto tensor = Downcast<Tensor>(pload->producer);
     auto it = bounds_.find(tensor);
-    CHECK(it != bounds_.end());
+    ICHECK(it != bounds_.end());
     Array<PrimExpr> min_bound;
     for (auto i : it->second) {
       min_bound.push_back(i->min);
     }
 
-    CHECK_GE(it->second.size(), 2);
+    ICHECK_GE(it->second.size(), 2);
     Array<PrimExpr> shape;
     for (size_t i = 0; i < it->second.size() - 2; ++i) {
       shape.push_back(it->second[i]->extent);
@@ -1037,13 +1037,13 @@ class TensorCoreIRMutator : public StmtExprMutator {
     strides.push_back(make_const(DataType::Int(32), 1));
 
     PrimExpr elem_offset = IntImm(DataType::Int(32), 0);
-    CHECK_EQ(pload->indices.size(), min_bound.size());
+    ICHECK_EQ(pload->indices.size(), min_bound.size());
     for (size_t i = 0; i < min_bound.size(); i++) {
       elem_offset = Add(elem_offset, Mul(strides[i], Sub(pload->indices[i], min_bound[i])));
     }
 
     auto it2 = matrix_abc_.find(simplify_name(tensor->op->name));
-    CHECK(it2 != matrix_abc_.end()) << "Cannot find matrix info for " << tensor->op->name;
+    ICHECK(it2 != matrix_abc_.end()) << "Cannot find matrix info for " << tensor->op->name;
     buffer_node->data = Var(tensor->op->name, DataType::Handle());
     buffer_node->name = tensor->op->name;
     buffer_node->scope = "wmma." + it2->second;
diff --git a/src/te/schedule/schedule_postproc_to_primfunc.cc b/src/te/schedule/schedule_postproc_to_primfunc.cc
index a86ad76b0eb9..1710a91c6985 100644
--- a/src/te/schedule/schedule_postproc_to_primfunc.cc
+++ b/src/te/schedule/schedule_postproc_to_primfunc.cc
@@ -128,7 +128,7 @@ class TensorToBufferMapper : public StmtExprMutator {
   Buffer GetBuffer(const Tensor& tensor, bool allow_alloc = false) {
     auto it = buffer_map_.find(tensor);
     if (it != buffer_map_.end()) return it->second;
-    CHECK(allow_alloc) << "Cannot find the Realization point of tensor " << tensor;
+    ICHECK(allow_alloc) << "Cannot find the Realization point of tensor " << tensor;
 
     auto buffer = CreateBufferFor(tensor);
     buffer_map_[tensor] = buffer;
@@ -156,7 +156,7 @@ PrimFunc SchedulePostProcToPrimFunc(Array<ObjectRef> arg_list, Stmt body,
       params.push_back(GetRef<tir::Var>(n));
     } else if (auto* n = var.as<te::TensorNode>()) {
       te::Tensor tensor = GetRef<te::Tensor>(n);
-      CHECK(!extern_buffer.count(tensor));
+      ICHECK(!extern_buffer.count(tensor));
 
       tir::Buffer buffer = CreateBufferFor(tensor);
       tir::Var bptr(buffer->name, DataType::Handle());
diff --git a/src/te/tensor.cc b/src/te/tensor.cc
index e66b9632d8a2..18d4947cdddc 100644
--- a/src/te/tensor.cc
+++ b/src/te/tensor.cc
@@ -46,8 +46,8 @@ PrimExpr Tensor::operator()(Array<Var> indices) const {
 
 PrimExpr Tensor::operator()(Array<PrimExpr> indices) const {
   if (ndim() != 0) {
-    CHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read"
-                                     << "ndim = " << ndim() << ", indices.size=" << indices.size();
+    ICHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read"
+                                      << "ndim = " << ndim() << ", indices.size=" << indices.size();
   }
 
   return ProducerLoad((*this), indices);
diff --git a/src/tir/analysis/verify_gpu_code.cc b/src/tir/analysis/verify_gpu_code.cc
index 5ef755a1b5a1..afd3c7add605 100644
--- a/src/tir/analysis/verify_gpu_code.cc
+++ b/src/tir/analysis/verify_gpu_code.cc
@@ -94,7 +94,7 @@ class GPUCodeVerifier : public StmtExprVisitor {
 
       Var var = op->node.as<IterVarNode>()->var;
       const auto* extent = op->value.as<IntImmNode>();
-      CHECK(extent);
+      ICHECK(extent);
 
       std::string name = var.get()->name_hint;
       // record the number of threads in a block
@@ -167,7 +167,7 @@ class GPUCodeVerifier : public StmtExprVisitor {
   void VisitStmt_(const ForNode* op) {
     if (op->loop_var->name_hint == "vthread.s") {
       const auto* extent = op->extent.as<IntImmNode>();
-      CHECK(extent);
+      ICHECK(extent);
 
       size_t num_vthread = static_cast<size_t>(extent->value);
       if (num_vthread > max_vthread_) {
diff --git a/src/tir/analysis/verify_memory.cc b/src/tir/analysis/verify_memory.cc
index 64097e1d343a..905384f29908 100644
--- a/src/tir/analysis/verify_memory.cc
+++ b/src/tir/analysis/verify_memory.cc
@@ -170,7 +170,7 @@ class MemoryAccessVerifier final : protected StmtExprVisitor {
 /// Interface of VerifyMemory pass
 std::vector<String> VerifyMemory_(const PrimFunc& func) {
   auto target = func->GetAttr<Target>(tvm::attr::kTarget);
-  CHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
+  ICHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
 
   if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
       CallingConv::kDefault) {
diff --git a/src/tir/analysis/verify_ssa.cc b/src/tir/analysis/verify_ssa.cc
index 834ad09cb61a..d7ccb363c16e 100644
--- a/src/tir/analysis/verify_ssa.cc
+++ b/src/tir/analysis/verify_ssa.cc
@@ -148,7 +148,7 @@ Pass VerifySSA() {
     for (auto kv : mod->functions) {
       if (auto* n = kv.second.as<PrimFuncNode>()) {
         auto func = GetRef<PrimFunc>(n);
-        CHECK(VerifySSA(func)) << "RuntimeError: IR is not in SSA form" << func;
+        ICHECK(VerifySSA(func)) << "RuntimeError: IR is not in SSA form" << func;
       }
     }
     return mod;
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index d33f2ddf698a..08b2224e9912 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -244,10 +244,10 @@ inline PrimExpr ElemOffset(const BufferNode* n, Array<PrimExpr> index) {
     // Scalar case
     if (n->shape.size() == 0 && index.size() == 1) {
       auto is_int = index[0].as<IntImmNode>();
-      CHECK(is_int && is_int->value == 0);
+      ICHECK(is_int && is_int->value == 0);
       base = base + index[0];
     } else {
-      CHECK_EQ(n->shape.size(), index.size());
+      ICHECK_EQ(n->shape.size(), index.size());
       if (index.size() > 0) {
         PrimExpr offset = index[0];
         for (size_t i = 1; i < index.size(); ++i) {
@@ -257,7 +257,7 @@ inline PrimExpr ElemOffset(const BufferNode* n, Array<PrimExpr> index) {
       }
     }
   } else {
-    CHECK_EQ(n->strides.size(), index.size());
+    ICHECK_EQ(n->strides.size(), index.size());
     if (is_zero(base)) {
       base = MergeMulMod(&ana, index[0] * n->strides[0]);
     } else {
@@ -285,7 +285,7 @@ inline PrimExpr BufferOffset(const BufferNode* n, Array<PrimExpr> index, DataTyp
 PrimExpr Buffer::vload(Array<PrimExpr> begin, DataType dtype) const {
   // specially handle bool, stored as DataType::Int(8)
   const BufferNode* n = operator->();
-  CHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
+  ICHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype << " from buffer of " << n->dtype;
   if (dtype == DataType::Bool()) {
     return tir::Cast(DataType::Bool(),
@@ -300,7 +300,7 @@ Stmt Buffer::vstore(Array<PrimExpr> begin, PrimExpr value) const {
   // specially handle bool, stored as DataType::Int(8)
   const BufferNode* n = operator->();
   DataType dtype = value.dtype();
-  CHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
+  ICHECK(dtype.element_of() == n->dtype.element_of() && dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot store " << dtype << " to buffer of " << n->dtype;
   if (value.dtype() == DataType::Bool()) {
     return tir::Store(n->data, tir::Cast(DataType::Int(8), value),
@@ -383,7 +383,7 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
 Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr> strides,
                PrimExpr elem_offset, String name, String scope, int data_alignment,
                int offset_factor, BufferType buffer_type) {
-  CHECK(IsPointerType(data->type_annotation, dtype))
+  ICHECK(IsPointerType(data->type_annotation, dtype))
       << "Buffer data field expect to have the right pointer type annotation"
       << " annotation=" << data->type_annotation << ", dtype=" << dtype;
 
@@ -428,7 +428,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(BufferNode);
 
 TVM_REGISTER_GLOBAL("tir.Buffer").set_body([](TVMArgs args, TVMRetValue* ret) {
-  CHECK_EQ(args.size(), 10);
+  ICHECK_EQ(args.size(), 10);
   auto buffer_type = args[9].operator String();
   BufferType type = (buffer_type == "auto_broadcast") ? kAutoBroadcast : kDefault;
   *ret =
diff --git a/src/tir/ir/data_layout.cc b/src/tir/ir/data_layout.cc
index bc777db55dbe..da3496dba407 100644
--- a/src/tir/ir/data_layout.cc
+++ b/src/tir/ir/data_layout.cc
@@ -54,7 +54,7 @@ const LayoutAxis LayoutAxis::LOWER_CASE[] = {
     LayoutAxis('z')};
 
 const LayoutAxis& LayoutAxis::Get(const char name) {
-  CHECK((name >= 'A' && name <= 'Z') || (name >= 'a' && name <= 'z'))
+  ICHECK((name >= 'A' && name <= 'Z') || (name >= 'a' && name <= 'z'))
       << "Invalid layout axis name: " << name << ". Has to be A-Z or a-z.";
   return (name >= 'A' && name <= 'Z') ? LayoutAxis::UPPER_CASE[name - 'A']
                                       : LayoutAxis::LOWER_CASE[name - 'a'];
@@ -62,12 +62,12 @@ const LayoutAxis& LayoutAxis::Get(const char name) {
 
 const LayoutAxis& LayoutAxis::Get(const IterVar& itvar) {
   const std::string axis = itvar->var.get()->name_hint;
-  CHECK_EQ(axis.size(), 1) << "Invalid layout axis " << axis;
+  ICHECK_EQ(axis.size(), 1) << "Invalid layout axis " << axis;
   return LayoutAxis::Get(axis[0]);
 }
 
 const LayoutAxis& LayoutAxis::Get(const std::string& name) {
-  CHECK_EQ(name.length(), 1) << "Invalid axis " << name;
+  ICHECK_EQ(name.length(), 1) << "Invalid axis " << name;
   return LayoutAxis::Get(name[0]);
 }
 
@@ -77,13 +77,13 @@ Layout::Layout(const Array<IterVar>& axes) {
   std::ostringstream repr;
   for (const IterVar& axis : axes) {
     if (const auto* factor = axis->dom->extent.as<IntImmNode>()) {
-      CHECK_GT(factor->value, 0);
+      ICHECK_GT(factor->value, 0);
       repr << factor->value;
     }
-    CHECK_EQ(axis->var.get()->name_hint.size(), 1)
+    ICHECK_EQ(axis->var.get()->name_hint.size(), 1)
         << "Invalid layout axis " << axis->var.get()->name_hint;
     char c = axis->var.get()->name_hint.operator std::string()[0];
-    CHECK((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) << "Invalid layout axis " << c;
+    ICHECK((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) << "Invalid layout axis " << c;
     repr << axis->var.get()->name_hint;
   }
   node->name = repr.str();
@@ -102,22 +102,22 @@ Layout::Layout(const std::string& name) {  // NOLINT(*)
   int32_t factor = 0;
   for (char c : name) {
     if (c >= 'A' && c <= 'Z') {
-      CHECK_EQ(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
-                          << " before dimension " << c;
+      ICHECK_EQ(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
+                           << " before dimension " << c;
       std::string shape_name("_shape");
       shape_name.insert(0, 1, c);
       IterVar axis =
           IterVar(Range(PrimExpr(0), Var(shape_name)), Var(std::string(1, c)), tir::kDataPar);
       node->axes.push_back(axis);
     } else if (c >= 'a' && c <= 'z') {
-      CHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
-                          << " for dimension " << c;
+      ICHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
+                           << " for dimension " << c;
       IterVar axis =
           IterVar(Range(PrimExpr(0), PrimExpr(factor)), Var(std::string(1, c)), tir::kDataPar);
       node->axes.push_back(axis);
       factor = 0;
     } else if (c >= '0' && c <= '9') {
-      CHECK(factor >= 0) << "Invalid layout " << name << ": _ is adjacent to a number.";
+      ICHECK(factor >= 0) << "Invalid layout " << name << ": _ is adjacent to a number.";
       factor = factor * 10 + c - '0';
     } else {
       LOG(FATAL) << "Invalid layout " << name;
@@ -128,16 +128,16 @@ Layout::Layout(const std::string& name) {  // NOLINT(*)
   std::vector<bool> exist_axis(256, false);
   for (const IterVar& v : node->axes) {
     auto axis_str = v->var.get()->name_hint.operator std::string();
-    CHECK_EQ(axis_str.size(), 1);
+    ICHECK_EQ(axis_str.size(), 1);
     char axis = axis_str[0];
-    CHECK((axis >= 'a' && axis <= 'z') || (axis >= 'A' && axis <= 'Z'));
-    CHECK(!exist_axis[axis]) << "Invalid layout " << name << ": duplicate axis " << axis;
+    ICHECK((axis >= 'a' && axis <= 'z') || (axis >= 'A' && axis <= 'Z'));
+    ICHECK(!exist_axis[axis]) << "Invalid layout " << name << ": duplicate axis " << axis;
     exist_axis[axis] = true;
   }
   for (const IterVar& v : node->axes) {
     char axis = v->var.get()->name_hint.operator std::string()[0];
     if (axis >= 'a' && axis <= 'z') {
-      CHECK(exist_axis[axis - 'a' + 'A'])
+      ICHECK(exist_axis[axis - 'a' + 'A'])
           << "Invalid layout " << name << ": missing axis " << std::toupper(axis);
     }
   }
@@ -160,13 +160,13 @@ Layout Layout::Split(const LayoutAxis& axis, size_t target_pos, int32_t factor)
   if (!defined()) return Layout::Undef();
   const std::string& name = operator->()->name;
   const auto axes = operator->()->axes;
-  CHECK(target_pos <= this->ndim())
+  ICHECK(target_pos <= this->ndim())
       << "Invalid split position " << target_pos << " for layout " << name;
-  CHECK(axis.IsPrimal()) << "Cannot split a subordinate axis " << axis;
-  CHECK(this->Contains(axis)) << "Axis " << axis << " does not exist in " << name;
-  CHECK(!this->Contains(axis.ToSubordinate()))
+  ICHECK(axis.IsPrimal()) << "Cannot split a subordinate axis " << axis;
+  ICHECK(this->Contains(axis)) << "Axis " << axis << " does not exist in " << name;
+  ICHECK(!this->Contains(axis.ToSubordinate()))
       << "Axis " << axis << " has already been split in " << name;
-  CHECK(factor > 0) << "Invalid split size " << factor;
+  ICHECK(factor > 0) << "Invalid split size " << factor;
   Array<IterVar> new_layout;
   for (size_t i = 0; i <= this->ndim(); ++i) {
     if (i == target_pos) {
@@ -186,7 +186,7 @@ int32_t Layout::FactorOf(const LayoutAxis& axis) const {
   for (const IterVar& itvar : operator->()->axes) {
     if (sub == LayoutAxis::Get(itvar)) {
       const auto* factor = itvar->dom->extent.as<IntImmNode>();
-      CHECK(factor);
+      ICHECK(factor);
       return factor->value;
     }
   }
@@ -261,17 +261,17 @@ inline Array<PrimExpr> TransformIndex(const Array<PrimExpr>& src_index,
 }
 
 Array<PrimExpr> BijectiveLayout::ForwardIndex(const Array<PrimExpr>& src_index) const {
-  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  ICHECK(defined()) << "Cannot operate on an undefined bijective layout.";
   const BijectiveLayoutNode* self = operator->();
-  CHECK_EQ(src_index.size(), self->src_layout->axes.size())
+  ICHECK_EQ(src_index.size(), self->src_layout->axes.size())
       << "Input mismatch with layout " << self->src_layout;
   return TransformIndex(src_index, self->src_layout->axes, self->forward_rule);
 }
 
 Array<PrimExpr> BijectiveLayout::BackwardIndex(const Array<PrimExpr>& dst_index) const {
-  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  ICHECK(defined()) << "Cannot operate on an undefined bijective layout.";
   const BijectiveLayoutNode* self = operator->();
-  CHECK_EQ(dst_index.size(), self->dst_layout->axes.size())
+  ICHECK_EQ(dst_index.size(), self->dst_layout->axes.size())
       << "Output mismatch with layout " << self->dst_layout;
   return TransformIndex(dst_index, self->dst_layout->axes, self->backward_rule);
 }
@@ -281,7 +281,7 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
                                       const Array<IterVar>& target_axis,
                                       const Array<PrimExpr>& transform_rule) {
   arith::Analyzer ana;
-  CHECK_EQ(src_shape.size(), src_axis.size());
+  ICHECK_EQ(src_shape.size(), src_axis.size());
   // bind variables for original axes
   // for major-axis, bind the corresponding size
   // for minor-axis, simply bind it as 0, so that we can reuse forward/backward_rule,
@@ -299,7 +299,7 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
         const auto* orig_shape_const = orig_shape.as<IntImmNode>();
         const auto* orig_axis_extent = orig_axis->dom->extent.as<IntImmNode>();
         if (orig_shape_const) {
-          CHECK_EQ(orig_shape_const->value, orig_axis_extent->value)
+          ICHECK_EQ(orig_shape_const->value, orig_axis_extent->value)
               << "Input shape mismatch at index " << i << ". Expected " << orig_axis->dom->extent
               << ", get " << orig_shape;
         }
@@ -313,7 +313,7 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
   // for major-axis, use the forward/backward_rule directly,
   // for minor-axis, simply use the extent.
   Array<PrimExpr> result;
-  CHECK_EQ(transform_rule.size(), target_axis.size());
+  ICHECK_EQ(transform_rule.size(), target_axis.size());
   for (size_t i = 0; i < transform_rule.size(); ++i) {
     PrimExpr rule = transform_rule[i];
     IterVar axis = target_axis[i];
@@ -331,13 +331,13 @@ inline Array<PrimExpr> TransformShape(const Array<PrimExpr>& src_shape,
 }
 
 Array<PrimExpr> BijectiveLayout::ForwardShape(const Array<PrimExpr>& shape) const {
-  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  ICHECK(defined()) << "Cannot operate on an undefined bijective layout.";
   const BijectiveLayoutNode* self = operator->();
   return TransformShape(shape, self->src_layout->axes, self->dst_layout->axes, self->forward_rule);
 }
 
 Array<PrimExpr> BijectiveLayout::BackwardShape(const Array<PrimExpr>& shape) const {
-  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  ICHECK(defined()) << "Cannot operate on an undefined bijective layout.";
   const BijectiveLayoutNode* self = operator->();
   return TransformShape(shape, self->dst_layout->axes, self->src_layout->axes, self->backward_rule);
 }
@@ -351,7 +351,7 @@ BijectiveLayout::BijectiveLayout(Layout src_layout, Layout dst_layout) {
   // To be consistent with previous behavior, a nullptr layout is created
   // when argument is invalid.
   if (GetStoreRule(&n->forward_rule, n->src_layout, n->dst_layout)) {
-    CHECK(GetStoreRule(&n->backward_rule, n->dst_layout, n->src_layout));
+    ICHECK(GetStoreRule(&n->backward_rule, n->dst_layout, n->src_layout));
     data_ = std::move(n);
   }
 }
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index f648aca18e46..825bac86919c 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -33,30 +33,30 @@
 namespace tvm {
 namespace tir {
 
-#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                            \
-  Name::Name(PrimExpr a, PrimExpr b) {                                \
-    using T = Name::ContainerType;                                    \
-    CHECK(a.defined()) << "ValueError: a is undefined\n";             \
-    CHECK(b.defined()) << "ValueError: b is undefined\n";             \
-    CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n"; \
-    ObjectPtr<T> node = make_object<T>();                             \
-    node->dtype = a.dtype();                                          \
-    node->a = std::move(a);                                           \
-    node->b = std::move(b);                                           \
-    data_ = std::move(node);                                          \
+#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                             \
+  Name::Name(PrimExpr a, PrimExpr b) {                                 \
+    using T = Name::ContainerType;                                     \
+    ICHECK(a.defined()) << "ValueError: a is undefined\n";             \
+    ICHECK(b.defined()) << "ValueError: b is undefined\n";             \
+    ICHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n"; \
+    ObjectPtr<T> node = make_object<T>();                              \
+    node->dtype = a.dtype();                                           \
+    node->a = std::move(a);                                            \
+    node->b = std::move(b);                                            \
+    data_ = std::move(node);                                           \
   }
 
-#define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                            \
-  Name::Name(PrimExpr a, PrimExpr b) {                                \
-    using T = Name::ContainerType;                                    \
-    CHECK(a.defined()) << "ValueError: a is undefined\n";             \
-    CHECK(b.defined()) << "ValueError: b is undefined\n";             \
-    CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n"; \
-    ObjectPtr<T> node = make_object<T>();                             \
-    node->dtype = DataType::Bool(a.dtype().lanes());                  \
-    node->a = std::move(a);                                           \
-    node->b = std::move(b);                                           \
-    data_ = std::move(node);                                          \
+#define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                             \
+  Name::Name(PrimExpr a, PrimExpr b) {                                 \
+    using T = Name::ContainerType;                                     \
+    ICHECK(a.defined()) << "ValueError: a is undefined\n";             \
+    ICHECK(b.defined()) << "ValueError: b is undefined\n";             \
+    ICHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n"; \
+    ObjectPtr<T> node = make_object<T>();                              \
+    node->dtype = DataType::Bool(a.dtype().lanes());                   \
+    node->a = std::move(a);                                            \
+    node->b = std::move(b);                                            \
+    data_ = std::move(node);                                           \
   }
 
 // Var
@@ -178,8 +178,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Cast
 Cast::Cast(DataType t, PrimExpr value) {
-  CHECK(value.defined());
-  CHECK_EQ(t.lanes(), value.dtype().lanes());
+  ICHECK(value.defined());
+  ICHECK_EQ(t.lanes(), value.dtype().lanes());
   ObjectPtr<CastNode> node = make_object<CastNode>();
   node->dtype = t;
   node->value = std::move(value);
@@ -453,11 +453,11 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // And
 And::And(PrimExpr a, PrimExpr b) {
-  CHECK(a.defined()) << "ValueError: a is undefined";
-  CHECK(b.defined()) << "ValueError: b is undefined";
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
-  CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
+  ICHECK(a.defined()) << "ValueError: a is undefined";
+  ICHECK(b.defined()) << "ValueError: b is undefined";
+  ICHECK(a.dtype().is_bool());
+  ICHECK(b.dtype().is_bool());
+  ICHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
 
   ObjectPtr<AndNode> node = make_object<AndNode>();
   node->dtype = DataType::Bool(a.dtype().lanes());
@@ -482,11 +482,11 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Or
 Or::Or(PrimExpr a, PrimExpr b) {
-  CHECK(a.defined()) << "ValueError: a is undefined";
-  CHECK(b.defined()) << "ValueError: b is undefined";
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
-  CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
+  ICHECK(a.defined()) << "ValueError: a is undefined";
+  ICHECK(b.defined()) << "ValueError: b is undefined";
+  ICHECK(a.dtype().is_bool());
+  ICHECK(b.dtype().is_bool());
+  ICHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
 
   ObjectPtr<OrNode> node = make_object<OrNode>();
   node->dtype = DataType::Bool(a.dtype().lanes());
@@ -511,8 +511,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Not
 Not::Not(PrimExpr a) {
-  CHECK(a.defined()) << "ValueError: a is undefined";
-  CHECK(a.dtype().is_bool());
+  ICHECK(a.defined()) << "ValueError: a is undefined";
+  ICHECK(a.dtype().is_bool());
 
   ObjectPtr<NotNode> node = make_object<NotNode>();
   node->dtype = DataType::Bool(a.dtype().lanes());
@@ -533,12 +533,12 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Select
 Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value) {
-  CHECK(condition.defined()) << "ValueError: condition is undefined";
-  CHECK(true_value.defined()) << "ValueError: true_value is undefined";
-  CHECK(false_value.defined()) << "ValueError: true_value is undefined";
-  CHECK(condition.dtype().is_bool());
-  CHECK(condition.dtype().lanes() == true_value.dtype().lanes() || condition.dtype().lanes() == 1);
-  CHECK(false_value.dtype() == true_value.dtype()) << "TypeError: mismatched types";
+  ICHECK(condition.defined()) << "ValueError: condition is undefined";
+  ICHECK(true_value.defined()) << "ValueError: true_value is undefined";
+  ICHECK(false_value.defined()) << "ValueError: true_value is undefined";
+  ICHECK(condition.dtype().is_bool());
+  ICHECK(condition.dtype().lanes() == true_value.dtype().lanes() || condition.dtype().lanes() == 1);
+  ICHECK(false_value.dtype() == true_value.dtype()) << "TypeError: mismatched types";
 
   ObjectPtr<SelectNode> node = make_object<SelectNode>();
   node->dtype = true_value.dtype();
@@ -569,11 +569,11 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Load
 Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate) {
-  CHECK(buffer_var.defined());
-  CHECK(predicate.defined());
-  CHECK(index.defined());
-  CHECK_EQ(dtype.lanes(), index.dtype().lanes());
-  CHECK_EQ(dtype.lanes(), predicate.dtype().lanes());
+  ICHECK(buffer_var.defined());
+  ICHECK(predicate.defined());
+  ICHECK(index.defined());
+  ICHECK_EQ(dtype.lanes(), index.dtype().lanes());
+  ICHECK_EQ(dtype.lanes(), predicate.dtype().lanes());
 
   ObjectPtr<LoadNode> node = make_object<LoadNode>();
   node->dtype = dtype;
@@ -609,12 +609,12 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Ramp
 Ramp::Ramp(PrimExpr base, PrimExpr stride, int lanes) {
-  CHECK(base.defined());
-  CHECK(stride.defined());
-  CHECK(base.dtype().is_scalar());
-  CHECK(stride.dtype().is_scalar());
-  CHECK_GT(lanes, 1);
-  CHECK_EQ(stride.dtype(), base.dtype());
+  ICHECK(base.defined());
+  ICHECK(stride.defined());
+  ICHECK(base.dtype().is_scalar());
+  ICHECK(stride.dtype().is_scalar());
+  ICHECK_GT(lanes, 1);
+  ICHECK_EQ(stride.dtype(), base.dtype());
 
   ObjectPtr<RampNode> node = make_object<RampNode>();
   node->dtype = base.dtype().with_lanes(lanes);
@@ -642,9 +642,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Broadcast
 Broadcast::Broadcast(PrimExpr value, int lanes) {
-  CHECK(value.defined());
-  CHECK(value.dtype().is_scalar());
-  CHECK_GT(lanes, 1);
+  ICHECK(value.defined());
+  ICHECK(value.dtype().is_scalar());
+  ICHECK_GT(lanes, 1);
 
   ObjectPtr<BroadcastNode> node = make_object<BroadcastNode>();
   node->dtype = value.dtype().with_lanes(lanes);
@@ -669,9 +669,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Let
 Let::Let(Var var, PrimExpr value, PrimExpr body) {
-  CHECK(value.defined());
-  CHECK(body.defined());
-  CHECK_EQ(value.dtype(), var.dtype());
+  ICHECK(value.defined());
+  ICHECK(body.defined());
+  ICHECK_EQ(value.dtype(), var.dtype());
 
   ObjectPtr<LetNode> node = make_object<LetNode>();
   node->dtype = body.dtype();
@@ -700,7 +700,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Call
 Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args) {
   for (size_t i = 0; i < args.size(); ++i) {
-    CHECK(args[i].defined());
+    ICHECK(args[i].defined());
   }
 
   ObjectPtr<CallNode> node = make_object<CallNode>();
@@ -714,7 +714,7 @@ TVM_REGISTER_GLOBAL("tir.Call")
     .set_body_typed([](DataType type, RelayExpr op, Array<ObjectRef> args) {
       Array<PrimExpr> prim_expr_args;
       for (const auto& it : args) {
-        CHECK(it->IsInstance<runtime::StringObj>() || it->IsInstance<PrimExprNode>());
+        ICHECK(it->IsInstance<runtime::StringObj>() || it->IsInstance<PrimExprNode>());
         if (const auto* str = it.as<runtime::StringObj>()) {
           prim_expr_args.push_back(StringImm(str->data));
         } else {
@@ -733,7 +733,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
         p->stream << ptr_op->name << "(";
       } else {
         auto* ptr_gvar = op->op.as<GlobalVarNode>();
-        CHECK(ptr_gvar != nullptr);
+        ICHECK(ptr_gvar != nullptr);
         p->stream << "@" << ptr_gvar->name_hint << "(";
       }
       for (size_t i = 0; i < op->args.size(); ++i) {
@@ -747,17 +747,17 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Shuffle
 Shuffle::Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices) {
-  CHECK_NE(vectors.size(), 0U);
-  CHECK_NE(indices.size(), 0U);
+  ICHECK_NE(vectors.size(), 0U);
+  ICHECK_NE(indices.size(), 0U);
 
   DataType base_type = vectors[0].dtype().element_of();
   int total_lanes = 0;
 
   for (PrimExpr val : vectors) {
-    CHECK(val.dtype().element_of() == base_type);
+    ICHECK(val.dtype().element_of() == base_type);
     total_lanes += val.dtype().lanes();
   }
-  CHECK_LE(indices.size(), static_cast<size_t>(total_lanes));
+  ICHECK_LE(indices.size(), static_cast<size_t>(total_lanes));
 
   ObjectPtr<ShuffleNode> node = make_object<ShuffleNode>();
   node->dtype = base_type.with_lanes(static_cast<int>(indices.size()));
@@ -767,7 +767,7 @@ Shuffle::Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices) {
 }
 
 PrimExpr Shuffle::Concat(Array<PrimExpr> vectors) {
-  CHECK_NE(vectors.size(), 0);
+  ICHECK_NE(vectors.size(), 0);
   if (vectors.size() == 1) {
     return vectors[0];
   }
@@ -824,9 +824,9 @@ CommReducer::CommReducer(Array<Var> lhs, Array<Var> rhs, Array<PrimExpr> result,
 }
 
 Array<PrimExpr> CommReducerNode::operator()(Array<PrimExpr> a, Array<PrimExpr> b) const {
-  CHECK_EQ(a.size(), b.size());
-  CHECK_EQ(lhs.size(), a.size());
-  CHECK_EQ(rhs.size(), b.size());
+  ICHECK_EQ(a.size(), b.size());
+  ICHECK_EQ(lhs.size(), a.size());
+  ICHECK_EQ(rhs.size(), b.size());
   Map<Var, PrimExpr> value_map;
   for (size_t i = 0; i < a.size(); ++i) {
     value_map.Set(lhs[i], a[i]);
@@ -859,21 +859,21 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 Reduce::Reduce(CommReducer combiner, Array<PrimExpr> source, Array<IterVar> axis,
                PrimExpr condition, int value_index, Array<PrimExpr> init) {
   for (size_t i = 0; i < axis.size(); ++i) {
-    CHECK_EQ(axis[i]->iter_type, kCommReduce) << "Can only take axis created by reduce_axis";
+    ICHECK_EQ(axis[i]->iter_type, kCommReduce) << "Can only take axis created by reduce_axis";
   }
   if (!condition.defined()) {
     condition = const_true();
   }
   auto n = make_object<ReduceNode>();
-  CHECK(source.defined());
+  ICHECK(source.defined());
   for (size_t i = 0; i < axis.size(); ++i) {
-    CHECK(axis[i].defined());
+    ICHECK(axis[i].defined());
   }
   if (!init.empty()) {
-    CHECK_EQ(init.size(), source.size()) << "Number of inits should match number of exprs";
+    ICHECK_EQ(init.size(), source.size()) << "Number of inits should match number of exprs";
     for (size_t i = 0; i < init.size(); i++) {
-      CHECK(init[i]->IsInstance<ProducerLoadNode>() || init[i]->IsInstance<IntImmNode>() ||
-            init[i]->IsInstance<FloatImmNode>())
+      ICHECK(init[i]->IsInstance<ProducerLoadNode>() || init[i]->IsInstance<IntImmNode>() ||
+             init[i]->IsInstance<FloatImmNode>())
           << "init can only be a IntImm, FloatImm or ProducerLoad";
     }
   }
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index f45117791457..dbbc99c3abed 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -30,9 +30,9 @@ namespace tir {
 
 // LetStmt
 LetStmt::LetStmt(Var var, PrimExpr value, Stmt body) {
-  CHECK(value.defined());
-  CHECK(body.defined());
-  CHECK_EQ(value.dtype(), var.dtype());
+  ICHECK(value.defined());
+  ICHECK(body.defined());
+  ICHECK_EQ(value.dtype(), var.dtype());
 
   ObjectPtr<LetStmtNode> node = make_object<LetStmtNode>();
   node->var = std::move(var);
@@ -88,8 +88,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // AssertStmt
 AssertStmt::AssertStmt(PrimExpr condition, PrimExpr message, Stmt body) {
-  CHECK(condition.defined());
-  CHECK(message.dtype() == DataType::Int(32) || message.as<StringImmNode>())
+  ICHECK(condition.defined());
+  ICHECK(message.dtype() == DataType::Int(32) || message.as<StringImmNode>())
       << "TypeError: AssertStmt message must be an int or string:" << message << "\n";
 
   ObjectPtr<AssertStmtNode> node = make_object<AssertStmtNode>();
@@ -126,12 +126,12 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // For
 For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api,
          Stmt body) {
-  CHECK(min.defined());
-  CHECK(extent.defined());
-  CHECK(min.dtype().is_scalar());
-  CHECK(extent.dtype().is_scalar());
-  CHECK(loop_var.dtype().is_scalar());
-  CHECK(body.defined());
+  ICHECK(min.defined());
+  ICHECK(extent.defined());
+  ICHECK(min.dtype().is_scalar());
+  ICHECK(extent.dtype().is_scalar());
+  ICHECK(loop_var.dtype().is_scalar());
+  ICHECK(body.defined());
 
   ObjectPtr<ForNode> node = make_object<ForNode>();
   node->loop_var = std::move(loop_var);
@@ -189,11 +189,11 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Store
 Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate) {
-  CHECK(value.defined());
-  CHECK(index.defined());
-  CHECK(predicate.defined());
-  CHECK_EQ(value.dtype().lanes(), index.dtype().lanes());
-  CHECK_EQ(value.dtype().lanes(), predicate.dtype().lanes());
+  ICHECK(value.defined());
+  ICHECK(index.defined());
+  ICHECK(predicate.defined());
+  ICHECK_EQ(value.dtype().lanes(), index.dtype().lanes());
+  ICHECK_EQ(value.dtype().lanes(), predicate.dtype().lanes());
 
   ObjectPtr<StoreNode> node = make_object<StoreNode>();
   node->buffer_var = std::move(buffer_var);
@@ -267,12 +267,12 @@ Allocate::Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, Prim
   // IsPointerPType(buffer_var->type_annotation, dtype)
   // once we fix the allocate tvm script printing.
   for (size_t i = 0; i < extents.size(); ++i) {
-    CHECK(extents[i].defined());
-    CHECK(extents[i].dtype().is_scalar());
+    ICHECK(extents[i].defined());
+    ICHECK(extents[i].dtype().is_scalar());
   }
-  CHECK(body.defined());
-  CHECK(condition.defined());
-  CHECK(condition.dtype().is_bool());
+  ICHECK(body.defined());
+  ICHECK(condition.defined());
+  ICHECK(condition.dtype().is_bool());
 
   ObjectPtr<AllocateNode> node = make_object<AllocateNode>();
   node->buffer_var = std::move(buffer_var);
@@ -326,14 +326,14 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 ProducerRealize::ProducerRealize(DataProducer producer, Region bounds, PrimExpr condition,
                                  Stmt body) {
   for (size_t i = 0; i < bounds.size(); ++i) {
-    CHECK(bounds[i]->min.defined());
-    CHECK(bounds[i]->extent.defined());
-    CHECK(bounds[i]->min.dtype().is_scalar());
-    CHECK(bounds[i]->extent.dtype().is_scalar());
+    ICHECK(bounds[i]->min.defined());
+    ICHECK(bounds[i]->extent.defined());
+    ICHECK(bounds[i]->min.dtype().is_scalar());
+    ICHECK(bounds[i]->extent.dtype().is_scalar());
   }
-  CHECK(body.defined());
-  CHECK(condition.defined());
-  CHECK(condition.dtype().is_bool());
+  ICHECK(body.defined());
+  ICHECK(condition.defined());
+  ICHECK(condition.dtype().is_bool());
 
   ObjectPtr<ProducerRealizeNode> node = make_object<ProducerRealizeNode>();
   node->producer = std::move(producer);
@@ -428,8 +428,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // IfThenElse
 IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case) {
-  CHECK(condition.defined());
-  CHECK(then_case.defined());
+  ICHECK(condition.defined());
+  ICHECK(then_case.defined());
   // else_case may be null.
   ObjectPtr<IfThenElseNode> node = make_object<IfThenElseNode>();
   node->condition = std::move(condition);
@@ -478,7 +478,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Evaluate
 Evaluate::Evaluate(PrimExpr value) {
-  CHECK(value.defined());
+  ICHECK(value.defined());
 
   ObjectPtr<EvaluateNode> node = make_object<EvaluateNode>();
   node->value = std::move(value);
diff --git a/src/tir/ir/transform.cc b/src/tir/ir/transform.cc
index 62c790fab3ab..95c40f9a3c8e 100644
--- a/src/tir/ir/transform.cc
+++ b/src/tir/ir/transform.cc
@@ -88,7 +88,7 @@ PrimFuncPass::PrimFuncPass(
 // Perform Module -> Module optimizations at the PrimFunc level.
 IRModule PrimFuncPassNode::operator()(IRModule mod, const PassContext& pass_ctx) const {
   const PassInfo& pass_info = Info();
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
   pass_ctx.Trace(mod, pass_info, true);
   std::vector<ObjectRef> deleted_list;
   IRModuleNode* mod_ptr = mod.CopyOnWrite();
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 6d94a08cad5d..71321d2a3b02 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -107,7 +107,7 @@ void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs) {  // NOLINT(*)
   } else if (rtype.lanes() == 1 && ltype.lanes() != 1) {
     rhs = tir::Broadcast(rhs, ltype.lanes());
   } else {
-    CHECK(ltype.lanes() == rtype.lanes()) << "Cannot match type " << ltype << " vs " << rtype;
+    ICHECK(ltype.lanes() == rtype.lanes()) << "Cannot match type " << ltype << " vs " << rtype;
   }
   if (lhs.dtype() == rhs.dtype()) return;
   // Only do very simple type coversion
@@ -146,7 +146,7 @@ void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs) {  // NOLINT(*)
 // maximum and min limits
 PrimExpr max_value(const DataType& dtype) {
   using namespace tir;
-  CHECK_EQ(dtype.lanes(), 1);
+  ICHECK_EQ(dtype.lanes(), 1);
   if (dtype.is_int()) {
     if (dtype.bits() == 64) {
       return IntImm(dtype, std::numeric_limits<int64_t>::max());
@@ -178,10 +178,10 @@ PrimExpr max_value(const DataType& dtype) {
 
 PrimExpr min_value(const DataType& dtype) {
   using namespace tir;
-  CHECK_EQ(dtype.lanes(), 1);
+  ICHECK_EQ(dtype.lanes(), 1);
   if (datatype::Registry::Global()->GetTypeRegistered(dtype.code())) {
     auto f = datatype::GetMinFunc(dtype.code());
-    CHECK(f) << "No minimum function registered for custom dtype " << (unsigned int)dtype.code();
+    ICHECK(f) << "No minimum function registered for custom dtype " << (unsigned int)dtype.code();
     // TODO(@hypercubestart) Document this change (and others associated with the overflowing
     // floatimm min bug)
     return (*f)(dtype.bits());
@@ -211,7 +211,7 @@ PrimExpr min_value(const DataType& dtype) {
 // infinity
 PrimExpr infinity(const DataType& dtype) {
   using namespace tir;
-  CHECK_EQ(dtype.lanes(), 1);
+  ICHECK_EQ(dtype.lanes(), 1);
   if (dtype.is_float()) {
     if (dtype.bits() == 64) {
       return FloatImm(dtype, std::numeric_limits<double>::infinity());
@@ -273,7 +273,7 @@ PrimExpr cast(const DataType& t, PrimExpr value) {
       }
       return tir::Broadcast(value, t.lanes());
     } else {
-      CHECK(value.dtype().lanes() == t.lanes());
+      ICHECK(value.dtype().lanes() == t.lanes());
       return tir::Cast(t, value);
     }
   }
@@ -326,8 +326,8 @@ PrimExpr div(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr truncdiv(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  CHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   return div(a, b);
 }
 
@@ -348,8 +348,8 @@ PrimExpr indexdiv(PrimExpr a, PrimExpr b) { return floordiv(a, b); }
 PrimExpr indexmod(PrimExpr a, PrimExpr b) { return floormod(a, b); }
 
 PrimExpr floordiv(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  CHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   BinaryOpMatchTypes(a, b);
   PrimExpr ret = arith::TryConstFold<tir::FloorDiv>(a, b);
   if (ret.defined()) return ret;
@@ -357,8 +357,8 @@ PrimExpr floordiv(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr floormod(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  CHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
   BinaryOpMatchTypes(a, b);
   PrimExpr ret = arith::TryConstFold<tir::FloorMod>(a, b);
   if (ret.defined()) return ret;
@@ -395,7 +395,7 @@ PrimExpr max(PrimExpr a, PrimExpr b) {
 
 // if_then_else
 PrimExpr if_then_else(PrimExpr cond, PrimExpr true_value, PrimExpr false_value) {
-  CHECK(cond.dtype() == DataType::Bool(1))
+  ICHECK(cond.dtype() == DataType::Bool(1))
       << "if_then_else only accept the condition to be boolean type.";
   BinaryOpMatchTypes(true_value, false_value);
   if (const IntImmNode* op = cond.as<IntImmNode>()) {
@@ -460,23 +460,23 @@ PrimExpr operator!=(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr operator&&(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
+  ICHECK(a.dtype().is_bool());
+  ICHECK(b.dtype().is_bool());
   PrimExpr ret = arith::TryConstFold<tir::And>(a, b);
   if (ret.defined()) return ret;
   return tir::And(a, b);
 }
 
 PrimExpr operator||(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
+  ICHECK(a.dtype().is_bool());
+  ICHECK(b.dtype().is_bool());
   PrimExpr ret = arith::TryConstFold<tir::Or>(a, b);
   if (ret.defined()) return ret;
   return tir::Or(a, b);
 }
 
 PrimExpr operator!(PrimExpr a) {
-  CHECK(a.dtype().is_bool());
+  ICHECK(a.dtype().is_bool());
   PrimExpr ret = arith::TryConstFold<tir::Not>(a);
   if (ret.defined()) return ret;
   return tir::Not(a);
@@ -484,13 +484,13 @@ PrimExpr operator!(PrimExpr a) {
 
 // shirt right
 PrimExpr operator>>(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pb)
-      CHECK(pb->value >= 0 && pb->value < rtype.bits())
+      ICHECK(pb->value >= 0 && pb->value < rtype.bits())
           << "Shift amount must be non-negative and less than " << rtype.bits() << " for type "
           << rtype;
     if (pa && pb) return IntImm(rtype, (pa->value >> pb->value));
@@ -504,13 +504,13 @@ PrimExpr operator>>(PrimExpr a, PrimExpr b) {
 
 // shift left
 PrimExpr operator<<(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
     if (pb)
-      CHECK(pb->value >= 0 && pb->value < rtype.bits())
+      ICHECK(pb->value >= 0 && pb->value < rtype.bits())
           << "Shift amount must be non-negative and less than " << rtype.bits() << " for type "
           << rtype;
     if (pa && pb) return IntImm(rtype, (pa->value << pb->value));
@@ -523,8 +523,8 @@ PrimExpr operator<<(PrimExpr a, PrimExpr b) {
 
 // bitwise and
 PrimExpr operator&(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -535,8 +535,8 @@ PrimExpr operator&(PrimExpr a, PrimExpr b) {
 
 // bitwise_or
 PrimExpr operator|(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -547,8 +547,8 @@ PrimExpr operator|(PrimExpr a, PrimExpr b) {
 
 // bitwise_xor
 PrimExpr operator^(PrimExpr a, PrimExpr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
     const DataType& rtype = a.dtype();
@@ -559,7 +559,7 @@ PrimExpr operator^(PrimExpr a, PrimExpr b) {
 
 // bitwie_not
 PrimExpr operator~(PrimExpr a) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
+  ICHECK(a.dtype().is_int() || a.dtype().is_uint());
   return tir::Call(a.dtype(), tir::builtin::bitwise_not(), {a});
 }
 
@@ -568,7 +568,7 @@ TVM_REGISTER_GLOBAL("tir.bitwise_not").set_body_typed([](PrimExpr a) { return ~a
 // pow
 PrimExpr pow(PrimExpr x, PrimExpr y) {
   BinaryOpMatchTypes(x, y);
-  CHECK(x.dtype().is_float()) << "power only applies to float";
+  ICHECK(x.dtype().is_float()) << "power only applies to float";
   static auto op = Op::Get("tir.pow");
   return tir::Call(x.dtype(), op, {x, y});
 }
@@ -652,7 +652,7 @@ PrimExpr sum(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
 }
 
 PrimExpr all(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
-  CHECK(source.dtype().is_bool());
+  ICHECK(source.dtype().is_bool());
   Var x("x", source.dtype()), y("y", source.dtype());
   PrimExpr result = tir::And(x, y);
   PrimExpr identity_element = make_const(source.dtype(), true);
@@ -661,7 +661,7 @@ PrimExpr all(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
 }
 
 PrimExpr any(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
-  CHECK(source.dtype().is_bool());
+  ICHECK(source.dtype().is_bool());
   Var x("x", source.dtype()), y("y", source.dtype());
   PrimExpr result = tir::Or(x, y);
   PrimExpr identity_element = make_const(source.dtype(), false);
@@ -696,7 +696,7 @@ PrimExpr prod(PrimExpr source, Array<IterVar> rdom, Array<PrimExpr> init) {
 // fmod
 PrimExpr fmod(PrimExpr x, PrimExpr y) {
   BinaryOpMatchTypes(x, y);
-  CHECK(x.dtype().is_float()) << "fmod only applies to float";
+  ICHECK(x.dtype().is_float()) << "fmod only applies to float";
   static auto op = Op::Get("tir.fmod");
   return tir::Call(x.dtype(), op, {x, y});
 }
diff --git a/src/tir/transforms/arg_binder.cc b/src/tir/transforms/arg_binder.cc
index 1faa6267b4fe..1b58bfa38b40 100644
--- a/src/tir/transforms/arg_binder.cc
+++ b/src/tir/transforms/arg_binder.cc
@@ -49,7 +49,7 @@ void BinderAddAssert(arith::Analyzer* ana, PrimExpr cond, const std::string& arg
 
 bool ArgBinder::Bind_(const PrimExpr& arg, const PrimExpr& value, const std::string& arg_name,
                       bool with_lets) {
-  CHECK_EQ(arg.dtype(), value.dtype());
+  ICHECK_EQ(arg.dtype(), value.dtype());
   if (const VarNode* v = arg.as<VarNode>()) {
     auto it = def_map_->find(v);
     if (it == def_map_->end()) {
@@ -78,7 +78,7 @@ void ArgBinder::Bind(const PrimExpr& arg, const PrimExpr& value, const std::stri
 
 void ArgBinder::BindArray(const Array<PrimExpr>& arg, const Array<PrimExpr>& value,
                           const std::string& arg_name) {
-  CHECK_EQ(arg.size(), value.size()) << "Argument " << arg_name << " array size mismatch";
+  ICHECK_EQ(arg.size(), value.size()) << "Argument " << arg_name << " array size mismatch";
   for (size_t i = 0; i < arg.size(); ++i) {
     std::ostringstream os;
     os << arg_name << "[" << i << "]";
@@ -88,8 +88,8 @@ void ArgBinder::BindArray(const Array<PrimExpr>& arg, const Array<PrimExpr>& val
 
 void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::string& arg_name,
                            bool fuzzy_match) {
-  CHECK_EQ(arg->scope, value->scope) << "Argument " << arg_name << " Buffer bind scope mismatch";
-  CHECK_EQ(arg->dtype, value->dtype)
+  ICHECK_EQ(arg->scope, value->scope) << "Argument " << arg_name << " Buffer bind scope mismatch";
+  ICHECK_EQ(arg->dtype, value->dtype)
       << "Argument " << arg_name << " Buffer bind data type mismatch";
   if (value->data_alignment % arg->data_alignment != 0) {
     LOG(WARNING) << "Trying to bind buffer to another one with lower alignment requirement "
@@ -98,7 +98,7 @@ void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::st
   }
   // bind pointer and offset.
   if (is_zero(arg->elem_offset)) {
-    CHECK(is_zero(value->elem_offset))
+    ICHECK(is_zero(value->elem_offset))
         << "Trying to bind a Buffer with offset into one without offset "
         << " required elem_offset=" << arg->elem_offset
         << ", provided elem_offset=" << value->elem_offset;
@@ -116,10 +116,10 @@ void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::st
   }
 
   if (arg->shape.size() < value->shape.size()) {
-    CHECK(fuzzy_match) << "Argument " << arg_name << " size mismatch";
+    ICHECK(fuzzy_match) << "Argument " << arg_name << " size mismatch";
     size_t diff = value->shape.size() - arg->shape.size();
     for (size_t i = 0; i < diff; ++i) {
-      CHECK(is_one(analyzer_.Simplify(value->shape[i])))
+      ICHECK(is_one(analyzer_.Simplify(value->shape[i])))
           << "Argument " << arg_name << " shape mismatch" << arg->shape << " vs " << value->shape;
     }
     for (size_t i = 0; i < arg->shape.size(); ++i) {
@@ -128,8 +128,8 @@ void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::st
       this->Bind(arg->shape[i], value->shape[i + diff], os.str());
     }
     if (value->strides.size() != 0) {
-      CHECK_EQ(arg->strides.size(), arg->shape.size());
-      CHECK_EQ(value->strides.size(), value->shape.size());
+      ICHECK_EQ(arg->strides.size(), arg->shape.size());
+      ICHECK_EQ(value->strides.size(), value->shape.size());
       for (size_t i = 0; i < arg->strides.size(); ++i) {
         std::ostringstream os;
         os << arg_name << ".strides[" << i << "]";
diff --git a/src/tir/transforms/bf16_legalize.cc b/src/tir/transforms/bf16_legalize.cc
index 97c96edc6ca7..7a8789457923 100644
--- a/src/tir/transforms/bf16_legalize.cc
+++ b/src/tir/transforms/bf16_legalize.cc
@@ -50,10 +50,10 @@ class BF16PromoteRewriter : public StmtExprMutator {
     auto b = this->VisitExpr(orig_b);
     *is_bfloat16 = false;
     if (a->dtype.is_bfloat16()) {
-      CHECK(b->dtype.is_bfloat16());
+      ICHECK(b->dtype.is_bfloat16());
       *is_bfloat16 = true;
     } else if (b->dtype.is_bfloat16()) {
-      CHECK(a->dtype.is_bfloat16());
+      ICHECK(a->dtype.is_bfloat16());
       *is_bfloat16 = true;
     }
 
@@ -182,14 +182,14 @@ class BF16LowerRewriter : public StmtExprMutator {
     auto op_val = StmtExprMutator::VisitExpr(op->value);
     if (op->value->dtype.is_bfloat16()) {
       // if is cast_from_bf16, check if is to fp32
-      CHECK(op->dtype.is_float() && op->dtype.bits() == 32);
+      ICHECK(op->dtype.is_float() && op->dtype.bits() == 32);
       auto uint32_dtype = DataType(kDLUInt, 32, op_val->dtype.lanes());
       auto uint32_v = Cast(uint32_dtype, op_val);
       // to be endian invariant.
       return Call(op->dtype, builtin::reinterpret(), {uint32_v << 16});
     } else if (op->dtype.is_bfloat16()) {
       // if is cast_to_bf16, check if op->value is fp32
-      CHECK(op->value->dtype.is_float() && op->value->dtype.bits() == 32);
+      ICHECK(op->value->dtype.is_float() && op->value->dtype.bits() == 32);
       auto uint32_dtype = DataType(kDLUInt, 32, op_val->dtype.lanes());
       auto uint32_v = Call(uint32_dtype, builtin::reinterpret(), {op_val});
       auto uint16_dtype = DataType(kDLUInt, 16, op_val->dtype.lanes());
@@ -299,7 +299,7 @@ class BF16LowerRewriter : public StmtExprMutator {
 
     if (op->dtype.is_bfloat16()) {
       auto it = var_remap_.find(op->buffer_var);
-      CHECK(it != var_remap_.end()) << "bfloat* var needs to be remapped";
+      ICHECK(it != var_remap_.end()) << "bfloat* var needs to be remapped";
       return Load(DataType::UInt(16, op->dtype.lanes()), it->second, op->index, op->predicate);
     } else {
       return ret;
diff --git a/src/tir/transforms/combine_context_call.cc b/src/tir/transforms/combine_context_call.cc
index 0485bb1f7613..03a0d5e751cf 100644
--- a/src/tir/transforms/combine_context_call.cc
+++ b/src/tir/transforms/combine_context_call.cc
@@ -42,13 +42,13 @@ class ContextCallCombiner final : public StmtExprMutator {
  public:
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_thread_context())) {
-      CHECK_EQ(op->args.size(), 1U);
+      ICHECK_EQ(op->args.size(), 1U);
       PrimExpr ctx = op->args[0];
       auto it = ctx_map_.find(ctx);
       if (it != ctx_map_.end()) {
         return it->second;
       } else {
-        CHECK(ctx.dtype().is_handle());
+        ICHECK(ctx.dtype().is_handle());
         Var ctx_var("ctx_cache_", ctx.dtype());
         ctx_map_[ctx] = ctx_var;
         return std::move(ctx_var);
diff --git a/src/tir/transforms/coproc_sync.cc b/src/tir/transforms/coproc_sync.cc
index 9de9eaa8a639..f9245442d268 100644
--- a/src/tir/transforms/coproc_sync.cc
+++ b/src/tir/transforms/coproc_sync.cc
@@ -149,7 +149,7 @@ class CoProcSyncPlanner : public StorageAccessVisitor {
         }
       }
       if (sync_write) {
-        CHECK_NE(i, 0U);
+        ICHECK_NE(i, 0U);
         sync_[seq[i - 1].stmt] = GetSync(co_access);
         co_access.clear();
         contain_sync = true;
@@ -175,7 +175,7 @@ class CoProcSyncPlanner : public StorageAccessVisitor {
       }
     }
     if (sync_at_end && co_access.size() != 0) {
-      CHECK_NE(seq.size(), 0);
+      ICHECK_NE(seq.size(), 0);
       contain_sync = true;
       sync_[seq.back().stmt] = GetSync(co_access);
       co_access.clear();
@@ -190,8 +190,8 @@ class CoProcSyncPlanner : public StorageAccessVisitor {
   // Add write Synchronization
   std::vector<Stmt> GetSync(const std::vector<AccessEntry>& co_access) {
     // Does not consider memory coherence, need runtime.
-    CHECK_NE(co_access.size(), 0U);
-    CHECK_EQ(co_access[0].threads.size(), 1U);
+    ICHECK_NE(co_access.size(), 0U);
+    ICHECK_EQ(co_access[0].threads.size(), 1U);
     return GetSync(coproc_name_ + ".coproc_sync");
   }
 
@@ -250,7 +250,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
     auto fupdate = [&](size_t i, const AccessEntry& acc) {
       auto it = write_set.find(acc.buffer.get());
       if (it != write_set.end()) {
-        CHECK_NE(i, 0U);
+        ICHECK_NE(i, 0U);
         barrier_after_[seq[i - 1].stmt].push_back(MakeBarrier(write_barrier_name_, it->second));
         write_set.erase(it);
       }
@@ -288,7 +288,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
     auto fupdate = [&](size_t i, const AccessEntry& acc) {
       auto it = read_set.find(acc.buffer.get());
       if (it != read_set.end()) {
-        CHECK_NE(i, seq.size());
+        ICHECK_NE(i, seq.size());
         barrier_before_[seq[i].stmt].push_back(MakeBarrier(read_barrier_name_, it->second));
         read_set.erase(it);
       }
@@ -324,12 +324,12 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
     // insert write point
     Array<arith::IntSet> wset;
     for (const AccessEntry& acc : wvec) {
-      CHECK(acc.dtype == wvec[0].dtype);
+      ICHECK(acc.dtype == wvec[0].dtype);
       wset.push_back(acc.touched);
     }
     Range none;
     Range r = arith::Union(wset).CoverRange(none);
-    CHECK(r.defined()) << "Cannot deduce write range of " << wvec[0].buffer;
+    ICHECK(r.defined()) << "Cannot deduce write range of " << wvec[0].buffer;
     PrimExpr min = r->min;
     PrimExpr extent = r->extent;
     return Evaluate(Call(DataType::Int(32), Op::Get(func),
@@ -361,7 +361,7 @@ class CoProcInstDepDetector : public StmtVisitor {
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::coproc_scope && op->node.same_as(coproc_axis_)) {
       const IntImmNode* ctx_id = op->value.as<IntImmNode>();
-      CHECK(ctx_id != nullptr);
+      ICHECK(ctx_id != nullptr);
       curr_state_.clear();
       curr_state_.node = op->body.get();
       curr_state_.enter_ctx.insert(ctx_id->value);
@@ -380,7 +380,7 @@ class CoProcInstDepDetector : public StmtVisitor {
     curr_state_.clear();
     if (last_state_.node != nullptr) {
       curr_state_.node = op;
-      CHECK(first_state_.node != nullptr);
+      ICHECK(first_state_.node != nullptr);
       // loop carry dependency
       InjectSync(last_state_, first_state_, &(curr_state_.exit_push), &(curr_state_.enter_pop));
       curr_state_.enter_ctx = first_state_.enter_ctx;
@@ -548,7 +548,7 @@ class CoProcInstDepDetector : public StmtVisitor {
       InjectSync(last_state_, curr_state_, &t1, &t2);
       std::swap(last_state_, curr_state_);
     } else {
-      CHECK(first_state_.node == nullptr);
+      ICHECK(first_state_.node == nullptr);
       first_state_ = curr_state_;
       last_state_ = curr_state_;
     }
@@ -582,7 +582,7 @@ class CoProcSyncInserter : public StmtMutator {
         touched.insert(kv.first);
       }
     }
-    CHECK_EQ(visitor.coproc_.size(), 1U);
+    ICHECK_EQ(visitor.coproc_.size(), 1U);
     std::string coproc_name = (*visitor.coproc_.begin())->var->name_hint;
     // plan sync.
     CoProcSyncPlanner sync_planner(touched, coproc_name);
diff --git a/src/tir/transforms/hoist_if_then_else.cc b/src/tir/transforms/hoist_if_then_else.cc
index 9db800c2a6d2..7bae0ce8ca75 100644
--- a/src/tir/transforms/hoist_if_then_else.cc
+++ b/src/tir/transforms/hoist_if_then_else.cc
@@ -248,7 +248,7 @@ class HoistCandidateSelector final : public StmtExprVisitor {
  private:
   void ResetRecorderInternal() {
     if (is_recorder_on_) {
-      CHECK_GT(ordered_list_.size(), 0);
+      ICHECK_GT(ordered_list_.size(), 0);
       is_recorder_on_ = false;
     }
     ordered_list_.clear();
diff --git a/src/tir/transforms/inject_copy_intrin.cc b/src/tir/transforms/inject_copy_intrin.cc
index b27459f4bd45..f7443c74c0f7 100644
--- a/src/tir/transforms/inject_copy_intrin.cc
+++ b/src/tir/transforms/inject_copy_intrin.cc
@@ -47,7 +47,7 @@ class CopyIntrinInjector : public StmtMutator {
       storage_scope_[buf] = op->value.as<StringImmNode>()->value;
     } else if (op->attr_key == pragma_key_) {
       Stmt ret;
-      CHECK(MatchCopyPattern(op->body, &ret)) << "Cannot match copy pattern of " << op->body;
+      ICHECK(MatchCopyPattern(op->body, &ret)) << "Cannot match copy pattern of " << op->body;
       return ret;
     }
     return StmtMutator::VisitStmt_(op);
@@ -76,7 +76,7 @@ class CopyIntrinInjector : public StmtMutator {
     const CastNode* cast = store->value.as<CastNode>();
     const LoadNode* load = store->value.as<LoadNode>();
     if (0 == loops.size()) {
-      CHECK(!has_cond);
+      ICHECK(!has_cond);
     }
     // for now only support true condition matching
     if (has_cond) {
@@ -112,8 +112,8 @@ class CopyIntrinInjector : public StmtMutator {
       Array<PrimExpr> clip_bound = arith::DetectClipBound(sel_cond.Eval(), loop_vars);
       pad_value = sel_false_value.Eval();
       if (clip_bound.size() == 0) return false;
-      CHECK_EQ(src_shape.size(), loop_vars.size());
-      CHECK_EQ(clip_bound.size(), loop_vars.size() * 2);
+      ICHECK_EQ(src_shape.size(), loop_vars.size());
+      ICHECK_EQ(clip_bound.size(), loop_vars.size() * 2);
       for (size_t i = 0; i < src_shape.size(); ++i) {
         PrimExpr min_value = clip_bound[2 * i];
         PrimExpr max_value = clip_bound[2 * i + 1];
@@ -139,8 +139,8 @@ class CopyIntrinInjector : public StmtMutator {
       }
       src_elem_offset = analyzer_.Simplify(src_elem_offset);
     }
-    CHECK_EQ(load_strides.size(), store_strides.size());
-    CHECK_EQ(load_strides.size(), loop_var_size + 1);
+    ICHECK_EQ(load_strides.size(), store_strides.size());
+    ICHECK_EQ(load_strides.size(), loop_var_size + 1);
     Array<PrimExpr> src_strides(load_strides.begin(), load_strides.begin() + loop_var_size);
     Array<PrimExpr> dst_strides(store_strides.begin(), store_strides.begin() + loop_var_size);
     if (loop_var_size == 0) {
@@ -154,7 +154,7 @@ class CopyIntrinInjector : public StmtMutator {
                         load->buffer_var->name_hint, GetStorageScope(load->buffer_var.get()), 0, 0,
                         kDefault);
     *out = flower_copy_fromto_(src, dst, pad_before, pad_after, pad_value);
-    CHECK(out->defined()) << "flower function did not return correct stmt";
+    ICHECK(out->defined()) << "flower function did not return correct stmt";
     return true;
   }
   // Get storage scope
diff --git a/src/tir/transforms/inject_double_buffer.cc b/src/tir/transforms/inject_double_buffer.cc
index 1eea43d27d46..8de446727a71 100644
--- a/src/tir/transforms/inject_double_buffer.cc
+++ b/src/tir/transforms/inject_double_buffer.cc
@@ -123,7 +123,7 @@ class DoubleBufferInjector : public StmtExprMutator {
       for (PrimExpr e : op->extents) {
         new_extents.push_back(e);
       }
-      CHECK(it->second.loop != nullptr);
+      ICHECK(it->second.loop != nullptr);
       auto& alloc_nest = loop_allocs_[it->second.loop];
       alloc_nest.emplace_back(
           AttrStmt(op->buffer_var, attr::storage_scope, StringImm(it->second.scope), Evaluate(0)));
@@ -143,9 +143,9 @@ class DoubleBufferInjector : public StmtExprMutator {
       const ForNode* old_loop = stmt.as<ForNode>();
       if (split_loop_ != 0) {
         // Explicitly unroll the loop
-        CHECK(split_loop_ % 2 == 0 || split_loop_ == 1)
+        ICHECK(split_loop_ % 2 == 0 || split_loop_ == 1)
             << "It is better to split with multiple of 2";
-        CHECK(is_zero(old_loop->min));
+        ICHECK(is_zero(old_loop->min));
         PrimExpr zero = old_loop->min;
         PrimExpr new_ext = old_loop->extent - make_const(old_loop->loop_var.dtype(), 1);
         PrimExpr factor = make_const(new_ext.dtype(), split_loop_);
@@ -186,8 +186,8 @@ class DoubleBufferInjector : public StmtExprMutator {
     auto it = dbuffer_info_.find(op->buffer_var.get());
     if (it != dbuffer_info_.end()) {
       const StorageEntry& e = it->second;
-      CHECK(in_double_buffer_scope_);
-      CHECK(e.stride.defined());
+      ICHECK(in_double_buffer_scope_);
+      ICHECK(e.stride.defined());
       return Store(op->buffer_var, op->value, e.switch_write_var * e.stride + op->index,
                    op->predicate);
     } else {
@@ -201,8 +201,8 @@ class DoubleBufferInjector : public StmtExprMutator {
     auto it = dbuffer_info_.find(op->buffer_var.get());
     if (it != dbuffer_info_.end()) {
       const StorageEntry& e = it->second;
-      CHECK(e.stride.defined());
-      CHECK(e.switch_read_var.defined());
+      ICHECK(e.stride.defined());
+      ICHECK(e.switch_read_var.defined());
       return Load(op->dtype, op->buffer_var, e.switch_read_var * e.stride + op->index,
                   op->predicate);
     } else {
@@ -211,14 +211,14 @@ class DoubleBufferInjector : public StmtExprMutator {
   }
 
   PrimExpr VisitExpr_(const VarNode* op) final {
-    CHECK(!dbuffer_info_.count(op));
+    ICHECK(!dbuffer_info_.count(op));
     return GetRef<PrimExpr>(op);
   }
 
  private:
   Stmt MakeProducer(const AttrStmtNode* op) {
     const Var buffer = Downcast<Var>(op->node);
-    CHECK_NE(loop_nest_.size(), 0U) << "Double buffer scope must be inside a loop";
+    ICHECK_NE(loop_nest_.size(), 0U) << "Double buffer scope must be inside a loop";
     auto it = dbuffer_info_.find(buffer.get());
     if (it == dbuffer_info_.end()) {
       LOG(WARNING) << "Skip double buffer scope " << op->node;
diff --git a/src/tir/transforms/inject_prefetch.cc b/src/tir/transforms/inject_prefetch.cc
index 4e4f33baed2b..b5c4cf5ec582 100644
--- a/src/tir/transforms/inject_prefetch.cc
+++ b/src/tir/transforms/inject_prefetch.cc
@@ -44,7 +44,7 @@ class PrefetchInjector : public StmtMutator {
     op = ret.as<AttrStmtNode>();
     if (op && op->attr_key == attr::prefetch_scope) {
       Buffer buffer = Downcast<Buffer>(op->node);
-      CHECK_NE(loop_nest_.size(), 0U);
+      ICHECK_NE(loop_nest_.size(), 0U);
       Region domain = DomainTouched(op->body, buffer, true, false);
       Region region;
 
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index c0a0b08f22a0..9a77449ecfa2 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -58,8 +58,8 @@ class ExprTouched final : public StmtExprVisitor {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       const auto* rw_mask = op->args[4].as<IntImmNode>();
       const VarNode* buffer_var = op->args[1].as<VarNode>();
-      CHECK(buffer_var);
-      CHECK(rw_mask);
+      ICHECK(buffer_var);
+      ICHECK(rw_mask);
       // read
       if (rw_mask->value & 1) {
         HandleUseVar(buffer_var);
@@ -182,7 +182,7 @@ class VTInjector : public StmtExprMutator {
         allow_share_(allow_share) {}
   // Inject VTLoop when needed.
   Stmt VisitStmt(const Stmt& s) final {
-    CHECK(!visit_touched_var_);
+    ICHECK(!visit_touched_var_);
     auto stmt = StmtExprMutator::VisitStmt(s);
     if (visit_touched_var_ || trigger_base_inject_) {
       if (!vt_loop_injected_) {
@@ -195,7 +195,7 @@ class VTInjector : public StmtExprMutator {
   }
   // Variable
   PrimExpr VisitExpr_(const VarNode* op) final {
-    CHECK(!alloc_remap_.count(op)) << "Buffer address may get rewritten in virtual thread";
+    ICHECK(!alloc_remap_.count(op)) << "Buffer address may get rewritten in virtual thread";
     if (touched_var_.count(op)) {
       visit_touched_var_ = true;
     }
@@ -221,7 +221,7 @@ class VTInjector : public StmtExprMutator {
   // Expression.
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
-      CHECK_EQ(op->args.size(), 5U);
+      ICHECK_EQ(op->args.size(), 5U);
       DataType dtype = op->args[0].dtype();
       const VarNode* buffer = op->args[1].as<VarNode>();
       auto it = alloc_remap_.find(buffer);
@@ -290,7 +290,7 @@ class VTInjector : public StmtExprMutator {
   }
   // For
   Stmt VisitStmt_(const ForNode* op) final {
-    CHECK(is_zero(op->min));
+    ICHECK(is_zero(op->min));
     PrimExpr extent = this->VisitExpr(op->extent);
     if (visit_touched_var_ && !vt_loop_injected_) {
       Stmt stmt = InjectVTLoop(GetRef<Stmt>(op), true);
@@ -313,7 +313,7 @@ class VTInjector : public StmtExprMutator {
       return InjectVTLoop(GetRef<Stmt>(op), true);
     }
     visit_touched_var_ = false;
-    CHECK_EQ(max_loop_depth_, 0);
+    ICHECK_EQ(max_loop_depth_, 0);
     Stmt then_case = this->VisitStmt(op->then_case);
     Stmt else_case;
     if (op->else_case.defined()) {
@@ -332,7 +332,7 @@ class VTInjector : public StmtExprMutator {
 
   // Seq
   Stmt VisitStmt_(const SeqStmtNode* op) final {
-    CHECK_EQ(max_loop_depth_, 0);
+    ICHECK_EQ(max_loop_depth_, 0);
     auto fmutate = [this](const Stmt& s) {
       int temp = max_loop_depth_;
       max_loop_depth_ = 0;
@@ -392,7 +392,7 @@ class VTInjector : public StmtExprMutator {
 
   // inject vthread loop
   Stmt InjectVTLoop(Stmt stmt, bool before_mutation) {
-    CHECK(!vt_loop_injected_);
+    ICHECK(!vt_loop_injected_);
     // reset the flags
     visit_touched_var_ = false;
     trigger_base_inject_ = false;
diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index d6c7300f2edb..838194203b5b 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -38,38 +38,38 @@ Stmt MergeNest(const std::vector<Stmt>& nest, Stmt body) {
     Stmt s = *ri;
     if (const auto* for_ = s.as<ForNode>()) {
       auto n = make_object<ForNode>(*for_);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (const auto* let = s.as<LetStmtNode>()) {
       auto n = make_object<LetStmtNode>(*let);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (const auto* attr = s.as<AttrStmtNode>()) {
       auto n = make_object<AttrStmtNode>(*attr);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (const auto* ite = s.as<IfThenElseNode>()) {
       auto n = make_object<IfThenElseNode>(*ite);
-      CHECK(is_no_op(n->then_case));
-      CHECK(!n->else_case.defined());
+      ICHECK(is_no_op(n->then_case));
+      ICHECK(!n->else_case.defined());
       n->then_case = body;
       body = Stmt(n);
     } else if (const auto* seq = s.as<SeqStmtNode>()) {
       auto n = make_object<SeqStmtNode>(*seq);
-      CHECK(n->size() != 0 && is_no_op(n->seq[n->size() - 1]));
+      ICHECK(n->size() != 0 && is_no_op(n->seq[n->size() - 1]));
       n->seq.Set(n->size() - 1, body);
       body = Stmt(n);
     } else if (const auto* assert_ = s.as<AssertStmtNode>()) {
       auto n = make_object<AssertStmtNode>(*assert_);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (const auto* alloc = s.as<AllocateNode>()) {
       auto n = make_object<AllocateNode>(*alloc);
-      CHECK(is_no_op(n->body));
+      ICHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else {
@@ -177,7 +177,7 @@ class IRConvertSSA final : public StmtExprMutator {
           Stmt new_alloc = this->VisitStmt(op->body);
           if (new_alloc.same_as(op->body)) return GetRef<Stmt>(op);
           alloc = new_alloc.as<AllocateNode>();
-          CHECK(alloc);
+          ICHECK(alloc);
           return AttrStmt(alloc->buffer_var, op->attr_key, op->value, new_alloc);
         }
       }
diff --git a/src/tir/transforms/ir_utils.h b/src/tir/transforms/ir_utils.h
index eb7a246957d2..3b4e693b820a 100644
--- a/src/tir/transforms/ir_utils.h
+++ b/src/tir/transforms/ir_utils.h
@@ -138,9 +138,9 @@ inline Stmt TVMStructSet(Var handle, int index, builtin::TVMStructFieldKind kind
  */
 inline DataType APIType(DataType t) {
   if (t.is_handle()) return t;
-  CHECK_EQ(t.lanes(), 1) << "Cannot pass vector type through packed API.";
+  ICHECK_EQ(t.lanes(), 1) << "Cannot pass vector type through packed API.";
   if (t.is_uint() || t.is_int()) return DataType::Int(64);
-  CHECK(t.is_float());
+  ICHECK(t.is_float());
   return DataType::Float(64);
 }
 
diff --git a/src/tir/transforms/lift_attr_scope.cc b/src/tir/transforms/lift_attr_scope.cc
index 44b121a7b559..27dd583b8b42 100644
--- a/src/tir/transforms/lift_attr_scope.cc
+++ b/src/tir/transforms/lift_attr_scope.cc
@@ -88,7 +88,7 @@ class AttrScopeLifter : public StmtMutator {
     if (attr_node.size() == 0) return ret;
 
     op = ret.as<SeqStmtNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     Array<Stmt> reorg;
     // check if all decorations are common.
     for (size_t begin = 0; begin < attr_node.size();) {
diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index 68c43fac1170..ab567dc0e417 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -121,7 +121,7 @@ class CandidateSelector final : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent) {
       const IterVarNode* iv = op->node.as<IterVarNode>();
-      CHECK(iv);
+      ICHECK(iv);
       Var var = iv->var;
       runtime::ThreadScope scope = runtime::ThreadScope::Create(iv->thread_tag);
       if ((scope.rank == 0) && (!is_const_int(op->value) || partition_const_loop_)) {
@@ -210,7 +210,7 @@ class PartitionFinder : public StmtExprVisitor {
     // handle thread_axis
     if (op->attr_key == attr::thread_extent) {
       const IterVarNode* thread_axis = op->node.as<IterVarNode>();
-      CHECK(thread_axis);
+      ICHECK(thread_axis);
       const VarNode* var = thread_axis->var.get();
       IntSet dom = IntSet::FromRange(Range(make_zero(op->value.dtype()), op->value));
       hint_map_.insert({var, dom});
@@ -363,7 +363,7 @@ class LoopPartitioner : public StmtMutator {
     }
 
     const IterVarNode* iv = op->node.as<IterVarNode>();
-    CHECK(iv);
+    ICHECK(iv);
     Var var = iv->var;
     auto as = GetRef<Stmt>(op);
     if (selector.candidates.count(as)) {
@@ -595,7 +595,7 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
 
 inline Stmt LoopPartitioner::MakeFor(const Object* node, PrimExpr extent, Stmt body) {
   const ForNode* for_node = static_cast<const ForNode*>(node);
-  CHECK(for_node);
+  ICHECK(for_node);
   if (analyzer_.CanProve(extent == make_const(DataType::Int(32), 1))) {
     // If the loop extent is 1, do not create the loop anymore
     return Substitute(body, {{Var{for_node->loop_var}, make_const(DataType::Int(32), 0)}});
@@ -609,7 +609,7 @@ class RemoveLikelyTags : public StmtExprMutator {
  public:
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::likely())) {
-      CHECK_EQ(op->args.size(), 1);
+      ICHECK_EQ(op->args.size(), 1);
       return StmtExprMutator::VisitExpr(op->args[0]);
     } else {
       return StmtExprMutator::VisitExpr_(op);
diff --git a/src/tir/transforms/lower_custom_datatypes.cc b/src/tir/transforms/lower_custom_datatypes.cc
index a0faa17fbcc3..a3e5a920a0b2 100644
--- a/src/tir/transforms/lower_custom_datatypes.cc
+++ b/src/tir/transforms/lower_custom_datatypes.cc
@@ -53,9 +53,9 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     if (toBeLowered) {
       auto lower = datatype::GetCastLowerFunc(target_, type_code, src_type_code);
-      CHECK(lower) << "Cast lowering function for target " << target_ << " destination type "
-                   << static_cast<unsigned>(type_code) << " source type "
-                   << static_cast<unsigned>(src_type_code) << " not found";
+      ICHECK(lower) << "Cast lowering function for target " << target_ << " destination type "
+                    << static_cast<unsigned>(type_code) << " source type "
+                    << static_cast<unsigned>(src_type_code) << " not found";
       return (*lower)(expr);
     }
     return expr;
@@ -66,8 +66,8 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     auto e = GetRef<PrimExpr>(imm);
     if (datatype::Registry::Global()->GetTypeRegistered(type_code)) {
       auto lower = datatype::GetFloatImmLowerFunc(target_, type_code);
-      CHECK(lower) << "FloatImm lowering function for target " << target_ << " type "
-                   << static_cast<unsigned>(type_code) << " not found";
+      ICHECK(lower) << "FloatImm lowering function for target " << target_ << " type "
+                    << static_cast<unsigned>(type_code) << " not found";
       return (*lower)(e);
     }
     return e;
@@ -103,11 +103,11 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     call = expr.as<CallNode>();
     if (toBeLowered) {
       auto op = call->op.as<OpNode>();
-      CHECK(op != nullptr) << "Lowering non-intrinsic Calls not implemented";
+      ICHECK(op != nullptr) << "Lowering non-intrinsic Calls not implemented";
       auto lower = datatype::GetIntrinLowerFunc(target_, op->name, call->dtype.code());
-      CHECK(lower) << "Intrinsic lowering function for target " << target_ << ", intrinsic name "
-                   << op->name << ", type " << static_cast<unsigned>(call->dtype.code())
-                   << " not found";
+      ICHECK(lower) << "Intrinsic lowering function for target " << target_ << ", intrinsic name "
+                    << op->name << ", type " << static_cast<unsigned>(call->dtype.code())
+                    << " not found";
       return (*lower)(expr);
     }
     return expr;
@@ -121,8 +121,8 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     op = expr.as<NodeName>();                                                      \
     if (toBeLowered) {                                                             \
       auto lower = datatype::Get##OP##LowerFunc(target_, type_code);               \
-      CHECK(lower) << #OP " lowering function for target " << target_ << " type "  \
-                   << static_cast<unsigned>(type_code) << " not found";            \
+      ICHECK(lower) << #OP " lowering function for target " << target_ << " type " \
+                    << static_cast<unsigned>(type_code) << " not found";           \
       return (*lower)(expr);                                                       \
     }                                                                              \
     return expr;                                                                   \
@@ -153,7 +153,7 @@ Pass LowerCustomDatatypes() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
-    CHECK(target.defined()) << "LowerCustomDatatypes: Require the target attribute";
+    ICHECK(target.defined()) << "LowerCustomDatatypes: Require the target attribute";
 
     n->body = CustomDatatypesLowerer(target.value()->kind->name)(std::move(n->body));
     return f;
diff --git a/src/tir/transforms/lower_device_storage_access_info.cc b/src/tir/transforms/lower_device_storage_access_info.cc
index 3b317e3f9968..829b7d822d11 100644
--- a/src/tir/transforms/lower_device_storage_access_info.cc
+++ b/src/tir/transforms/lower_device_storage_access_info.cc
@@ -49,7 +49,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
     if (it != storage_info_.end() && it->second.info.defined()) {
       const MemoryInfo& info = it->second.info;
       ++it->second.alloc_count;
-      CHECK_LE(it->second.alloc_count, 1)
+      ICHECK_LE(it->second.alloc_count, 1)
           << "Double allocation of " << it->second.scope.to_string();
 
       if (info->head_address.defined()) {
@@ -69,7 +69,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
       e.scope = scope;
       if (scope.tag.length() != 0) {
         e.info = GetMemoryInfo(op->value.as<StringImmNode>()->value);
-        CHECK(e.info.defined()) << "Cannot find memory info of " << scope.to_string();
+        ICHECK(e.info.defined()) << "Cannot find memory info of " << scope.to_string();
       }
       storage_info_[buf] = e;
       return StmtExprMutator::VisitStmt_(op);
@@ -93,7 +93,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
     // Specially handle the buffer packed intrinsic
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<CallNode>();
-    CHECK_EQ(op->args.size(), 5U);
+    ICHECK_EQ(op->args.size(), 5U);
     DataType dtype = op->args[0].dtype();
     const VarNode* buffer = op->args[1].as<VarNode>();
     Var buffer_var = Downcast<Var>(op->args[1]);
@@ -102,7 +102,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
     if (it != storage_info_.end() && it->second.info.defined()) {
       return MakeTaggedAccessPtr(op->dtype, buffer_var, dtype, offset, it->second.info);
     }
-    CHECK(op->dtype.is_handle());
+    ICHECK(op->dtype.is_handle());
     // Change to address_of
     return AddressOffset(buffer_var, dtype, offset);
   }
@@ -110,11 +110,11 @@ class StorageAccessInfoLower : public StmtExprMutator {
   PrimExpr MakeTaggedAccessPtr(DataType ptr_type, Var buffer_var, DataType dtype, PrimExpr offset,
                                const MemoryInfo& info) {
     if (ptr_type.is_handle()) {
-      CHECK(info->head_address.defined()) << buffer_var << " is not adddressable.";
+      ICHECK(info->head_address.defined()) << buffer_var << " is not adddressable.";
       return AddressOffset(buffer_var, dtype, offset);
     }
     int dtype_bits = dtype.bits() * dtype.lanes();
-    CHECK_EQ(info->unit_bits % dtype_bits, 0);
+    ICHECK_EQ(info->unit_bits % dtype_bits, 0);
     return cast(ptr_type, analyzer_.Simplify(
                               offset / make_const(offset.dtype(), info->unit_bits / dtype_bits)));
   }
diff --git a/src/tir/transforms/lower_intrin.cc b/src/tir/transforms/lower_intrin.cc
index 8774fc37125f..cd7c10ffa688 100644
--- a/src/tir/transforms/lower_intrin.cc
+++ b/src/tir/transforms/lower_intrin.cc
@@ -86,7 +86,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     if (op == nullptr) return ret;
     int shift;
     const DataType& dtype = op->dtype;
-    CHECK(dtype.is_int() || dtype.is_uint());
+    ICHECK(dtype.is_int() || dtype.is_uint());
 
     if (support_bitwise_op_ && is_const_power_of_two_integer(op->b, &shift)) {
       // lower to right shift if possible.
@@ -138,7 +138,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     // Lower floordiv to native truncdiv.
     int shift;
     const DataType& dtype = op->dtype;
-    CHECK(dtype.is_int() || dtype.is_uint());
+    ICHECK(dtype.is_int() || dtype.is_uint());
 
     if (support_bitwise_op_ && is_const_power_of_two_integer(op->b, &shift)) {
       // lower to masking if possible.
@@ -281,7 +281,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       // if pattern exists.
       if (f != nullptr) {
         PrimExpr r = (*f)(e);
-        CHECK(r.defined()) << "intrinsic rule must always return valid Expr";
+        ICHECK(r.defined()) << "intrinsic rule must always return valid Expr";
         if (!r.same_as(e)) {
           return this->VisitExpr(r);
         }
@@ -307,7 +307,7 @@ Pass LowerIntrin() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
-    CHECK(target.defined()) << "LowerIntrin: Require the target attribute";
+    ICHECK(target.defined()) << "LowerIntrin: Require the target attribute";
     arith::Analyzer analyzer;
     auto mtriple = target.value()->GetAttr<runtime::String>("mtriple", "");
     n->body =
diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
index 720c9d0a67e0..c24e26b58db0 100644
--- a/src/tir/transforms/lower_thread_allreduce.cc
+++ b/src/tir/transforms/lower_thread_allreduce.cc
@@ -59,7 +59,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       }
     } else if (op->attr_key == attr::reduce_scope) {
       const CommReducerNode* combiner = op->node.as<CommReducerNode>();
-      CHECK(combiner);
+      ICHECK(combiner);
       reduce_combiner_.push_back(combiner);
       Stmt ret = StmtExprMutator::VisitStmt_(op);
       reduce_combiner_.pop_back();
@@ -101,7 +101,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
   PrimExpr VisitExpr_(const LoadNode* op) final {
     auto it = load_remap_.find(op->buffer_var.get());
     if (it != load_remap_.end()) {
-      CHECK(is_zero(op->index));
+      ICHECK(is_zero(op->index));
       return it->second;
     } else {
       return StmtExprMutator::VisitExpr_(op);
@@ -122,13 +122,13 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
   // make allreduce.
   Stmt MakeAllreduce(const CallNode* call) {
-    CHECK(!reduce_combiner_.empty());
+    ICHECK(!reduce_combiner_.empty());
     const CommReducerNode* combiner = reduce_combiner_.back();
     size_t size = combiner->result.size();
 
     const IntImmNode* size_of_args = call->args[0].as<IntImmNode>();
-    CHECK(size_of_args) << call->args[0]->GetTypeKey();
-    CHECK_EQ(size, size_of_args->value);
+    ICHECK(size_of_args) << call->args[0]->GetTypeKey();
+    ICHECK_EQ(size, size_of_args->value);
     Array<PrimExpr> inits = combiner->identity_element;
     std::vector<PrimExpr> values(size);
     std::vector<DataType> types(size);
@@ -143,7 +143,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     std::vector<const VarNode*> buffers(size);
     for (size_t idx = 0; idx < size; ++idx) {
       const VarNode* buffer = call->args[2 + size + idx].as<VarNode>();
-      CHECK(buffer);
+      ICHECK(buffer);
       buffers[idx] = buffer;
     }
 
@@ -156,7 +156,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       if (v) {
         reduce_set.insert(v);
       } else {
-        CHECK(call->args[i].as<IntImmNode>() && call->args[i].as<IntImmNode>()->value == 0)
+        ICHECK(call->args[i].as<IntImmNode>() && call->args[i].as<IntImmNode>()->value == 0)
             << "arg" << i << "should be a VarNode or IntImmNode";
       }
     }
@@ -168,11 +168,11 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       IterVar iv = Downcast<IterVar>(attr->node);
       e.scope = runtime::ThreadScope::Create(iv->thread_tag);
       e.iv = iv;
-      CHECK_LE(e.scope.rank, 1);
-      CHECK_GE(e.scope.dim_index, 0) << "vthread do not work with cross thread reduction";
+      ICHECK_LE(e.scope.rank, 1);
+      ICHECK_GE(e.scope.dim_index, 0) << "vthread do not work with cross thread reduction";
       if (e.scope.rank == 1) {
         const auto* ptr = attr->value.as<IntImmNode>();
-        CHECK(ptr) << "Need constant extent for reduce set " << iv;
+        ICHECK(ptr) << "Need constant extent for reduce set " << iv;
         e.extent = static_cast<int>(ptr->value);
         // ignore variables equal to 0
         if (e.extent == 1) {
@@ -187,7 +187,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         }
       }
     }
-    CHECK_EQ(nmatch, reduce_set.size()) << "Not all reduce index are presented in the context";
+    ICHECK_EQ(nmatch, reduce_set.size()) << "Not all reduce index are presented in the context";
     std::sort(vred.begin(), vred.end());
     std::sort(vpar.begin(), vpar.end());
     // the size of each index.
@@ -216,7 +216,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     //
     if (is_warp_reduction(types)) {
       // TODO(tvm-team) sub-warp reduction support.
-      CHECK_EQ(reduce_extent, warp_size_) << "not a warp reduction";
+      ICHECK_EQ(reduce_extent, warp_size_) << "not a warp reduction";
       //
       // This is the index to the reduction variable, one reduction
       // variable per warp. Local scope seems easier to reason without
@@ -309,7 +309,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
       // Update existing allocations.
       for (size_t i = 0; i < size; ++i) {
-        CHECK(!load_remap_.count(buffers[i]));
+        ICHECK(!load_remap_.count(buffers[i]));
         PrimExpr pred = const_true(types[i].lanes());
         Var var = shared_bufs[i];
         load_remap_[buffers[i]] = Load(types[i], var, index, pred);
@@ -347,7 +347,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       seq.emplace_back(MakeBufAllreduce(combiner, types, shared_bufs, reduce_index, group_index,
                                         reduce_extent, threadx_extent));
       for (size_t idx = 0; idx < size; ++idx) {
-        CHECK(!load_remap_.count(buffers[idx]));
+        ICHECK(!load_remap_.count(buffers[idx]));
         PrimExpr pred = const_true(types[idx].lanes());
         load_remap_[buffers[idx]] =
             Load(types[idx], shared_bufs[idx],
@@ -380,7 +380,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     while (reduce_extent > reduce_align) {
       reduce_align = reduce_align << 1;
     }
-    CHECK_GT(reduce_align, 1);
+    ICHECK_GT(reduce_align, 1);
     std::vector<Stmt> seq;
 
     size_t size = shared_bufs.size();
@@ -409,7 +409,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       seq.emplace_back(IfThenElse(cond, freduce(reduce_align)));
       seq.emplace_back(SyncThread("shared"));
     }
-    CHECK(threadx_extent >= 1 && warp_size_ >= 1);
+    ICHECK(threadx_extent >= 1 && warp_size_ >= 1);
     // normal synchronization
     while (reduce_align > threadx_extent || reduce_align > warp_size_) {
       reduce_align = reduce_align >> 1;
@@ -446,7 +446,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       if (ret.defined()) {
         ret = ret + e.iv->var * total_extent;
       } else {
-        CHECK_EQ(total_extent, 1);
+        ICHECK_EQ(total_extent, 1);
         ret = e.iv->var;
       }
       total_extent *= e.extent;
@@ -547,7 +547,7 @@ Pass LowerThreadAllreduce() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
-    CHECK(target.defined()) << "LowerThreadAllreduce: Require the target attribute";
+    ICHECK(target.defined()) << "LowerThreadAllreduce: Require the target attribute";
     const TargetNode* target_node = target.as<TargetNode>();
     n->body = ThreadAllreduceBuilder(target_node)(n->body);
     return f;
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 39e6640eece6..1d12d57d10b4 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -35,7 +35,7 @@ namespace tvm {
 namespace tir {
 
 inline PrimExpr ConstInt32(size_t index) {
-  CHECK_LE(index, std::numeric_limits<int>::max());
+  ICHECK_LE(index, std::numeric_limits<int>::max());
   return make_const(DataType::Int(32), static_cast<int>(index));
 }
 
@@ -70,8 +70,8 @@ class BuiltinLower : public StmtExprMutator {
 
   Stmt VisitStmt(const Stmt& s) final {
     auto stmt = StmtExprMutator::VisitStmt(s);
-    CHECK_EQ(run_shape_stack_, -1);
-    CHECK_EQ(run_array_stack_, 0);
+    ICHECK_EQ(run_shape_stack_, -1);
+    ICHECK_EQ(run_array_stack_, 0);
 
     if (prep_seq_.size() != 0) {
       Stmt ret = SeqStmt::Flatten(prep_seq_, stmt);
@@ -102,8 +102,8 @@ class BuiltinLower : public StmtExprMutator {
     for (size_t i = 0; i < op->extents.size(); ++i) {
       total_bytes = total_bytes * op->extents[i];
     }
-    CHECK(device_type_.defined()) << "Unknown device type in current IR";
-    CHECK(device_id_.defined()) << "Unknown device id in current IR";
+    ICHECK(device_type_.defined()) << "Unknown device type in current IR";
+    ICHECK(device_id_.defined()) << "Unknown device id in current IR";
     Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
 
     Stmt body = SeqStmt({IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {op->buffer_var}),
@@ -129,11 +129,11 @@ class BuiltinLower : public StmtExprMutator {
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::device_context_id) {
-      CHECK(!device_id_.defined());
+      ICHECK(!device_id_.defined());
       device_id_ = op->value;
       return this->VisitStmt(op->body);
     } else if (op->attr_key == attr::device_context_type) {
-      CHECK(!device_type_.defined());
+      ICHECK(!device_type_.defined());
       device_type_ = op->value;
       return this->VisitStmt(op->body);
     } else {
@@ -202,8 +202,8 @@ class BuiltinLower : public StmtExprMutator {
     }
     prep_seq_.emplace_back(TVMStructSet(stack_array_, idx, builtin::kArrByteOffset,
                                         cast(DataType::UInt(64), byte_offset)));
-    CHECK(device_type_.defined()) << "Unknown device type in current IR";
-    CHECK(device_id_.defined()) << "Unknown device id in current IR";
+    ICHECK(device_type_.defined()) << "Unknown device type in current IR";
+    ICHECK(device_id_.defined()) << "Unknown device id in current IR";
     prep_seq_.emplace_back(TVMStructSet(stack_array_, idx, builtin::kArrDeviceId,
                                         cast(DataType::Int(32), device_id_)));
     prep_seq_.emplace_back(TVMStructSet(stack_array_, idx, builtin::kArrDeviceType,
@@ -256,7 +256,7 @@ class BuiltinLower : public StmtExprMutator {
     size_t arg_stack_begin = run_arg_stack_;
     run_arg_stack_ += op->args.size();
     size_t args_size = op->args.size();
-    CHECK_GT(args_size, 0);
+    ICHECK_GT(args_size, 0);
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<CallNode>();
     for (size_t i = 1; i < op->args.size(); ++i) {
@@ -270,7 +270,7 @@ class BuiltinLower : public StmtExprMutator {
       prep_seq_.emplace_back(TVMStructSet(stack_value_, static_cast<int>(arg_stack_begin + i - 1),
                                           builtin::kTVMValueContent, arg));
       int arg_tcode = api_type.code();
-      CHECK(!IsArrayHandle(arg)) << "Trace does not support Buffers";
+      ICHECK(!IsArrayHandle(arg)) << "Trace does not support Buffers";
       prep_seq_.emplace_back(
           Store(stack_tcode_, ConstInt32(arg_tcode), stack_index, const_true(1)));
     }
diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc
index cb6c609ef657..b95681a936ca 100644
--- a/src/tir/transforms/lower_warp_memory.cc
+++ b/src/tir/transforms/lower_warp_memory.cc
@@ -117,7 +117,7 @@ class WarpStoreCoeffFinder : private StmtVisitor {
         UpdatePattern(op->index);
       } else {
         arith::PVar<PrimExpr> base;
-        CHECK(arith::ramp(base, 1, op->value.dtype().lanes()).Match(op->index))
+        ICHECK(arith::ramp(base, 1, op->value.dtype().lanes()).Match(op->index))
             << "LowerWarpMemory failed due to store index=" << op->index
             << ", can only handle continuous store";
         UpdatePattern(base.Eval());
@@ -129,20 +129,20 @@ class WarpStoreCoeffFinder : private StmtVisitor {
 
   void UpdatePattern(const PrimExpr& index) {
     Array<PrimExpr> m = arith::DetectLinearEquation(index, {warp_index_});
-    CHECK_EQ(m.size(), 2U)
+    ICHECK_EQ(m.size(), 2U)
         << "LowerWarpMemory failed. Could not simplify the store index `" << index
         << "` into the form ax + by + cz + ... Warp memory is approximated by storing values in "
            "thread local registers and shuffling values between these registers. Currently only "
            "linear equation indices are supported.";
     PrimExpr mcoeff = analyzer_->canonical_simplify(m[0]);
     const auto* mcoeff_as_int = mcoeff.as<IntImmNode>();
-    CHECK(mcoeff_as_int && mcoeff_as_int->value > 0)
+    ICHECK(mcoeff_as_int && mcoeff_as_int->value > 0)
         << "LowerWarpMemory failed due to store index=" << index
         << ", require positive constant coefficient on warp index " << warp_index_ << " but get "
         << mcoeff;
 
     if (warp_coeff_ != 0) {
-      CHECK_EQ(warp_coeff_, mcoeff_as_int->value)
+      ICHECK_EQ(warp_coeff_, mcoeff_as_int->value)
           << "LowerWarpMemory failed due to two different store coefficient to warp index";
     } else {
       warp_coeff_ = mcoeff_as_int->value;
@@ -166,7 +166,7 @@ class WarpIndexFinder : private StmtVisitor {
   // find the warp co-efficient and the shuffle width in the statement
   std::pair<Var, int> Find(const Stmt& stmt) {
     this->VisitStmt(stmt);
-    CHECK(warp_index_.defined())
+    ICHECK(warp_index_.defined())
         << "Cannot find warp index(threadIdx.x) within the scope of warp memory";
     return std::make_pair(warp_index_->var, width_);
   }
@@ -178,14 +178,14 @@ class WarpIndexFinder : private StmtVisitor {
       IterVar iv = Downcast<IterVar>(op->node);
       if (iv->thread_tag == "threadIdx.x") {
         auto* value_as_int = op->value.as<IntImmNode>();
-        CHECK(value_as_int && value_as_int->value <= warp_size_ &&
-              warp_size_ % value_as_int->value == 0)
+        ICHECK(value_as_int && value_as_int->value <= warp_size_ &&
+               warp_size_ % value_as_int->value == 0)
             << "Expect threadIdx.x 's size to be no larger than, and a factor of"
             << " warp size(" << warp_size_ << ")"
             << " to enable warp memory"
             << " but get " << op->value << " instead";
         if (warp_index_.defined()) {
-          CHECK(warp_index_.same_as(iv))
+          ICHECK(warp_index_.same_as(iv))
               << "Find two instance of " << warp_index_->thread_tag << " in the same kernel. "
               << "Please create it using thread_axis once and reuse the axis "
               << "across multiple binds in the same kernel";
@@ -214,7 +214,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
   Stmt Rewrite(const AllocateNode* op) {
     buffer_ = op->buffer_var.get();
     int alloc_size = op->constant_allocation_size();
-    CHECK_GT(alloc_size, 0) << "warp memory only support constant alloc size";
+    ICHECK_GT(alloc_size, 0) << "warp memory only support constant alloc size";
     alloc_size *= op->dtype.lanes();
     std::tie(warp_index_, width_) = WarpIndexFinder(warp_size_).Find(op->body);
     warp_coeff_ = WarpStoreCoeffFinder(buffer_, warp_index_, analyzer_).Find(op->body);
@@ -231,7 +231,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
 
  protected:
   PrimExpr VisitExpr_(const VarNode* op) override {
-    CHECK(op != buffer_) << "Cannot access address of warp memory directly";
+    ICHECK(op != buffer_) << "Cannot access address of warp memory directly";
     return StmtExprMutator::VisitExpr_(op);
   }
 
@@ -250,7 +250,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
       PrimExpr local_index, group;
       std::tie(local_index, group) = SplitIndexByGroup(op->index);
       // invariance: local index must do not contain warp id
-      CHECK(!ExprUseVar(local_index, warp_index_))
+      ICHECK(!ExprUseVar(local_index, warp_index_))
           << "LowerWarpMemory failed to rewrite load to shuffle for index " << op->index
           << " local_index=" << local_index;
       PrimExpr load_value = Load(op->dtype, op->buffer_var, local_index, op->predicate);
@@ -271,7 +271,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
       PrimExpr local_index, group;
 
       arith::PVar<PrimExpr> base;
-      CHECK(arith::ramp(base, 1, index.dtype().lanes()).Match(index));
+      ICHECK(arith::ramp(base, 1, index.dtype().lanes()).Match(index));
 
       std::tie(local_index, group) = SplitIndexByGroup(base.Eval());
       local_index = Ramp(local_index, make_const(local_index.dtype(), 1), index.dtype().lanes());
@@ -326,7 +326,7 @@ class BindVarBoundInfo : public StmtVisitor {
   void VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       if (!var_dom_.count(iv->var.get())) {
         Range dom = Range::FromMinExtent(0, op->value);
         var_dom_[iv->var.get()] = dom;
@@ -395,7 +395,7 @@ Pass LowerWarpMemory() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
-    CHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
+    ICHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
     int warp_size = target.value()->GetAttr<Integer>("thread_warp_size", 1).value();
     n->body = WarpMemoryRewriter(warp_size).Rewrite(std::move(n->body));
     return f;
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 3cacf52d90d2..7c4a8ef92724 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -47,10 +47,10 @@ inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
 
 PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol) << "MakePackedAPI: Expect PrimFunc to have the global_symbol attribute";
+  ICHECK(global_symbol) << "MakePackedAPI: Expect PrimFunc to have the global_symbol attribute";
 
   auto target = func->GetAttr<Target>(tvm::attr::kTarget);
-  CHECK(target.defined()) << "MakePackedAPI: Require the target attribute";
+  ICHECK(target.defined()) << "MakePackedAPI: Require the target attribute";
   int target_device_type = target.value()->kind->device_type;
 
   std::string name_hint = global_symbol.value();
@@ -58,7 +58,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   auto* func_ptr = func.CopyOnWrite();
   const Stmt nop = Evaluate(0);
   int num_args = static_cast<int>(func_ptr->params.size());
-  CHECK_LE(num_unpacked_args, num_args);
+  ICHECK_LE(num_unpacked_args, num_args);
 
   int num_packed_args = num_args - num_unpacked_args;
   // Data field definitions
@@ -143,7 +143,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
         msg << name_hint << ": Expect arg[" << i << "] to be int";
         seq_check.emplace_back(AssertStmt(tcode == kDLInt, tvm::tir::StringImm(msg.str()), nop));
       } else {
-        CHECK(t.is_float());
+        ICHECK(t.is_float());
         std::ostringstream msg;
         msg << name_hint << ": Expect arg[" << i << "] to be float";
         seq_check.emplace_back(AssertStmt(tcode == kDLFloat, tvm::tir::StringImm(msg.str()), nop));
@@ -161,7 +161,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   }
 
   size_t expected_nargs = num_unpacked_args + (num_packed_args != 0 ? 6 : 0);
-  CHECK_EQ(args.size(), expected_nargs);
+  ICHECK_EQ(args.size(), expected_nargs);
 
   // Arg definitions are defined before buffer binding to avoid the use before
   // def errors.
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 4d6aa88ede01..0b248959ec6e 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -105,7 +105,7 @@ class DataTypeVisitor final : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       analyzer_.Bind(iv->var, Range::FromMinExtent(0, op->value));
       vextent_[iv->var.as<VarNode>()] = op->value.dtype();
       StmtExprVisitor::VisitStmt_(op);
@@ -216,8 +216,8 @@ class DataTypeRewriter : public StmtExprMutator {
   Stmt VisitStmt_(const ForNode* op) final {
     Stmt s = StmtExprMutator::VisitStmt_(op);
     op = s.as<ForNode>();
-    CHECK(op != nullptr) << "Expected type to be ForNode"
-                         << ", but get " << s->GetTypeKey();
+    ICHECK(op != nullptr) << "Expected type to be ForNode"
+                          << ", but get " << s->GetTypeKey();
     PrimExpr e = VisitExpr(op->loop_var);
     Var var = Downcast<Var>(e);
     return For(var, cast(var.dtype(), op->min), cast(var.dtype(), op->extent), op->for_type,
@@ -228,11 +228,11 @@ class DataTypeRewriter : public StmtExprMutator {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       Stmt s = StmtExprMutator::VisitStmt_(op);
       op = s.as<AttrStmtNode>();
-      CHECK(op != nullptr) << "Expected type to be AttrStmtNode"
-                           << ", but get " << s->GetTypeKey();
+      ICHECK(op != nullptr) << "Expected type to be AttrStmtNode"
+                            << ", but get " << s->GetTypeKey();
       const IterVarNode* iv = op->node.as<IterVarNode>();
-      CHECK(iv != nullptr) << "Expected type to be IterVarNode"
-                           << ", but get " << op->node->GetTypeKey();
+      ICHECK(iv != nullptr) << "Expected type to be IterVarNode"
+                            << ", but get " << op->node->GetTypeKey();
       PrimExpr e = VisitExpr(iv->var);
       Var var = Downcast<Var>(e);
       if (ivmap_.find(iv) == ivmap_.end()) {
@@ -284,8 +284,8 @@ class DataTypeRewriter : public StmtExprMutator {
     if (is_index_ && visitor_.vmap.find(op) != visitor_.vmap.end()) {
       PrimExpr e = StmtExprMutator::VisitExpr_(op);
       const CastNode* new_op = e.as<CastNode>();
-      CHECK(new_op != nullptr) << "Expected type to be CastNode"
-                               << ", but get " << e->GetTypeKey();
+      ICHECK(new_op != nullptr) << "Expected type to be CastNode"
+                                << ", but get " << e->GetTypeKey();
       return Cast(visitor_.vmap[op], new_op->value);
     }
     return StmtExprMutator::VisitExpr_(op);
@@ -353,8 +353,8 @@ DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
 PrimExpr DataTypeRewriter::VisitExpr_(const CallNode* op) {
   PrimExpr e = StmtExprMutator::VisitExpr_(op);
   op = e.as<CallNode>();
-  CHECK(op != nullptr) << "Expected type to be CallNode"
-                       << ", but get " << e->GetTypeKey();
+  ICHECK(op != nullptr) << "Expected type to be CallNode"
+                        << ", but get " << e->GetTypeKey();
 
   if (op->op.same_as(builtin::if_then_else())) {
     return if_then_else(op->args[0], op->args[1], op->args[2]);
diff --git a/src/tir/transforms/remap_thread_axis.cc b/src/tir/transforms/remap_thread_axis.cc
index 017d1b4e6c67..e101e6b904ce 100644
--- a/src/tir/transforms/remap_thread_axis.cc
+++ b/src/tir/transforms/remap_thread_axis.cc
@@ -41,7 +41,7 @@ class ThreadAxisRewriter : private StmtExprMutator {
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       auto it = tmap_.find(iv->thread_tag);
       if (it != tmap_.end()) {
         const IterVar& new_iv = it->second;
@@ -49,7 +49,7 @@ class ThreadAxisRewriter : private StmtExprMutator {
         if (!vmap_.count(v)) {
           vmap_[v] = new_iv->var;
         } else {
-          CHECK(vmap_[v].same_as(new_iv->var));
+          ICHECK(vmap_[v].same_as(new_iv->var));
         }
         Stmt body = this->VisitStmt(op->body);
         return AttrStmt(new_iv, op->attr_key, op->value, body);
@@ -76,7 +76,7 @@ PrimFunc RemapThreadAxis(PrimFunc&& f, Map<runtime::String, IterVar> thread_map)
   }
 
   auto opt_thread_axis = f->GetAttr<Array<IterVar>>(tir::attr::kDeviceThreadAxis);
-  CHECK(opt_thread_axis != nullptr) << "Require attribute " << tir::attr::kDeviceThreadAxis;
+  ICHECK(opt_thread_axis != nullptr) << "Require attribute " << tir::attr::kDeviceThreadAxis;
   auto thread_axis = opt_thread_axis.value();
   auto* n = f.CopyOnWrite();
 
diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index baa1c3c368fd..aae1749b27db 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -97,7 +97,7 @@ class NoOpRemover : public StmtMutator {
   Stmt VisitStmt_(const SeqStmtNode* op) final {
     Stmt ret = StmtMutator::VisitSeqStmt_(op, true);
     op = ret.as<SeqStmtNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     bool need_compact = false;
     for (size_t i = 0; i < op->size(); ++i) {
       if (is_no_op(op->seq[i])) need_compact = true;
diff --git a/src/tir/transforms/split_host_device.cc b/src/tir/transforms/split_host_device.cc
index c121285e2314..921c7ad79509 100644
--- a/src/tir/transforms/split_host_device.cc
+++ b/src/tir/transforms/split_host_device.cc
@@ -43,7 +43,7 @@ class VarUseDefAnalysis : public StmtExprMutator {
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
-      CHECK_NE(iv->thread_tag.length(), 0U);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
       // thread_extent can appear multiple times
       // use the first appearance as def.
       if (!use_count_.count(iv->var.get())) {
@@ -108,7 +108,7 @@ class VarUseDefAnalysis : public StmtExprMutator {
     auto it = let_binding_.find(op->var);
     PrimExpr value = this->VisitExpr(op->value);
     if (it != let_binding_.end()) {
-      CHECK(deep_equal_(it->second->value, value))
+      ICHECK(deep_equal_(it->second->value, value))
           << "Let cannot bind the same var to two different values";
       return GetRef<PrimExpr>(it->second);
     } else {
@@ -147,16 +147,16 @@ class VarUseDefAnalysis : public StmtExprMutator {
   }
 
   void HandleDef(const VarNode* v) {
-    CHECK(!def_count_.count(v)) << "variable " << v->name_hint
-                                << " has already been defined, the Stmt is not SSA";
-    CHECK(!use_count_.count(v)) << "variable " << v->name_hint
-                                << " has been used before definition!";
+    ICHECK(!def_count_.count(v)) << "variable " << v->name_hint
+                                 << " has already been defined, the Stmt is not SSA";
+    ICHECK(!use_count_.count(v)) << "variable " << v->name_hint
+                                 << " has been used before definition!";
     use_count_[v] = 0;
     def_count_[v] = 1;
   }
 
   void HandleUse(const PrimExpr& v) {
-    CHECK(v.as<VarNode>());
+    ICHECK(v.as<VarNode>());
     Var var = Downcast<Var>(v);
     auto it = use_count_.find(var.get());
     if (it != use_count_.end()) {
@@ -290,9 +290,9 @@ class HostDeviceSplitter : public StmtMutator {
 
 PrimFunc SplitHostDevice(PrimFunc&& func, IRModule* device_mod) {
   auto target = func->GetAttr<Target>(tvm::attr::kTarget);
-  CHECK(target.defined()) << "SplitHostDevice: Require the target attribute";
+  ICHECK(target.defined()) << "SplitHostDevice: Require the target attribute";
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  CHECK(global_symbol.defined())
+  ICHECK(global_symbol.defined())
       << "SplitHostDevice: Expect PrimFunc to have the global_symbol attribute";
 
   HostDeviceSplitter splitter(device_mod, target.value(),
@@ -316,7 +316,7 @@ Pass SplitHostDevice() {
     for (auto& kv : *func_dict) {
       if (kv.second->IsInstance<PrimFuncNode>()) {
         PrimFunc func = Downcast<PrimFunc>(std::move(kv.second));
-        CHECK(device_mod.defined()) << "The device module must be defined.";
+        ICHECK(device_mod.defined()) << "The device module must be defined.";
         kv.second = SplitHostDevice(std::move(func), &device_mod);
       }
     }
diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc
index 6514a834b397..be20724ae207 100644
--- a/src/tir/transforms/storage_access.cc
+++ b/src/tir/transforms/storage_access.cc
@@ -37,7 +37,7 @@ void StorageAccessVisitor::VisitExpr_(const LoadNode* op) {
   const VarNode* buf = op->buffer_var.as<VarNode>();
   StorageScope scope = GetScope(buf);
   if (Enabled(buf, scope)) {
-    CHECK(allow_append_) << op << " " << scope.to_string();
+    ICHECK(allow_append_) << op << " " << scope.to_string();
     AccessEntry e;
     e.threads = env_threads();
     e.buffer = op->buffer_var;
@@ -53,7 +53,7 @@ void StorageAccessVisitor::VisitExpr_(const LoadNode* op) {
 
 void StorageAccessVisitor::VisitStmt_(const StoreNode* op) {
   allow_append_ = true;
-  CHECK_EQ(curr_stmt_.access.size(), 0U);
+  ICHECK_EQ(curr_stmt_.access.size(), 0U);
   curr_stmt_.stmt = op;
   const VarNode* buf = op->buffer_var.as<VarNode>();
   StorageScope scope = GetScope(buf);
@@ -78,7 +78,7 @@ void StorageAccessVisitor::VisitStmt_(const StoreNode* op) {
 
 void StorageAccessVisitor::VisitStmt_(const EvaluateNode* op) {
   allow_append_ = true;
-  CHECK_EQ(curr_stmt_.access.size(), 0U);
+  ICHECK_EQ(curr_stmt_.access.size(), 0U);
   curr_stmt_.stmt = op;
   StmtExprVisitor::VisitStmt_(op);
   // push to the scope
@@ -95,7 +95,7 @@ void StorageAccessVisitor::VisitStmt_(const AttrStmtNode* op) {
     storage_scope_[buf] = StorageScope::Create(op->value.as<StringImmNode>()->value);
     StmtExprVisitor::VisitStmt_(op);
   } else if (op->attr_key == attr::double_buffer_write) {
-    CHECK(double_buffer_write_ == nullptr);
+    ICHECK(double_buffer_write_ == nullptr);
     double_buffer_write_ = op->node.as<VarNode>();
     scope_.push_back(std::vector<StmtEntry>());
     StmtExprVisitor::VisitStmt_(op);
@@ -151,7 +151,7 @@ void StorageAccessVisitor::VisitStmt_(const ForNode* op) {
         arith::IntSet::FromRange(Range::FromMinExtent(op->min, op->extent));
     for (AccessEntry& e : s.access) {
       if (e.buffer.defined()) {
-        CHECK(e.touched.defined());
+        ICHECK(e.touched.defined());
         e.touched = arith::EvalSet(e.touched, relax_map);
       }
     }
@@ -185,7 +185,7 @@ void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
     const LoadNode* l = op->args[0].as<LoadNode>();
     StmtExprVisitor::VisitExpr_(l);
   } else if (op->op.same_as(builtin::tvm_access_ptr())) {
-    CHECK_EQ(op->args.size(), 5U);
+    ICHECK_EQ(op->args.size(), 5U);
     DataType dtype = op->args[0].dtype();
     const VarNode* buffer = op->args[1].as<VarNode>();
     PrimExpr offset = op->args[2];
@@ -194,7 +194,7 @@ void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
     StorageScope scope = GetScope(buffer);
     // The buffer scope.
     if (Enabled(buffer, scope)) {
-      CHECK(allow_append_);
+      ICHECK(allow_append_);
       AccessEntry e;
       e.threads = env_threads();
       e.dtype = dtype;
@@ -212,7 +212,7 @@ void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
     }
     StmtExprVisitor::VisitExpr_(op);
   } else if (op->op.same_as(builtin::tvm_storage_sync())) {
-    CHECK(allow_append_);
+    ICHECK(allow_append_);
     const std::string& s = op->args[0].as<StringImmNode>()->value;
     if (s != "warp") {
       StorageScope scope = StorageScope::Create(s);
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index c062cf73aeef..d392866b3694 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -69,7 +69,7 @@ class StorageFlattener : public StmtExprMutator {
     op = stmt.as<StoreNode>();
     auto it = var_remap_.find(op->buffer_var.get());
     if (it != var_remap_.end() && !it->second.same_as(op->buffer_var)) {
-      CHECK(it->second.as<VarNode>());
+      ICHECK(it->second.as<VarNode>());
       Var buf_var = Downcast<Var>(it->second);
       return Store(buf_var, op->value, op->index, op->predicate);
     } else {
@@ -86,7 +86,7 @@ class StorageFlattener : public StmtExprMutator {
       auto buffer = Downcast<tir::Buffer>(op->node);
       Stmt body = this->VisitStmt(op->body);
       auto it = buf_map_.find(buffer);
-      CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << buffer;
+      ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << buffer;
       body = AttrStmt(it->second.buffer->data, op->attr_key, op->value, std::move(body));
       return body;
     } else if (op->attr_key == attr::thread_extent) {
@@ -101,7 +101,7 @@ class StorageFlattener : public StmtExprMutator {
     } else if (op->attr_key == attr::buffer_dim_align) {
       auto buffer = Downcast<tir::Buffer>(op->node);
       const CallNode* tuple = op->value.as<CallNode>();
-      CHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
+      ICHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
       auto& vinfo = dim_align_[buffer];
       int dim = tuple->args[0].as<IntImmNode>()->value;
       if (static_cast<size_t>(dim) >= vinfo.size()) {
@@ -122,10 +122,10 @@ class StorageFlattener : public StmtExprMutator {
     const auto& key = op->buffer;
 
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
 
     const BufferEntry& e = it->second;
-    CHECK(!e.released) << "Read a buffer that is already out of scope";
+    ICHECK(!e.released) << "Read a buffer that is already out of scope";
 
     Stmt body = e.buffer.vstore(e.RelIndex(op->indices), op->value);
     if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
@@ -145,7 +145,7 @@ class StorageFlattener : public StmtExprMutator {
     const auto& key = op->buffer;
 
     if (buf_map_.count(key)) {
-      CHECK(buf_map_.at(key).external);
+      ICHECK(buf_map_.at(key).external);
       return this->VisitStmt(op->body);
     } else {
       // create a buffer entry
@@ -157,7 +157,7 @@ class StorageFlattener : public StmtExprMutator {
       }
       // deduce current storage scope.
       auto it = storage_scope_.find(op->buffer.get());
-      CHECK(it != storage_scope_.end()) << "Cannot find storage scope of " << op->buffer;
+      ICHECK(it != storage_scope_.end()) << "Cannot find storage scope of " << op->buffer;
       StorageScope skey;
       const std::string& strkey = it->second;
       if (strkey.length() == 0) {
@@ -176,7 +176,7 @@ class StorageFlattener : public StmtExprMutator {
         MemoryInfo info = GetMemoryInfo(skey.to_string());
         if (info.defined()) {
           align = (info->max_simd_bits + dtype.bits() - 1) / dtype.bits();
-          CHECK_LE(const_size * dtype.bits(), info->max_num_bits)
+          ICHECK_LE(const_size * dtype.bits(), info->max_num_bits)
               << "Allocation exceed bound of memory tag " << skey.to_string();
         }
       }
@@ -243,7 +243,7 @@ class StorageFlattener : public StmtExprMutator {
     op = expr.as<LoadNode>();
     auto it = var_remap_.find(op->buffer_var.get());
     if (it != var_remap_.end() && !it->second.same_as(op->buffer_var)) {
-      CHECK(it->second.as<VarNode>());
+      ICHECK(it->second.as<VarNode>());
       Var buf_var = Downcast<Var>(it->second);
       return Load(op->dtype, buf_var, op->index, op->predicate);
     } else {
@@ -267,9 +267,9 @@ class StorageFlattener : public StmtExprMutator {
     const auto& key = op->buffer;
 
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
     const BufferEntry& e = it->second;
-    CHECK(!e.released) << "Read a buffer that is already out of scope";
+    ICHECK(!e.released) << "Read a buffer that is already out of scope";
 
     if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
       shape_collector_.push_back(std::make_pair(e.buffer->data, e.buffer->shape));
@@ -280,15 +280,15 @@ class StorageFlattener : public StmtExprMutator {
   Stmt VisitStmt_(const PrefetchNode* op) final {
     Stmt stmt = StmtExprMutator::VisitStmt_(op);
     op = stmt.as<PrefetchNode>();
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
 
     const auto& key = op->buffer;
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
+    ICHECK(it != buf_map_.end()) << "Cannot find allocated buffer for " << key;
     const BufferEntry& e = it->second;
 
-    CHECK(!e.released) << "Read a buffer that is already out of scope";
-    CHECK_EQ(e.buffer->shape.size(), op->bounds.size())
+    ICHECK(!e.released) << "Read a buffer that is already out of scope";
+    ICHECK_EQ(e.buffer->shape.size(), op->bounds.size())
         << "Prefetch dim should be the same as buffer dim";
 
     int block_size = 1, elem_cnt = cache_line_size_ / e.buffer->dtype.bytes();
@@ -385,22 +385,22 @@ class StorageFlattener : public StmtExprMutator {
   // region with shape [1, 1, n, m] to buffer with shape [n, m]
   Stmt HandleBufferBindScope(const AttrStmtNode* op) {
     Array<ObjectRef> arr = Downcast<Array<ObjectRef>>(op->node);
-    CHECK_EQ(arr.size(), 2U);
+    ICHECK_EQ(arr.size(), 2U);
     const BufferNode* buffer = arr[0].as<BufferNode>();
     const BufferNode* target = arr[1].as<BufferNode>();
     const CallNode* tuple = op->value.as<CallNode>();
-    CHECK(buffer && target);
-    CHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
+    ICHECK(buffer && target);
+    ICHECK(tuple && tuple->op.same_as(builtin::tvm_tuple()));
     auto key = GetRef<Buffer>(target);
 
     auto it = buf_map_.find(key);
-    CHECK(it != buf_map_.end()) << "Cannot find buffer of " << key;
+    ICHECK(it != buf_map_.end()) << "Cannot find buffer of " << key;
     const BufferEntry& be = it->second;
-    CHECK(!be.released);
-    CHECK_EQ(tuple->args.size(), be.buffer->shape.size() * 2);
+    ICHECK(!be.released);
+    ICHECK_EQ(tuple->args.size(), be.buffer->shape.size() * 2);
     Array<PrimExpr> begins, extents;
     if (be.bounds.size() != 0) {
-      CHECK_EQ(tuple->args.size(), be.bounds.size() * 2);
+      ICHECK_EQ(tuple->args.size(), be.bounds.size() * 2);
       for (size_t i = 0; i < be.buffer->shape.size(); ++i) {
         begins.push_back(tuple->args[2 * i] - be.bounds[i]->min);
         extents.push_back(tuple->args[2 * i + 1]);
@@ -414,7 +414,7 @@ class StorageFlattener : public StmtExprMutator {
     }
     Buffer slice = be.buffer.MakeSlice(begins, extents);
     if (buffer->strides.size() == 0) {
-      CHECK_EQ(slice->strides.size(), 0U)
+      ICHECK_EQ(slice->strides.size(), 0U)
           << "Trying to bind compact buffer to strided one strides=" << slice->strides;
     } else {
       slice = slice.MakeStrideView();
@@ -452,7 +452,7 @@ class StorageFlattener : public StmtExprMutator {
     inline Array<PrimExpr> RelIndex(Array<PrimExpr> args) const {
       if (bounds.size() != 0) {
         Array<PrimExpr> index;
-        CHECK_EQ(bounds.size(), args.size());
+        ICHECK_EQ(bounds.size(), args.size());
         for (size_t i = 0; i < bounds.size(); ++i) {
           index.push_back(args[i] - bounds[i]->min);
         }
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 3abff415d1f1..2817b1334019 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -86,8 +86,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     size_t level = scope_.size();
     const VarNode* buf = op->buffer_var.get();
     auto it = alloc_info_.find(buf);
-    CHECK(it != alloc_info_.end());
-    CHECK(it->second.alloc == nullptr);
+    ICHECK(it != alloc_info_.end());
+    ICHECK(it->second.alloc == nullptr);
     it->second.alloc = op;
     it->second.level = level;
     StmtExprVisitor::VisitStmt_(op);
@@ -100,7 +100,7 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     const VarNode* buf = op->buffer_var.get();
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
-      CHECK_LT(it->second.level, scope_.size());
+      ICHECK_LT(it->second.level, scope_.size());
       scope_[it->second.level].touched.push_back(buf);
     }
     StmtEntry e = scope_.back();
@@ -127,7 +127,7 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     const VarNode* buf = op->buffer_var.get();
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
-      CHECK_LT(it->second.level, scope_.size()) << "Load memory in places other than store.";
+      ICHECK_LT(it->second.level, scope_.size()) << "Load memory in places other than store.";
       scope_[it->second.level].touched.push_back(buf);
     }
   }
@@ -143,7 +143,7 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     // Directly reference to the variable count as a read.
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
-      CHECK_LT(it->second.level, scope_.size()) << " buf=" << buf->name_hint;
+      ICHECK_LT(it->second.level, scope_.size()) << " buf=" << buf->name_hint;
       scope_[it->second.level].touched.push_back(buf);
     }
   }
@@ -160,11 +160,11 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     e.touched = std::move(scope_.back().touched);
     scope_.pop_back();
     int64_t end_index = static_cast<int64_t>(linear_seq_.size());
-    CHECK_GT(end_index, begin_index);
+    ICHECK_GT(end_index, begin_index);
     e.scope_pair_offset = begin_index - end_index;
     linear_seq_.push_back(e);
     // record the pointer to end index.
-    CHECK_NE(end_index, 0U);
+    ICHECK_NE(end_index, 0U);
     linear_seq_[begin_index].scope_pair_offset = end_index - begin_index;
   }
   void VisitStmt_(const AttrStmtNode* op) final {
@@ -349,7 +349,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     if (attach_map_.count(nullptr)) {
       std::vector<Stmt> nest;
       for (StorageEntry* e : attach_map_.at(nullptr)) {
-        // CHECK_EQ(e->scope.rank, 0);
+        // ICHECK_EQ(e->scope.rank, 0);
         if (e->new_alloc.defined()) {
           nest.emplace_back(AttrStmt(e->alloc_var, attr::storage_scope,
                                      StringImm(e->scope.to_string()), Evaluate(0)));
@@ -389,7 +389,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   }
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
-      CHECK_EQ(op->args.size(), 5U);
+      ICHECK_EQ(op->args.size(), 5U);
       DataType dtype = op->args[0].dtype();
       const VarNode* buffer = op->args[1].as<VarNode>();
       auto it = alloc_map_.find(buffer);
@@ -400,7 +400,7 @@ class StoragePlanRewriter : public StmtExprMutator {
       PrimExpr offset = this->VisitExpr(op->args[2]);
       PrimExpr extent = this->VisitExpr(op->args[3]);
       uint64_t elem_bits = dtype.bits() * dtype.lanes();
-      CHECK_EQ(se->bits_offset % elem_bits, 0U);
+      ICHECK_EQ(se->bits_offset % elem_bits, 0U);
       if (se->bits_offset != 0) {
         offset = make_const(offset.dtype(), se->bits_offset / elem_bits) + offset;
       }
@@ -435,7 +435,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     }
   }
   Stmt VisitStmt_(const ForNode* op) final {
-    CHECK(op->for_type != ForType::Vectorized) << "VectorizeLoop before LiftStorageAlloc";
+    ICHECK(op->for_type != ForType::Vectorized) << "VectorizeLoop before LiftStorageAlloc";
     // remake all the allocation at the attach scope.
     if (attach_map_.count(op)) {
       auto& svec = attach_map_[op];
@@ -508,7 +508,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   PrimExpr RemapIndex(DataType dtype, PrimExpr index, StorageEntry* e) {
     if (e->bits_offset == 0) return index;
     uint64_t elem_bits = dtype.bits() * dtype.lanes();
-    CHECK_EQ(e->bits_offset % elem_bits, 0U);
+    ICHECK_EQ(e->bits_offset % elem_bits, 0U);
     return make_const(index.dtype(), e->bits_offset / elem_bits) + index;
   }
   // Prepare the new allocations
@@ -525,7 +525,7 @@ class StoragePlanRewriter : public StmtExprMutator {
       for (size_t i = 0; i < vec.size(); ++i) {
         StorageEntry* e = vec[i];
         if (e->scope.tag.length() != 0) {
-          CHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size";
+          ICHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size";
           for (size_t j = 0; j < i; ++j) {
             if (e->scope == vec[j]->scope) {
               vec[j]->merged_children.push_back(e);
@@ -562,7 +562,7 @@ class StoragePlanRewriter : public StmtExprMutator {
           if (e->scope.tag.length() != 0) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
-            CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+            ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
                 << "Allocation exceed bound of memory tag " << e->scope.to_string();
           }
         } else {
@@ -602,7 +602,7 @@ class StoragePlanRewriter : public StmtExprMutator {
           if (e->scope.tag.length() != 0) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
-            CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+            ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
                 << "Allocation exceed bound of memory tag " << e->scope.to_string();
           }
         }
@@ -611,9 +611,9 @@ class StoragePlanRewriter : public StmtExprMutator {
   }
   // New allocation for merged data
   void NewAllocTagMerged(StorageEntry* e) {
-    CHECK_NE(e->scope.tag.length(), 0U);
+    ICHECK_NE(e->scope.tag.length(), 0U);
     // allocate with element type.
-    CHECK_NE(e->const_nbits, 0U);
+    ICHECK_NE(e->const_nbits, 0U);
     MemoryInfo info = GetMemoryInfo(e->scope.to_string());
     uint64_t total_bits = e->const_nbits;
     // By default, align to 32 bits.
@@ -628,8 +628,8 @@ class StoragePlanRewriter : public StmtExprMutator {
     }
     e->alloc_var = e->allocs[0]->buffer_var;
     for (StorageEntry* child : e->merged_children) {
-      CHECK_NE(child->const_nbits, 0U);
-      CHECK_NE(total_bits, 0U);
+      ICHECK_NE(child->const_nbits, 0U);
+      ICHECK_NE(total_bits, 0U);
       child->bits_offset = total_bits;
       child->alloc_var = e->alloc_var;
       total_bits += child->const_nbits;
@@ -642,7 +642,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         make_const(e->allocs[0]->extents[0].dtype(), (total_bits + type_bits - 1) / type_bits);
     e->new_alloc = Allocate(e->alloc_var, e->elem_type, {alloc_size}, const_true(), Evaluate(0));
     if (info.defined()) {
-      CHECK_LE(total_bits, info->max_num_bits)
+      ICHECK_LE(total_bits, info->max_num_bits)
           << "Allocation exceed bound of memory tag " << e->scope.to_string();
     }
   }
@@ -675,7 +675,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   }
   void PlanNewScope(const Object* op) {
     if (thread_scope_ != nullptr) {
-      CHECK(thread_scope_ == op);
+      ICHECK(thread_scope_ == op);
       // erase all memory atatched to this scope.
       for (auto it = const_free_map_.begin(); it != const_free_map_.end();) {
         if (it->second->attach_scope_ == op) {
@@ -716,7 +716,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         bool detect_inplace = detect_inplace_ && (it->second.gen.size() <= 2);
 
         for (const VarNode* var : it->second.gen) {
-          CHECK(alloc_info.count(var));
+          ICHECK(alloc_info.count(var));
           const AllocEntry& ae = alloc_info.at(var);
           StorageEntry* dst_entry = nullptr;
           // inplace detection
@@ -758,7 +758,7 @@ class StoragePlanRewriter : public StmtExprMutator {
             attr::IsPragmaKey(op->attr_key)) {
           PlanNewScope(op);
         } else {
-          CHECK(op->attr_key == attr::extern_scope);
+          ICHECK(op->attr_key == attr::extern_scope);
         }
       } else if (s.stmt->IsInstance<ForNode>()) {
         const auto* op = static_cast<const ForNode*>(s.stmt);
@@ -785,7 +785,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   // Allocate new storage entry.
   StorageEntry* NewAlloc(const AllocateNode* op, const Object* attach_scope,
                          const StorageScope& scope, size_t const_nbits) {
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     // Re-use not successful, allocate a new buffer.
     std::unique_ptr<StorageEntry> entry(new StorageEntry());
     entry->attach_scope_ = attach_scope;
@@ -799,7 +799,7 @@ class StoragePlanRewriter : public StmtExprMutator {
 
   StorageEntry* FindAlloc(const AllocateNode* op, const Object* attach_scope,
                           const StorageScope& scope) {
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     // skip plan for local variable,
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
@@ -858,9 +858,9 @@ class StoragePlanRewriter : public StmtExprMutator {
   // simulated free.
   void Free(const VarNode* var) {
     auto it = alloc_map_.find(var);
-    CHECK(it != alloc_map_.end());
+    ICHECK(it != alloc_map_.end());
     StorageEntry* e = it->second;
-    CHECK_NE(e->allocs.size(), 0U);
+    ICHECK_NE(e->allocs.size(), 0U);
 
     // disable reuse of small arrays, they will be lowered to registers in LLVM
     // This rules only apply if we are using non special memory
@@ -989,7 +989,7 @@ PrimFunc PointerValueTypeRewrite(PrimFunc f) {
     }
   }
 
-  CHECK_EQ(args.size(), n->params.size());
+  ICHECK_EQ(args.size(), n->params.size());
   n->params = args;
   n->body = Substitute(n->body, remap_vars);
   return f;
diff --git a/src/tir/transforms/tensorcore_infer_fragment.cc b/src/tir/transforms/tensorcore_infer_fragment.cc
index 81c8645f3dda..d0f58074ada0 100644
--- a/src/tir/transforms/tensorcore_infer_fragment.cc
+++ b/src/tir/transforms/tensorcore_infer_fragment.cc
@@ -56,28 +56,28 @@ class FragmentGetter : public StmtExprVisitor {
     if (op->op.same_as(builtin::tvm_load_matrix_sync()) ||
         op->op.same_as(builtin::tvm_store_matrix_sync())) {
       // Get shape and layout information from load and store intrinsic
-      CHECK_EQ(op->args.size(), 8U);
+      ICHECK_EQ(op->args.size(), 8U);
       const VarNode* buffer_var = op->args[0].as<VarNode>();
-      CHECK(buffer_var);
+      ICHECK(buffer_var);
       // Get shape
       const IntImmNode* m = op->args[1].as<IntImmNode>();
       const IntImmNode* n = op->args[2].as<IntImmNode>();
       const IntImmNode* k = op->args[3].as<IntImmNode>();
       const StringImmNode* layout = op->args[7].as<StringImmNode>();
-      CHECK(m);
-      CHECK(n);
-      CHECK(k);
-      CHECK(layout);
+      ICHECK(m);
+      ICHECK(n);
+      ICHECK(k);
+      ICHECK(layout);
 
       std::string scope = scopes[buffer_var];
       if (fragments.count(buffer_var)) {
         // check if the fragment has met before
         FragmentInfo info = fragments[buffer_var];
-        CHECK_EQ(m->value, info.m);
-        CHECK_EQ(n->value, info.n);
-        CHECK_EQ(k->value, info.k);
+        ICHECK_EQ(m->value, info.m);
+        ICHECK_EQ(n->value, info.n);
+        ICHECK_EQ(k->value, info.k);
         if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
-          CHECK_EQ(layout->value, info.layout);
+          ICHECK_EQ(layout->value, info.layout);
         }
       } else {
         // store metadata
@@ -91,25 +91,25 @@ class FragmentGetter : public StmtExprVisitor {
       }
     } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
       // Get shape information from fill intrinsic
-      CHECK_EQ(op->args.size(), 6U);
+      ICHECK_EQ(op->args.size(), 6U);
       const VarNode* buffer_var = op->args[0].as<VarNode>();
-      CHECK(buffer_var);
+      ICHECK(buffer_var);
       // Get shape
       const IntImmNode* m = op->args[1].as<IntImmNode>();
       const IntImmNode* n = op->args[2].as<IntImmNode>();
       const IntImmNode* k = op->args[3].as<IntImmNode>();
-      CHECK(m);
-      CHECK(n);
-      CHECK(k);
+      ICHECK(m);
+      ICHECK(n);
+      ICHECK(k);
 
       std::string scope = scopes[buffer_var];
       // Only wmma.accumulator can use tvm_fill_fragment
-      CHECK_EQ(scope, "wmma.accumulator");
+      ICHECK_EQ(scope, "wmma.accumulator");
       if (fragments.count(buffer_var)) {
         FragmentInfo info = fragments[buffer_var];
-        CHECK_EQ(m->value, info.m);
-        CHECK_EQ(n->value, info.n);
-        CHECK_EQ(k->value, info.k);
+        ICHECK_EQ(m->value, info.m);
+        ICHECK_EQ(n->value, info.n);
+        ICHECK_EQ(k->value, info.k);
       } else {
         FragmentInfo info(m->value, n->value, k->value, "");
         fragments[buffer_var] = info;
@@ -121,7 +121,7 @@ class FragmentGetter : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::storage_scope) {
       const VarNode* buffer = op->node.as<VarNode>();
-      CHECK(buffer);
+      ICHECK(buffer);
       scopes[buffer] = op->value.as<StringImmNode>()->value;
     }
     StmtExprVisitor::VisitStmt_(op);
@@ -142,28 +142,28 @@ class FragmentChecker : public StmtExprVisitor {
     StmtExprVisitor::VisitExpr_(op);
     // Check shape when calling tvm_mma_sync
     if (op->op.same_as(builtin::tvm_mma_sync()) || op->op.same_as(builtin::tvm_bmma_sync())) {
-      CHECK_EQ(op->args.size(), 8U);
+      ICHECK_EQ(op->args.size(), 8U);
       const VarNode* buffer_var_d = op->args[0].as<VarNode>();
       const VarNode* buffer_var_a = op->args[2].as<VarNode>();
       const VarNode* buffer_var_b = op->args[4].as<VarNode>();
       const VarNode* buffer_var_c = op->args[6].as<VarNode>();
-      CHECK(buffer_var_d);
-      CHECK(buffer_var_a);
-      CHECK(buffer_var_b);
-      CHECK(buffer_var_c);
+      ICHECK(buffer_var_d);
+      ICHECK(buffer_var_a);
+      ICHECK(buffer_var_b);
+      ICHECK(buffer_var_c);
 
       // Check all fragment A, B, C and D have the same shape
-      CHECK(CheckShape(buffer_var_d, buffer_var_a));
-      CHECK(CheckShape(buffer_var_d, buffer_var_b));
-      CHECK(CheckShape(buffer_var_d, buffer_var_c));
+      ICHECK(CheckShape(buffer_var_d, buffer_var_a));
+      ICHECK(CheckShape(buffer_var_d, buffer_var_b));
+      ICHECK(CheckShape(buffer_var_d, buffer_var_c));
     }
   }
 
  private:
   // A tool for checking shapes of two fragments
   bool CheckShape(const VarNode* buffer1, const VarNode* buffer2) {
-    CHECK(fragment_getter.fragments.count(buffer1));
-    CHECK(fragment_getter.fragments.count(buffer2));
+    ICHECK(fragment_getter.fragments.count(buffer1));
+    ICHECK(fragment_getter.fragments.count(buffer2));
     FragmentGetter::FragmentInfo info1 = fragment_getter.fragments.at(buffer1);
     FragmentGetter::FragmentInfo info2 = fragment_getter.fragments.at(buffer2);
     return info1.m == info2.m && info1.n == info2.n && info1.k == info2.k;
diff --git a/src/tir/transforms/thread_storage_sync.cc b/src/tir/transforms/thread_storage_sync.cc
index 05ee8146cbd8..8f757171afbd 100644
--- a/src/tir/transforms/thread_storage_sync.cc
+++ b/src/tir/transforms/thread_storage_sync.cc
@@ -97,7 +97,7 @@ class ThreadSyncPlanner : public StorageAccessVisitor {
         }
       }
       if (sync_before_stmt) {
-        CHECK_EQ(condition_counter(), 0) << "Cannot insert syncs inside condition";
+        ICHECK_EQ(condition_counter(), 0) << "Cannot insert syncs inside condition";
         syncs_inserted_.insert(s.stmt);
       }
     }
@@ -124,7 +124,7 @@ class ThreadSyncPlanner : public StorageAccessVisitor {
           }
         }
         if (sync_before_stmt) {
-          CHECK_EQ(condition_counter(), 0) << "Cannot insert syncs inside condition";
+          ICHECK_EQ(condition_counter(), 0) << "Cannot insert syncs inside condition";
           syncs_inserted_.insert(s.stmt);
           break;
         }
@@ -263,7 +263,7 @@ class ThreadSyncInserter : public StmtExprMutator {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       PrimExpr expr = StmtExprMutator::VisitExpr_(op);
       op = expr.as<CallNode>();
-      CHECK_EQ(op->args.size(), 5U);
+      ICHECK_EQ(op->args.size(), 5U);
       const VarNode* buffer_var = op->args[1].as<VarNode>();
       Var var(GetRef<Var>(buffer_var));
       const IntImmNode* flag = op->args[4].as<IntImmNode>();
@@ -297,7 +297,7 @@ class ThreadSyncInserter : public StmtExprMutator {
   }
   // private functions.
   Stmt InitGlobalBarrier(const AttrStmtNode* op) {
-    CHECK(op != nullptr);
+    ICHECK(op != nullptr);
     Array<PrimExpr> pargs = {StringImm(runtime::symbol::tvm_prepare_global_barrier)};
     Stmt prep = Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
     Stmt body = op->body;
@@ -314,9 +314,9 @@ class ThreadSyncInserter : public StmtExprMutator {
     return SeqStmt({prep, body});
   }
   Stmt MakeGlobalBarrier() {
-    CHECK(sync_scope_.rank == StorageRank::kGlobal);
+    ICHECK(sync_scope_.rank == StorageRank::kGlobal);
     if (!num_blocks_.defined()) {
-      CHECK(!is_lead_.defined());
+      ICHECK(!is_lead_.defined());
       num_work_dim_ = thread_extents_.size();
       for (const AttrStmtNode* attr : thread_extents_) {
         IterVar iv = Downcast<IterVar>(attr->node);
@@ -329,7 +329,7 @@ class ThreadSyncInserter : public StmtExprMutator {
         }
       }
     } else {
-      CHECK_EQ(num_work_dim_, thread_extents_.size());
+      ICHECK_EQ(num_work_dim_, thread_extents_.size());
     }
     return Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
                          {StringImm(sync_scope_.to_string()), is_lead_, num_blocks_}));
diff --git a/src/tir/transforms/unroll_loop.cc b/src/tir/transforms/unroll_loop.cc
index 122654149f24..71ad899273a6 100644
--- a/src/tir/transforms/unroll_loop.cc
+++ b/src/tir/transforms/unroll_loop.cc
@@ -107,7 +107,7 @@ class LoopUnroller : public StmtExprMutator {
         auto_unroll && (value * step_count_ <= auto_max_step_ || value <= auto_max_extent_);
 
     if (op->for_type == ForType::Unrolled) {
-      CHECK_GE(value, 0) << "Cannot unroll non-constant loop";
+      ICHECK_GE(value, 0) << "Cannot unroll non-constant loop";
       auto_unroll = true;
     }
 
@@ -163,7 +163,7 @@ class LoopUnroller : public StmtExprMutator {
   Stmt Unroll(const ForNode* op) {
     int value = GetExtent(op);
     // For loop must have a constant integer extent
-    CHECK_NE(value, -1) << "loop doesn't have a constant integer extent";
+    ICHECK_NE(value, -1) << "loop doesn't have a constant integer extent";
     if (value == 0) return Evaluate(0);
     Stmt body = op->body;
     Map<Var, PrimExpr> vmap;
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index bf54ada6e837..239f42266b83 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -45,8 +45,8 @@ inline PrimExpr BroadcastTo(PrimExpr e, int lanes) {
       return Broadcast(op->value, lanes);
     }
   }
-  CHECK_EQ(e.dtype().lanes(), 1) << "Cannot broadcast lane=" << e.dtype().lanes() << " to "
-                                 << lanes;
+  ICHECK_EQ(e.dtype().lanes(), 1) << "Cannot broadcast lane=" << e.dtype().lanes() << " to "
+                                  << lanes;
   return Broadcast(e, lanes);
 }
 
@@ -105,7 +105,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   }
 
   Stmt VisitStmt(const Stmt& stmt) final {
-    CHECK(!need_scalarize_);
+    ICHECK(!need_scalarize_);
     Stmt ret = StmtMutator::VisitStmt(stmt);
     if (need_scalarize_) {
       need_scalarize_ = false;
@@ -319,7 +319,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     // (let x = 1 in x + 1) * (let x = 1 in x + 1)
     auto it = let_binding_.find(op->var);
     if (it != let_binding_.end()) {
-      CHECK(deep_equal_(it->second, value))
+      ICHECK(deep_equal_(it->second, value))
           << "Let cannot bind the same var to two different values";
     }
     if (value.dtype().lanes() != op->value.dtype().lanes()) {
@@ -355,8 +355,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (op->for_type == ForType::Vectorized) {
       LOG(WARNING) << "Detect vectorize inside vectorized loop, ignoring...";
     }
-    CHECK(is_zero(op->min));
-    CHECK(!op->extent.dtype().is_vector());
+    ICHECK(is_zero(op->min));
+    ICHECK(!op->extent.dtype().is_vector());
     PrimExpr extent = this->VisitExpr(op->extent);
     if (extent.dtype().is_vector()) {
       return Scalarize(GetRef<Stmt>(op));
@@ -370,7 +370,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   }
   // IfThenElse
   Stmt VisitStmt_(const IfThenElseNode* op) final {
-    CHECK(!op->condition.dtype().is_vector());
+    ICHECK(!op->condition.dtype().is_vector());
     PrimExpr condition = this->VisitExpr(op->condition);
     if (condition.dtype().is_vector()) {
       return Scalarize(GetRef<Stmt>(op));
@@ -390,7 +390,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   // LetStmt
   Stmt VisitStmt_(const LetStmtNode* op) final {
     PrimExpr value = this->VisitExpr(op->value);
-    CHECK(!let_binding_.count(op->var)) << "SSA violation, a single var is binded twice";
+    ICHECK(!let_binding_.count(op->var)) << "SSA violation, a single var is binded twice";
     let_binding_[op->var] = value;
 
     if (value.dtype().lanes() != op->value.dtype().lanes()) {
@@ -526,7 +526,7 @@ class LoopVectorizer : public StmtMutator {
  public:
   Stmt VisitStmt_(const ForNode* op) final {
     if (op->for_type == ForType::Vectorized) {
-      CHECK(is_zero(op->min));
+      ICHECK(is_zero(op->min));
       auto* extent_as_int = op->extent.as<IntImmNode>();
       if (!extent_as_int || extent_as_int->value < 1) {
         LOG(FATAL) << "Failed to vectorize loop with extent " << op->extent;
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index 19243803cdc9..2d7657eedcdd 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -150,7 +150,7 @@ TVM_REGISTER_GLOBAL("topi.matmul").set_body([](TVMArgs args, TVMRetValue* rv) {
       *rv = matmul(args[0], args[1], args[2], args[3]);
       break;
     default:
-      CHECK(0) << "topi.matmul expects 2, 3 or 4 arguments";
+      ICHECK(0) << "topi.matmul expects 2, 3 or 4 arguments";
   }
 });
 

From 6d27055d94e14cc51be7c4ed70435d226ee41a0e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sun, 25 Oct 2020 17:12:27 +0900
Subject: [PATCH 050/258] [MKL] Fix offloading of batch_matmul to MKL (#6752)

* fix mkl offloading of batch matmul

* name fix and add doc

* add doc for lib arg

Co-authored-by: masa <masa@pop-os.localdomain>
---
 python/tvm/relay/op/strategy/x86.py |  7 +++++++
 python/tvm/topi/x86/batch_matmul.py | 30 ++++++++++++++++++++++++-----
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index e2a82d396b22..3c5735b17aa5 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -377,6 +377,13 @@ def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
             name="batch_matmul_cblas.x86",
             plevel=15,
         )
+    if "mkl" in target.libs:
+        strategy.add_implementation(
+            wrap_compute_batch_matmul(topi.x86.batch_matmul_mkl),
+            wrap_topi_schedule(topi.x86.schedule_batch_matmul_mkl),
+            name="batch_matmul_mkl.x86",
+            plevel=15,
+        )
     return strategy
 
 
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 4e5f6efc815a..100bdf205165 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -19,7 +19,7 @@
 from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
-from tvm.contrib import cblas
+from tvm.contrib import cblas, mkl
 from .. import generic
 from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
 
@@ -137,10 +137,9 @@ def _default_batch_matmul_config(cfg, M, N, K):
     cfg["tile_y"] = SplitEntity([M // y_bn, y_bn])
 
 
-@autotvm.register_topi_compute("batch_matmul_cblas.x86")
-def batch_matmul_cblas(cfg, x, y, out_shape=None):
+def batch_matmul_blas_common(cfg, x, y, out_shape, lib):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
-    data in batch.
+    data in batch, using one of BLAS libraries.
 
     Parameters
     ----------
@@ -152,6 +151,8 @@ def batch_matmul_cblas(cfg, x, y, out_shape=None):
         3-D with shape [batch, N, K]
     out_shape : tuple or None
         Shape of the output
+    lib : A contrib module which implements batch_matmul funtion
+        cblas and mkl are supported
 
     Returns
     -------
@@ -168,9 +169,28 @@ def batch_matmul_cblas(cfg, x, y, out_shape=None):
         assert out_shape[1] == M, "got invalid output shape"
         assert out_shape[2] == N, "got invalid output shape"
     cfg.add_flop(XB * M * N * XK * 2)
-    return cblas.batch_matmul(x, y, False, True)
+    return lib.batch_matmul(x, y, False, True)
+
+
+@autotvm.register_topi_compute("batch_matmul_cblas.x86")
+def batch_matmul_cblas(cfg, x, y, out_shape=None):
+    """Compute batch_matmul using cblas"""
+    return batch_matmul_blas_common(cfg, x, y, out_shape, cblas)
 
 
 @autotvm.register_topi_schedule("batch_matmul_cblas.x86")
 def schedule_batch_matmul_cblas(_, outs):
+    """Create schedule for batch_matmul_cblas"""
+    return generic.schedule_extern(outs)
+
+
+@autotvm.register_topi_compute("batch_matmul_mkl.x86")
+def batch_matmul_mkl(cfg, x, y, out_shape=None):
+    """Compute batch_matmul using mkl"""
+    return batch_matmul_blas_common(cfg, x, y, out_shape, mkl)
+
+
+@autotvm.register_topi_schedule("batch_matmul_mkl.x86")
+def schedule_batch_matmul_mkl(_, outs):
+    """Create schedule for batch_matmul_mul"""
     return generic.schedule_extern(outs)

From 7966d68137bc1769d06d243aaa7f6ecae461c337 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 25 Oct 2020 08:44:44 -0400
Subject: [PATCH 051/258] [WASM] Update support for latest emcc, add ffi test.
 (#6751)

---
 python/tvm/contrib/emcc.py         |  1 +
 web/Makefile                       |  4 ++--
 web/emcc/wasm_runtime.cc           |  8 ++++++++
 web/tests/node/test_packed_func.js | 13 +++++++++++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/emcc.py b/python/tvm/contrib/emcc.py
index 0cecc66bd1a2..89431dc2a4f6 100644
--- a/python/tvm/contrib/emcc.py
+++ b/python/tvm/contrib/emcc.py
@@ -42,6 +42,7 @@ def create_tvmjs_wasm(output, objects, options=None, cc="emcc"):
     cmd += ["-O3"]
 
     cmd += ["-std=c++14"]
+    cmd += ["--no-entry"]
     cmd += ["-s", "ERROR_ON_UNDEFINED_SYMBOLS=0"]
     cmd += ["-s", "STANDALONE_WASM=1"]
     cmd += ["-s", "ALLOW_MEMORY_GROWTH=1"]
diff --git a/web/Makefile b/web/Makefile
index eaf5a954accb..8c4dbc20dadc 100644
--- a/web/Makefile
+++ b/web/Makefile
@@ -26,8 +26,8 @@ all: dist/wasm/tvmjs_runtime.wasm dist/wasm/tvmjs_runtime.wasi.js
 
 EMCC = emcc
 
-EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++14 -Wno-ignored-attributes \
-	-s ALLOW_MEMORY_GROWTH=1 -s STANDALONE_WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0
+EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++14 -Wno-ignored-attributes --no-entry \
+	-s ALLOW_MEMORY_GROWTH=1 -s STANDALONE_WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0 
 
 EMCC_LDFLAGS = --pre-js emcc/preload.js
 
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index a5f8c8252571..214c1883f874 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -75,5 +75,13 @@ TVM_REGISTER_GLOBAL("testing.wrap_callback").set_body([](TVMArgs args, TVMRetVal
   PackedFunc pf = args[0];
   *ret = runtime::TypedPackedFunc<void()>([pf]() { pf(); });
 });
+
+// internal function used for debug and testing purposes
+TVM_REGISTER_GLOBAL("testing.object_use_count").set_body([](TVMArgs args, TVMRetValue* ret) {
+  runtime::ObjectRef obj = args[0];
+  // substract the current one because we always copy
+  // and get another value.
+  *ret = (obj.use_count() - 1);
+});
 }  // namespace runtime
 }  // namespace tvm
diff --git a/web/tests/node/test_packed_func.js b/web/tests/node/test_packed_func.js
index e18c0aecfdc0..87b48df3d67a 100644
--- a/web/tests/node/test_packed_func.js
+++ b/web/tests/node/test_packed_func.js
@@ -109,3 +109,16 @@ test("RegisterGlobal", () => {
   let syslib = tvm.systemLib();
   syslib.dispose();
 });
+
+test("NDArrayCbArg", () => {
+  let use_count = tvm.getGlobalFunc("testing.object_use_count");
+
+  let fcheck = tvm.toPackedFunc(function (x) {
+    assert(use_count(x) == 2);
+    x.dispose();
+  });
+  let x = tvm.empty([2], "float32").copyFrom([1, 2]);
+  assert(use_count(x) == 1);
+  fcheck(x);
+  assert(use_count(x) == 1);
+});

From 22b4be3d010ac21b6c612d38272a31f97bebc846 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Sun, 25 Oct 2020 14:52:05 -0600
Subject: [PATCH 052/258] add onnx resize v10 and unit test (#6726)

---
 python/tvm/relay/frontend/onnx.py          | 23 ++++++++++++++---
 tests/python/frontend/onnx/test_forward.py | 30 ++++++++++++++++++++++
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index e2c6b9abc449..2d21156f8f7b 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1870,6 +1870,25 @@ def _impl_v7(cls, inputs, attr, params):
 class Resize(OnnxOpConverter):
     """Operator converter for Resize"""
 
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        mode = attr.get("mode")
+        if mode == b"nearest":
+            method = "nearest_neighbor"
+        elif mode == b"linear":
+            method = "bilinear"
+        else:
+            raise tvm.error.OpAttributeInvalid(
+                'Value {} in attribute "mode" of operator Resize is not valid.'.format(mode)
+            )
+
+        scale = inputs[1]
+        size = _op.cast(_op.shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
+
+        layout = "NCHW"  # ONNX assumes NCHW layout
+        out_size = _op.strided_slice(size, [2], [4])
+        return _op.image.resize(inputs[0], out_size, layout, method, "asymmetric")
+
     @classmethod
     def _impl_v11(cls, inputs, attr, params):
         mode = attr.get("mode")
@@ -1891,9 +1910,7 @@ def _impl_v11(cls, inputs, attr, params):
             size = inputs[3]
         else:
             assert len(scale_shape) != 0, "One of scale or size should be passed."
-            size = (
-                _op.cast(_op.shape_of(inputs[0]), infer_type(scale).type_annotation.dtype) * scale
-            )
+            size = _op.cast(_op.shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
 
         coord_trans = attr.get("coordinate_transformation_mode")
         if coord_trans in [b"pytorch_half_pixel", b"half_pixel"]:
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 81b5186d0e26..bf27ba5ddcd9 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3525,6 +3525,36 @@ def verify(ishape, oshape, scales, mode, coord_trans):
     verify([1, 16, 32, 32], [], [1, 1, 2, 2], "nearest", "asymmetric")
     verify([1, 16, 32, 32], [], [1, 1, 0.5, 0.5], "linear", "half_pixel")
 
+    def verify_opset_10(ishape, scales, mode):
+        nodes = [
+            make_constant_node("scales", onnx.TensorProto.FLOAT, (len(scales),), scales),
+        ]
+        input_names = ["X", "scales"]
+        nodes.append(
+            helper.make_node(
+                "Resize",
+                inputs=input_names,
+                outputs=["Y"],
+                mode=mode,
+            )
+        )
+
+        oshape = [round(dim * scale) for (dim, scale) in zip(ishape, scales)]
+        graph = helper.make_graph(
+            nodes,
+            "resize_test",
+            inputs=[helper.make_tensor_value_info("X", TensorProto.FLOAT, ishape)],
+            outputs=[helper.make_tensor_value_info("Y", TensorProto.FLOAT, oshape)],
+        )
+
+        model = helper.make_model(graph, producer_name="resize_test")
+        model.opset_import[0].version = 10
+
+        verify_with_ort(model, [ishape], oshape, use_vm=True, freeze_params=True)
+
+    verify_opset_10([1, 16, 32, 32], [1, 1, 2, 2], "nearest")
+    verify_opset_10([1, 16, 32, 32], [1, 1, 0.5, 0.5], "linear")
+
 
 @tvm.testing.uses_gpu
 def test_nonzero():

From 126293083787705a75d534ff3a8b0e9bd99fc8a9 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 25 Oct 2020 17:14:45 -0400
Subject: [PATCH 053/258] [CI] Update wasm emcc to latest (#6755)

---
 docker/install/ubuntu_install_emscripten.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_emscripten.sh b/docker/install/ubuntu_install_emscripten.sh
index 2e48cccbe2a6..fa44e1c70f1d 100755
--- a/docker/install/ubuntu_install_emscripten.sh
+++ b/docker/install/ubuntu_install_emscripten.sh
@@ -23,5 +23,5 @@ set -o pipefail
 cd /
 git clone https://github.com/emscripten-core/emsdk.git
 cd emsdk
-./emsdk install latest
-./emsdk activate latest
+./emsdk install 2.0.7
+./emsdk activate 2.0.7

From 4d668b6bea03257b5061884c96dc5b5615598afc Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 25 Oct 2020 19:27:29 -0400
Subject: [PATCH 054/258] [CI] Introduce all platform test for
 windows/mac/linux. (#6756)

This PR introduces a minimal set of test cases that
are supposed to run in all platforms during CI.

The set of testcases are supposed to help on
platform dependent regression.

See tests/python/all-platform-minimal-test/README.md for guidelines.

- Enable windows mac LLVM build via conda with cython support.
- Test on all platform test cases.
- Update implementation to improve MSVC support.
---
 .github/workflows/main.yml                    |  84 ++++-----
 conda/build-environment.yaml                  |  35 ++++
 conda/recipe/meta.yaml                        |   6 +
 include/tvm/runtime/packed_func.h             |  42 +++--
 python/setup.py                               |  36 ++--
 python/tvm/_ffi/registry.py                   |   4 +-
 python/tvm/runtime/ndarray.py                 |   4 +-
 python/tvm/runtime/object.py                  |   4 +-
 python/tvm/runtime/packed_func.py             |   4 +-
 .../all-platform-minimal-test/README.md       |  29 +++
 .../test_minimal_target_codegen_llvm.py       | 107 +++++++++++
 .../test_runtime_ndarray.py                   |   2 +
 .../test_runtime_packed_func.py               | 166 ++++++++++++++++++
 ...e_packed_func.py => test_runtime_trace.py} | 143 ---------------
 .../unittest/test_target_codegen_llvm.py      |  78 --------
 tests/scripts/task_python_unittest.sh         |   4 +-
 16 files changed, 453 insertions(+), 295 deletions(-)
 create mode 100644 conda/build-environment.yaml
 create mode 100644 tests/python/all-platform-minimal-test/README.md
 create mode 100644 tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
 rename tests/python/{unittest => all-platform-minimal-test}/test_runtime_ndarray.py (98%)
 create mode 100644 tests/python/all-platform-minimal-test/test_runtime_packed_func.py
 rename tests/python/unittest/{test_runtime_packed_func.py => test_runtime_trace.py} (72%)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a5250c1ffc74..be50a81e527d 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -34,53 +34,55 @@ jobs:
   Build:
     strategy:
       matrix:
-        os: [windows-latest, macOS-latest]
+        os: [windows-2016, macOS-latest]
 
     runs-on: ${{ matrix.os }}
 
     steps:
     - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
-    - name: Lint Python
-      if: matrix.os == 'macOS-latest'
-      run: |
-        pip install flake8
-        flake8 . --count --select=E9,F63,F7 --show-source --statistics
     - name: Initialize submodules
       run: git submodule update --recursive --init
-
-    - name: Make Build Directory
-      run: cmake -E make_directory build.common
-
-    # configuration for Windows
-    - name: CMake@Win
-      if: matrix.os == 'windows-latest'
-      working-directory: build.common
+    - name: Lint Python
+      if: startsWith(matrix.os, 'macOS')
+      run: |
+        python3 -m pip install flake8
+        python3 -m flake8 . --count --select=E9,F63,F7 --show-source --statistics
+    - uses: actions/cache@v1
+      env:
+        CACHE_NUMBER: 0
+      with:
+        path: ~/conda_pkgs_dir
+        key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda/build-environment.yaml') }}
+    - uses: conda-incubator/setup-miniconda@v1
+      with:
+        activate-environment: tvm-build
+        channel-priority: strict
+        environment-file: conda/build-environment.yaml
+        auto-activate-base: false
+        use-only-tar-bz2: true
+    - name: Conda info
+      run: |
+        conda info
+        conda list
+    - name: Conda-Build@Win
+      if: startsWith(matrix.os, 'windows')
+      shell: cmd /C call {0}
       run: >-
-        cmake
-        -DUSE_SORT=ON
-        -DUSE_RPC=ON
-        -DUSE_GRAPH_RUNTIME=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DCMAKE_CONFIGURATION_TYPES="Release"
-        ..
-
-    # configuration for Mac
-    - name: CMake@MacOS
-      if: matrix.os  == 'macOS-latest'
-      working-directory: build.common
+        conda build --output-folder=conda/pkg conda/recipe &&
+        conda install tvm -c ./conda/pkg
+    - name: Conda-Build@MacOS
+      if: startsWith(matrix.os, 'macOS')
+      shell: bash -l {0}
       run: >-
-        cmake
-        "-DUSE_SORT=ON"
-        "-DUSE_RPC=ON"
-        "-DUSE_GRAPH_RUNTIME=ON"
-        "-DUSE_METAL=ON"
-        ..
-
-    - name: Build@Win
-      if: matrix.os == 'windows-latest'
-      run: cmake --build build.common --config Release -- /m
-
-    - name: Build@MacOS
-      if: matrix.os == 'macOS-latest'
-      run: cmake --build build.common --config Release -j3
+        conda build --output-folder=conda/pkg  conda/recipe &&
+        conda install tvm -c ./conda/pkg
+    - name: Test@Win
+      if: startsWith(matrix.os, 'windows')
+      shell: cmd /C call {0}
+      run: >-
+        python -m pytest -v tests/python/all-platform-minimal-test
+    - name: Test@MacOS
+      if: startsWith(matrix.os, 'macOS')
+      shell: bash -l {0}
+      run: >-
+        python -m pytest -v tests/python/all-platform-minimal-test
diff --git a/conda/build-environment.yaml b/conda/build-environment.yaml
new file mode 100644
index 000000000000..600933fc18b3
--- /dev/null
+++ b/conda/build-environment.yaml
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Build environment.
+name: tvm-build
+
+channels:
+  - anaconda
+  - conda-forge
+
+dependencies:
+  - conda-build
+  - git
+  - llvmdev ==10.0.0
+  - numpy
+  - pytest
+  - cython
+  - cmake
+  - bzip2
+  - make
+  - scipy
diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index 67ba7fec1869..0113850a6602 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -71,14 +71,20 @@ outputs:
     script: install_tvm_python.sh  # [not win]
     script: install_tvm_python.bat # [win]
     string: {{ build_tag }}_py{{ PY_VER | replace('.', '')}}
+    # skip bytecompile pyc to speedup CI speed
+    skip_compile_pyc:
+      - "*/**/*.py"
     requirements:
       host:
         - python
         - setuptools
+        - cython
+        - {{ pin_subpackage(pkg_name + '-libs', exact=True) }}
       run:
         - python
         - decorator
         - psutil
+        - scipy
         - {{ pin_compatible('numpy') }}
         - {{ pin_subpackage(pkg_name + '-libs', exact=True) }}
 
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 43038998639e..a5db34c75400 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -522,22 +522,30 @@ class TVMArgValue : public TVMPODValue_ {
  *
  * \note For internal development purpose only.
  */
-class TVMMovableArgValue_ : public TVMArgValue {
+class TVMMovableArgValue_ : public TVMPODValue_ {
  public:
-  TVMMovableArgValue_(TVMValue value, int type_code) : TVMArgValue(value, type_code) {}
+  TVMMovableArgValue_(TVMValue value, int type_code) : TVMPODValue_(value, type_code) {}
   // reuse converter from parent
-  using TVMArgValue::operator double;
-  using TVMArgValue::operator int64_t;
-  using TVMArgValue::operator uint64_t;
-  using TVMArgValue::operator int;
-  using TVMArgValue::operator bool;
-  using TVMArgValue::operator void*;
-  using TVMArgValue::operator DLTensor*;
-  using TVMArgValue::operator TVMContext;
-  using TVMArgValue::operator std::string;
-  using TVMArgValue::operator DLDataType;
-  using TVMArgValue::operator DataType;
-  using TVMArgValue::operator PackedFunc;
+  using TVMPODValue_::operator double;
+  using TVMPODValue_::operator int64_t;
+  using TVMPODValue_::operator uint64_t;
+  using TVMPODValue_::operator int;
+  using TVMPODValue_::operator bool;
+  using TVMPODValue_::operator void*;
+  using TVMPODValue_::operator DLTensor*;
+  using TVMPODValue_::operator NDArray;
+  using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator Module;
+  // reuse conversion rule from ArgValue.
+  operator std::string() const { return AsArgValue().operator std::string(); }
+  operator PackedFunc() const { return AsArgValue().operator PackedFunc(); }
+  template <typename FType>
+  operator TypedPackedFunc<FType>() const {
+    return TypedPackedFunc<FType>(operator PackedFunc());
+  }
+  operator DLDataType() const { return AsArgValue().operator DLDataType(); }
+  operator DataType() const { return AsArgValue().operator DataType(); }
+  operator TVMArgValue() const { return AsArgValue(); }
   /*!
    * \brief Helper converter function.
    *  Try to move out an argument if possible,
@@ -546,6 +554,10 @@ class TVMMovableArgValue_ : public TVMArgValue {
   template <typename T,
             typename = typename std::enable_if<std::is_base_of<ObjectRef, T>::value>::type>
   inline operator T() const;
+
+ private:
+  /*! \return The arg value repr of the value. */
+  TVMArgValue AsArgValue() const { return TVMArgValue(value_, type_code_); }
 };
 
 /*!
@@ -1450,7 +1462,7 @@ inline TVMMovableArgValue_::operator T() const {
     }
   }
   // fallback
-  return PackedFuncValueConverter<T>::From(*this);
+  return PackedFuncValueConverter<T>::From(AsArgValue());
 }
 
 template <typename T, typename>
diff --git a/python/setup.py b/python/setup.py
index fff7a0ed3bb1..092800c2314d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -35,6 +35,8 @@
     from setuptools.extension import Extension
 
 CURRENT_DIR = os.path.dirname(__file__)
+FFI_MODE = os.environ.get("TVM_FFI", "auto")
+CONDA_BUILD = os.getenv("CONDA_BUILD") is not None
 
 
 def get_lib_path():
@@ -45,7 +47,7 @@ def get_lib_path():
     libinfo = {"__file__": libinfo_py}
     exec(compile(open(libinfo_py, "rb").read(), libinfo_py, "exec"), libinfo, libinfo)
     version = libinfo["__version__"]
-    if not os.getenv("CONDA_BUILD"):
+    if not CONDA_BUILD:
         lib_path = libinfo["find_lib_path"]()
         libs = [lib_path[0]]
         if libs[0].find("runtime") == -1:
@@ -63,14 +65,14 @@ def get_lib_path():
 
 def config_cython():
     """Try to configure cython and return cython configuration"""
-    if os.name == "nt":
-        print("WARNING: Cython is not supported on Windows, will compile without cython module")
-        return []
-    sys_cflags = sysconfig.get_config_var("CFLAGS")
-
-    if "i386" in sys_cflags and "x86_64" in sys_cflags:
-        print("WARNING: Cython library may not be compiled correctly with both i386 and x64")
-        return []
+    if FFI_MODE not in ("cython"):
+        if os.name == "nt" and not CONDA_BUILD:
+            print("WARNING: Cython is not supported on Windows, will compile without cython module")
+            return []
+        sys_cflags = sysconfig.get_config_var("CFLAGS")
+        if sys_cflags and "i386" in sys_cflags and "x86_64" in sys_cflags:
+            print("WARNING: Cython library may not be compiled correctly with both i386 and x64")
+            return []
     try:
         from Cython.Build import cythonize
 
@@ -81,12 +83,18 @@ def config_cython():
             subdir = "_cy2"
         ret = []
         path = "tvm/_ffi/_cython"
+        extra_compile_args = ["-std=c++14"]
         if os.name == "nt":
             library_dirs = ["tvm", "../build/Release", "../build"]
-            libraries = ["libtvm"]
+            libraries = ["tvm"]
+            extra_compile_args = None
+            # library is available via conda env.
+            if CONDA_BUILD:
+                library_dirs = [os.environ["LIBRARY_LIB"]]
         else:
             library_dirs = None
             libraries = None
+
         for fn in os.listdir(path):
             if not fn.endswith(".pyx"):
                 continue
@@ -99,14 +107,16 @@ def config_cython():
                         "../3rdparty/dmlc-core/include",
                         "../3rdparty/dlpack/include",
                     ],
-                    extra_compile_args=["-std=c++14"],
+                    extra_compile_args=extra_compile_args,
                     library_dirs=library_dirs,
                     libraries=libraries,
                     language="c++",
                 )
             )
         return cythonize(ret, compiler_directives={"language_level": 3})
-    except ImportError:
+    except ImportError as error:
+        if FFI_MODE == "cython":
+            raise error
         print("WARNING: Cython is not installed, will compile without cython module")
         return []
 
@@ -121,7 +131,7 @@ def is_pure(self):
 
 include_libs = False
 wheel_include_libs = False
-if not os.getenv("CONDA_BUILD"):
+if not CONDA_BUILD:
     if "bdist_wheel" in sys.argv:
         wheel_include_libs = True
     else:
diff --git a/python/tvm/_ffi/registry.py b/python/tvm/_ffi/registry.py
index b42dada9c792..6637cd174391 100644
--- a/python/tvm/_ffi/registry.py
+++ b/python/tvm/_ffi/registry.py
@@ -29,8 +29,10 @@
     from ._cy3.core import _register_object
     from ._cy3.core import _reg_extension
     from ._cy3.core import convert_to_tvm_func, _get_global_func, PackedFuncBase
-except (RuntimeError, ImportError):
+except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position,unused-import
+    if _FFI_MODE == "cython":
+        raise error
     from ._ctypes.object import _register_object
     from ._ctypes.ndarray import _reg_extension
     from ._ctypes.packed_func import convert_to_tvm_func, _get_global_func, PackedFuncBase
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index b0a3c74eeefa..2f616ce879c9 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -30,8 +30,10 @@
         raise ImportError()
     from tvm._ffi._cy3.core import _set_class_ndarray, _make_array, _from_dlpack
     from tvm._ffi._cy3.core import NDArrayBase
-except (RuntimeError, ImportError):
+except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position
+    if _FFI_MODE == "cython":
+        raise error
     from tvm._ffi._ctypes.ndarray import _set_class_ndarray, _make_array, _from_dlpack
     from tvm._ffi._ctypes.ndarray import NDArrayBase
 
diff --git a/python/tvm/runtime/object.py b/python/tvm/runtime/object.py
index 35f1f4e57da0..bfee7f544f9c 100644
--- a/python/tvm/runtime/object.py
+++ b/python/tvm/runtime/object.py
@@ -28,8 +28,10 @@
         raise ImportError()
     from tvm._ffi._cy3.core import _set_class_object, _set_class_object_generic
     from tvm._ffi._cy3.core import ObjectBase, PyNativeObject
-except (RuntimeError, ImportError):
+except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position,unused-import
+    if _FFI_MODE == "cython":
+        raise error
     from tvm._ffi._ctypes.packed_func import _set_class_object, _set_class_object_generic
     from tvm._ffi._ctypes.object import ObjectBase, PyNativeObject
 
diff --git a/python/tvm/runtime/packed_func.py b/python/tvm/runtime/packed_func.py
index 35a4783b6dff..bcd3cd733dc6 100644
--- a/python/tvm/runtime/packed_func.py
+++ b/python/tvm/runtime/packed_func.py
@@ -27,8 +27,10 @@
     from tvm._ffi._cy3.core import _set_class_packed_func, _set_class_module
     from tvm._ffi._cy3.core import PackedFuncBase
     from tvm._ffi._cy3.core import convert_to_tvm_func
-except (RuntimeError, ImportError):
+except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position
+    if _FFI_MODE == "cython":
+        raise error
     from tvm._ffi._ctypes.packed_func import _set_class_packed_func, _set_class_module
     from tvm._ffi._ctypes.packed_func import PackedFuncBase
     from tvm._ffi._ctypes.packed_func import convert_to_tvm_func
diff --git a/tests/python/all-platform-minimal-test/README.md b/tests/python/all-platform-minimal-test/README.md
new file mode 100644
index 000000000000..d1f53b9163a3
--- /dev/null
+++ b/tests/python/all-platform-minimal-test/README.md
@@ -0,0 +1,29 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Core Cross Platform Regression Tests
+
+CI Unit test cases that will run on all platforms.
+To reduce the CI burden, we only put in test-cases that are platform sensitive.
+Please use the following guideline:
+
+- Always consider add tests to the unittest folder first.
+- If a problems that passes the Linux pipeline but fails in Windows or MacOS,
+  we should isolate the problem, write a minimal regression test case
+  and add it to this folder.
+- A test case in this folder should be minimal and finish in a reasonable amount of time.
+- Document about why it should be in the all-platform-minimal-test.
diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
new file mode 100644
index 000000000000..6c2897b1b561
--- /dev/null
+++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""LLVM enablement tests."""
+
+import tvm
+import tvm.testing
+from tvm import te
+from tvm import topi
+from tvm.contrib import util, clang
+import numpy as np
+import ctypes
+import math
+import re
+
+
+@tvm.testing.requires_llvm
+def test_llvm_add_pipeline():
+    """all-platform-minimal-test: Check LLVM enablement."""
+    nn = 1024
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name="A")
+    B = te.placeholder((n,), name="B")
+    AA = te.compute((n,), lambda *i: A(*i), name="A")
+    BB = te.compute((n,), lambda *i: B(*i), name="B")
+    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
+    C = te.compute(A.shape, lambda *i: T(*i), name="C")
+    s = te.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], factor=4)
+    xo1, xo2 = s[C].split(xo, factor=13)
+    s[C].parallel(xo2)
+    s[C].pragma(xo1, "parallel_launch_point")
+    s[C].pragma(xo2, "parallel_stride_pattern")
+    s[C].pragma(xo2, "parallel_barrier_when_finish")
+    s[C].vectorize(xi)
+
+    def check_llvm():
+        # Specifically allow offset to test codepath when offset is available
+        Ab = tvm.tir.decl_buffer(
+            A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A"
+        )
+        binds = {A: Ab}
+        # BUILD and invoke the kernel.
+        f = tvm.build(s, [A, B, C], "llvm", binds=binds)
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        f(a, b, c)
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+    check_llvm()
+
+
+@tvm.testing.requires_llvm
+def test_llvm_import():
+    """all-platform-minimal-test: check shell dependent clang behavior."""
+    # extern "C" is necessary to get the correct signature
+    cc_code = """
+    extern "C" float my_add(float x, float y) {
+      return x + y;
+    }
+    """
+    n = 10
+    A = te.placeholder((n,), name="A")
+    B = te.compute(
+        (n,), lambda *i: tvm.tir.call_pure_extern("float32", "my_add", A(*i), 1.0), name="B"
+    )
+
+    def check_llvm(use_file):
+        if not clang.find_clang(required=False):
+            print("skip because clang is not available")
+            return
+        temp = util.tempdir()
+        ll_path = temp.relpath("temp.ll")
+        ll_code = clang.create_llvm(cc_code, output=ll_path)
+        s = te.create_schedule(B.op)
+        if use_file:
+            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
+        else:
+            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
+        # BUILD and invoke the kernel.
+        f = tvm.build(s, [A, B], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1.0)
+
+    check_llvm(use_file=True)
+    check_llvm(use_file=False)
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
similarity index 98%
rename from tests/python/unittest/test_runtime_ndarray.py
rename to tests/python/all-platform-minimal-test/test_runtime_ndarray.py
index 0183ecd81864..bd9fb738ba7b 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Basic runtime enablement test."""
+
 import tvm
 from tvm import te
 import numpy as np
diff --git a/tests/python/all-platform-minimal-test/test_runtime_packed_func.py b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
new file mode 100644
index 000000000000..c6efbb472c4a
--- /dev/null
+++ b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test packed function FFI."""
+import tvm
+from tvm import te
+import tvm.testing
+import numpy as np
+
+
+def test_get_global():
+    targs = (10, 10.0, "hello")
+    # register into global function table
+    @tvm.register_func
+    def my_packed_func(*args):
+        assert tuple(args) == targs
+        return 10
+
+    # get it out from global function table
+    f = tvm.get_global_func("my_packed_func")
+    assert isinstance(f, tvm.runtime.PackedFunc)
+    y = f(*targs)
+    assert y == 10
+
+
+def test_get_callback_with_node():
+    x = tvm.runtime.convert(10)
+
+    def test(y):
+        assert y.handle != x.handle
+        return y
+
+    f2 = tvm.runtime.convert(test)
+    # register into global function table
+    @tvm.register_func
+    def my_callback_with_node(y, f):
+        assert y == x
+        return f(y)
+
+    # get it out from global function table
+    f = tvm.get_global_func("my_callback_with_node")
+    assert isinstance(f, tvm.runtime.PackedFunc)
+    y = f(x, f2)
+    assert y.value == 10
+
+
+def test_return_func():
+    def addy(y):
+        def add(x):
+            return tvm.runtime.convert(x + y)
+
+        return add
+
+    myf = tvm.runtime.convert(addy)
+    f = myf(10)
+    assert f(11).value == 21
+
+
+def test_convert():
+    # convert a function to tvm function
+    targs = (10, 10.0, "hello", 10)
+
+    def myfunc(*args):
+        assert tuple(args) == targs
+
+    f = tvm.runtime.convert(myfunc)
+    assert isinstance(f, tvm.runtime.PackedFunc)
+
+
+def test_byte_array():
+    s = "hello"
+    a = bytearray(s, encoding="ascii")
+
+    def myfunc(ss):
+        assert ss == a
+
+    f = tvm.runtime.convert(myfunc)
+    f(a)
+
+
+def test_empty_array():
+    def myfunc(ss):
+        assert tuple(ss) == ()
+
+    x = tvm.runtime.convert(())
+    tvm.runtime.convert(myfunc)(x)
+
+
+def test_ctx():
+    def test_ctx_func(ctx):
+        assert tvm.gpu(7) == ctx
+        return tvm.cpu(0)
+
+    x = test_ctx_func(tvm.gpu(7))
+    assert x == tvm.cpu(0)
+    x = tvm.opencl(10)
+    x = tvm.testing.context_test(x, x.device_type, x.device_id)
+    assert x == tvm.opencl(10)
+
+
+def test_rvalue_ref():
+    def callback(x, expected_count):
+        assert expected_count == tvm.testing.object_use_count(x)
+        return x
+
+    f = tvm.runtime.convert(callback)
+
+    def check0():
+        x = tvm.tir.Var("x", "int32")
+        assert tvm.testing.object_use_count(x) == 1
+        f(x, 2)
+        y = f(x._move(), 1)
+        assert x.handle.value == None
+
+    def check1():
+        x = tvm.tir.Var("x", "int32")
+        assert tvm.testing.object_use_count(x) == 1
+        y = f(x, 2)
+        z = f(x._move(), 2)
+        assert x.handle.value == None
+        assert y.handle.value is not None
+
+    check0()
+    check1()
+
+
+def test_numpy_scalar():
+    maxint = (1 << 63) - 1
+    assert tvm.testing.echo(np.int64(maxint)) == maxint
+
+
+def test_ndarray_args():
+    def check(arr):
+        assert not arr.is_view
+        assert tvm.testing.object_use_count(arr) == 2
+
+    fcheck = tvm.runtime.convert(check)
+    x = tvm.nd.array([1, 2, 3])
+    fcheck(x)
+    assert tvm.testing.object_use_count(x) == 1
+
+
+if __name__ == "__main__":
+    test_ndarray_args()
+    test_numpy_scalar()
+    test_rvalue_ref()
+    test_empty_array()
+    test_get_global()
+    test_get_callback_with_node()
+    test_convert()
+    test_return_func()
+    test_byte_array()
+    test_ctx()
diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_trace.py
similarity index 72%
rename from tests/python/unittest/test_runtime_packed_func.py
rename to tests/python/unittest/test_runtime_trace.py
index b681e4fc25d7..951e88d7efdd 100644
--- a/tests/python/unittest/test_runtime_packed_func.py
+++ b/tests/python/unittest/test_runtime_trace.py
@@ -16,126 +16,9 @@
 # under the License.
 import tvm
 from tvm import te
-import tvm.testing
 import numpy as np
 
 
-def test_get_global():
-    targs = (10, 10.0, "hello")
-    # register into global function table
-    @tvm.register_func
-    def my_packed_func(*args):
-        assert tuple(args) == targs
-        return 10
-
-    # get it out from global function table
-    f = tvm.get_global_func("my_packed_func")
-    assert isinstance(f, tvm.runtime.PackedFunc)
-    y = f(*targs)
-    assert y == 10
-
-
-def test_get_callback_with_node():
-    x = tvm.runtime.convert(10)
-
-    def test(y):
-        assert y.handle != x.handle
-        return y
-
-    f2 = tvm.runtime.convert(test)
-    # register into global function table
-    @tvm.register_func
-    def my_callback_with_node(y, f):
-        assert y == x
-        return f(y)
-
-    # get it out from global function table
-    f = tvm.get_global_func("my_callback_with_node")
-    assert isinstance(f, tvm.runtime.PackedFunc)
-    y = f(x, f2)
-    assert y.value == 10
-
-
-def test_return_func():
-    def addy(y):
-        def add(x):
-            return tvm.runtime.convert(x + y)
-
-        return add
-
-    myf = tvm.runtime.convert(addy)
-    f = myf(10)
-    assert f(11).value == 21
-
-
-def test_convert():
-    # convert a function to tvm function
-    targs = (10, 10.0, "hello", 10)
-
-    def myfunc(*args):
-        assert tuple(args) == targs
-
-    f = tvm.runtime.convert(myfunc)
-    assert isinstance(f, tvm.runtime.PackedFunc)
-
-
-def test_byte_array():
-    s = "hello"
-    a = bytearray(s, encoding="ascii")
-
-    def myfunc(ss):
-        assert ss == a
-
-    f = tvm.runtime.convert(myfunc)
-    f(a)
-
-
-def test_empty_array():
-    def myfunc(ss):
-        assert tuple(ss) == ()
-
-    x = tvm.runtime.convert(())
-    tvm.runtime.convert(myfunc)(x)
-
-
-def test_ctx():
-    def test_ctx_func(ctx):
-        assert tvm.gpu(7) == ctx
-        return tvm.cpu(0)
-
-    x = test_ctx_func(tvm.gpu(7))
-    assert x == tvm.cpu(0)
-    x = tvm.opencl(10)
-    x = tvm.testing.context_test(x, x.device_type, x.device_id)
-    assert x == tvm.opencl(10)
-
-
-def test_rvalue_ref():
-    def callback(x, expected_count):
-        assert expected_count == tvm.testing.object_use_count(x)
-        return x
-
-    f = tvm.runtime.convert(callback)
-
-    def check0():
-        x = tvm.tir.Var("x", "int32")
-        assert tvm.testing.object_use_count(x) == 1
-        f(x, 2)
-        y = f(x._move(), 1)
-        assert x.handle.value == None
-
-    def check1():
-        x = tvm.tir.Var("x", "int32")
-        assert tvm.testing.object_use_count(x) == 1
-        y = f(x, 2)
-        z = f(x._move(), 2)
-        assert x.handle.value == None
-        assert y.handle.value is not None
-
-    check0()
-    check1()
-
-
 def test_trace_default_action():
     n = 2
     x = te.placeholder((n, n, n), name="X", dtype="float32")
@@ -328,33 +211,7 @@ def check_assign(dtype):
         check_assign(t)
 
 
-def test_numpy_scalar():
-    maxint = (1 << 63) - 1
-    assert tvm.testing.echo(np.int64(maxint)) == maxint
-
-
-def test_ndarray_args():
-    def check(arr):
-        assert not arr.is_view
-        assert tvm.testing.object_use_count(arr) == 2
-
-    fcheck = tvm.runtime.convert(check)
-    x = tvm.nd.array([1, 2, 3])
-    fcheck(x)
-    assert tvm.testing.object_use_count(x) == 1
-
-
 if __name__ == "__main__":
-    test_ndarray_args()
-    test_numpy_scalar()
-    test_rvalue_ref()
-    test_empty_array()
-    test_get_global()
-    test_get_callback_with_node()
-    test_convert()
-    test_return_func()
-    test_byte_array()
-    test_ctx()
     test_trace_expr_assign()
     test_trace_expr_sum_generated()
     test_trace_expr_sum_custom()
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index e9f7a01f7a18..19773e59a777 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -76,45 +76,6 @@ def use_llvm_intrinsic(A, C):
     f = tvm.build(s, [A, C], target="llvm")
 
 
-@tvm.testing.requires_llvm
-def test_llvm_import():
-    # extern "C" is necessary to get the correct signature
-    cc_code = """
-    extern "C" float my_add(float x, float y) {
-      return x + y;
-    }
-    """
-    n = 10
-    A = te.placeholder((n,), name="A")
-    B = te.compute(
-        (n,), lambda *i: tvm.tir.call_pure_extern("float32", "my_add", A(*i), 1.0), name="B"
-    )
-
-    def check_llvm(use_file):
-        if not clang.find_clang(required=False):
-            print("skip because clang is not available")
-            return
-        temp = util.tempdir()
-        ll_path = temp.relpath("temp.ll")
-        ll_code = clang.create_llvm(cc_code, output=ll_path)
-        s = te.create_schedule(B.op)
-        if use_file:
-            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
-        else:
-            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
-        # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, B], "llvm")
-        ctx = tvm.cpu(0)
-        # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1.0)
-
-    check_llvm(use_file=True)
-    check_llvm(use_file=False)
-
-
 @tvm.testing.requires_llvm
 def test_llvm_lookup_intrin():
     ib = tvm.tir.ir_builder.create()
@@ -147,45 +108,6 @@ def check_llvm():
     check_llvm()
 
 
-@tvm.testing.requires_llvm
-def test_llvm_add_pipeline():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    AA = te.compute((n,), lambda *i: A(*i), name="A")
-    BB = te.compute((n,), lambda *i: B(*i), name="B")
-    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
-    C = te.compute(A.shape, lambda *i: T(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    xo1, xo2 = s[C].split(xo, factor=13)
-    s[C].parallel(xo2)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xo2, "parallel_stride_pattern")
-    s[C].pragma(xo2, "parallel_barrier_when_finish")
-    s[C].vectorize(xi)
-
-    def check_llvm():
-        # Specifically allow offset to test codepath when offset is available
-        Ab = tvm.tir.decl_buffer(
-            A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A"
-        )
-        binds = {A: Ab}
-        # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, B, C], "llvm", binds=binds)
-        ctx = tvm.cpu(0)
-        # launch the kernel.
-        n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
-        f(a, b, c)
-        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
-
-    check_llvm()
-
-
 @tvm.testing.requires_llvm
 def test_llvm_persist_parallel():
     n = 128
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index 622646b76189..0aaf9fc86664 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -23,7 +23,9 @@ source tests/scripts/setup-pytest-env.sh
 
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
+make cython3
 
+TVM_FFI=ctypes python3 -m pytest tests/python/all-platform-minimal-test
+TVM_FFI=cython python3 -m pytest tests/python/all-platform-minimal-test
 TVM_FFI=ctypes python3 -m pytest tests/python/unittest
-make cython3
 TVM_FFI=cython python3 -m pytest tests/python/unittest

From 5b03ce728bb545843c370bd3ed5f7342176d4c63 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <lairuihangdongdong@qq.com>
Date: Mon, 26 Oct 2020 20:21:02 +0800
Subject: [PATCH 055/258] [BUGFIX] Fix topi matrix multiplication using
 tensorcore to run faster (#6749)

---
 python/tvm/topi/cuda/dense_tensorcore.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/topi/cuda/dense_tensorcore.py b/python/tvm/topi/cuda/dense_tensorcore.py
index 8c7d7cc76677..99f28a1fc4e6 100644
--- a/python/tvm/topi/cuda/dense_tensorcore.py
+++ b/python/tvm/topi/cuda/dense_tensorcore.py
@@ -199,6 +199,8 @@ def _schedule_dense_tensorcore(cfg, s, C):
     bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
     oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
     s[CS].reorder(bb, oo, bbii, ooii, bbi, ooi)
+    s[CS].bind(bb, thread_y)
+    s[CS].bind(oo, thread_z)
 
     # Schedule for wmma computation
     s[CF].compute_at(s[CS], oo)

From a4e719f7b23cdf51b0db63fa8e34d04d37bd7525 Mon Sep 17 00:00:00 2001
From: shibuiwilliam <shibuiyusuke@gmail.com>
Date: Mon, 26 Oct 2020 21:21:27 +0900
Subject: [PATCH 056/258] [Fix,Conda] update conda download url (#6760)

Co-authored-by: Shibui Yusuke <yusuke.shibui@ShibuinoMacBook-Pro.local>
---
 docker/install/ubuntu_install_conda.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_conda.sh b/docker/install/ubuntu_install_conda.sh
index ef059ce42aa0..6f6019340293 100755
--- a/docker/install/ubuntu_install_conda.sh
+++ b/docker/install/ubuntu_install_conda.sh
@@ -20,7 +20,7 @@ set -e
 set -u
 set -o pipefail
 
-cd /tmp && wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+cd /tmp && wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 chmod +x Miniconda3-latest-Linux-x86_64.sh
 /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda
 rm /tmp/Miniconda3-latest-Linux-x86_64.sh

From 1e1a5e34a8182293d2ef9f56277944a68fa20aaf Mon Sep 17 00:00:00 2001
From: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Date: Mon, 26 Oct 2020 20:22:09 +0800
Subject: [PATCH 057/258] [ARITH] iter_affine_map bug fix, stride generalize
 (#6753)

---
 include/tvm/arith/iter_affine_map.h           |  8 ++
 src/arith/iter_affine_map.cc                  | 98 ++++++++++++++-----
 .../unittest/test_arith_iter_affine_map.py    | 24 +++++
 3 files changed, 105 insertions(+), 25 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index 00f8cf6ee9f0..e2e081d2be89 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -48,7 +48,9 @@
 #ifndef TVM_ARITH_ITER_AFFINE_MAP_H_
 #define TVM_ARITH_ITER_AFFINE_MAP_H_
 
+#include <tvm/arith/analyzer.h>
 #include <tvm/ir/expr.h>
+#include <tvm/tir/var.h>
 
 namespace tvm {
 namespace arith {
@@ -187,6 +189,12 @@ class IterSplitExpr : public IterMapExpr {
    * \param source The source expression.
    */
   TVM_DLL explicit IterSplitExpr(IterMark source);
+  /*!
+   * \brief constructor from just source.
+   * \param source The source expression.
+   * \param scale The additional scaling factor.
+   */
+  TVM_DLL explicit IterSplitExpr(IterMark source, PrimExpr scale);
   /*!
    * \brief constructor
    * \param source The source expression.
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 283ffa646567..7896db73d10a 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -51,7 +51,7 @@ TVM_REGISTER_NODE_TYPE(IterMarkNode);
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<IterMarkNode>([](const ObjectRef& node, ReprPrinter* p) {
       auto* op = static_cast<const IterMarkNode*>(node.get());
-      p->stream << "IterMark(" << op->source << ", extent=" << op->extent;
+      p->stream << "IterMark(" << op->source << ", extent=" << op->extent << ")";
     });
 
 IterSplitExpr::IterSplitExpr(IterMark source) {
@@ -65,6 +65,17 @@ IterSplitExpr::IterSplitExpr(IterMark source) {
   data_ = std::move(n);
 }
 
+IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr scale) {
+  auto n = make_object<IterSplitExprNode>();
+  auto one = make_const(source->source->dtype, 1);
+  n->dtype = source->source->dtype;
+  n->source = std::move(source);
+  n->extent = n->source->extent;
+  n->lower_factor = one;
+  n->scale = std::move(scale);
+  data_ = std::move(n);
+}
+
 IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr lower_factor, PrimExpr extent,
                              PrimExpr scale) {
   auto n = make_object<IterSplitExprNode>();
@@ -87,7 +98,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<IterSplitExprNode>([](const ObjectRef& node, ReprPrinter* p) {
       auto* op = static_cast<const IterSplitExprNode*>(node.get());
       p->stream << "IterSplit(" << op->source << ", lower_factor=" << op->lower_factor
-                << ", extent=" << op->extent << ", scale=" << op->scale;
+                << ", extent=" << op->extent << ", scale=" << op->scale << ")";
     });
 
 IterSumExpr::IterSumExpr(Array<IterSplitExpr> args, PrimExpr base) {
@@ -197,11 +208,11 @@ class IterMapRewriter : public ExprMutator {
     // All the splits that refers to the itermark covers its extent.
     // The splits do not overlap with each other.
     collector.Collect(indices);
-    for (IterMark mark : collector.visited_) {
-      if (TryNormalizeSplits(mark, collector.mark2splits_[mark]).size() == 0) return false;
+    for (const IterMark& mark : collector.visited_) {
+      if (TryNormalizeSplits(mark, collector.mark2splits_[mark]).empty()) return false;
     }
     // all input marks must be visited
-    for (auto mark : input_marks_) {
+    for (const auto& mark : input_marks_) {
       if (collector.visited_.count(mark) == 0) return false;
     }
     return true;
@@ -217,7 +228,7 @@ class IterMapRewriter : public ExprMutator {
   }
 
   // Normal mutation without normalization.
-  PrimExpr DirectMutate(PrimExpr expr) { return ExprMutator::VisitExpr(expr); }
+  PrimExpr DirectMutate(const PrimExpr& expr) { return ExprMutator::VisitExpr(expr); }
 
   PrimExpr VisitExpr_(const VarNode* op) final;
   PrimExpr VisitExpr_(const AddNode* op) final;
@@ -232,8 +243,8 @@ class IterMapRewriter : public ExprMutator {
     size_t operator()(const IterSumExpr& value) const {
       // for now only hash on source index.
       size_t hash = value->args.size();
-      for (size_t i = 0; i < value->args.size(); ++i) {
-        hash = support::HashCombine(hash, std::hash<const Object*>()(value->args[i]->source.get()));
+      for (const auto& arg : value->args) {
+        hash = support::HashCombine(hash, std::hash<const Object*>()(arg->source.get()));
       }
       return hash;
     }
@@ -246,7 +257,7 @@ class IterMapRewriter : public ExprMutator {
       if (!equal(lhs->base, rhs->base)) return false;
       for (size_t i = 0; i < lhs->args.size(); ++i) {
         auto lvalue = lhs->args[i];
-        auto rvalue = lhs->args[i];
+        auto rvalue = rhs->args[i];
         if (!lvalue->source.same_as(rvalue->source)) return false;
         if (!equal(lvalue->lower_factor, rvalue->lower_factor)) return false;
         if (!equal(lvalue->scale, rvalue->scale)) return false;
@@ -330,7 +341,7 @@ class IterMapRewriter : public ExprMutator {
    * \param expr The input expr.
    * \return The transformed IterSumExpr.
    */
-  IterSumExpr ToIterSumExpr(PrimExpr expr) {
+  static IterSumExpr ToIterSumExpr(const PrimExpr& expr) {
     if (const auto* op = expr.as<IterSumExprNode>()) {
       return GetRef<IterSumExpr>(op);
     } else if (const auto* op = expr.as<IterSplitExprNode>()) {
@@ -343,6 +354,11 @@ class IterMapRewriter : public ExprMutator {
 
   // Try to normalize IterSum into a fused IterMark
   // return a corresponding splitexpr if needed.
+  // IterSum = x1*c1 + x2*c2 + ... + xn*cn
+  //         = (x1*s1 + x2*s2 + ... + xn)*cn
+  //         = y*cn (IterMark y => x1*s1 + x2*s2 + ... + xn)
+  //         = [IterSplit(IterMark(y), scale=cn)]
+  // return a corresponding IterSplitExpr if needed.
   Optional<IterSplitExpr> TryFuseIters(IterSumExpr expr) {
     if (!is_zero(expr->base)) return NullOpt;
     if (expr->args.size() == 1) return expr->args[0];
@@ -351,10 +367,22 @@ class IterMapRewriter : public ExprMutator {
     std::vector<IterSplitExpr> iters;
     iters.reserve(expr->args.size());
     // canonicalize the expression
+    // find the base scale first
+    Optional<IntImm> base_scale = NullOpt;
+    size_t base_index = 0;
+    for (size_t i = 0; i < expr->args.size(); ++i) {
+      if (const auto* op = expr->args[i]->scale.as<IntImmNode>()) {
+        if (!base_scale || op->value < base_scale.value()->value) {
+          base_scale = GetRef<IntImm>(op);
+          base_index = i;
+        }
+      }
+    }
+    if (!base_scale) return NullOpt;
     // check if it can be remapped into a fused pattern.
-    PrimExpr expected_scale = make_const(expr->base->dtype, 1);
+    PrimExpr expected_scale = base_scale.value();
     for (size_t i = 0; i < expr->args.size(); ++i) {
-      size_t j = 0;
+      size_t j = i == 0 ? base_index : 0;
       for (; j < expr->args.size(); ++j) {
         if (!visited[j] && CanProveEqual(expr->args[j]->scale, expected_scale)) break;
       }
@@ -362,20 +390,22 @@ class IterMapRewriter : public ExprMutator {
         return NullOpt;
       }
       visited[j] = true;
-      iters.push_back(expr->args[j]);
+      auto arg = expr->args[j];
+      arg.CopyOnWrite()->scale = div(expr->args[j]->scale, base_scale.value());
+      iters.push_back(arg);
       expected_scale *= expr->args[j]->extent;
     }
     // update the iterator to use the canonicalized form
     expr.CopyOnWrite()->args = Array<IterSplitExpr>(iters.rbegin(), iters.rend());
     auto it = sum_fuse_map_.find(expr);
     if (it != sum_fuse_map_.end()) return it->second;
-    auto mark = IterMark(expr, expected_scale);
-    IterSplitExpr split(mark);
+    auto mark = IterMark(expr, div(expected_scale, base_scale.value()));
+    IterSplitExpr split(mark, base_scale.value());
     sum_fuse_map_[expr] = split;
     return split;
   }
 
-  bool CanProveDivisible(PrimExpr lhs, PrimExpr rhs) {
+  bool CanProveDivisible(const PrimExpr& lhs, const PrimExpr& rhs) {
     const auto* clhs = lhs.as<IntImmNode>();
     const auto* crhs = rhs.as<IntImmNode>();
     if (clhs && crhs) return clhs->value % crhs->value == 0;
@@ -408,9 +438,9 @@ class IterMapRewriter : public ExprMutator {
     }
   }
 
-  static void AddToLhs(IterSumExprNode* lhs, IterSumExpr rhs, int sign) {
-    for (size_t i = 0; i < rhs->args.size(); ++i) {
-      AddToLhs(lhs, rhs->args[i], sign);
+  static void AddToLhs(IterSumExprNode* lhs, const IterSumExpr& rhs, int sign) {
+    for (const auto& arg : rhs->args) {
+      AddToLhs(lhs, arg, sign);
     }
     if (sign > 0) {
       lhs->base += rhs->base;
@@ -419,7 +449,7 @@ class IterMapRewriter : public ExprMutator {
     }
   }
 
-  static void MulToLhs(IterSumExprNode* lhs, PrimExpr rhs) {
+  static void MulToLhs(IterSumExprNode* lhs, const PrimExpr& rhs) {
     for (size_t i = 0; i < lhs->args.size(); ++i) {
       IterSplitExpr lvalue = lhs->args[i];
       lvalue.CopyOnWrite()->scale *= rhs;
@@ -480,7 +510,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) {
   }
 
   // canonical form simplification.
-  IterSumExpr ret = ToIterSumExpr(std::move(a));
+  IterSumExpr ret = ToIterSumExpr(a);
 
   if (!b->IsInstance<IterMapExprNode>()) {
     ret.CopyOnWrite()->base += b;
@@ -516,7 +546,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) {
   }
 
   // canonical form simplification.
-  IterSumExpr ret = ToIterSumExpr(std::move(a));
+  IterSumExpr ret = ToIterSumExpr(a);
 
   if (!b->IsInstance<IterMapExprNode>()) {
     ret.CopyOnWrite()->base -= b;
@@ -574,13 +604,16 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
 }
 
 PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) {
+  // floordiv(x*scale, rhs)
   if (is_one(rhs)) return std::move(lhs);
   if (!is_one(lhs->scale)) {
     if (CanProveDivisible(lhs->scale, rhs)) {
+      // floordiv(x*c1*c2, c2) = x*c1, c1=scale/rhs
       lhs.CopyOnWrite()->scale = floordiv(lhs->scale, rhs);
       return std::move(lhs);
     } else {
       if (CanProveDivisible(rhs, lhs->scale)) {
+        // floordiv(x*c1, c1*c2) = floordiv(x, c2), c2=rhs/scale
         rhs = floordiv(rhs, lhs->scale);
         lhs.CopyOnWrite()->scale = make_const(rhs->dtype, 1);
       } else {
@@ -591,7 +624,16 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) {
     }
   }
 
+  // We handle scale!=1 in above code, hence we only consider floordiv(x, rhs) below
+  // where x=floormod(floordiv(iter, lower_factor), extent)
   if (CanProveDivisible(lhs->extent, rhs)) {
+    // floordiv(floormod(floordiv(iter, lower_factor), c1c2), c1)
+    // = floordiv(floormod(y, c1c2), c1), where y=floordiv(iter, lower_factor)
+    // = floordiv(floormod(sc1c2+tc1+u, c1c2), c1), where y=sc1c2+tc1+u, t<c2, u<c1
+    // = t
+    // = floormod(sc2+t, c2)
+    // = floormod(floordiv(y, c1), c2)
+    // = floormod(floordiv(iter, lower_factor*c1), c2), where c1=rhs, c2=extent/rhs
     auto* ptr_lhs = lhs.CopyOnWrite();
     ptr_lhs->lower_factor *= rhs;
     ptr_lhs->extent = analyzer_->Simplify(floordiv(ptr_lhs->extent, rhs));
@@ -631,7 +673,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
   }
 
   if (a->IsInstance<IterSumExprNode>()) {
-    IterSumExpr ret = Downcast<IterSumExpr>(std::move(a));
+    IterSumExpr ret = Downcast<IterSumExpr>(a);
     if (auto opt = TryFuseIters(ret)) {
       return SplitFloorDivConst(opt.value(), b);
     } else {
@@ -646,13 +688,16 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
 }
 
 PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) {
+  // floormod(x*scale, rhs)
   if (is_one(rhs)) return make_zero(lhs->dtype);
   if (!is_one(lhs->scale)) {
+    // floormod(x*c1*c2, c1) = 0
     if (CanProveDivisible(lhs->scale, rhs)) {
       return make_zero(lhs->dtype);
     } else {
       if (CanProveDivisible(rhs, lhs->scale)) {
-        rhs = floormod(rhs, lhs->scale);
+        // floormod(x*c1, c1*c2) = (floormod(x, c2)) * c1, where c2 = rhs/scale
+        rhs = floordiv(rhs, lhs->scale);
       } else {
         // mark as unresolved.
         ++unresolved_count_;
@@ -661,7 +706,10 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) {
     }
   }
 
+  // floormod(x, rhs) where x=floormod(floordiv(iter, lower_factor), extent)
   if (CanProveDivisible(lhs->extent, rhs)) {
+    // floormod(floormod(floordiv(iter, lower_factor), c1c2), c1)
+    // = floormod(floordiv(iter, lower_factor), c1), where c1=rhs
     lhs.CopyOnWrite()->extent = rhs;
     return std::move(lhs);
   } else {
@@ -699,7 +747,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
   }
 
   if (a->IsInstance<IterSumExprNode>()) {
-    IterSumExpr ret = Downcast<IterSumExpr>(std::move(a));
+    IterSumExpr ret = Downcast<IterSumExpr>(a);
     if (auto opt = TryFuseIters(ret)) {
       return SplitFloorModConst(opt.value(), b);
     } else {
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 9fb098831a71..620540cc9841 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -78,6 +78,9 @@ def test_fuse():
     x = tvm.tir.Var("x", "int32")
     y = tvm.tir.Var("y", "int32")
     c = tvm.tir.SizeVar("c", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+    c1 = tvm.tir.SizeVar("c1", "int32")
+    c2 = tvm.tir.SizeVar("c1", "int32")
 
     res = tvm.arith.detect_iter_map([y * 3 + 1 + c + x], var_dom([(x, 3), (y, 4)]))
     assert len(res) == 1
@@ -104,6 +107,16 @@ def test_fuse():
     res = tvm.arith.detect_iter_map([y * 4 + x], var_dom([(x, 3), (y, 4)]))
     assert len(res) == 0
 
+    # simple stride pattern
+    res = tvm.arith.detect_iter_map([x * 4 + y * 2], var_dom([(x, 3), (y, 2)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 6, 0, scale=2)
+
+    # simple stride pattern with symbolic
+    res = tvm.arith.detect_iter_map([x * 2 * c0 + y * 2], var_dom([(x, 3), (y, c0)]))
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 3 * c0, 0, scale=2)
+
 
 def test_split():
     x = tvm.tir.Var("x", "int32")
@@ -137,6 +150,17 @@ def test_split():
     assert_iter_sum_pattern(res[0], c1, 0)
     assert_iter_sum_pattern(res[1], c0, 0)
 
+    res = tvm.arith.detect_iter_map([fld(x * 2, 4), flm(x * 2, 4)], var_dom([(x, 8)]))
+
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 4, 0, scale=1)
+    assert_iter_sum_pattern(res[1], 2, 0, scale=2)
+
+    res = tvm.arith.detect_iter_map([fld(x * 2, 4) * 4 + flm(x * 2, 4)], var_dom([(x, 8)]))
+
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 8, 0, scale=2)
+
 
 def test_compound():
     x = tvm.tir.Var("x", "int32"), 10

From b3f974705ab8b0fe1ffc6d045ced11764ae458a9 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 26 Oct 2020 11:03:25 -0700
Subject: [PATCH 058/258] [FIX,PYLINT] Fix pylint errors on MacOS with Python
 3.8 (#6746)

These errors do not seem to show up in CI, but they show up locally.
---
 python/tvm/autotvm/tuner/ga_tuner.py           | 4 ++--
 python/tvm/relay/frontend/onnx.py              | 2 +-
 python/tvm/relay/frontend/pytorch.py           | 1 +
 python/tvm/relay/frontend/qnn_torch.py         | 1 +
 python/tvm/relay/frontend/tensorflow_parser.py | 2 +-
 python/tvm/relay/testing/darknet.py            | 2 +-
 6 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index 165d5d3dc5c2..58251992ce54 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -129,8 +129,8 @@ def update(self, inputs, results):
                     while knob2point(tmp_gene, self.dims) in self.visited:
                         j = np.random.randint(len(self.dims))
                         tmp_gene[j] = np.random.randint(
-                            self.dims[j]
-                        )  # pylint: disable=invalid-sequence-index
+                            self.dims[j]  # pylint: disable=invalid-sequence-index
+                        )
                     next_genes.append(tmp_gene)
                     self.visited.add(knob2point(tmp_gene, self.dims))
                 else:
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 2d21156f8f7b..ccf644e82c57 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2689,7 +2689,7 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
             # try use onnx's own model checker before converting any model
             try:
                 onnx.checker.check_model(model)
-            except onnx.onnx_cpp2py_export.checker.ValidationError as e:
+            except onnx.onnx_cpp2py_export.checker.ValidationError as e:  # pylint: disable=c-extension-no-member
                 # the checker is a bit violent about errors, so simply print warnings here
                 warnings.warn(str(e))
     except ImportError:
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index c41d6802edd9..52761647d15b 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2738,6 +2738,7 @@ def _get_convert_map(prelude, default_dtype):
 
 def _run_jit_passes(graph):
     """ The inline pass is necessary to unwrap prim::CallMethod """
+    # pylint: disable=c-extension-no-member
     import torch
 
     if is_version_greater_than("1.5.0"):
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index ca67391cebc7..3f8d495511dd 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -285,6 +285,7 @@ def _add_output_quant_params_to_scalar_op(node, graph, input_scale, input_zero_p
 
     %7 and %8 are newly created output scale and zp constant nodes
     """
+    # pylint: disable=c-extension-no-member
     import torch
 
     operator = node.kind()
diff --git a/python/tvm/relay/frontend/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
index a176e12192c4..3c1d342ac248 100644
--- a/python/tvm/relay/frontend/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -147,7 +147,7 @@ def _load_saved_model(self):
             saved_model_tags,
         )
 
-        with ops.Graph().as_default():
+        with ops.Graph().as_default():  # pylint: disable=not-context-manager
             output_graph_def = graph_pb2.GraphDef()
             with open(output_graph_filename, "rb") as f:
                 output_graph_def.ParseFromString(f.read())
diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py
index a62a91f66c41..c0468b7ef692 100644
--- a/python/tvm/relay/testing/darknet.py
+++ b/python/tvm/relay/testing/darknet.py
@@ -24,8 +24,8 @@
 """
 from __future__ import division
 import numpy as np
-from cffi import FFI
 import cv2
+from cffi import FFI
 
 
 def convert_image(image):

From d80b94ed4e6ff313e6f273c821f689b1339824fe Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 26 Oct 2020 15:42:47 -0400
Subject: [PATCH 059/258] [VERSION] Enhance version.py to support git-describe.
 (#6757)

* [VERSION] Enhance version.py to support git-describe.

This PR enhances version.py with a --git-descrbe option
which allows it to generate a git describe based version
tag for potential dev related nightly packaging during the
development cycle.

The behavior of the normal relase remains the same.
Note that the version.py still modifies the files inplace
and we only recommend using it during a clean clone based workflow.

The setup.py is also updated to take advantage of the version.
Note that the git info is already captured by the c++ side in a previous PR.
The tool is mainly used to create PEP compatible python wheels.

* Update per comment
---
 python/setup.py  |  12 ++++
 version.py       | 160 +++++++++++++++++++++++++++++++++++++++++------
 web/package.json |   2 +-
 3 files changed, 155 insertions(+), 19 deletions(-)

diff --git a/python/setup.py b/python/setup.py
index 092800c2314d..24022e4ec7b5 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -60,7 +60,19 @@ def get_lib_path():
     return libs, version
 
 
+def git_describe_version(original_version):
+    """Get git describe version."""
+    ver_py = os.path.join(CURRENT_DIR, "..", "version.py")
+    libver = {"__file__": ver_py}
+    exec(compile(open(ver_py, "rb").read(), ver_py, "exec"), libver, libver)
+    _, gd_version = libver["git_describe_version"]()
+    if gd_version != original_version and "--inplace" not in sys.argv:
+        print("Use git describe based version %s" % gd_version)
+    return gd_version
+
+
 LIB_LIST, __version__ = get_lib_path()
+__version__ = git_describe_version(__version__)
 
 
 def config_cython():
diff --git a/version.py b/version.py
index 0338d13661e0..2678c5ce42d4 100644
--- a/version.py
+++ b/version.py
@@ -22,19 +22,102 @@
 List of affected files:
 - tvm-root/python/tvm/_ffi/libinfo.py
 - tvm-root/include/tvm/runtime/c_runtime_api.h
-- tvm-root/conda/tvm/meta.yaml
-- tvm-root/conda/tvm-libs/meta.yaml
+- tvm-root/conda/recipe/meta.yaml
+- tvm-root/web/package.json
 """
 import os
 import re
+import argparse
+import logging
+import subprocess
 
-# current version
+# Modify the following two settings during release
+# ---------------------------------------------------
+# Current version
 # We use the version of the incoming release for code
 # that is under development
 __version__ = "0.8.dev0"
 
+# Most recent tag, used for git describe validation
+# set this value to be the most recent release tag
+# before this development cycle.
+__most_recent_tag__ = "v0.7.0"
+# ---------------------------------------------------
+
+
+def py_str(cstr):
+    return cstr.decode("utf-8")
+
+
+def git_describe_version():
+    """Get PEP-440 compatible public and local version using git describe.
+
+    Returns
+    -------
+    pub_ver: str
+        Public version.
+
+    local_ver: str
+        Local version (with additional label appended to pub_ver).
+
+    Note
+    ----
+    We follow PEP 440's convention of public version
+    and local versions.
+
+    Here are some examples:
+
+    - pub_ver = '0.7.0', local_ver = '0.7.0':
+      We are at the 0.7.0 release.
+    - pub_ver =  '0.8.dev94', local_ver = '0.8.dev94+g0d07a329e':
+      We are at the the 0.8 development cycle.
+      The current source contains 94 additional commits
+      after the most recent tag(v0.7.0),
+      the git short hash tag of the current commit is 0d07a329e.
+    """
+    cmd = ["git", "describe", "--tags"]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    (out, _) = proc.communicate()
+
+    if proc.returncode != 0:
+        msg = py_str(out)
+        if msg.find("not a git repository") != -1:
+            return __version__, __version__
+        logging.warning("git describe error: %", msg)
+        return __version__, __version__
+    describe = py_str(out).strip()
+    arr_info = describe.split("-")
+
+    if not arr_info[0].endswith(__most_recent_tag__):
+        logging.warning(
+            "%s does not match most recent tag %s, fallback to %s",
+            describe,
+            __most_recent_tag__,
+            __version__,
+        )
+        return __version__, __version__
+
+    # Remove the v prefix, mainly to be robust
+    # to the case where v is not presented as well.
+    if arr_info[0].startswith("v"):
+        arr_info[0] = arr_info[0][1:]
+
+    # hit the exact tag
+    if len(arr_info) == 1:
+        return arr_info[0], arr_info[0]
+
+    if len(arr_info) != 3:
+        logging.warning("Invalid output from git describe %s", describe)
+        return __version__, __version__
+
+    dev_pos = __version__.find(".dev")
+    pub_ver = "%s.dev%s" % (__version__[:dev_pos], arr_info[1])
+    local_ver = "%s+%s" % (pub_ver, arr_info[2])
+    return pub_ver, local_ver
+
+
 # Implementations
-def update(file_name, pattern, repl):
+def update(file_name, pattern, repl, dry_run=False):
     update = []
     hit_counter = 0
     need_update = False
@@ -46,7 +129,7 @@ def update(file_name, pattern, repl):
             if result[0] != repl:
                 l = re.sub(pattern, repl, l)
                 need_update = True
-                print("%s: %s->%s" % (file_name, result[0], repl))
+                print("%s: %s -> %s" % (file_name, result[0], repl))
             else:
                 print("%s: version is already %s" % (file_name, repl))
 
@@ -54,33 +137,74 @@ def update(file_name, pattern, repl):
     if hit_counter != 1:
         raise RuntimeError("Cannot find version in %s" % file_name)
 
-    if need_update:
+    if need_update and not dry_run:
         with open(file_name, "w") as output_file:
             for l in update:
                 output_file.write(l)
 
 
-def main():
+def sync_version(pub_ver, local_ver, dry_run):
+    """Synchronize version."""
     proj_root = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    # python path
+
+    # python uses the PEP-440: local version
     update(
         os.path.join(proj_root, "python", "tvm", "_ffi", "libinfo.py"),
-        r"(?<=__version__ = \")[.0-9a-z]+",
-        __version__,
+        r"(?<=__version__ = \")[.0-9a-z\+]+",
+        local_ver,
+        dry_run,
     )
+    # Use public version for other parts for now
+    # Note that full git hash is already available in libtvm
     # C++ header
     update(
         os.path.join(proj_root, "include", "tvm", "runtime", "c_runtime_api.h"),
-        '(?<=TVM_VERSION ")[.0-9a-z]+',
-        __version__,
+        r'(?<=TVM_VERSION ")[.0-9a-z\+]+',
+        pub_ver,
+        dry_run,
     )
     # conda
-    for path in ["recipe"]:
-        update(
-            os.path.join(proj_root, "conda", path, "meta.yaml"),
-            "(?<=version = ')[.0-9a-z]+",
-            __version__,
-        )
+    update(
+        os.path.join(proj_root, "conda", "recipe", "meta.yaml"),
+        r"(?<=version = ')[.0-9a-z\+]+",
+        pub_ver,
+        dry_run,
+    )
+    # web
+    # change to pre-release convention by npm
+    dev_pos = pub_ver.find(".dev")
+    npm_ver = pub_ver if dev_pos == -1 else "%s.0-%s" % (pub_ver[:dev_pos], pub_ver[dev_pos + 1 :])
+    update(
+        os.path.join(proj_root, "web", "package.json"),
+        r'(?<="version": ")[.0-9a-z\+]+',
+        npm_ver,
+        dry_run,
+    )
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser(description="Detect and sychnronize version.")
+    parser.add_argument(
+        "--print-version",
+        action="store_true",
+        help="Print version to the command line. No changes is applied to files.",
+    )
+    parser.add_argument(
+        "--git-describe",
+        action="store_true",
+        help="Use git describe to generate development version.",
+    )
+    parser.add_argument("--dry-run", action="store_true")
+
+    opt = parser.parse_args()
+    pub_ver, local_ver = __version__, __version__
+    if opt.git_describe:
+        pub_ver, local_ver = git_describe_version()
+    if opt.print_version:
+        print(local_ver)
+    else:
+        sync_version(pub_ver, local_ver, opt.dry_run)
 
 
 if __name__ == "__main__":
diff --git a/web/package.json b/web/package.json
index 1f52a0781e60..dafccb0a8648 100644
--- a/web/package.json
+++ b/web/package.json
@@ -2,7 +2,7 @@
   "name": "tvmjs",
   "displayName": "TVM Wasm JS runtime",
   "license": "Apache-2.0",
-  "version": "0.7.0",
+  "version": "0.8.0-dev0",
   "scripts": {
     "prepwasm": "make && python3 tests/python/prepare_test_libs.py",
     "build": "tsc -b && make rmtypedep",

From 8991282cd2eec92719c4d82ce6351404d3fa813f Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandron85@gmail.com>
Date: Mon, 26 Oct 2020 20:42:37 +0000
Subject: [PATCH 060/258] [TVMC] 'tvmc run' --rpc-tracker and --rpc-tracker
 fail due to argparse misconfiguration (#6762)

to be identified as a list of strings, rathat than the expected
string type.


Co-authored-by: Giuseppe Rossini <giuseppe.rossini@arm.com>

Co-authored-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
---
 python/tvm/driver/tvmc/runner.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index d86d4db795dc..a4abe8c31f56 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -86,12 +86,10 @@ def add_run_parser(subparsers):
     )
     parser.add_argument(
         "--rpc-key",
-        nargs=1,
         help="the RPC tracker key of the target device",
     )
     parser.add_argument(
         "--rpc-tracker",
-        nargs=1,
         help="hostname (required) and port (optional, defaults to 9090) of the RPC tracker, "
         "e.g. '192.168.0.100:9999'",
     )

From 4b1028ca09deb1658ee0f9cc2f2e7d991ed90f4a Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Mon, 26 Oct 2020 15:55:43 -0700
Subject: [PATCH 061/258] More CHECK to ICHECK (#6758)

* Address apps, docs, and nnvm directories

* Catch some that were missed

* crt has it's own logging.h

* Fix missing include
---
 apps/cpp_rpc/main.cc                         |   2 +-
 apps/cpp_rpc/rpc_env.cc                      |   2 +-
 apps/cpp_rpc/rpc_server.cc                   |  18 +-
 apps/cpp_rpc/rpc_tracker_client.h            |  14 +-
 apps/extension/src/tvm_ext.cc                |   6 +-
 apps/howto_deploy/cpp_deploy.cc              |   6 +-
 apps/ios_rpc/tvmrpc/TVMRuntime.mm            |   2 +-
 apps/ios_rpc/tvmrpc/ViewController.mm        |   4 +-
 docs/contribute/error_handling.rst           |   4 +-
 docs/dev/convert_layout.rst                  |   2 +-
 docs/dev/pass_infra.rst                      |   6 +-
 docs/dev/relay_bring_your_own_codegen.rst    |  12 +-
 nnvm/include/nnvm/graph.h                    |   4 +-
 nnvm/include/nnvm/layout.h                   |  40 +--
 nnvm/include/nnvm/op.h                       |  12 +-
 nnvm/include/nnvm/tuple.h                    |   4 +-
 nnvm/src/core/graph.cc                       |  10 +-
 nnvm/src/core/op.cc                          |   2 +-
 nnvm/src/core/pass.cc                        |   2 +-
 nnvm/src/core/symbolic.cc                    |  22 +-
 nnvm/src/pass/correct_layout.cc              |  12 +-
 nnvm/src/pass/gradient.cc                    |  16 +-
 nnvm/src/pass/graph_algorithm.h              |  10 +-
 nnvm/src/pass/infer_shape_type.cc            |  24 +-
 nnvm/src/pass/place_device.cc                |  12 +-
 nnvm/src/pass/plan_memory.cc                 |   4 +-
 nnvm/src/pass/print_graph_ir.cc              |   2 +-
 nnvm/src/pass/saveload_json.cc               |  18 +-
 nnvm/tests/cpp/op_test.cc                    |   2 +-
 nnvm/tests/cpp/tuple_test.cc                 |   8 +-
 tests/cpp/arith_simplify_test.cc             |   8 +-
 tests/cpp/attrs_test.cc                      |  14 +-
 tests/cpp/auto_scheduler_test.cc             |  48 ++--
 tests/cpp/build_module_test.cc               |  22 +-
 tests/cpp/container_test.cc                  | 288 +++++++++----------
 tests/cpp/expr_test.cc                       |   6 +-
 tests/cpp/ir_functor_test.cc                 |  48 ++--
 tests/cpp/object_protocol_test.cc            |  36 +--
 tests/cpp/packed_func_test.cc                | 138 ++++-----
 tests/cpp/parallel_for_test.cc               |  15 +-
 tests/cpp/pattern_match_test.cc              | 114 ++++----
 tests/cpp/relay_build_module_test.cc         |  24 +-
 tests/cpp/relay_pass_type_infer_test.cc      |   4 +-
 tests/cpp/relay_transform_sequential_test.cc |  10 +-
 tests/cpp/target_test.cc                     |  36 +--
 tests/cpp/tir_analysis_side_effect.cc        |  10 +-
 46 files changed, 552 insertions(+), 551 deletions(-)

diff --git a/apps/cpp_rpc/main.cc b/apps/cpp_rpc/main.cc
index 9bbbea92f41c..e381dd2b261b 100644
--- a/apps/cpp_rpc/main.cc
+++ b/apps/cpp_rpc/main.cc
@@ -139,7 +139,7 @@ string GetCmdOption(int argc, char* argv[], string option, bool key = false) {
         return cmd;
       }
       // We assume "=" is the end of option.
-      CHECK_EQ(*option.rbegin(), '=');
+      ICHECK_EQ(*option.rbegin(), '=');
       cmd = arg.substr(arg.find('=') + 1);
       return cmd;
     }
diff --git a/apps/cpp_rpc/rpc_env.cc b/apps/cpp_rpc/rpc_env.cc
index 967274fd88a2..5b351725b1f1 100644
--- a/apps/cpp_rpc/rpc_env.cc
+++ b/apps/cpp_rpc/rpc_env.cc
@@ -116,7 +116,7 @@ RPCEnv::RPCEnv() {
         std::string bin;
 
         std::ifstream fs(file_name, std::ios::in | std::ios::binary);
-        CHECK(!fs.fail()) << "Cannot open " << file_name;
+        ICHECK(!fs.fail()) << "Cannot open " << file_name;
         fs.seekg(0, std::ios::end);
         size_t size = static_cast<size_t>(fs.tellg());
         fs.seekg(0, std::ios::beg);
diff --git a/apps/cpp_rpc/rpc_server.cc b/apps/cpp_rpc/rpc_server.cc
index 592a6db6d2ef..16939456451b 100644
--- a/apps/cpp_rpc/rpc_server.cc
+++ b/apps/cpp_rpc/rpc_server.cc
@@ -245,7 +245,7 @@ class RPCServer {
       support::TCPSocket conn = listen_sock_.Accept(addr);
 
       int code = kRPCMagic;
-      CHECK_EQ(conn.RecvAll(&code, sizeof(code)), sizeof(code));
+      ICHECK_EQ(conn.RecvAll(&code, sizeof(code)), sizeof(code));
       if (code != kRPCMagic) {
         conn.Close();
         LOG(FATAL) << "Client connected is not TVM RPC server";
@@ -253,7 +253,7 @@ class RPCServer {
       }
 
       int keylen = 0;
-      CHECK_EQ(conn.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
+      ICHECK_EQ(conn.RecvAll(&keylen, sizeof(keylen)), sizeof(keylen));
 
       const char* CLIENT_HEADER = "client:";
       const char* SERVER_HEADER = "server:";
@@ -265,10 +265,10 @@ class RPCServer {
         continue;
       }
 
-      CHECK_NE(keylen, 0);
+      ICHECK_NE(keylen, 0);
       std::string remote_key;
       remote_key.resize(keylen);
-      CHECK_EQ(conn.RecvAll(&remote_key[0], keylen), keylen);
+      ICHECK_EQ(conn.RecvAll(&remote_key[0], keylen), keylen);
 
       std::stringstream ssin(remote_key);
       std::string arg0;
@@ -280,16 +280,16 @@ class RPCServer {
 
       if (arg0 != expect_header) {
         code = kRPCMismatch;
-        CHECK_EQ(conn.SendAll(&code, sizeof(code)), sizeof(code));
+        ICHECK_EQ(conn.SendAll(&code, sizeof(code)), sizeof(code));
         conn.Close();
         LOG(WARNING) << "Mismatch key from" << addr->AsString();
         continue;
       } else {
         code = kRPCSuccess;
-        CHECK_EQ(conn.SendAll(&code, sizeof(code)), sizeof(code));
+        ICHECK_EQ(conn.SendAll(&code, sizeof(code)), sizeof(code));
         keylen = int(server_key.length());
-        CHECK_EQ(conn.SendAll(&keylen, sizeof(keylen)), sizeof(keylen));
-        CHECK_EQ(conn.SendAll(server_key.c_str(), keylen), keylen);
+        ICHECK_EQ(conn.SendAll(&keylen, sizeof(keylen)), sizeof(keylen));
+        ICHECK_EQ(conn.SendAll(server_key.c_str(), keylen), keylen);
         LOG(INFO) << "Connection success " << addr->AsString();
 #ifndef __ANDROID__
         ssin >> *opts;
@@ -325,7 +325,7 @@ class RPCServer {
     size_t pos = opts.rfind(option);
     if (pos != std::string::npos) {
       const std::string cmd = opts.substr(pos + option.size());
-      CHECK(support::IsNumber(cmd)) << "Timeout is not valid";
+      ICHECK(support::IsNumber(cmd)) << "Timeout is not valid";
       return std::stoi(cmd);
     }
     return 0;
diff --git a/apps/cpp_rpc/rpc_tracker_client.h b/apps/cpp_rpc/rpc_tracker_client.h
index cdfb64780ba6..1497ab3251be 100644
--- a/apps/cpp_rpc/rpc_tracker_client.h
+++ b/apps/cpp_rpc/rpc_tracker_client.h
@@ -74,9 +74,9 @@ class TrackerClient {
       tracker_sock_ = ConnectWithRetry();
 
       int code = kRPCTrackerMagic;
-      CHECK_EQ(tracker_sock_.SendAll(&code, sizeof(code)), sizeof(code));
-      CHECK_EQ(tracker_sock_.RecvAll(&code, sizeof(code)), sizeof(code));
-      CHECK_EQ(code, kRPCTrackerMagic) << tracker_addr_.c_str() << " is not RPC Tracker";
+      ICHECK_EQ(tracker_sock_.SendAll(&code, sizeof(code)), sizeof(code));
+      ICHECK_EQ(tracker_sock_.RecvAll(&code, sizeof(code)), sizeof(code));
+      ICHECK_EQ(code, kRPCTrackerMagic) << tracker_addr_.c_str() << " is not RPC Tracker";
 
       std::ostringstream ss;
       ss << "[" << static_cast<int>(TrackerCode::kUpdateInfo) << ", {\"key\": \"server:" << key_
@@ -85,7 +85,7 @@ class TrackerClient {
 
       // Receive status and validate
       std::string remote_status = tracker_sock_.RecvBytes();
-      CHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
+      ICHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
     }
   }
   /*!
@@ -117,7 +117,7 @@ class TrackerClient {
 
       // Receive status and validate
       std::string remote_status = tracker_sock_.RecvBytes();
-      CHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
+      ICHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
     } else {
       *matchkey = key_;
     }
@@ -167,7 +167,7 @@ class TrackerClient {
             tracker_sock_.SendBytes(ss.str());
 
             std::string remote_status = tracker_sock_.RecvBytes();
-            CHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
+            ICHECK_EQ(std::stoi(remote_status), static_cast<int>(TrackerCode::kSuccess));
             unmatch_period_count = 0;
           }
           continue;
@@ -199,7 +199,7 @@ class TrackerClient {
       auto period = (std::chrono::duration_cast<std::chrono::seconds>(
                          std::chrono::system_clock::now() - tbegin))
                         .count();
-      CHECK(period < timeout) << "Failed to connect to server" << addr.AsString();
+      ICHECK(period < timeout) << "Failed to connect to server" << addr.AsString();
       LOG(WARNING) << "Cannot connect to tracker " << addr.AsString() << " retry in "
                    << retry_period << " seconds.";
       std::this_thread::sleep_for(std::chrono::seconds(retry_period));
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index 87cb69b4f4ce..be431bab68d1 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -75,12 +75,12 @@ class NDSubClass : public tvm::runtime::NDArray {
   NDSubClass AddWith(const NDSubClass& other) const {
     SubContainer* a = static_cast<SubContainer*>(get_mutable());
     SubContainer* b = static_cast<SubContainer*>(other.get_mutable());
-    CHECK(a != nullptr && b != nullptr);
+    ICHECK(a != nullptr && b != nullptr);
     return NDSubClass(a->additional_info_ + b->additional_info_);
   }
   int get_additional_info() const {
     SubContainer* self = static_cast<SubContainer*>(get_mutable());
-    CHECK(self != nullptr);
+    ICHECK(self != nullptr);
     return self->additional_info_;
   }
   using ContainerType = SubContainer;
@@ -146,7 +146,7 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev").set_body([](TVMArgs args, TVMRetValue*
 TVM_REGISTER_GLOBAL("tvm_ext.nd_create").set_body([](TVMArgs args, TVMRetValue* rv) {
   int additional_info = args[0];
   *rv = NDSubClass(additional_info);
-  CHECK_EQ(rv->type_code(), kTVMNDArrayHandle);
+  ICHECK_EQ(rv->type_code(), kTVMNDArrayHandle);
 });
 
 TVM_REGISTER_GLOBAL("tvm_ext.nd_add_two").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc
index fdb55a51480a..829241d31a6d 100644
--- a/apps/howto_deploy/cpp_deploy.cc
+++ b/apps/howto_deploy/cpp_deploy.cc
@@ -31,7 +31,7 @@
 void Verify(tvm::runtime::Module mod, std::string fname) {
   // Get the function from the module.
   tvm::runtime::PackedFunc f = mod.GetFunction(fname);
-  CHECK(f != nullptr);
+  ICHECK(f != nullptr);
   // Allocate the DLPack data structures.
   //
   // Note that we use TVM runtime API to allocate the DLTensor in this example.
@@ -64,7 +64,7 @@ void Verify(tvm::runtime::Module mod, std::string fname) {
   f(x, y);
   // Print out the output
   for (int i = 0; i < shape[0]; ++i) {
-    CHECK_EQ(static_cast<float*>(y->data)[i], i + 1.0f);
+    ICHECK_EQ(static_cast<float*>(y->data)[i], i + 1.0f);
   }
   LOG(INFO) << "Finish verification...";
   TVMArrayFree(x);
@@ -112,7 +112,7 @@ void DeployGraphRuntime() {
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 2; ++j) {
-      CHECK_EQ(static_cast<float*>(y->data)[i * 2 + j], i * 2 + j + 1);
+      ICHECK_EQ(static_cast<float*>(y->data)[i * 2 + j], i * 2 + j + 1);
     }
   }
 }
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index 61a4668cdd91..fbe4850e1b57 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -118,7 +118,7 @@ void LaunchSyncServer() {
   std::ifstream fs(name, std::ios::in);
   std::string url, key;
   int port;
-  CHECK(fs >> url >> port >> key) << "Invalid RPC config file " << name;
+  ICHECK(fs >> url >> port >> key) << "Invalid RPC config file " << name;
   RPCConnect(url, port, "server:" + key, TVMArgs(nullptr, nullptr, 0))->ServerLoop();
 }
 
diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm
index 6c618c48096f..910c650aedc1 100644
--- a/apps/ios_rpc/tvmrpc/ViewController.mm
+++ b/apps/ios_rpc/tvmrpc/ViewController.mm
@@ -80,7 +80,7 @@ - (void)onReadAvailable {
     } else {
       initialized_ = true;
       self.statusLabel.text = @"Proxy connected.";
-      CHECK(handler_ != nullptr);
+      ICHECK(handler_ != nullptr);
     }
   }
   const int kBufferSize = 4 << 10;
@@ -158,7 +158,7 @@ - (void)open {
   [outputStream_ open];
   [inputStream_ open];
   handler_ = tvm::runtime::CreateServerEventHandler(outputStream_, key_, "%toinit");
-  CHECK(handler_ != nullptr);
+  ICHECK(handler_ != nullptr);
   self.infoText.text = @"";
   self.statusLabel.text = @"Connecting...";
 }
diff --git a/docs/contribute/error_handling.rst b/docs/contribute/error_handling.rst
index 8f71ee61aeb6..d31b401ea654 100644
--- a/docs/contribute/error_handling.rst
+++ b/docs/contribute/error_handling.rst
@@ -37,14 +37,14 @@ raise an error of the corresponding type.
 Note that you do not have to add a new type
 :py:class:`tvm.error.TVMError` will be raised by default when
 there is no error type prefix in the message.
-This mechanism works for both ``LOG(FATAL)`` and ``CHECK`` macros.
+This mechanism works for both ``LOG(FATAL)`` and ``ICHECK`` macros.
 The following code gives an example on how to do so.
 
 .. code:: c
 
   // src/api_test.cc
   void ErrorTest(int x, int y) {
-    CHECK_EQ(x, y) << "ValueError: expect x and y to be equal."
+    ICHECK_EQ(x, y) << "ValueError: expect x and y to be equal."
     if (x == 1) {
       LOG(FATAL) << "InternalError: cannot reach here";
     }
diff --git a/docs/dev/convert_layout.rst b/docs/dev/convert_layout.rst
index 07ebc2048dd3..6c9890f69d85 100644
--- a/docs/dev/convert_layout.rst
+++ b/docs/dev/convert_layout.rst
@@ -157,7 +157,7 @@ First example is for layout agnostic operators. These operators do not have any
       Layout ret;
 
       if (new_in_layouts.defined()) {
-        CHECK_GE(new_in_layouts.size(), 1);
+        ICHECK_GE(new_in_layouts.size(), 1);
         ret = new_in_layouts[0];
       } else {
         for (size_t i = 0; i < old_in_layouts.size(); ++i) {
diff --git a/docs/dev/pass_infra.rst b/docs/dev/pass_infra.rst
index 1427608a4574..898e51793a44 100644
--- a/docs/dev/pass_infra.rst
+++ b/docs/dev/pass_infra.rst
@@ -276,12 +276,12 @@ order that they were appended to the pass list.
                                       const PassContext& pass_ctx) const {
       Module mod = module;
       for (const Pass& pass : passes) {
-        CHECK(pass.defined()) << "Found undefined pass for optimization.";
+        ICHECK(pass.defined()) << "Found undefined pass for optimization.";
         const PassInfo& pass_info = pass->Info();
         if (!PassEnabled(pass_info))  continue;
         for (const auto& it : pass_info->required) {
           const auto* name = it.as<tvm::ir::StringImm>();
-          CHECK(name);
+          ICHECK(name);
           mod = GetPass(name->value)(mod, pass_ctx);
         }
         mod = pass(mod, pass_ctx);
@@ -306,7 +306,7 @@ pass is registered with an API endpoint as we will show later.
       using tvm::runtime::Registry;
       std::string fpass_name = "relay._transform." + pass_name;
       const auto* f = Registry::Get(fpass_name);
-      CHECK(f != nullptr) << "Cannot find " << fpass_name
+      ICHECK(f != nullptr) << "Cannot find " << fpass_name
                           << "to create the pass " << pass_name;
       return (*f)();
     }
diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/relay_bring_your_own_codegen.rst
index f4ee58a6902b..a4d4ebd60b88 100644
--- a/docs/dev/relay_bring_your_own_codegen.rst
+++ b/docs/dev/relay_bring_your_own_codegen.rst
@@ -296,7 +296,7 @@ As mentioned in the previous step, in addition to the subgraph input and output
 
     // This example only supports single output.
     auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
+    ICHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
           << "Only support single output tensor with float type";
 
     // Generate a unique buffer name.
@@ -410,7 +410,7 @@ Implement GenCFunc
 .. code-block:: c++
 
   void GenCFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
+    ICHECK(func.defined()) << "Input error: expect a Relay function.";
 
     // Record the external symbol for runtime lookup.
     auto sid = GetExtSymbol(func);
@@ -474,7 +474,7 @@ This function creates a runtime module for the external library. In this example
 
     // Create a CSourceModule
     const auto* pf = runtime::Registry::Get("module.csource_module_create");
-    CHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
+    ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
     return (*pf)(code_stream_.str(), "cc");
   }
 
@@ -556,7 +556,7 @@ In this section, our goal is to implement the following customized TVM runtime m
       ExampleJsonCodeGen codegen(ref);
       std::string code = codegen.gen(); // Note 1
       const auto* pf = runtime::Registry::Get("module.examplejson_module_create"); // Note 2
-      CHECK(pf != nullptr) << "Cannot find ExampleJson module to create the external runtime module";
+      ICHECK(pf != nullptr) << "Cannot find ExampleJson module to create the external runtime module";
       return (*pf)(code);
   }
   TVM_REGISTER_GLOBAL("relay.ext.examplejsoncompiler").set_body_typed(ExampleJsonCompiler);
@@ -785,7 +785,7 @@ After the construction, we should have the above class variables ready. We then
 
         // Copy input tensors to corresponding data entries.
         for (auto i = 0; i < args.size(); ++i) {
-          CHECK(args[i].type_code() == kNDArrayContainer || args[i].type_code() == kArrayHandle)
+          ICHECK(args[i].type_code() == kNDArrayContainer || args[i].type_code() == kArrayHandle)
               << "Expect NDArray or DLTensor as inputs\n";
           if (args[i].type_code() == kArrayHandle) {
             DLTensor* arg = args[i];
@@ -800,7 +800,7 @@ After the construction, we should have the above class variables ready. We then
         for (const auto& it : this->graph_[this->curr_subgraph_]) {
           this->Run(it.id, it.inputs, it.output);
         }
-        CHECK_GT(graph_.count(this->curr_subgraph_), 0U);
+        ICHECK_GT(graph_.count(this->curr_subgraph_), 0U);
 
         // Copy the output from a data entry back to TVM runtime argument.
         auto out_idx = graph_[this->curr_subgraph_].back().output;
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
index 475494e62c4d..6f624b758fa9 100644
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -229,7 +229,7 @@ inline void DFSVisit(const std::vector<NodeEntry>& heads, FVisit fvisit);
 template <typename T>
 inline const T& Graph::GetAttr(const std::string& attr_name) const {
   auto it = attrs.find(attr_name);
-  CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
+  ICHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
   return nnvm::unsafe_get<T>(*it->second);
 }
 
@@ -241,7 +241,7 @@ inline bool Graph::HasAttr(const std::string& attr_name) const {
 template <typename T>
 inline T Graph::MoveCopyAttr(const std::string& attr_name) {
   auto it = attrs.find(attr_name);
-  CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
+  ICHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
   std::shared_ptr<any> sptr = it->second;
   attrs.erase(it);
   if (sptr.unique()) {
diff --git a/nnvm/include/nnvm/layout.h b/nnvm/include/nnvm/layout.h
index e2e99784c99e..6c46f9de9e0f 100644
--- a/nnvm/include/nnvm/layout.h
+++ b/nnvm/include/nnvm/layout.h
@@ -220,7 +220,7 @@ class Layout {
     for (size_t i = pos; i < pos + len; ++i) {
       if (is_subdim(layout_simplified_[i])) {
         auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
+        ICHECK_GT(block_size, 0);
         new_layout << block_size;
       }
       new_layout << layout_simplified_[i];
@@ -235,7 +235,7 @@ class Layout {
     for (int64_t i = this->ndim() - 1; i >= 0; --i) {
       if (is_subdim(layout_simplified_[i])) {
         auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
+        ICHECK_GT(block_size, 0);
         new_layout << block_size;
       }
       new_layout << layout_simplified_[i];
@@ -251,13 +251,13 @@ class Layout {
    * \return A newly constructed Layout object.
    */
   inline Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const {
-    CHECK(target_pos <= this->ndim())
+    ICHECK(target_pos <= this->ndim())
         << "Invalid split position " << target_pos << " for layout " << name_;
-    CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
-    CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
-    CHECK(!this->contains(to_subdim(dim)))
+    ICHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
+    ICHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
+    ICHECK(!this->contains(to_subdim(dim)))
         << "Dimension " << dim << " has already been split in " << name_;
-    CHECK(size > 0) << "Invalid split size " << size;
+    ICHECK(size > 0) << "Invalid split size " << size;
     std::ostringstream new_layout;
     for (size_t i = 0; i <= this->ndim(); ++i) {
       if (i == target_pos) {
@@ -293,11 +293,11 @@ class Layout {
    * \return the description of the dimension.
    */
   inline std::string at(size_t i) const {
-    CHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim();
+    ICHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim();
     std::ostringstream repr;
     if (is_subdim(layout_simplified_[i])) {
       auto factor = subsizeof(layout_simplified_[i]);
-      CHECK_GT(factor, 0);
+      ICHECK_GT(factor, 0);
       repr << factor;
     }
     repr << layout_simplified_[i];
@@ -328,7 +328,7 @@ class Layout {
    *         Return -1 if \p dim is not in the layout or the layout is undefined.
    */
   inline int64_t subsizeof(LayoutDim dim) const {
-    CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
+    ICHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
     if (!this->defined() || !this->contains(to_subdim(dim))) {
       return -1;
     }
@@ -409,34 +409,34 @@ class Layout {
       const LayoutDim c = layout.at(i);
       if (is_superdim(c)) {
         int pos = c - 'A';
-        CHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
-                            << " before dimension " << c;
-        CHECK_EQ(superdim_pos_[pos], -1)
+        ICHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
+                             << " before dimension " << c;
+        ICHECK_EQ(superdim_pos_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
         superdim_pos_[pos] = curr++;
         layout_simplified_.push_back(c);
       } else if (is_subdim(c)) {
         int pos = c - 'a';
-        CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
-                            << " for dimension " << c;
-        CHECK_EQ(subdim_pos_[pos], -1)
+        ICHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
+                             << " for dimension " << c;
+        ICHECK_EQ(subdim_pos_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
-        CHECK_EQ(subdim_size_[pos], -1)
+        ICHECK_EQ(subdim_size_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
         subdim_pos_[pos] = curr++;
         subdim_size_[pos] = factor;
         layout_simplified_.push_back(c);
         factor = 0;
       } else if (c >= '0' && c <= '9') {
-        CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
+        ICHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
         factor = factor * 10 + c - '0';
       } else {
         LOG(FATAL) << "Invalid layout " << layout;
       }
     }
-    CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
+    ICHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
     for (LayoutDim dim : layout_simplified_) {
-      CHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0)
+      ICHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0)
           << "Invalid layout " << layout << ": missing axis " << static_cast<char>(dim - 'a' + 'A');
     }
   }
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
index d5794d88f705..272f70610715 100644
--- a/nnvm/include/nnvm/op.h
+++ b/nnvm/include/nnvm/op.h
@@ -467,7 +467,7 @@ inline const OpMap<ValueType>& Op::GetAttr(const std::string& key) {
 template <typename ValueType>
 inline Op& Op::set_attr(  // NOLINT(*)
     const std::string& attr_name, const ValueType& value, int plevel) {
-  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  ICHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
   // update the attribute map of the key by creating new empty if needed.
   UpdateAttrMap(attr_name, [this, attr_name, value, plevel](any* pmap) {
     // the callback is in lockscope so is threadsafe.
@@ -476,7 +476,7 @@ inline Op& Op::set_attr(  // NOLINT(*)
       pm.attr_name_ = attr_name;
       *pmap = std::move(pm);
     }
-    CHECK(pmap->type() == typeid(OpMap<ValueType>))
+    ICHECK(pmap->type() == typeid(OpMap<ValueType>))
         << "Attribute " << attr_name << " of operator " << this->name
         << " is registered as inconsistent types"
         << " previously " << pmap->type().name() << " current " << typeid(OpMap<ValueType>).name();
@@ -486,8 +486,8 @@ inline Op& Op::set_attr(  // NOLINT(*)
       vec.resize(index_ + 1, std::make_pair(ValueType(), 0));
     }
     std::pair<ValueType, int>& p = vec[index_];
-    CHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name
-                              << " is already registered with same plevel=" << plevel;
+    ICHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name
+                               << " is already registered with same plevel=" << plevel;
     if (p.second < plevel) {
       vec[index_] = std::make_pair(value, plevel);
     }
@@ -562,9 +562,9 @@ inline bool OpMap<ValueType>::contains(const Op* op) const {
 
 template <typename ValueType>
 inline const ValueType& OpMap<ValueType>::operator[](const Op* op) const {
-  CHECK(op != nullptr);
+  ICHECK(op != nullptr);
   const uint32_t idx = op->index_;
-  CHECK(idx < data_.size() && data_[idx].second)
+  ICHECK(idx < data_.size() && data_[idx].second)
       << "Attribute " << attr_name_ << " has not been registered for Operator " << op->name;
   return data_[idx].first;
 }
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
index c6d6125aa194..af800e77dd07 100644
--- a/nnvm/include/nnvm/tuple.h
+++ b/nnvm/include/nnvm/tuple.h
@@ -435,7 +435,7 @@ class TShape : public Tuple<dim_t> {
    */
   template <int dim>
   inline mshadow::Shape<dim> get() const {
-    CHECK_EQ(dim, static_cast<int>(ndim()))
+    ICHECK_EQ(dim, static_cast<int>(ndim()))
         << "dimension do not match target dimension " << dim << " vs " << ndim();
     const dim_t* d = this->data();
     mshadow::Shape<dim> s;
@@ -467,7 +467,7 @@ class TShape : public Tuple<dim_t> {
    * \return the flat 3d shape
    */
   inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
-    CHECK(axis_end >= axis_begin);
+    ICHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
     if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
     const dim_t* d = this->data();
diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc
index e5042802906c..81dc9bc35992 100644
--- a/nnvm/src/core/graph.cc
+++ b/nnvm/src/core/graph.cc
@@ -54,7 +54,7 @@ static void SubgraphSanityCheck(const std::vector<std::shared_ptr<Symbol>>& subg
         nnvm::Node* node = n.get();
         // if the node is visited, but on a different level, then check failed
         // if check failed here or before, we stop doing anything, but raise an error
-        CHECK(!node2level.count(node) || node2level[node] == level)
+        ICHECK(!node2level.count(node) || node2level[node] == level)
             << "A subgraph should not depend on the outputs of nodes on higher levels";
         // otherwise, this node belongs to the current level
         node2level[node] = level;
@@ -76,9 +76,9 @@ IndexedGraph::IndexedGraph(const Graph& g) {
   DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs](const ObjectPtr& n) {
     const auto& is_ghost = Op::GetAttr<TIsGhost>("TIsGhost");
     if (!n->is_variable() && is_ghost.get(n->op(), false)) return;
-    CHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
+    ICHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
     uint32_t nid = static_cast<uint32_t>(nodes_.size());
-    CHECK(n);
+    ICHECK(n);
     for (const auto& subgraph : n->attrs.subgraphs) subgraphs.push_back(subgraph);
     // nodes_
     IndexedGraph::Node new_node;
@@ -96,7 +96,7 @@ IndexedGraph::IndexedGraph(const Graph& g) {
     // input entries
     for (const auto& e : n->inputs) {
       auto it = node2index_.find(e.node.get());
-      CHECK(it != node2index_.end() && it->first == e.node.get());
+      ICHECK(it != node2index_.end() && it->first == e.node.get());
       input_entries_.emplace_back(NodeEntry{it->second, e.index, e.version});
     }
     inputs_rptr.push_back(input_entries_.size());
@@ -104,7 +104,7 @@ IndexedGraph::IndexedGraph(const Graph& g) {
     for (const auto& nptr : n->control_deps) {
       if (!nptr->is_variable() && is_ghost.get(nptr->op(), false)) continue;
       auto it = node2index_.find(nptr.get());
-      CHECK(it != node2index_.end()) << "control dep not found in graph";
+      ICHECK(it != node2index_.end()) << "control dep not found in graph";
       control_deps_.push_back(it->second);
     }
     control_rptr.push_back(control_deps_.size());
diff --git a/nnvm/src/core/op.cc b/nnvm/src/core/op.cc
index 08a11dff9a02..7f5d1999780d 100644
--- a/nnvm/src/core/op.cc
+++ b/nnvm/src/core/op.cc
@@ -70,7 +70,7 @@ Op& Op::add_alias(const std::string& alias) {  // NOLINT(*)
 // find operator by name
 const Op* Op::Get(const std::string& name) {
   const Op* op = dmlc::Registry<Op>::Find(name);
-  CHECK(op != nullptr) << "Operator " << name << " is not registered";
+  ICHECK(op != nullptr) << "Operator " << name << " is not registered";
   return op;
 }
 
diff --git a/nnvm/src/core/pass.cc b/nnvm/src/core/pass.cc
index 974cd2b35918..9966d3d42300 100644
--- a/nnvm/src/core/pass.cc
+++ b/nnvm/src/core/pass.cc
@@ -45,7 +45,7 @@ Graph ApplyPasses(Graph g, const std::vector<std::string>& pass) {
   std::vector<const PassFunctionReg*> fpass;
   for (auto& name : pass) {
     auto* reg = dmlc::Registry<PassFunctionReg>::Find(name);
-    CHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry";
+    ICHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry";
     fpass.push_back(reg);
   }
 
diff --git a/nnvm/src/core/symbolic.cc b/nnvm/src/core/symbolic.cc
index 12b8675d0bd7..18d31dd3a937 100644
--- a/nnvm/src/core/symbolic.cc
+++ b/nnvm/src/core/symbolic.cc
@@ -58,7 +58,7 @@ inline void UpdateNodeVersion(Node* n) {
   if (fmutate_inputs.count(n->op()) != 0) {
     for (uint32_t i : fmutate_inputs[n->op()](n->attrs)) {
       NodeEntry& e = n->inputs[i];
-      CHECK(e.node->is_variable()) << "Mutation target can only be Variable";
+      ICHECK(e.node->is_variable()) << "Mutation target can only be Variable";
       // increase the version of the variable.
       e.version = ++nnvm::get<VariableParam>(e.node->attrs.parsed).version;
     }
@@ -186,7 +186,7 @@ void Symbol::Print(std::ostream& os) const {
 
 Symbol Symbol::operator[](size_t index) const {
   size_t nreturn = outputs.size();
-  CHECK_LT(index, nreturn) << "Symbol only accept nonnegative index";
+  ICHECK_LT(index, nreturn) << "Symbol only accept nonnegative index";
   if (nreturn == 1) {
     return *this;
   } else {
@@ -298,13 +298,13 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
   for (size_t i = 0; i < args.size(); ++i) {
     // If the argument isn't a graph, it should have only one output.
     if (garg_idx.empty() || std::find(garg_idx.begin(), garg_idx.end(), i) == garg_idx.end())
-      CHECK_EQ(args[i]->outputs.size(), 1U)
+      ICHECK_EQ(args[i]->outputs.size(), 1U)
           << "Argument " << i << " is a tuple, single value is required";
   }
   for (const auto& kv : kwargs) {
     if (garg_names.empty() ||
         std::find(garg_names.begin(), garg_names.end(), kv.first) == garg_names.end())
-      CHECK_EQ(kv.second->outputs.size(), 1U)
+      ICHECK_EQ(kv.second->outputs.size(), 1U)
           << "Keyword Argument " << kv.first << " is a tuple, single value is required";
   }
   // assign new name
@@ -325,7 +325,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
           sym = arg_vec[idx];
         } else {
           auto it = kwarg_map.find(arg_names[idx]);
-          CHECK(it != kwarg_map.end());
+          ICHECK(it != kwarg_map.end());
           sym = it->second;
           kwarg_map.erase(it);
         }
@@ -346,7 +346,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
 
     if (n_req != kVarg) {
       n->inputs.resize(n_req);
-      CHECK_LE(arg_vec.size(), n_req)
+      ICHECK_LE(arg_vec.size(), n_req)
           << "Incorrect number of arguments, requires " << n_req << ", provided " << arg_vec.size();
       for (size_t i = 0; i < arg_vec.size(); ++i) {
         n->inputs[i] = arg_vec[i]->outputs[0];
@@ -378,7 +378,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
         }
       }
     } else {
-      CHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs";
+      ICHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs";
       n->inputs.reserve(arg_vec.size());
       for (const Symbol* s : arg_vec) {
         n->inputs.push_back(s->outputs[0]);
@@ -396,7 +396,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
     }
   } else {
     // general composition
-    CHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now";
+    ICHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now";
     size_t nmatched = 0;
     size_t arg_counter = 0;
     std::unordered_map<Node*, const NodeEntry*> replace_map;
@@ -456,7 +456,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
     // update outputs in case the composed variable is part of outputs.
     for (size_t i = 0; i < outputs.size(); ++i) {
       if (outputs[i].node->is_variable()) {
-        CHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments";
+        ICHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments";
         const auto it = kwargs.find(outputs[i].node->attrs.name);
         if (it != kwargs.end()) outputs[i] = it->second->outputs[0];
       }
@@ -473,7 +473,7 @@ Symbol Symbol::operator()(const array_view<const Symbol*>& args,
 }
 
 void Symbol::AddControlDeps(const Symbol& src) {
-  CHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol";
+  ICHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol";
   Node* n = outputs[0].node.get();
   for (const NodeEntry& sp : src.outputs) {
     n->control_deps.push_back(sp.node);
@@ -517,7 +517,7 @@ Symbol Symbol::GetChildren() const {
 void Symbol::SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs) {
   Node* node = outputs[0].node.get();
   for (const NodeEntry& e : outputs) {
-    CHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol";
+    ICHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol";
   }
   for (const auto& kv : attrs) {
     if (kv.first == "name") {
diff --git a/nnvm/src/pass/correct_layout.cc b/nnvm/src/pass/correct_layout.cc
index b9024a56d143..3a8cc16511ff 100644
--- a/nnvm/src/pass/correct_layout.cc
+++ b/nnvm/src/pass/correct_layout.cc
@@ -64,7 +64,7 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
     if (new_node->is_variable()) {
       // Variable node. No operator. Only one output entry.
       auto input_iter = std::find(idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
-      CHECK(input_iter != idx.input_nodes().cend());
+      ICHECK(input_iter != idx.input_nodes().cend());
       int64_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
       if (src.HasAttr("layout_inputs")) {
         new_layouts[new_node.get()] = {
@@ -83,11 +83,11 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
     for (size_t i = 0; i < num_inputs; ++i) {
       const IndexedGraph::NodeEntry& input_entry = inode.inputs[i];
       const ObjectPtr& new_input_node = mirror_vec[input_entry.node_id];
-      CHECK(new_input_node != nullptr);
+      ICHECK(new_input_node != nullptr);
 
       // fill inputs by previous node (DFS order) inferred layouts.
       const auto& layouts_iter = new_layouts.find(new_input_node.get());
-      CHECK(layouts_iter != new_layouts.end());
+      ICHECK(layouts_iter != new_layouts.end());
       request_ilayouts[i] = layouts_iter->second[input_entry.index];
     }
     // layouts produced by previous node.
@@ -108,10 +108,10 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
 
     if (op_correct_layout.count(new_node->op())) {
       const auto& flayout = op_correct_layout[new_node->op()];
-      CHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
+      ICHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
           << "Layout infer fail";
-      CHECK_EQ(request_ilayouts.size(), num_inputs);
-      CHECK_EQ(produce_olayouts.size(), num_outputs);
+      ICHECK_EQ(request_ilayouts.size(), num_inputs);
+      ICHECK_EQ(produce_olayouts.size(), num_outputs);
     }
 
     // update new layouts
diff --git a/nnvm/src/pass/gradient.cc b/nnvm/src/pass/gradient.cc
index 1df3af7ffaaf..902a968b102d 100644
--- a/nnvm/src/pass/gradient.cc
+++ b/nnvm/src/pass/gradient.cc
@@ -85,10 +85,10 @@ Graph Gradient(Graph src) {
   using MirrorFun = std::function<int(const Node& node)>;
   using AttrHintFun = std::function<NodeEntry(const NodeEntry& src, const NodeEntry& like)>;
 
-  CHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented.";
-  CHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U)
+  ICHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented.";
+  ICHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U)
       << "Gradient require grad_ys_out_grad to be presented.";
-  CHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented.";
+  ICHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented.";
   const std::vector<NodeEntry>& ys = src.GetAttr<std::vector<NodeEntry> >("grad_ys");
   const std::vector<NodeEntry>& ys_out_grad =
       src.GetAttr<std::vector<NodeEntry> >("grad_ys_out_grad");
@@ -124,7 +124,7 @@ Graph Gradient(Graph src) {
     topo_order.push_back(node);
   });
 
-  CHECK_EQ(ys.size(), ys_out_grad.size());
+  ICHECK_EQ(ys.size(), ys_out_grad.size());
   for (size_t i = 0; i < ys.size(); ++i) {
     NodeEntry ograd = ys_out_grad[i];
     output_grads[ys[i].node.get()][ys[i].index].grads = {ograd};
@@ -132,7 +132,7 @@ Graph Gradient(Graph src) {
 
   // Check that all xs are reachable from ys
   for (size_t i = 0; i < xs.size(); ++i) {
-    CHECK(output_grads.find(xs[i].node.get()) != output_grads.end())
+    ICHECK(output_grads.find(xs[i].node.get()) != output_grads.end())
         << "Cannot differentiate with respect to the " << i + 1 << "-th variable "
         << "because it is unreachable from the outputs.";
   }
@@ -182,7 +182,7 @@ Graph Gradient(Graph src) {
       // Check for FGradient
       if (grad_fun_map.contains(ptr->op())) {
         input_grads = grad_fun_map[ptr->op()](fwd_node, out_agg_grads);
-        CHECK_EQ((*rit)->inputs.size(), input_grads.size())
+        ICHECK_EQ((*rit)->inputs.size(), input_grads.size())
             << "Gradient function not returning enough gradient";
       } else if (CheckGradAllZero(out_agg_grads, zero_ops)) {
         for (size_t i = 0; i < fwd_node->num_inputs(); ++i) {
@@ -206,9 +206,9 @@ Graph Gradient(Graph src) {
         LOG(FATAL) << "Operator " << fwd_node->op()->name << " is non-differentiable "
                    << "because it didn't register FGradient attribute.";
       }
-      for (const auto& nodeEntry : input_grads) CHECK(nodeEntry.node);
+      for (const auto& nodeEntry : input_grads) ICHECK(nodeEntry.node);
       auto git = input_grads.begin();
-      CHECK((*rit)->inputs.size() <= input_grads.size());
+      ICHECK((*rit)->inputs.size() <= input_grads.size());
       for (auto it = (*rit)->inputs.begin(); it != (*rit)->inputs.end(); ++it, ++git) {
         auto& output_grad_entry = output_grads[it->node.get()][it->index];
         // if any of the backward op can do shape inference, the hint is not necessary.
diff --git a/nnvm/src/pass/graph_algorithm.h b/nnvm/src/pass/graph_algorithm.h
index b305c08bc05f..4620079a0ab2 100644
--- a/nnvm/src/pass/graph_algorithm.h
+++ b/nnvm/src/pass/graph_algorithm.h
@@ -45,7 +45,7 @@ namespace pass {
 inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32_t>& node_reward,
                              std::vector<uint32_t>* path) {
   const uint32_t num_nodes = static_cast<uint32_t>(graph.num_nodes());
-  CHECK_EQ(num_nodes, node_reward.size());
+  ICHECK_EQ(num_nodes, node_reward.size());
 
   std::vector<uint32_t> best_reward(node_reward.size(), 0);
   std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
@@ -73,7 +73,7 @@ inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32
     path->push_back(nid);
     reward += node_reward[nid];
   }
-  CHECK_EQ(reward, best_solution);
+  ICHECK_EQ(reward, best_solution);
   return best_solution;
 }
 
@@ -90,8 +90,8 @@ inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32
  */
 inline uint32_t ColorNodeGroup(const IndexedGraph& graph, std::vector<uint32_t> node_importance,
                                uint32_t max_ncolor, std::vector<uint32_t>* color) {
-  CHECK_NE(max_ncolor, 0U);
-  CHECK_EQ(graph.num_nodes(), node_importance.size());
+  ICHECK_NE(max_ncolor, 0U);
+  ICHECK_EQ(graph.num_nodes(), node_importance.size());
 
   color->clear();
   color->resize(graph.num_nodes(), max_ncolor);
@@ -105,7 +105,7 @@ inline uint32_t ColorNodeGroup(const IndexedGraph& graph, std::vector<uint32_t>
     if (reward == 0) break;
     for (uint32_t nid : path) {
       if (node_importance[nid] != 0) {
-        CHECK_EQ(color->at(nid), max_ncolor);
+        ICHECK_EQ(color->at(nid), max_ncolor);
         color->at(nid) = cindex;
         // make the importance 0 after color is decided.
         node_importance[nid] = 0;
diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
index fde1691ee96a..859c5b385c4a 100644
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -49,7 +49,7 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
 
   if (ret.attrs.count(input_name) != 0) {
     const AttrVector& shape_args = ret.GetAttr<AttrVector>(input_name);
-    CHECK_LE(shape_args.size(), idx.input_nodes().size())
+    ICHECK_LE(shape_args.size(), idx.input_nodes().size())
         << "More provided shapes than number of arguments.";
     for (size_t i = 0; i < shape_args.size(); ++i) {
       rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i];
@@ -88,22 +88,22 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
     const uint32_t num_outputs = inode.source->num_outputs();
     if (inode.source->is_variable()) {
       // Variable node. No operator. Only one output entry.
-      CHECK(inode.source->op() == nullptr);
-      CHECK_EQ(num_outputs, 1U);
+      ICHECK(inode.source->op() == nullptr);
+      ICHECK_EQ(num_outputs, 1U);
       const uint32_t out_ent_id = idx.entry_id(nid, 0);
       if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) {
         auto it = inode.source->attrs.dict.find(shape_attr_key);
         if (it != inode.source->attrs.dict.end()) {
           std::istringstream is(it->second);
-          CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
+          ICHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
         }
       }
     } else if (is_backward.get(inode.source->op(), false) && inode.control_deps.size()) {
-      CHECK_GE(inode.control_deps.size(), 1U)
+      ICHECK_GE(inode.control_deps.size(), 1U)
           << "BackwardOp need to have control_deps to its forward op";
       const IndexedGraph::Node& fnode = idx[inode.control_deps[0]];
       ObjectPtr fwd_ptr = inode.source->control_deps[0];
-      CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
+      ICHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
       // use gradient function to find out the correspondence.
       std::vector<NodeEntry> ograd(fwd_ptr->num_outputs());
       for (size_t i = 0; i < ograd.size(); ++i) {
@@ -119,18 +119,18 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
           if (fis_none(rshape[eid])) {
             rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])];
           } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) {
-            CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
+            ICHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
                 << "Backward shape inconsistent with the forward shape";
           }
           if (igrad_node == nullptr) {
             igrad_node = igrad[i].node.get();
           } else {
-            CHECK(igrad_node == igrad[i].node.get());
+            ICHECK(igrad_node == igrad[i].node.get());
           }
         }
       }
       // out grad entries
-      CHECK(igrad_node != nullptr)
+      ICHECK(igrad_node != nullptr)
           << "Cannot find matching backward op for " << inode.source->attrs.name;
       for (size_t i = 0; i < igrad_node->inputs.size(); ++i) {
         const NodeEntry& e = igrad_node->inputs[i];
@@ -164,9 +164,9 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
             throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what());
           }
         } else {
-          CHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op "
-                            << inode.source->op()->name
-                            << " we are not able to complete the inference because of this";
+          ICHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op "
+                             << inode.source->op()->name
+                             << " we are not able to complete the inference because of this";
         }
       }
       // Save to the result map.
diff --git a/nnvm/src/pass/place_device.cc b/nnvm/src/pass/place_device.cc
index d45658ae24ab..4a9d93465de8 100644
--- a/nnvm/src/pass/place_device.cc
+++ b/nnvm/src/pass/place_device.cc
@@ -33,11 +33,11 @@ namespace {
 // simply logic to place device according to device_group hint
 // insert copy node when there is
 Graph PlaceDevice(Graph src) {
-  CHECK(src.attrs.count("device_group_attr_key"))
+  ICHECK(src.attrs.count("device_group_attr_key"))
       << "Need graph attribute \"device_group_attr_key\" in PlaceDevice";
-  CHECK(src.attrs.count("device_assign_map"))
+  ICHECK(src.attrs.count("device_assign_map"))
       << "Need graph attribute \"device_assign_map\" in PlaceDevice";
-  CHECK(src.attrs.count("device_copy_op"))
+  ICHECK(src.attrs.count("device_copy_op"))
       << "Need graph attribute \"device_copy_op\" in PlaceDevice";
   std::string device_group_attr_key = src.GetAttr<std::string>("device_group_attr_key");
   const Op* copy_op = Op::Get(src.GetAttr<std::string>("device_copy_op"));
@@ -48,7 +48,7 @@ Graph PlaceDevice(Graph src) {
   // copy on write semanatics
   if (src.attrs.count("device") != 0) {
     device = src.MoveCopyAttr<DeviceVector>("device");
-    CHECK_EQ(device.size(), idx.num_nodes());
+    ICHECK_EQ(device.size(), idx.num_nodes());
   } else {
     device.resize(idx.num_nodes(), -1);
   }
@@ -60,7 +60,7 @@ Graph PlaceDevice(Graph src) {
     if (it != inode.source->attrs.dict.end()) {
       const std::string& device_group = it->second;
       auto dit = device_assign_map.find(device_group);
-      CHECK(dit != device_assign_map.end())
+      ICHECK(dit != device_assign_map.end())
           << "The device assignment not found for group " << device_group;
       device[nid] = dit->second;
     } else {
@@ -139,7 +139,7 @@ Graph PlaceDevice(Graph src) {
       }
     }
     if (inode.source->is_variable()) {
-      CHECK(!need_mutate) << "consistency check";
+      ICHECK(!need_mutate) << "consistency check";
     }
     if (need_mutate) {
       ObjectPtr new_node = Node::Create();
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
index 7d478c646a1f..931dbbd8d24c 100644
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
@@ -112,7 +112,7 @@ class GraphAllocator {
   }
   // release a memory space.
   void Release(StorageID id, uint32_t node_id) {
-    CHECK_NE(id, kBadStorageID);
+    ICHECK_NE(id, kBadStorageID);
     if (id == kExternalStorageID || id == kDynamicStorageID) return;
     StorageEntry* e = data_[id].get();
     e->released_by_node = node_id;
@@ -219,7 +219,7 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
       std::vector<bool> identity;
       if (finplace_identity.count(inode.source->op()) != 0) {
         identity = finplace_identity[inode.source->op()](inode.source->attrs);
-        CHECK_EQ(identity.size(), inplace_pairs.size())
+        ICHECK_EQ(identity.size(), inplace_pairs.size())
             << "FInplaceOption and FInplaceIdentity returned vectors of different "
             << "size for operator " << inode.source->op()->name;
       } else {
diff --git a/nnvm/src/pass/print_graph_ir.cc b/nnvm/src/pass/print_graph_ir.cc
index 4fe92e665961..6604d810f288 100644
--- a/nnvm/src/pass/print_graph_ir.cc
+++ b/nnvm/src/pass/print_graph_ir.cc
@@ -41,7 +41,7 @@ AttrPrinter GetVectorPrinter_(const T& vec) {
 
 AttrPrinter GetVectorPrinter(const Graph& graph, const std::string& key) {
   auto it = graph.attrs.find(key);
-  CHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr";
+  ICHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr";
   const any& value = *(it->second);
   if (value.type() == typeid(std::vector<TShape>)) {
     return GetVectorPrinter_(nnvm::get<std::vector<TShape> >(value));
diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
index 3916da43618d..dbd8ee0f83d4 100644
--- a/nnvm/src/pass/saveload_json.cc
+++ b/nnvm/src/pass/saveload_json.cc
@@ -72,13 +72,13 @@ struct JSONNode {
     }
     void Load(dmlc::JSONReader* reader) {
       reader->BeginArray();
-      CHECK(reader->NextArrayItem()) << "invalid json format";
+      ICHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&node_id);
-      CHECK(reader->NextArrayItem()) << "invalid json format";
+      ICHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&index);
       if (reader->NextArrayItem()) {
         reader->Read(&version);
-        CHECK(!reader->NextArrayItem()) << "invalid json format";
+        ICHECK(!reader->NextArrayItem()) << "invalid json format";
       } else {
         version = 0;
       }
@@ -226,12 +226,12 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
   for (const JSONNode& n : jgraph.nodes) {
     n.node->inputs.reserve(n.inputs.size());
     for (const JSONNode::Entry& e : n.inputs) {
-      CHECK(e.node_id < jgraph.nodes.size());
+      ICHECK(e.node_id < jgraph.nodes.size());
       n.node->inputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
     }
     n.node->control_deps.reserve(n.control_deps.size());
     for (uint32_t nid : n.control_deps) {
-      CHECK(nid < jgraph.nodes.size());
+      ICHECK(nid < jgraph.nodes.size());
       n.node->control_deps.push_back(jgraph.nodes[nid].node);
     }
     for (const JSONGraph& subgraph : n.subgraphs) {
@@ -252,13 +252,13 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
   }
   // consistency check
   for (uint32_t nid : jgraph.arg_nodes) {
-    CHECK(nid < jgraph.nodes.size());
-    CHECK(jgraph.nodes[nid].node->is_variable());
+    ICHECK(nid < jgraph.nodes.size());
+    ICHECK(jgraph.nodes[nid].node->is_variable());
   }
   std::shared_ptr<Symbol> symbol = std::make_shared<Symbol>();
   symbol->outputs.reserve(jgraph.heads.size());
   for (const JSONNode::Entry& e : jgraph.heads) {
-    CHECK(e.node_id < jgraph.nodes.size());
+    ICHECK(e.node_id < jgraph.nodes.size());
     symbol->outputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
   }
   return symbol;
@@ -266,7 +266,7 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
 
 // Load a graph from JSON file.
 Graph LoadJSON(Graph src) {
-  CHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented.";
+  ICHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented.";
   const std::string& json_str = nnvm::get<std::string>(*src.attrs.at("json"));
   bool no_parse = false;
   if (src.attrs.count("load_json_no_parse")) {
diff --git a/nnvm/tests/cpp/op_test.cc b/nnvm/tests/cpp/op_test.cc
index 2ebd14688f46..39a998a4eebe 100644
--- a/nnvm/tests/cpp/op_test.cc
+++ b/nnvm/tests/cpp/op_test.cc
@@ -35,7 +35,7 @@ TEST(Op, GetAttr) {
   auto add = Op::Get("add");
   auto nick = Op::GetAttr<std::string>("nick_name");
 
-  CHECK_EQ(nick[add], "plus");
+  ICHECK_EQ(nick[add], "plus");
 }
 
 int main(int argc, char** argv) {
diff --git a/nnvm/tests/cpp/tuple_test.cc b/nnvm/tests/cpp/tuple_test.cc
index 2c2c307aadce..e28ecd89f6fa 100644
--- a/nnvm/tests/cpp/tuple_test.cc
+++ b/nnvm/tests/cpp/tuple_test.cc
@@ -28,18 +28,18 @@ TEST(Tuple, Basic) {
   Tuple<int> y{1, 2, 3, 5, 6};
   x = std::move(y);
 
-  CHECK_EQ(x.ndim(), 5);
+  ICHECK_EQ(x.ndim(), 5);
   Tuple<int> z{1, 2, 3, 5, 6};
   std::ostringstream os;
   os << z;
-  CHECK_EQ(os.str(), "[1,2,3,5,6]");
+  ICHECK_EQ(os.str(), "[1,2,3,5,6]");
   std::istringstream is(os.str());
   is >> y;
-  CHECK_EQ(x, y);
+  ICHECK_EQ(x, y);
   Tuple<nnvm::dim_t> ss{1, 2, 3};
   TShape s = ss;
   s = std::move(ss);
-  CHECK((s == TShape{1, 2, 3}));
+  ICHECK((s == TShape{1, 2, 3}));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/arith_simplify_test.cc b/tests/cpp/arith_simplify_test.cc
index 341d9f8df062..829e2689887e 100644
--- a/tests/cpp/arith_simplify_test.cc
+++ b/tests/cpp/arith_simplify_test.cc
@@ -27,11 +27,11 @@ TEST(Simplify, MinMax) {
   auto x = tvm::te::var("x");
   auto e1 = (tvm::max(x, 1) - tvm::max(x, 1));
   auto e1s = ana.canonical_simplify(e1);
-  CHECK(tvm::tir::is_zero(e1s));
+  ICHECK(tvm::tir::is_zero(e1s));
 
   auto e2 = (x * tvm::min(x, 1)) - (x * tvm::min(x, 1));
   auto e2s = ana.canonical_simplify(e2);
-  CHECK(tvm::tir::is_zero(e2s));
+  ICHECK(tvm::tir::is_zero(e2s));
 }
 
 TEST(Simplify, Mul) {
@@ -39,7 +39,7 @@ TEST(Simplify, Mul) {
   auto x = tvm::te::var("x");
   auto e = (x * x) - (x * x);
   auto es = ana.canonical_simplify(e);
-  CHECK(tvm::tir::is_zero(es));
+  ICHECK(tvm::tir::is_zero(es));
 }
 
 TEST(Simplify, Mod) {
@@ -51,7 +51,7 @@ TEST(Simplify, Mod) {
   // and therefore, the constant folding will be attempted in CanonicalSimplify
   auto mod = ana.canonical_simplify(tvm::tir::Mod(x, y));
   auto es = ana.canonical_simplify(mod - x);
-  CHECK(tvm::tir::is_zero(es));
+  ICHECK(tvm::tir::is_zero(es));
 }
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/tests/cpp/attrs_test.cc b/tests/cpp/attrs_test.cc
index 7b301bd13f68..4d6de60b9706 100644
--- a/tests/cpp/attrs_test.cc
+++ b/tests/cpp/attrs_test.cc
@@ -65,21 +65,21 @@ TEST(Attrs, Basic) {
     LOG(FATAL) << "bad";
   } catch (const tvm::AttrError& e) {
     std::string what = e.what();
-    CHECK(what.find("expr : PrimExpr, default=1") != std::string::npos);
-    CHECK(what.find("axisx") != std::string::npos);
+    ICHECK(what.find("expr : PrimExpr, default=1") != std::string::npos);
+    ICHECK(what.find("axisx") != std::string::npos);
   }
   n->InitBySeq("learning_rate", PrimExpr(1), "expr", 128, "name", "xx");
-  CHECK_EQ(n->learning_rate, 1.0);
+  ICHECK_EQ(n->learning_rate, 1.0);
 
   n->InitBySeq("name", "xxx", "expr", 128);
-  CHECK_EQ(n->name, "xxx");
-  CHECK_EQ(n->axis, 10);
-  CHECK_EQ(n->expr.as<tvm::tir::IntImmNode>()->value, 128);
+  ICHECK_EQ(n->name, "xxx");
+  ICHECK_EQ(n->axis, 10);
+  ICHECK_EQ(n->expr.as<tvm::tir::IntImmNode>()->value, 128);
   // Check docstring
   std::ostringstream os;
   n->PrintDocString(os);
   LOG(INFO) << "docstring\n" << os.str();
-  CHECK(os.str().find("expr : PrimExpr, default=1") != std::string::npos);
+  ICHECK(os.str().find("expr : PrimExpr, default=1") != std::string::npos);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/auto_scheduler_test.cc b/tests/cpp/auto_scheduler_test.cc
index aacc3b154463..5e4533733d2e 100644
--- a/tests/cpp/auto_scheduler_test.cc
+++ b/tests/cpp/auto_scheduler_test.cc
@@ -43,8 +43,8 @@ tvm::Array<tvm::te::Tensor> conv2d_nchw_bn_relu_func(int N, int H, int W, int CI
   int OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) / strides + 1;
 
   const auto& conv = topi::conv2d_nchw(data, kernel, padding, padding, strides, strides);
-  CHECK(conv->shape[2].as<IntImmNode>()->value == OH);
-  CHECK(conv->shape[3].as<IntImmNode>()->value == OW);
+  ICHECK(conv->shape[2].as<IntImmNode>()->value == OH);
+  ICHECK(conv->shape[3].as<IntImmNode>()->value == OW);
 
   const auto& bias_add = compute(
       {N, CO, OH, OW}, [&](Var i, Var j, Var k, Var l) { return conv[i][j][k][l] + bias[j][0][0]; },
@@ -76,9 +76,9 @@ TEST(ComputeDAG, AccessAnalyzer) {
   std::set<int> needs_multi_level_tiling = {conv};
   for (size_t stage_id = 0; stage_id < dag->ops.size(); stage_id++) {
     if (needs_multi_level_tiling.count(stage_id)) {
-      CHECK(dag->access_analyzer.NeedsMultiLevelTiling(dag->ops[stage_id]));
+      ICHECK(dag->access_analyzer.NeedsMultiLevelTiling(dag->ops[stage_id]));
     } else {
-      CHECK(!dag->access_analyzer.NeedsMultiLevelTiling(dag->ops[stage_id]));
+      ICHECK(!dag->access_analyzer.NeedsMultiLevelTiling(dag->ops[stage_id]));
     }
   }
 
@@ -86,37 +86,37 @@ TEST(ComputeDAG, AccessAnalyzer) {
                                     bn_scale, bn_mul,  bn_offset, bn_add, relu};
   for (size_t stage_id = 0; stage_id < dag->ops.size(); stage_id++) {
     if (is_simple_access.count(stage_id)) {
-      CHECK(dag->access_analyzer.IsSimpleAccess(dag->ops[stage_id]));
+      ICHECK(dag->access_analyzer.IsSimpleAccess(dag->ops[stage_id]));
     } else {
-      CHECK(!dag->access_analyzer.IsSimpleAccess(dag->ops[stage_id]));
+      ICHECK(!dag->access_analyzer.IsSimpleAccess(dag->ops[stage_id]));
     }
   }
 
   std::set<int> is_strictly_inlinable = {bias_add, bn_mul, bn_add, relu};
   for (size_t stage_id = 0; stage_id < dag->ops.size(); stage_id++) {
     if (is_strictly_inlinable.count(stage_id)) {
-      CHECK(dag->access_analyzer.IsStrictlyInlineable(dag->ops[stage_id]));
+      ICHECK(dag->access_analyzer.IsStrictlyInlineable(dag->ops[stage_id]));
     } else {
-      CHECK(!dag->access_analyzer.IsStrictlyInlineable(dag->ops[stage_id]));
+      ICHECK(!dag->access_analyzer.IsStrictlyInlineable(dag->ops[stage_id]));
     }
   }
 
   std::set<int> is_output = {relu};
   for (size_t stage_id = 0; stage_id < dag->ops.size(); stage_id++) {
     if (is_output.count(stage_id)) {
-      CHECK(dag->access_analyzer.IsOutput(dag->ops[stage_id]));
+      ICHECK(dag->access_analyzer.IsOutput(dag->ops[stage_id]));
     } else {
-      CHECK(!dag->access_analyzer.IsOutput(dag->ops[stage_id]));
+      ICHECK(!dag->access_analyzer.IsOutput(dag->ops[stage_id]));
     }
   }
 
-  CHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[conv], dag->ops[bias_add]), 4);
-  CHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[conv], dag->ops[relu]), 4);
-  CHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[data], dag->ops[relu]), 1);
+  ICHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[conv], dag->ops[bias_add]), 4);
+  ICHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[conv], dag->ops[relu]), 4);
+  ICHECK_EQ(dag->access_analyzer.GetNumCommonOuterIterator(dag->ops[data], dag->ops[relu]), 1);
 
-  CHECK(dag->access_analyzer.ElementWiseMatch(dag->ops[conv], dag->ops[bias_add]));
-  CHECK(dag->access_analyzer.ElementWiseMatch(dag->ops[conv], dag->ops[relu]));
-  CHECK(!dag->access_analyzer.ElementWiseMatch(dag->ops[data], dag->ops[padding]));
+  ICHECK(dag->access_analyzer.ElementWiseMatch(dag->ops[conv], dag->ops[bias_add]));
+  ICHECK(dag->access_analyzer.ElementWiseMatch(dag->ops[conv], dag->ops[relu]));
+  ICHECK(!dag->access_analyzer.ElementWiseMatch(dag->ops[data], dag->ops[padding]));
 
   std::unordered_set<tvm::te::Operation, tvm::ObjectHash, tvm::ObjectEqual> op_set;
   {
@@ -126,8 +126,8 @@ TEST(ComputeDAG, AccessAnalyzer) {
         {bn_offset, bn_add}, {bn_add, relu}};
     for (const auto& pair : consumer_list) {
       op_set = dag->access_analyzer.GetConsumers(s0, s0->stages[pair.first]->op);
-      CHECK_EQ(op_set.size(), 1);
-      CHECK_EQ((*op_set.begin()), s0->stages[pair.second]->op);
+      ICHECK_EQ(op_set.size(), 1);
+      ICHECK_EQ((*op_set.begin()), s0->stages[pair.second]->op);
     }
     std::vector<std::pair<int, std::vector<int>>> producer_list = {{padding, {data}},
                                                                    {conv, {padding, kernel}},
@@ -137,9 +137,9 @@ TEST(ComputeDAG, AccessAnalyzer) {
                                                                    {relu, {bn_add}}};
     for (const auto& pair : producer_list) {
       op_set = dag->access_analyzer.GetProducers(s0, s0->stages[pair.first]->op);
-      CHECK_EQ(op_set.size(), pair.second.size());
+      ICHECK_EQ(op_set.size(), pair.second.size());
       for (const auto& target : pair.second) {
-        CHECK(op_set.count(s0->stages[target]->op));
+        ICHECK(op_set.count(s0->stages[target]->op));
       }
     }
   }
@@ -152,8 +152,8 @@ TEST(ComputeDAG, AccessAnalyzer) {
     std::vector<std::pair<int, int>> consumer_list = {{data, conv}, {kernel, conv}, {conv, relu}};
     for (const auto& pair : consumer_list) {
       op_set = dag->access_analyzer.GetConsumers(s0, s0->stages[pair.first]->op);
-      CHECK_EQ(op_set.size(), 1);
-      CHECK_EQ((*op_set.begin()), s0->stages[pair.second]->op);
+      ICHECK_EQ(op_set.size(), 1);
+      ICHECK_EQ((*op_set.begin()), s0->stages[pair.second]->op);
     }
     std::vector<std::pair<int, std::vector<int>>> producer_list = {{padding, {data}},
                                                                    {conv, {padding, kernel}},
@@ -163,9 +163,9 @@ TEST(ComputeDAG, AccessAnalyzer) {
                                                                    {relu, {bn_add}}};
     for (const auto& pair : producer_list) {
       op_set = dag->access_analyzer.GetDirectProducers(s0->stages[pair.first]->op);
-      CHECK_EQ(op_set.size(), pair.second.size());
+      ICHECK_EQ(op_set.size(), pair.second.size());
       for (const auto& target : pair.second) {
-        CHECK(op_set.count(s0->stages[target]->op));
+        ICHECK(op_set.count(s0->stages[target]->op));
       }
     }
   }
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 62c37f827cd5..ed50e3c86e85 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -56,14 +56,14 @@ TEST(BuildModule, Basic) {
   auto module = build(lowered, target, Target());
 
   auto mali_target = Target("opencl -model=Mali-T860MP4@800Mhz -device=mali");
-  CHECK_EQ(mali_target->kind->name, "opencl");
-  CHECK_EQ(mali_target->keys.size(), 3);
-  CHECK_EQ(mali_target->keys[0], "mali");
-  CHECK_EQ(mali_target->keys[1], "opencl");
-  CHECK_EQ(mali_target->keys[2], "gpu");
-  CHECK_EQ(mali_target->GetAttr<String>("device").value(), "mali");
-  CHECK_EQ(mali_target->GetAttr<String>("model").value(), "Mali-T860MP4@800Mhz");
-  CHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 256);
+  ICHECK_EQ(mali_target->kind->name, "opencl");
+  ICHECK_EQ(mali_target->keys.size(), 3);
+  ICHECK_EQ(mali_target->keys[0], "mali");
+  ICHECK_EQ(mali_target->keys[1], "opencl");
+  ICHECK_EQ(mali_target->keys[2], "gpu");
+  ICHECK_EQ(mali_target->GetAttr<String>("device").value(), "mali");
+  ICHECK_EQ(mali_target->GetAttr<String>("model").value(), "Mali-T860MP4@800Mhz");
+  ICHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 256);
 }
 
 TEST(BuildModule, Heterogeneous) {
@@ -122,7 +122,7 @@ TEST(BuildModule, Heterogeneous) {
   auto module = build(inputs, Target());
 
   // Assertion for build.
-  CHECK_EQ(module->imports().size(), 1);
+  ICHECK_EQ(module->imports().size(), 1);
 
   // Execute the graph and check the correctness.
   // Setup graph json.
@@ -177,7 +177,7 @@ TEST(BuildModule, Heterogeneous) {
   // test FFI for module.
   auto test_ffi = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
     int tcode = args[1];
-    CHECK_EQ(args[0].type_code(), tcode);
+    ICHECK_EQ(args[0].type_code(), tcode);
   });
 
   test_ffi(runtime::Module(mod), static_cast<int>(kTVMModuleHandle));
@@ -196,7 +196,7 @@ TEST(BuildModule, Heterogeneous) {
 
   // Check correctness.
   for (int i = 0; i < n; ++i) {
-    CHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
+    ICHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
   }
 }
 
diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc
index d1d6ffb7ce76..35fd5b1c45b1 100644
--- a/tests/cpp/container_test.cc
+++ b/tests/cpp/container_test.cc
@@ -160,15 +160,15 @@ TEST(Array, Mutate) {
   Array<PrimExpr> list{x, z, z};
   auto list2 = list;
   list.Set(1, x);
-  CHECK(list[1].same_as(x));
-  CHECK(list2[1].same_as(z));
+  ICHECK(list[1].same_as(x));
+  ICHECK(list2[1].same_as(z));
 }
 
 TEST(Array, Iterator) {
   using namespace tvm;
   Array<PrimExpr> array{1, 2, 3};
   std::vector<PrimExpr> vector(array.begin(), array.end());
-  CHECK(vector[1].as<IntImmNode>()->value == 2);
+  ICHECK(vector[1].as<IntImmNode>()->value == 2);
 }
 
 TEST(Array, PushPop) {
@@ -277,10 +277,10 @@ TEST(Map, Expr) {
   auto z = max(x + 1 + 2, 100);
   auto zz = z + 1;
   Map<PrimExpr, PrimExpr> dict{{x, z}, {z, 2}};
-  CHECK(dict.size() == 2);
-  CHECK(dict[x].same_as(z));
-  CHECK(dict.count(z));
-  CHECK(!dict.count(zz));
+  ICHECK(dict.size() == 2);
+  ICHECK(dict[x].same_as(z));
+  ICHECK(dict.count(z));
+  ICHECK(!dict.count(zz));
 }
 
 TEST(Map, Str) {
@@ -288,8 +288,8 @@ TEST(Map, Str) {
   Var x("x");
   auto z = max(x + 1 + 2, 100);
   Map<String, PrimExpr> dict{{"x", z}, {"z", 2}};
-  CHECK(dict.size() == 2);
-  CHECK(dict["x"].same_as(z));
+  ICHECK(dict.size() == 2);
+  ICHECK(dict["x"].same_as(z));
 }
 
 TEST(Map, Mutate) {
@@ -298,19 +298,19 @@ TEST(Map, Mutate) {
   auto z = max(x + 1 + 2, 100);
   Map<PrimExpr, PrimExpr> dict{{x, z}, {z, 2}};
   auto zz = z + 1;
-  CHECK(dict[x].same_as(z));
+  ICHECK(dict[x].same_as(z));
   dict.Set(x, zz);
   auto dict2 = dict;
-  CHECK(dict2.count(z) == 1);
+  ICHECK(dict2.count(z) == 1);
   dict.Set(zz, x);
-  CHECK(dict2.count(zz) == 0);
-  CHECK(dict.count(zz) == 1);
+  ICHECK(dict2.count(zz) == 0);
+  ICHECK(dict.count(zz) == 1);
 
   auto it = dict.find(zz);
-  CHECK(it != dict.end() && (*it).second.same_as(x));
+  ICHECK(it != dict.end() && (*it).second.same_as(x));
 
   it = dict2.find(zz);
-  CHECK(it == dict2.end());
+  ICHECK(it == dict2.end());
 }
 
 TEST(Map, Iterator) {
@@ -319,17 +319,17 @@ TEST(Map, Iterator) {
   Map<PrimExpr, PrimExpr> map1{{a, b}};
   std::unordered_map<PrimExpr, PrimExpr, ObjectPtrHash, ObjectPtrEqual> map2(map1.begin(),
                                                                              map1.end());
-  CHECK(map2[a].as<IntImmNode>()->value == 2);
+  ICHECK(map2[a].as<IntImmNode>()->value == 2);
 }
 
 TEST(Map, Insert) {
   using namespace tvm;
   auto check = [](const Map<String, Integer>& result,
                   std::unordered_map<std::string, int64_t> expected) {
-    CHECK_EQ(result.size(), expected.size());
+    ICHECK_EQ(result.size(), expected.size());
     for (const auto& kv : result) {
-      CHECK(expected.count(kv.first));
-      CHECK_EQ(expected[kv.first], kv.second.operator int64_t());
+      ICHECK(expected.count(kv.first));
+      ICHECK_EQ(expected[kv.first], kv.second.operator int64_t());
       expected.erase(kv.first);
     }
   };
@@ -348,10 +348,10 @@ TEST(Map, Insert) {
 TEST(Map, Erase) {
   auto check = [](const Map<String, Integer>& result,
                   std::unordered_map<std::string, int64_t> expected) {
-    CHECK_EQ(result.size(), expected.size());
+    ICHECK_EQ(result.size(), expected.size());
     for (const auto& kv : result) {
-      CHECK(expected.count(kv.first));
-      CHECK_EQ(expected[kv.first], kv.second.operator int64_t());
+      ICHECK(expected.count(kv.first));
+      ICHECK_EQ(expected[kv.first], kv.second.operator int64_t());
       expected.erase(kv.first);
     }
   };
@@ -373,8 +373,8 @@ TEST(String, MoveFromStd) {
   string expect = source;
   String s(std::move(source));
   string copy = (string)s;
-  CHECK_EQ(copy, expect);
-  CHECK_EQ(source.size(), 0);
+  ICHECK_EQ(copy, expect);
+  ICHECK_EQ(source.size(), 0);
 }
 
 TEST(String, CopyFromStd) {
@@ -383,26 +383,26 @@ TEST(String, CopyFromStd) {
   string expect = source;
   String s{source};
   string copy = (string)s;
-  CHECK_EQ(copy, expect);
-  CHECK_EQ(source.size(), expect.size());
+  ICHECK_EQ(copy, expect);
+  ICHECK_EQ(source.size(), expect.size());
 }
 
 TEST(String, Assignment) {
   using namespace std;
   String s{string{"hello"}};
   s = string{"world"};
-  CHECK_EQ(s == "world", true);
+  ICHECK_EQ(s == "world", true);
   string s2{"world2"};
   s = std::move(s2);
-  CHECK_EQ(s == "world2", true);
+  ICHECK_EQ(s == "world2", true);
 }
 
 TEST(String, empty) {
   using namespace std;
   String s{"hello"};
-  CHECK_EQ(s.empty(), false);
+  ICHECK_EQ(s.empty(), false);
   s = std::string("");
-  CHECK_EQ(s.empty(), true);
+  ICHECK_EQ(s.empty(), true);
 }
 
 TEST(String, Comparisons) {
@@ -412,24 +412,24 @@ TEST(String, Comparisons) {
   String s{source};
   String m{mismatch};
 
-  CHECK_EQ(s == source, true);
-  CHECK_EQ(s == mismatch, false);
-  CHECK_EQ(s == source.data(), true);
-  CHECK_EQ(s == mismatch.data(), false);
+  ICHECK_EQ(s == source, true);
+  ICHECK_EQ(s == mismatch, false);
+  ICHECK_EQ(s == source.data(), true);
+  ICHECK_EQ(s == mismatch.data(), false);
 
-  CHECK_EQ(s < m, source < mismatch);
-  CHECK_EQ(s > m, source > mismatch);
-  CHECK_EQ(s <= m, source <= mismatch);
-  CHECK_EQ(s >= m, source >= mismatch);
-  CHECK_EQ(s == m, source == mismatch);
-  CHECK_EQ(s != m, source != mismatch);
+  ICHECK_EQ(s < m, source < mismatch);
+  ICHECK_EQ(s > m, source > mismatch);
+  ICHECK_EQ(s <= m, source <= mismatch);
+  ICHECK_EQ(s >= m, source >= mismatch);
+  ICHECK_EQ(s == m, source == mismatch);
+  ICHECK_EQ(s != m, source != mismatch);
 
-  CHECK_EQ(m < s, mismatch < source);
-  CHECK_EQ(m > s, mismatch > source);
-  CHECK_EQ(m <= s, mismatch <= source);
-  CHECK_EQ(m >= s, mismatch >= source);
-  CHECK_EQ(m == s, mismatch == source);
-  CHECK_EQ(m != s, mismatch != source);
+  ICHECK_EQ(m < s, mismatch < source);
+  ICHECK_EQ(m > s, mismatch > source);
+  ICHECK_EQ(m <= s, mismatch <= source);
+  ICHECK_EQ(m >= s, mismatch >= source);
+  ICHECK_EQ(m == s, mismatch == source);
+  ICHECK_EQ(m != s, mismatch != source);
 }
 
 // Check '\0' handling
@@ -439,11 +439,11 @@ TEST(String, null_byte_handling) {
   string v1 = "hello world";
   size_t v1_size = v1.size();
   v1[5] = '\0';
-  CHECK_EQ(v1[5], '\0');
-  CHECK_EQ(v1.size(), v1_size);
+  ICHECK_EQ(v1[5], '\0');
+  ICHECK_EQ(v1.size(), v1_size);
   String str_v1{v1};
-  CHECK_EQ(str_v1.compare(v1), 0);
-  CHECK_EQ(str_v1.size(), v1_size);
+  ICHECK_EQ(str_v1.compare(v1), 0);
+  ICHECK_EQ(str_v1.size(), v1_size);
 
   // Ensure bytes after '\0' are taken into account for mismatches.
   string v2 = "aaa one";
@@ -452,12 +452,12 @@ TEST(String, null_byte_handling) {
   v3[3] = '\0';
   String str_v2{v2};
   String str_v3{v3};
-  CHECK_EQ(str_v2.compare(str_v3), -1);
-  CHECK_EQ(str_v2.size(), 7);
+  ICHECK_EQ(str_v2.compare(str_v3), -1);
+  ICHECK_EQ(str_v2.size(), 7);
   // strcmp won't be able to detect the mismatch
-  CHECK_EQ(strcmp(v2.data(), v3.data()), 0);
+  ICHECK_EQ(strcmp(v2.data(), v3.data()), 0);
   // string::compare can handle \0 since it knows size
-  CHECK_LT(v2.compare(v3), 0);
+  ICHECK_LT(v2.compare(v3), 0);
 
   // If there is mismatch before '\0', should still handle it.
   string v4 = "acc one";
@@ -466,12 +466,12 @@ TEST(String, null_byte_handling) {
   v5[3] = '\0';
   String str_v4{v4};
   String str_v5{v5};
-  CHECK_GT(str_v4.compare(str_v5), 0);
-  CHECK_EQ(str_v4.size(), 7);
+  ICHECK_GT(str_v4.compare(str_v5), 0);
+  ICHECK_EQ(str_v4.size(), 7);
   // strcmp is able to detect the mismatch
-  CHECK_GT(strcmp(v4.data(), v5.data()), 0);
+  ICHECK_GT(strcmp(v4.data(), v5.data()), 0);
   // string::compare can handle \0 since it knows size
-  CHECK_GT(v4.compare(v5), 0);
+  ICHECK_GT(v4.compare(v5), 0);
 }
 
 TEST(String, compare_same_memory_region_different_size) {
@@ -479,11 +479,11 @@ TEST(String, compare_same_memory_region_different_size) {
   string source = "a string";
   String str_source{source};
   char* memory = const_cast<char*>(str_source.data());
-  CHECK_EQ(str_source.compare(memory), 0);
+  ICHECK_EQ(str_source.compare(memory), 0);
   // This changes the string size
   memory[2] = '\0';
   // memory is logically shorter now
-  CHECK_GT(str_source.compare(memory), 0);
+  ICHECK_GT(str_source.compare(memory), 0);
 }
 
 TEST(String, compare) {
@@ -500,55 +500,55 @@ TEST(String, compare) {
   String str_mismatch4{mismatch4};
 
   // compare with string
-  CHECK_EQ(str_source.compare(source), 0);
-  CHECK(str_source == source);
-  CHECK(source == str_source);
-  CHECK(str_source <= source);
-  CHECK(source <= str_source);
-  CHECK(str_source >= source);
-  CHECK(source >= str_source);
-  CHECK_LT(str_source.compare(mismatch1), 0);
-  CHECK(str_source < mismatch1);
-  CHECK(mismatch1 != str_source);
-  CHECK_GT(str_source.compare(mismatch2), 0);
-  CHECK(str_source > mismatch2);
-  CHECK(mismatch2 < str_source);
-  CHECK_GT(str_source.compare(mismatch3), 0);
-  CHECK(str_source > mismatch3);
-  CHECK_LT(str_source.compare(mismatch4), 0);
-  CHECK(str_source < mismatch4);
-  CHECK(mismatch4 > str_source);
+  ICHECK_EQ(str_source.compare(source), 0);
+  ICHECK(str_source == source);
+  ICHECK(source == str_source);
+  ICHECK(str_source <= source);
+  ICHECK(source <= str_source);
+  ICHECK(str_source >= source);
+  ICHECK(source >= str_source);
+  ICHECK_LT(str_source.compare(mismatch1), 0);
+  ICHECK(str_source < mismatch1);
+  ICHECK(mismatch1 != str_source);
+  ICHECK_GT(str_source.compare(mismatch2), 0);
+  ICHECK(str_source > mismatch2);
+  ICHECK(mismatch2 < str_source);
+  ICHECK_GT(str_source.compare(mismatch3), 0);
+  ICHECK(str_source > mismatch3);
+  ICHECK_LT(str_source.compare(mismatch4), 0);
+  ICHECK(str_source < mismatch4);
+  ICHECK(mismatch4 > str_source);
 
   // compare with char*
-  CHECK_EQ(str_source.compare(source.data()), 0);
-  CHECK(str_source == source.data());
-  CHECK(source.data() == str_source);
-  CHECK(str_source <= source.data());
-  CHECK(source <= str_source.data());
-  CHECK(str_source >= source.data());
-  CHECK(source >= str_source.data());
-  CHECK_LT(str_source.compare(mismatch1.data()), 0);
-  CHECK(str_source < mismatch1.data());
-  CHECK(str_source != mismatch1.data());
-  CHECK(mismatch1.data() != str_source);
-  CHECK_GT(str_source.compare(mismatch2.data()), 0);
-  CHECK(str_source > mismatch2.data());
-  CHECK(mismatch2.data() < str_source);
-  CHECK_GT(str_source.compare(mismatch3.data()), 0);
-  CHECK(str_source > mismatch3.data());
-  CHECK_LT(str_source.compare(mismatch4.data()), 0);
-  CHECK(str_source < mismatch4.data());
-  CHECK(mismatch4.data() > str_source);
+  ICHECK_EQ(str_source.compare(source.data()), 0);
+  ICHECK(str_source == source.data());
+  ICHECK(source.data() == str_source);
+  ICHECK(str_source <= source.data());
+  ICHECK(source <= str_source.data());
+  ICHECK(str_source >= source.data());
+  ICHECK(source >= str_source.data());
+  ICHECK_LT(str_source.compare(mismatch1.data()), 0);
+  ICHECK(str_source < mismatch1.data());
+  ICHECK(str_source != mismatch1.data());
+  ICHECK(mismatch1.data() != str_source);
+  ICHECK_GT(str_source.compare(mismatch2.data()), 0);
+  ICHECK(str_source > mismatch2.data());
+  ICHECK(mismatch2.data() < str_source);
+  ICHECK_GT(str_source.compare(mismatch3.data()), 0);
+  ICHECK(str_source > mismatch3.data());
+  ICHECK_LT(str_source.compare(mismatch4.data()), 0);
+  ICHECK(str_source < mismatch4.data());
+  ICHECK(mismatch4.data() > str_source);
 
   // compare with String
-  CHECK_LT(str_source.compare(str_mismatch1), 0);
-  CHECK(str_source < str_mismatch1);
-  CHECK_GT(str_source.compare(str_mismatch2), 0);
-  CHECK(str_source > str_mismatch2);
-  CHECK_GT(str_source.compare(str_mismatch3), 0);
-  CHECK(str_source > str_mismatch3);
-  CHECK_LT(str_source.compare(str_mismatch4), 0);
-  CHECK(str_source < str_mismatch4);
+  ICHECK_LT(str_source.compare(str_mismatch1), 0);
+  ICHECK(str_source < str_mismatch1);
+  ICHECK_GT(str_source.compare(str_mismatch2), 0);
+  ICHECK(str_source > str_mismatch2);
+  ICHECK_GT(str_source.compare(str_mismatch3), 0);
+  ICHECK(str_source > str_mismatch3);
+  ICHECK_LT(str_source.compare(str_mismatch4), 0);
+  ICHECK(str_source < str_mismatch4);
 }
 
 TEST(String, c_str) {
@@ -557,8 +557,8 @@ TEST(String, c_str) {
   string mismatch = "mismatch";
   String s{source};
 
-  CHECK_EQ(std::strcmp(s.c_str(), source.data()), 0);
-  CHECK_NE(std::strcmp(s.c_str(), mismatch.data()), 0);
+  ICHECK_EQ(std::strcmp(s.c_str(), source.data()), 0);
+  ICHECK_NE(std::strcmp(s.c_str(), mismatch.data()), 0);
 }
 
 TEST(String, hash) {
@@ -575,8 +575,8 @@ TEST(String, hash) {
   map[k1] = v1;
   map[k2] = v2;
 
-  CHECK_EQ(map[k1], v1);
-  CHECK_EQ(map[k2], v2);
+  ICHECK_EQ(map[k1], v1);
+  ICHECK_EQ(map[k2], v2);
 }
 
 TEST(String, Cast) {
@@ -597,11 +597,11 @@ TEST(String, Concat) {
   String res4 = s1 + "world";
   String res5 = "world" + s1;
 
-  CHECK_EQ(res1.compare("helloworld"), 0);
-  CHECK_EQ(res2.compare("helloworld"), 0);
-  CHECK_EQ(res3.compare("worldhello"), 0);
-  CHECK_EQ(res4.compare("helloworld"), 0);
-  CHECK_EQ(res5.compare("worldhello"), 0);
+  ICHECK_EQ(res1.compare("helloworld"), 0);
+  ICHECK_EQ(res2.compare("helloworld"), 0);
+  ICHECK_EQ(res3.compare("worldhello"), 0);
+  ICHECK_EQ(res4.compare("helloworld"), 0);
+  ICHECK_EQ(res5.compare("worldhello"), 0);
 }
 
 TEST(Optional, Composition) {
@@ -609,71 +609,71 @@ TEST(Optional, Composition) {
   Optional<String> opt1 = String("xyz");
   Optional<String> opt2 = String("xyz1");
   // operator bool
-  CHECK(!opt0);
-  CHECK(opt1);
+  ICHECK(!opt0);
+  ICHECK(opt1);
   // comparison op
-  CHECK(opt0 != "xyz");
-  CHECK(opt1 == "xyz");
-  CHECK(opt1 != nullptr);
-  CHECK(opt0 == nullptr);
-  CHECK(opt0.value_or("abc") == "abc");
-  CHECK(opt1.value_or("abc") == "xyz");
-  CHECK(opt0 != opt1);
-  CHECK(opt1 == Optional<String>(String("xyz")));
-  CHECK(opt0 == Optional<String>(nullptr));
+  ICHECK(opt0 != "xyz");
+  ICHECK(opt1 == "xyz");
+  ICHECK(opt1 != nullptr);
+  ICHECK(opt0 == nullptr);
+  ICHECK(opt0.value_or("abc") == "abc");
+  ICHECK(opt1.value_or("abc") == "xyz");
+  ICHECK(opt0 != opt1);
+  ICHECK(opt1 == Optional<String>(String("xyz")));
+  ICHECK(opt0 == Optional<String>(nullptr));
   opt0 = opt1;
-  CHECK(opt0 == opt1);
-  CHECK(opt0.value().same_as(opt1.value()));
+  ICHECK(opt0 == opt1);
+  ICHECK(opt0.value().same_as(opt1.value()));
   opt0 = std::move(opt2);
-  CHECK(opt0 != opt2);
+  ICHECK(opt0 != opt2);
 }
 
 TEST(Optional, IntCmp) {
   Integer val(CallingConv::kDefault);
   Optional<Integer> opt = Integer(0);
-  CHECK(0 == static_cast<int>(CallingConv::kDefault));
-  CHECK(val == CallingConv::kDefault);
-  CHECK(opt == CallingConv::kDefault);
+  ICHECK(0 == static_cast<int>(CallingConv::kDefault));
+  ICHECK(val == CallingConv::kDefault);
+  ICHECK(opt == CallingConv::kDefault);
 
   // check we can handle implicit 0 to nullptr conversion.
   Optional<Integer> opt1(nullptr);
-  CHECK(opt1 != 0);
-  CHECK(opt1 != false);
-  CHECK(!(opt1 == 0));
+  ICHECK(opt1 != 0);
+  ICHECK(opt1 != false);
+  ICHECK(!(opt1 == 0));
 }
 
 TEST(Optional, PackedCall) {
   auto tf = [](Optional<String> s, bool isnull) {
     if (isnull) {
-      CHECK(s == nullptr);
+      ICHECK(s == nullptr);
     } else {
-      CHECK(s != nullptr);
+      ICHECK(s != nullptr);
     }
     return s;
   };
   auto func = TypedPackedFunc<Optional<String>(Optional<String>, bool)>(tf);
-  CHECK(func(String("xyz"), false) == "xyz");
-  CHECK(func(Optional<String>(nullptr), true) == nullptr);
+  ICHECK(func(String("xyz"), false) == "xyz");
+  ICHECK(func(Optional<String>(nullptr), true) == nullptr);
 
   auto pf = [](TVMArgs args, TVMRetValue* rv) {
     Optional<String> s = args[0];
     bool isnull = args[1];
     if (isnull) {
-      CHECK(s == nullptr);
+      ICHECK(s == nullptr);
     } else {
-      CHECK(s != nullptr);
+      ICHECK(s != nullptr);
     }
     *rv = s;
   };
   auto packedfunc = PackedFunc(pf);
-  CHECK(packedfunc("xyz", false).operator String() == "xyz");
-  CHECK(packedfunc("xyz", false).operator Optional<String>() == "xyz");
-  CHECK(packedfunc(nullptr, true).operator Optional<String>() == nullptr);
+  ICHECK(packedfunc("xyz", false).operator String() == "xyz");
+  ICHECK(packedfunc("xyz", false).operator Optional<String>() == "xyz");
+  ICHECK(packedfunc(nullptr, true).operator Optional<String>() == nullptr);
 
   // test FFI convention.
   auto test_ffi = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
     int tcode = args[1];
-    CHECK_EQ(args[0].type_code(), tcode);
+    ICHECK_EQ(args[0].type_code(), tcode);
   });
   String s = "xyz";
   auto nd = NDArray::Empty({0, 1}, DataType::Float(32), DLContext{kDLCPU, 0});
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index a5d47dd4d989..99ff26dc0b58 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -30,8 +30,8 @@ TEST(Expr, Basic) {
   PrimExpr zz = Downcast<PrimExpr>(tmp);
   std::ostringstream os;
   os << z;
-  CHECK(zz.same_as(z));
-  CHECK(os.str() == "max(((x + 1) + 2), 100)");
+  ICHECK(zz.same_as(z));
+  ICHECK(os.str() == "max(((x + 1) + 2), 100)");
 }
 
 TEST(ExprNodeRef, Basic) {
@@ -40,7 +40,7 @@ TEST(ExprNodeRef, Basic) {
   Var x("x");
   PrimExpr z = max(x + 1 + 2, 100);
   const tir::MaxNode* op = z.as<tir::MaxNode>();
-  CHECK(GetRef<ObjectRef>(op).same_as(z));
+  ICHECK(GetRef<ObjectRef>(op).same_as(z));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index de06a0e7189f..683caaa7c5de 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -35,8 +35,8 @@ TEST(IRF, Basic) {
   NodeFunctor<int(const ObjectRef& n, int b)> f;
   f.set_dispatch<VarNode>([](const ObjectRef& n, int b) { return b; });
   f.set_dispatch<AddNode>([](const ObjectRef& n, int b) { return b + 2; });
-  CHECK_EQ(f(x, 2), 2);
-  CHECK_EQ(f(z, 2), 4);
+  ICHECK_EQ(f(x, 2), 2);
+  ICHECK_EQ(f(z, 2), 4);
 }
 
 TEST(IRF, CountVar) {
@@ -49,7 +49,7 @@ TEST(IRF, CountVar) {
   tir::PostOrderVisit(z, [&n_var](const ObjectRef& n) {
     if (n.as<VarNode>()) ++n_var;
   });
-  CHECK_EQ(n_var, 2);
+  ICHECK_EQ(n_var, 2);
 }
 
 TEST(IRF, ExprTransform) {
@@ -67,8 +67,8 @@ TEST(IRF, ExprTransform) {
     }
   };
   MyExprFunctor f;
-  CHECK_EQ(f(x, 2), 2);
-  CHECK_EQ(f(z, 2), 3);
+  ICHECK_EQ(f(x, 2), 2);
+  ICHECK_EQ(f(z, 2), 3);
   try {
     f(z - 1, 2);
     LOG(FATAL) << "should fail";
@@ -97,7 +97,7 @@ TEST(IRF, ExprVisit) {
   };
   MyVisitor v;
   v.VisitStmt(Evaluate(z));
-  CHECK_EQ(v.count, 1);
+  ICHECK_EQ(v.count, 1);
 }
 
 TEST(IRF, StmtVisitor) {
@@ -118,7 +118,7 @@ TEST(IRF, StmtVisitor) {
     return Allocate(buffer, DataType::Float(32), {z, z}, const_true(), body);
   };
   v(fmaketest());
-  CHECK_EQ(v.count, 3);
+  ICHECK_EQ(v.count, 3);
 }
 
 TEST(IRF, StmtMutator) {
@@ -159,14 +159,14 @@ TEST(IRF, StmtMutator) {
     Array<Stmt> arr{std::move(body), body2, body2};
     auto* arrptr = arr.get();
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr.get() == arrptr);
+    ICHECK(arr.get() == arrptr);
     // inplace update body
-    CHECK(arr[0].as<AllocateNode>()->extents[1].same_as(x));
-    CHECK(arr[0].as<AllocateNode>()->extents.get() == extentptr);
+    ICHECK(arr[0].as<AllocateNode>()->extents[1].same_as(x));
+    ICHECK(arr[0].as<AllocateNode>()->extents.get() == extentptr);
     // copy because there is additional refs
-    CHECK(!arr[0].as<AllocateNode>()->body.same_as(bref));
-    CHECK(arr[0].as<AllocateNode>()->body.as<EvaluateNode>()->value.same_as(x));
-    CHECK(bref.as<EvaluateNode>()->value.as<AddNode>());
+    ICHECK(!arr[0].as<AllocateNode>()->body.same_as(bref));
+    ICHECK(arr[0].as<AllocateNode>()->body.as<EvaluateNode>()->value.same_as(x));
+    ICHECK(bref.as<EvaluateNode>()->value.as<AddNode>());
   }
   {
     Array<Stmt> arr{fmakealloc()};
@@ -174,29 +174,29 @@ TEST(IRF, StmtMutator) {
     Array<Stmt> arr2 = arr;
     auto* arrptr = arr.get();
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr.get() != arrptr);
-    CHECK(arr[0].as<AllocateNode>()->extents[1].same_as(x));
-    CHECK(!arr2[0].as<AllocateNode>()->extents[1].same_as(x));
+    ICHECK(arr.get() != arrptr);
+    ICHECK(arr[0].as<AllocateNode>()->extents[1].same_as(x));
+    ICHECK(!arr2[0].as<AllocateNode>()->extents[1].same_as(x));
     // mutate but no content change.
     arr2 = arr;
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr2.get() == arr.get());
+    ICHECK(arr2.get() == arr.get());
   }
   {
     Array<Stmt> arr{fmakeif()};
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr[0].as<IfThenElseNode>()->else_case.as<EvaluateNode>()->value.same_as(x));
+    ICHECK(arr[0].as<IfThenElseNode>()->else_case.as<EvaluateNode>()->value.same_as(x));
     // mutate but no content change.
     auto arr2 = arr;
     arr.MutateByApply([&](Stmt s) { return v(std::move(s)); });
-    CHECK(arr2.get() == arr.get());
+    ICHECK(arr2.get() == arr.get());
   }
 
   {
     auto body =
         Evaluate(Call(DataType::Int(32), builtin::call_extern(), {StringImm("xyz"), x + 1}));
     auto res = v(std::move(body));
-    CHECK(res.as<EvaluateNode>()->value.as<CallNode>()->args[1].same_as(x));
+    ICHECK(res.as<EvaluateNode>()->value.as<CallNode>()->args[1].same_as(x));
   }
   {
     Stmt body = fmakealloc();
@@ -209,9 +209,9 @@ TEST(IRF, StmtMutator) {
     body = SeqStmt({body, body2});
     body = v(std::move(body));
     // the seq get flattened
-    CHECK(body.as<SeqStmtNode>()->size() == 3);
-    CHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() == extentptr);
-    CHECK(body.as<SeqStmtNode>()->seq[1].get() == ref2);
+    ICHECK(body.as<SeqStmtNode>()->size() == 3);
+    ICHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() == extentptr);
+    ICHECK(body.as<SeqStmtNode>()->seq[1].get() == ref2);
   }
 
   {
@@ -225,7 +225,7 @@ TEST(IRF, StmtMutator) {
     body = SeqStmt({body, body2});
     body = v(std::move(body));
     // the seq get flattened
-    CHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() != extentptr);
+    ICHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() != extentptr);
   }
 }
 
diff --git a/tests/cpp/object_protocol_test.cc b/tests/cpp/object_protocol_test.cc
index 0df802497434..aaf9ee4af271 100644
--- a/tests/cpp/object_protocol_test.cc
+++ b/tests/cpp/object_protocol_test.cc
@@ -72,28 +72,28 @@ TEST(ObjectHierachy, Basic) {
   using namespace tvm::test;
 
   ObjectRef refA(make_object<ObjA>());
-  CHECK_EQ(refA->type_index(), ObjA::RuntimeTypeIndex());
-  CHECK(refA.as<Object>() != nullptr);
-  CHECK(refA.as<ObjA>() != nullptr);
-  CHECK(refA.as<ObjBase>() != nullptr);
-  CHECK(refA.as<ObjB>() == nullptr);
-  CHECK(refA.as<ObjAA>() == nullptr);
+  ICHECK_EQ(refA->type_index(), ObjA::RuntimeTypeIndex());
+  ICHECK(refA.as<Object>() != nullptr);
+  ICHECK(refA.as<ObjA>() != nullptr);
+  ICHECK(refA.as<ObjBase>() != nullptr);
+  ICHECK(refA.as<ObjB>() == nullptr);
+  ICHECK(refA.as<ObjAA>() == nullptr);
 
   ObjectRef refAA(make_object<ObjAA>());
-  CHECK_EQ(refAA->type_index(), ObjAA::RuntimeTypeIndex());
-  CHECK(refAA.as<Object>() != nullptr);
-  CHECK(refAA.as<ObjBase>() != nullptr);
-  CHECK(refAA.as<ObjA>() != nullptr);
-  CHECK(refAA.as<ObjAA>() != nullptr);
-  CHECK(refAA.as<ObjB>() == nullptr);
+  ICHECK_EQ(refAA->type_index(), ObjAA::RuntimeTypeIndex());
+  ICHECK(refAA.as<Object>() != nullptr);
+  ICHECK(refAA.as<ObjBase>() != nullptr);
+  ICHECK(refAA.as<ObjA>() != nullptr);
+  ICHECK(refAA.as<ObjAA>() != nullptr);
+  ICHECK(refAA.as<ObjB>() == nullptr);
 
   ObjectRef refB(make_object<ObjB>());
-  CHECK_EQ(refB->type_index(), ObjB::RuntimeTypeIndex());
-  CHECK(refB.as<Object>() != nullptr);
-  CHECK(refB.as<ObjBase>() != nullptr);
-  CHECK(refB.as<ObjA>() == nullptr);
-  CHECK(refB.as<ObjAA>() == nullptr);
-  CHECK(refB.as<ObjB>() != nullptr);
+  ICHECK_EQ(refB->type_index(), ObjB::RuntimeTypeIndex());
+  ICHECK(refB.as<Object>() != nullptr);
+  ICHECK(refB.as<ObjBase>() != nullptr);
+  ICHECK(refB.as<ObjA>() == nullptr);
+  ICHECK(refB.as<ObjAA>() == nullptr);
+  ICHECK(refB.as<ObjB>() != nullptr);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 523df9891332..53a3f40388cb 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -34,16 +34,16 @@ TEST(PackedFunc, Basic) {
   DLTensor a;
 
   Var v = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK(args.num_args == 3);
-    CHECK(args.values[0].v_float64 == 1.0);
-    CHECK(args.type_codes[0] == kDLFloat);
-    CHECK(args.values[1].v_handle == &a);
-    CHECK(args.type_codes[1] == kTVMDLTensorHandle);
-    CHECK(args.values[2].v_handle == &x);
-    CHECK(args.type_codes[2] == kTVMOpaqueHandle);
+    ICHECK(args.num_args == 3);
+    ICHECK(args.values[0].v_float64 == 1.0);
+    ICHECK(args.type_codes[0] == kDLFloat);
+    ICHECK(args.values[1].v_handle == &a);
+    ICHECK(args.type_codes[1] == kTVMDLTensorHandle);
+    ICHECK(args.values[2].v_handle == &x);
+    ICHECK(args.type_codes[2] == kTVMOpaqueHandle);
     *rv = Var("a");
   })(1.0, &a, handle);
-  CHECK(v->name_hint == "a");
+  ICHECK(v->name_hint == "a");
 }
 
 TEST(PackedFunc, Node) {
@@ -52,13 +52,13 @@ TEST(PackedFunc, Node) {
   using namespace tvm::runtime;
   Var x;
   Var t = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK(args.num_args == 1);
-    CHECK(args[0].IsObjectRef<ObjectRef>());
+    ICHECK(args.num_args == 1);
+    ICHECK(args[0].IsObjectRef<ObjectRef>());
     Var b = args[0];
-    CHECK(x.same_as(b));
+    ICHECK(x.same_as(b));
     *rv = b;
   })(x);
-  CHECK(t.same_as(x));
+  ICHECK(t.same_as(x));
 }
 
 TEST(PackedFunc, NDArray) {
@@ -66,38 +66,38 @@ TEST(PackedFunc, NDArray) {
   using namespace tvm::runtime;
   auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0});
   reinterpret_cast<float*>(x->data)[0] = 10.0f;
-  CHECK(x.use_count() == 1);
+  ICHECK(x.use_count() == 1);
 
   PackedFunc forward([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; });
 
   NDArray ret = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
     NDArray y = args[0];
     DLTensor* ptr = args[0];
-    CHECK(ptr == x.operator->());
-    CHECK(x.same_as(y));
-    CHECK(x.use_count() == 2);
+    ICHECK(ptr == x.operator->());
+    ICHECK(x.same_as(y));
+    ICHECK(x.use_count() == 2);
     *rv = forward(y);
   })(x);
-  CHECK(ret.use_count() == 2);
-  CHECK(ret.same_as(x));
+  ICHECK(ret.use_count() == 2);
+  ICHECK(ret.same_as(x));
 }
 
 TEST(PackedFunc, str) {
   using namespace tvm;
   using namespace tvm::runtime;
   PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK(args.num_args == 1);
+    ICHECK(args.num_args == 1);
     std::string x = args[0];
-    CHECK(x == "hello");
+    ICHECK(x == "hello");
     String y = args[0];
-    CHECK(y == "hello");
+    ICHECK(y == "hello");
     *rv = x;
   })("hello");
 
   PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK(args.num_args == 1);
+    ICHECK(args.num_args == 1);
     runtime::String s = args[0];
-    CHECK(s == "hello");
+    ICHECK(s == "hello");
   })(runtime::String("hello"));
 }
 
@@ -111,13 +111,13 @@ TEST(PackedFunc, func) {
     // TVMArgValue -> Arguments as function
     *rv = f(args[1]).operator int();
   })(addone, 1);
-  CHECK_EQ(r0, 2);
+  ICHECK_EQ(r0, 2);
 
   int r1 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
     // TVMArgValue -> TVMRetValue
     *rv = args[1];
   })(2, 100);
-  CHECK_EQ(r1, 100);
+  ICHECK_EQ(r1, 100);
 
   int r2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
     // re-assignment
@@ -125,7 +125,7 @@ TEST(PackedFunc, func) {
     // TVMRetValue -> Function argument
     *rv = addone(args[0].operator PackedFunc()(args[1], 1));
   })(addone, 100);
-  CHECK_EQ(r2, 102);
+  ICHECK_EQ(r2, 102);
 }
 
 TEST(PackedFunc, Expr) {
@@ -141,7 +141,7 @@ TEST(PackedFunc, Expr) {
     // TVMArgValue -> Arguments as function
     *rv = f(args[1]).operator int();
   })(addone, 1);
-  CHECK_EQ(r0, 2);
+  ICHECK_EQ(r0, 2);
 }
 
 TEST(PackedFunc, Type) {
@@ -152,9 +152,9 @@ TEST(PackedFunc, Type) {
     *rv = x;
   });
   auto get_type2 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; });
-  CHECK(get_type("int32").operator DataType() == DataType::Int(32));
-  CHECK(get_type("float").operator DataType() == DataType::Float(32));
-  CHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2));
+  ICHECK(get_type("int32").operator DataType() == DataType::Int(32));
+  ICHECK(get_type("float").operator DataType() == DataType::Float(32));
+  ICHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2));
 }
 
 TEST(TypedPackedFunc, HighOrder) {
@@ -170,12 +170,12 @@ TEST(TypedPackedFunc, HighOrder) {
     return x;
   };
   auto add = [](int x, int y) { return x + y; };
-  CHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
+  ICHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
   PackedFunc f = ftyped(Int2Func(add), 1);
-  CHECK_EQ(f(3).operator int(), 4);
+  ICHECK_EQ(f(3).operator int(), 4);
   // call the type erased version.
   Int1Func f1 = ftyped.packed()(Int2Func(add), 1);
-  CHECK_EQ(f1(3), 4);
+  ICHECK_EQ(f1(3), 4);
 }
 
 TEST(TypedPackedFunc, Deduce) {
@@ -202,54 +202,54 @@ TEST(PackedFunc, ObjectConversion) {
   auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0});
   // assign null
   rv = ObjectRef();
-  CHECK_EQ(rv.type_code(), kTVMNullptr);
+  ICHECK_EQ(rv.type_code(), kTVMNullptr);
 
   // Can assign NDArray to ret type
   rv = x;
-  CHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
+  ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
   // Even if we assign base type it still shows as NDArray
   rv = ObjectRef(x);
-  CHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
+  ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
   // Check convert back
-  CHECK(rv.operator NDArray().same_as(x));
-  CHECK(rv.operator ObjectRef().same_as(x));
-  CHECK(!rv.IsObjectRef<PrimExpr>());
+  ICHECK(rv.operator NDArray().same_as(x));
+  ICHECK(rv.operator ObjectRef().same_as(x));
+  ICHECK(!rv.IsObjectRef<PrimExpr>());
 
   auto pf1 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK_EQ(args[0].type_code(), kTVMNDArrayHandle);
-    CHECK(args[0].operator NDArray().same_as(x));
-    CHECK(args[0].operator ObjectRef().same_as(x));
-    CHECK(args[1].operator ObjectRef().get() == nullptr);
-    CHECK(args[1].operator NDArray().get() == nullptr);
-    CHECK(args[1].operator Module().get() == nullptr);
-    CHECK(args[1].operator Array<NDArray>().get() == nullptr);
-    CHECK(!args[0].IsObjectRef<PrimExpr>());
+    ICHECK_EQ(args[0].type_code(), kTVMNDArrayHandle);
+    ICHECK(args[0].operator NDArray().same_as(x));
+    ICHECK(args[0].operator ObjectRef().same_as(x));
+    ICHECK(args[1].operator ObjectRef().get() == nullptr);
+    ICHECK(args[1].operator NDArray().get() == nullptr);
+    ICHECK(args[1].operator Module().get() == nullptr);
+    ICHECK(args[1].operator Array<NDArray>().get() == nullptr);
+    ICHECK(!args[0].IsObjectRef<PrimExpr>());
   });
   pf1(x, ObjectRef());
   pf1(ObjectRef(x), NDArray());
 
   // testcases for modules
   auto* pf = tvm::runtime::Registry::Get("runtime.SourceModuleCreate");
-  CHECK(pf != nullptr);
+  ICHECK(pf != nullptr);
   Module m = (*pf)("", "xyz");
   rv = m;
-  CHECK_EQ(rv.type_code(), kTVMModuleHandle);
+  ICHECK_EQ(rv.type_code(), kTVMModuleHandle);
   // Even if we assign base type it still shows as NDArray
   rv = ObjectRef(m);
-  CHECK_EQ(rv.type_code(), kTVMModuleHandle);
+  ICHECK_EQ(rv.type_code(), kTVMModuleHandle);
   // Check convert back
-  CHECK(rv.operator Module().same_as(m));
-  CHECK(rv.operator ObjectRef().same_as(m));
-  CHECK(!rv.IsObjectRef<NDArray>());
+  ICHECK(rv.operator Module().same_as(m));
+  ICHECK(rv.operator ObjectRef().same_as(m));
+  ICHECK(!rv.IsObjectRef<NDArray>());
 
   auto pf2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-    CHECK_EQ(args[0].type_code(), kTVMModuleHandle);
-    CHECK(args[0].operator Module().same_as(m));
-    CHECK(args[0].operator ObjectRef().same_as(m));
-    CHECK(args[1].operator ObjectRef().get() == nullptr);
-    CHECK(args[1].operator NDArray().get() == nullptr);
-    CHECK(args[1].operator Module().get() == nullptr);
-    CHECK(!args[0].IsObjectRef<PrimExpr>());
+    ICHECK_EQ(args[0].type_code(), kTVMModuleHandle);
+    ICHECK(args[0].operator Module().same_as(m));
+    ICHECK(args[0].operator ObjectRef().same_as(m));
+    ICHECK(args[1].operator ObjectRef().get() == nullptr);
+    ICHECK(args[1].operator NDArray().get() == nullptr);
+    ICHECK(args[1].operator Module().get() == nullptr);
+    ICHECK(!args[0].IsObjectRef<PrimExpr>());
   });
   pf2(m, ObjectRef());
   pf2(ObjectRef(m), Module());
@@ -261,7 +261,7 @@ TEST(TypedPackedFunc, RValue) {
   {
     auto inspect = [](TVMArgs args, TVMRetValue* rv) {
       for (int i = 0; i < args.size(); ++i) {
-        CHECK_EQ(args[0].type_code(), kTVMObjectRValueRefArg);
+        ICHECK_EQ(args[0].type_code(), kTVMObjectRValueRefArg);
       }
     };
     PackedFunc finspect(inspect);
@@ -270,37 +270,37 @@ TEST(TypedPackedFunc, RValue) {
   {
     auto f = [](tir::Var x, bool move) {
       if (move) {
-        CHECK(x.unique());
+        ICHECK(x.unique());
       } else {
-        CHECK(!x.unique());
+        ICHECK(!x.unique());
       }
-      CHECK(x->name_hint == "x");
+      ICHECK(x->name_hint == "x");
       return x;
     };
     TypedPackedFunc<tir::Var(tir::Var, bool)> tf(f);
 
     tir::Var var("x");
-    CHECK(var.unique());
+    ICHECK(var.unique());
     tf(var, false);
     // move the result to the function.
     tir::Var ret = tf(std::move(var), true);
-    CHECK(!var.defined());
+    ICHECK(!var.defined());
   }
 
   {
     // pass child class.
     auto f = [](PrimExpr x, bool move) {
       if (move) {
-        CHECK(x.unique());
+        ICHECK(x.unique());
       } else {
-        CHECK(!x.unique());
+        ICHECK(!x.unique());
       }
       return x;
     };
     TypedPackedFunc<PrimExpr(PrimExpr, bool)> tf(f);
 
     tir::Var var("x");
-    CHECK(var.unique());
+    ICHECK(var.unique());
     tf(var, false);
     tf(std::move(var), true);
     // auto conversion.
diff --git a/tests/cpp/parallel_for_test.cc b/tests/cpp/parallel_for_test.cc
index 82e95f9ab46e..bf5fe94b83ff 100644
--- a/tests/cpp/parallel_for_test.cc
+++ b/tests/cpp/parallel_for_test.cc
@@ -19,6 +19,7 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <tvm/support/logging.h>
 #include <tvm/support/parallel_for.h>
 
 #include <vector>
@@ -34,7 +35,7 @@ TEST(ParallelFor, Basic) {
   }
   parallel_for(0, 10, [&b](int i) { b[i] = i; });
   for (int i = 0; i < 10; i++) {
-    CHECK_EQ(a[i], b[i]);
+    ICHECK_EQ(a[i], b[i]);
   }
 
   // Check for a large size of parallel
@@ -43,7 +44,7 @@ TEST(ParallelFor, Basic) {
   }
   parallel_for(0, 1000, [&b](int i) { b[i] = i; });
   for (int i = 0; i < 1000; i++) {
-    CHECK_EQ(a[i], b[i]);
+    ICHECK_EQ(a[i], b[i]);
   }
 
   // Check for step != 1
@@ -53,7 +54,7 @@ TEST(ParallelFor, Basic) {
   parallel_for(
       0, 1000, [&b](int i) { b[i] *= 2; }, 2);
   for (int i = 0; i < 1000; i++) {
-    CHECK_EQ(a[i], b[i]);
+    ICHECK_EQ(a[i], b[i]);
   }
 }
 
@@ -75,7 +76,7 @@ TEST(ParallelFor, NestedWithNormalForLoop) {
   });
   for (int i = 0; i < 500; i++) {
     for (int j = 0; j < 500; j++) {
-      CHECK_EQ(a[i][j], b[i][j]);
+      ICHECK_EQ(a[i][j], b[i][j]);
     }
   }
 
@@ -84,7 +85,7 @@ TEST(ParallelFor, NestedWithNormalForLoop) {
   }
   for (int i = 0; i < 500; i++) {
     for (int j = 0; j < 500; j++) {
-      CHECK_EQ(a[i][j], c[i][j]);
+      ICHECK_EQ(a[i][j], c[i][j]);
     }
   }
 }
@@ -103,7 +104,7 @@ TEST(Parallelfor, NestedWithParallelFor) {
   } catch (const std::exception& e) {
     exception = true;
   }
-  CHECK(exception);
+  ICHECK(exception);
 }
 
 TEST(ParallelFor, Exception) {
@@ -115,7 +116,7 @@ TEST(ParallelFor, Exception) {
   } catch (const std::exception& e) {
     exception = true;
   }
-  CHECK(exception);
+  ICHECK(exception);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/pattern_match_test.cc b/tests/cpp/pattern_match_test.cc
index 5063509e4e35..dfe09406ba52 100644
--- a/tests/cpp/pattern_match_test.cc
+++ b/tests/cpp/pattern_match_test.cc
@@ -33,91 +33,91 @@ TEST(Pattern, Basic) {
 
   // arithmetics
   auto r = 1 + (y + 1);
-  CHECK(!(px + (px + px)).Match(r));
-  CHECK(!(px + (py + py)).Match(r));
-  CHECK((px + (py + pz)).Match(r));
+  ICHECK(!(px + (px + px)).Match(r));
+  ICHECK(!(px + (py + py)).Match(r));
+  ICHECK((px + (py + pz)).Match(r));
   auto pattern = px + (py + pz);
-  CHECK(pattern.Match(r));
+  ICHECK(pattern.Match(r));
   {
-    CHECK((px + (py + px)).Match(r));
+    ICHECK((px + (py + px)).Match(r));
     auto rr = (px + py).Eval();
 
-    CHECK(tir::ExprDeepEqual()(rr, 1 + y));
-    CHECK(tir::ExprDeepEqual()(px.Eval() + py.Eval(), 1 + y));
+    ICHECK(tir::ExprDeepEqual()(rr, 1 + y));
+    ICHECK(tir::ExprDeepEqual()(px.Eval() + py.Eval(), 1 + y));
   }
   {
-    CHECK((px + max(py, px)).Match((x + 1) + max(y, (x + 1))));
-    CHECK(tir::ExprDeepEqual()(px.Eval(), x + 1));
+    ICHECK((px + max(py, px)).Match((x + 1) + max(y, (x + 1))));
+    ICHECK(tir::ExprDeepEqual()(px.Eval(), x + 1));
   }
-  CHECK(!(px + min(py, px)).Match((x + 1) + max(y, (x + 1))));
+  ICHECK(!(px + min(py, px)).Match((x + 1) + max(y, (x + 1))));
 
-  CHECK((px + min(py, px)).Match(z + min(y, z)));
-  CHECK((px + truncdiv(py, px * py)).Match(x + truncdiv(2, x * 2)));
-  CHECK((px - truncmod(py, px * pz)).Match(x - truncmod(2, x * 2)));
-  CHECK((px - floormod(py, px * PConst<PrimExpr>(2))).Match(x - floormod(2, x * 2)));
+  ICHECK((px + min(py, px)).Match(z + min(y, z)));
+  ICHECK((px + truncdiv(py, px * py)).Match(x + truncdiv(2, x * 2)));
+  ICHECK((px - truncmod(py, px * pz)).Match(x - truncmod(2, x * 2)));
+  ICHECK((px - floormod(py, px * PConst<PrimExpr>(2))).Match(x - floormod(2, x * 2)));
 
   // logicals
-  CHECK((px == pz).Match(x == 1));
-  CHECK((px != pz).Match(x != 1));
-  CHECK((px > py).Match(x > y));
-  CHECK((px < py).Match(x < y));
-  CHECK((px <= py).Match(x <= y));
-  CHECK((px >= py).Match(x >= y));
-  CHECK((px >= py && px < pz).Match(x >= y && x < z));
-  CHECK((!(px > py || px != py)).Match(!(x > y || x != y)));
+  ICHECK((px == pz).Match(x == 1));
+  ICHECK((px != pz).Match(x != 1));
+  ICHECK((px > py).Match(x > y));
+  ICHECK((px < py).Match(x < y));
+  ICHECK((px <= py).Match(x <= y));
+  ICHECK((px >= py).Match(x >= y));
+  ICHECK((px >= py && px < pz).Match(x >= y && x < z));
+  ICHECK((!(px > py || px != py)).Match(!(x > y || x != y)));
   {
-    CHECK(select(px >= pz, py, py + pz).Match(tir::Select((x + 1) >= 1, y, y + 1)));
-    CHECK(tir::ExprDeepEqual()(px.Eval(), x + 1));
+    ICHECK(select(px >= pz, py, py + pz).Match(tir::Select((x + 1) >= 1, y, y + 1)));
+    ICHECK(tir::ExprDeepEqual()(px.Eval(), x + 1));
   }
   // bit intrinsics
   {
-    CHECK((px >> pz).Match(x >> 1));
-    CHECK(is_const_int(pz.Eval(), 1));
+    ICHECK((px >> pz).Match(x >> 1));
+    ICHECK(is_const_int(pz.Eval(), 1));
   }
-  CHECK(!(px >> pz).Match(x << 1));
-  CHECK((px << pz).Match(x << 1));
-  CHECK((px & pz).Match(x & 1));
-  CHECK((px | pz).Match(x | 1));
-  CHECK((px ^ pz).Match(x ^ 1));
-  CHECK((px - (~(py | (px * pz)))).Match(x - (~(2 | (x * 2)))));
+  ICHECK(!(px >> pz).Match(x << 1));
+  ICHECK((px << pz).Match(x << 1));
+  ICHECK((px & pz).Match(x & 1));
+  ICHECK((px | pz).Match(x | 1));
+  ICHECK((px ^ pz).Match(x ^ 1));
+  ICHECK((px - (~(py | (px * pz)))).Match(x - (~(2 | (x * 2)))));
   // select
   {
-    CHECK(select(px > pz, py, py + pz).Match(tir::Select(x > 1, y, y + 1)));
-    CHECK(is_const_int(pz.Eval(), 1));
+    ICHECK(select(px > pz, py, py + pz).Match(tir::Select(x > 1, y, y + 1)));
+    ICHECK(is_const_int(pz.Eval(), 1));
   }
-  CHECK(!select(px > pz, py, py + pz).Match(tir::Select(x > 2, y, y + 1)));
-  CHECK(!select(px > pz, py, py).Match(tir::Select(x > 2, y, y + 1)));
+  ICHECK(!select(px > pz, py, py + pz).Match(tir::Select(x > 2, y, y + 1)));
+  ICHECK(!select(px > pz, py, py).Match(tir::Select(x > 2, y, y + 1)));
   {
-    CHECK(select(px, py, pz).Match(tir::Select(x > 2, y, y + 1)));
-    CHECK(tir::ExprDeepEqual()(pz.Eval(), y + 1));
+    ICHECK(select(px, py, pz).Match(tir::Select(x > 2, y, y + 1)));
+    ICHECK(tir::ExprDeepEqual()(pz.Eval(), y + 1));
   }
   // if_then_else
   {
-    CHECK(if_then_else(px > pz, py, py + pz).Match(if_then_else(x > 1, y, y + 1)));
-    CHECK(is_const_int(pz.Eval(), 1));
+    ICHECK(if_then_else(px > pz, py, py + pz).Match(if_then_else(x > 1, y, y + 1)));
+    ICHECK(is_const_int(pz.Eval(), 1));
   }
   // cast pattern
   {
-    CHECK(!cast(PConst<DataType>(DataType::Int(32)), px).Match(tir::Cast(DataType::Float(64), x)));
-    CHECK(cast(pt, px).Match(tir::Cast(DataType::Float(64), x)));
-    CHECK(pt.Eval() == DataType::Float(64));
+    ICHECK(!cast(PConst<DataType>(DataType::Int(32)), px).Match(tir::Cast(DataType::Float(64), x)));
+    ICHECK(cast(pt, px).Match(tir::Cast(DataType::Float(64), x)));
+    ICHECK(pt.Eval() == DataType::Float(64));
     auto zz = cast(pt, px).Eval();
-    CHECK((cast(pt, px) - cast(pt, py))
-              .Match(tir::Cast(DataType::Float(64), x) - tir::Cast(DataType::Int(64), x)));
+    ICHECK((cast(pt, px) - cast(pt, py))
+               .Match(tir::Cast(DataType::Float(64), x) - tir::Cast(DataType::Int(64), x)));
     auto expr = tir::Cast(DataType::Int(32), tir::Cast(DataType::Float(64), x));
-    CHECK(!(cast(pt, cast(pt, px))).Match(expr));
+    ICHECK(!(cast(pt, cast(pt, px))).Match(expr));
   }
   // ramp pattern
   {
-    CHECK(ramp(px, PConst<PrimExpr>(1), planes).Match(tir::Ramp(x, 1, 10)));
-    CHECK(planes.Eval() == 10);
-    CHECK(!ramp(px, PConst<PrimExpr>(1), planes).Match(tir::Ramp(x, 2, 10)));
+    ICHECK(ramp(px, PConst<PrimExpr>(1), planes).Match(tir::Ramp(x, 1, 10)));
+    ICHECK(planes.Eval() == 10);
+    ICHECK(!ramp(px, PConst<PrimExpr>(1), planes).Match(tir::Ramp(x, 2, 10)));
   }
   // broadcast pattern
   {
-    CHECK(broadcast(px, planes).Match(tir::Broadcast(x, 10)));
-    CHECK(planes.Eval() == 10);
-    CHECK(broadcast(px * py, planes).Match(tir::Broadcast(x * 10, 10)));
+    ICHECK(broadcast(px, planes).Match(tir::Broadcast(x, 10)));
+    ICHECK(planes.Eval() == 10);
+    ICHECK(broadcast(px * py, planes).Match(tir::Broadcast(x * 10, 10)));
   }
 }
 
@@ -129,14 +129,14 @@ TEST(Pattern, IntImm) {
   {
     // We can match integer and Var, both of which are
     // special case container of Expr
-    CHECK((v * c).Match(tx * 3));
-    CHECK_EQ(c.Eval()->value, 3);
-    CHECK((v * 3).Match(tx * 3));
+    ICHECK((v * c).Match(tx * 3));
+    ICHECK_EQ(c.Eval()->value, 3);
+    ICHECK((v * 3).Match(tx * 3));
   }
   // cannot match c to ty
-  CHECK(!(v * c).Match(tx * ty));
+  ICHECK(!(v * c).Match(tx * ty));
   // cannot match tx + 1 to v
-  CHECK(!(v * c).Match((tx + 1) * 3));
+  ICHECK(!(v * c).Match((tx + 1) * 3));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index fcab1b85edd9..3212f9079619 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -41,7 +41,7 @@ TVM_REGISTER_GLOBAL("test.strategy")
                        const Target& target) {
       FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) -> Array<te::Tensor> {
-        CHECK_EQ(inputs.size(), 2U);
+        ICHECK_EQ(inputs.size(), 2U);
         return {topi::add(inputs[0], inputs[1])};
       };
       FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>& outs,
@@ -116,14 +116,14 @@ TEST(Relay, BuildModule) {
   Target llvm_tgt = Target("llvm");
   targets.Set(0, llvm_tgt);
   auto relay_mod = tvm::IRModule::FromExpr(func);
-  CHECK(relay_mod.defined()) << "Module must be defined";
+  ICHECK(relay_mod.defined()) << "Module must be defined";
   build_f(relay_mod, targets, llvm_tgt);
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   // run
   auto ctx = A->ctx;
   auto pfr = tvm::runtime::Registry::Get("tvm.graph_runtime.create");
-  CHECK(mod.defined()) << "Module must be defined";
+  ICHECK(mod.defined()) << "Module must be defined";
   tvm::runtime::Module run_mod = (*pfr)(json, mod, (int)ctx.device_type, (int)ctx.device_id);
   auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false);
   auto run_f = run_mod.GetFunction("run", false);
@@ -135,7 +135,7 @@ TEST(Relay, BuildModule) {
   tvm::runtime::NDArray Y = get_output_f(0);
   auto pY = (float*)Y->data;
   for (int i = 0; i < 6; ++i) {
-    CHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4);
+    ICHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4);
   }
   // mutate the input a bit and run it again
   for (int i = 0; i < 6; ++i) {
@@ -145,7 +145,7 @@ TEST(Relay, BuildModule) {
   tvm::runtime::NDArray Y2 = get_output_f(0);
   auto pY2 = (float*)Y2->data;
   for (int i = 0; i < 6; ++i) {
-    CHECK_LT(fabs(pY2[i] - (i + (i + 3) + (i + 2))), 1e-4);
+    ICHECK_LT(fabs(pY2[i] - (i + (i + 3) + (i + 2))), 1e-4);
   }
   // attach a different input and run it again
   auto C2 = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
@@ -158,7 +158,7 @@ TEST(Relay, BuildModule) {
   tvm::runtime::NDArray Y3 = get_output_f(0);
   auto pY3 = (float*)Y3->data;
   for (int i = 0; i < 6; ++i) {
-    CHECK_LT(fabs(pY3[i] - (i + (i + 3) + (i + 4))), 1e-4);
+    ICHECK_LT(fabs(pY3[i] - (i + (i + 3) + (i + 4))), 1e-4);
   }
 }
 
@@ -171,12 +171,12 @@ TEST(Relay, GetExprRefCount) {
   auto y = relay::Call(relu_op, {x}, tvm::Attrs(), {});
   auto z = relay::Call(add_op, {y, x}, tvm::Attrs(), {});
   auto ref_count = GetExprRefCount(z);
-  CHECK(ref_count[a.get()] == 1);
-  CHECK(ref_count[relu_op.get()] == 2);
-  CHECK(ref_count[add_op.get()] == 1);
-  CHECK(ref_count[x.get()] == 2);
-  CHECK(ref_count[y.get()] == 1);
-  CHECK(ref_count[z.get()] == 1);
+  ICHECK(ref_count[a.get()] == 1);
+  ICHECK(ref_count[relu_op.get()] == 2);
+  ICHECK(ref_count[add_op.get()] == 1);
+  ICHECK(ref_count[x.get()] == 2);
+  ICHECK(ref_count[y.get()] == 1);
+  ICHECK(ref_count[z.get()] == 1);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/relay_pass_type_infer_test.cc b/tests/cpp/relay_pass_type_infer_test.cc
index cb7330dfab6d..38ac906c6dac 100644
--- a/tests/cpp/relay_pass_type_infer_test.cc
+++ b/tests/cpp/relay_pass_type_infer_test.cc
@@ -30,7 +30,7 @@ TEST(Relay, SelfReference) {
   auto tensor_type = relay::TensorType({}, DataType::Bool());
   auto x = relay::Var("x", relay::Type());
   auto f = relay::Function(tvm::Array<relay::Var>{x}, x, relay::Type(), {});
-  CHECK(f->IsInstance<BaseFuncNode>());
+  ICHECK(f->IsInstance<BaseFuncNode>());
   auto y = relay::Var("y", tensor_type);
   auto call = relay::Call(f, Array<relay::Expr>{y});
   auto fx = relay::Function(tvm::Array<relay::Var>{y}, call, relay::Type(), {});
@@ -39,7 +39,7 @@ TEST(Relay, SelfReference) {
   auto type_fx = mod->Lookup("main");
 
   auto expected = relay::FuncType(tvm::Array<relay::Type>{tensor_type}, tensor_type, {}, {});
-  CHECK(tvm::StructuralEqual()(type_fx->checked_type(), expected));
+  ICHECK(tvm::StructuralEqual()(type_fx->checked_type(), expected));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/relay_transform_sequential_test.cc b/tests/cpp/relay_transform_sequential_test.cc
index f052d66fbc5a..289574aef1e2 100644
--- a/tests/cpp/relay_transform_sequential_test.cc
+++ b/tests/cpp/relay_transform_sequential_test.cc
@@ -40,7 +40,7 @@ TVM_REGISTER_GLOBAL("test.seq.strategy")
                        const Target& target) {
       relay::FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                        const Type& out_type) -> Array<te::Tensor> {
-        CHECK_EQ(inputs.size(), 2U);
+        ICHECK_EQ(inputs.size(), 2U);
         return {topi::add(inputs[0], inputs[1])};
       };
       relay::FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>& outs,
@@ -99,11 +99,11 @@ TEST(Relay, Sequential) {
     mod = seq(mod);
   }
 
-  CHECK(mod.defined());
+  ICHECK(mod.defined());
   auto entry_func = mod->GetGlobalVar("main");
-  CHECK(entry_func.defined());
+  ICHECK(entry_func.defined());
   relay::Function f = Downcast<relay::Function>(mod->Lookup("main"));
-  CHECK(f.defined());
+  ICHECK(f.defined());
 
   // Expected function
   auto c1 = relay::Constant(c_data);
@@ -118,7 +118,7 @@ TEST(Relay, Sequential) {
   auto mod1 = IRModule::FromExpr(expected_func);
   mod1 = relay::transform::InferType()(mod1);
   auto expected = mod1->Lookup("main");
-  CHECK(tvm::StructuralEqual()(f, expected));
+  ICHECK(tvm::StructuralEqual()(f, expected));
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index b4c53ab84520..3d528f821059 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -37,7 +37,7 @@ TEST(TargetKind, GetAttrMap) {
   auto map = tvm::TargetKind::GetAttrMap<std::string>("Attr1");
   auto target_kind = tvm::TargetKind::Get("TestTargetKind").value();
   std::string result = map[target_kind];
-  CHECK_EQ(result, "Value1");
+  ICHECK_EQ(result, "Value1");
 }
 
 TEST(TargetCreation, NestedConfig) {
@@ -54,19 +54,19 @@ TEST(TargetCreation, NestedConfig) {
       },
   };
   Target target = Target(config);
-  CHECK_EQ(target->kind, TargetKind::Get("TestTargetKind").value());
-  CHECK_EQ(target->tag, "");
-  CHECK(target->keys.empty());
+  ICHECK_EQ(target->kind, TargetKind::Get("TestTargetKind").value());
+  ICHECK_EQ(target->tag, "");
+  ICHECK(target->keys.empty());
   Bool my_bool = target->GetAttr<Bool>("my_bool").value();
-  CHECK_EQ(my_bool.operator bool(), true);
+  ICHECK_EQ(my_bool.operator bool(), true);
   Array<String> your_names = target->GetAttr<Array<String>>("your_names").value();
-  CHECK_EQ(your_names.size(), 2U);
-  CHECK_EQ(your_names[0], "junru");
-  CHECK_EQ(your_names[1], "jian");
+  ICHECK_EQ(your_names.size(), 2U);
+  ICHECK_EQ(your_names[0], "junru");
+  ICHECK_EQ(your_names[1], "jian");
   Map<String, Integer> her_maps = target->GetAttr<Map<String, Integer>>("her_maps").value();
-  CHECK_EQ(her_maps.size(), 2U);
-  CHECK_EQ(her_maps["a"], 1);
-  CHECK_EQ(her_maps["b"], 2);
+  ICHECK_EQ(her_maps.size(), 2U);
+  ICHECK_EQ(her_maps["a"], 1);
+  ICHECK_EQ(her_maps["b"], 2);
 }
 
 TEST(TargetCreationFail, UnrecognizedConfigOption) {
@@ -142,13 +142,13 @@ TEST(TargetCreation, DeduplicateKeys) {
       {"device", String("arm_cpu")},
   };
   Target target = Target(config);
-  CHECK_EQ(target->kind, TargetKind::Get("llvm").value());
-  CHECK_EQ(target->tag, "");
-  CHECK_EQ(target->keys.size(), 2U);
-  CHECK_EQ(target->keys[0], "cpu");
-  CHECK_EQ(target->keys[1], "arm_cpu");
-  CHECK_EQ(target->attrs.size(), 1U);
-  CHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
+  ICHECK_EQ(target->kind, TargetKind::Get("llvm").value());
+  ICHECK_EQ(target->tag, "");
+  ICHECK_EQ(target->keys.size(), 2U);
+  ICHECK_EQ(target->keys[0], "cpu");
+  ICHECK_EQ(target->keys[1], "arm_cpu");
+  ICHECK_EQ(target->attrs.size(), 1U);
+  ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/cpp/tir_analysis_side_effect.cc b/tests/cpp/tir_analysis_side_effect.cc
index 26dedabb9304..022f2cffeda8 100644
--- a/tests/cpp/tir_analysis_side_effect.cc
+++ b/tests/cpp/tir_analysis_side_effect.cc
@@ -27,11 +27,11 @@ TEST(SimplePasses, SideEffect) {
   using namespace tvm;
   auto A = tir::Var("A", DataType::Handle());
   auto i = tir::Var("i", DataType::Int(32));
-  CHECK(tir::SideEffect(tir::Load(DataType::Float(32), A, i, tir::const_true(1))) ==
-        tir::CallEffectKind::kReadState);
-  CHECK(tir::SideEffect(exp(tir::Cast(DataType::Float(32), i + 1))) == tir::CallEffectKind::kPure);
-  CHECK(tir::SideEffect(tir::Call(DataType::Handle(), tir::builtin::tvm_storage_sync(), {})) ==
-        tir::CallEffectKind::kUpdateState);
+  ICHECK(tir::SideEffect(tir::Load(DataType::Float(32), A, i, tir::const_true(1))) ==
+         tir::CallEffectKind::kReadState);
+  ICHECK(tir::SideEffect(exp(tir::Cast(DataType::Float(32), i + 1))) == tir::CallEffectKind::kPure);
+  ICHECK(tir::SideEffect(tir::Call(DataType::Handle(), tir::builtin::tvm_storage_sync(), {})) ==
+         tir::CallEffectKind::kUpdateState);
 }
 
 int main(int argc, char** argv) {

From 684fa0c77d705c1fcdb3da0cfbf6bfa24ae2efde Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 26 Oct 2020 18:35:24 -0500
Subject: [PATCH 062/258] [LLVM] Add target feature string to function
 attributes (#6763)

---
 src/target/llvm/codegen_llvm.cc                  |  5 +++++
 .../unittest/test_target_codegen_hexagon.py      | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 2a7e4644571b..fd55f2418628 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -172,6 +172,11 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 #endif
 
+  llvm::StringRef fs = target_machine_->getTargetFeatureString();
+  if (!fs.empty()) {
+    function_->addFnAttr("target-features", fs);
+  }
+
   if (ret_void) {
     builder_->CreateRetVoid();
   } else {
diff --git a/tests/python/unittest/test_target_codegen_hexagon.py b/tests/python/unittest/test_target_codegen_hexagon.py
index d42693ef16bd..b74d487f3fa7 100644
--- a/tests/python/unittest/test_target_codegen_hexagon.py
+++ b/tests/python/unittest/test_target_codegen_hexagon.py
@@ -63,6 +63,21 @@ def check_add(offload):
     check_add(False)
 
 
+def test_llvm_target_features():
+    if not check_prereq_and_setup():
+        return
+    target = tvm.target.hexagon("v66", hvx=128)
+    # Define some trivial compute
+    A = tvm.te.placeholder((128,), dtype="uint8", name="A")
+    C = tvm.te.compute((128,), lambda i: A[i] + 1, name="C")
+    s = tvm.te.create_schedule(C.op)
+    m = tvm.build(s, [C, A], target=target, target_host=target, name="add_one")
+    llvm_ir = m.get_source("ll")
+    # Make sure we find +hvx-length128b in "attributes".
+    fs = re.findall(r"attributes.*\+hvx-length128b", llvm_ir)
+    assert fs  # Check that it's non-empty
+
+
 def test_alloc_vtcm():
     if not check_prereq_and_setup():
         return
@@ -92,4 +107,5 @@ def test_alloc_vtcm():
 
 if __name__ == "__main__":
     test_basic()
+    test_llvm_target_features()
     test_alloc_vtcm()

From bc54cd59569f6be9fda5f3422db0454f39940d68 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 26 Oct 2020 16:36:32 -0700
Subject: [PATCH 063/258] [FIX,MICROTVM] Add requires_micro decorators to
 microtvm tests (#6747)

* [FIX,MICROTVM] Add requires_micro decorators to microtvm tests
---
 tests/python/unittest/test_micro_artifact.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_micro_artifact.py b/tests/python/unittest/test_micro_artifact.py
index 39fea16c0a02..80d34db6384f 100644
--- a/tests/python/unittest/test_micro_artifact.py
+++ b/tests/python/unittest/test_micro_artifact.py
@@ -20,9 +20,9 @@
 import json
 import os
 import shutil
+import tvm
 
 from tvm.contrib import util
-from tvm.micro import artifact
 
 
 FILE_LIST = ["label1", "label2", "label12", "unlabelled"]
@@ -49,11 +49,14 @@ def build_artifact(artifact_path, immobile=False):
         os.path.join(artifact_path, "sub_dir"), os.path.join(artifact_path, "abs_dir_symlink")
     )
 
+    from tvm.micro import artifact
+
     art = artifact.Artifact(artifact_path, TEST_LABELS, TEST_METADATA, immobile=immobile)
 
     return art
 
 
+@tvm.testing.requires_micro
 def test_basic_functionality():
     temp_dir = util.tempdir()
     artifact_path = temp_dir.relpath("foo")
@@ -66,7 +69,10 @@ def test_basic_functionality():
         assert art.label_abspath(label) == [os.path.join(artifact_path, p) for p in paths]
 
 
+@tvm.testing.requires_micro
 def test_archive():
+    from tvm.micro import artifact
+
     temp_dir = util.tempdir()
     art = build_artifact(temp_dir.relpath("foo"))
 
@@ -100,7 +106,10 @@ def test_archive():
         assert os.path.exists(os.path.join(unarchive_base_dir, f))
 
 
+@tvm.testing.requires_micro
 def test_metadata_only():
+    from tvm.micro import artifact
+
     temp_dir = util.tempdir()
     base_dir = temp_dir.relpath("foo")
     art = build_artifact(base_dir)

From b845dd3252f1f27e93ee4ec3d8adee75d428447f Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Mon, 26 Oct 2020 16:37:05 -0700
Subject: [PATCH 064/258] [Relay] A set of utilities that allows a model to be
 run efficiently on tensorcores. (#6748)

---
 python/tvm/relay/analysis/__init__.py         |   3 +
 python/tvm/relay/analysis/count_layers.py     |  68 +++++++++
 python/tvm/relay/op/nn/_nn.py                 |  14 ++
 python/tvm/relay/op/strategy/cuda.py          |   9 +-
 python/tvm/relay/transform/__init__.py        |   1 +
 python/tvm/relay/transform/recast.py          | 139 ++++++++++++++++++
 python/tvm/relay/transform/transform.py       |  27 ++++
 src/runtime/graph/graph_runtime.cc            |   2 +-
 tests/python/relay/test_layer_count.py        |  34 +++++
 .../relay/test_pass_convert_op_layout.py      |  73 +++++++++
 tests/python/relay/test_recast.py             | 108 ++++++++++++++
 11 files changed, 475 insertions(+), 3 deletions(-)
 create mode 100644 python/tvm/relay/analysis/count_layers.py
 create mode 100644 python/tvm/relay/transform/recast.py
 create mode 100644 tests/python/relay/test_layer_count.py
 create mode 100644 tests/python/relay/test_recast.py

diff --git a/python/tvm/relay/analysis/__init__.py b/python/tvm/relay/analysis/__init__.py
index e5b21cb107f5..b4ea7f3cff62 100644
--- a/python/tvm/relay/analysis/__init__.py
+++ b/python/tvm/relay/analysis/__init__.py
@@ -29,3 +29,6 @@
 # Feature
 from . import feature
 from . import sparse_dense
+
+# Utilities
+from .count_layers import count_layers
diff --git a/python/tvm/relay/analysis/count_layers.py b/python/tvm/relay/analysis/count_layers.py
new file mode 100644
index 000000000000..93d4f2766284
--- /dev/null
+++ b/python/tvm/relay/analysis/count_layers.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities that enable counting the number of layers in a graph."""
+import tvm
+from tvm import relay
+from ..expr_functor import ExprVisitor
+
+
+class LayerCounter(ExprVisitor):
+    """A visitor pass that computes the deepest chain of specified ops in graph."""
+
+    def __init__(self, valid_ops):
+        self.depth_count = 0
+        self.deepest_count = 0
+        self.valid_ops = [relay.op.get(op) for op in valid_ops]
+        super().__init__()
+
+    def visit_call(self, call):
+        if call.op in self.valid_ops:
+            self.depth_count += 1
+        current_count = self.depth_count
+        self.deepest_count = max(self.deepest_count, current_count)
+        for arg in call.args:
+            self.visit(arg)
+            self.depth_count = current_count
+
+    def count(self):
+        return self.deepest_count
+
+
+def count_layers(expr, valid_ops):
+    """Determine the number of layers of specified ops in a graph.
+    This pass computes only the deepest chain of ops rather than the
+    total number of ops in a graph. Thus, if there are two parallel
+    convolutions (for example), they would be considered a single layer.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule.
+        The input expression.
+
+    valid_ops: List[str]
+        A list of the operations that should be included in the count.
+
+    Returns
+    -------
+    layer_count : int
+        The number of layers of the specified operations found in the graph.
+    """
+    if isinstance(expr, tvm.ir.IRModule):
+        expr = expr["main"]
+    count_pass = LayerCounter(valid_ops)
+    count_pass.visit(expr)
+    return count_pass.count()
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index e1aabe1e15b5..c9926647989e 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -28,6 +28,7 @@
 from ..op import OpPattern
 from .._tensor import elemwise_shape_func
 from ..strategy.generic import is_depthwise_conv2d
+from ...transform import LayoutConfig
 
 # relu
 reg.register_broadcast_schedule("nn.relu")
@@ -164,6 +165,16 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layouts):
     from tvm import relay
 
     data, weight = inputs
+
+    # First check if there is a LayoutConfig scope, and if so, whether
+    # it indicates we should ignore this layer or not.
+    layout_config = LayoutConfig.current
+    if layout_config is not None:
+        skip_layer = layout_config.check_skip()
+        if skip_layer:
+            return relay.nn.conv2d(data, weight, **attrs)
+
+    # Prepare new layout.
     new_attrs = dict(attrs)
     assert len(desired_layouts) == 2, "A desired layout is expected for both of nn.conv2d's inputs"
     desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
@@ -192,6 +203,9 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layouts):
         else:
             new_attrs["kernel_layout"] = "HWIO"
         return relay.nn.conv2d(data, weight, **new_attrs)
+    elif desired_data_layout == "HWNC":
+        new_attrs["kernel_layout"] = "HWOI"
+        return relay.nn.conv2d(data, weight, **new_attrs)
 
     raise ValueError("Layout %s is not yet supported." % desired_data_layout)
 
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 7031365251aa..ca44e49ce1dd 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -219,8 +219,13 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 out_channels = oc_chunk * oc_block_factor
             else:
                 _, _, out_channels, _ = get_const_tuple(kernel.shape)
-            if topi.cuda.is_shape_tensorcore_direct_qualified(
-                batch=N, in_channels=in_channels, num_filter=out_channels, in_dtype=data.dtype
+
+            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
+            if (
+                (N % 16 == 0 and in_channels % 16 == 0 and out_channels % 16 == 0)
+                or (N % 8 == 0 and in_channels % 16 == 0 and out_channels % 32 == 0)
+                or (N % 32 == 0 and in_channels % 16 == 0 and out_channels % 8 == 0)
+                and (data.dtype in tensorcore_dtypes and kernel.dtype in tensorcore_dtypes)
             ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
diff --git a/python/tvm/relay/transform/__init__.py b/python/tvm/relay/transform/__init__.py
index 138a36611c6f..1d0ea176b16f 100644
--- a/python/tvm/relay/transform/__init__.py
+++ b/python/tvm/relay/transform/__init__.py
@@ -18,4 +18,5 @@
 """The Relay IR namespace containing transformations."""
 # transformation passes
 from .transform import *
+from .recast import recast
 from . import memory_alloc
diff --git a/python/tvm/relay/transform/recast.py b/python/tvm/relay/transform/recast.py
new file mode 100644
index 000000000000..05a72676a907
--- /dev/null
+++ b/python/tvm/relay/transform/recast.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Relay type recasting pass"""
+import tvm
+from tvm import relay
+from tvm.ir import IRModule
+from .transform import InferType
+from ..analysis import count_layers
+from ..expr_functor import ExprMutator, Call
+
+
+class RecastMutator(ExprMutator):
+    """Cast operations to the target type."""
+
+    def __init__(self, dtype, out_dtype, valid_ops, valid_op_count, skip_layers):
+        self.dtype = dtype
+        self.out_dtype = out_dtype
+        self.depth_count = 0
+        self.valid_ops = [relay.op.get(op) for op in valid_ops]
+        self.valid_op_count = valid_op_count
+        self.skip_layers = skip_layers
+        # Convert negative indices to positive ones.
+        for i, layer in enumerate(skip_layers):
+            if layer < 0:
+                skip_layers[i] = self.valid_op_count + layer
+        super().__init__()
+
+    def visit_call(self, call):
+        # Keep track of our current depth and layer count
+        # so we can know whether to skip this layer or not.
+        current_depth = self.depth_count
+        current_layer = self.valid_op_count - current_depth - 1
+        if call.op in self.valid_ops:
+            self.depth_count += 1
+        # Visit current call operation
+        new_fn = self.visit(call.op)
+        # Visit current arguments
+        args = []
+        for arg in call.args:
+            args.append(self.visit(arg))
+            self.depth_count = current_depth
+
+        # Downcast this op if its the correct type and not skipped.
+        if call.op in self.valid_ops and current_layer not in self.skip_layers:
+            # Recast inputs to specified type.
+            args = [self.visit(arg) for arg in call.args]
+            new_args = list()
+            for arg in args:
+                new_args.append(relay.cast(arg, dtype=self.dtype))
+
+            # If out_dtype is in the attributes, we need to update it.
+            orig_dtype = None
+            if "out_dtype" in call.attrs.keys():
+                new_attr_dict = {}
+                for attr in call.attrs.keys():
+                    attr_value = call.attrs[attr]
+                    if isinstance(attr_value, tvm.ir.container.Array):
+                        attr_value = tuple(attr_value)
+                    new_attr_dict[str(attr)] = attr_value
+                new_attr_dict["out_dtype"] = self.out_dtype
+                attr_type = str(call.attrs).split("(")[0]
+                new_attrs = tvm.ir.make_node(attr_type, **new_attr_dict)
+                if call.attrs["out_dtype"] != "":
+                    orig_dtype = call.attrs["out_dtype"]
+            else:
+                new_attrs = call.attrs
+
+            if orig_dtype is None:
+                # Perform type inference to determine the original type.
+                new_mod = IRModule.from_expr(call)
+                new_mod = InferType()(new_mod)
+                checked_arg = new_mod["main"].body
+                orig_dtype = checked_arg.checked_type.dtype
+            # Recast the output for compatibility with other graph operations.
+            return relay.cast(Call(new_fn, new_args, new_attrs), orig_dtype)
+
+        # Otherwise return the unchanged call.
+        return Call(new_fn, args, call.attrs)
+
+
+def recast(expr, dtype, out_dtype, ops=None, skip_layers=None):
+    """Convert the types of operations in a graph to a new value.
+    Note that this is primarily useful for testing performance of individual
+    operations at the new datatype. In a real setting, this pass will
+    almost certainly do a poor job converting from one datatype to another
+    as it just applies hard casting. For example, when recasting from float
+    to integer, many small values will simply be set to 0. Although this will
+    allow autotuning and benchmarking to produce proper timings at the new
+    data type, the output of the model will of course be heavily impacted.
+
+    Parameters
+    ---------
+    expr: tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule
+        The original function that will have its type changed.
+    dtype: str
+        The target type to cast to.
+    out_dtype: str
+        The output type to cast to.
+    ops: List[str]
+        A list of operations that should have their type changed,
+        others will be left as is.
+    skip_layers: List[int]
+        A list of integers indicating operations that should
+        not have their type changed, counted starting with the
+        first valid operation encountered. Negative indices are
+        allowed and indicate starting at the last layer.
+    Returns
+    -------
+    output_expr : tvm.relay.Expr, tvm.relay.Function, or tvm.ir.IRModule
+        The graph after recasting to the specified datatype.
+    """
+    return_mod = False
+    if isinstance(expr, tvm.ir.IRModule):
+        expr = expr["main"]
+        return_mod = True
+    if ops is None:
+        ops = ["nn.conv2d"]
+    if skip_layers is None:
+        skip_layers = []
+    layer_depth = count_layers(expr, ops)
+    recast_pass = RecastMutator(dtype, out_dtype, ops, layer_depth, skip_layers)
+    expr = recast_pass.visit(expr)
+    if return_mod:
+        return tvm.IRModule.from_expr(expr)
+    return expr
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index e155f83a7c5d..060547e4c4d7 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -386,6 +386,33 @@ def AlterOpLayout():
     return _ffi_api.AlterOpLayout()
 
 
+class LayoutConfig(object):
+    """A structure for customizing the ConvertLayout pass."""
+
+    current = None
+
+    def __init__(self, skip_layers=None):
+        self.skip_counter = 0
+        self.skip_layers = skip_layers if skip_layers is not None else []
+
+    def check_skip(self):
+        skip = self.skip_counter in self.skip_layers
+        self.skip_counter += 1
+        return skip
+
+    def reset(self):
+        self.skip_counter = 0
+        self.skip_layers = []
+
+    def __enter__(self):
+        self._old_manager = LayoutConfig.current
+        LayoutConfig.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        LayoutConfig.current = self._old_manager
+
+
 def ConvertLayout(desired_layouts):
     """Given a dest layout, this pass transforms the expr such that most of the ops input data
     layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms,
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 601c68abdf08..21960d9d4b1b 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -310,7 +310,7 @@ void GraphRuntime::SetupStorage() {
     ICHECK_GE(storage_id, 0) << "Do not support runtime shape op";
     DLDataType t = vtype[i];
     size_t bits = t.bits * t.lanes;
-    ICHECK(bits % 8U == 0U || bits == 1U);
+    ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
     size_t bytes = ((bits + 7U) / 8U) * size;
 
     uint32_t sid = static_cast<uint32_t>(storage_id);
diff --git a/tests/python/relay/test_layer_count.py b/tests/python/relay/test_layer_count.py
new file mode 100644
index 000000000000..f680bb2725f2
--- /dev/null
+++ b/tests/python/relay/test_layer_count.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tvm.relay.testing import resnet
+from tvm.relay.analysis import count_layers
+
+
+def test_layer_count():
+    def verify(num_layers):
+        # Load a resnet with a known number of layers.
+        mod, _ = resnet.get_workload(num_layers=num_layers)
+        # Count the number of conv and dense layers.
+        count = count_layers(mod, valid_ops=["nn.conv2d", "nn.dense"])
+        assert count == num_layers
+
+    verify(18)
+    verify(50)
+
+
+if __name__ == "__main__":
+    test_layer_count()
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 40aef264a335..1fc5d39b9486 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -1162,6 +1162,78 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_convert_with_config():
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y = relay.nn.relu(y)
+
+        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
+        y2 = relay.nn.conv2d(
+            y,
+            weight2,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y2 = relay.nn.relu(y2)
+
+        out = relay.Function([x, weight, weight2], y2)
+        return out
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+
+        weight2 = relay.var("weight2", shape=(3, 3, 64, 64))
+        weight2 = relay.layout_transform(weight2, "HWIO", "HWOI")
+
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y = relay.nn.relu(y)
+        y = relay.layout_transform(y, "NHWC", "HWNC")
+
+        y2 = relay.nn.conv2d(
+            y,
+            weight2,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="HWNC",
+            kernel_layout="HWOI",
+        )
+        y2 = relay.nn.relu(y2)
+
+        y2 = relay.layout_transform(y2, "HWNC", "NHWC")
+        output = relay.Function(relay.analysis.free_vars(y2), y2)
+        return output
+
+    a = before()
+    layout_config = relay.transform.LayoutConfig(skip_layers=[0])
+    with layout_config:
+        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["HWNC", "default"]}))
+    b = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 if __name__ == "__main__":
     test_qnn_binary_no_convert_layout()
     test_no_convert_layout()
@@ -1185,3 +1257,4 @@ def expected():
     test_default_keyword()
     test_different_ops_convert_layout()
     test_no_desired_layout()
+    test_convert_with_config()
diff --git a/tests/python/relay/test_recast.py b/tests/python/relay/test_recast.py
new file mode 100644
index 000000000000..8c5a562ddbba
--- /dev/null
+++ b/tests/python/relay/test_recast.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import relay
+from tvm.relay.transform import recast
+
+
+def test_recast_simple():
+    """Recast a single convolution operator."""
+
+    def before():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
+        return relay.Function([x, w], c)
+
+    def expected():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        x_int = relay.cast(x, "int8")
+        w_int = relay.cast(w, "int8")
+        c = relay.nn.conv2d(x_int, w_int, padding=(1, 1), out_dtype="int32")
+        c_float = relay.cast(c, "float32")
+        return relay.Function([x, w], c_float)
+
+    pre = before()
+    post = recast(pre, "int8", "int32")
+    expected = expected()
+    assert tvm.ir.structural_equal(expected, post)
+
+
+def test_recast_medium():
+    """Recast a slightly larger graph."""
+
+    def before():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
+        w2 = relay.var("w2", shape=[8, 8, 3, 3])
+        c2 = relay.nn.conv2d(c, w2, padding=(1, 1), out_dtype="float32")
+        return relay.Function([x, w, w2], c2)
+
+    def expected():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        x_int = relay.cast(x, "int8")
+        w_int = relay.cast(w, "int8")
+        c = relay.nn.conv2d(x_int, w_int, padding=(1, 1), out_dtype="int32")
+        c_float = relay.cast(c, "float32")
+        w2 = relay.var("w2", shape=[8, 8, 3, 3])
+        w2_int = relay.cast(w2, "int8")
+        c_float_int = relay.cast(c_float, "int8")
+        c2 = relay.nn.conv2d(c_float_int, w2_int, padding=(1, 1), out_dtype="int32")
+        c2_float = relay.cast(c2, "float32")
+        return relay.Function([x, w, w2], c2_float)
+
+    pre = before()
+    post = recast(pre, "int8", "int32")
+    expected = expected()
+    assert tvm.ir.structural_equal(expected, post)
+
+
+def test_recast_skip():
+    """Recast a graph using skip layers."""
+
+    def before():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
+        w2 = relay.var("w2", shape=[8, 8, 3, 3])
+        c2 = relay.nn.conv2d(c, w2, padding=(1, 1), out_dtype="float32")
+        return relay.Function([x, w, w2], c2)
+
+    def expected():
+        x = relay.var("x", shape=[8, 8, 8, 8])
+        w = relay.var("w", shape=[8, 8, 3, 3])
+        c = relay.nn.conv2d(x, w, padding=(1, 1), out_dtype="float32")
+        w2 = relay.var("w2", shape=[8, 8, 3, 3])
+        w2_int = relay.cast(w2, "int8")
+        c_int = relay.cast(c, "int8")
+        c2 = relay.nn.conv2d(c_int, w2_int, padding=(1, 1), out_dtype="int32")
+        c2_float = relay.cast(c2, "float32")
+        return relay.Function([x, w, w2], c2_float)
+
+    pre = before()
+    post = recast(pre, "int8", "int32", skip_layers=[0])
+    expected = expected()
+    assert tvm.ir.structural_equal(expected, post)
+
+
+if __name__ == "__main__":
+    test_recast_simple()
+    test_recast_medium()
+    test_recast_skip()

From 9d578ae219bf472b061d0430c8ee324d5e90b057 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 27 Oct 2020 09:49:08 -0400
Subject: [PATCH 065/258] [VERSION] Make script path invariant (#6766)

---
 version.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/version.py b/version.py
index 2678c5ce42d4..461b81a5571b 100644
--- a/version.py
+++ b/version.py
@@ -44,6 +44,8 @@
 __most_recent_tag__ = "v0.7.0"
 # ---------------------------------------------------
 
+PROJ_ROOT = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+
 
 def py_str(cstr):
     return cstr.decode("utf-8")
@@ -76,7 +78,7 @@ def git_describe_version():
       the git short hash tag of the current commit is 0d07a329e.
     """
     cmd = ["git", "describe", "--tags"]
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=PROJ_ROOT)
     (out, _) = proc.communicate()
 
     if proc.returncode != 0:
@@ -145,11 +147,9 @@ def update(file_name, pattern, repl, dry_run=False):
 
 def sync_version(pub_ver, local_ver, dry_run):
     """Synchronize version."""
-    proj_root = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-
     # python uses the PEP-440: local version
     update(
-        os.path.join(proj_root, "python", "tvm", "_ffi", "libinfo.py"),
+        os.path.join(PROJ_ROOT, "python", "tvm", "_ffi", "libinfo.py"),
         r"(?<=__version__ = \")[.0-9a-z\+]+",
         local_ver,
         dry_run,
@@ -158,14 +158,14 @@ def sync_version(pub_ver, local_ver, dry_run):
     # Note that full git hash is already available in libtvm
     # C++ header
     update(
-        os.path.join(proj_root, "include", "tvm", "runtime", "c_runtime_api.h"),
+        os.path.join(PROJ_ROOT, "include", "tvm", "runtime", "c_runtime_api.h"),
         r'(?<=TVM_VERSION ")[.0-9a-z\+]+',
         pub_ver,
         dry_run,
     )
     # conda
     update(
-        os.path.join(proj_root, "conda", "recipe", "meta.yaml"),
+        os.path.join(PROJ_ROOT, "conda", "recipe", "meta.yaml"),
         r"(?<=version = ')[.0-9a-z\+]+",
         pub_ver,
         dry_run,
@@ -175,8 +175,8 @@ def sync_version(pub_ver, local_ver, dry_run):
     dev_pos = pub_ver.find(".dev")
     npm_ver = pub_ver if dev_pos == -1 else "%s.0-%s" % (pub_ver[:dev_pos], pub_ver[dev_pos + 1 :])
     update(
-        os.path.join(proj_root, "web", "package.json"),
-        r'(?<="version": ")[.0-9a-z\+]+',
+        os.path.join(PROJ_ROOT, "web", "package.json"),
+        r'(?<="version": ")[.0-9a-z\-\+]+',
         npm_ver,
         dry_run,
     )

From 0b9907c0d62ab0ec9cf1743892a14ac447223923 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 27 Oct 2020 07:05:10 -0700
Subject: [PATCH 066/258] [AutoScheduler] Re-organize logs files for tutorials
 (#6768)

* reorganize logs files

* fix lint
---
 tests/scripts/task_python_docs.sh             |  5 ++---
 .../auto_scheduler/{ => ci_logs}/conv2d.json  |  1 +
 .../auto_scheduler/{ => ci_logs}/matmul.json  |  2 +-
 .../auto_scheduler/tune_conv2d_layer_cuda.py  | 15 ++++++-------
 tutorials/auto_scheduler/tune_matmul_x86.py   | 21 +++++++++----------
 5 files changed, 20 insertions(+), 24 deletions(-)
 rename tutorials/auto_scheduler/{ => ci_logs}/conv2d.json (93%)
 rename tutorials/auto_scheduler/{ => ci_logs}/matmul.json (83%)

diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index cbaffa2b37e4..3d229651cb4f 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -39,9 +39,8 @@ rm -rf docs/gen_modules
 rm -rf docs/doxygen
 
 # prepare auto scheduler tutorials
-rm -rf tutorials/auto_scheduler/*logs
-mkdir tutorials/auto_scheduler/logs
-cp -f tutorials/auto_scheduler/{matmul,conv2d}.json tutorials/auto_scheduler/logs
+rm -rf tutorials/auto_scheduler/*.json
+cp -f tutorials/auto_scheduler/ci_logs/{matmul,conv2d}.json tutorials/auto_scheduler
 
 # remove stale tutorials and always build from scratch.
 rm -rf docs/tutorials
diff --git a/tutorials/auto_scheduler/conv2d.json b/tutorials/auto_scheduler/ci_logs/conv2d.json
similarity index 93%
rename from tutorials/auto_scheduler/conv2d.json
rename to tutorials/auto_scheduler/ci_logs/conv2d.json
index 10f63d0d4c8a..c748920d14db 100644
--- a/tutorials/auto_scheduler/conv2d.json
+++ b/tutorials/auto_scheduler/ci_logs/conv2d.json
@@ -1 +1,2 @@
+# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI.
 {"i": [["[\"conv2d_layer\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32"], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 512, [1, 64, 2, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 7, [1, 1, 7, 1], 1], ["SP", 3, 20, 512, [4, 2], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 48, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 504, [4], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000429498], 0, 1.59126, 1603259147], "v": "v0.2"}
diff --git a/tutorials/auto_scheduler/matmul.json b/tutorials/auto_scheduler/ci_logs/matmul.json
similarity index 83%
rename from tutorials/auto_scheduler/matmul.json
rename to tutorials/auto_scheduler/ci_logs/matmul.json
index 7f537641281a..827cfc9a6dbb 100644
--- a/tutorials/auto_scheduler/matmul.json
+++ b/tutorials/auto_scheduler/ci_logs/matmul.json
@@ -1,2 +1,2 @@
-# Keep a valid schedule for demonstraction
+# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI.
 {"i": [["[\"matmul_add\", 128, 128, 128, \"float32\"]", "llvm -keys=cpu"], [[], [["SP", 2, 0, 128, [4, 2, 4], 1], ["SP", 2, 4, 128, [1, 32, 2], 1], ["SP", 2, 8, 128, [2], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 1], ["FSP", 4, 2, 1, 1], ["RE", 4, [0, 2, 1, 3]], ["CA", 2, 4, 1], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$0"], ["AN", 2, 9, 2]]]], "r": [[5.80388e-05], 0, 0.299169, 1603402396], "v": "v0.2"}
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 68fa5d597f66..10a2d1b44144 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -90,15 +90,12 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # * see :any:`auto_scheduler.TuningOptions`,
 #   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
 
-if not os.path.exists("./logs"):
-    os.mkdir("./logs")
-
-logfile = os.path.join("./logs", "conv2d.json")
+log_file = "conv2d.json"
 measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
 tune_option = auto_scheduler.TuningOptions(
     num_measure_trials=10,
     runner=measure_ctx.runner,
-    measure_callbacks=[auto_scheduler.RecordToFile(logfile)],
+    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
 )
 
 ######################################################################
@@ -163,7 +160,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # print the equivalent python schedule API, and build the binary again.
 
 # Load the measuremnt record for the best schedule
-inp, res = auto_scheduler.load_best(logfile, task.workload_key)
+inp, res = auto_scheduler.load_best(log_file, task.workload_key)
 
 # Print equivalent python schedule API. This can be used for debugging and
 # learning the behavior of the auto-scheduler.
@@ -183,15 +180,15 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 
 
 cost_model = auto_scheduler.XGBModel()
-cost_model.update_from_file(logfile)
+cost_model.update_from_file(log_file)
 search_policy = auto_scheduler.SketchPolicy(
-    task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(logfile)]
+    task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
 )
 measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
 tune_option = auto_scheduler.TuningOptions(
     num_measure_trials=5,
     runner=measure_ctx.runner,
-    measure_callbacks=[auto_scheduler.RecordToFile(logfile)],
+    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
 )
 sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
 
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index a2331fcc9835..81f2e71ff8f7 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -82,12 +82,9 @@ def matmul_add(N, L, M, dtype):
 #   and do more analyses later.
 # * see :any:`auto_scheduler.TuningOptions` for more parameters
 
-if not os.path.exists("./logs"):
-    os.mkdir("./logs")
-
-logfile = os.path.join("./logs", "matmul.json")
+log_file = "matmul.json"
 tune_option = auto_scheduler.TuningOptions(
-    num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile(logfile)]
+    num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
 )
 
 ######################################################################
@@ -147,7 +144,7 @@ def matmul_add(N, L, M, dtype):
 # print the equivalent python schedule API, and build the binary again.
 
 # Load the measuremnt record for the best schedule
-inp, res = auto_scheduler.load_best(logfile, task.workload_key)
+inp, res = auto_scheduler.load_best(log_file, task.workload_key)
 
 # Print equivalent python schedule API. This can be used for debugging and
 # learning the behavior of the auto-scheduler.
@@ -166,19 +163,21 @@ def matmul_add(N, L, M, dtype):
 # In the example below we resume the status and do more 5 trials.
 
 
-def resume_search(task, logfile_name):
+def resume_search(task, log_file_name):
     cost_model = auto_scheduler.XGBModel()
-    cost_model.update_from_file(logfile_name)
+    cost_model.update_from_file(log_file_name)
     search_policy = auto_scheduler.SketchPolicy(
-        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(logfile_name)]
+        task,
+        cost_model,
+        init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file_name)],
     )
     tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(logfile_name)]
+        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file_name)]
     )
     sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
 
 
-# resume_search(task, logfile)
+# resume_search(task, log_file)
 
 ######################################################################
 # .. note::

From da52ebb18de27c350374eb2df63f3d857cce9c89 Mon Sep 17 00:00:00 2001
From: shibuiwilliam <shibuiyusuke@gmail.com>
Date: Wed, 28 Oct 2020 01:33:32 +0900
Subject: [PATCH 067/258] [CI] Update PyXIR version to 0.1.3 (#6769)

Co-authored-by: Shibui Yusuke <yusuke.shibui@ShibuinoMacBook-Pro.local>
---
 docker/install/ubuntu_install_vitis_ai_packages_ci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index d4077bc67b44..c34ed3addce2 100644
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -25,5 +25,5 @@ mkdir "$PYXIR_HOME"
 
 pip3 install progressbar
 
-git clone --recursive --branch v0.1.2 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
+git clone --recursive --branch v0.1.3 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
 cd "${PYXIR_HOME}" && python3 setup.py install

From a6a1ff04a37ee213ae801a7bf12dddd5189feb27 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 27 Oct 2020 14:47:54 -0400
Subject: [PATCH 068/258] [CI] Update ci-wasm to latest (#6772)

---
 Jenkinsfile                    | 2 +-
 tests/scripts/task_web_wasm.sh | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 9d475045bb93..e2abb018d6e2 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -47,7 +47,7 @@
 ci_lint = "tlcpack/ci-lint:v0.62"
 ci_gpu = "tlcpack/ci-gpu:v0.70"
 ci_cpu = "tlcpack/ci-cpu:v0.70"
-ci_wasm = "tlcpack/ci-wasm:v0.60"
+ci_wasm = "tlcpack/ci-wasm:v0.70"
 ci_i386 = "tlcpack/ci-i386:v0.70"
 ci_qemu = "tlcpack/ci-qemu:v0.01"
 // <--- End of regex-scanned config.
diff --git a/tests/scripts/task_web_wasm.sh b/tests/scripts/task_web_wasm.sh
index 717d3284fce1..c117bb08c6f4 100755
--- a/tests/scripts/task_web_wasm.sh
+++ b/tests/scripts/task_web_wasm.sh
@@ -21,6 +21,7 @@ set -u
 
 export PYTHONPATH=`pwd`/python
 
+rm -rf .emscripten_cache
 cd web
 make clean
 npm install

From 51475346a375d1c8c196c081d8ab8f7b4af5f64a Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 28 Oct 2020 11:02:22 +0900
Subject: [PATCH 069/258] [Relay, TOPI] Complete rewrite of where op to support
 broadcasting (#6759)

* where type rel with broadcast

* add tests for where with broadcast

* clean up tests

* uncomment other tests

* add more tests

* update doc

* CHECK -> ICHECK

* add where any test

* fix format

* remove useless detections for one

* set manual seed

* ported shape broadcast helper func to hybridscript

* remove shape function helper from cpp

Co-authored-by: masa <masa@pop-os.localdomain>
---
 include/tvm/topi/transform.h               | 67 ++++++------------
 python/tvm/relay/op/_transform.py          | 34 ++++++++-
 python/tvm/relay/op/transform.py           | 17 ++---
 src/relay/op/tensor/transform.cc           | 47 ++++---------
 src/relay/op/type_relations.cc             |  2 +-
 src/relay/op/type_relations.h              |  9 +++
 tests/python/frontend/pytorch/test_lstm.py |  2 +
 tests/python/relay/test_any.py             | 44 ++++++++++++
 tests/python/relay/test_op_level4.py       | 81 ++++++++++++++++------
 9 files changed, 192 insertions(+), 111 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index fa27faf18f15..b670755d97b7 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -39,6 +39,8 @@
 #include <unordered_set>
 #include <vector>
 
+#include "detail/broadcast.h"
+
 namespace tvm {
 namespace topi {
 
@@ -887,53 +889,30 @@ inline Tensor take(const Tensor& a, const Tensor& indices, int axis, std::string
  */
 inline Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y,
                     std::string name = "T_where", std::string tag = kBroadcast) {
-  ICHECK_EQ(x->shape.size(), y->shape.size())
-      << "x and y must have the same shape.Got different number of dimension: " << x->shape.size()
-      << " vs " << y->shape.size();
   ICHECK_EQ(x->dtype, y->dtype) << "x and y must have the same dtype: " << x->dtype << " vs "
                                 << y->dtype;
+  auto get_out_shape = [&]() {
+    auto bh1 = detail::BroadcastShape(x->shape, y->shape);
+    Array<PrimExpr> common_shape1(bh1.common_shape.begin(), bh1.common_shape.end());
+    auto bh2 = detail::BroadcastShape(condition->shape, common_shape1);
+    Array<PrimExpr> common_shape2(bh2.common_shape.begin(), bh2.common_shape.end());
+    return common_shape2;
+  };
 
-  if (x->shape.size() == 0) {
-    return compute(
-        condition->shape,
-        [&](const Array<Var>& indices) {
-          PrimExpr cond;
-          if (condition->shape.size() == 0) {
-            cond = condition();
-          } else {
-            Array<PrimExpr> condition_idx{indices[0]};
-            cond = condition(condition_idx);
-          }
-          return tvm::tir::Select(cond != 0, x(), y());
-        },
-        name, tag);
-  } else if (condition->shape.size() != 1) {
-    ICHECK_EQ(condition->shape.size(), x->shape.size())
-        << "condition array must be either have the same shape as x or to be a "
-           "1-D array.Got different number of dimension: "
-        << condition->shape.size() << " vs " << x->shape.size();
-    return compute(
-        x->shape,
-        [&](const Array<Var>& indices) {
-          return tvm::tir::Select(condition(indices) != 0, x(indices), y(indices));
-        },
-        name, tag);
-  } else {
-    int64_t cond_first_dim = topi::GetConstInt(condition->shape[0]);
-    int64_t x_first_dim = topi::GetConstInt(x->shape[0]);
-    if (cond_first_dim > 0 && x_first_dim > 0) {
-      ICHECK_EQ(cond_first_dim, x_first_dim)
-          << "If condition is 1-D, the first dimension must be the same as x: " << cond_first_dim
-          << " vs " << x_first_dim;
-    }
-    return compute(
-        x->shape,
-        [&](const Array<Var>& indices) {
-          Array<PrimExpr> condition_idx{indices[0]};
-          return tvm::tir::Select(condition(condition_idx) != 0, x(indices), y(indices));
-        },
-        name, tag);
-  }
+  auto oshape = get_out_shape();
+
+  auto c_bh = detail::BroadcastShape(condition->shape, oshape);
+  auto x_bh = detail::BroadcastShape(x->shape, oshape);
+  auto y_bh = detail::BroadcastShape(y->shape, oshape);
+
+  auto select = [&](tvm::Array<tvm::tir::Var> ovars) {
+    auto c = condition(InputIndexFromBroadcast(ovars, condition, c_bh.vars1, c_bh.all_vars));
+    auto true_val = x(InputIndexFromBroadcast(ovars, x, x_bh.vars1, x_bh.all_vars));
+    auto false_val = y(InputIndexFromBroadcast(ovars, y, y_bh.vars1, y_bh.all_vars));
+    return tvm::tir::Select(c != 0, true_val, false_val);
+  };
+
+  return compute(oshape, select, name, tag);
 }
 
 /*!
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index b135901baac3..3bb488c80b58 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -821,6 +821,33 @@ def stack_shape_func(attrs, inputs, _):
     return [_stack_shape_func(inputs[0], convert(axis), convert(len(inputs)))]
 
 
+@script
+def _broadcast_shape_tensors(shape_tensor1, shape_tensor2):
+    rank1 = shape_tensor1.shape[0]
+    rank2 = shape_tensor2.shape[0]
+    out_rank = max(rank1, rank2)
+    bcast_shape_tensor = output_tensor((out_rank,), "int64")
+
+    for index in const_range(out_rank):
+        dim1 = int64(1)
+        dim2 = int64(1)
+
+        if rank1 == out_rank:
+            dim1 = shape_tensor1[index]
+        elif rank1 - (out_rank - index) >= 0:
+            dim1 = shape_tensor1[rank1 - (out_rank - index)]
+
+        if rank2 == out_rank:
+            dim2 = shape_tensor2[index]
+        elif rank2 - (out_rank - index) >= 0:
+            dim2 = shape_tensor2[rank2 - (out_rank - index)]
+
+        assert dim1 == dim2 or dim1 == 1 or dim2 == 1, "Invalid broadcast shapes"
+        bcast_shape_tensor[index] = max(dim1, dim2)
+
+    return bcast_shape_tensor
+
+
 @_reg.register_shape_func("where", False)
 def where_shape_func(attrs, inputs, _):
     """
@@ -828,6 +855,9 @@ def where_shape_func(attrs, inputs, _):
     """
     cond_shape = inputs[0]
     x_shape = inputs[1]
-    out_shape = x_shape if x_shape.shape else cond_shape
+    y_shape = inputs[2]
+
+    bcast_shape = _broadcast_shape_tensors(x_shape, y_shape)
+    out_shape = _broadcast_shape_tensors(bcast_shape, cond_shape)
 
-    return [topi.math.identity(out_shape)]
+    return [out_shape]
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 01af60ebbd4b..17f4c02380b3 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -688,25 +688,26 @@ def where(condition, x, y):
     condition.
 
     .. note::
-        The shape of condition, x, and y needs to be the same.
+        Shapes of condition, x, and y must be broadcastable to a common shape.
+        Semantics follow numpy where function
+        https://numpy.org/doc/stable/reference/generated/numpy.where.html
 
     Parameters
     ----------
     condition : relay.Expr
-        The condition array. The n-th element in `y` is selected when the n-th
-        value in the `condition` array is zero. Otherwise, the corresponding
-        element from `x` will be picked.
+        Where True, yield x, otherwise yield y
 
     x : relay.Expr
-        The first array to be selected.
+        The first array or scalar to be selected.
 
     y : relay.Expr
-        The second array to be selected.
+        The second array or scalar to be selected.
 
     Returns
     -------
     result : relay.Expr
-        The selected array.
+        The selected array. The output shape is the broadcasted shape from
+        condition, x, and y.
 
     Examples
     --------
@@ -717,7 +718,7 @@ def where(condition, x, y):
         condition = [[0, 1], [-1, 0]]
         relay.where(conditon, x, y) = [[5, 2], [3, 8]]
 
-        condition = [1, 0]
+        condition = [[1], [0]]
         relay.where(conditon, x, y) = [[1, 2], [7, 8]]
     """
     return _make.where(condition, x, y)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 4a832ec8d962..02fd8930d332 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -45,6 +45,7 @@
 #include "../../transforms/pattern_utils.h"
 #include "../make_op.h"
 #include "../op_common.h"
+#include "../type_relations.h"
 
 namespace tvm {
 namespace relay {
@@ -1737,30 +1738,17 @@ bool WhereRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     return false;
   }
 
-  const auto& cond_shape = condition->shape;
-  const auto& x_shape = x->shape;
-  const auto& y_shape = y->shape;
-  ICHECK(x_shape.size() == y_shape.size()) << "x and y must have the same size";
+  ICHECK_EQ(x->dtype, y->dtype) << "x and y must have the same dtype: " << x->dtype << " vs "
+                                << y->dtype;
 
-  if (cond_shape.size() != x_shape.size()) {
-    ICHECK_EQ(cond_shape.size(), 1) << "Shape of condition " << condition->shape
-                                    << " must be either equal to x or has dimension of 1.";
-  }
-  for (size_t i = 0; i < x_shape.size(); i++) {
-    ICHECK(reporter->AssertEQ(x_shape[i], y_shape[i]))
-        << "x and y must have the same shape: " << x_shape << " vs " << y_shape;
+  auto tensor_ty_condition = GetRef<TensorType>(condition);
+  auto tensor_ty_x = GetRef<TensorType>(x);
+  auto tensor_ty_y = GetRef<TensorType>(y);
 
-    if (i < cond_shape.size()) {
-      ICHECK(reporter->AssertEQ(cond_shape[i], x_shape[i]))
-          << "condition and x must have the same shape: " << cond_shape << " vs " << x_shape;
-    }
-  }
-  if (x_shape.size() == 0) {
-    // if x and y are scalar, the condition shape becomes the output shape
-    reporter->Assign(types[3], TensorType(cond_shape, x->dtype));
-  } else {
-    reporter->Assign(types[3], TensorType(x_shape, x->dtype));
-  }
+  auto b_ty = ConcreteBroadcast(tensor_ty_x, tensor_ty_y, x->dtype);
+  auto ret_ty = ConcreteBroadcast(tensor_ty_condition, b_ty, b_ty->dtype);
+
+  reporter->Assign(types[3], ret_ty);
   return true;
 }
 
@@ -1783,17 +1771,10 @@ Return the elements, either from x or y, depending on the condition.
 
 Given three ndarrays, condition, x, and y, return an ndarray with the elements
 from x or y, depending on the elements from condition are true or false.
-x and y must have the same shape. If condition has the same shape as x,
-each element in the output array is from x if the corresponding element
-in the condition is true, and from y if false.
-
-If condition does not have the same shape as x, it must be a 1D array whose
-size is the same as x’s first dimension size. Each row of the output array
-is from x’s row if the corresponding element from condition is true, and
-from y’s row if false.
 
-When x and y are scalars, condition must be an 1D array. The output shape
-is the same as condition's shape.
+Shapes of condition, x, and y must be broadcastable to a common shape, which
+is the output shape of this op. Semantics follow numpy where function.
+https://numpy.org/doc/stable/reference/generated/numpy.where.html
 
 Note that all non-zero values are interpreted as True in condition.
 
@@ -1805,7 +1786,7 @@ Examples::
   where(cond, x, y) = [[5, 2], [3, 8]]
 
 
-  cond = [1, 0]
+  cond = [[1], [0]]
   where(cond, x, y) = [[1, 2], [7, 8]]
 
   cond = [0, 1]
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 3dc33c5022e0..7a3bfcb21ce6 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -64,7 +64,7 @@ bool EqualConstInt(const IndexExpr& lhs, int64_t value) {
   return false;
 }
 
-Type ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataType output_dtype) {
+TensorType ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataType output_dtype) {
   std::vector<IndexExpr> oshape;
   size_t ndim1 = t1->shape.size();
   size_t ndim2 = t2->shape.size();
diff --git a/src/relay/op/type_relations.h b/src/relay/op/type_relations.h
index 5ab8b121ae9d..6d6d5f70c0c2 100644
--- a/src/relay/op/type_relations.h
+++ b/src/relay/op/type_relations.h
@@ -57,6 +57,15 @@ bool IdentityRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 bool BroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter);
 
+/*!
+ * \brief Determine the broadcasted shape from two input shapes
+ * \param t1 One of two Tensortype whose shapes are broadcasted
+ * \param t2 One of two Tensortype whose shapes are broadcasted
+ * \param output_dtype dtype of the output TensorType
+ * \return A TensorType whose shape is broadcasted from two input TensorType.
+ */
+TensorType ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataType output_dtype);
+
 /*!
  * \brief The broadcast type relation, implements the broadcasting
  *  rule over the two input types producing the broadcasted type.
diff --git a/tests/python/frontend/pytorch/test_lstm.py b/tests/python/frontend/pytorch/test_lstm.py
index 39d78c70c0fb..1197990f54ba 100644
--- a/tests/python/frontend/pytorch/test_lstm.py
+++ b/tests/python/frontend/pytorch/test_lstm.py
@@ -277,6 +277,8 @@ def test_custom_lstm():
     num_layers = 3
     state_tensor_shape = (batch, hidden_size)
 
+    torch.manual_seed(1)
+
     inp = torch.randn(seq_len, batch, input_size)
 
     input_shapes = [
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 872728514c3e..b1b068ebb32a 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -1236,5 +1236,49 @@ def test_any_stack():
     verify_any_stack(any_dims(4), (2, 1, 1, 4), 2, 2)
 
 
+def verify_any_where(
+    cond_shape, x_shape, y_shape, cond_np_shape, x_np_shape, y_np_shape, y_np_shape_invalid=None
+):
+    dtype = "float32"
+    cond = relay.var("cond", shape=cond_shape, dtype="bool")
+    x = relay.var("x", shape=x_shape, dtype=dtype)
+    y = relay.var("y", shape=y_shape, dtype=dtype)
+    z = relay.where(cond, x, y)
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([cond, x, y], z)
+
+    cond_np = np.random.randn(*cond_np_shape) > 0
+    x_np = np.random.randn(*x_np_shape).astype(dtype)
+    y_np = np.random.randn(*y_np_shape).astype(dtype)
+    expected = np.where(cond_np, x_np, y_np)
+
+    check_result([cond_np, x_np, y_np], mod, expected)
+
+    # verify invalid broadcasting check
+    if y_np_shape_invalid:
+        y_np_bad = np.random.randn(*y_np_shape_invalid).astype(dtype)
+        try:
+            check_result([cond_np, x_np, y_np_bad], mod, expected)
+        except tvm.error.TVMError as e:
+            error_msg = str(e).split("\n")[-1]
+            assert "Invalid broadcast shapes" in error_msg
+
+
+@tvm.testing.uses_gpu
+def test_any_where():
+    verify_any_where(any_dims(1), (5,), (5,), (5,), (5,), (5,))
+    verify_any_where(any_dims(1), any_dims(1), (5,), (5,), (5,), (5,))
+    verify_any_where(any_dims(1), any_dims(1), any_dims(1), (5,), (5,), (5,))
+    verify_any_where((5,), any_dims(1), any_dims(1), (5,), (5,), (5,))
+
+    # where with broadcast
+    verify_any_where(any_dims(1), any_dims(1), any_dims(1), (5,), (1,), (5,))
+    verify_any_where(any_dims(1), any_dims(2), any_dims(2), (5,), (5, 5), (5, 5))
+    verify_any_where(any_dims(1), any_dims(1), any_dims(2), (5,), (5,), (5, 5))
+    verify_any_where(
+        any_dims(2), any_dims(2), any_dims(2), (3, 4), (3, 1), (1, 4), y_np_shape_invalid=(2, 4)
+    )
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index eafc743634d8..ef363430a2eb 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -152,35 +152,70 @@ def run(func, inputs, ref_res):
                 op_res = intrp.evaluate(func)(*inputs)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
-    shape = (3, 4)
-    dtype = "float32"
-    cond = relay.var("cond", relay.TensorType(shape, dtype))
-    x = relay.var("x", relay.TensorType(shape, dtype))
-    y = relay.var("y", relay.TensorType(shape, dtype))
-    z = relay.where(cond, x, y)
-    zz = run_infer_type(z)
-    assert zz.checked_type == relay.TensorType(shape, dtype)
+    def verify(x_np, y_np, cond_np):
+        ref_res = np.where(cond_np, x_np, y_np)
+
+        args = []
+        args_np = []
+        vs = []
+
+        cond = relay.var("cond", relay.TensorType(cond_np.shape, "bool"))
 
-    func = relay.Function([cond, x, y], z)
-    condition = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
-    x = np.random.uniform(size=shape).astype(dtype)
-    y = np.random.uniform(size=shape).astype(dtype)
-    ref_res = np.where(condition, x, y)
+        args.append(cond)
+        args_np.append(cond_np)
 
-    run(func, [condition, x, y], ref_res)
+        for v_name, v_np in [("x", x_np), ("y", y_np)]:
+            if len(v_np.shape) == 0:
+                v = relay.const(v_np.item())
+            else:
+                v = relay.var(v_name, relay.TensorType(v_np.shape, dtype))
+                args.append(v)
+                args_np.append(v_np)
+            vs.append(v)
+
+        z = relay.where(cond, vs[0], vs[1])
+
+        func = relay.Function(args, z)
+
+        run(func, args_np, ref_res)
 
-    x = relay.const(1)
-    y = relay.const(-1)
-    shape = (3,)
     dtype = "float32"
-    cond = relay.var("cond", relay.TensorType(shape, "bool"))
-    z = relay.where(cond, x, y)
 
-    func = relay.Function([cond], z)
-    condition = np.array([1, 0, 1], dtype=np.bool)
-    ref_res = np.where(condition, 1, -1)
+    x_np = np.random.uniform(size=(3, 4)).astype(dtype)
+    y_np = np.random.uniform(size=(3, 4)).astype(dtype)
+    cond_np = np.random.uniform(low=-1, high=1, size=(3, 4)) > 0
+
+    verify(x_np, y_np, cond_np)
+
+    x_np = np.array(1.0, dtype)
+    y_np = np.array(-1.0, dtype)
+    cond_np = np.array([1, 0, 1], dtype=np.bool)
+
+    verify(x_np, y_np, cond_np)
+
+    x_np = np.arange(10).astype(dtype)
+    y_np = 10 * x_np
+    cond_np = x_np < 5
+
+    verify(x_np, y_np, cond_np)
+
+    x_np = np.array([[1, 2], [3, 4]], dtype)
+    y_np = np.array([[5, 6], [7, 8]], dtype)
+    cond_np = np.array([[1], [0]], dtype=np.bool)
+
+    verify(x_np, y_np, cond_np)
+    verify(x_np, y_np, cond_np.T)
+
+    x_np = np.random.randn(1, 12, 8, 8).astype(dtype)
+    y_np = np.array(-1.0, dtype)
+    cond_np = np.random.randn(1, 1, 8, 8) > 0
+
+    verify(x_np, y_np, cond_np)
+
+    x_np, y_np = np.ogrid[:3, :4]
+    cond_np = np.where(x_np < y_np, x_np, 10 + y_np).astype(np.bool)
 
-    run(func, [condition], ref_res)
+    verify(x_np.astype(dtype), y_np.astype(dtype), cond_np)
 
 
 def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32"):

From fe601339a8b096c6d6fd17cd69406fe2325ebc8a Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Wed, 28 Oct 2020 10:58:52 +0800
Subject: [PATCH 070/258] [Relay] Fix dynamic case for Squeeze and Split
 (#6739)

* [Relay] Fix dynamic case for Squeeze and Split

  Squeeze: Allow removed dimension to be dynamic and check it in shape
  function

  Split: Fix negative axis

* Fix comments
---
 include/tvm/topi/transform.h      |  4 +++-
 python/tvm/relay/op/_transform.py | 13 +++++++++++--
 tests/python/relay/test_any.py    |  2 ++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index b670755d97b7..e3cc2f61f57b 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -334,7 +334,9 @@ inline Tensor squeeze(const Tensor& x, Array<Integer> axis, bool atleast1d = fal
       if (val < 0) {
         val += static_cast<int>(x->shape.size());
       }
-      ICHECK_EQ(GetConstInt(x->shape[val]), 1) << "Dimension " << val << " must have size 1";
+      if (IsConstInt(x->shape[val])) {
+        ICHECK_EQ(GetConstInt(x->shape[val]), 1) << "Dimension " << val << " must have size 1";
+      }
       axis_val.push_back(val);
     }
   }
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 3bb488c80b58..56a3f2640e5d 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -586,11 +586,14 @@ def transpose_shape_func(attrs, inputs, _):
 
 
 @script
-def _squeeze_shape_func(data_shape, keep_axes):
+def _squeeze_shape_func(data_shape, keep_axes, remove_axes):
     out = output_tensor((len(keep_axes),), "int64")
     for i in const_range(len(keep_axes)):
         out[i] = data_shape[keep_axes[i]]
 
+    for i in const_range(len(remove_axes)):
+        assert data_shape[remove_axes[i]] == 1, "Removed dimension must have size 1"
+
     return out
 
 
@@ -601,10 +604,13 @@ def squeeze_shape_func(attrs, inputs, _):
     """
     axis = attrs.axis if attrs.axis is None else get_const_tuple(attrs.axis)
     keep_axes = []
+    remove_axes = []
     if axis is not None:
         for i in range(inputs[0].shape[0].value):
             if i not in axis:
                 keep_axes.append(i)
+            else:
+                remove_axes.append(i)
 
     # Due to current relay type system, it is possible even
     # a static kernel function needs shape function. To handle
@@ -612,7 +618,7 @@ def squeeze_shape_func(attrs, inputs, _):
     # for now.
     # TODO(kevinthesun): Enhance relay type system to avoid this.
     if keep_axes:
-        out = _squeeze_shape_func(inputs[0], convert(keep_axes))
+        out = _squeeze_shape_func(inputs[0], convert(keep_axes), convert(remove_axes))
     else:
         out = te.compute((), lambda *indices: 0)
     return [out]
@@ -716,6 +722,9 @@ def split_shape_func(attrs, inputs, _):
 
     axis = get_const_int(attrs.axis)
 
+    if axis < 0:
+        axis += get_const_int(inputs[0].shape[0])
+
     num_out = (
         indices_or_sections
         if isinstance(indices_or_sections, int)
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index b1b068ebb32a..d39e04ecd7c1 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -405,6 +405,7 @@ def verify_any_squeeze(data_shape, axis, static_data_shape):
 
 @tvm.testing.uses_gpu
 def test_any_squeeze():
+    verify_any_squeeze((relay.Any(), relay.Any(), relay.Any()), (0,), (1, 9, 8))
     verify_any_squeeze((1, relay.Any(), relay.Any()), (0,), (1, 9, 8))
     verify_any_squeeze(
         (1, relay.Any(), relay.Any(), 1, relay.Any(), relay.Any()), (0, 3), (1, 12, 2, 1, 9, 17)
@@ -684,6 +685,7 @@ def verify_any_split(data_shape, indices_or_sections, axis, static_data_shape, r
 
 @tvm.testing.uses_gpu
 def test_any_split():
+    verify_any_split((relay.Any(), 4), 2, -1, (9, 4), [(9, 2), (9, 2)])
     verify_any_split((relay.Any(), 4), 2, 1, (9, 4), [(9, 2), (9, 2)])
     verify_any_split((relay.Any(), relay.Any()), 2, 1, (9, 4), [(9, 2), (9, 2)])
     verify_any_split((relay.Any(), 12), (1, 4, 8), 1, (7, 12), [(7, 1), (7, 3), (7, 4)])

From d75a7ca828b1944e94291a22fc35c9e9ccf11f6b Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Tue, 27 Oct 2020 22:40:02 -0600
Subject: [PATCH 071/258] Scatter on Cuda (#6533)

* working cuda scatter

fix lint

fix pylint again

* cuda scatter with threading

* add dynamic shape tests

* remove unused variable
---
 python/tvm/relay/op/_transform.py       |   2 +-
 python/tvm/relay/op/strategy/cuda.py    |  13 +
 python/tvm/relay/op/strategy/generic.py |  23 +-
 python/tvm/topi/cuda/__init__.py        |   1 +
 python/tvm/topi/cuda/scatter.py         | 443 ++++++++++++++++++++++++
 tests/python/relay/test_op_level3.py    |  41 ++-
 6 files changed, 515 insertions(+), 8 deletions(-)
 create mode 100644 python/tvm/topi/cuda/scatter.py

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 56a3f2640e5d..4ee6f2ebb5c1 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -104,7 +104,7 @@ def compute_scatter(attrs, inputs, output_type):
     return [topi.scatter(inputs[0], inputs[1], inputs[2], attrs.axis)]
 
 
-_reg.register_schedule("scatter", strategy.schedule_scatter)
+_reg.register_strategy("scatter", strategy.scatter_strategy)
 
 # scatter_add
 @_reg.register_compute("scatter_add")
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index ca44e49ce1dd..d77361d906fb 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -651,6 +651,19 @@ def sparse_dense_padded_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+@scatter_strategy.register(["cuda", "gpu"])
+def scatter_cuda(attrs, inputs, out_type, target):
+    """sparse dense cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.cuda.scatter),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter.cuda",
+        plevel=10,
+    )
+    return strategy
+
+
 @argsort_strategy.register(["cuda", "gpu"])
 def argsort_strategy_cuda(attrs, inputs, out_type, target):
     """argsort cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index f6030d471594..34d1999707e9 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1032,11 +1032,24 @@ def schedule_argwhere(attrs, outs, target):
 
 
 # scatter
-@generic_func
-def schedule_scatter(attrs, outs, target):
-    """schedule scatter"""
-    with target:
-        return topi.generic.schedule_scatter(outs)
+@override_native_generic_func("scatter_strategy")
+def scatter_strategy(attrs, outs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.scatter),
+        wrap_topi_schedule(topi.generic.schedule_scatter),
+        name="scatter.generic",
+    )
+    return strategy
+
+
+def wrap_compute_scatter(topi_compute):
+    """Wrap scatter topi compute"""
+
+    def _compute_scatter(attrs, inputs, _):
+        return [topi_compute(inputs[0], inputs[1], inputs[2], axis=attrs.axis)]
+
+    return _compute_scatter
 
 
 # scatter_add
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index ed8037024635..3ff544f4bb3e 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -46,6 +46,7 @@
 from .ssd import *
 from .nms import get_valid_counts, non_max_suppression
 from .rcnn import *
+from .scatter import *
 from .sort import *
 from .conv2d_nhwc_tensorcore import *
 from .conv3d_ndhwc_tensorcore import *
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
new file mode 100644
index 000000000000..6522d74d8bef
--- /dev/null
+++ b/python/tvm/topi/cuda/scatter.py
@@ -0,0 +1,443 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
+"""Scatter operator """
+import tvm
+from tvm import te
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def gen_ir_1d(data, indices, updates, axis, out):
+    """Generate scatter ir for 1d inputs
+
+    Parameters
+    ----------
+    data : tir.Tensor
+        The input data to the operator.
+
+    indices : tir.Tensor
+        The index locations to update.
+
+    updates : tir.Tensor
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    out : tir.Tensor
+        The output tensor.
+
+    Returns
+    -------
+    ret : tir
+        The computational ir.
+    """
+    assert axis == 0
+    n = data.shape[0]
+
+    ib = tvm.tir.ir_builder.create()
+
+    out_ptr = ib.buffer_ptr(out)
+    data_ptr = ib.buffer_ptr(data)
+
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", n)
+        out_ptr[bx] = data_ptr[bx]
+
+    indices_ptr = ib.buffer_ptr(indices)
+    updates_ptr = ib.buffer_ptr(updates)
+
+    ni = indices.shape[0]
+
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", 1)
+        with ib.for_range(0, ni, name="i") as i:
+            index = indices_ptr[i]
+            with ib.if_scope(index < 0):
+                out_ptr[index + n] = updates_ptr[i]
+            with ib.else_scope():
+                out_ptr[index] = updates_ptr[i]
+
+    return ib.get()
+
+
+def gen_ir_2d(data, indices, updates, axis, out):
+    """Generate scatter ir for 2d inputs
+
+    Parameters
+    ----------
+    data : tir.Tensor
+        The input data to the operator.
+
+    indices : tir.Tensor
+        The index locations to update.
+
+    updates : tir.Tensor
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    out : tir.Tensor
+        The output tensor.
+
+    Returns
+    -------
+    ret : tir
+        The computational ir.
+    """
+    warp_size = tvm.target.Target.current(False).thread_warp_size
+
+    n = data.shape[0]
+    c = data.shape[1]
+
+    ib = tvm.tir.ir_builder.create()
+
+    out_ptr = ib.buffer_ptr(out)
+    data_ptr = ib.buffer_ptr(data)
+
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", n)
+        tx = te.thread_axis("threadIdx.x")
+        ib.scope_attr(tx, "thread_extent", warp_size)
+        with ib.for_range(0, ceil_div(c, warp_size), name="j") as j_:
+            j = j_ * warp_size + tx
+            with ib.if_scope(j < c):
+                idx = bx * c + j
+                out_ptr[idx] = data_ptr[idx]
+
+    indices_ptr = ib.buffer_ptr(indices)
+    updates_ptr = ib.buffer_ptr(updates)
+
+    ni = indices.shape[0]
+    ci = indices.shape[1]
+
+    if axis == 0:
+        with ib.new_scope():
+            j = te.thread_axis("blockIdx.x")
+            ib.scope_attr(j, "thread_extent", ci)
+            with ib.for_range(0, ni, name="i") as i:
+                idx = i * ci + j
+                index = indices_ptr[idx]
+                with ib.if_scope(index < 0):
+                    out_ptr[(index + n) * c + j] = updates_ptr[idx]
+                with ib.else_scope():
+                    out_ptr[index * c + j] = updates_ptr[idx]
+    else:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            with ib.for_range(0, ci, name="j") as j:
+                idx = i * ci + j
+                index = indices_ptr[idx]
+                with ib.if_scope(index < 0):
+                    out_ptr[i * c + (index + c)] = updates_ptr[idx]
+                with ib.else_scope():
+                    out_ptr[i * c + index] = updates_ptr[idx]
+    return ib.get()
+
+
+def gen_ir_3d(data, indices, updates, axis, out):
+    """Generate scatter ir for 3d inputs
+
+    Parameters
+    ----------
+    data : tir.Tensor
+        The input data to the operator.
+
+    indices : tir.Tensor
+        The index locations to update.
+
+    updates : tir.Tensor
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    out : tir.Tensor
+        The output tensor.
+
+    Returns
+    -------
+    ret : tir
+        The computational ir.
+    """
+    warp_size = tvm.target.Target.current(False).thread_warp_size
+
+    n = data.shape[0]
+    c = data.shape[1]
+    h = data.shape[2]
+
+    ib = tvm.tir.ir_builder.create()
+
+    out_ptr = ib.buffer_ptr(out)
+    data_ptr = ib.buffer_ptr(data)
+
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", n)
+        by = te.thread_axis("blockIdx.y")
+        ib.scope_attr(by, "thread_extent", c)
+        tx = te.thread_axis("threadIdx.x")
+        ib.scope_attr(tx, "thread_extent", warp_size)
+        with ib.for_range(0, ceil_div(h, warp_size), name="k") as k_:
+            k = k_ * warp_size + tx
+            with ib.if_scope(k < h):
+                idx = (bx * c + by) * h + k
+                out_ptr[idx] = data_ptr[idx]
+
+    indices_ptr = ib.buffer_ptr(indices)
+    updates_ptr = ib.buffer_ptr(updates)
+    ni = indices.shape[0]
+    ci = indices.shape[1]
+    hi = indices.shape[2]
+
+    if axis == 0:
+        with ib.new_scope():
+            j = te.thread_axis("blockIdx.x")
+            ib.scope_attr(j, "thread_extent", ci)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, ni, name="i") as i:
+                with ib.for_range(0, ceil_div(hi, warp_size), name="k") as k_:
+                    k = k_ * warp_size + tx
+                    with ib.if_scope(k < hi):
+                        idx = (i * ci + j) * hi + k
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            out_ptr[((index + n) * c + j) * h + k] = updates_ptr[idx]
+                        with ib.else_scope():
+                            out_ptr[(index * c + j) * h + k] = updates_ptr[idx]
+    elif axis == 1:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, ci, name="j") as j:
+                with ib.for_range(0, ceil_div(hi, warp_size), name="k") as k_:
+                    k = k_ * warp_size + tx
+                    with ib.if_scope(k < hi):
+                        idx = (i * ci + j) * hi + k
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            out_ptr[(i * c + (index + c)) * h + k] = updates_ptr[idx]
+                        with ib.else_scope():
+                            out_ptr[(i * c + index) * h + k] = updates_ptr[idx]
+    else:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            j = te.thread_axis("blockIdx.y")
+            ib.scope_attr(j, "thread_extent", ci)
+            with ib.for_range(0, hi, name="k") as k:
+                idx = (i * ci + j) * hi + k
+                index = indices_ptr[idx]
+                with ib.if_scope(index < 0):
+                    out_ptr[(i * c + j) * h + (index + h)] = updates_ptr[idx]
+                with ib.else_scope():
+                    out_ptr[(i * c + j) * h + index] = updates_ptr[idx]
+    return ib.get()
+
+
+def gen_ir_4d(data, indices, updates, axis, out):
+    """Generate scatter ir for 4d inputs
+
+    Parameters
+    ----------
+    data : tir.Tensor
+        The input data to the operator.
+
+    indices : tir.Tensor
+        The index locations to update.
+
+    updates : tir.Tensor
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    out : tir.Tensor
+        The output tensor.
+
+    Returns
+    -------
+    ret : tir
+        The computational ir.
+    """
+    warp_size = tvm.target.Target.current(False).thread_warp_size
+
+    n = data.shape[0]
+    c = data.shape[1]
+    h = data.shape[2]
+    w = data.shape[3]
+
+    ib = tvm.tir.ir_builder.create()
+
+    out_ptr = ib.buffer_ptr(out)
+    data_ptr = ib.buffer_ptr(data)
+    with ib.new_scope():
+        i = te.thread_axis("blockIdx.x")
+        ib.scope_attr(i, "thread_extent", n)
+        j = te.thread_axis("blockIdx.y")
+        ib.scope_attr(j, "thread_extent", c)
+        k = te.thread_axis("blockIdx.z")
+        ib.scope_attr(k, "thread_extent", h)
+        tx = te.thread_axis("threadIdx.x")
+        ib.scope_attr(tx, "thread_extent", warp_size)
+        with ib.for_range(0, ceil_div(w, warp_size), name="l") as l_:
+            l = l_ * warp_size + tx
+            with ib.if_scope(l < w):
+                idx = ((i * c + j) * h + k) * w + l
+                out_ptr[idx] = data_ptr[idx]
+
+    indices_ptr = ib.buffer_ptr(indices)
+    updates_ptr = ib.buffer_ptr(updates)
+    ni = indices.shape[0]
+    ci = indices.shape[1]
+    hi = indices.shape[2]
+    wi = indices.shape[3]
+
+    if axis == 0:
+        with ib.new_scope():
+            j = te.thread_axis("blockIdx.y")
+            ib.scope_attr(j, "thread_extent", ci)
+            k = te.thread_axis("blockIdx.z")
+            ib.scope_attr(k, "thread_extent", hi)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, ni, name="i") as i:
+                with ib.for_range(0, ceil_div(wi, warp_size), name="l") as l_:
+                    l = l_ * warp_size + tx
+                    with ib.if_scope(l < wi):
+                        idx = ((i * ci + j) * hi + k) * wi + l
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            out_ptr[(((index + n) * c + j) * h + k) * w + l] = updates_ptr[idx]
+                        with ib.else_scope():
+                            out_ptr[((index * c + j) * h + k) * w + l] = updates_ptr[idx]
+    elif axis == 1:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            k = te.thread_axis("blockIdx.z")
+            ib.scope_attr(k, "thread_extent", hi)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, ci, name="j") as j:
+                with ib.for_range(0, ceil_div(wi, warp_size), name="l") as l_:
+                    l = l_ * warp_size + tx
+                    with ib.if_scope(l < wi):
+                        idx = ((i * ci + j) * hi + k) * wi + l
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            out_ptr[((i * c + (index + c)) * h + k) * w + l] = updates_ptr[idx]
+                        with ib.else_scope():
+                            out_ptr[((i * c + index) * h + k) * w + l] = updates_ptr[idx]
+    elif axis == 2:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            j = te.thread_axis("blockIdx.y")
+            ib.scope_attr(j, "thread_extent", ci)
+            tx = te.thread_axis("threadIdx.x")
+            ib.scope_attr(tx, "thread_extent", warp_size)
+            with ib.for_range(0, hi, name="k") as k:
+                with ib.for_range(0, ceil_div(wi, warp_size), name="l") as l_:
+                    l = l_ * warp_size + tx
+                    with ib.if_scope(l < wi):
+                        idx = ((i * ci + j) * hi + k) * wi + l
+                        index = indices_ptr[idx]
+                        with ib.if_scope(index < 0):
+                            out_ptr[((i * c + j) * h + (index + h)) * w + l] = updates_ptr[idx]
+                        with ib.else_scope():
+                            out_ptr[((i * c + j) * h + index) * w + l] = updates_ptr[idx]
+    else:
+        with ib.new_scope():
+            i = te.thread_axis("blockIdx.x")
+            ib.scope_attr(i, "thread_extent", ni)
+            j = te.thread_axis("blockIdx.y")
+            ib.scope_attr(j, "thread_extent", ci)
+            k = te.thread_axis("blockIdx.z")
+            ib.scope_attr(k, "thread_extent", hi)
+            with ib.for_range(0, wi, name="l") as l:
+                idx = ((i * ci + j) * hi + k) * wi + l
+                index = indices_ptr[idx]
+                with ib.if_scope(index < 0):
+                    out_ptr[((i * c + j) * h + k) * w + (index + w)] = updates_ptr[idx]
+                with ib.else_scope():
+                    out_ptr[((i * c + j) * h + k) * w + index] = updates_ptr[idx]
+
+    return ib.get()
+
+
+def scatter(data, indices, updates, axis=0):
+    """Update data at positions defined by indices with values in updates
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    indices : relay.Expr
+        The index locations to update.
+
+    updates : relay.Expr
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    if axis < 0:
+        axis += len(data.shape)
+    assert axis >= 0
+    assert axis < len(data.shape)
+
+    rank = len(data.shape)
+    assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions"
+
+    ir_funcs = {
+        1: gen_ir_1d,
+        2: gen_ir_2d,
+        3: gen_ir_3d,
+        4: gen_ir_4d,
+    }
+
+    out_shape = data.shape
+    out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
+    out = te.extern(
+        [out_shape],
+        [data, indices, updates],
+        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0]),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_gpu",
+        tag="scatter_gpu",
+    )
+
+    return out
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index d18a12f20fa5..e636fe3f0037 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -910,6 +910,7 @@ def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
     )
 
 
+@tvm.testing.uses_gpu
 def test_scatter():
     def ref_scatter(data, indices, updates, axis=0):
         idx = np.indices(indices.shape).reshape(indices.ndim, -1)
@@ -935,13 +936,34 @@ def verify_scatter(dshape, ishape, axis=0):
         indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
 
         ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
-        # TODO(mbrookhart): expand testing when adding more backend schedules
-        for target, ctx in [("llvm", tvm.cpu())]:
+
+        for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
+    def verify_dynamic_scatter(dshape, ishape, axis=0):
+        d = relay.var("d", relay.TensorType([relay.Any() for i in range(len(dshape))], "float32"))
+        i = relay.var("i", relay.TensorType([relay.Any() for i in range(len(ishape))], "int64"))
+        u = relay.var("u", relay.TensorType([relay.Any() for i in range(len(ishape))], "float32"))
+        z = relay.op.scatter(d, i, u, axis)
+
+        func = relay.Function([d, i, u], z)
+
+        data_np = np.random.uniform(size=dshape).astype("float32")
+        updates_np = np.random.uniform(size=ishape).astype("float32")
+        indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
+
+        ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
+
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in ["vm", "debug"]:
+                mod = tvm.ir.IRModule.from_expr(func)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(data_np, indices_np, updates_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
     verify_scatter((10,), (10,), 0)
     verify_scatter((10, 5), (10, 5), -2)
     verify_scatter((10, 5), (10, 5), -1)
@@ -950,11 +972,26 @@ def verify_scatter(dshape, ishape, axis=0):
     verify_scatter((2, 3, 4), (1, 3, 4), 0)
     verify_scatter((2, 3, 4), (2, 1, 4), 1)
     verify_scatter((2, 3, 4), (2, 3, 1), 2)
+    verify_scatter((4, 2, 1), (1, 1, 1), 0)
     verify_scatter((2, 3, 4, 5), (1, 3, 4, 5), 0)
     verify_scatter((6, 3, 4, 5), (2, 3, 4, 5), 1)
     verify_scatter((2, 3, 8, 5), (2, 3, 1, 1), 2)
     verify_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
 
+    verify_dynamic_scatter((10,), (10,), 0)
+    verify_dynamic_scatter((10, 5), (10, 5), -2)
+    verify_dynamic_scatter((10, 5), (10, 5), -1)
+    verify_dynamic_scatter((10, 5), (3, 5), 0)
+    verify_dynamic_scatter((12, 4), (7, 2), 1)
+    verify_dynamic_scatter((2, 3, 4), (1, 3, 4), 0)
+    verify_dynamic_scatter((2, 3, 4), (2, 1, 4), 1)
+    verify_dynamic_scatter((2, 3, 4), (2, 3, 1), 2)
+    verify_dynamic_scatter((4, 2, 1), (1, 1, 1), 0)
+    verify_dynamic_scatter((2, 3, 4, 5), (1, 3, 4, 5), 0)
+    verify_dynamic_scatter((6, 3, 4, 5), (2, 3, 4, 5), 1)
+    verify_dynamic_scatter((2, 3, 8, 5), (2, 3, 1, 1), 2)
+    verify_dynamic_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
+
 
 def test_scatter_add():
     def ref_scatter_add(data, indices, updates, axis=0):

From 0236c9048d9f6d40cfe5e17ccb30119f43e063d9 Mon Sep 17 00:00:00 2001
From: Haozheng Fan <hzfan@apache.org>
Date: Wed, 28 Oct 2020 21:05:10 +0800
Subject: [PATCH 072/258] [ARITH] Tight bound for floormod (#6771)

---
 src/arith/const_int_bound.cc                  | 25 ++++++++++++++++---
 .../unittest/test_arith_const_int_bound.py    | 12 +++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index f39ce4b05643..2c01b9143155 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -245,6 +245,23 @@ class ConstIntBoundAnalyzer::Impl
   }
 
   Entry VisitExpr_(const FloorModNode* op) final {
+    /* let a / b = x + y, where x is integer, y \in [0, 1)
+     * floormod(a, b) = a - floordiv(a, b) * b
+     * floordiv(a, b) = x
+     * floormod(a, b) = a - floordiv(a, b) * b
+     *                = a - x * b
+     *                = a - (a / b - y) * b
+     *                = a - a + y * b
+     *                = y * b
+     * note that 0 <= y < 1
+     * when b > 0, 0 <= b * y < b
+     *             0 <= b * y <= b - 1
+     * when b < 0, b < b * y <= 0
+     *             b + 1 <= b * y <= 0
+     * In all cases, min(0, b + 1) <= b * y <= max(0, b - 1)
+     *               min(0, b_min + 1) <= b * y <= max(0, b_max - 1)
+     * That is, min(0, b_min + 1) <= floormod(a, b) <= max(0, b_max - 1)
+     */
     Entry a = VisitExpr(op->a);
     Entry b = VisitExpr(op->b);
     if (b.min_value > 0) {
@@ -259,9 +276,11 @@ class ConstIntBoundAnalyzer::Impl
       }
     } else {
       ICHECK(!b.is_const(0)) << "floormod by zero";
-      // mod by negative value is rare,
-      // and we just use the simpliest rule.
-      return Everything(op->dtype);
+      int64_t b_min_cap = InfAwareAdd(b.min_value, 1);
+      int64_t b_max_cap = InfAwareAdd(b.max_value, -1);
+      return Intersect(MakeBound(std::min(static_cast<int64_t>(0), b_min_cap),
+                                 std::max(static_cast<int64_t>(0), b_max_cap)),
+                       Everything(op->dtype));
     }
   }
 
diff --git a/tests/python/unittest/test_arith_const_int_bound.py b/tests/python/unittest/test_arith_const_int_bound.py
index badbcbcf1bb3..84fc7fd64614 100644
--- a/tests/python/unittest/test_arith_const_int_bound.py
+++ b/tests/python/unittest/test_arith_const_int_bound.py
@@ -303,6 +303,17 @@ def test_let_bound():
     assert bd.max_value == 2
 
 
+def test_floormod_negative_divisor():
+    analyzer = tvm.arith.Analyzer()
+    flm, fld = tvm.te.floormod, tvm.te.floordiv
+    a, b = te.var("a"), te.var("b")
+    analyzer.update(a, tvm.arith.ConstIntBound(0, 6))
+    analyzer.update(b, tvm.arith.ConstIntBound(-5, 7))
+    bd = analyzer.const_int_bound(flm(a, b))
+    assert bd.min_value == -4
+    assert bd.max_value == 6
+
+
 if __name__ == "__main__":
     test_let_bound()
     test_dtype_bound()
@@ -318,3 +329,4 @@ def test_let_bound():
     test_shift_and_bound()
     test_mix_index_bound()
     test_size_var_bound()
+    test_floormod_negative_divisor()

From 1cb1c37fc74847425b4306089dd9d3dcca752e98 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Wed, 28 Oct 2020 06:05:47 -0700
Subject: [PATCH 073/258] Fix version check bug (#6784)

* Fix version check bug

* Update pytorch_utils.py

* Update pytorch_utils.py

* Update pytorch_utils.py

* Update pytorch_utils.py
---
 python/tvm/relay/frontend/pytorch_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py
index e0c8f8da7d62..d0f0b9b4b019 100644
--- a/python/tvm/relay/frontend/pytorch_utils.py
+++ b/python/tvm/relay/frontend/pytorch_utils.py
@@ -20,6 +20,8 @@
 
 def is_version_greater_than(ver):
     import torch
-    from packaging import version
+    import re
 
-    return version.parse(torch.__version__) > version.parse(ver)
+    return "".join(re.findall(r"(\d+\.)(\d+\.)(\d)", torch.__version__)[0]) > "".join(
+        re.findall(r"(\d+\.)(\d+\.)(\d)", ver)[0]
+    )

From 9bd74dc351b338e3d5cc5c4cf888497b25b4ddfa Mon Sep 17 00:00:00 2001
From: mbaret <55580676+mbaret@users.noreply.github.com>
Date: Wed, 28 Oct 2020 18:51:28 +0000
Subject: [PATCH 074/258] [API] Added remove_global_func to the Python API
 (#6787)

This is useful for unregistering functions after a test.

Change-Id: Ic39499aa8f36bfe5470bc1f058ad3b96cf52b49c
---
 include/tvm/runtime/c_runtime_api.h |  6 ++++++
 python/tvm/_ffi/registry.py         | 11 +++++++++++
 src/runtime/registry.cc             |  6 ++++++
 3 files changed, 23 insertions(+)

diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index e25394a88b5a..aac49c198c72 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -366,6 +366,12 @@ TVM_DLL int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out);
  */
 TVM_DLL int TVMFuncListGlobalNames(int* out_size, const char*** out_array);
 
+/*!
+ * \brief Remove a global function.
+ * \param name The name of the function.
+ */
+TVM_DLL int TVMFuncRemoveGlobal(const char* name);
+
 // Array related apis for quick proptyping
 /*!
  * \brief Allocate a nd-array's memory,
diff --git a/python/tvm/_ffi/registry.py b/python/tvm/_ffi/registry.py
index 6637cd174391..677ca5d8de8d 100644
--- a/python/tvm/_ffi/registry.py
+++ b/python/tvm/_ffi/registry.py
@@ -262,6 +262,17 @@ def _list(name, func):
     return fdict
 
 
+def remove_global_func(name):
+    """Remove a global function by name
+
+    Parameters
+    ----------
+    name : str
+        The name of the global function
+    """
+    check_call(_LIB.TVMFuncRemoveGlobal(c_str(name)))
+
+
 def _get_api(f):
     flocal = f
     flocal.is_global = True
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index 6e74dc354259..a65235090bfd 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -146,3 +146,9 @@ int TVMFuncListGlobalNames(int* out_size, const char*** out_array) {
   *out_size = static_cast<int>(ret->ret_vec_str.size());
   API_END();
 }
+
+int TVMFuncRemoveGlobal(const char* name) {
+  API_BEGIN();
+  tvm::runtime::Registry::Remove(name);
+  API_END();
+}

From afe10fa78e49d575a702df61e945a640f2c87391 Mon Sep 17 00:00:00 2001
From: Rohan Mukherjee <rohan.mukherjii@gmail.com>
Date: Wed, 28 Oct 2020 15:32:35 -0500
Subject: [PATCH 075/258] [ManifestAlloc] Handle TupleType inputs in
 CheckReshapeOnly (#6776)

* Changes in CheckReshapeOnly to support TupleTypes as input

This arises insed ManifestAllocPass inside relay.vm.compile

* [ManifestAlloc] Handle TupleType inputs in CheckReshapeOnly
---
 python/tvm/relay/transform/memory_alloc.py |  5 +++++
 tests/python/relay/test_vm.py              | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/python/tvm/relay/transform/memory_alloc.py b/python/tvm/relay/transform/memory_alloc.py
index f611c1cc14c1..66528c861788 100644
--- a/python/tvm/relay/transform/memory_alloc.py
+++ b/python/tvm/relay/transform/memory_alloc.py
@@ -84,6 +84,11 @@ def visit_call(self, call):
         for arg in call.args:
             self.visit(arg)
 
+    def visit_var(self, var):
+        var_type = var.checked_type
+        if not isinstance(var_type, ty.TensorType):
+            self.reshape_only = False
+
 
 def is_reshape_only(func):
     """Check if the primitive function contains only reshape ops."""
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 038b5c5ed9e1..92d6e8e55db4 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -754,5 +754,21 @@ def test_vm_reshape_tensor():
     check_result([x_np, y_np], x_np.reshape([8, 2, 8]), mod)
 
 
+def test_vm_reshape_tuple(x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
+    tup = relay.var(
+        "tup",
+        type_annotation=relay.TupleType([relay.TensorType(x_shape), relay.TensorType(y_shape)]),
+    )
+    out = relay.reshape(relay.TupleGetItem(tup, 0), (1, -1))
+    f = relay.Function([tup], out)
+
+    x_data = np.random.uniform(size=x_shape).astype("float32")
+    y_data = np.random.uniform(size=y_shape).astype("float32")
+
+    for tgt, ctx in tvm.testing.enabled_targets():
+        res = veval(f, (x_data, y_data), ctx=ctx, target=tgt)
+        tvm.testing.assert_allclose(res.asnumpy(), np.reshape(x_data, (1, -1)))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 19d3d88f5071d459b9b8a94c8fc3105f0c8f2cdc Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Wed, 28 Oct 2020 14:26:33 -0700
Subject: [PATCH 076/258] [Relay][Training] Add more missing gradients (#6767)

---
 python/tvm/relay/op/_tensor.py             |   1 +
 python/tvm/relay/op/_tensor_grad.py        | 141 ++++++++++++++++++++-
 python/tvm/relay/testing/__init__.py       |   6 +-
 src/relay/analysis/type_solver.cc          |   2 +-
 tests/python/relay/test_op_grad_level1.py  |   6 +
 tests/python/relay/test_op_grad_level10.py |   5 +
 tests/python/relay/test_op_grad_level3.py  |  44 ++++++-
 7 files changed, 201 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 776060d7733e..5675b28c713a 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -230,6 +230,7 @@ def elemwise_shape_func(attrs, inputs, _):
 
 
 register_shape_func("cast", False, elemwise_shape_func)
+register_shape_func("cast_like", False, elemwise_shape_func)
 register_shape_func("zeros", False, no_data_full_shape_func)
 register_shape_func("zeros_like", False, elemwise_shape_func)
 register_shape_func("ones", False, no_data_full_shape_func)
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 85168a561399..9f4f20c9000c 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -20,8 +20,11 @@
 
 from tvm.topi.nn.util import get_pad_tuple
 from tvm.topi.util import get_const_tuple
+from tvm.error import OpError
 
-from ..expr import Tuple, TupleGetItem, const
+from ..expr import Tuple, TupleGetItem, const, Var
+from ..ty import TensorType
+from ..loops import while_loop
 from . import nn as _nn
 from .op import register_gradient
 from .reduce import sum as _sum
@@ -40,6 +43,7 @@
     equal,
     shape_of,
     log,
+    concatenate,
 )
 from .transform import (
     broadcast_to_like,
@@ -55,6 +59,10 @@
     repeat,
     expand_dims,
     full_like,
+    split,
+    squeeze,
+    strided_set,
+    arange,
 )
 
 
@@ -665,3 +673,134 @@ def cross_entropy_with_logits_grad(orig, grad):
     batch_size = take(shape, const(0, dtype="int32"), axis=0)
     grad = grad / batch_size.astype(x.checked_type.dtype)
     return [-grad * y, -grad * x]
+
+
+@register_gradient("take")
+def take_grad(orig, grad):
+    """
+    Returns the gradient of take.
+    """
+
+    def make_scalar_tensor(v):
+        if isinstance(v, int):
+            v = const(v, dtype="int32")
+        return reshape(v, (1,))
+
+    # TODO(@altanh): we currently assume indices are in range
+    data, indices = orig.args
+    axis = orig.attrs.axis
+    zero, one = map(make_scalar_tensor, [0, 1])
+    data_grad = zeros_like(data)
+    try:
+        data_shape = data.checked_type.concrete_shape
+    except TypeError as ty_err:
+        raise OpError("currently take_grad only supports data with concrete shape") from ty_err
+    if axis is None:
+        axis = 0
+        data_grad = reshape(data_grad, (-1,))
+        data_shape = 1
+        for dim in data.checked_type.concrete_shape:
+            data_shape *= dim
+        data_shape = (data_shape,)
+    else:
+        axis = int(axis)
+    strides = [1] * len(data_shape)
+
+    if len(indices.checked_type.shape) == 0:
+        # axis on grad has been squeezed in this case
+        num_indices = one
+        indices = reshape(indices, (1,))
+        grad = expand_dims(grad, int(axis))
+    elif len(indices.checked_type.shape) == 1:
+        num_indices = take(shape_of(indices), zero, axis=0)
+    else:
+        raise OpError("take_grad only supports scalar or 1D indices")
+
+    def loop_cond(data_grad, i):
+        return squeeze(less(i, num_indices))
+
+    def loop_body(data_grad, i):
+        index = take(indices, i, axis=0)
+        grad_slice = take(grad, i, axis=axis)
+        begin, end = [], []
+        for ax, size in enumerate(data_shape):
+            size = make_scalar_tensor(size)
+            begin.append(zero if ax != axis else index)
+            end.append(size if ax != axis else index + one)
+        begin, end = concatenate(begin, axis=0), concatenate(end, axis=0)
+        # data_grad[:,...,index at axis,...,:] += grad_slice
+        update = strided_slice(data_grad, begin, end, strides=strides)
+        update = update + grad_slice  # no need to expand grad_slice since i has shape (1,)
+        next_data_grad = strided_set(data_grad, update, begin, end, strides=strides)
+        return (next_data_grad, i + one)
+
+    loop_vars = [
+        Var("data_grad", type_annotation=TensorType(data_shape, data.checked_type.dtype)),
+        Var("i", type_annotation=TensorType((1,), "int32")),
+    ]
+
+    loop = while_loop(loop_cond, loop_vars, loop_body)
+    result = loop(data_grad, zero)
+    data_grad = TupleGetItem(result, 0)
+
+    if orig.attrs.axis is None:
+        data_grad = reshape_like(data_grad, data)
+
+    return [data_grad, zeros_like(orig.args[1])]
+
+
+@register_gradient("contrib_reverse_reshape")
+def reverse_reshape_grad(orig, grad):
+    """
+    Returns the gradient of reverse_reshape (same as reshape).
+    """
+    return [reshape_like(grad, orig.args[0])]
+
+
+@register_gradient("stack")
+def stack_grad(orig, grad):
+    """
+    Returns grad split across stacked inputs.
+    """
+    stack_axis = int(orig.attrs.axis)
+    sections = len(orig.args[0].checked_type.fields)
+    splits = split(grad, sections, stack_axis)
+    splits = Tuple([squeeze(x, axis=[stack_axis]) for x in splits])
+    return [splits]
+
+
+@register_gradient("squeeze")
+def squeeze_grad(orig, grad):
+    """
+    Returns grad expanded to input size.
+    """
+    # this should work, can't use expand_dims since we lose
+    # squeeze information when axis=None
+    return [reshape_like(grad, orig.args[0])]
+
+
+@register_gradient("expand_dims")
+def expand_dims_grad(orig, grad):
+    """
+    Returns grad squeezed on expanded dims.
+    """
+    axis = int(orig.attrs.axis)
+    for _ in range(orig.attrs.num_newaxis):
+        grad = squeeze(grad, axis=[axis])
+    return [grad]
+
+
+@register_gradient("arange")
+def arange_grad(orig, grad):
+    """
+    Returns the gradient of arange.
+    """
+    start, stop, step = orig.args
+    length = take(shape_of(orig), const(0, dtype="int32"), axis=0)
+
+    grad_start = cast_like(_sum(grad), start)
+    grad_stop = zeros_like(stop)
+    grad_step = cast_like(arange(length, dtype="int32"), grad) * grad
+    grad_step = cast_like(_sum(grad_step), step)
+
+    return [grad_start, grad_stop, grad_step]
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 6eb71b581ab2..9c87f2795e5c 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -64,7 +64,11 @@ def run_infer_type(expr):
 
 
 def _np_randn_from_type(t, scale=1, mean=0):
-    return (mean + (scale * np.random.randn(*(int(d) for d in t.shape)))).astype(t.dtype)
+    res = mean + (scale * np.random.randn(*(int(d) for d in t.shape)))
+    # if t.shape == (), then randn returns a scalar so we need to wrap for dtype conversion
+    if np.isscalar(res):
+        res = np.array(res)
+    return res.astype(t.dtype)
 
 
 def check_grad(
diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index 55f736895018..8f14b557dc54 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -615,7 +615,7 @@ bool TypeSolver::Solve() {
 
       rnode->resolved = resolved;
     } catch (const Error& err) {
-      this->diag_ctx_.Emit(Diagnostic::Error(rnode->span) << "err");
+      this->diag_ctx_.Emit(Diagnostic::Error(rnode->span) << err.what());
       rnode->resolved = false;
     } catch (const dmlc::Error& e) {
       ICHECK(false) << e.what();
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index c0270eae80d2..cac07c437a42 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -144,5 +144,11 @@ def test_bias_add_grad():
     verify_bias_add((4, 8), (8,))
 
 
+def test_expand_dims_grad():
+    data = relay.var("data", shape=(2, 3), dtype="float64")
+    fwd_func = relay.Function([data], relay.expand_dims(data, axis=1, num_newaxis=2))
+    check_grad(fwd_func)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 462a75255f90..4a6ffb933881 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -67,5 +67,10 @@ def test_batch_matmul_grad():
     check_grad(relay.Function([x, y], relay.op.nn.batch_matmul(x, y)))
 
 
+def test_reverse_reshape_grad():
+    x = relay.var("x", shape=(3, 4, 5), dtype="float64")
+    check_grad(relay.Function([x], relay.op.reverse_reshape(x, (-1, 0))))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 0b4f8920aa5c..9c27afd87205 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.relay.testing import check_grad, run_infer_type
+from tvm.relay.testing import check_grad, run_infer_type, _np_randn_from_type
 from tvm.relay.transform import gradient
 import tvm.testing
 
@@ -75,5 +75,47 @@ def test_copy_grad():
     check_grad(fwd_func)
 
 
+def test_take_grad():
+    data_dtype = relay.TensorType((3, 4, 5), "float64")
+    data = relay.var("data", data_dtype)
+    indices = relay.var("indices", relay.TensorType((relay.Any(),), "int32"))
+    inputs = [_np_randn_from_type(data_dtype, scale=1e-5), np.array([1, 2], dtype="int32")]
+    test_inputs = [inputs[0]]
+
+    # take on axis
+    fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=1))
+    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs)
+
+    # take on flattened
+    fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=None))
+    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs)
+
+
+def test_stack_grad():
+    args = [relay.var(c, shape=(2, 3, 4), dtype="float64") for c in "xyz"]
+    fwd_func = relay.Function(args, relay.stack(args, axis=0))
+    check_grad(fwd_func)
+
+
+def test_squeeze_grad():
+    data = relay.var("data", shape=(2, 1, 1, 3, 4, 1), dtype="float64")
+    fwd_func = relay.Function([data], relay.squeeze(data))
+    fwd_func_subset = relay.Function([data], relay.squeeze(data, axis=[1, -1]))
+    check_grad(fwd_func)
+    check_grad(fwd_func_subset)
+
+
+def test_arange_grad():
+    # TODO: testing arange numerically is strange because two-sided approx can
+    #       produce different output shapes
+    dtype = "float64"
+    start = relay.var("start", relay.TensorType((), dtype))
+    stop = relay.var("stop", relay.TensorType((), dtype))
+    step = relay.var("step", relay.TensorType((), dtype))
+    values = [np.array(v, dtype=dtype) for v in [2.5, 9.5, 1.8]]
+    fwd_func = relay.Function([start, stop, step], relay.arange(start, stop, step, dtype))
+    check_grad(fwd_func, inputs=values)
+
+
 if __name__ == "__main__":
     pytest.main()

From 220248be8ffb7ec1f7d8f3c14b39233e8a6c3a51 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Wed, 28 Oct 2020 18:12:21 -0700
Subject: [PATCH 077/258] [FIX][AUTOTVM] Make autotvm work with spawn (#6790)

Like #6671 this PR fixes autotvm when using the spawn start method for
multiprocessing. I've added some tests to make sure that things work
with spawn in the CI.
---
 python/tvm/autotvm/task/task.py               | 14 ++++--
 .../tvm/autotvm/tuner/xgboost_cost_model.py   | 50 +++++++++++++------
 .../unittest/test_autotvm_index_tuner.py      | 14 ++++++
 tests/python/unittest/test_autotvm_measure.py | 15 ++++++
 .../unittest/test_autotvm_xgboost_model.py    | 15 ++++++
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |  4 ++
 tutorials/auto_scheduler/tune_matmul_x86.py   |  4 ++
 tutorials/autotvm/tune_conv2d_cuda.py         |  4 ++
 tutorials/autotvm/tune_relay_arm.py           |  4 ++
 tutorials/autotvm/tune_relay_cuda.py          |  4 ++
 tutorials/autotvm/tune_relay_mobile_gpu.py    |  4 ++
 tutorials/autotvm/tune_relay_x86.py           |  4 ++
 tutorials/autotvm/tune_simple_template.py     |  4 ++
 13 files changed, 121 insertions(+), 19 deletions(-)

diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index a7cb9a095765..8822ba971e4c 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -23,15 +23,14 @@
 """
 import numpy as np
 
-from tvm.target import Target
 from tvm import runtime
 from tvm.ir import container
+from tvm.target import Target
+from tvm.te import placeholder, tensor
 from tvm.tir import expr
-from tvm.te import tensor, placeholder
-
 
 from ..util import get_const_int, get_const_tuple
-from .dispatcher import DispatchContext, ApplyConfig
+from .dispatcher import ApplyConfig, DispatchContext
 from .space import ConfigSpace
 
 
@@ -173,6 +172,8 @@ def __getstate__(self):
         # some unpickable local task functions.
         # So we only pickle the name of the function
         # and restore the function by name when unpickling it.
+        import cloudpickle  # pylint: disable=import-outside-toplevel
+
         return {
             "name": self.name,
             "args": self.args,
@@ -181,14 +182,17 @@ def __getstate__(self):
             "flop": self.flop,
             "target": self.target,
             "target_host": self.target_host,
+            "func": cloudpickle.dumps(self.func),
         }
 
     def __setstate__(self, state):
+        import cloudpickle  # pylint: disable=import-outside-toplevel
+
         self.name = state["name"]
         self.args = state["args"]
         self.kwargs = state["kwargs"]
         self.config_space = state["config_space"]
-        self.func = _lookup_task(state["name"])
+        self.func = cloudpickle.loads(state["func"])
         self.flop = state["flop"]
         self.target = state["target"]
         self.target_host = state["target_host"]
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 7b9df1c99373..f66764c42520 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -153,7 +153,10 @@ def _reset_pool(self, space, target, task):
 
         self._close_pool()
 
-        # use global variable to pass common arguments
+        # Use global variable to pass common arguments. This is only used when
+        # new processes are started with fork. We have to set the globals
+        # before we create the pool, so that processes in the pool get the
+        # correct globals.
         global _extract_space, _extract_target, _extract_task
         _extract_space = space
         _extract_target = target
@@ -324,7 +327,12 @@ def _get_feature(self, indexes):
 
         if need_extract:
             pool = self._get_pool()
-            feas = pool.map(self.feature_extract_func, need_extract)
+            # If we are forking, we can pass arguments in globals for better performance
+            if multiprocessing.get_start_method(False) == "fork":
+                feas = pool.map(self.feature_extract_func, need_extract)
+            else:
+                args = [(self.space.get(x), self.target, self.task) for x in need_extract]
+                feas = pool.map(self.feature_extract_func, args)
             for i, fea in zip(need_extract, feas):
                 fea_cache[i] = fea
 
@@ -344,18 +352,24 @@ def __del__(self):
         self._close_pool()
 
 
+# Global variables for passing arguments to extract functions.
 _extract_space = None
 _extract_target = None
 _extract_task = None
 
 
-def _extract_itervar_feature_index(index):
+def _extract_itervar_feature_index(args):
     """extract iteration var feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
-        with _extract_target:
-            sch, args = _extract_task.instantiate(config)
-        fea = feature.get_itervar_feature_flatten(sch, args, take_log=True)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+            with _extract_target:
+                sch, fargs = _extract_task.instantiate(config)
+        else:
+            config, target, task = args
+            with target:
+                sch, fargs = task.instantiate(config)
+        fea = feature.get_itervar_feature_flatten(sch, fargs, take_log=True)
         fea = np.concatenate((fea, list(config.get_other_option().values())))
         return fea
     except Exception:  # pylint: disable=broad-except
@@ -381,10 +395,13 @@ def _extract_itervar_feature_log(arg):
         return None
 
 
-def _extract_knob_feature_index(index):
+def _extract_knob_feature_index(args):
     """extract knob feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+        else:
+            config = args[0]
         return config.get_flatten_feature()
     except Exception:  # pylint: disable=broad-except
         return None
@@ -408,13 +425,18 @@ def _extract_knob_feature_log(arg):
         return None
 
 
-def _extract_curve_feature_index(index):
+def _extract_curve_feature_index(args):
     """extract sampled curve feature for an index in extract_space"""
     try:
-        config = _extract_space.get(index)
-        with _extract_target:
-            sch, args = _extract_task.instantiate(config)
-        fea = feature.get_buffer_curve_sample_flatten(sch, args, sample_n=20)
+        if multiprocessing.get_start_method(False) == "fork":
+            config = _extract_space.get(args)
+            with _extract_target:
+                sch, fargs = _extract_task.instantiate(config)
+        else:
+            config, target, task = args
+            with target:
+                sch, fargs = task.instantiate(config)
+        fea = feature.get_buffer_curve_sample_flatten(sch, fargs, sample_n=20)
         fea = np.concatenate((fea, list(config.get_other_option().values())))
         return np.array(fea)
     except Exception:  # pylint: disable=broad-except
diff --git a/tests/python/unittest/test_autotvm_index_tuner.py b/tests/python/unittest/test_autotvm_index_tuner.py
index 05f12118e6af..c433d8fb7297 100644
--- a/tests/python/unittest/test_autotvm_index_tuner.py
+++ b/tests/python/unittest/test_autotvm_index_tuner.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test index based tuners"""
 
+import multiprocessing
 from test_autotvm_common import DummyRunner, get_sample_task
 from tvm import autotvm
 from tvm.autotvm.tuner import GridSearchTuner, RandomTuner
@@ -43,6 +44,18 @@ def test_gridsearch_tuner():
     assert not tuner.has_next()
 
 
+def grid_search_spawn():
+    assert multiprocessing.get_spawn_method(False) == "spawn"
+    test_gridsearch_tuner()
+
+
+def test_grid_search_tuner_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=test_gridsearch_tuner)
+    p.start()
+    p.join()
+
+
 def test_random_tuner():
     """Test RandomTuner"""
 
@@ -65,4 +78,5 @@ def test_random_tuner():
 
 if __name__ == "__main__":
     test_gridsearch_tuner()
+    test_gridsearch_tuner_spawn()
     test_random_tuner()
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index c8760d2be1b4..1a18d6122bf0 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test builder and runner"""
 import logging
+import multiprocessing
 import time
 
 import numpy as np
@@ -46,6 +47,19 @@ def test_task_tuner_without_measurement():
         assert tuner.best_flops > 1
 
 
+def task_tuner_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_task_tuner_without_measurement()
+
+
+def test_task_tuner_without_measurement_spawn():
+    # Subprocesses inherit the spawn method of their parents
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=task_tuner_spawn)
+    p.start()
+    p.join()
+
+
 def test_check_correctness():
     task, target = get_sample_task()
 
@@ -77,4 +91,5 @@ def _callback_wrong(tuner, measure_inputs, measure_results):
     logging.basicConfig(level=logging.INFO)
 
     test_task_tuner_without_measurement()
+    test_task_tuner_without_measurement_spawn()
     test_check_correctness()
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index 5789a9fad4d5..58b2a4d66344 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -16,6 +16,7 @@
 # under the License.
 import time
 
+import multiprocessing
 import numpy as np
 
 import tvm
@@ -43,6 +44,19 @@ def test_fit():
     upper_model.fit(xs, ys, plan_size=32)
 
 
+def fit_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_fit()
+
+
+def test_fit_spawn():
+    # Subprocesses inherit the spawn method of their parents
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=test_fit)
+    p.start()
+    p.join()
+
+
 def test_tuner():
     task, target = get_sample_task()
     records = get_sample_records(n=100)
@@ -53,4 +67,5 @@ def test_tuner():
 
 if __name__ == "__main__":
     test_fit()
+    test_fit_spawn()
     test_tuner()
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 10a2d1b44144..42273bf72891 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -30,6 +30,10 @@
 find a good schedule in the space.
 
 We use a convolution layer as an example in this tutorial.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 import os
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 81f2e71ff8f7..0f2ebe0e09a4 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -27,6 +27,10 @@
 find a good schedule in the space.
 
 We use matrix multiplication as an example in this tutorial.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 import os
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index ce9c19860ff4..b307077905d3 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -22,6 +22,10 @@
 This is an advanced tutorial for writing high performance tunable template for
 NVIDIA GPU. By running auto-tuner on this template, we can outperform the
 vendor provided library CuDNN in many cases.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index f024ba4f201a..31fda54a9a7e 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -35,6 +35,10 @@
 We also released pre-tuned parameters for some arm devices. You can go to
 `ARM CPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#arm-cpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 4636103a22e2..e86430767b31 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -33,6 +33,10 @@
 We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
 `NVIDIA GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#nvidia-gpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 61254662c463..9a112e134f4f 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -33,6 +33,10 @@
 We also released pre-tuned parameters for some arm devices. You can go to
 `Mobile GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#mobile-gpu>`_
 to see the results.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 1dd947fefd25..b1b7ca29e46a 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -23,6 +23,10 @@
 
 This is a tutorial about how to tune convolution neural network
 for x86 CPU.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 import os
 import numpy as np
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 357abf19a09c..b5167b3c72ab 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -26,6 +26,10 @@
 The second step is running a search algorithm to explore through this space.
 In this tutorial, you can learn how to perform these two steps in TVM.
 The whole workflow is illustrated by a matrix multiplication example.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
 """
 
 ######################################################################

From 38a8be69534ee410de573c5722d2b34e253b43f8 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 29 Oct 2020 13:52:59 +0900
Subject: [PATCH 078/258] [Torch, QNN] Support dynamic quantization flow to
 enable importing quantized transformer models (#6782)

* add stub and test

* per channel quantize

* calculate qparam correctly

* import qbert working

* support batched qdense

* test batched input

* fix mkl offloading of batch matmul

* reduce range become True in torch 1.6

* fix for 1.6

* Revert "fix mkl offloading of batch matmul"

This reverts commit cd90aa783688c68e1b12633eea4d2690d9e3a5a5.

* fix merge

* fix

* lint fix

* fix black

* more black fix

* fix version check for 1.5.1

* disable assert on v1.4 (strange pytorch issue)

* minor fix

* use dequantize

Co-authored-by: masa <masa@pop-os.localdomain>
---
 python/tvm/relay/frontend/pytorch.py      |  5 +-
 python/tvm/relay/frontend/qnn_torch.py    | 81 ++++++++++++++++++++---
 src/relay/qnn/op/dense.cc                 | 35 +++++++---
 src/relay/qnn/op/quantize.cc              | 23 +++----
 src/relay/qnn/utils.h                     | 12 ++++
 tests/python/frontend/pytorch/qnn_test.py | 55 ++++++++++++---
 6 files changed, 172 insertions(+), 39 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 52761647d15b..8d164314ecc8 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2741,7 +2741,7 @@ def _run_jit_passes(graph):
     # pylint: disable=c-extension-no-member
     import torch
 
-    if is_version_greater_than("1.5.0"):
+    if is_version_greater_than("1.5.1"):
         # This is required for torchvision detection models from 1.6 above
         # It is the same as _jit_pass_inline, except that it has some special
         # case behaviors for some ops such as aten::__interpolate()
@@ -3383,7 +3383,8 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
     ret_name = _get_input_names(graph.return_node())
 
     # For quantized models
-    if "aten::quantize_per_tensor" in op_names:
+    quantized_ops = set(["aten::quantize_per_tensor", "quantized::linear_dynamic"])
+    if len(quantized_ops.intersection(set(op_names))) > 0:
         weight_quant_params = qnn_torch.get_weight_quant_params(script_module)
         qnn_torch.add_input_quant_params_to_op_inputs(graph)
         qnn_torch.add_quant_params_to_outputs(outputs, packed_param_map, weight_quant_params)
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 3f8d495511dd..e3431043bc86 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -173,7 +173,7 @@ def _get_quant_param_for_input(input_value):
     # 6th and 7th arg are output scale and zp respectively.
 
     # PyTorch 1.6 changed qconv API
-    if is_version_greater_than("1.5.0"):
+    if is_version_greater_than("1.5.1"):
         qconv_indices = (2, 3)
     else:
         qconv_indices = (6, 7)
@@ -575,13 +575,7 @@ def _impl(inputs, _):
         )
 
         return _do_bias_and_requantize(
-            conv_out,
-            bias,
-            input_scale,
-            weight_scale,
-            output_scale,
-            output_zero_point,
-            with_relu,
+            conv_out, bias, input_scale, weight_scale, output_scale, output_zero_point, with_relu
         )
 
     return _impl
@@ -826,6 +820,76 @@ def _impl(inputs, _):
     return _impl
 
 
+def _linear_dynamic():
+    def _calculate_qparam(inp):
+        # reference ATen/native/quantized/cpu/qlinear_dynamic.cpp
+        # ChooseQuantizationParams function
+        mn = _op.min(inp)
+        mx = _op.max(inp)
+
+        # Ensure that the interval contains 0
+        mn = _op.minimum(mn, _op.const(0.0, dtype="float32"))
+        mx = _op.maximum(mx, _op.const(0.0, dtype="float32"))
+
+        qmax = 255
+
+        # reduce_range became True in v1.6
+        if is_version_greater_than("1.5.1"):
+            qmax = 127
+
+        scale = (mx - mn) / _expr.const(qmax, dtype="float32")
+
+        zero_point_from_min = -(mn / scale)
+        zero_point = _op.cast(_op.round(_op.clip(zero_point_from_min, 0.0, qmax)), "int32")
+
+        return scale, zero_point
+
+    def _impl(inputs, _):
+        weight = inputs[1][0]
+        weight_scale = inputs[1][1]
+        weight_zero_point = inputs[1][2]
+
+        inp = inputs[0]
+
+        input_scale, input_zero_point = _calculate_qparam(inp)
+        qinp = relay.qnn.op.quantize(inp, input_scale, input_zero_point, out_dtype="uint8")
+
+        data_shape = infer_shape(inp)
+
+        if len(data_shape) > 2:
+            qinp = _op.reverse_reshape(qinp, [-1, 0])
+
+        weight_shape = infer_shape(weight)
+        units = weight_shape[0]
+        dense = relay.qnn.op.dense(
+            qinp,
+            weight,
+            input_zero_point,
+            weight_zero_point,
+            input_scale,
+            weight_scale,
+            units=units,
+        )
+        bias_var = inputs[1][3]
+
+        dequant_scale = input_scale * weight_scale
+        dense_out = relay.qnn.op.dequantize(
+            dense, dequant_scale, input_zero_point=relay.const(0, "int32"), axis=1
+        )
+
+        if len(data_shape) > 2:
+            new_shape = list(data_shape[:-1])
+            new_shape.append(units)
+            dense_out = _op.reshape(dense_out, new_shape)
+
+        if bias_var is not None:
+            return dense_out + bias_var
+
+        return dense_out
+
+    return _impl
+
+
 convert_map = {
     "aten::quantize_per_tensor": _quantize_per_tensor(),
     "quantized::conv2d_relu": _quantized_conv2d(with_relu=True),
@@ -841,4 +905,5 @@ def _impl(inputs, _):
     "quantized::add_scalar": _add_scalar(),
     "quantized::mul_scalar": _mul_scalar(),
     "quantized::relu6": _relu6(),
+    "quantized::linear_dynamic": _linear_dynamic(),
 }
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 62988c8cc52f..3602995b8f16 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -99,6 +99,18 @@ Expr DenseFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int re
   return MakeConstantScalar(DataType::Int(32), scalar_term);
 }
 
+Expr DenseFourthTerm(const Expr& input_zero_point, const Expr& kernel_zero_point,
+                     int reduction_dim_size) {
+  auto reduction_dim = MakeConstantScalar(DataType::Int(32), reduction_dim_size);
+  return Multiply(Multiply(input_zero_point, kernel_zero_point), reduction_dim);
+}
+
+Expr DenseCombineTerms(const Expr& term1, const Expr& term2, const Expr& term3, const Expr& term4) {
+  auto data_term = Subtract(term1, term2);
+  // Putting constant terms together, so that constant folding can fold it.
+  auto const_term = Subtract(term4, term3);
+  return Add(data_term, const_term);
+}
 /*
  * \brief Forward rewrite the qnn dense op.
  * \param attrs The QNN dense attrs.
@@ -144,14 +156,24 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
 
   const auto* qnn_dense_attrs = attrs.as<DenseAttrs>();
 
+  auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
+  auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point);
+  auto term3 = DenseThirdTerm(quantized_kernel, input_zero_point);
+
   // Extract the integer zero points.
-  auto input_zero_point_int = GetScalarFromConstant<int>(input_zero_point);
   auto kernel_zero_point_int = GetScalarFromConstant<int>(kernel_zero_point);
 
+  if (!IsConstScalar(input_zero_point)) {
+    if (kernel_zero_point_int == 0) {
+      return Subtract(term1, term3);
+    }
+    auto term4 = DenseFourthTerm(input_zero_point, kernel_zero_point, reduction_dim_size);
+    return DenseCombineTerms(term1, term2, term3, term4);
+  }
+
+  auto input_zero_point_int = GetScalarFromConstant<int>(input_zero_point);
+
   // Get all the terms as described in the comments.
-  auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
-  auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point);
-  auto term3 = DenseThirdTerm(quantized_kernel, input_zero_point);
   auto term4 = DenseFourthTerm(input_zero_point_int, kernel_zero_point_int, reduction_dim_size);
 
   // Combine those 4 terms depending on the zero points to get the best lowering.
@@ -165,10 +187,7 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
     // term 2 and term 4 become zero.
     return Subtract(term1, term3);
   } else {
-    auto data_term = Subtract(term1, term2);
-    // Putting constant terms together, so that constant folding can fold it.
-    auto const_term = Subtract(term4, term3);
-    return Add(data_term, const_term);
+    return DenseCombineTerms(term1, term2, term3, term4);
   }
 }
 
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 0622c96f04a6..9829834f43a3 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -83,20 +83,27 @@ Expr MakeQuantize(Expr data, Expr output_scale, Expr output_zero_point, int axis
 }
 
 Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale,
-                   const Expr& output_zero_point, const Array<IndexExpr>& input_shape,
+                   const Expr& output_zero_point, const Array<tvm::relay::Type>& types,
                    const QuantizeAttrs* attrs) {
+  ICHECK_EQ(types.size(), 4);
+  auto in_type = types[0];
+  auto in_tensor_type = in_type.as<TensorTypeNode>();
+  ICHECK(in_tensor_type != nullptr) << "Type information missing."
+                                    << " Please run infer_type pass.";
+  Array<IndexExpr> input_shape = in_tensor_type->shape;
+
   const auto out_dtype = attrs->out_dtype;
   const auto axis = attrs->axis;
 
   size_t n_dim = input_shape.size();
 
   auto expanded_output_scale = output_scale;
-  if (!IsConstScalar(output_scale)) {
+  if (!IsConstScalar(output_scale) && !IsScalarType(types[1])) {
     expanded_output_scale = ExpandBiasToMatchAxis(output_scale, n_dim, {axis});
   }
 
   auto expanded_output_zero_point = output_zero_point;
-  if (!IsConstScalar(output_zero_point)) {
+  if (!IsConstScalar(output_zero_point) && !IsScalarType(types[2])) {
     expanded_output_zero_point = ExpandBiasToMatchAxis(output_zero_point, n_dim, {axis});
   }
 
@@ -120,15 +127,7 @@ Expr QuantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
   ICHECK(quantize_attrs != nullptr);
 
-  // Find input shape.
-  ICHECK_EQ(types.size(), 4);
-  auto in_type = types[0];
-  auto in_tensor_type = in_type.as<TensorTypeNode>();
-  ICHECK(in_tensor_type != nullptr) << "Type information missing."
-                                    << " Please run infer_type pass.";
-  Array<IndexExpr> input_shape = in_tensor_type->shape;
-
-  return QuantizeLower(data, output_scale, output_zero_point, input_shape, quantize_attrs);
+  return QuantizeLower(data, output_scale, output_zero_point, types, quantize_attrs);
 }
 
 RELAY_REGISTER_OP("qnn.quantize")
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index ab5c9a4fbbe2..23759a52ec41 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -179,6 +179,18 @@ static inline bool IsScalarType(const Type& expr_type, const DataType& dtype) {
   return true;
 }
 
+/*
+ * \brief Checks whether an expr type is scalar.
+ * \param expr_type The type of expr to be checked.
+ * \return True if the type is a scalar
+ */
+static inline bool IsScalarType(const Type& expr_type) {
+  const auto* tensor_type = expr_type.as<TensorTypeNode>();
+  CHECK(tensor_type) << "Only tensor type can be checked for scalar values. But got"
+                     << AsText(expr_type, false);
+  return tensor_type->shape.size() == 0;
+}
+
 /*
  * \brief Checks and assigns types to scale and zero points.
  * \param expr_type The type of expr to be checked.
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 706f15b9d9d9..1851e31e817f 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -27,7 +27,9 @@
 from torch.quantization import fuse_modules, QuantWrapper
 
 import tvm
+import tvm.testing
 from tvm import relay
+from tvm.relay.frontend.pytorch_utils import is_version_greater_than
 from tvm.contrib.download import download_testdata
 
 
@@ -197,9 +199,7 @@ def fuse_model(self):
 
 # test on quantized::mul_scalar with negative scale
 class MulScalarNegative(nn.Module):
-    def __init__(
-        self,
-    ):
+    def __init__(self):
         super().__init__()
         self.float_op = nn.quantized.FloatFunctional()
         self.quant = QuantStub()
@@ -337,12 +337,7 @@ def get_transform():
 
         normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         return transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                normalize,
-            ]
+            [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize]
         )
 
     def get_real_image(im_height, im_width):
@@ -508,3 +503,45 @@ def test_serialized_modules():
     num_identical = np.sum(np.abs(tvm_result - pt_result) < 1e-2)
     match_ratio = num_identical / float(np.prod(tvm_result.shape))
     assert match_ratio > 0.90
+
+
+def test_quantize_dynamic():
+    # A wrapper is required for quantize_dynamic to work correctly
+    class LinearWrapper(nn.Module):
+        def __init__(self, in_dim, hidden_dim):
+            super().__init__()
+            self.linear = nn.Linear(in_dim, hidden_dim)
+
+        def forward(self, inp):
+            return self.linear(inp)
+
+    torch.manual_seed(0)
+    mod = LinearWrapper(16, 32)
+
+    for qconfig in [
+        torch.quantization.per_channel_dynamic_qconfig,
+        torch.quantization.default_dynamic_qconfig,
+    ]:
+        for ishape in [(16, 16), (10, 16, 16)]:
+            qspec = {nn.Linear: qconfig}
+            qmod = torch.quantization.quantize_dynamic(mod, qconfig_spec=qspec, dtype=torch.qint8)
+
+            inp = torch.randn(*ishape)
+            script_module = torch.jit.trace(qmod, inp).eval()
+
+            with torch.no_grad():
+                pt_result = script_module(inp.clone()).numpy()
+
+            input_name = "input"
+            runtime = get_tvm_runtime(script_module, "input", inp.shape)
+            runtime.set_input(input_name, inp.numpy().copy())
+            runtime.run()
+            tvm_result = runtime.get_output(0).asnumpy()
+
+            # Only compare with the PyTorch result for version v1.6 or newer
+            # Have seen a strange accuracy problem from PyTorch 1.4 and 1.5
+            # Even with the manual random seed set, the same PyTorch
+            # version can outputs slightly different results depending on an environment.
+            # Outputs from v1.6 seem reliable. TVM's outputs are always the same
+            if is_version_greater_than("1.5.1"):
+                tvm.testing.assert_allclose(tvm_result, pt_result, rtol=1e-4, atol=1e-4)

From 8062fdd154990438a4a94287b4c112a44b64704c Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Thu, 29 Oct 2020 18:40:06 +0530
Subject: [PATCH 079/258] TFLite failures resulted from TF latest version
 upgrade resolved (#6774)

* TFLite failures resulted from TF latest version upgrade resolved

* [1] Review comments handled
---
 docker/install/ubuntu_install_tflite.sh      |   6 +-
 python/tvm/relay/frontend/tflite.py          |  15 ++-
 tests/python/frontend/tflite/test_forward.py | 115 ++++++++++---------
 3 files changed, 77 insertions(+), 59 deletions(-)

diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh
index 123ff520d725..2dfbb0681a80 100755
--- a/docker/install/ubuntu_install_tflite.sh
+++ b/docker/install/ubuntu_install_tflite.sh
@@ -33,14 +33,14 @@ pip3 install flatbuffers
 # Build the TFLite static library, necessary for building with TFLite ON.
 # The library is built at:
 # tensorflow/tensorflow/lite/tools/make/gen/*/lib/libtensorflow-lite.a.
-git clone https://github.com/tensorflow/tensorflow --branch=r2.1
+git clone https://github.com/tensorflow/tensorflow --branch=r2.3
 ./tensorflow/tensorflow/lite/tools/make/download_dependencies.sh
 ./tensorflow/tensorflow/lite/tools/make/build_lib.sh
 
 # Setup tflite from schema
 mkdir tflite
 cd tflite
-wget -q https://raw.githubusercontent.com/tensorflow/tensorflow/r2.1/tensorflow/lite/schema/schema.fbs
+wget -q https://raw.githubusercontent.com/tensorflow/tensorflow/r2.3/tensorflow/lite/schema/schema.fbs
 flatc --python schema.fbs
 
 cat <<EOM >setup.py
@@ -48,7 +48,7 @@ import setuptools
 
 setuptools.setup(
     name="tflite",
-    version="2.1.0",
+    version="2.3.1",
     author="google",
     author_email="google@google.com",
     description="TFLite",
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index f52c318c8e97..6da06ac4a20b 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -2770,7 +2770,7 @@ def convert_transpose_conv(self, op):
             raise ImportError("The tflite package must be installed")
 
         input_tensors = self.get_input_tensors(op)
-        assert len(input_tensors) == 3, "input tensors length should be 3"
+        assert len(input_tensors) >= 3, "input tensors length should be >= 3"
 
         # Input (data) Tensor. NHWC layout
         input_tensor = input_tensors[2]
@@ -2843,6 +2843,19 @@ def convert_transpose_conv(self, op):
             out_dtype=output_tensor_type_str,
         )
 
+        # if we have bias
+        if len(input_tensors) == 4:
+            bias_tensor = input_tensors[3]
+            bias_tensor_type = bias_tensor.tensor.Type()
+            # bias tensor type should be INT32 (quantization) or FLOAT32
+            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
+            bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
+            bias_expr = self.exp_tab.new_const(
+                self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str
+            )
+            channel_axis = 3
+            out = _op.nn.bias_add(out, bias_expr, axis=channel_axis)
+
         return out
 
     def convert_quantize(self, op):
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index caa41806c8aa..3f860a3c6580 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -136,14 +136,20 @@ def vmobj_to_list(o):
         raise RuntimeError("Unknown object type: %s" % type(o))
 
 
-def _quantize_keras_model(keras_model, representative_data_gen):
+def _quantize_keras_model(
+    keras_model, representative_data_gen, is_float_input=False, is_float_output=False
+):
     """Utility function to quantize a Keras model using TFLite converter."""
     converter = interpreter_wrapper.TFLiteConverter.from_keras_model(keras_model)
     converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
     converter.representative_dataset = representative_data_gen
     converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    converter.inference_input_type = tf.uint8
-    converter.inference_output_type = tf.uint8
+    # NOTE: If representative dataset is provided, and inference input type is not set,
+    #       then converter will self add quant & dequant Op accordingly.
+    if not is_float_input:
+        converter.inference_input_type = tf.uint8
+    if not is_float_output:
+        converter.inference_output_type = tf.uint8
     return converter.convert()
 
 
@@ -973,6 +979,7 @@ def _test_convolution(
                     [out],
                     quantized=quantized,
                     input_range=input_range,
+                    experimental_new_converter=True,
                 )
             else:
                 # Quantized the inputs and feed them to the convolution
@@ -1000,6 +1007,7 @@ def _test_convolution(
                     [out],
                     quantized=quantized,
                     input_range=input_range,
+                    experimental_new_converter=True,
                 )
         else:
             data_array = np.reshape(data_array, tensor_in_sizes).astype("float32")
@@ -1078,18 +1086,18 @@ def test_forward_convolution():
         )
 
     # TFLite2 quantized convolution testing
-    if package_version.parse(tf.VERSION) >= package_version.parse("2.1.0"):
-        _test_tflite2_quantized_convolution(
-            [1, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], "SAME", "NHWC"
+    if package_version.parse(tf.VERSION) >= package_version.parse("2.3.0"):
+        _test_convolution(
+            [1, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], "SAME", "NHWC", quantized=True
         )
-        _test_tflite2_quantized_convolution(
-            [1, 17, 17, 12], [3, 3, 12, 32], [1, 1], [2, 2], "VALID", "NHWC"
+        _test_convolution(
+            [1, 17, 17, 12], [3, 3, 12, 32], [1, 1], [2, 2], "VALID", "NHWC", quantized=True
         )
-        _test_tflite2_quantized_convolution(
-            [1, 17, 17, 19], [3, 3, 19, 19], [1, 1], [2, 2], "VALID", "NHWC"
+        _test_convolution(
+            [1, 17, 17, 19], [3, 3, 19, 19], [1, 1], [2, 2], "VALID", "NHWC", quantized=True
         )
-        _test_tflite2_quantized_convolution(
-            [1, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], "SAME", "NHWC"
+        _test_convolution(
+            [1, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], "SAME", "NHWC", quantized=True
         )
 
         # Disable as tests are flaky - https://github.com/apache/incubator-tvm/issues/6064
@@ -2280,7 +2288,7 @@ def representative_data_gen():
         for i in range(1):
             yield [data]
 
-    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen)
+    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen, True, True)
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
     tvm_output = run_tvm_graph(tflite_model_quant, data, input_name)
@@ -2307,7 +2315,7 @@ def representative_data_gen():
         for i in range(1):
             yield [data]
 
-    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen)
+    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen, True, True)
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
     tvm_output = run_tvm_graph(tflite_model_quant, data, input_name)
@@ -2548,14 +2556,17 @@ def test_forward_padv2():
             np.array([2], dtype=np.float32),
         ]
     )
-    _test_padv2(
-        [
-            np.arange(0, 256, dtype=np.uint8).reshape((1, 256)),
-            np.array([[1, 1], [2, 2]], dtype=np.int32),
-            np.array([2], dtype=np.uint8),
-        ],
-        quantized=True,
-    )
+    # NOTE: In versions > 2.1.0, there is a bug in Tensorflow package for this scenario.
+    #       Hence, it is disabled temporarily for TF version > 2.1.0 .
+    if package_version.parse(tf.VERSION) <= package_version.parse("2.1.0"):
+        _test_padv2(
+            [
+                np.arange(0, 256, dtype=np.uint8).reshape((1, 256)),
+                np.array([[1, 1], [2, 2]], dtype=np.int32),
+                np.array([2], dtype=np.float32),
+            ],
+            quantized=True,
+        )
 
     # Constant Values input can be scalar
     _test_padv2(
@@ -2565,14 +2576,17 @@ def test_forward_padv2():
             np.float32(2),
         ]
     )
-    _test_padv2(
-        [
-            np.arange(0, 256, dtype=np.uint8).reshape((1, 256)),
-            np.array([[1, 1], [2, 2]], dtype=np.int32),
-            np.uint8(10),
-        ],
-        quantized=True,
-    )
+    # NOTE: In versions > 2.1.0, there is a bug in Tensorflow package for this scenario.
+    #       Hence, it is disabled temporarily for TF versions > 2.1.0.
+    if package_version.parse(tf.VERSION) <= package_version.parse("2.1.0"):
+        _test_padv2(
+            [
+                np.arange(0, 256, dtype=np.uint8).reshape((1, 256)),
+                np.array([[1, 1], [2, 2]], dtype=np.int32),
+                np.uint8(10),
+            ],
+            quantized=True,
+        )
 
 
 #######################################################################
@@ -2870,37 +2884,28 @@ def test_forward_tanh():
 def _test_relu(data, quantized=False):
     """ One iteration of ReLU """
 
-    if quantized:
-        if package_version.parse(tf.VERSION) < package_version.parse("2.1.0"):
-            pytest.skip("Testcase requires tflite version >= 2.1.0")
-        data_in = tf.keras.layers.Input(shape=data.shape[1:])
-        relu = tf.keras.layers.ReLU()(data_in)
-        keras_model = tf.keras.models.Model(inputs=data_in, outputs=relu)
-        input_name = data_in.name.split(":")[0]
-
-        # To create quantized values with dynamic range of activations, needs representative dataset
-        def representative_data_gen():
-            for i in range(1):
-                yield [data]
-
-        tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen)
-
-        tflite_output = run_tflite_graph(tflite_model_quant, data)
-        tvm_output = run_tvm_graph(tflite_model_quant, data, input_name)
-        tvm.testing.assert_allclose(
-            np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-5
-        )
-    else:
-        with tf.Graph().as_default():
-            in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0")
+
+        if quantized:
+            inq_data = tf.quantization.fake_quant_with_min_max_args(
+                in_data, min=-10, max=10, name="inq_0"
+            )
+            input_range = {"inq_0": (-10, 10)}
+            out = nn_ops.relu(inq_data)
+            out = tf.quantization.fake_quant_with_min_max_args(out, min=0, max=6, name="out")
+            compare_tflite_with_tvm(
+                data, "inq_0:0", [inq_data], [out], quantized=True, input_range=input_range
+            )
+        else:
             out = nn_ops.relu(in_data)
-            compare_tflite_with_tvm(data, "Placeholder:0", [in_data], [out])
+            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
 
 
 def test_forward_relu():
     """ ReLU """
     _test_relu(np.arange(6.0, dtype=np.float32).reshape((1, 6)))
-    _test_relu(np.arange(6.0, dtype=np.float32).reshape((1, 6)), quantized=True)
+    _test_relu(np.random.uniform(0, 255, (3, 6)).astype(np.uint8), quantized=True)
 
 
 #######################################################################

From 3dd35f2c1a2f041185f563705100c096fafd58ea Mon Sep 17 00:00:00 2001
From: ZHANG Hao <zhanghao@4paradigm.com>
Date: Thu, 29 Oct 2020 21:52:17 +0800
Subject: [PATCH 080/258] [VTA] quant support for alu-only op (#6191)

---
 python/tvm/relay/quantize/_annotate.py  |  1 +
 python/tvm/relay/quantize/_partition.py | 15 ++++++++++++-
 src/relay/quantize/realize.cc           | 28 ++++++++++++++++++-------
 3 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 329ba64aae00..b187387a56c2 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -284,6 +284,7 @@ def identity_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(ret_expr, x_kind)
 
 
+register_annotate_function("reshape", identity_rewrite)
 register_annotate_function("clip", identity_rewrite)
 register_annotate_function("nn.relu", identity_rewrite)
 register_annotate_function("strided_slice", identity_rewrite)
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
index 6892e8612a94..563d28366874 100644
--- a/python/tvm/relay/quantize/_partition.py
+++ b/python/tvm/relay/quantize/_partition.py
@@ -82,7 +82,7 @@ def add_partition_generic(ref_call, new_args, ctx):
         #     ...
         lhs = new_args[0].realize()
         rhs = new_args[1].realize()
-        return _forward_op(ref_call, [lhs, rhs])
+        return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
     if not lhs_cond and rhs_cond:
         # - introduced by residual connection in ResNet
         #     ...
@@ -130,6 +130,7 @@ def mul_partition_generic(ref_call, new_args, ctx):
 
     if lhs_cond:
         # introduced by bn: multiply(out, scale)
+        lhs = new_args[0].realize()
         return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
 
     if not lhs_cond and not rhs_cond:
@@ -155,3 +156,15 @@ def add_partition_function(ref_call, new_args, ctx):
 def multiply_partition_function(ref_call, new_args, ctx):
     """Rewrite function for ewise multiply for partition"""
     return mul_partition_generic(ref_call, new_args, ctx)
+
+
+# add cast after the relu op to make it run on vta
+@register_partition_function("nn.global_avg_pool2d")
+def global_avg_pool2d_partition_function(ref_call, new_args, ctx):
+    cond, expr = partition_expr_check(new_args[0])
+    if cond:
+        expr = new_args[0].realize()
+    else:
+        expr = QPartitionExpr(new_args[0]).realize()
+
+    return _forward_op(ref_call, [expr])
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index c96a1b063e98..8db72a3f2b32 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -309,7 +309,8 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 /* \brief Unify the dom scale of arguments */
 Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args,
-                            DataType* dtype_ptr, Expr* scale_ptr) {
+                            DataType* dtype_ptr, Expr* scale_ptr,
+                            DataType dtype = DataType::Void()) {
   static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
@@ -324,13 +325,15 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
 
   // unify the data type
   ICHECK_EQ(ref_args.size(), args.size());
-  DataType dtype;
 
-  if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
-    dtype = cfg->dtype_input;
-  } else {
-    dtype = cfg->dtype_activation;
+  if (dtype.is_void()) {
+    if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
+      dtype = cfg->dtype_input;
+    } else {
+      dtype = cfg->dtype_activation;
+    }
   }
+
   for (size_t i = 0; i < ret.size(); ++i) {
     auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
@@ -361,7 +364,16 @@ Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
+    // execute the operation with activation data type.
+    const QConfig& cfg = QConfig::Current();
+    Array<Expr> ret_args =
+        UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation);
+    for (size_t i = 0; i < ret_args.size(); ++i) {
+      // do not fuse float32 arg
+      if (new_args[i].as<QRealizeIntExprNode>()->dtype == DataType::Float(32)) {
+        ret_args.Set(i, StopFusion(ret_args[i]));
+      }
+    }
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
@@ -430,6 +442,8 @@ Expr IdentityRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
 
 RELAY_REGISTER_OP("nn.relu").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
+RELAY_REGISTER_OP("reshape").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
+
 RELAY_REGISTER_OP("strided_slice").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
 RELAY_REGISTER_OP("nn.batch_flatten")

From eac0c9cd97c9b4d161cc4f355180b9b310503e5d Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Thu, 29 Oct 2020 10:46:12 -0700
Subject: [PATCH 081/258] Only use thrust for cuda target (#6722)

---
 python/tvm/relay/op/strategy/cuda.py | 8 ++++++--
 python/tvm/topi/cuda/nms.py          | 7 ++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index d77361d906fb..187ea01c47b8 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -673,7 +673,9 @@ def argsort_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_argsort),
         name="argsort.cuda",
     )
-    if get_global_func("tvm.contrib.thrust.sort", allow_missing=True):
+    if target.kind.name == "cuda" and get_global_func(
+        "tvm.contrib.thrust.sort", allow_missing=True
+    ):
         strategy.add_implementation(
             wrap_compute_argsort(topi.cuda.argsort_thrust),
             wrap_topi_schedule(topi.cuda.schedule_argsort),
@@ -692,7 +694,9 @@ def topk_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_topk),
         name="topk.cuda",
     )
-    if get_global_func("tvm.contrib.thrust.sort", allow_missing=True):
+    if target.kind.name == "cuda" and get_global_func(
+        "tvm.contrib.thrust.sort", allow_missing=True
+    ):
         strategy.add_implementation(
             wrap_compute_topk(topi.cuda.topk_thrust),
             wrap_topi_schedule(topi.cuda.schedule_topk),
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 2041f4c232a2..ed6e8f086a0d 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -483,7 +483,12 @@ def non_max_suppression(
     score_axis = score_index
     score_shape = (batch_size, num_anchors)
     score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
-    if tvm.get_global_func("tvm.contrib.thrust.sort_nms", allow_missing=True):
+    target = tvm.target.Target.current()
+    if (
+        target
+        and target.kind.name == "cuda"
+        and tvm.get_global_func("tvm.contrib.thrust.sort_nms", allow_missing=True)
+    ):
         sort_tensor = argsort_thrust(
             score_tensor, valid_count=None, axis=1, is_ascend=False, dtype=valid_count_dtype
         )

From 9f17c31f08663ee7bcc115b12e228ce7c8dae6b0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 29 Oct 2020 13:46:31 -0400
Subject: [PATCH 082/258] [REFACTOR] Remainings of util => utils (#6778)

---
 apps/android_camera/models/prepare_model.py   |  2 +-
 apps/android_rpc/tests/android_rpc_test.py    |  4 +-
 apps/benchmark/arm_cpu_imagenet_bench.py      |  2 +-
 apps/benchmark/mobile_gpu_imagenet_bench.py   |  2 +-
 apps/ios_rpc/tests/ios_rpc_mobilenet.py       |  4 +-
 apps/ios_rpc/tests/ios_rpc_test.py            |  4 +-
 .../topi_recipe/conv/depthwise_conv2d_test.py |  2 +-
 apps/topi_recipe/conv/test_conv2d_hwcn_map.py |  2 +-
 apps/topi_recipe/gemm/android_gemm_square.py  |  4 +-
 docs/api/python/contrib.rst                   |  6 +--
 .../introduction_to_module_serialization.rst  |  4 +-
 golang/sample/deploy.py                       |  2 +-
 golang/src/{util.go => utils.go}              |  2 +-
 jvm/README.md                                 |  2 +-
 jvm/core/src/test/scripts/test_add_cpu.py     |  2 +-
 jvm/core/src/test/scripts/test_add_gpu.py     |  2 +-
 python/setup.py                               |  1 -
 python/tvm/auto_scheduler/auto_schedule.py    |  2 +-
 python/tvm/autotvm/__init__.py                |  2 +-
 .../autotvm/graph_tuner/base_graph_tuner.py   |  2 +-
 python/tvm/autotvm/measure/measure_methods.py |  2 +-
 python/tvm/autotvm/task/space.py              |  2 +-
 python/tvm/autotvm/task/task.py               |  3 +-
 python/tvm/autotvm/tophub.py                  |  2 +-
 python/tvm/autotvm/tuner/callback.py          |  2 +-
 python/tvm/autotvm/tuner/metric.py            |  2 +-
 .../tvm/autotvm/tuner/sa_model_optimizer.py   |  2 +-
 python/tvm/autotvm/tuner/tuner.py             |  4 +-
 .../tvm/autotvm/tuner/xgboost_cost_model.py   |  2 +-
 python/tvm/autotvm/{util.py => utils.py}      |  0
 .../tvm/contrib/{binutil.py => binutils.py}   |  8 ++--
 python/tvm/contrib/clang.py                   |  8 ++--
 python/tvm/contrib/nvcc.py                    |  4 +-
 python/tvm/contrib/peak.py                    |  4 +-
 python/tvm/contrib/rocm.py                    |  6 +--
 python/tvm/contrib/sdaccel.py                 |  4 +-
 python/tvm/contrib/spirv.py                   |  4 +-
 python/tvm/contrib/tar.py                     |  4 +-
 python/tvm/contrib/{util.py => utils.py}      |  0
 python/tvm/contrib/xcode.py                   |  6 +--
 python/tvm/driver/tvmc/compiler.py            |  4 +-
 python/tvm/error.py                           |  2 +-
 python/tvm/micro/build.py                     |  8 ++--
 python/tvm/micro/compiler.py                  | 10 ++---
 python/tvm/micro/micro_library.py             |  4 +-
 python/tvm/relay/backend/vm.py                |  2 +-
 python/tvm/relay/build_module.py              |  4 +-
 python/tvm/relay/frontend/common.py           |  2 +-
 python/tvm/relay/frontend/mxnet.py            |  2 +-
 python/tvm/relay/frontend/onnx.py             |  4 +-
 python/tvm/relay/frontend/pytorch.py          |  2 +-
 python/tvm/relay/frontend/tensorflow.py       |  2 +-
 .../tvm/relay/frontend/tensorflow_parser.py   |  4 +-
 python/tvm/relay/op/_reduce.py                |  2 +-
 python/tvm/relay/op/_tensor_grad.py           |  7 ++--
 python/tvm/relay/op/_transform.py             |  2 +-
 python/tvm/relay/op/dyn/image/_image.py       |  2 +-
 python/tvm/relay/op/image/_image.py           |  2 +-
 python/tvm/relay/op/nn/_nn.py                 |  2 +-
 python/tvm/relay/op/nn/nn.py                  |  2 +-
 python/tvm/relay/op/nn/{util.py => utils.py}  |  0
 python/tvm/relay/op/strategy/arm_cpu.py       |  2 +-
 python/tvm/relay/op/strategy/generic.py       |  2 +-
 python/tvm/relay/op/vision/_rcnn.py           |  2 +-
 python/tvm/relay/qnn/op/qnn.py                |  2 +-
 python/tvm/relay/transform/transform.py       |  2 +-
 python/tvm/rpc/client.py                      |  4 +-
 python/tvm/rpc/server.py                      |  8 ++--
 python/tvm/runtime/module.py                  |  8 ++--
 python/tvm/runtime/vm.py                      |  2 +-
 python/tvm/te/hybrid/module.py                |  4 +-
 python/tvm/topi/__init__.py                   |  4 +-
 python/tvm/topi/arm_cpu/bitserial_conv2d.py   |  4 +-
 python/tvm/topi/arm_cpu/bitserial_dense.py    |  2 +-
 python/tvm/topi/arm_cpu/conv2d.py             |  4 +-
 python/tvm/topi/arm_cpu/conv2d_alter_op.py    |  2 +-
 python/tvm/topi/arm_cpu/conv2d_gemm.py        |  4 +-
 python/tvm/topi/arm_cpu/conv2d_int8.py        |  2 +-
 .../tvm/topi/arm_cpu/conv2d_spatial_pack.py   |  4 +-
 python/tvm/topi/arm_cpu/conv2d_transpose.py   |  2 +-
 .../topi/arm_cpu/cortex_m7/conv2d/direct.py   |  2 +-
 .../arm_cpu/cortex_m7/conv2d/direct_simd.py   |  4 +-
 python/tvm/topi/arm_cpu/depthwise_conv2d.py   |  6 +--
 python/tvm/topi/arm_cpu/injective.py          |  2 +-
 python/tvm/topi/arm_cpu/tensor_intrin.py      |  4 +-
 python/tvm/topi/bifrost/conv2d.py             |  2 +-
 python/tvm/topi/bifrost/dense.py              |  2 +-
 python/tvm/topi/bifrost/depthwise_conv2d.py   |  8 ++--
 python/tvm/topi/bifrost/gemm.py               | 20 ++++-----
 python/tvm/topi/cpp/__init__.py               |  2 +-
 python/tvm/topi/cpp/{util.py => utils.py}     |  2 +-
 python/tvm/topi/cuda/batch_matmul.py          |  2 +-
 python/tvm/topi/cuda/conv1d.py                |  2 +-
 python/tvm/topi/cuda/conv1d_transpose_ncw.py  |  2 +-
 python/tvm/topi/cuda/conv2d.py                |  4 +-
 python/tvm/topi/cuda/conv2d_alter_op.py       |  2 +-
 python/tvm/topi/cuda/conv2d_direct.py         |  2 +-
 .../tvm/topi/cuda/conv2d_hwnc_tensorcore.py   |  4 +-
 python/tvm/topi/cuda/conv2d_int8.py           |  4 +-
 python/tvm/topi/cuda/conv2d_nhwc.py           |  2 +-
 .../tvm/topi/cuda/conv2d_nhwc_tensorcore.py   |  4 +-
 python/tvm/topi/cuda/conv2d_nhwc_winograd.py  |  2 +-
 python/tvm/topi/cuda/conv2d_transpose_nchw.py |  2 +-
 python/tvm/topi/cuda/conv2d_winograd.py       |  2 +-
 python/tvm/topi/cuda/conv3d.py                |  2 +-
 python/tvm/topi/cuda/conv3d_alter_op.py       |  2 +-
 python/tvm/topi/cuda/conv3d_direct.py         |  2 +-
 .../tvm/topi/cuda/conv3d_ndhwc_tensorcore.py  |  4 +-
 .../tvm/topi/cuda/conv3d_transpose_ncdhw.py   |  2 +-
 python/tvm/topi/cuda/conv3d_winograd.py       |  2 +-
 python/tvm/topi/cuda/correlation.py           |  2 +-
 python/tvm/topi/cuda/deformable_conv2d.py     |  2 +-
 python/tvm/topi/cuda/dense.py                 |  2 +-
 python/tvm/topi/cuda/dense_tensorcore.py      |  2 +-
 python/tvm/topi/cuda/depthwise_conv2d.py      |  2 +-
 python/tvm/topi/cuda/group_conv2d_nchw.py     |  4 +-
 python/tvm/topi/cuda/injective.py             |  6 +--
 python/tvm/topi/cuda/pooling.py               |  2 +-
 python/tvm/topi/cuda/rcnn/proposal.py         |  2 +-
 python/tvm/topi/cuda/sparse.py                |  2 +-
 python/tvm/topi/generic/conv2d.py             |  2 +-
 python/tvm/topi/image/dilation2d.py           |  4 +-
 python/tvm/topi/image/resize.py               |  2 +-
 python/tvm/topi/intel_graphics/conv2d.py      | 12 +++---
 .../topi/intel_graphics/conv2d_alter_op.py    |  2 +-
 .../topi/intel_graphics/depthwise_conv2d.py   |  2 +-
 python/tvm/topi/mali/conv2d.py                |  2 +-
 python/tvm/topi/mali/dense.py                 |  2 +-
 python/tvm/topi/mali/depthwise_conv2d.py      |  2 +-
 python/tvm/topi/nn/batch_matmul.py            |  2 +-
 python/tvm/topi/nn/bitserial_conv2d.py        |  4 +-
 python/tvm/topi/nn/bitserial_dense.py         |  2 +-
 python/tvm/topi/nn/bitserial_util.py          |  2 +-
 python/tvm/topi/nn/bnn.py                     |  2 +-
 python/tvm/topi/nn/conv1d.py                  |  4 +-
 python/tvm/topi/nn/conv1d_transpose.py        |  4 +-
 python/tvm/topi/nn/conv2d.py                  |  4 +-
 python/tvm/topi/nn/conv2d_transpose.py        |  4 +-
 python/tvm/topi/nn/conv3d.py                  |  4 +-
 python/tvm/topi/nn/conv3d_transpose.py        |  4 +-
 python/tvm/topi/nn/correlation.py             |  2 +-
 python/tvm/topi/nn/deformable_conv2d.py       |  6 +--
 python/tvm/topi/nn/depthwise_conv2d.py        |  4 +-
 python/tvm/topi/nn/dilate.py                  |  4 +-
 python/tvm/topi/nn/elemwise.py                |  2 +-
 python/tvm/topi/nn/pad.py                     |  2 +-
 python/tvm/topi/nn/sparse.py                  |  2 +-
 python/tvm/topi/nn/upsampling.py              |  2 +-
 python/tvm/topi/nn/{util.py => utils.py}      |  2 +-
 python/tvm/topi/nn/winograd_util.py           |  2 +-
 python/tvm/topi/rocm/batch_matmul.py          |  2 +-
 python/tvm/topi/rocm/conv2d.py                |  4 +-
 python/tvm/topi/rocm/dense.py                 |  2 +-
 python/tvm/topi/sort.py                       |  2 +-
 python/tvm/topi/sparse/csrmm.py               |  2 +-
 python/tvm/topi/sparse/dense.py               |  2 +-
 .../topi/testing/bilinear_resize_python.py    |  2 +-
 python/tvm/topi/testing/conv1d_ncw_python.py  |  2 +-
 .../testing/conv1d_transpose_ncw_python.py    |  2 +-
 python/tvm/topi/testing/conv2d_hwcn_python.py |  2 +-
 python/tvm/topi/testing/conv2d_nchw_python.py |  2 +-
 python/tvm/topi/testing/conv2d_nhwc_python.py |  2 +-
 .../topi/testing/conv2d_transpose_python.py   |  2 +-
 .../tvm/topi/testing/conv3d_ncdhw_python.py   |  2 +-
 .../tvm/topi/testing/conv3d_ndhwc_python.py   |  2 +-
 .../testing/conv3d_transpose_ncdhw_python.py  |  2 +-
 .../testing/deformable_conv2d_nchw_python.py  |  2 +-
 python/tvm/topi/testing/upsampling_python.py  |  2 +-
 python/tvm/topi/transform.py                  |  2 +-
 python/tvm/topi/{util.py => utils.py}         |  2 +-
 python/tvm/topi/vision/rcnn/proposal.py       |  2 +-
 python/tvm/topi/vision/rcnn/roi_align.py      |  4 +-
 python/tvm/topi/vision/rcnn/roi_pool.py       |  2 +-
 python/tvm/topi/x86/batch_matmul.py           |  2 +-
 python/tvm/topi/x86/bitserial_conv2d.py       |  4 +-
 python/tvm/topi/x86/bitserial_dense.py        |  2 +-
 python/tvm/topi/x86/conv2d.py                 |  4 +-
 python/tvm/topi/x86/conv2d_alter_op.py        |  4 +-
 python/tvm/topi/x86/conv2d_avx_1x1.py         |  6 +--
 python/tvm/topi/x86/conv2d_avx_common.py      |  4 +-
 python/tvm/topi/x86/conv2d_int8.py            |  4 +-
 python/tvm/topi/x86/conv2d_transpose.py       |  2 +-
 python/tvm/topi/x86/conv3d.py                 |  8 ++--
 python/tvm/topi/x86/conv3d_transpose.py       |  2 +-
 python/tvm/topi/x86/dense.py                  |  4 +-
 python/tvm/topi/x86/depthwise_conv2d.py       |  8 ++--
 python/tvm/topi/x86/injective.py              |  2 +-
 python/tvm/topi/x86/reduction.py              |  2 +-
 python/tvm/topi/x86/roi_align.py              |  2 +-
 python/tvm/topi/x86/sparse.py                 |  4 +-
 python/tvm/topi/x86/{util.py => utils.py}     |  0
 src/topi/schedule.cc                          |  4 +-
 tests/micro/qemu/test_zephyr.py               |  4 +-
 tests/micro/test_runtime_micro_on_arm.py      |  2 +-
 .../test_minimal_target_codegen_llvm.py       |  4 +-
 .../test_arm_compute_lib/infrastructure.py    |  4 +-
 .../{test_binutil.py => test_binutils.py}     | 12 +++---
 tests/python/contrib/test_coreml_runtime.py   |  6 +--
 tests/python/contrib/test_edgetpu_runtime.py  |  2 +-
 .../contrib/test_ethosn/infrastructure.py     |  6 +--
 tests/python/contrib/test_nnpack.py           |  2 +-
 tests/python/contrib/test_tflite_runtime.py   |  6 +--
 tests/python/contrib/test_util.py             | 36 ++++++++--------
 tests/python/frontend/caffe/test_forward.py   |  2 +-
 .../frontend/tensorflow/test_forward.py       |  4 +-
 tests/python/frontend/tflite/test_forward.py  |  2 +-
 .../integration/test_winograd_nnpack.py       |  2 +-
 tests/python/relay/test_any.py                |  2 +-
 tests/python/relay/test_external_codegen.py   |  4 +-
 tests/python/relay/test_json_runtime.py       |  2 +-
 tests/python/relay/test_op_grad_level2.py     |  6 +--
 tests/python/relay/test_op_level2.py          |  4 +-
 tests/python/relay/test_op_qnn_conv2d.py      |  2 +-
 tests/python/relay/test_param_dict.py         |  4 +-
 .../python/relay/test_pass_annotate_target.py |  4 +-
 tests/python/relay/test_pass_auto_quantize.py |  2 +-
 .../relay/test_pass_merge_compiler_regions.py |  2 +-
 .../python/relay/test_pass_partition_graph.py |  4 +-
 tests/python/relay/test_vm_serialization.py   |  4 +-
 .../{util => utils}/assert_diagnostic.py      |  0
 tests/python/topi/python/test_topi_basic.py   |  6 +--
 .../topi/python/test_topi_batch_matmul.py     |  2 +-
 .../topi/python/test_topi_bitserial_conv2d.py |  2 +-
 .../python/test_topi_bitserial_conv2d_rasp.py |  2 +-
 .../topi/python/test_topi_bitserial_dense.py  |  2 +-
 tests/python/topi/python/test_topi_bnn.py     |  2 +-
 tests/python/topi/python/test_topi_clip.py    |  2 +-
 tests/python/topi/python/test_topi_conv1d.py  |  2 +-
 .../python/test_topi_conv1d_transpose_ncw.py  |  2 +-
 .../topi/python/test_topi_conv2d_NCHWc.py     |  4 +-
 .../topi/python/test_topi_conv2d_hwcn.py      |  2 +-
 .../test_topi_conv2d_hwnc_tensorcore.py       |  4 +-
 .../topi/python/test_topi_conv2d_int8.py      |  4 +-
 .../topi/python/test_topi_conv2d_nchw.py      |  4 +-
 .../topi/python/test_topi_conv2d_nhwc.py      |  2 +-
 .../python/test_topi_conv2d_nhwc_pack_int8.py |  2 +-
 .../test_topi_conv2d_nhwc_tensorcore.py       |  4 +-
 .../python/test_topi_conv2d_nhwc_winograd.py  |  4 +-
 .../python/test_topi_conv2d_transpose_nchw.py |  2 +-
 .../topi/python/test_topi_conv2d_winograd.py  |  4 +-
 .../topi/python/test_topi_conv3d_ncdhw.py     |  4 +-
 .../topi/python/test_topi_conv3d_ndhwc.py     |  2 +-
 .../test_topi_conv3d_ndhwc_tensorcore.py      |  4 +-
 .../test_topi_conv3d_transpose_ncdhw.py       |  2 +-
 .../topi/python/test_topi_conv3d_winograd.py  |  4 +-
 .../topi/python/test_topi_correlation.py      |  2 +-
 .../python/test_topi_deformable_conv2d.py     |  2 +-
 tests/python/topi/python/test_topi_dense.py   |  2 +-
 .../topi/python/test_topi_dense_tensorcore.py |  2 +-
 .../topi/python/test_topi_depthwise_conv2d.py |  4 +-
 .../test_topi_depthwise_conv2d_back_input.py  |  4 +-
 .../test_topi_depthwise_conv2d_back_weight.py |  4 +-
 tests/python/topi/python/test_topi_dilate.py  |  2 +-
 .../topi/python/test_topi_group_conv2d.py     |  2 +-
 .../test_topi_group_conv2d_NCHWc_int8.py      |  2 +-
 tests/python/topi/python/test_topi_lrn.py     |  2 +-
 tests/python/topi/python/test_topi_math.py    |  6 +--
 tests/python/topi/python/test_topi_matmul.py  |  2 +-
 tests/python/topi/python/test_topi_pooling.py |  2 +-
 tests/python/topi/python/test_topi_relu.py    |  2 +-
 tests/python/topi/python/test_topi_reorg.py   |  2 +-
 tests/python/topi/python/test_topi_softmax.py |  2 +-
 tests/python/topi/python/test_topi_sparse.py  |  2 +-
 .../topi/python/test_topi_upsampling.py       |  2 +-
 tests/python/topi/python/test_topi_util.py    |  2 +-
 tests/python/topi/python/test_topi_vision.py  |  2 +-
 .../unittest/test_auto_scheduler_common.py    |  2 +-
 tests/python/unittest/test_autotvm_record.py  |  4 +-
 tests/python/unittest/test_crt.py             |  2 +-
 .../python/unittest/test_format_si_prefix.py  |  8 ++--
 tests/python/unittest/test_micro_artifact.py  |  8 ++--
 tests/python/unittest/test_runtime_graph.py   |  4 +-
 .../unittest/test_runtime_graph_debug.py      |  4 +-
 .../unittest/test_runtime_heterogeneous.py    |  4 +-
 tests/python/unittest/test_runtime_measure.py |  2 +-
 .../test_runtime_module_based_interface.py    | 32 +++++++-------
 .../unittest/test_runtime_module_export.py    | 20 ++++-----
 .../unittest/test_runtime_module_load.py      | 12 +++---
 tests/python/unittest/test_runtime_rpc.py     | 12 +++---
 .../unittest/test_target_codegen_blob.py      |  8 ++--
 .../unittest/test_target_codegen_c_host.py    |  8 ++--
 .../test_target_codegen_cross_llvm.py         |  6 +--
 .../unittest/test_target_codegen_device.py    |  2 +-
 .../unittest/test_target_codegen_llvm.py      |  4 +-
 tests/python/unittest/test_te_autodiff.py     |  2 +-
 .../python/unittest/test_te_hybrid_script.py  |  4 +-
 .../unittest/test_te_tensor_overload.py       |  2 +-
 tests/python/unittest/test_tir_data_layout.py |  2 +-
 tests/python/unittest/test_tir_intrin.py      |  2 +-
 tests/scripts/task_golang.sh                  |  2 +
 tutorials/autotvm/tune_relay_arm.py           |  4 +-
 tutorials/autotvm/tune_relay_cuda.py          |  4 +-
 tutorials/autotvm/tune_relay_mobile_gpu.py    |  4 +-
 tutorials/frontend/deploy_model_on_android.py |  4 +-
 tutorials/frontend/deploy_model_on_rasp.py    |  4 +-
 tutorials/frontend/deploy_prequantized.py     |  2 +-
 tutorials/frontend/deploy_quantized.py        |  2 +-
 tutorials/frontend/from_onnx.py               |  2 +-
 .../get_started/cross_compilation_and_rpc.py  |  4 +-
 tutorials/get_started/relay_quick_start.py    |  4 +-
 .../get_started/tensor_expr_get_started.py    |  4 +-
 tutorials/language/tensorize.py               | 10 ++---
 tutorials/micro/micro_tflite.py               |  2 +-
 version.py                                    |  2 +-
 vta/python/vta/__init__.py                    |  2 -
 vta/python/vta/testing/__init__.py            |  2 +-
 vta/python/vta/testing/{util.py => utils.py}  |  0
 vta/python/vta/top/__init__.py                |  2 +-
 vta/python/vta/top/bitpack.py                 |  4 +-
 vta/python/vta/top/op.py                      |  6 +--
 vta/python/vta/top/{util.py => utils.py}      |  0
 vta/python/vta/top/vta_conv2d.py              | 12 +++---
 vta/python/vta/top/vta_conv2d_transpose.py    |  6 +--
 vta/python/vta/top/vta_dense.py               |  6 +--
 vta/python/vta/top/vta_group_conv2d.py        | 10 ++---
 vta/python/vta/transform.py                   | 42 +++++++++----------
 vta/scripts/tune_resnet.py                    |  4 +-
 .../python/integration/test_benchmark_gemm.py |  4 +-
 .../integration/test_benchmark_topi_conv2d.py |  6 +--
 .../test_benchmark_topi_conv2d_transpose.py   |  6 +--
 .../integration/test_benchmark_topi_dense.py  |  6 +--
 .../test_benchmark_topi_group_conv2d.py       |  6 +--
 vta/tests/python/unittest/test_vta_insn.py    | 14 +++----
 vta/tutorials/autotvm/tune_relay_vta.py       |  6 +--
 .../frontend/deploy_classification.py         |  4 +-
 .../frontend/legacy/deploy_detection.py       |  2 +-
 vta/tutorials/matrix_multiply.py              |  4 +-
 vta/tutorials/optimize/convolution_opt.py     |  4 +-
 vta/tutorials/optimize/matrix_multiply_opt.py |  4 +-
 vta/tutorials/vta_get_started.py              |  4 +-
 web/tests/python/webgpu_rpc_test.py           |  4 +-
 web/tests/python/websock_rpc_test.py          |  4 +-
 332 files changed, 618 insertions(+), 619 deletions(-)
 rename golang/src/{util.go => utils.go} (98%)
 rename python/tvm/autotvm/{util.py => utils.py} (100%)
 rename python/tvm/contrib/{binutil.py => binutils.py} (98%)
 rename python/tvm/contrib/{util.py => utils.py} (100%)
 rename python/tvm/relay/op/nn/{util.py => utils.py} (100%)
 rename python/tvm/topi/cpp/{util.py => utils.py} (93%)
 rename python/tvm/topi/nn/{util.py => utils.py} (99%)
 rename python/tvm/topi/{util.py => utils.py} (99%)
 rename python/tvm/topi/x86/{util.py => utils.py} (100%)
 rename tests/python/contrib/{test_binutil.py => test_binutils.py} (96%)
 rename tests/python/relay/{util => utils}/assert_diagnostic.py (100%)
 rename vta/python/vta/testing/{util.py => utils.py} (100%)
 rename vta/python/vta/top/{util.py => utils.py} (100%)

diff --git a/apps/android_camera/models/prepare_model.py b/apps/android_camera/models/prepare_model.py
index 19be368c97e9..ab20e028c2ad 100644
--- a/apps/android_camera/models/prepare_model.py
+++ b/apps/android_camera/models/prepare_model.py
@@ -25,7 +25,7 @@
 
 import tvm
 import tvm.relay as relay
-from tvm.contrib import util, ndk, graph_runtime as runtime
+from tvm.contrib import utils, ndk, graph_runtime as runtime
 from tvm.contrib.download import download_testdata, download
 
 target = "llvm -mtriple=arm64-linux-android"
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index 2827c140ea92..9586bffeca0b 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -25,7 +25,7 @@
 from tvm import te
 import os
 from tvm import rpc
-from tvm.contrib import util, ndk
+from tvm.contrib import utils, ndk
 import numpy as np
 
 # Set to be address of tvm proxy.
@@ -50,7 +50,7 @@ def test_rpc_module():
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
     a_np = np.random.uniform(size=1024).astype(A.dtype)
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     # Establish remote connection with target hardware
     tracker = rpc.connect_tracker(tracker_host, tracker_port)
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index fb58819d3c5c..e7233370e6d6 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -23,7 +23,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 from tvm import relay
 
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
index b57f6028ab73..cf78c66141d0 100644
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -23,7 +23,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 from tvm import relay
 
diff --git a/apps/ios_rpc/tests/ios_rpc_mobilenet.py b/apps/ios_rpc/tests/ios_rpc_mobilenet.py
index 132377ac4412..90ac6bfb9218 100644
--- a/apps/ios_rpc/tests/ios_rpc_mobilenet.py
+++ b/apps/ios_rpc/tests/ios_rpc_mobilenet.py
@@ -22,7 +22,7 @@
 from tvm.relay import transform
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.quantize.quantize import prerequisite_optimize
-from tvm.contrib import util, xcode, graph_runtime, coreml_runtime
+from tvm.contrib import utils, xcode, graph_runtime, coreml_runtime
 from tvm.contrib.target import coreml as _coreml
 
 import os
@@ -98,7 +98,7 @@ def get_model(model_name, data_shape):
 
 
 def test_mobilenet():
-    temp = util.tempdir()
+    temp = utils.tempdir()
     image, synset = prepare_input()
     model, params = get_model("mobilenetv2_1.0", image.shape)
 
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index 620fe493771f..a967c2f75e61 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -26,7 +26,7 @@
 import re
 import sys
 from tvm import rpc
-from tvm.contrib import util, xcode
+from tvm.contrib import utils, xcode
 import numpy as np
 
 # Set to be address of tvm proxy.
@@ -59,7 +59,7 @@ def test_rpc_module():
     n = tvm.runtime.convert(1024)
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    temp = util.tempdir()
+    temp = utils.tempdir()
     s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
     s[B].bind(xi, te.thread_axis("threadIdx.x"))
diff --git a/apps/topi_recipe/conv/depthwise_conv2d_test.py b/apps/topi_recipe/conv/depthwise_conv2d_test.py
index 036f1a4240f2..94687edde5f9 100644
--- a/apps/topi_recipe/conv/depthwise_conv2d_test.py
+++ b/apps/topi_recipe/conv/depthwise_conv2d_test.py
@@ -22,7 +22,7 @@
 from tvm.contrib import nvcc
 
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.topi.cuda.depthwise_conv2d import (
     schedule_depthwise_conv2d_nchw,
     schedule_depthwise_conv2d_nhwc,
diff --git a/apps/topi_recipe/conv/test_conv2d_hwcn_map.py b/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
index 1d2032d5c405..d67bfdc8952e 100644
--- a/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
+++ b/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
@@ -22,7 +22,7 @@
 from tvm import te
 from tvm.contrib import nvcc
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 TASK = "conv2d_hwcn_map"
 USE_MANUAL_CODE = False
diff --git a/apps/topi_recipe/gemm/android_gemm_square.py b/apps/topi_recipe/gemm/android_gemm_square.py
index 522818842cfa..0e64dcd3844d 100644
--- a/apps/topi_recipe/gemm/android_gemm_square.py
+++ b/apps/topi_recipe/gemm/android_gemm_square.py
@@ -19,7 +19,7 @@
 from tvm import te
 import os
 from tvm import rpc
-from tvm.contrib import util, ndk
+from tvm.contrib import utils, ndk
 import numpy as np
 
 # Set to be address of tvm proxy.
@@ -121,7 +121,7 @@ def test_gemm_gpu(N, times, bn, num_block, num_thread):
     print(tvm.lower(s, [A, B, C], simple_mode=True))
 
     f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu")
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path_dso = temp.relpath("gemm_gpu.so")
     f.export_library(path_dso, ndk.create_shared)
 
diff --git a/docs/api/python/contrib.rst b/docs/api/python/contrib.rst
index 8ac4e1ff7d3a..0eb3024c2d08 100644
--- a/docs/api/python/contrib.rst
+++ b/docs/api/python/contrib.rst
@@ -122,9 +122,9 @@ tvm.contrib.tar
     :members:
 
 
-tvm.contrib.util
-~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.util
+tvm.contrib.utils
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.contrib.utils
     :members:
 
 
diff --git a/docs/dev/introduction_to_module_serialization.rst b/docs/dev/introduction_to_module_serialization.rst
index 5451b84c9b8c..6b2f2addaf9a 100644
--- a/docs/dev/introduction_to_module_serialization.rst
+++ b/docs/dev/introduction_to_module_serialization.rst
@@ -32,7 +32,7 @@ Let us build one ResNet-18 workload for GPU as an example first.
 
    from tvm import relay
    from tvm.relay import testing
-   from tvm.contrib import util
+   from tvm.contrib import utils
    import tvm
 
    # Resnet18 workload
@@ -43,7 +43,7 @@ Let us build one ResNet-18 workload for GPU as an example first.
        _, resnet18_lib, _ = relay.build_module.build(resnet18_mod, "cuda", params=resnet18_params)
 
    # create one tempory directory
-   temp = util.tempdir()
+   temp = utils.tempdir()
 
    # path lib
    file_name = "deploy.so"
diff --git a/golang/sample/deploy.py b/golang/sample/deploy.py
index a0553cfe0211..98820195511c 100644
--- a/golang/sample/deploy.py
+++ b/golang/sample/deploy.py
@@ -51,7 +51,7 @@
 # Save Compiled Module
 # --------------------
 from tvm.contrib import cc
-from tvm.contrib import util
+from tvm.contrib import utils
 
 fadd.save("deploy.o")
 cc.create_shared("deploy.so", ["deploy.o"])
diff --git a/golang/src/util.go b/golang/src/utils.go
similarity index 98%
rename from golang/src/util.go
rename to golang/src/utils.go
index d3846d1db452..2da4138a1e66 100644
--- a/golang/src/util.go
+++ b/golang/src/utils.go
@@ -19,7 +19,7 @@
 
 /*!
  * \brief gotvm package source for common utilities
- * \file util.go
+ * \file utils.go
  */
 
 package gotvm
diff --git a/jvm/README.md b/jvm/README.md
index 348e941c16e4..320e769adb74 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -97,7 +97,7 @@ There's nothing special for this part. The following Python snippet generate add
 import os
 import tvm
 from tvm import te
-from tvm.contrib import cc, util
+from tvm.contrib import cc, utils
 
 def test_add(target_dir):
     n = te.var("n")
diff --git a/jvm/core/src/test/scripts/test_add_cpu.py b/jvm/core/src/test/scripts/test_add_cpu.py
index 40edd082466a..4725dcb8aa67 100644
--- a/jvm/core/src/test/scripts/test_add_cpu.py
+++ b/jvm/core/src/test/scripts/test_add_cpu.py
@@ -18,7 +18,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import cc, util
+from tvm.contrib import cc, utils
 
 
 def test_add(target_dir):
diff --git a/jvm/core/src/test/scripts/test_add_gpu.py b/jvm/core/src/test/scripts/test_add_gpu.py
index 7983930252bc..040a447c3c27 100644
--- a/jvm/core/src/test/scripts/test_add_gpu.py
+++ b/jvm/core/src/test/scripts/test_add_gpu.py
@@ -18,7 +18,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import cc, util
+from tvm.contrib import cc, utils
 
 
 def test_add(target_dir):
diff --git a/python/setup.py b/python/setup.py
index 24022e4ec7b5..5333da0da239 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, exec-used
 """Setup TVM package."""
-from __future__ import absolute_import
 import os
 import shutil
 import sys
diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index a53c29d174d7..5bc13fec62a9 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -36,7 +36,7 @@ class HardwareParams(Object):
     """The parameters of target hardware used to guide the search policy
 
     TODO(jcf94): This is considered to be merged with the new Target specification:
-    https://discuss.tvm.ai/t/rfc-tvm-target-specification/6844
+    https://discuss.tvm.apache.org/t/rfc-tvm-target-specification/6844
 
     Parameters
     ----------
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 7eb1c8b98bc2..a3c59252b01a 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -33,7 +33,7 @@
 from . import record
 from . import task
 from . import tuner
-from . import util
+from . import utils
 from . import env
 from . import tophub
 
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index 40945edad57f..741b05f4c453 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -509,7 +509,7 @@ def _callback(_, inputs, results):
                 # Rule out invalid layout transformations
                 out = topi.layout_transform(data, in_layout, out_layout)
                 out_flops = 1
-                for i in topi.util.get_const_tuple(out.shape):
+                for i in topi.utils.get_const_tuple(out.shape):
                     out_flops *= i
 
                 if flops != out_flops:
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 7032db67a732..913d62b94427 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -41,7 +41,7 @@
 from tvm.driver import build
 from tvm.contrib import nvcc, ndk, tar
 
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..env import AutotvmGlobalScope
 from ..task.space import InstantiationError
 
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index fb8cf57ed7c7..cf9cd809aa8d 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -34,7 +34,7 @@
 
 from tvm.te import schedule, thread_axis
 from tvm.tir import expr
-from tvm.autotvm.util import get_const_int
+from tvm.autotvm.utils import get_const_int
 
 Axis = namedtuple("Axis", ["space", "index"])
 
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 8822ba971e4c..c8b50ad33741 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -29,7 +29,8 @@
 from tvm.te import placeholder, tensor
 from tvm.tir import expr
 
-from ..util import get_const_int, get_const_tuple
+
+from ..utils import get_const_int, get_const_tuple
 from .dispatcher import ApplyConfig, DispatchContext
 from .space import ConfigSpace
 
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 2076ee7d363c..e3170ba98f8a 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -30,7 +30,7 @@
 from ..target import Target
 from ..contrib.download import download
 from .record import load_from_file
-from .util import EmptyContext
+from .utils import EmptyContext
 
 # environment variable to read TopHub location
 AUTOTVM_TOPHUB_LOC_VAR = "TOPHUB_LOCATION"
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index bb9dafaac112..dc75de206d05 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -23,7 +23,7 @@
 import numpy as np
 
 from .. import record
-from ..util import format_si_prefix
+from ..utils import format_si_prefix
 
 logger = logging.getLogger("autotvm")
 
diff --git a/python/tvm/autotvm/tuner/metric.py b/python/tvm/autotvm/tuner/metric.py
index 1ed04ab22e3f..f6932f80d3e3 100644
--- a/python/tvm/autotvm/tuner/metric.py
+++ b/python/tvm/autotvm/tuner/metric.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from ..util import get_rank
+from ..utils import get_rank
 
 
 def max_curve(trial_scores):
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
index 5535246791b6..401eda8c276f 100644
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -25,7 +25,7 @@
 
 import numpy as np
 
-from ..util import sample_ints
+from ..utils import sample_ints
 from .model_based_tuner import ModelOptimizer, knob2point, point2knob
 
 logger = logging.getLogger("autotvm")
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index b769d345bc9b..ba54291ada67 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from ..measure import MeasureInput, create_measure_batch
-from ..util import format_si_prefix
+from ..utils import format_si_prefix
 
 from ..env import GLOBAL_SCOPE
 
@@ -107,7 +107,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
             with no return value. These callback functions will be called on
             every measurement pair. See autotvm/tuner/callback.py for some examples.
         si_prefix: str
-            One of tvm.autotvm.util.SI_PREFIXES. The SI prefix to use when reporting FLOPS.
+            One of tvm.autotvm.utils.SI_PREFIXES. The SI prefix to use when reporting FLOPS.
         """
         measure_batch = create_measure_batch(self.task, measure_option)
         n_parallel = getattr(measure_batch, "n_parallel", 1)
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index f66764c42520..14bc683c10b1 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -29,7 +29,7 @@
     xgb = None
 
 from .. import feature
-from ..util import get_rank
+from ..utils import get_rank
 from .metric import max_curve, recall_curve, cover_curve
 from .model_based_tuner import CostModel, FeatureCache
 
diff --git a/python/tvm/autotvm/util.py b/python/tvm/autotvm/utils.py
similarity index 100%
rename from python/tvm/autotvm/util.py
rename to python/tvm/autotvm/utils.py
diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutils.py
similarity index 98%
rename from python/tvm/contrib/binutil.py
rename to python/tvm/contrib/binutils.py
index b0f36c8277ed..646362a5587f 100644
--- a/python/tvm/contrib/binutil.py
+++ b/python/tvm/contrib/binutils.py
@@ -19,7 +19,7 @@
 import os
 import subprocess
 import tvm._ffi
-from . import util
+from . import utils
 
 # TODO does this file still belong in `contrib`. is it too µTVM-specific?
 
@@ -217,7 +217,7 @@ def tvm_callback_relocate_binary(
         stack_pointer_init=stack_pointer_init,
     )
 
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     rel_obj_path = tmp_dir.relpath("relocated.obj")
     rel_ld_script_path = tmp_dir.relpath("relocate.lds")
     with open(rel_ld_script_path, "w") as f:
@@ -265,7 +265,7 @@ def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
     section_bin : bytearray
         contents of the read section
     """
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_bin = tmp_dir.relpath("temp.bin")
     tmp_section = tmp_dir.relpath("tmp_section.bin")
     with open(tmp_bin, "wb") as out_file:
@@ -306,7 +306,7 @@ def tvm_callback_get_symbol_map(binary, toolchain_prefix):
         map of defined symbols to addresses, encoded as a series of
         alternating newline-separated keys and values
     """
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_obj = tmp_dir.relpath("tmp_obj.bin")
     with open(tmp_obj, "wb") as out_file:
         out_file.write(bytes(binary))
diff --git a/python/tvm/contrib/clang.py b/python/tvm/contrib/clang.py
index edc12004dc4d..989444730412 100644
--- a/python/tvm/contrib/clang.py
+++ b/python/tvm/contrib/clang.py
@@ -20,7 +20,7 @@
 
 from tvm._ffi.base import py_str
 import tvm.target
-from . import util
+from . import utils
 
 
 def find_clang(required=True):
@@ -49,7 +49,7 @@ def find_clang(required=True):
         cc_list += ["clang-%d" % major]
     cc_list += ["clang"]
     cc_list += ["clang.exe"]
-    valid_list = [util.which(x) for x in cc_list]
+    valid_list = [utils.which(x) for x in cc_list]
     valid_list = [x for x in valid_list if x]
     if not valid_list and required:
         raise RuntimeError("cannot find clang, candidates are: " + str(cc_list))
@@ -83,12 +83,12 @@ def create_llvm(inputs, output=None, options=None, cc=None):
     cc = cc if cc else find_clang()[0]
     cmd = [cc]
     cmd += ["-S", "-emit-llvm"]
-    temp = util.tempdir()
+    temp = utils.tempdir()
     output = output if output else temp.relpath("output.ll")
     inputs = [inputs] if isinstance(inputs, str) else inputs
     input_files = []
     for i, code in enumerate(inputs):
-        if util.is_source_path(code):
+        if utils.is_source_path(code):
             input_files.append(code)
         else:
             temp_path = temp.relpath("input%d.cc" % i)
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index f958c1f8a0cf..53a507f2d79a 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -25,7 +25,7 @@
 import tvm._ffi
 from tvm.runtime import ndarray as nd
 
-from . import util
+from . import utils
 from .._ffi.base import py_str
 
 
@@ -54,7 +54,7 @@ def compile_cuda(code, target="ptx", arch=None, options=None, path_target=None):
     cubin : bytearray
         The bytearray of the cubin
     """
-    temp = util.tempdir()
+    temp = utils.tempdir()
     if target not in ["cubin", "ptx", "fatbin"]:
         raise ValueError("target must be in cubin, ptx, fatbin")
     temp_code = temp.relpath("my_kernel.cu")
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index d9dd5360da85..62ee9fea400b 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -20,13 +20,13 @@
 import logging
 import tvm
 from tvm import te
-from . import util
+from . import utils
 from .. import rpc
 
 
 def _convert_to_remote(func, remote):
     """ convert module function to remote rpc function"""
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path_dso = temp.relpath("tmp_func.tar")
     func.export_library(path_dso)
 
diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index 7b222f3bb20e..e69b2558c0b5 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -23,7 +23,7 @@
 import tvm.runtime
 import tvm.target
 
-from . import util
+from . import utils
 
 
 def find_lld(required=True):
@@ -51,7 +51,7 @@ def find_lld(required=True):
         lld_list += ["ld.lld-%d.0" % major]
         lld_list += ["ld.lld-%d" % major]
     lld_list += ["ld.lld"]
-    valid_list = [util.which(x) for x in lld_list]
+    valid_list = [utils.which(x) for x in lld_list]
     valid_list = [x for x in valid_list if x]
     if not valid_list and required:
         raise RuntimeError("cannot find ld.lld, candidates are: " + str(lld_list))
@@ -97,7 +97,7 @@ def callback_rocm_link(obj_bin):
     cobj_bin : bytearray
         The HSA Code Object
     """
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_obj = tmp_dir.relpath("rocm_kernel.o")
     tmp_cobj = tmp_dir.relpath("rocm_kernel.co")
     with open(tmp_obj, "wb") as out_file:
diff --git a/python/tvm/contrib/sdaccel.py b/python/tvm/contrib/sdaccel.py
index b88fa4c840a2..930752c2bc6b 100644
--- a/python/tvm/contrib/sdaccel.py
+++ b/python/tvm/contrib/sdaccel.py
@@ -19,7 +19,7 @@
 import os
 
 import tvm._ffi
-from . import util
+from . import utils
 
 
 @tvm._ffi.register_func("tvm_callback_sdaccel_compile")
@@ -40,7 +40,7 @@ def compile_vhls(kernel_info, device_name):
     xclbin : bytearray
         The bytearray of the xclbin
     """
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
 
     sdk = os.environ.get("XILINX_SDX", None)
     xocc = os.path.join(sdk, "bin/xocc") if sdk else "xocc"
diff --git a/python/tvm/contrib/spirv.py b/python/tvm/contrib/spirv.py
index a5d847158c63..94b24d0c7b09 100644
--- a/python/tvm/contrib/spirv.py
+++ b/python/tvm/contrib/spirv.py
@@ -17,7 +17,7 @@
 """Utility for Interacting with SPIRV Tools"""
 import subprocess
 import os
-from . import util
+from . import utils
 from .._ffi.base import py_str
 
 
@@ -37,7 +37,7 @@ def optimize(spv_bin):
         The HSA Code Object
     """
 
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_in = tmp_dir.relpath("input.spv")
     tmp_out = tmp_dir.relpath("output.spv")
     with open(tmp_in, "wb") as out_file:
diff --git a/python/tvm/contrib/tar.py b/python/tvm/contrib/tar.py
index bcc34a18637c..354887730f46 100644
--- a/python/tvm/contrib/tar.py
+++ b/python/tvm/contrib/tar.py
@@ -21,7 +21,7 @@
 import os
 import shutil
 import subprocess
-from . import util
+from . import utils
 from .._ffi.base import py_str
 
 
@@ -38,7 +38,7 @@ def tar(output, files):
     """
     cmd = ["tar"]
     cmd += ["-czf"]
-    temp = util.tempdir()
+    temp = utils.tempdir()
     fset = set()
     for fname in files:
         base = os.path.basename(fname)
diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/utils.py
similarity index 100%
rename from python/tvm/contrib/util.py
rename to python/tvm/contrib/utils.py
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index 13bd74762163..0c0dac1234e8 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -23,7 +23,7 @@
 import subprocess
 import json
 from .._ffi.base import py_str
-from . import util
+from . import utils
 
 
 def xcrun(cmd):
@@ -132,7 +132,7 @@ def compile_metal(code, path_target=None, sdk="macosx"):
     metallib : bytearray
         The bytearray of the metallib
     """
-    temp = util.tempdir()
+    temp = utils.tempdir()
     temp_code = temp.relpath("my_lib.metal")
     temp_ir = temp.relpath("my_lib.air")
     temp_target = temp.relpath("my_lib.metallib")
@@ -248,7 +248,7 @@ def popen_test_rpc(host, port, key, destination, libs=None, options=None):
         )
 
     # Lock the path so only one file can run
-    lock = util.filelock(os.path.join(rpc_root, "ios_rpc.lock"))
+    lock = utils.filelock(os.path.join(rpc_root, "ios_rpc.lock"))
 
     with open(os.path.join(rpc_root, "rpc_config.txt"), "w") as fo:
         fo.write("%s %d %s\n" % (host, port, key))
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 8001ee29f757..eeb5d07fe051 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -26,7 +26,7 @@
 from tvm import autotvm
 from tvm import relay
 from tvm.contrib import cc
-from tvm.contrib import util
+from tvm.contrib import utils
 
 from . import common, frontends
 from .main import register_parser
@@ -238,7 +238,7 @@ def save_module(module_path, graph, lib, params, cross=None):
     lib_name = "mod.so"
     graph_name = "mod.json"
     param_name = "mod.params"
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path_lib = temp.relpath(lib_name)
     if not cross:
         logger.debug("exporting library to %s", path_lib)
diff --git a/python/tvm/error.py b/python/tvm/error.py
index d7628a735145..819f06475e0a 100644
--- a/python/tvm/error.py
+++ b/python/tvm/error.py
@@ -50,7 +50,7 @@ def __init__(self, msg):
         if "TVM hint:" not in msg:
             msg += (
                 "\nTVM hint: You hit an internal error. "
-                + "Please open a thread on https://discuss.tvm.ai/ to report it."
+                + "Please open a thread on https://discuss.tvm.apache.org/ to report it."
             )
         super(InternalError, self).__init__(msg)
 
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 908bc9637dcf..d1a3c4163755 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -21,7 +21,7 @@
 import logging
 import os
 import re
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 _LOG = logging.getLogger(__name__)
@@ -32,11 +32,11 @@ class Workspace:
 
     def __init__(self, root=None, debug=False):
         if debug or root is not None:
-            with util.TempDirectory.set_keep_for_debug():
-                self.tempdir = util.tempdir(custom_path=root)
+            with utils.TempDirectory.set_keep_for_debug():
+                self.tempdir = utils.tempdir(custom_path=root)
                 _LOG.info("Created debug mode workspace at: %s", self.tempdir.temp_dir)
         else:
-            self.tempdir = util.tempdir()
+            self.tempdir = utils.tempdir()
 
     def relpath(self, path):
         return self.tempdir.relpath(path)
diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py
index 307c9809fc21..069f600a823e 100644
--- a/python/tvm/micro/compiler.py
+++ b/python/tvm/micro/compiler.py
@@ -22,7 +22,7 @@
 import os
 import re
 
-from tvm.contrib import binutil
+from tvm.contrib import binutils
 import tvm.target
 from . import build
 from . import class_factory
@@ -232,13 +232,13 @@ def library(self, output, sources, options=None):
 
             output_filename = f"{src_base}.o"
             output_abspath = os.path.join(output, output_filename)
-            binutil.run_cmd(args + ["-c", "-o", output_abspath, src])
+            binutils.run_cmd(args + ["-c", "-o", output_abspath, src])
             outputs.append(output_abspath)
 
         output_filename = f"{os.path.basename(output)}.a"
         output_abspath = os.path.join(output, output_filename)
-        binutil.run_cmd([prefix + "ar", "-r", output_abspath] + outputs)
-        binutil.run_cmd([prefix + "ranlib", output_abspath])
+        binutils.run_cmd([prefix + "ar", "-r", output_abspath] + outputs)
+        binutils.run_cmd([prefix + "ranlib", output_abspath])
 
         return tvm.micro.MicroLibrary(output, [output_filename])
 
@@ -273,7 +273,7 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non
             for lib_name in obj.library_files:
                 args.append(obj.abspath(lib_name))
 
-        binutil.run_cmd(args)
+        binutils.run_cmd(args)
         return tvm.micro.MicroBinary(output, output_filename, [])
 
     @property
diff --git a/python/tvm/micro/micro_library.py b/python/tvm/micro/micro_library.py
index b2876509708e..74687ede1235 100644
--- a/python/tvm/micro/micro_library.py
+++ b/python/tvm/micro/micro_library.py
@@ -17,7 +17,7 @@
 
 """Defines an Artifact subclass that describes a compiled static library."""
 
-from tvm.contrib import util
+from tvm.contrib import utils
 from . import artifact
 from . import compiler
 
@@ -79,7 +79,7 @@ def create_micro_library(output, objects, options=None):
     options : Optional[List[str]]
       If given, additional command-line flags for the compiler.
     """
-    temp_dir = util.tempdir()
+    temp_dir = utils.tempdir()
     comp = compiler.DefaultCompiler()
     output = temp_dir.relpath("micro-library.o")
     comp.library(output, objects, options=options)
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index b0a5e9899b02..0f7875a9202e 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -223,7 +223,7 @@ def _tophub_context(self, target):
         if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
             tophub_context = autotvm.tophub.context(list(target.values()))
         else:
-            tophub_context = autotvm.util.EmptyContext()
+            tophub_context = autotvm.utils.EmptyContext()
         return tophub_context
 
 
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index bd0c3e5f4d73..35bd8e6d3d4d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -253,7 +253,7 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(list(target.values()))
     else:
-        tophub_context = autotvm.util.EmptyContext()
+        tophub_context = autotvm.utils.EmptyContext()
 
     with tophub_context:
         bld_mod = BuildModule()
@@ -307,7 +307,7 @@ def optimize(mod, target=None, params=None):
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(list(target.values()))
     else:
-        tophub_context = autotvm.util.EmptyContext()
+        tophub_context = autotvm.utils.EmptyContext()
 
     with tophub_context:
         bld_mod = BuildModule()
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index b27c759b8d03..ae51f2155402 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -22,7 +22,7 @@
 
 import tvm
 from tvm.ir import IRModule
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from .. import expr as _expr
 from .. import function as _function
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index a543f78bd949..2242be1bcdeb 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -23,7 +23,7 @@
 from tvm.ir import IRModule
 
 from tvm import relay
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .. import analysis
 from .. import expr as _expr
 from .. import function as _function
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ccf644e82c57..fa404efc39cf 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -21,7 +21,7 @@
 import numpy as np
 import tvm
 from tvm.ir import IRModule
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from ... import nd as _nd
 from .. import analysis
@@ -2649,7 +2649,7 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
     retains that dynamism upon import, and the compiler attempts to convert the
     model into a static shapes at compile time. If this fails, there may still
     be dynamic operations in the model. Not all TVM kernels currently support
-    dynamic shapes, please file an issue on discuss.tvm.ai
+    dynamic shapes, please file an issue on discuss.tvm.apache.org
     if you hit an error with dynamic kernels.
 
     Parameters
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 8d164314ecc8..d8c0769e24ea 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -25,7 +25,7 @@
 import numpy as np
 
 import tvm
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from .. import analysis as _analysis
 from .. import expr as _expr
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 89b36256152e..218c6e57f995 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -27,7 +27,7 @@
 from tvm.ir import IRModule
 from tvm.relay.prelude import Prelude, StaticTensorArrayOps, get_tensor_array_shape
 from tvm.relay.transform import InferType
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from .. import analysis
 from .. import expr as _expr
diff --git a/python/tvm/relay/frontend/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
index 3c1d342ac248..b1b10eb81f56 100644
--- a/python/tvm/relay/frontend/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -18,7 +18,7 @@
 # pylint: disable=import-outside-toplevel, assignment-from-no-return
 
 import os
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 class TFParser(object):
@@ -45,7 +45,7 @@ class TFParser(object):
     def __init__(self, model_dir, outputs=None):
         from tensorflow.core.framework import graph_pb2
 
-        self._tmp_dir = util.tempdir()
+        self._tmp_dir = utils.tempdir()
         self._model_dir = model_dir
         self._graph = graph_pb2.GraphDef()
         self._outputs = outputs or []
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index 604098f30ad9..2872640109d3 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -19,7 +19,7 @@
 
 from tvm.runtime import convert
 from tvm.te.hybrid import script
-from tvm.topi.util import get_const_int, get_const_tuple
+from tvm.topi.utils import get_const_int, get_const_tuple
 from . import op as _reg
 
 _reg.register_reduce_schedule("argmax")
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 9f4f20c9000c..b070d9f5b3ff 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -16,12 +16,11 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
-from __future__ import absolute_import
-
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.error import OpError
 
+
 from ..expr import Tuple, TupleGetItem, const, Var
 from ..ty import TensorType
 from ..loops import while_loop
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 4ee6f2ebb5c1..a4da896e6111 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -22,7 +22,7 @@
 from tvm.te.hybrid import script
 from tvm.runtime import convert
 from tvm import topi
-from tvm.topi.util import get_const_int, get_const_tuple
+from tvm.topi.utils import get_const_int, get_const_tuple
 from . import op as _reg
 from . import strategy
 from .op import OpPattern
diff --git a/python/tvm/relay/op/dyn/image/_image.py b/python/tvm/relay/op/dyn/image/_image.py
index cc0099836e2e..e3415795712e 100644
--- a/python/tvm/relay/op/dyn/image/_image.py
+++ b/python/tvm/relay/op/dyn/image/_image.py
@@ -21,7 +21,7 @@
 import tvm.topi
 from tvm.runtime import convert
 from tvm.te.hybrid import script
-from tvm.topi.util import nchw_pack_layout, nchw_xc_layout
+from tvm.topi.utils import nchw_pack_layout, nchw_xc_layout
 from ... import op as reg
 
 
diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py
index adbed84713fe..c0cdf64c621a 100644
--- a/python/tvm/relay/op/image/_image.py
+++ b/python/tvm/relay/op/image/_image.py
@@ -22,7 +22,7 @@
 from tvm.runtime import convert
 
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .. import op as reg
 from .. import strategy
 from ..op import OpPattern
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index c9926647989e..c235f87d1e99 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from tvm.runtime import convert
 from tvm.te.hybrid import script
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 1aad4e7125fd..0d012540343f 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -20,7 +20,7 @@
 
 from . import _make
 from ..dyn.nn import _make as _dyn_make
-from .util import get_pad_tuple1d, get_pad_tuple2d, get_pad_tuple3d
+from .utils import get_pad_tuple1d, get_pad_tuple2d, get_pad_tuple3d
 from ...expr import const, Expr
 
 
diff --git a/python/tvm/relay/op/nn/util.py b/python/tvm/relay/op/nn/utils.py
similarity index 100%
rename from python/tvm/relay/op/nn/util.py
rename to python/tvm/relay/op/nn/utils.py
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 6759a54d0b80..985124e305ee 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -180,7 +180,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
             # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
             # Let us comment it out but not remove.
             # see discussion:
-            # https://discuss.tvm.ai/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
+            # https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
             # strategy.add_implementation(
             #     wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack),
             #     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack),
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 34d1999707e9..273bee41cf75 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -20,7 +20,7 @@
 
 import re
 from tvm import topi
-from tvm.topi.util import get_const_int, get_const_float, get_const_tuple, get_float_tuple
+from tvm.topi.utils import get_const_int, get_const_float, get_const_tuple, get_float_tuple
 from .. import op as _op
 from ....target import generic_func, override_native_generic_func
 
diff --git a/python/tvm/relay/op/vision/_rcnn.py b/python/tvm/relay/op/vision/_rcnn.py
index a5cc266f1566..46eb3cbc2e53 100644
--- a/python/tvm/relay/op/vision/_rcnn.py
+++ b/python/tvm/relay/op/vision/_rcnn.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Faster R-CNN and Mask R-CNN operations."""
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .. import op as reg
 from .. import strategy
 from ..op import OpPattern
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index e4a6cbf43383..3f23d6895b43 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -19,7 +19,7 @@
 
 from __future__ import absolute_import as _abs
 from tvm.relay.expr import Tuple, TupleWrapper
-from tvm.relay.op.nn.util import get_pad_tuple2d
+from tvm.relay.op.nn.utils import get_pad_tuple2d
 from . import _make
 
 
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 060547e4c4d7..f0f55f60d0e3 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -422,7 +422,7 @@ def ConvertLayout(desired_layouts):
     parser and relay.build call. This is very helpful for hardware backends that support/prefer only
     type of data layout.
 
-    RFC - https://discuss.tvm.ai/t/layout-conversion-pass/4009
+    RFC - https://discuss.tvm.apache.org/t/layout-conversion-pass/4009
 
     This pass uses most of the AlterOpLayout and InferCorrectLayout infrastructure. We can define
     new layouts for conv2d ops for now. Most of the other operators try to adapt to their input
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index b9ad94d87327..a50f3b856800 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -22,7 +22,7 @@
 import time
 
 import tvm._ffi
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm._ffi.base import TVMError
 from tvm.runtime import ndarray as nd
 
@@ -244,7 +244,7 @@ def __init__(self):
 
 @tvm._ffi.register_func("rpc.PopenSession")
 def _popen_session(binary):
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     if isinstance(binary, (bytes, bytearray)):
         path_exec = temp.relpath("server.minrpc")
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 728723432aa8..9489a734eb8b 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -42,7 +42,7 @@
 from tvm._ffi.base import py_str
 from tvm._ffi.libinfo import find_lib_path
 from tvm.runtime.module import load_module as _load_module
-from tvm.contrib import util
+from tvm.contrib import utils
 from . import _ffi_api
 from . import base
 from .base import TrackerCode
@@ -55,7 +55,7 @@ def _server_env(load_library, work_path=None):
     if work_path:
         temp = work_path
     else:
-        temp = util.tempdir()
+        temp = utils.tempdir()
 
     # pylint: disable=unused-variable
     @tvm._ffi.register_func("tvm.rpc.server.workpath", override=True)
@@ -89,7 +89,7 @@ def download_linked_module(file_name):
             # Extra dependencies during runtime.
             from tvm.contrib import cc as _cc, tar as _tar
 
-            tar_temp = util.tempdir(custom_path=path.replace(".tar", ""))
+            tar_temp = utils.tempdir(custom_path=path.replace(".tar", ""))
             _tar.untar(path, tar_temp.temp_dir)
             files = [tar_temp.relpath(x) for x in tar_temp.listdir()]
             _cc.create_shared(path + ".so", files, cc=cc)
@@ -230,7 +230,7 @@ def _accept_conn(listen_sock, tracker_conn, ping_period=2):
             raise exc
 
         # step 3: serving
-        work_path = util.tempdir()
+        work_path = utils.tempdir()
         logger.info("connection from %s", addr)
         server_proc = multiprocessing.Process(
             target=_serve_loop, args=(conn, addr, load_library, work_path)
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 5b613a60ffc3..9cf636895541 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -278,7 +278,7 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
             raise RuntimeError("Cannot call export_library in runtime only mode")
         # Extra dependencies during runtime.
         from pathlib import Path
-        from tvm.contrib import cc as _cc, tar as _tar, util as _util
+        from tvm.contrib import cc as _cc, tar as _tar, utils as _utils
 
         if isinstance(file_name, Path):
             file_name = str(file_name)
@@ -293,7 +293,7 @@ def export_library(self, file_name, fcompile=None, addons=None, **kwargs):
             return
 
         modules = self._collect_dso_modules()
-        temp = _util.tempdir()
+        temp = _utils.tempdir()
         files = addons if addons else []
         is_system_lib = False
         has_c_module = False
@@ -409,9 +409,9 @@ def load_module(path, fmt=""):
         path += ".so"
     elif path.endswith(".tar"):
         # Extra dependencies during runtime.
-        from tvm.contrib import cc as _cc, util as _util, tar as _tar
+        from tvm.contrib import cc as _cc, utils as _utils, tar as _tar
 
-        tar_temp = _util.tempdir(custom_path=path.replace(".tar", ""))
+        tar_temp = _utils.tempdir(custom_path=path.replace(".tar", ""))
         _tar.untar(path, tar_temp.temp_dir)
         files = [tar_temp.relpath(x) for x in tar_temp.listdir()]
         _cc.create_shared(path + ".so", files, cc=cc)
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 81a909b2e1c0..448cb137cc9b 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -120,7 +120,7 @@ def save(self):
             executable = relay.vm.compile(mod, target)
             code, lib = executable.save()
             # save and load the code and lib file.
-            tmp = tvm.contrib.util.tempdir()
+            tmp = tvm.contrib.utils.tempdir()
             path_lib = tmp.relpath("lib.so")
             lib.export_library(path_lib)
             with open(tmp.relpath("code.ro"), "wb") as fo:
diff --git a/python/tvm/te/hybrid/module.py b/python/tvm/te/hybrid/module.py
index 2af67853ca5b..beea8844f78c 100644
--- a/python/tvm/te/hybrid/module.py
+++ b/python/tvm/te/hybrid/module.py
@@ -23,7 +23,7 @@
 
 import ast
 
-from tvm.contrib import util
+from tvm.contrib import utils
 from .utils import _internal_assert
 from .utils import _is_tvm_arg_types
 from .parser import source_to_op
@@ -48,7 +48,7 @@ def __init__(self, src=None, name=None):
         """
         self.src_ = self.name = self.func_ = self.root_ = None
         if src is not None:
-            temp = util.tempdir()
+            temp = utils.tempdir()
             dst = temp.relpath("script.py")
             with open(dst, "w") as f:
                 f.write("import tvm\n@tvm.te.hybrid.script\n%s" % src)
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 0b2174cb3cb0..555717854ed6 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -49,7 +49,7 @@
 from . import mali
 from . import bifrost
 from . import intel_graphics
-from . import util
+from . import utils
 from . import rocm
 from . import vision
 from . import image
@@ -57,7 +57,7 @@
 from . import hls
 
 # error reporting
-from .util import InvalidShapeError
+from .utils import InvalidShapeError
 
 # not import testing by default
 # because testing can have extra deps that are not necessary
diff --git a/python/tvm/topi/arm_cpu/bitserial_conv2d.py b/python/tvm/topi/arm_cpu/bitserial_conv2d.py
index fb22930b5681..88940a000397 100644
--- a/python/tvm/topi/arm_cpu/bitserial_conv2d.py
+++ b/python/tvm/topi/arm_cpu/bitserial_conv2d.py
@@ -25,8 +25,8 @@
 from ..nn.pad import pad
 from ..nn.bitserial_conv2d import bitserial_conv2d_legalize
 from ..nn.bitserial_util import bitpack, binary_op_multiplier
-from ..nn.util import get_pad_tuple
-from ..util import get_const_int, get_const_tuple
+from ..nn.utils import get_pad_tuple
+from ..utils import get_const_int, get_const_tuple
 
 
 def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC, use_bitpack=True):
diff --git a/python/tvm/topi/arm_cpu/bitserial_dense.py b/python/tvm/topi/arm_cpu/bitserial_dense.py
index 61778b7eb544..8ceab5153889 100644
--- a/python/tvm/topi/arm_cpu/bitserial_dense.py
+++ b/python/tvm/topi/arm_cpu/bitserial_dense.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .. import tag
 from .bitserial_conv2d import _intrin_popcount
 from ..nn.pad import pad
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
index b7f94f7e390c..7dbbf9d3d447 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -23,9 +23,9 @@
 from tvm import autotvm
 import tvm.contrib.nnpack
 
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 from .. import nn
-from ..nn.util import get_const_int, get_pad_tuple
+from ..nn.utils import get_const_int, get_pad_tuple
 from ..nn.winograd_util import winograd_transform_matrices
 from .conv2d_spatial_pack import (
     conv2d_spatial_pack_nchw,
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index a64bc413e0c6..c7c572c81110 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -25,7 +25,7 @@
 from tvm import autotvm
 
 from ..nn import conv2d_alter_layout
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..x86.conv2d import _get_default_config as _get_x86_default_config
 from .arm_utils import get_tiling_B_interleaved_t
 
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index b40fb89b5d33..81326f169260 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -21,8 +21,8 @@
 from tvm import te
 from tvm.topi import nn
 from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity
-from ..util import get_const_tuple, get_const_int
-from ..nn.util import get_pad_tuple
+from ..utils import get_const_tuple, get_const_int
+from ..nn.utils import get_pad_tuple
 from .tensor_intrin import (
     gemm_quantized,
     gemm_quantized_impl,
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 43fe80178bd3..445b9ec0c113 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -19,7 +19,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import tag
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 from ..generic import conv2d as conv2d_generic
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index 2e664433c15b..f4cd9d899b73 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -21,8 +21,8 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import get_const_tuple
-from ..nn.util import get_const_int, get_pad_tuple
+from ..utils import get_const_tuple
+from ..nn.utils import get_const_int, get_pad_tuple
 
 
 def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
diff --git a/python/tvm/topi/arm_cpu/conv2d_transpose.py b/python/tvm/topi/arm_cpu/conv2d_transpose.py
index bab32ab9def6..c9f1e1efddfc 100644
--- a/python/tvm/topi/arm_cpu/conv2d_transpose.py
+++ b/python/tvm/topi/arm_cpu/conv2d_transpose.py
@@ -23,7 +23,7 @@
 from tvm import autotvm
 
 from ..nn import dilate, pad, get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 from .conv2d_spatial_pack import schedule_conv2d_spatial_pack_nchw
 
 
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
index b084066fc152..4f721da5420c 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
@@ -21,7 +21,7 @@
 from tvm import autotvm
 from tvm.autotvm.task import deserialize_args
 from tvm.topi.nn.conv2d import conv2d_nchw, conv2d_nhwc
-from tvm.topi.util import get_const_tuple, get_const_int, traverse_inline
+from tvm.topi.utils import get_const_tuple, get_const_int, traverse_inline
 
 
 def conv2d_direct(*args, **kwargs):
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
index 61dca8a37962..988c3a99c059 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
@@ -20,9 +20,9 @@
 from tvm import autotvm
 from tvm.autotvm.task import deserialize_args
 from tvm import te
-from tvm.topi.util import simplify, traverse_inline
+from tvm.topi.utils import simplify, traverse_inline
 from tvm.topi.nn.pad import pad
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 from ..micro_kernel.gemm import (
     intrin_gemm_MxKxN,
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index b71c0c92864c..3c32d3e1f3f2 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -23,8 +23,8 @@
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from .. import nn
-from ..util import traverse_inline, get_const_tuple, get_const_int
-from ..nn.util import get_pad_tuple
+from ..utils import traverse_inline, get_const_tuple, get_const_int
+from ..nn.utils import get_pad_tuple
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -151,7 +151,7 @@ def _callback(op):
 # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
 # Let us comment it out but not remove.
 # see discussion:
-# https://discuss.tvm.ai/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
+# https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
 @autotvm.register_topi_compute("depthwise_conv2d_nchw_spatial_pack.arm_cpu")
 def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """TOPI compute callback for depthwise_conv2d nchw
diff --git a/python/tvm/topi/arm_cpu/injective.py b/python/tvm/topi/arm_cpu/injective.py
index aec86bc6d525..55f47c5dee4d 100644
--- a/python/tvm/topi/arm_cpu/injective.py
+++ b/python/tvm/topi/arm_cpu/injective.py
@@ -18,7 +18,7 @@
 """Schedule for pooling operators"""
 import tvm
 from tvm import te
-from ..util import is_empty_shape
+from ..utils import is_empty_shape
 
 
 def schedule_injective_from_existing(sch, out):
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 73cfacb62079..196f788e6b8c 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import util, clang
+from tvm.contrib import utils, clang
 
 
 def gemm_quantized_4_4_batched():
@@ -372,7 +372,7 @@ def gemm_quantized_impl(M, N, K, unroll, interleave, data_type="uint8"):
         cc_code = cc_code.replace("umull", "smull")
         cc_code = cc_code.replace("uadalp", "sadalp")
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     ll_path = temp.relpath("temp.ll")
     # Create LLVM ir from c source code
     ll_code = clang.create_llvm(
diff --git a/python/tvm/topi/bifrost/conv2d.py b/python/tvm/topi/bifrost/conv2d.py
index a3be906c250f..3b6cca6aaea4 100644
--- a/python/tvm/topi/bifrost/conv2d.py
+++ b/python/tvm/topi/bifrost/conv2d.py
@@ -25,7 +25,7 @@
 
 from .gemm import decl_winograd_gemm, schedule_gemm
 from .transforms import tile_and_bind, tile_and_bind3d
-from ..util import traverse_inline, get_const_int, get_const_tuple
+from ..utils import traverse_inline, get_const_int, get_const_tuple
 from .. import nn
 from ..nn.winograd_util import winograd_transform_matrices
 
diff --git a/python/tvm/topi/bifrost/dense.py b/python/tvm/topi/bifrost/dense.py
index 85703f19ca2f..9ab8b4ebea62 100644
--- a/python/tvm/topi/bifrost/dense.py
+++ b/python/tvm/topi/bifrost/dense.py
@@ -20,7 +20,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("dense.bifrost")
diff --git a/python/tvm/topi/bifrost/depthwise_conv2d.py b/python/tvm/topi/bifrost/depthwise_conv2d.py
index 35da5a594ec2..625c274213ad 100644
--- a/python/tvm/topi/bifrost/depthwise_conv2d.py
+++ b/python/tvm/topi/bifrost/depthwise_conv2d.py
@@ -22,7 +22,7 @@
 import tvm
 from tvm import te
 
-from .. import util
+from .. import utils
 from .. import tag
 
 
@@ -70,12 +70,12 @@ def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
         VH = 1
         VW = 1
         num_thread = 4
-        while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
+        while utils.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
             VW = VW * 2
-        while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
+        while utils.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
             VH = VH * 2
         if raw_data.dtype == "float16":
-            if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
+            if utils.get_const_int(conv.shape[3]) % (VW * 2) == 0:
                 VW *= 2
                 num_thread *= 2
             else:
diff --git a/python/tvm/topi/bifrost/gemm.py b/python/tvm/topi/bifrost/gemm.py
index c06f62323817..6224493109ef 100644
--- a/python/tvm/topi/bifrost/gemm.py
+++ b/python/tvm/topi/bifrost/gemm.py
@@ -19,7 +19,7 @@
 from tvm import te
 
 from .transforms import tile_and_bind, tile_and_bind3d, interleave_transpose, transpose_interleave
-from .. import util
+from .. import utils
 
 
 def decl_gemm(cfg, A, B):
@@ -50,10 +50,10 @@ def decl_gemm(cfg, A, B):
     cfg.define_knob("split_k_factor", [1, 4, 16])
 
     # Mutual k axis must be of equal extent
-    assert util.get_const_int(A.shape[1]) == util.get_const_int(B.shape[0])
+    assert utils.get_const_int(A.shape[1]) == utils.get_const_int(B.shape[0])
     n = A.shape[0]
     m = B.shape[1]
-    k_size = util.get_const_int(A.shape[1])
+    k_size = utils.get_const_int(A.shape[1])
     unroll_gemm = cfg["split_k_factor"].val
     if unroll_gemm == 1:
         # No unrolling case must have the same set of tensors to keep scheduling consistent
@@ -120,8 +120,8 @@ def decl_batched_gemm(cfg, A, B):
 
     """
     # Mutual b and k axis must be of equal extent
-    assert util.get_const_int(A.shape[2]) == util.get_const_int(B.shape[1])
-    assert util.get_const_int(A.shape[0]) == util.get_const_int(B.shape[0])
+    assert utils.get_const_int(A.shape[2]) == utils.get_const_int(B.shape[1])
+    assert utils.get_const_int(A.shape[0]) == utils.get_const_int(B.shape[0])
 
     cfg.define_knob("work_group_x", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
     cfg.define_knob("work_group_y", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
@@ -131,8 +131,8 @@ def decl_batched_gemm(cfg, A, B):
 
     n = A.shape[1]
     m = B.shape[2]
-    k_size = util.get_const_int(A.shape[2])
-    b_size = util.get_const_int(A.shape[0])
+    k_size = utils.get_const_int(A.shape[2])
+    b_size = utils.get_const_int(A.shape[0])
 
     # Declare a batched GEMM
     k = te.reduce_axis((0, k_size), name="k")
@@ -163,9 +163,9 @@ def decl_winograd_gemm(cfg, A, B):
     -------
 
     """
-    alpha = util.get_const_int(A.shape[0])
-    n = util.get_const_int(A.shape[2])
-    k = util.get_const_int(A.shape[3])
+    alpha = utils.get_const_int(A.shape[0])
+    n = utils.get_const_int(A.shape[2])
+    k = utils.get_const_int(A.shape[3])
 
     A_3D = te.compute(
         (alpha * alpha, n, k), lambda b, i, j: A[b // alpha][b % alpha][i][j], name="A_3D"
diff --git a/python/tvm/topi/cpp/__init__.py b/python/tvm/topi/cpp/__init__.py
index 62e274c4e768..bad6f0e8d452 100644
--- a/python/tvm/topi/cpp/__init__.py
+++ b/python/tvm/topi/cpp/__init__.py
@@ -23,4 +23,4 @@
 from . import x86
 from . import generic
 from . import rocm
-from . import util
+from . import utils
diff --git a/python/tvm/topi/cpp/util.py b/python/tvm/topi/cpp/utils.py
similarity index 93%
rename from python/tvm/topi/cpp/util.py
rename to python/tvm/topi/cpp/utils.py
index ca0b86e5a353..60a2747f9abb 100644
--- a/python/tvm/topi/cpp/util.py
+++ b/python/tvm/topi/cpp/utils.py
@@ -17,4 +17,4 @@
 """FFI for TOPI utility functions"""
 import tvm._ffi
 
-tvm._ffi._init_api("topi.util", "tvm.topi.cpp.util")
+tvm._ffi._init_api("topi.utils", "tvm.topi.cpp.utils")
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index ee94420066dd..8d34b2996593 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -22,7 +22,7 @@
 from tvm.contrib import cublas
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn
-from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
 
 
 @autotvm.register_topi_compute("batch_matmul.cuda")
diff --git a/python/tvm/topi/cuda/conv1d.py b/python/tvm/topi/cuda/conv1d.py
index 416e4803a7f0..e50913d88df2 100644
--- a/python/tvm/topi/cuda/conv1d.py
+++ b/python/tvm/topi/cuda/conv1d.py
@@ -21,7 +21,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 
 
 @autotvm.register_topi_compute("conv1d_ncw.cuda")
diff --git a/python/tvm/topi/cuda/conv1d_transpose_ncw.py b/python/tvm/topi/cuda/conv1d_transpose_ncw.py
index c827007f3aec..1ddbdcca9b36 100644
--- a/python/tvm/topi/cuda/conv1d_transpose_ncw.py
+++ b/python/tvm/topi/cuda/conv1d_transpose_ncw.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 
 
 @autotvm.task.register_topi_compute("conv1d_transpose_nchw.cuda")
diff --git a/python/tvm/topi/cuda/conv2d.py b/python/tvm/topi/cuda/conv2d.py
index cf335acfb98d..ce9cebc3c963 100644
--- a/python/tvm/topi/cuda/conv2d.py
+++ b/python/tvm/topi/cuda/conv2d.py
@@ -22,8 +22,8 @@
 from tvm.contrib import cudnn
 
 from .. import nn, generic
-from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..nn.utils import get_pad_tuple
+from ..utils import get_const_tuple, traverse_inline
 from .conv2d_direct import schedule_direct_cuda
 from .conv2d_nhwc import schedule_conv2d_nhwc_direct
 
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 9bac87c32cff..609ead3e6398 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -24,7 +24,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from .conv2d_winograd import _infer_tile_size
 from ..nn import conv2d_legalize
 
diff --git a/python/tvm/topi/cuda/conv2d_direct.py b/python/tvm/topi/cuda/conv2d_direct.py
index e1f3d82cb3e9..2dc6635e680e 100644
--- a/python/tvm/topi/cuda/conv2d_direct.py
+++ b/python/tvm/topi/cuda/conv2d_direct.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def schedule_direct_cuda(cfg, s, conv):
diff --git a/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py b/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py
index db5a6c9863f2..e2d3cd927a6e 100644
--- a/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py
+++ b/python/tvm/topi/cuda/conv2d_hwnc_tensorcore.py
@@ -22,9 +22,9 @@
 from tvm import autotvm
 from tvm.target import Target
 from tvm.topi.cuda.injective import schedule_injective_from_existing
-from ..util import get_const_tuple, traverse_inline, simplify, tag
+from ..utils import get_const_tuple, traverse_inline, simplify, tag
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from .tensor_intrin import intrin_wmma_load_matrix_A
 from .tensor_intrin import intrin_wmma_load_matrix_W
 from .tensor_intrin import intrin_wmma_store_matrix
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index deeec50f6d71..50a0e8b71661 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -25,8 +25,8 @@
 from .tensor_intrin import dp4a
 from ..nn.pad import pad
 from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..nn.utils import get_pad_tuple
+from ..utils import get_const_tuple, traverse_inline
 
 
 def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"):
diff --git a/python/tvm/topi/cuda/conv2d_nhwc.py b/python/tvm/topi/cuda/conv2d_nhwc.py
index b25634586a69..a08d217696e2 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def schedule_conv2d_nhwc_direct(cfg, s, Conv):
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
index a33092d2ff22..f665cc779dc5 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
@@ -21,9 +21,9 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple, traverse_inline, simplify
+from ..utils import get_const_tuple, traverse_inline, simplify
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from .tensor_intrin import intrin_wmma_load_matrix_A
 from .tensor_intrin import intrin_wmma_load_matrix_W
 from .tensor_intrin import intrin_wmma_store_matrix
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
index 246437a26146..1e368f585354 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
@@ -23,7 +23,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import get_const_int, get_const_tuple, traverse_inline
+from ..utils import get_const_int, get_const_tuple, traverse_inline
 from ..nn.winograd_util import winograd_transform_matrices
 from .tensor_intrin import intrin_wmma_load_matrix_A
 from .tensor_intrin import intrin_wmma_load_matrix_W
diff --git a/python/tvm/topi/cuda/conv2d_transpose_nchw.py b/python/tvm/topi/cuda/conv2d_transpose_nchw.py
index 915e6cdecae2..609d1acc78bd 100644
--- a/python/tvm/topi/cuda/conv2d_transpose_nchw.py
+++ b/python/tvm/topi/cuda/conv2d_transpose_nchw.py
@@ -22,7 +22,7 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 
 
 @autotvm.register_topi_compute("conv2d_transpose_nchw.cuda")
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
index 11502e134fd5..407f05e64912 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -23,7 +23,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import get_const_int, get_const_tuple, traverse_inline
+from ..utils import get_const_int, get_const_tuple, traverse_inline
 from ..nn.winograd_util import winograd_transform_matrices
 
 
diff --git a/python/tvm/topi/cuda/conv3d.py b/python/tvm/topi/cuda/conv3d.py
index 98f351bb53d4..e5a3a53a89ff 100644
--- a/python/tvm/topi/cuda/conv3d.py
+++ b/python/tvm/topi/cuda/conv3d.py
@@ -21,7 +21,7 @@
 from tvm.contrib import cudnn
 
 from .. import nn, generic
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 from .conv3d_direct import schedule_direct_conv3d_cuda
 
 
diff --git a/python/tvm/topi/cuda/conv3d_alter_op.py b/python/tvm/topi/cuda/conv3d_alter_op.py
index 2dfba508e281..faf73e77255a 100644
--- a/python/tvm/topi/cuda/conv3d_alter_op.py
+++ b/python/tvm/topi/cuda/conv3d_alter_op.py
@@ -24,7 +24,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from .conv3d_winograd import _infer_tile_size
 
 logger = logging.getLogger("topi")
diff --git a/python/tvm/topi/cuda/conv3d_direct.py b/python/tvm/topi/cuda/conv3d_direct.py
index aa13e6b9a0f4..faccb75badd6 100644
--- a/python/tvm/topi/cuda/conv3d_direct.py
+++ b/python/tvm/topi/cuda/conv3d_direct.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def schedule_direct_conv3d_cuda(cfg, s, conv, layout, workload_name):
diff --git a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
index b253130268aa..a5c4e81a4dc3 100644
--- a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
+++ b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
@@ -21,9 +21,9 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import get_const_tuple, traverse_inline, simplify
+from ..utils import get_const_tuple, traverse_inline, simplify
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple3d
+from ..nn.utils import get_pad_tuple3d
 from .tensor_intrin import intrin_wmma_load_matrix_A
 from .tensor_intrin import intrin_wmma_load_matrix_W
 from .tensor_intrin import intrin_wmma_store_matrix
diff --git a/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py b/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py
index 69c0e0f733ea..3ad85b9bbee7 100644
--- a/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py
+++ b/python/tvm/topi/cuda/conv3d_transpose_ncdhw.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 from .conv3d_direct import schedule_direct_conv3d_cuda
 
 
diff --git a/python/tvm/topi/cuda/conv3d_winograd.py b/python/tvm/topi/cuda/conv3d_winograd.py
index 7f4f13979976..2134ee9178b8 100644
--- a/python/tvm/topi/cuda/conv3d_winograd.py
+++ b/python/tvm/topi/cuda/conv3d_winograd.py
@@ -23,7 +23,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import get_const_int, get_const_tuple, traverse_inline, simplify
+from ..utils import get_const_int, get_const_tuple, traverse_inline, simplify
 from ..nn.winograd_util import winograd_transform_matrices
 
 logger = logging.getLogger("conv3d_winograd")
diff --git a/python/tvm/topi/cuda/correlation.py b/python/tvm/topi/cuda/correlation.py
index 12f564409c12..9b1698329fd3 100644
--- a/python/tvm/topi/cuda/correlation.py
+++ b/python/tvm/topi/cuda/correlation.py
@@ -20,7 +20,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("correlation_nchw.cuda")
diff --git a/python/tvm/topi/cuda/deformable_conv2d.py b/python/tvm/topi/cuda/deformable_conv2d.py
index 365fde541501..911588cad5a3 100644
--- a/python/tvm/topi/cuda/deformable_conv2d.py
+++ b/python/tvm/topi/cuda/deformable_conv2d.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import autotvm
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("deformable_conv2d_nchw.cuda")
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 727992d76529..47b9db4f390a 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -25,7 +25,7 @@
 from .. import nn
 from .. import tag
 from .. import generic
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 
 logger = logging.getLogger("topi")
 
diff --git a/python/tvm/topi/cuda/dense_tensorcore.py b/python/tvm/topi/cuda/dense_tensorcore.py
index 99f28a1fc4e6..a59ebd7347bb 100644
--- a/python/tvm/topi/cuda/dense_tensorcore.py
+++ b/python/tvm/topi/cuda/dense_tensorcore.py
@@ -21,7 +21,7 @@
 from tvm import te
 import tvm.autotvm as autotvm
 from .. import tag
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 from .tensor_intrin import (
     intrin_wmma_load_matrix_A,
     intrin_wmma_load_matrix_W,
diff --git a/python/tvm/topi/cuda/depthwise_conv2d.py b/python/tvm/topi/cuda/depthwise_conv2d.py
index 2908439f0a20..90a7371cb70b 100644
--- a/python/tvm/topi/cuda/depthwise_conv2d.py
+++ b/python/tvm/topi/cuda/depthwise_conv2d.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import traverse_inline
+from ..utils import traverse_inline
 from .. import tag
 from .. import nn
 
diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
index 35d511977fa9..2af011700235 100644
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ b/python/tvm/topi/cuda/group_conv2d_nchw.py
@@ -23,8 +23,8 @@
 from .injective import schedule_injective_from_existing
 from .tensor_intrin import dp4a
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
-from ..util import traverse_inline, get_const_tuple, get_const_int
+from ..nn.utils import get_pad_tuple
+from ..utils import traverse_inline, get_const_tuple, get_const_int
 from .. import nn
 
 
diff --git a/python/tvm/topi/cuda/injective.py b/python/tvm/topi/cuda/injective.py
index 8a5f618f76ab..60fb12e4975e 100644
--- a/python/tvm/topi/cuda/injective.py
+++ b/python/tvm/topi/cuda/injective.py
@@ -18,7 +18,7 @@
 """Schedule for composition of injective operator"""
 import tvm
 from tvm import te
-from .. import util
+from .. import utils
 
 
 def schedule_injective_from_existing(sch, out):
@@ -45,7 +45,7 @@ def schedule_injective_from_existing(sch, out):
     vector_width = 4 if out.dtype == "float16" else 1
 
     try:
-        const_size = util.get_const_int(util.prod(out.shape))
+        const_size = utils.get_const_int(utils.prod(out.shape))
         need_block_split = const_size > max_block * num_thread * vector_width
     except ValueError:
         need_block_split = False
@@ -87,7 +87,7 @@ def schedule_injective(outs):
 
     tvm.te.schedule.AutoInlineInjective(s)
     for out in outs:
-        if not util.is_empty_shape(out.shape):
+        if not utils.is_empty_shape(out.shape):
             schedule_injective_from_existing(s, out)
     return s
 
diff --git a/python/tvm/topi/cuda/pooling.py b/python/tvm/topi/cuda/pooling.py
index a3caf5f45151..f2a6aadb659f 100644
--- a/python/tvm/topi/cuda/pooling.py
+++ b/python/tvm/topi/cuda/pooling.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 def schedule_adaptive_pool(outs, layout="NCHW"):
diff --git a/python/tvm/topi/cuda/rcnn/proposal.py b/python/tvm/topi/cuda/rcnn/proposal.py
index 119b7bd2a74f..5b7884c7363b 100644
--- a/python/tvm/topi/cuda/rcnn/proposal.py
+++ b/python/tvm/topi/cuda/rcnn/proposal.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from ...vision.rcnn import generate_anchor, reg_bbox, reg_iou
-from ...util import get_const_tuple, get_const_int
+from ...utils import get_const_tuple, get_const_int
 
 
 def predict_bbox_ir(
diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index 3fd6fbebc62f..d125423968a9 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -23,7 +23,7 @@
 from tvm import relay, te
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 def sparse_dense(data, weight_data, weight_indices, weight_indptr):
diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 122d7d27b60d..f23cff3bef84 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
diff --git a/python/tvm/topi/image/dilation2d.py b/python/tvm/topi/image/dilation2d.py
index b3887822c452..7aad50623164 100644
--- a/python/tvm/topi/image/dilation2d.py
+++ b/python/tvm/topi/image/dilation2d.py
@@ -19,9 +19,9 @@
 """Dilation2D operators"""
 from __future__ import absolute_import as _abs
 from tvm import te
-from tvm.topi.util import simplify
+from tvm.topi.utils import simplify
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 
 
 def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None):
diff --git a/python/tvm/topi/image/resize.py b/python/tvm/topi/image/resize.py
index ca9904492239..103850de4923 100644
--- a/python/tvm/topi/image/resize.py
+++ b/python/tvm/topi/image/resize.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 import tvm
 from tvm import te
-from tvm.topi.util import nchw_pack_layout, nchw_xc_layout
+from tvm.topi.utils import nchw_pack_layout, nchw_xc_layout
 from .. import tag
 
 
diff --git a/python/tvm/topi/intel_graphics/conv2d.py b/python/tvm/topi/intel_graphics/conv2d.py
index 340a6ccc8d23..fa1fd776b79c 100644
--- a/python/tvm/topi/intel_graphics/conv2d.py
+++ b/python/tvm/topi/intel_graphics/conv2d.py
@@ -25,8 +25,8 @@
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from .. import nn
-from .. import util
-from ..util import simplify, get_const_tuple, traverse_inline
+from .. import utils
+from ..utils import simplify, get_const_tuple, traverse_inline
 
 
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False):
@@ -486,8 +486,8 @@ def _callback(op):
 
 
 def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype="float16"):
-    batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
-    num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
+    batch, in_channel, in_height, in_width = [utils.get_const_int(x) for x in data.shape]
+    num_filter, channel, kernel_h, kernel_w = [utils.get_const_int(x) for x in kernel.shape]
     pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w))
 
     if isinstance(stride, (tuple, list)):
@@ -573,7 +573,7 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype="float16"):
 
 def _schedule_cl_spatialpack(s, op):
     output = op.output(0)
-    _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
+    _, _, out_height, out_width = [utils.get_const_int(x) for x in output.shape]
 
     conv = op.input_tensors[0]
     temp = s[conv].op.input_tensors[0]
@@ -583,7 +583,7 @@ def _schedule_cl_spatialpack(s, op):
     conv_L = s.cache_write(conv, "local")
 
     kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
-    _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
+    _, in_channel, temp_h, temp_w = [utils.get_const_int(x) for x in temp.shape]
 
     attrs = s[conv].op.attrs
     OUTPUT_BLOCK_HEIGHT = attrs["block_h"]
diff --git a/python/tvm/topi/intel_graphics/conv2d_alter_op.py b/python/tvm/topi/intel_graphics/conv2d_alter_op.py
index 46802bba806f..0b59a849c2c9 100644
--- a/python/tvm/topi/intel_graphics/conv2d_alter_op.py
+++ b/python/tvm/topi/intel_graphics/conv2d_alter_op.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm import autotvm
 
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..nn import conv2d_alter_layout, conv2d_infer_layout
 from .conv2d import _get_default_config
 
diff --git a/python/tvm/topi/intel_graphics/depthwise_conv2d.py b/python/tvm/topi/intel_graphics/depthwise_conv2d.py
index e2367798d6cb..fabd63b8778c 100644
--- a/python/tvm/topi/intel_graphics/depthwise_conv2d.py
+++ b/python/tvm/topi/intel_graphics/depthwise_conv2d.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from ..util import traverse_inline
+from ..utils import traverse_inline
 from .. import tag
 from .. import nn
 from ..nn.depthwise_conv2d import depthwise_conv2d_infer_layout
diff --git a/python/tvm/topi/mali/conv2d.py b/python/tvm/topi/mali/conv2d.py
index 0ccf1e671e8c..eb4005eb37c7 100644
--- a/python/tvm/topi/mali/conv2d.py
+++ b/python/tvm/topi/mali/conv2d.py
@@ -22,7 +22,7 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import get_factors
 
-from ..util import traverse_inline, get_const_int, get_const_tuple
+from ..utils import traverse_inline, get_const_int, get_const_tuple
 from .. import nn
 from ..nn.winograd_util import winograd_transform_matrices
 
diff --git a/python/tvm/topi/mali/dense.py b/python/tvm/topi/mali/dense.py
index 7605acebe7c6..53f76219bacd 100644
--- a/python/tvm/topi/mali/dense.py
+++ b/python/tvm/topi/mali/dense.py
@@ -20,7 +20,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("dense.mali")
diff --git a/python/tvm/topi/mali/depthwise_conv2d.py b/python/tvm/topi/mali/depthwise_conv2d.py
index b64135c969e6..55fcb1de9c4a 100644
--- a/python/tvm/topi/mali/depthwise_conv2d.py
+++ b/python/tvm/topi/mali/depthwise_conv2d.py
@@ -22,7 +22,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.mali")
diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 9b926a1182d8..6e60f27eab5d 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -17,7 +17,7 @@
 """Binary Neural Network (BNN) Operators"""
 # pylint: disable=invalid-name
 from tvm import te
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def batch_matmul(x, y, oshape=None):
diff --git a/python/tvm/topi/nn/bitserial_conv2d.py b/python/tvm/topi/nn/bitserial_conv2d.py
index d10451902789..78d05d027659 100644
--- a/python/tvm/topi/nn/bitserial_conv2d.py
+++ b/python/tvm/topi/nn/bitserial_conv2d.py
@@ -20,9 +20,9 @@
 import tvm
 from tvm import te
 from .pad import pad
-from .util import get_pad_tuple
+from .utils import get_pad_tuple
 from .bitserial_util import bitpack
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def bitserial_conv2d_nchw(
diff --git a/python/tvm/topi/nn/bitserial_dense.py b/python/tvm/topi/nn/bitserial_dense.py
index 0b86e2e17392..32154ac81910 100644
--- a/python/tvm/topi/nn/bitserial_dense.py
+++ b/python/tvm/topi/nn/bitserial_dense.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 import tvm
 from tvm import te
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from .bitserial_util import bitpack
 
 
diff --git a/python/tvm/topi/nn/bitserial_util.py b/python/tvm/topi/nn/bitserial_util.py
index ae43668484b3..3a55422493d4 100644
--- a/python/tvm/topi/nn/bitserial_util.py
+++ b/python/tvm/topi/nn/bitserial_util.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm.topi.transform import concatenate
-from ..util import get_const_int
+from ..utils import get_const_int
 
 
 def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
diff --git a/python/tvm/topi/nn/bnn.py b/python/tvm/topi/nn/bnn.py
index 6c36b375f157..50539de10052 100644
--- a/python/tvm/topi/nn/bnn.py
+++ b/python/tvm/topi/nn/bnn.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import simplify, get_const_int
+from ..utils import simplify, get_const_int
 
 
 def binarize_pack(data, axis=None, name="PackedInput"):
diff --git a/python/tvm/topi/nn/conv1d.py b/python/tvm/topi/nn/conv1d.py
index cffed669f20a..8fdf3f8918ee 100644
--- a/python/tvm/topi/nn/conv1d.py
+++ b/python/tvm/topi/nn/conv1d.py
@@ -18,8 +18,8 @@
 """1D convolution operators."""
 from tvm import te
 from .pad import pad
-from ..util import simplify
-from .util import get_pad_tuple1d
+from ..utils import simplify
+from .utils import get_pad_tuple1d
 
 
 def conv1d(data, kernel, strides=1, padding="VALID", dilation=1, layout="NCW", out_dtype=None):
diff --git a/python/tvm/topi/nn/conv1d_transpose.py b/python/tvm/topi/nn/conv1d_transpose.py
index 813377e76ca6..6f040409f47c 100644
--- a/python/tvm/topi/nn/conv1d_transpose.py
+++ b/python/tvm/topi/nn/conv1d_transpose.py
@@ -19,8 +19,8 @@
 from tvm import te
 from .dilate import dilate
 from .pad import pad
-from ..util import simplify
-from .util import get_pad_tuple1d
+from ..utils import simplify
+from .utils import get_pad_tuple1d
 
 
 def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype, output_padding):
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index c0e941c4acc7..2e147fc148de 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -23,8 +23,8 @@
 from tvm import te
 
 from .pad import pad
-from .util import get_pad_tuple
-from ..util import simplify, get_const_tuple, get_const_int, tag
+from .utils import get_pad_tuple
+from ..utils import simplify, get_const_tuple, get_const_int, tag
 from .winograd_util import winograd_transform_matrices
 
 # workload description of conv2d
diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py
index f67f9c9c8a5a..22188bcd45a4 100644
--- a/python/tvm/topi/nn/conv2d_transpose.py
+++ b/python/tvm/topi/nn/conv2d_transpose.py
@@ -21,8 +21,8 @@
 from tvm import relay
 from .dilate import dilate
 from .pad import pad
-from .util import get_pad_tuple
-from ..util import simplify
+from .utils import get_pad_tuple
+from ..utils import simplify
 
 
 def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype, output_padding):
diff --git a/python/tvm/topi/nn/conv3d.py b/python/tvm/topi/nn/conv3d.py
index 1696ac663e95..f3cda2896f63 100644
--- a/python/tvm/topi/nn/conv3d.py
+++ b/python/tvm/topi/nn/conv3d.py
@@ -21,8 +21,8 @@
 from tvm import te
 
 from .pad import pad
-from .util import get_pad_tuple3d
-from ..util import simplify, get_const_tuple
+from .utils import get_pad_tuple3d
+from ..utils import simplify, get_const_tuple
 from .winograd_util import winograd_transform_matrices
 
 
diff --git a/python/tvm/topi/nn/conv3d_transpose.py b/python/tvm/topi/nn/conv3d_transpose.py
index 9a8828f7cbbd..9f5c01a1fc3b 100644
--- a/python/tvm/topi/nn/conv3d_transpose.py
+++ b/python/tvm/topi/nn/conv3d_transpose.py
@@ -21,8 +21,8 @@
 from tvm import relay
 from .dilate import dilate
 from .pad import pad
-from .util import get_pad_tuple3d
-from ..util import simplify
+from .utils import get_pad_tuple3d
+from ..utils import simplify
 
 
 def conv3d_transpose_ncdhw(Input, Filter, strides, padding, out_dtype, output_padding):
diff --git a/python/tvm/topi/nn/correlation.py b/python/tvm/topi/nn/correlation.py
index 583002e7fc83..d7d650cd965e 100644
--- a/python/tvm/topi/nn/correlation.py
+++ b/python/tvm/topi/nn/correlation.py
@@ -18,7 +18,7 @@
 from tvm import te
 
 from .pad import pad
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def correlation_nchw(
diff --git a/python/tvm/topi/nn/deformable_conv2d.py b/python/tvm/topi/nn/deformable_conv2d.py
index 3d2b7ce3f14e..a8c2745b1c77 100644
--- a/python/tvm/topi/nn/deformable_conv2d.py
+++ b/python/tvm/topi/nn/deformable_conv2d.py
@@ -19,9 +19,9 @@
 import tvm
 from tvm import te
 
-from .util import get_pad_tuple
-from ..util import get_const_tuple
-from ..cpp.util import bilinear_sample_nchw
+from .utils import get_pad_tuple
+from ..utils import get_const_tuple
+from ..cpp.utils import bilinear_sample_nchw
 
 
 def deformable_conv2d_nchw(
diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py
index c863a157025b..72356821770d 100644
--- a/python/tvm/topi/nn/depthwise_conv2d.py
+++ b/python/tvm/topi/nn/depthwise_conv2d.py
@@ -23,8 +23,8 @@
 
 from .dilate import dilate
 from .pad import pad
-from .util import get_pad_tuple
-from ..util import simplify
+from .utils import get_pad_tuple
+from ..utils import simplify
 
 # workload description of depthwise-conv2d
 Workload = namedtuple(
diff --git a/python/tvm/topi/nn/dilate.py b/python/tvm/topi/nn/dilate.py
index 6980fea58173..6b2222e4a779 100644
--- a/python/tvm/topi/nn/dilate.py
+++ b/python/tvm/topi/nn/dilate.py
@@ -18,7 +18,7 @@
 """Dilation operators"""
 import tvm
 from tvm import te
-from .. import util
+from .. import utils
 from .. import tag
 
 
@@ -57,7 +57,7 @@ def _dilate(*indices):
         idxdiv = tvm.tir.indexdiv
         idxmod = tvm.tir.indexmod
         for i in range(n):
-            if not util.equal_const_int(strides[i], 1):
+            if not utils.equal_const_int(strides[i], 1):
                 index_tuple.append(idxdiv(indices[i], strides[i]))
                 not_zero.append(idxmod(indices[i], strides[i]).equal(0))
             else:
diff --git a/python/tvm/topi/nn/elemwise.py b/python/tvm/topi/nn/elemwise.py
index 03fffc76ab99..a80047d900f3 100644
--- a/python/tvm/topi/nn/elemwise.py
+++ b/python/tvm/topi/nn/elemwise.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import get_const_int
+from ..utils import get_const_int
 
 
 @tvm.te.tag_scope(tag=tag.ELEMWISE)
diff --git a/python/tvm/topi/nn/pad.py b/python/tvm/topi/nn/pad.py
index ec20ef6f0a13..78e41b5af92a 100644
--- a/python/tvm/topi/nn/pad.py
+++ b/python/tvm/topi/nn/pad.py
@@ -18,7 +18,7 @@
 from __future__ import absolute_import as _abs
 import tvm
 from tvm import te
-from ..util import equal_const_int
+from ..utils import equal_const_int
 from .. import tag
 
 
diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py
index 74a9ad5fd650..55b3e6a7d1e5 100644
--- a/python/tvm/topi/nn/sparse.py
+++ b/python/tvm/topi/nn/sparse.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def sparse_dense(data, weight_data, weight_indices, weight_indptr):
diff --git a/python/tvm/topi/nn/upsampling.py b/python/tvm/topi/nn/upsampling.py
index b390b803b516..b95835f6e103 100644
--- a/python/tvm/topi/nn/upsampling.py
+++ b/python/tvm/topi/nn/upsampling.py
@@ -17,7 +17,7 @@
 """TVM operator upsampling compute."""
 from tvm import topi
 from tvm import te
-from ..util import simplify
+from ..utils import simplify
 
 
 def upsampling(
diff --git a/python/tvm/topi/nn/util.py b/python/tvm/topi/nn/utils.py
similarity index 99%
rename from python/tvm/topi/nn/util.py
rename to python/tvm/topi/nn/utils.py
index 0894656dcf54..ff00441e9850 100644
--- a/python/tvm/topi/nn/util.py
+++ b/python/tvm/topi/nn/utils.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 
 import tvm
-from ..util import get_const_int
+from ..utils import get_const_int
 
 
 def infer_pad(data, data_pad):
diff --git a/python/tvm/topi/nn/winograd_util.py b/python/tvm/topi/nn/winograd_util.py
index d43586dbd0d7..c0f7097a6315 100644
--- a/python/tvm/topi/nn/winograd_util.py
+++ b/python/tvm/topi/nn/winograd_util.py
@@ -26,7 +26,7 @@
 from functools import reduce
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
-from ..util import const_matrix
+from ..utils import const_matrix
 
 
 # pylint: disable=invalid-name
diff --git a/python/tvm/topi/rocm/batch_matmul.py b/python/tvm/topi/rocm/batch_matmul.py
index fa4dd457f3ed..7f35f4b55620 100644
--- a/python/tvm/topi/rocm/batch_matmul.py
+++ b/python/tvm/topi/rocm/batch_matmul.py
@@ -19,7 +19,7 @@
 from tvm import autotvm
 from tvm.contrib import rocblas
 from .. import generic
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 @autotvm.register_topi_compute("batch_matmul_rocblas.rocm")
diff --git a/python/tvm/topi/rocm/conv2d.py b/python/tvm/topi/rocm/conv2d.py
index 0857d093535c..fac77f02b456 100644
--- a/python/tvm/topi/rocm/conv2d.py
+++ b/python/tvm/topi/rocm/conv2d.py
@@ -20,8 +20,8 @@
 from tvm.contrib import miopen
 
 from .. import generic
-from ..util import get_const_tuple
-from ..nn.util import get_pad_tuple
+from ..utils import get_const_tuple
+from ..nn.utils import get_pad_tuple
 
 
 @autotvm.register_topi_compute("conv2d_nchw_miopen.rocm")
diff --git a/python/tvm/topi/rocm/dense.py b/python/tvm/topi/rocm/dense.py
index 4a771c602f59..2f3ce77cc7ba 100644
--- a/python/tvm/topi/rocm/dense.py
+++ b/python/tvm/topi/rocm/dense.py
@@ -21,7 +21,7 @@
 from tvm.contrib import rocblas
 from .. import generic, nn
 from .. import tag
-from ..util import traverse_inline
+from ..utils import traverse_inline
 
 
 @autotvm.register_topi_compute("dense.rocm")
diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py
index 86e2bad591d9..98a1080660fb 100644
--- a/python/tvm/topi/sort.py
+++ b/python/tvm/topi/sort.py
@@ -18,7 +18,7 @@
 """Argsort operator"""
 import tvm
 from tvm import te
-from .util import get_const_tuple
+from .utils import get_const_tuple
 
 
 def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
diff --git a/python/tvm/topi/sparse/csrmm.py b/python/tvm/topi/sparse/csrmm.py
index 954f9dd955f0..f578e6001351 100644
--- a/python/tvm/topi/sparse/csrmm.py
+++ b/python/tvm/topi/sparse/csrmm.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import simplify
+from ..utils import simplify
 
 
 def csrmm_default(data, indices, indptr, weight, bias=None):
diff --git a/python/tvm/topi/sparse/dense.py b/python/tvm/topi/sparse/dense.py
index d86f5dd4bfce..d1516d0c20fc 100644
--- a/python/tvm/topi/sparse/dense.py
+++ b/python/tvm/topi/sparse/dense.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from .. import tag
-from ..util import simplify
+from ..utils import simplify
 
 
 def dense_si(data, indices, indptr, weight, bias=None):
diff --git a/python/tvm/topi/testing/bilinear_resize_python.py b/python/tvm/topi/testing/bilinear_resize_python.py
index 8d78f13ddf76..844546e0643f 100644
--- a/python/tvm/topi/testing/bilinear_resize_python.py
+++ b/python/tvm/topi/testing/bilinear_resize_python.py
@@ -18,7 +18,7 @@
 """Bilinear Scale in python"""
 import math
 import numpy as np
-from tvm.topi.util import nchw_pack_layout
+from tvm.topi.utils import nchw_pack_layout
 
 
 def bilinear_resize_python(image, out_size, layout, coordinate_transformation_mode="align_corners"):
diff --git a/python/tvm/topi/testing/conv1d_ncw_python.py b/python/tvm/topi/testing/conv1d_ncw_python.py
index 1405adb54f46..190e1c664610 100644
--- a/python/tvm/topi/testing/conv1d_ncw_python.py
+++ b/python/tvm/topi/testing/conv1d_ncw_python.py
@@ -17,7 +17,7 @@
 # pylint: disable=unused-variable, invalid-name
 """1D convolution in python"""
 import numpy as np
-from tvm.topi.nn.util import get_pad_tuple1d
+from tvm.topi.nn.utils import get_pad_tuple1d
 
 
 def dilate_np(x, dilation):
diff --git a/python/tvm/topi/testing/conv1d_transpose_ncw_python.py b/python/tvm/topi/testing/conv1d_transpose_ncw_python.py
index 3a1bc61419ff..85e1410c0cd8 100644
--- a/python/tvm/topi/testing/conv1d_transpose_ncw_python.py
+++ b/python/tvm/topi/testing/conv1d_transpose_ncw_python.py
@@ -19,7 +19,7 @@
 import numpy as np
 import scipy
 import tvm.topi.testing
-from tvm.topi.nn.util import get_pad_tuple1d
+from tvm.topi.nn.utils import get_pad_tuple1d
 
 
 def conv1d_transpose_ncw_python(a_np, w_np, stride, padding, output_padding):
diff --git a/python/tvm/topi/testing/conv2d_hwcn_python.py b/python/tvm/topi/testing/conv2d_hwcn_python.py
index 9a06edd82c3b..9ee66df51541 100644
--- a/python/tvm/topi/testing/conv2d_hwcn_python.py
+++ b/python/tvm/topi/testing/conv2d_hwcn_python.py
@@ -18,7 +18,7 @@
 """Convolution in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def conv2d_hwcn_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv2d_nchw_python.py b/python/tvm/topi/testing/conv2d_nchw_python.py
index 38bed4a00c49..ce5d981cc651 100644
--- a/python/tvm/topi/testing/conv2d_nchw_python.py
+++ b/python/tvm/topi/testing/conv2d_nchw_python.py
@@ -18,7 +18,7 @@
 """Convolution in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def _conv2d_nchw_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv2d_nhwc_python.py b/python/tvm/topi/testing/conv2d_nhwc_python.py
index 136fb6b8834a..68ef8c1b283e 100644
--- a/python/tvm/topi/testing/conv2d_nhwc_python.py
+++ b/python/tvm/topi/testing/conv2d_nhwc_python.py
@@ -18,7 +18,7 @@
 """Convolution in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def _conv2d_nhwc_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv2d_transpose_python.py b/python/tvm/topi/testing/conv2d_transpose_python.py
index 04e60a71dd60..c7c0d9f2529a 100644
--- a/python/tvm/topi/testing/conv2d_transpose_python.py
+++ b/python/tvm/topi/testing/conv2d_transpose_python.py
@@ -19,7 +19,7 @@
 import numpy as np
 import scipy
 import tvm.topi.testing
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding):
diff --git a/python/tvm/topi/testing/conv3d_ncdhw_python.py b/python/tvm/topi/testing/conv3d_ncdhw_python.py
index 11b0e2351288..a10d9ed42d1c 100644
--- a/python/tvm/topi/testing/conv3d_ncdhw_python.py
+++ b/python/tvm/topi/testing/conv3d_ncdhw_python.py
@@ -18,7 +18,7 @@
 """Convolution 3D in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple3d
+from tvm.topi.nn.utils import get_pad_tuple3d
 
 
 def _conv3d_ncdhw_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv3d_ndhwc_python.py b/python/tvm/topi/testing/conv3d_ndhwc_python.py
index 52974d488a71..46f04f630863 100644
--- a/python/tvm/topi/testing/conv3d_ndhwc_python.py
+++ b/python/tvm/topi/testing/conv3d_ndhwc_python.py
@@ -18,7 +18,7 @@
 """Convolution 3D in python"""
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple3d
+from tvm.topi.nn.utils import get_pad_tuple3d
 
 
 def conv3d_ndhwc_python(a_np, w_np, stride, padding):
diff --git a/python/tvm/topi/testing/conv3d_transpose_ncdhw_python.py b/python/tvm/topi/testing/conv3d_transpose_ncdhw_python.py
index 779371af9895..38b8bc51bc70 100644
--- a/python/tvm/topi/testing/conv3d_transpose_ncdhw_python.py
+++ b/python/tvm/topi/testing/conv3d_transpose_ncdhw_python.py
@@ -18,7 +18,7 @@
 """Convolution 3D transpose in python"""
 import numpy as np
 import tvm.topi.testing
-from tvm.topi.nn.util import get_pad_tuple3d
+from tvm.topi.nn.utils import get_pad_tuple3d
 
 
 def conv3d_transpose_ncdhw_python(a_np, w_np, stride, padding, output_padding):
diff --git a/python/tvm/topi/testing/deformable_conv2d_nchw_python.py b/python/tvm/topi/testing/deformable_conv2d_nchw_python.py
index cc66c5f12906..6a7afb4b96f3 100644
--- a/python/tvm/topi/testing/deformable_conv2d_nchw_python.py
+++ b/python/tvm/topi/testing/deformable_conv2d_nchw_python.py
@@ -18,7 +18,7 @@
 """Deformable convolution in python"""
 import itertools
 import numpy as np
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 
 def deformable_conv2d_nchw_python(
diff --git a/python/tvm/topi/testing/upsampling_python.py b/python/tvm/topi/testing/upsampling_python.py
index 203e804d3338..7f48aa47b8d1 100644
--- a/python/tvm/topi/testing/upsampling_python.py
+++ b/python/tvm/topi/testing/upsampling_python.py
@@ -18,7 +18,7 @@
 """Upsampling in python"""
 import math
 import numpy as np
-from tvm.topi.util import nchw_pack_layout
+from tvm.topi.utils import nchw_pack_layout
 
 
 def upsample_nearest(arr, scale):
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index c4e51a8858d1..b4a7d1c414da 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -22,7 +22,7 @@
 from tvm import topi
 from . import cpp
 from . import tag
-from .util import within_index, make_idx
+from .utils import within_index, make_idx
 
 
 def expand_dims(a, axis, num_newaxis=1):
diff --git a/python/tvm/topi/util.py b/python/tvm/topi/utils.py
similarity index 99%
rename from python/tvm/topi/util.py
rename to python/tvm/topi/utils.py
index 0a5c93c632de..ea08f3a94fad 100644
--- a/python/tvm/topi/util.py
+++ b/python/tvm/topi/utils.py
@@ -450,4 +450,4 @@ def is_empty_shape(shape):
     is_empty: bool
       Whether input shape is empty or has dimesion with size 0.
     """
-    return cpp.util.is_empty_shape(shape)
+    return cpp.utils.is_empty_shape(shape)
diff --git a/python/tvm/topi/vision/rcnn/proposal.py b/python/tvm/topi/vision/rcnn/proposal.py
index cda7522321fe..89726efd5d0e 100644
--- a/python/tvm/topi/vision/rcnn/proposal.py
+++ b/python/tvm/topi/vision/rcnn/proposal.py
@@ -19,7 +19,7 @@
 import math
 import tvm
 from tvm import te
-from ...util import get_const_tuple, get_const_int
+from ...utils import get_const_tuple, get_const_int
 from ...sort import argsort
 
 
diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py
index eafdc21089bd..a51ba33a6c45 100644
--- a/python/tvm/topi/vision/rcnn/roi_align.py
+++ b/python/tvm/topi/vision/rcnn/roi_align.py
@@ -18,8 +18,8 @@
 """Roi align operator"""
 import tvm
 from tvm import te
-from ...util import get_const_tuple
-from ...cpp.util import bilinear_sample_nchw
+from ...utils import get_const_tuple
+from ...cpp.utils import bilinear_sample_nchw
 
 
 def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
diff --git a/python/tvm/topi/vision/rcnn/roi_pool.py b/python/tvm/topi/vision/rcnn/roi_pool.py
index 2254b74f4bdf..dd1429bcb3c5 100644
--- a/python/tvm/topi/vision/rcnn/roi_pool.py
+++ b/python/tvm/topi/vision/rcnn/roi_pool.py
@@ -18,7 +18,7 @@
 """ROI pool operator"""
 import tvm
 from tvm import te
-from ...util import get_const_tuple
+from ...utils import get_const_tuple
 
 
 def roi_pool_nchw(data, rois, pooled_size, spatial_scale):
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 100bdf205165..166c79a4c93b 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -21,7 +21,7 @@
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cblas, mkl
 from .. import generic
-from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
 
 
 @autotvm.register_topi_compute("batch_matmul.x86")
diff --git a/python/tvm/topi/x86/bitserial_conv2d.py b/python/tvm/topi/x86/bitserial_conv2d.py
index 5fcc9e119c4e..18f305094754 100644
--- a/python/tvm/topi/x86/bitserial_conv2d.py
+++ b/python/tvm/topi/x86/bitserial_conv2d.py
@@ -20,9 +20,9 @@
 from tvm import te
 from tvm import autotvm
 from .. import tag
-from ..util import get_const_int, get_const_tuple
+from ..utils import get_const_int, get_const_tuple
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from ..nn.bitserial_util import bitpack, binary_op_multiplier
 
 
diff --git a/python/tvm/topi/x86/bitserial_dense.py b/python/tvm/topi/x86/bitserial_dense.py
index e9546ac1ee2e..7af18f602234 100644
--- a/python/tvm/topi/x86/bitserial_dense.py
+++ b/python/tvm/topi/x86/bitserial_dense.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.topi.util import get_const_int, get_const_tuple
+from tvm.topi.utils import get_const_int, get_const_tuple
 from .. import tag
 from ..nn.bitserial_util import bitpack, binary_op_multiplier
 
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index a52e27a42755..7b9da8a90ede 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -27,8 +27,8 @@
 from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
-from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple, traverse_inline
+from ..nn.utils import get_pad_tuple
+from ..utils import get_const_tuple, traverse_inline
 from . import conv2d_avx_1x1, conv2d_avx_common
 
 logger = logging.getLogger("topi")
diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
index 1c90841eb2de..db3c232b6a7f 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -26,9 +26,9 @@
 from tvm import autotvm
 from .conv2d import _get_default_config
 from .conv2d_int8 import is_int8_hw_support, _get_default_config_int8
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from ..nn import conv2d_legalize, conv2d_alter_layout
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 
 logger = logging.getLogger("topi")
 
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index 8ca20be262bc..b4a966e1db13 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -22,11 +22,11 @@
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from ..nn.pad import pad
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from ..generic import conv2d as conv2d_generic
-from ..util import get_const_tuple, simplify
+from ..utils import get_const_tuple, simplify
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
-from .util import get_fp32_len
+from .utils import get_fp32_len
 
 
 def _fallback_schedule(cfg, wkl):
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 28a698c342d8..8d707445be05 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -20,9 +20,9 @@
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from ..generic import conv2d as conv2d_generic
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
-from .util import get_fp32_len
+from .utils import get_fp32_len
 
 
 def _fallback_schedule(cfg, wkl):
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index e2862ec11ef7..905ada68f277 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -24,10 +24,10 @@
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 from .. import tag
 from ..generic import conv2d as conv2d_generic
-from ..nn.util import get_pad_tuple
+from ..nn.utils import get_pad_tuple
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
-from ..util import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline
 from .. import nn
 from . import conv2d_avx_1x1, conv2d_avx_common
 
diff --git a/python/tvm/topi/x86/conv2d_transpose.py b/python/tvm/topi/x86/conv2d_transpose.py
index 105c45526085..865b62bb3e87 100644
--- a/python/tvm/topi/x86/conv2d_transpose.py
+++ b/python/tvm/topi/x86/conv2d_transpose.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D Transpose schedule on x86"""
 from tvm import te
-from ..util import traverse_inline
+from ..utils import traverse_inline
 from .. import nn
 from .conv2d import conv2d_nchw, schedule_conv2d_nchw
 
diff --git a/python/tvm/topi/x86/conv3d.py b/python/tvm/topi/x86/conv3d.py
index 479a27b296a4..cb202f5257af 100644
--- a/python/tvm/topi/x86/conv3d.py
+++ b/python/tvm/topi/x86/conv3d.py
@@ -22,11 +22,11 @@
 from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from ..util import traverse_inline
-from ..nn.util import get_pad_tuple3d, infer_pad3d
+from ..utils import traverse_inline
+from ..nn.utils import get_pad_tuple3d, infer_pad3d
 from ..nn.pad import pad
-from ..util import get_const_tuple, simplify, get_const_int
-from .util import get_fp32_len
+from ..utils import get_const_tuple, simplify, get_const_int
+from .utils import get_fp32_len
 
 Workload3D = namedtuple(
     "Workload",
diff --git a/python/tvm/topi/x86/conv3d_transpose.py b/python/tvm/topi/x86/conv3d_transpose.py
index f986ccfaa3d9..e743f02fb063 100644
--- a/python/tvm/topi/x86/conv3d_transpose.py
+++ b/python/tvm/topi/x86/conv3d_transpose.py
@@ -19,7 +19,7 @@
 
 """Conv3D Transpose schedule on x86"""
 from tvm import te
-from ..util import traverse_inline
+from ..utils import traverse_inline
 from .. import nn
 from .conv3d import conv3d_ncdhw, schedule_conv3d_ncdhw
 
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index b0cc71acb232..15d7a1a310d6 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -25,9 +25,9 @@
 from tvm.contrib import mkl
 from tvm.contrib import mkldnn
 
-from .util import get_fp32_len
+from .utils import get_fp32_len
 from .. import generic, tag
-from ..util import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple
 
 
 def _schedule_dense_pack_template(cfg, s, C):
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index 1921f7fecf29..badba1a248e9 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -22,12 +22,12 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from ..nn.pad import pad
-from ..util import get_const_tuple
-from ..nn.util import get_pad_tuple
+from ..utils import get_const_tuple
+from ..nn.utils import get_pad_tuple
 from ..nn.depthwise_conv2d import _get_workload, depthwise_conv2d_infer_layout
 from ..nn.conv2d import unpack_NCHWc_to_nchw
-from ..util import traverse_inline
-from .util import get_fp32_len
+from ..utils import traverse_inline
+from .utils import get_fp32_len
 
 
 def _fallback_schedule(cfg, wkl):
diff --git a/python/tvm/topi/x86/injective.py b/python/tvm/topi/x86/injective.py
index a5e521b8eb20..29f903fd4e35 100644
--- a/python/tvm/topi/x86/injective.py
+++ b/python/tvm/topi/x86/injective.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name
 """x86 declaration and schedules."""
 from tvm import te
-from ..util import is_empty_shape
+from ..utils import is_empty_shape
 
 
 def schedule_injective_from_existing(sch, out):
diff --git a/python/tvm/topi/x86/reduction.py b/python/tvm/topi/x86/reduction.py
index 69659de7d8a9..db3ea81b7358 100644
--- a/python/tvm/topi/x86/reduction.py
+++ b/python/tvm/topi/x86/reduction.py
@@ -20,7 +20,7 @@
 from tvm import te
 from .injective import schedule_injective_from_existing
 from .. import tag
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 def _schedule_reduce(sch, op, is_idx_reduce=False):
diff --git a/python/tvm/topi/x86/roi_align.py b/python/tvm/topi/x86/roi_align.py
index baa23ad2a135..ac2146b558f9 100644
--- a/python/tvm/topi/x86/roi_align.py
+++ b/python/tvm/topi/x86/roi_align.py
@@ -21,7 +21,7 @@
 
 from tvm.te import hybrid
 from ..tensor import full
-from ..util import get_const_tuple
+from ..utils import get_const_tuple
 
 
 @hybrid.script
diff --git a/python/tvm/topi/x86/sparse.py b/python/tvm/topi/x86/sparse.py
index 8c4a387b8052..b6291083c8c1 100644
--- a/python/tvm/topi/x86/sparse.py
+++ b/python/tvm/topi/x86/sparse.py
@@ -18,8 +18,8 @@
 """sparse_dense schedule on x86"""
 from tvm import te
 
-from ..util import traverse_inline, get_const_int
-from .util import get_fp32_len
+from ..utils import traverse_inline, get_const_int
+from .utils import get_fp32_len
 
 
 def schedule_sparse_dense(outs):
diff --git a/python/tvm/topi/x86/util.py b/python/tvm/topi/x86/utils.py
similarity index 100%
rename from python/tvm/topi/x86/util.py
rename to python/tvm/topi/x86/utils.py
diff --git a/src/topi/schedule.cc b/src/topi/schedule.cc
index 83457ced9f16..c315d40be277 100644
--- a/src/topi/schedule.cc
+++ b/src/topi/schedule.cc
@@ -182,11 +182,11 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_lrn").set_body([](TVMArgs args, TVMRetVa
 });
 
 /* Utility functions */
-TVM_REGISTER_GLOBAL("topi.util.is_empty_shape").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("topi.utils.is_empty_shape").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = topi::detail::is_empty_shape(args[0]);
 });
 
-TVM_REGISTER_GLOBAL("topi.util.bilinear_sample_nchw").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nchw").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = detail::bilinear_sample_nchw(args[0], args[1], args[2], args[3]);
 });
 
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
index 67943ce2896f..a5fff781ddef 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/qemu/test_zephyr.py
@@ -30,7 +30,7 @@
 import tvm.relay
 
 from tvm.micro.contrib import zephyr
-from tvm.contrib import util
+from tvm.contrib import utils
 
 BUILD = True
 DEBUG = False
@@ -93,7 +93,7 @@ def _make_session(mod):
             os.unlink(prev_build)
         session_kw["binary"].archive(prev_build, metadata_only=True)
     else:
-        unarchive_dir = util.tempdir()
+        unarchive_dir = utils.tempdir()
         session_kw["binary"] = tvm.micro.MicroBinary.unarchive(
             prev_build, unarchive_dir.relpath("binary")
         )
diff --git a/tests/micro/test_runtime_micro_on_arm.py b/tests/micro/test_runtime_micro_on_arm.py
index cc4066d5adda..45ca8e74323c 100644
--- a/tests/micro/test_runtime_micro_on_arm.py
+++ b/tests/micro/test_runtime_micro_on_arm.py
@@ -19,7 +19,7 @@
 import numpy as np
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime, util
+from tvm.contrib import graph_runtime, utils
 from tvm import relay
 import tvm.micro as micro
 from tvm.micro import create_micro_mod
diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
index 6c2897b1b561..9861a1c39740 100644
--- a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
+++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
@@ -20,7 +20,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.contrib import util, clang
+from tvm.contrib import utils, clang
 import numpy as np
 import ctypes
 import math
@@ -86,7 +86,7 @@ def check_llvm(use_file):
         if not clang.find_clang(required=False):
             print("skip because clang is not available")
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ll_path = temp.relpath("temp.ll")
         ll_code = clang.create_llvm(cc_code, output=ll_path)
         s = te.create_schedule(B.op)
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index 42b111bcad2e..0e444809b014 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -26,7 +26,7 @@
 from tvm import rpc
 from tvm.contrib import graph_runtime
 from tvm.relay.op.contrib import arm_compute_lib
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.autotvm.measure import request_remote
 
 
@@ -226,7 +226,7 @@ def build_and_run(
 def update_lib(lib, device, cross_compile):
     """Export the library to the remote/local device."""
     lib_name = "mod.so"
-    temp = util.tempdir()
+    temp = utils.tempdir()
     lib_path = temp.relpath(lib_name)
     if cross_compile:
         lib.export_library(lib_path, cc=cross_compile)
diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutils.py
similarity index 96%
rename from tests/python/contrib/test_binutil.py
rename to tests/python/contrib/test_binutils.py
index 83b220f03b10..f0aa2d157aed 100644
--- a/tests/python/contrib/test_binutil.py
+++ b/tests/python/contrib/test_binutils.py
@@ -26,9 +26,9 @@
 import tvm
 from tvm import te
 import subprocess
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib import cc
-from tvm.contrib.binutil import *
+from tvm.contrib.binutils import *
 
 TOOLCHAIN_PREFIX = ""
 
@@ -39,7 +39,7 @@ def make_binary():
                 int b = 5; \
                 return 0; \
             }"
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_source = tmp_dir.relpath("source.c")
     tmp_obj = tmp_dir.relpath("obj.obj")
     with open(tmp_source, "w") as f:
@@ -52,7 +52,7 @@ def make_binary():
 def test_tvm_callback_get_section_size(binary=None):
     if binary is None:
         binary = make_binary()
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_bin = tmp_dir.relpath("obj.bin")
     with open(tmp_bin, "wb") as f:
         f.write(binary)
@@ -76,7 +76,7 @@ def verify():
 
 def test_tvm_callback_relocate_binary():
     binary = make_binary()
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_bin = tmp_dir.relpath("obj.bin")
     with open(tmp_bin, "wb") as f:
         f.write(binary)
@@ -133,7 +133,7 @@ def verify():
 
 def test_tvm_callback_get_symbol_map():
     binary = make_binary()
-    tmp_dir = util.tempdir()
+    tmp_dir = utils.tempdir()
     tmp_bin = tmp_dir.relpath("obj.bin")
     with open(tmp_bin, "wb") as f:
         f.write(binary)
diff --git a/tests/python/contrib/test_coreml_runtime.py b/tests/python/contrib/test_coreml_runtime.py
index f6b9d9eb58e4..c0076d6eb12f 100644
--- a/tests/python/contrib/test_coreml_runtime.py
+++ b/tests/python/contrib/test_coreml_runtime.py
@@ -18,7 +18,7 @@
 from tvm import te
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util, xcode, coreml_runtime
+from tvm.contrib import utils, xcode, coreml_runtime
 
 import pytest
 import os
@@ -82,7 +82,7 @@ def verify(coreml_model, model_path, ctx):
             np.testing.assert_almost_equal(c_out, t_out, 3)
 
     def check_remote(coreml_model):
-        temp = util.tempdir()
+        temp = utils.tempdir()
         compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir)
         xcode.popen_test_rpc(
             proxy_host, proxy_port, key, destination=destination, libs=[compiled_model]
@@ -93,7 +93,7 @@ def check_remote(coreml_model):
         verify(coreml_model, compiled_model, ctx)
 
     def check_local(coreml_model):
-        temp = util.tempdir()
+        temp = utils.tempdir()
         compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir)
         ctx = tvm.cpu(0)
         verify(coreml_model, compiled_model, ctx)
diff --git a/tests/python/contrib/test_edgetpu_runtime.py b/tests/python/contrib/test_edgetpu_runtime.py
index eef9e0b06349..8c6113cee3d4 100644
--- a/tests/python/contrib/test_edgetpu_runtime.py
+++ b/tests/python/contrib/test_edgetpu_runtime.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util, tflite_runtime
+from tvm.contrib import utils, tflite_runtime
 
 # import tflite_runtime.interpreter as tflite
 
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 8f07a372ab6b..905d066ce7a3 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import, print_function
 import tvm
 from tvm import relay
-from tvm.contrib import util, graph_runtime, download
+from tvm.contrib import utils, graph_runtime, download
 from hashlib import md5
 from itertools import zip_longest, combinations
 import numpy as np
@@ -61,7 +61,7 @@ def assert_lib_hash(lib, golden):
     if isinstance(golden, str):
         golden = {golden}
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path = temp.relpath("lib.cmm")
     hash_set = set()
     for mod in lib.imported_modules:
@@ -207,7 +207,7 @@ def run(lib, inputs, outputs, npu=True):
     """
     # Export and load lib to confirm this works
     lib_name = "mod.so"
-    temp = util.tempdir()
+    temp = utils.tempdir()
     lib_path = temp.relpath(lib_name)
     lib.export_library(lib_path)
     lib = tvm.runtime.load_module(lib_path)
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index a92b9301f7cc..bcb4358596b2 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 import scipy.signal
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 from tvm.contrib import nnpack
 import pytest
 
diff --git a/tests/python/contrib/test_tflite_runtime.py b/tests/python/contrib/test_tflite_runtime.py
index c24747db8238..39d8881f4040 100644
--- a/tests/python/contrib/test_tflite_runtime.py
+++ b/tests/python/contrib/test_tflite_runtime.py
@@ -20,7 +20,7 @@
 from tvm import te
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util, tflite_runtime
+from tvm.contrib import utils, tflite_runtime
 
 
 def _create_tflite_model():
@@ -70,7 +70,7 @@ def test_local():
 
     tflite_fname = "model.tflite"
     tflite_model = _create_tflite_model()
-    temp = util.tempdir()
+    temp = utils.tempdir()
     tflite_model_path = temp.relpath(tflite_fname)
     open(tflite_model_path, "wb").write(tflite_model)
 
@@ -111,7 +111,7 @@ def test_remote():
 
     tflite_fname = "model.tflite"
     tflite_model = _create_tflite_model()
-    temp = util.tempdir()
+    temp = utils.tempdir()
     tflite_model_path = temp.relpath(tflite_fname)
     open(tflite_model_path, "wb").write(tflite_model)
 
diff --git a/tests/python/contrib/test_util.py b/tests/python/contrib/test_util.py
index 29c6fbf6c897..5078450cedb9 100644
--- a/tests/python/contrib/test_util.py
+++ b/tests/python/contrib/test_util.py
@@ -19,7 +19,7 @@
 import datetime
 import os
 import shutil
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 def validate_debug_dir_path(temp_dir, expected_basename):
@@ -32,55 +32,55 @@ def validate_debug_dir_path(temp_dir, expected_basename):
 
 
 def test_tempdir():
-    assert util.TempDirectory._KEEP_FOR_DEBUG == False, "don't submit with KEEP_FOR_DEBUG == True"
+    assert utils.TempDirectory._KEEP_FOR_DEBUG == False, "don't submit with KEEP_FOR_DEBUG == True"
 
-    temp_dir = util.tempdir()
+    temp_dir = utils.tempdir()
     assert os.path.exists(temp_dir.temp_dir)
 
-    old_debug_mode = util.TempDirectory._KEEP_FOR_DEBUG
-    old_tempdirs = util.TempDirectory.TEMPDIRS
+    old_debug_mode = utils.TempDirectory._KEEP_FOR_DEBUG
+    old_tempdirs = utils.TempDirectory.TEMPDIRS
     try:
         for temp_dir_number in range(0, 3):
-            with util.TempDirectory.set_keep_for_debug():
-                debug_temp_dir = util.tempdir()
+            with utils.TempDirectory.set_keep_for_debug():
+                debug_temp_dir = utils.tempdir()
                 try:
                     validate_debug_dir_path(debug_temp_dir, "0000" + str(temp_dir_number))
                 finally:
                     shutil.rmtree(debug_temp_dir.temp_dir)
 
-        with util.TempDirectory.set_keep_for_debug():
+        with utils.TempDirectory.set_keep_for_debug():
             # Create 2 temp_dir within the same session.
-            debug_temp_dir = util.tempdir()
+            debug_temp_dir = utils.tempdir()
             try:
                 validate_debug_dir_path(debug_temp_dir, "00003")
             finally:
                 shutil.rmtree(debug_temp_dir.temp_dir)
 
-            debug_temp_dir = util.tempdir()
+            debug_temp_dir = utils.tempdir()
             try:
                 validate_debug_dir_path(debug_temp_dir, "00004")
             finally:
                 shutil.rmtree(debug_temp_dir.temp_dir)
 
-            with util.TempDirectory.set_keep_for_debug(False):
-                debug_temp_dir = util.tempdir()  # This one should get deleted.
+            with utils.TempDirectory.set_keep_for_debug(False):
+                debug_temp_dir = utils.tempdir()  # This one should get deleted.
 
                 # Simulate atexit hook
-                util.TempDirectory.remove_tempdirs()
+                utils.TempDirectory.remove_tempdirs()
 
                 # Calling twice should be a no-op.
-                util.TempDirectory.remove_tempdirs()
+                utils.TempDirectory.remove_tempdirs()
 
                 # Creating a new TempDirectory should fail now
                 try:
-                    util.tempdir()
+                    utils.tempdir()
                     assert False, "creation should fail"
-                except util.DirectoryCreatedPastAtExit:
+                except utils.DirectoryCreatedPastAtExit:
                     pass
 
     finally:
-        util.TempDirectory.DEBUG_MODE = old_debug_mode
-        util.TempDirectory.TEMPDIRS = old_tempdirs
+        utils.TempDirectory.DEBUG_MODE = old_debug_mode
+        utils.TempDirectory.TEMPDIRS = old_tempdirs
 
 
 if __name__ == "__main__":
diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py
index 2005090d7b1a..d75ecd83a285 100644
--- a/tests/python/frontend/caffe/test_forward.py
+++ b/tests/python/frontend/caffe/test_forward.py
@@ -36,7 +36,7 @@
 
 import tvm
 from tvm import relay
-from tvm.contrib import util, graph_runtime
+from tvm.contrib import utils, graph_runtime
 from tvm.contrib.download import download_testdata
 
 CURRENT_DIR = os.path.join(os.path.expanduser("~"), ".tvm_test_data", "caffe_test")
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 143030d080c9..fc50d771c71d 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -2863,11 +2863,11 @@ def test_forward_inception_v1():
 
         # Build an image from random data.
         from PIL import Image
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
         img_array = np.random.uniform(size=(1, 600, 600, 3)).astype("uint8")
         img = Image.frombuffer("RGB", (600, 600), img_array.tostring(), "raw", "RGB", 0, 1)
-        temp = util.tempdir()
+        temp = utils.tempdir()
         img_path = temp.relpath("tf-test.jpg")
         img.save(img_path)
 
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 3f860a3c6580..89ae34899331 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -3724,7 +3724,7 @@ def test_forward_qnn_coco_ssd_mobilenet_v1():
     """Test the quantized Coco SSD Mobilenet V1 TF Lite model."""
     pytest.skip(
         "LLVM bug - getExtendedVectorNumElements - "
-        + "https://discuss.tvm.ai/t/segfault-in-llvm/3567. The workaround is to use a "
+        + "https://discuss.tvm.apache.org/t/segfault-in-llvm/3567. The workaround is to use a "
         + "specific target, for example, llvm -mpcu=core-avx2"
     )
 
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index f29d1fba8f1e..d32699375050 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -23,7 +23,7 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from pytest import skip
 import tvm.testing
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index d39e04ecd7c1..8784b97a31fa 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay.loops import while_loop
 from tvm.relay.testing import run_infer_type as infer_type
-from util.assert_diagnostic import DiagnosticTesting
+from utils.assert_diagnostic import DiagnosticTesting
 import tvm.topi.testing
 
 
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index c919e7ce1a7c..b2c4dd98721c 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -25,7 +25,7 @@
 import tvm.relay.transform
 from tvm import relay
 from tvm import runtime
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu()):
@@ -40,7 +40,7 @@ def update_lib(lib):
 
         kwargs = {}
         kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
-        tmp_path = util.tempdir()
+        tmp_path = utils.tempdir()
         lib_name = "lib.so"
         lib_path = tmp_path.relpath(lib_name)
         lib.export_library(lib_path, fcompile=False, **kwargs)
diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py
index c09dab34be1e..df4dff81b03e 100644
--- a/tests/python/relay/test_json_runtime.py
+++ b/tests/python/relay/test_json_runtime.py
@@ -24,7 +24,7 @@
 import tvm.relay.op as reg
 import tvm.relay.testing
 from tvm import relay, runtime
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.relay import transform
 from tvm.relay.backend import compile_engine
 from tvm.relay.build_module import bind_params_by_name
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 85332da64221..80a567d9cb65 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -38,7 +38,7 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
 
     data = np.random.rand(*x_shape).astype("float32")
     ph, pw = padding
-    y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
+    y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
     out_grad = np.ones(shape=y_shape)
     ref_grad = tvm.topi.testing.pool_grad_nchw(
         data,
@@ -87,7 +87,7 @@ def verify_avg_pool2d_grad(
 
         data = np.random.rand(*x_shape).astype(dtype)
         ph, pw = padding
-        y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
+        y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
         out_grad = np.ones(shape=y_shape)
         ref_grad = tvm.topi.testing.pool_grad_nchw(
             data,
@@ -143,7 +143,7 @@ def verify_global_avg_pool2d_grad(x_shape):
     bwd_func = run_infer_type(gradient(fwd_func))
 
     data = np.random.rand(*x_shape).astype("float32")
-    y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
+    y_shape = topi.utils.get_const_tuple(fwd_func.ret_type.shape)
     out_grad = np.ones(shape=y_shape)
     ref_grad = tvm.topi.testing.pool_grad_nchw(
         data,
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 546ea6019e56..a2b791d8d33f 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -23,7 +23,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
-from tvm.contrib import util
+from tvm.contrib import utils
 import tvm.topi.testing
 from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
 import tvm.testing
@@ -258,7 +258,7 @@ def compile_test_conv2d_arm_cpu(
                         ["data_pad_inline", "ot", 4], ["data_vec_inline", "ot", 1], \
                         ["conv_inline", "ot", 0]]}], "r": [[0.0002933163], \
                         0, 3.1976189613342285, 1570811630.6058347], "v": 0.1}'
-        temp = util.tempdir()
+        temp = utils.tempdir()
         with open(temp.relpath("temp.log"), "w") as log_file:
             log_file.write(test_schedule)
         with autotvm.apply_history_best(temp.relpath("temp.log")):
diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py
index e14bba2d7f78..67d4c6f0b807 100644
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ b/tests/python/relay/test_op_qnn_conv2d.py
@@ -176,7 +176,7 @@ def get_funcs(
 def verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype):
     def get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype):
         # Keeping inputs multiple of 4 because of a bug in Average Pool2d
-        # https://discuss.tvm.ai/t/pool2d-gives-bad-output-for-integer-inputs/3377
+        # https://discuss.tvm.apache.org/t/pool2d-gives-bad-output-for-integer-inputs/3377
         low = -128
         high = 127
         if data_dtype == "uint8":
diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py
index 68e4b5028c06..74c9ebcaa355 100644
--- a/tests/python/relay/test_param_dict.py
+++ b/tests/python/relay/test_param_dict.py
@@ -24,7 +24,7 @@
 from tvm.relay.op import add
 from tvm import relay
 from tvm import rpc
-from tvm.contrib import util, graph_runtime
+from tvm.contrib import utils, graph_runtime
 
 
 def test_save_load():
@@ -70,7 +70,7 @@ def verify_graph_runtime(remote, target, shape, dtype):
         params = {"x": x_in}
         graph, lib, params = relay.build(func, target=target, params=params)
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("dev_lib.o")
         lib.save(path_dso)
         remote.upload(path_dso)
diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py
index 30b35fb8044b..b99e3bc02ba4 100644
--- a/tests/python/relay/test_pass_annotate_target.py
+++ b/tests/python/relay/test_pass_annotate_target.py
@@ -25,7 +25,7 @@
 import tvm.relay.transform as transform
 from tvm import relay
 from tvm import runtime
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 def check_result(
@@ -42,7 +42,7 @@ def update_lib(lib):
 
         kwargs = {}
         kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
-        tmp_path = util.tempdir()
+        tmp_path = utils.tempdir()
         lib_name = "lib.so"
         lib_path = tmp_path.relpath(lib_name)
         lib.export_library(lib_path, fcompile=False, **kwargs)
diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py
index 034cb48de186..8a7c4cbfbbd6 100644
--- a/tests/python/relay/test_pass_auto_quantize.py
+++ b/tests/python/relay/test_pass_auto_quantize.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay import testing
 from tvm.relay.expr import Call
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def quantize_and_build(out):
diff --git a/tests/python/relay/test_pass_merge_compiler_regions.py b/tests/python/relay/test_pass_merge_compiler_regions.py
index 8447eeffa6ba..ba94021d3f66 100644
--- a/tests/python/relay/test_pass_merge_compiler_regions.py
+++ b/tests/python/relay/test_pass_merge_compiler_regions.py
@@ -89,7 +89,7 @@ def expected():
 def test_example_graph():
     """This tests the merging algorithm on the example used in the RFC.
 
-    See the RFC here: https://discuss.tvm.ai/t/relay-improved-graph-partitioning-algorithm/5830
+    See the RFC here: https://discuss.tvm.apache.org/t/relay-improved-graph-partitioning-algorithm/5830
     Blue nodes are adds (target: test), red nodes are subtracts (target: default).
     """
 
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 2fd440e1c2c9..8d0e2d5e22e0 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -27,7 +27,7 @@
 from tvm import relay
 from tvm import runtime
 from tvm.relay import transform
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.relay.backend import compile_engine
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.op.annotation import compiler_begin, compiler_end
@@ -186,7 +186,7 @@ def update_lib(lib):
 
         kwargs = {}
         kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
-        tmp_path = util.tempdir()
+        tmp_path = utils.tempdir()
         lib_name = "lib.so"
         lib_path = tmp_path.relpath(lib_name)
         lib.export_library(lib_path, fcompile=False, **kwargs)
diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py
index 14f003e3500b..b2a695dc5434 100644
--- a/tests/python/relay/test_vm_serialization.py
+++ b/tests/python/relay/test_vm_serialization.py
@@ -27,7 +27,7 @@
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay import transform
 from tvm.relay.prelude import Prelude
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.relay import testing
 
 
@@ -129,7 +129,7 @@ def test_save_load():
     assert isinstance(code, bytearray)
 
     # save and load the code and lib file.
-    tmp = util.tempdir()
+    tmp = utils.tempdir()
     path_lib = tmp.relpath("lib.so")
     lib.export_library(path_lib)
     with open(tmp.relpath("code.ro"), "wb") as fo:
diff --git a/tests/python/relay/util/assert_diagnostic.py b/tests/python/relay/utils/assert_diagnostic.py
similarity index 100%
rename from tests/python/relay/util/assert_diagnostic.py
rename to tests/python/relay/utils/assert_diagnostic.py
diff --git a/tests/python/topi/python/test_topi_basic.py b/tests/python/topi/python/test_topi_basic.py
index 074319895bcb..108b92d903d9 100644
--- a/tests/python/topi/python/test_topi_basic.py
+++ b/tests/python/topi/python/test_topi_basic.py
@@ -17,13 +17,13 @@
 import tvm
 from tvm import te
 from tvm import topi
-from tvm.topi import util
+from tvm.topi import utils
 
 
 def test_util():
     x = tvm.tir.const(100, "int32")
-    assert util.get_const_int(x) == 100
-    assert util.get_const_tuple((x, x)) == (100, 100)
+    assert utils.get_const_int(x) == 100
+    assert utils.get_const_tuple((x, x)) == (100, 100)
 
 
 def test_ewise():
diff --git a/tests/python/topi/python/test_topi_batch_matmul.py b/tests/python/topi/python/test_topi_batch_matmul.py
index 0d82ee69fa26..e939f6c21e37 100644
--- a/tests/python/topi/python/test_topi_batch_matmul.py
+++ b/tests/python/topi/python/test_topi_batch_matmul.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_bitserial_conv2d.py b/tests/python/topi/python/test_topi_bitserial_conv2d.py
index 8e55c7bd4306..b0bce44a03f9 100644
--- a/tests/python/topi/python/test_topi_bitserial_conv2d.py
+++ b/tests/python/topi/python/test_topi_bitserial_conv2d.py
@@ -20,7 +20,7 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 
diff --git a/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py b/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
index 76f342255541..1cd982db5450 100644
--- a/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
+++ b/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def generate_quantized_np(shape, bits, out_dtype):
diff --git a/tests/python/topi/python/test_topi_bitserial_dense.py b/tests/python/topi/python/test_topi_bitserial_dense.py
index 8b565ac52da5..a624b1b1fede 100644
--- a/tests/python/topi/python/test_topi_bitserial_dense.py
+++ b/tests/python/topi/python/test_topi_bitserial_dense.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 _bitserial_dense_implement = {
diff --git a/tests/python/topi/python/test_topi_bnn.py b/tests/python/topi/python/test_topi_bnn.py
index 1bec2d0507cf..fbd9ac5d66c6 100644
--- a/tests/python/topi/python/test_topi_bnn.py
+++ b/tests/python/topi/python/test_topi_bnn.py
@@ -20,7 +20,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 
diff --git a/tests/python/topi/python/test_topi_clip.py b/tests/python/topi/python/test_topi_clip.py
index bee31a50a209..704ffe7e6843 100644
--- a/tests/python/topi/python/test_topi_clip.py
+++ b/tests/python/topi/python/test_topi_clip.py
@@ -21,7 +21,7 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 
diff --git a/tests/python/topi/python/test_topi_conv1d.py b/tests/python/topi/python/test_topi_conv1d.py
index 533910308ce8..aad029ce3ce5 100644
--- a/tests/python/topi/python/test_topi_conv1d.py
+++ b/tests/python/topi/python/test_topi_conv1d.py
@@ -23,7 +23,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 _conv1d_ncw_implement = {
diff --git a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
index 5b02d586a801..c251283f8011 100644
--- a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
+++ b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 _conv1d_transpose_ncw_implement = {
diff --git a/tests/python/topi/python/test_topi_conv2d_NCHWc.py b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
index a21790d61861..b1955ef5fa3b 100644
--- a/tests/python/topi/python/test_topi_conv2d_NCHWc.py
+++ b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
@@ -24,8 +24,8 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def _transform_data(data, bn):
diff --git a/tests/python/topi/python/test_topi_conv2d_hwcn.py b/tests/python/topi/python/test_topi_conv2d_hwcn.py
index a16499ace46d..bd88839c9c15 100644
--- a/tests/python/topi/python/test_topi_conv2d_hwcn.py
+++ b/tests/python/topi/python/test_topi_conv2d_hwcn.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
index 758976469bfd..9d63175d2e84 100644
--- a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
@@ -25,8 +25,8 @@
 from tvm import te, autotvm, topi
 from tvm.contrib.pickle_memoize import memoize
 from tvm.contrib import nvcc
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 
 _conv2d_hwnc_tensorcore_implement = {
     "cuda": (topi.cuda.conv2d_hwnc_tensorcore, topi.cuda.schedule_conv2d_hwnc_tensorcore)
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index b2f9835c3d66..e5b0689da008 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -24,8 +24,8 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm
 
 from common import Int8Fallback
diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py
index fef46e371da7..1b7575211dac 100644
--- a/tests/python/topi/python/test_topi_conv2d_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_nchw.py
@@ -23,8 +23,8 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index 747bd4f4429b..8c3b9e931eea 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -22,7 +22,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
index f6617379b549..c547ec7d0272 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
@@ -25,7 +25,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def verify_conv2d_1x1_nhwc_pack_int8(
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
index 8d881b01250c..eab73410dbe6 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
@@ -24,8 +24,8 @@
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
 from tvm.contrib import nvcc
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
index 3ffa4ac21f15..ac29fd6d0cff 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
@@ -24,8 +24,8 @@
 import tvm.topi.testing
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
index 267cfbe4c990..5cc2c2eb6f5d 100644
--- a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
@@ -21,7 +21,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_conv2d_winograd.py b/tests/python/topi/python/test_topi_conv2d_winograd.py
index 69ef4f78723a..9f07dbd1be46 100644
--- a/tests/python/topi/python/test_topi_conv2d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_winograd.py
@@ -24,8 +24,8 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv3d_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_ncdhw.py
index 58f30fbf6003..094a71074fa0 100644
--- a/tests/python/topi/python/test_topi_conv3d_ncdhw.py
+++ b/tests/python/topi/python/test_topi_conv3d_ncdhw.py
@@ -24,8 +24,8 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple3d
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple3d
+from tvm.topi.utils import get_const_tuple
 
 _conv3d_ncdhw_implement = {
     "generic": (topi.nn.conv3d_ncdhw, topi.generic.schedule_conv3d_ncdhw),
diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc.py b/tests/python/topi/python/test_topi_conv3d_ndhwc.py
index 2baacc07de75..2d2541af5979 100644
--- a/tests/python/topi/python/test_topi_conv3d_ndhwc.py
+++ b/tests/python/topi/python/test_topi_conv3d_ndhwc.py
@@ -23,7 +23,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 _conv3d_ndhwc_implement = {
diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
index 9a7d99ae6893..1e027aba4cd3 100644
--- a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
@@ -24,8 +24,8 @@
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
 from tvm.contrib import nvcc
-from tvm.topi.nn.util import get_pad_tuple3d
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple3d
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
index 480ec193ccf4..2ac7ccc708ec 100644
--- a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
+++ b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
@@ -22,7 +22,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 _conv3d_transpose_ncdhw_implement = {
diff --git a/tests/python/topi/python/test_topi_conv3d_winograd.py b/tests/python/topi/python/test_topi_conv3d_winograd.py
index fbb2995603c4..d00249ba4392 100644
--- a/tests/python/topi/python/test_topi_conv3d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv3d_winograd.py
@@ -24,8 +24,8 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.nn.util import get_pad_tuple3d
-from tvm.topi.util import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple3d
+from tvm.topi.utils import get_const_tuple
 
 
 _conv3d_ncdhw_implement = {
diff --git a/tests/python/topi/python/test_topi_correlation.py b/tests/python/topi/python/test_topi_correlation.py
index 17c21d6cb3f9..4709fb7d68f9 100644
--- a/tests/python/topi/python/test_topi_correlation.py
+++ b/tests/python/topi/python/test_topi_correlation.py
@@ -23,7 +23,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 _correlation_implement = {
     "generic": (topi.nn.correlation_nchw, topi.generic.schedule_correlation_nchw),
diff --git a/tests/python/topi/python/test_topi_deformable_conv2d.py b/tests/python/topi/python/test_topi_deformable_conv2d.py
index f57f42139395..34bfae7bad68 100644
--- a/tests/python/topi/python/test_topi_deformable_conv2d.py
+++ b/tests/python/topi/python/test_topi_deformable_conv2d.py
@@ -21,7 +21,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index f46a271ced53..95ebce43497b 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 from common import Int8Fallback
diff --git a/tests/python/topi/python/test_topi_dense_tensorcore.py b/tests/python/topi/python/test_topi_dense_tensorcore.py
index 07dab352da86..3ffdea50d660 100644
--- a/tests/python/topi/python/test_topi_dense_tensorcore.py
+++ b/tests/python/topi/python/test_topi_dense_tensorcore.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index 07ddeab2e897..d5fbef98593c 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -20,8 +20,8 @@
 from tvm import topi
 import tvm.topi.testing
 import numpy as np
-from tvm.topi.util import get_const_tuple
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
index 8b4575fa396d..72ad1e29004a 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
@@ -20,8 +20,8 @@
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
 from scipy import signal
-from tvm.topi.util import get_const_tuple
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 import tvm.topi.testing
 from tvm.topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
index 3826f6f1f473..53328113aa71 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
@@ -21,8 +21,8 @@
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
 from scipy import signal
-from tvm.topi.util import get_const_tuple
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 from tvm.topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_dilate.py b/tests/python/topi/python/test_topi_dilate.py
index 0ee51a6c7bf4..27e71735c565 100644
--- a/tests/python/topi/python/test_topi_dilate.py
+++ b/tests/python/topi/python/test_topi_dilate.py
@@ -39,7 +39,7 @@ def _test_dilate(input_size, strides, dilation_value=None):
         else:
             output_np = tvm.topi.testing.dilate_python(input_np, strides, dilation_value)
         input_tvm = tvm.nd.array(input_np, ctx=ctx)
-        output_size = topi.util.get_const_tuple(Output.shape)
+        output_size = topi.utils.get_const_tuple(Output.shape)
         output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
         f = tvm.build(schedule, [Input, Output], target)
         f(input_tvm, output_tvm)
diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index 45e77d19082d..9c4da5c2c849 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -24,7 +24,7 @@
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 from common import Int8Fallback
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
index 89e0e04206c7..b6cef2e97662 100644
--- a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
+++ b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
@@ -25,7 +25,7 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import pytest
 
 
diff --git a/tests/python/topi/python/test_topi_lrn.py b/tests/python/topi/python/test_topi_lrn.py
index 7e3300c39dc7..278926479977 100644
--- a/tests/python/topi/python/test_topi_lrn.py
+++ b/tests/python/topi/python/test_topi_lrn.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.topi.testing
 import tvm.testing
 
diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py
index 149ed82b150c..6e119e7aef6e 100644
--- a/tests/python/topi/python/test_topi_math.py
+++ b/tests/python/topi/python/test_topi_math.py
@@ -22,13 +22,13 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi import util
+from tvm.topi import utils
 
 
 def test_util():
     x = tvm.tir.const(100, "int32")
-    assert util.get_const_int(x) == 100
-    assert util.get_const_tuple((x, x)) == (100, 100)
+    assert utils.get_const_int(x) == 100
+    assert utils.get_const_tuple((x, x)) == (100, 100)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py
index f6933d7247f4..26ba6f8142b4 100644
--- a/tests/python/topi/python/test_topi_matmul.py
+++ b/tests/python/topi/python/test_topi_matmul.py
@@ -19,7 +19,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def with_tvm(lam, *args):
diff --git a/tests/python/topi/python/test_topi_pooling.py b/tests/python/topi/python/test_topi_pooling.py
index 251172f5a185..6f62b8ad969b 100644
--- a/tests/python/topi/python/test_topi_pooling.py
+++ b/tests/python/topi/python/test_topi_pooling.py
@@ -23,7 +23,7 @@
 from tvm import topi
 import tvm.testing
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 _pool_schedule = {
diff --git a/tests/python/topi/python/test_topi_relu.py b/tests/python/topi/python/test_topi_relu.py
index aa68f235f795..7c45acae0570 100644
--- a/tests/python/topi/python/test_topi_relu.py
+++ b/tests/python/topi/python/test_topi_relu.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.contrib.nvcc import have_fp16
 
 import tvm.testing
diff --git a/tests/python/topi/python/test_topi_reorg.py b/tests/python/topi/python/test_topi_reorg.py
index 37a0eb95d8eb..93464d9bef03 100644
--- a/tests/python/topi/python/test_topi_reorg.py
+++ b/tests/python/topi/python/test_topi_reorg.py
@@ -17,7 +17,7 @@
 """Example code to do reorg."""
 import numpy as np
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm
 from tvm import te
 import tvm.topi.testing
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index df4c4ea3bf0b..66c44f937c5e 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -23,7 +23,7 @@
 import tvm.testing
 import tvm.topi.testing
 import logging
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 _softmax_schedule = {
diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py
index 9426eb7499df..62f49e21418f 100644
--- a/tests/python/topi/python/test_topi_sparse.py
+++ b/tests/python/topi/python/test_topi_sparse.py
@@ -21,7 +21,7 @@
 from tvm import topi
 from tvm import relay
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.contrib.sparse as tvmsp
 from collections import namedtuple
 import time
diff --git a/tests/python/topi/python/test_topi_upsampling.py b/tests/python/topi/python/test_topi_upsampling.py
index 85042c032d03..0408220bfd65 100644
--- a/tests/python/topi/python/test_topi_upsampling.py
+++ b/tests/python/topi/python/test_topi_upsampling.py
@@ -22,7 +22,7 @@
 import tvm.testing
 import tvm.topi.testing
 import math
-from tvm.topi.util import nchw_pack_layout
+from tvm.topi.utils import nchw_pack_layout
 
 
 def verify_upsampling(
diff --git a/tests/python/topi/python/test_topi_util.py b/tests/python/topi/python/test_topi_util.py
index 18182dc27b11..bd7585e56302 100644
--- a/tests/python/topi/python/test_topi_util.py
+++ b/tests/python/topi/python/test_topi_util.py
@@ -20,7 +20,7 @@
 
 
 def verify_get_shape(src_shape, src_layout, dst_layout, expect_shape):
-    dst_shape = topi.util.get_shape(src_shape, src_layout, dst_layout)
+    dst_shape = topi.utils.get_shape(src_shape, src_layout, dst_layout)
     assert dst_shape == expect_shape, "Shape mismatch: expecting %s but got %s" % (
         expect_shape,
         dst_shape,
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 0d02dd8b3311..24035ba9bba6 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -24,7 +24,7 @@
 import tvm.topi.testing
 
 from tvm.contrib.pickle_memoize import memoize
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.topi.vision import ssd, non_max_suppression, get_valid_counts
 import pytest
 import tvm.testing
diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py
index 764099e7bd07..6a3fe4e82c99 100644
--- a/tests/python/unittest/test_auto_scheduler_common.py
+++ b/tests/python/unittest/test_auto_scheduler_common.py
@@ -23,7 +23,7 @@
 from tvm import te, auto_scheduler
 from tvm import topi
 from tvm.topi.nn.winograd_util import winograd_transform_matrices
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 @auto_scheduler.register_workload
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index c9d2c49de5f7..51cc9074a4fe 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import util
+from tvm.contrib import utils
 
 from tvm import autotvm
 from tvm.autotvm.measure import MeasureInput, MeasureResult, MeasureErrorNo
@@ -50,7 +50,7 @@ def test_load_dump():
 
 
 def test_file_io():
-    temp = util.tempdir()
+    temp = utils.tempdir()
     file_path = temp.relpath("temp.log")
 
     tsk, target = get_sample_task()
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 04018daca478..3b5471d0bb8b 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -29,7 +29,7 @@
 import tvm
 import tvm.relay
 
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python
 
 BUILD = True
diff --git a/tests/python/unittest/test_format_si_prefix.py b/tests/python/unittest/test_format_si_prefix.py
index b655654d6ae3..4df5c2b8cd13 100644
--- a/tests/python/unittest/test_format_si_prefix.py
+++ b/tests/python/unittest/test_format_si_prefix.py
@@ -17,7 +17,7 @@
 
 from numpy import isclose
 import random
-from tvm.autotvm import util
+from tvm.autotvm import utils
 
 
 SI_PREFIXES = "yzafpn\xb5m kMGTPEZY"
@@ -25,16 +25,16 @@
 
 def test_format_si_prefix():
     # test float conversion
-    assert util.format_si_prefix(1024, "k") == 1.024
+    assert utils.format_si_prefix(1024, "k") == 1.024
 
     for i, prefix in enumerate(SI_PREFIXES):
         integer, decimal = random.randint(0, 1000), random.randint(0, 1000)
         exp = -24 + 3 * i  # 0th prefix (yocto) is 10^-24
         number = integer * (10 ** exp) + decimal * (10 ** (exp - 3))
         expected = integer + decimal / 1000
-        assert isclose(util.format_si_prefix(number, prefix), expected)
+        assert isclose(utils.format_si_prefix(number, prefix), expected)
 
-    assert util.format_si_prefix(0, "y") == 0
+    assert utils.format_si_prefix(0, "y") == 0
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_micro_artifact.py b/tests/python/unittest/test_micro_artifact.py
index 80d34db6384f..d757f0956b81 100644
--- a/tests/python/unittest/test_micro_artifact.py
+++ b/tests/python/unittest/test_micro_artifact.py
@@ -22,7 +22,7 @@
 import shutil
 import tvm
 
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 FILE_LIST = ["label1", "label2", "label12", "unlabelled"]
@@ -58,7 +58,7 @@ def build_artifact(artifact_path, immobile=False):
 
 @tvm.testing.requires_micro
 def test_basic_functionality():
-    temp_dir = util.tempdir()
+    temp_dir = utils.tempdir()
     artifact_path = temp_dir.relpath("foo")
     art = build_artifact(artifact_path)
 
@@ -73,7 +73,7 @@ def test_basic_functionality():
 def test_archive():
     from tvm.micro import artifact
 
-    temp_dir = util.tempdir()
+    temp_dir = utils.tempdir()
     art = build_artifact(temp_dir.relpath("foo"))
 
     # Create archive
@@ -110,7 +110,7 @@ def test_archive():
 def test_metadata_only():
     from tvm.micro import artifact
 
-    temp_dir = util.tempdir()
+    temp_dir = utils.tempdir()
     base_dir = temp_dir.relpath("foo")
     art = build_artifact(base_dir)
 
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
index 5968f0b0e02b..c43a35924420 100644
--- a/tests/python/unittest/test_runtime_graph.py
+++ b/tests/python/unittest/test_runtime_graph.py
@@ -20,7 +20,7 @@
 import numpy as np
 import json
 from tvm import rpc
-from tvm.contrib import util, graph_runtime
+from tvm.contrib import utils, graph_runtime
 
 
 @tvm.testing.requires_llvm
@@ -68,7 +68,7 @@ def check_remote():
         mlib = tvm.build(s, [A, B], "llvm", name="myadd")
         server = rpc.Server("localhost")
         remote = rpc.connect(server.host, server.port)
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ctx = remote.cpu(0)
         path_dso = temp.relpath("dev_lib.so")
         mlib.export_library(path_dso)
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 25dadbd44acf..8aeaf1a1a23b 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -21,7 +21,7 @@
 from tvm import te
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib.debugger import debug_runtime as graph_runtime
 
 
@@ -120,7 +120,7 @@ def check_remote():
         mlib = tvm.build(s, [A, B], "llvm", name="myadd")
         server = rpc.Server("localhost")
         remote = rpc.connect(server.host, server.port)
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ctx = remote.cpu(0)
         path_dso = temp.relpath("dev_lib.so")
         mlib.export_library(path_dso)
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index 80b330c77004..161f944ea7bb 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -21,7 +21,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime, util
+from tvm.contrib import graph_runtime, utils
 from tvm import topi
 
 
@@ -415,7 +415,7 @@ def check_verify():
             np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
 
         def check_load_module():
-            temp = util.tempdir()
+            temp = utils.tempdir()
             path_lib = temp.relpath("deploy.so")
             mhost.export_library(path_lib)
             with open(temp.relpath("deploy.json"), "w") as out_file:
diff --git a/tests/python/unittest/test_runtime_measure.py b/tests/python/unittest/test_runtime_measure.py
index 77f32a05ea61..0d02f910a44c 100644
--- a/tests/python/unittest/test_runtime_measure.py
+++ b/tests/python/unittest/test_runtime_measure.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 
 
 def test_min_repeat_ms():
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 1d682d2fccdd..56ebb29c7c65 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -126,9 +126,9 @@ def verify_cpu_export(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -165,9 +165,9 @@ def verify_gpu_export(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -204,9 +204,9 @@ def verify_rpc_cpu_export(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -248,9 +248,9 @@ def verify_rpc_gpu_export(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -302,9 +302,9 @@ def verify_cpu_remove_package_params(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -349,9 +349,9 @@ def verify_gpu_remove_package_params(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -396,9 +396,9 @@ def verify_rpc_cpu_remove_package_params(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -449,9 +449,9 @@ def verify_rpc_gpu_remove_package_params(obj_format):
         with relay.build_config(opt_level=3):
             complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
diff --git a/tests/python/unittest/test_runtime_module_export.py b/tests/python/unittest/test_runtime_module_export.py
index fcdd906032af..88b7af984073 100644
--- a/tests/python/unittest/test_runtime_module_export.py
+++ b/tests/python/unittest/test_runtime_module_export.py
@@ -21,9 +21,9 @@
 
 import tvm.testing
 
-from tvm.contrib import util
+from tvm.contrib import utils
 
-header_file_dir_path = util.tempdir()
+header_file_dir_path = utils.tempdir()
 
 
 def gen_engine_header():
@@ -80,9 +80,9 @@ def verify_gpu_mod_export(obj_format):
                 synthetic_llvm_mod, "llvm", params=synthetic_llvm_params
             )
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -112,9 +112,9 @@ def verify_multi_dso_mod_export(obj_format):
         B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
         s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "llvm", name="myadd")
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
         else:
@@ -153,9 +153,9 @@ def verify_json_import_dso(obj_format):
             + "mul 6 inputs: 5 3 shape: 10 10"
         )
 
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         subgraph_path = temp.relpath("subgraph.examplejson")
         with open(subgraph_path, "w") as f:
             f.write(subgraph_json)
@@ -204,9 +204,9 @@ def verify_multi_c_mod_export():
         s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "c", name="myadd")
         engine_module = generate_engine_module()
-        from tvm.contrib import util
+        from tvm.contrib import utils
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         file_name = "deploy_lib.so"
         path_lib = temp.relpath(file_name)
         synthetic_cpu_lib.import_module(f)
diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py
index 81aa2ba5cc95..7befed3bbcdd 100644
--- a/tests/python/unittest/test_runtime_module_load.py
+++ b/tests/python/unittest/test_runtime_module_load.py
@@ -16,7 +16,7 @@
 # under the License.
 import tvm
 from tvm import te
-from tvm.contrib import cc, util
+from tvm.contrib import cc, utils
 import ctypes
 import os
 import sys
@@ -47,7 +47,7 @@ def test_dso_module_load():
     if not tvm.testing.device_enabled("llvm"):
         return
     dtype = "int64"
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     def save_object(names):
         n = te.size_var("n")
@@ -105,7 +105,7 @@ def check_device(device):
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         name = "myadd_%s" % device
         if sys.platform == "darwin" or sys.platform.startswith("linux"):
             f = tvm.build(s, [A, B], device, "llvm -system-lib", name=name)
@@ -133,7 +133,7 @@ def check_stackvm(device):
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         name = "myadd_%s" % device
         f = tvm.build(s, [A, B], device, "stackvm", name=name)
         path_dso = temp.relpath("dev_lib.stackvm")
@@ -163,7 +163,7 @@ def check_llvm():
         if not tvm.testing.device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         fadd1 = tvm.build(s, [A, B], "llvm", name="myadd1")
         fadd2 = tvm.build(s, [A, B], "llvm", name="myadd2")
         path1 = temp.relpath("myadd1.o")
@@ -188,7 +188,7 @@ def check_system_lib():
         if not tvm.testing.device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         fadd1 = tvm.build(s, [A, B], "llvm -system-lib", name="myadd1")
         fadd2 = tvm.build(s, [A, B], "llvm -system-lib", name="myadd2")
         path1 = temp.relpath("myadd1.o")
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index d25eff23ae76..e0fe8b5fe26f 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -26,7 +26,7 @@
 import pytest
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util, cc
+from tvm.contrib import utils, cc
 from tvm.rpc.tracker import Tracker
 
 
@@ -46,7 +46,7 @@ def verify_rpc(remote, target, shape, dtype):
         ctx = remote.cpu(0)
         a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx)
         b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx)
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("dev_lib.o")
         f.save(path_dso)
         remote.upload(path_dso)
@@ -164,7 +164,7 @@ def check_minrpc():
         if tvm.get_global_func("rpc.CreatePipeClient", allow_missing=True) is None:
             return
         # Test minrpc server.
-        temp = util.tempdir()
+        temp = utils.tempdir()
         minrpc_exec = temp.relpath("minrpc")
         tvm.rpc.with_minrpc(cc.create_executable)(minrpc_exec, [])
         check(rpc.PopenSession(minrpc_exec))
@@ -212,7 +212,7 @@ def test_rpc_remote_module():
     )
 
     def check_remote(remote):
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ctx = remote.cpu(0)
         f = tvm.build(s, [A, B], "llvm", name="myadd")
         path_dso = temp.relpath("dev_lib.so")
@@ -243,7 +243,7 @@ def check_minrpc():
         if tvm.get_global_func("rpc.CreatePipeClient", allow_missing=True) is None:
             return
         # export to minrpc
-        temp = util.tempdir()
+        temp = utils.tempdir()
         f = tvm.build(s, [A, B], "llvm --system-lib", name="myadd")
         path_minrpc = temp.relpath("dev_lib.minrpc")
         f.export_library(path_minrpc, rpc.with_minrpc(cc.create_executable))
@@ -278,7 +278,7 @@ def check_remote_link_cl(remote):
         if not tvm.testing.device_enabled("opencl"):
             print("Skip because opencl is not enabled")
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         ctx = remote.cl(0)
         s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=32)
diff --git a/tests/python/unittest/test_target_codegen_blob.py b/tests/python/unittest/test_target_codegen_blob.py
index 8189d1384e92..dc42381cf82d 100644
--- a/tests/python/unittest/test_target_codegen_blob.py
+++ b/tests/python/unittest/test_target_codegen_blob.py
@@ -49,9 +49,9 @@ def verify(data):
     with tvm.transform.PassContext(opt_level=3):
         synthetic_gpu_lib = relay.build_module.build(synthetic_mod, "cuda", params=synthetic_params)
 
-    from tvm.contrib import util
+    from tvm.contrib import utils
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path_lib = temp.relpath("deploy_lib.so")
     synthetic_gpu_lib.export_library(path_lib)
 
@@ -82,9 +82,9 @@ def test_cuda_lib():
     s[B].bind(bx, te.thread_axis("blockIdx.x"))
     s[B].bind(tx, te.thread_axis("threadIdx.x"))
 
-    from tvm.contrib import util
+    from tvm.contrib import utils
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     fn_add = tvm.build(s, [A, B], target="cuda", target_host="llvm", name="add")
     path_lib = temp.relpath("deploy_lib.so")
     fn_add.export_library(path_lib)
diff --git a/tests/python/unittest/test_target_codegen_c_host.py b/tests/python/unittest/test_target_codegen_c_host.py
index a2ed6345e651..3178d6dad0e4 100644
--- a/tests/python/unittest/test_target_codegen_c_host.py
+++ b/tests/python/unittest/test_target_codegen_c_host.py
@@ -18,7 +18,7 @@
 import tvm.testing
 from tvm import te
 import numpy as np
-from tvm.contrib import util
+from tvm.contrib import utils
 
 
 def test_add():
@@ -31,7 +31,7 @@ def test_add():
 
     def check_c():
         mhost = tvm.build(s, [A, B, C], "c", name="fadd")
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
@@ -76,7 +76,7 @@ def check_c():
         f1 = tvm.lower(s, [A, B, C], name="fadd_pipeline")
         mhost = tvm.build(f1, target="c")
 
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
@@ -104,7 +104,7 @@ def test_reinterpret():
 
     def check_c():
         mhost = tvm.build(s, [A, B], "c", name="reinterpret")
-        temp = util.tempdir()
+        temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
diff --git a/tests/python/unittest/test_target_codegen_cross_llvm.py b/tests/python/unittest/test_target_codegen_cross_llvm.py
index c0ab65100d52..a55530a090e4 100644
--- a/tests/python/unittest/test_target_codegen_cross_llvm.py
+++ b/tests/python/unittest/test_target_codegen_cross_llvm.py
@@ -21,7 +21,7 @@
 import os
 import struct
 from tvm import rpc
-from tvm.contrib import util, cc
+from tvm.contrib import utils, cc
 import numpy as np
 
 
@@ -46,7 +46,7 @@ def verify_elf(path, e_machine):
             assert struct.unpack(endian + "h", arr[0x12:0x14])[0] == e_machine
 
     def build_i386():
-        temp = util.tempdir()
+        temp = utils.tempdir()
         target = "llvm -mtriple=i386-pc-linux-gnu"
         f = tvm.build(s, [A, B, C], target)
         path = temp.relpath("myadd.o")
@@ -58,7 +58,7 @@ def build_arm():
         if not tvm.runtime.enabled(target):
             print("Skip because %s is not enabled.." % target)
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         f = tvm.build(s, [A, B, C], target)
         path = temp.relpath("myadd.o")
         f.save(path)
diff --git a/tests/python/unittest/test_target_codegen_device.py b/tests/python/unittest/test_target_codegen_device.py
index 3cdcb2d7345b..3b764c6709a8 100644
--- a/tests/python/unittest/test_target_codegen_device.py
+++ b/tests/python/unittest/test_target_codegen_device.py
@@ -16,7 +16,7 @@
 # under the License.
 import tvm
 from tvm import te
-from tvm.contrib import util
+from tvm.contrib import utils
 import numpy as np
 import tvm.testing
 
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 19773e59a777..3599493a74cb 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -18,7 +18,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.contrib import util, clang
+from tvm.contrib import utils, clang
 import numpy as np
 import ctypes
 import math
@@ -594,7 +594,7 @@ def check_llvm_object():
         f2 = tvm.lower(s, [A, B, C], name="fadd1")
         f1 = tvm.lower(s, [A, B, C], name="fadd2")
         m = tvm.build([f1, f2], "llvm")
-        temp = util.tempdir()
+        temp = utils.tempdir()
         o_path = temp.relpath("temp.o")
         m.save(o_path)
         import shutil
diff --git a/tests/python/unittest/test_te_autodiff.py b/tests/python/unittest/test_te_autodiff.py
index 7b591dcd87f1..6031182091fe 100644
--- a/tests/python/unittest/test_te_autodiff.py
+++ b/tests/python/unittest/test_te_autodiff.py
@@ -19,7 +19,7 @@
 from tvm import te
 from tvm.testing import assert_allclose
 from tvm import topi
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import pytest
 
 import numpy as np
diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py
index 3afdb66134e3..06d409933f1f 100644
--- a/tests/python/unittest/test_te_hybrid_script.py
+++ b/tests/python/unittest/test_te_hybrid_script.py
@@ -17,7 +17,7 @@
 import tvm, inspect, sys, traceback, numpy, pytest, types, os
 
 from tvm import te
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.te.hybrid import script
 from tvm.te.hybrid.runtime import HYBRID_GLOBALS
 
@@ -147,7 +147,7 @@ def test_outer_product():
     assert mul.b.producer.name == "b"
 
     func, ins, outs = run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101})
-    temp = util.tempdir()
+    temp = utils.tempdir()
     path = temp.relpath("%s.py" % func.name)
     func.save(path)
     func_ = te.hybrid.HybridModule()
diff --git a/tests/python/unittest/test_te_tensor_overload.py b/tests/python/unittest/test_te_tensor_overload.py
index b44a85c26f4f..715771747d53 100644
--- a/tests/python/unittest/test_te_tensor_overload.py
+++ b/tests/python/unittest/test_te_tensor_overload.py
@@ -19,7 +19,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 import tvm.testing
 
 
diff --git a/tests/python/unittest/test_tir_data_layout.py b/tests/python/unittest/test_tir_data_layout.py
index 22c24fafc54f..5c2eb8febd9b 100644
--- a/tests/python/unittest/test_tir_data_layout.py
+++ b/tests/python/unittest/test_tir_data_layout.py
@@ -18,7 +18,7 @@
 
 import tvm
 from tvm import te
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 
 def test_layout():
diff --git a/tests/python/unittest/test_tir_intrin.py b/tests/python/unittest/test_tir_intrin.py
index 3d54175b8731..76390dace757 100644
--- a/tests/python/unittest/test_tir_intrin.py
+++ b/tests/python/unittest/test_tir_intrin.py
@@ -18,7 +18,7 @@
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.contrib import util, clang
+from tvm.contrib import utils, clang
 import numpy as np
 import ctypes
 import math
diff --git a/tests/scripts/task_golang.sh b/tests/scripts/task_golang.sh
index 7a93b47c2913..5b01eeca164c 100755
--- a/tests/scripts/task_golang.sh
+++ b/tests/scripts/task_golang.sh
@@ -28,5 +28,7 @@ export PYTHONPATH="$tvm_root/python"
 export TVM_BIND_THREADS=0
 export OMP_NUM_THREADS=1
 
+make -C golang clean
+
 # Golang tests
 make -C golang tests
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 31fda54a9a7e..7514ee708292 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -71,7 +71,7 @@
 from tvm import relay
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -416,4 +416,4 @@ def tune_and_evaluate(tuning_opt):
 #      import logging
 #      logging.getLogger('autotvm').setLevel(logging.DEBUG)
 #
-#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index e86430767b31..f9b89211a066 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -69,7 +69,7 @@
 from tvm import relay
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -317,7 +317,7 @@ def tune_and_evaluate(tuning_opt):
 #      import logging
 #      logging.getLogger('autotvm').setLevel(logging.DEBUG)
 #
-#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org
 
 
 #################################################################
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 9a112e134f4f..b7fbf89e59aa 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -70,7 +70,7 @@
 from tvm import relay
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib.util import tempdir
+from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -421,4 +421,4 @@ def tune_and_evaluate(tuning_opt):
 #      import logging
 #      logging.getLogger('autotvm').setLevel(logging.DEBUG)
 #
-#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index 851810a83958..35f989c7a5ca 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -34,7 +34,7 @@
 from tvm import te
 import tvm.relay as relay
 from tvm import rpc
-from tvm.contrib import util, ndk, graph_runtime as runtime
+from tvm.contrib import utils, ndk, graph_runtime as runtime
 from tvm.contrib.download import download_testdata
 
 
@@ -282,7 +282,7 @@ def transform_image(image):
 # change the parameters but keep the result of model as the same.
 
 # Save the library at local temporary directory.
-tmp = util.tempdir()
+tmp = utils.tempdir()
 lib_fname = tmp.relpath("net.so")
 fcompile = ndk.create_shared if not local_demo else None
 lib.export_library(lib_fname, fcompile)
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py
index 8b49a213ef36..36879910a1b9 100644
--- a/tutorials/frontend/deploy_model_on_rasp.py
+++ b/tutorials/frontend/deploy_model_on_rasp.py
@@ -30,7 +30,7 @@
 from tvm import te
 import tvm.relay as relay
 from tvm import rpc
-from tvm.contrib import util, graph_runtime as runtime
+from tvm.contrib import utils, graph_runtime as runtime
 from tvm.contrib.download import download_testdata
 
 ######################################################################
@@ -193,7 +193,7 @@ def transform_image(image):
 # change the parameters but keep the result of model as the same.
 
 # Save the library at local temporary directory.
-tmp = util.tempdir()
+tmp = utils.tempdir()
 lib_fname = tmp.relpath("net.tar")
 lib.export_library(lib_fname)
 
diff --git a/tutorials/frontend/deploy_prequantized.py b/tutorials/frontend/deploy_prequantized.py
index e9f1a4c7c326..beba332a8a26 100644
--- a/tutorials/frontend/deploy_prequantized.py
+++ b/tutorials/frontend/deploy_prequantized.py
@@ -22,7 +22,7 @@
 This is a tutorial on loading models quantized by deep learning frameworks into TVM.
 Pre-quantized model import is one of the quantization support we have in TVM. More details on
 the quantization story in TVM can be found
-`here <https://discuss.tvm.ai/t/quantization-story/3920>`_.
+`here <https://discuss.tvm.apache.org/t/quantization-story/3920>`_.
 
 Here, we demonstrate how to load and run models quantized by PyTorch, MXNet, and TFLite.
 Once loaded, we can run compiled, quantized models on any hardware TVM supports.
diff --git a/tutorials/frontend/deploy_quantized.py b/tutorials/frontend/deploy_quantized.py
index 093bd732eb3c..e75f6e92a6f1 100644
--- a/tutorials/frontend/deploy_quantized.py
+++ b/tutorials/frontend/deploy_quantized.py
@@ -22,7 +22,7 @@
 This article is an introductory tutorial of automatic quantization with TVM.
 Automatic quantization is one of the quantization modes in TVM. More details on
 the quantization story in TVM can be found
-`here <https://discuss.tvm.ai/t/quantization-story/3920>`_.
+`here <https://discuss.tvm.apache.org/t/quantization-story/3920>`_.
 In this tutorial, we will import a GluonCV pre-trained model on ImageNet to
 Relay, quantize the Relay model and then perform the inference.
 """
diff --git a/tutorials/frontend/from_onnx.py b/tutorials/frontend/from_onnx.py
index 141defe65488..1557ea551d28 100644
--- a/tutorials/frontend/from_onnx.py
+++ b/tutorials/frontend/from_onnx.py
@@ -111,4 +111,4 @@
 # retains that dynamism upon import, and the compiler attemps to convert the model
 # into a static shapes at compile time. If this fails, there may still be dynamic
 # operations in the model. Not all TVM kernels currently support dynamic shapes,
-# please file an issue on discuss.tvm.ai if you hit an error with dynamic kernels.
+# please file an issue on discuss.tvm.apache.org if you hit an error with dynamic kernels.
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py
index 572ebb897e3c..69284c0521a3 100644
--- a/tutorials/get_started/cross_compilation_and_rpc.py
+++ b/tutorials/get_started/cross_compilation_and_rpc.py
@@ -98,7 +98,7 @@
 import tvm
 from tvm import te
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 
 n = tvm.runtime.convert(1024)
 A = te.placeholder((n,), name="A")
@@ -120,7 +120,7 @@
 
 func = tvm.build(s, [A, B], target=target, name="add_one")
 # save the lib at a local temp folder
-temp = util.tempdir()
+temp = utils.tempdir()
 path = temp.relpath("lib.tar")
 func.export_library(path)
 
diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py
index cece7ab04316..6da62f5ced4b 100644
--- a/tutorials/get_started/relay_quick_start.py
+++ b/tutorials/get_started/relay_quick_start.py
@@ -129,9 +129,9 @@
 ####################################################
 
 # save the graph, lib and params into separate files
-from tvm.contrib import util
+from tvm.contrib import utils
 
-temp = util.tempdir()
+temp = utils.tempdir()
 path_lib = temp.relpath("deploy_lib.tar")
 lib.export_library(path_lib)
 print(temp.listdir())
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
index 0ca2243c7c3b..7f1bb6a3d1e5 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/tutorials/get_started/tensor_expr_get_started.py
@@ -210,9 +210,9 @@
 # - cc.create_shared calls a compiler (gcc) to create a shared library
 #
 from tvm.contrib import cc
-from tvm.contrib import util
+from tvm.contrib import utils
 
-temp = util.tempdir()
+temp = utils.tempdir()
 fadd.save(temp.relpath("myadd.o"))
 if tgt == "cuda":
     fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
index 601adb8dce46..e91cfe43ab46 100644
--- a/tutorials/language/tensorize.py
+++ b/tutorials/language/tensorize.py
@@ -160,9 +160,9 @@ def gemv_impl():
         return 0;
       }
     """
-    from tvm.contrib import util, clang
+    from tvm.contrib import utils, clang
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     ll_path = temp.relpath("temp.ll")
     # Create LLVM ir from c source code
     ll_code = clang.create_llvm(cc_code, output=ll_path)
@@ -182,7 +182,7 @@ def gemv_impl():
 #
 func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
 
-from tvm.topi.util import get_const_tuple
+from tvm.topi.utils import get_const_tuple
 
 dtype = A.dtype
 ctx = tvm.context("cpu", 0)
@@ -228,9 +228,9 @@ def gemv_impl():
         return 0;
       }
     """
-    from tvm.contrib import util, clang
+    from tvm.contrib import utils, clang
 
-    temp = util.tempdir()
+    temp = utils.tempdir()
     ll_path = temp.relpath("temp.ll")
     # Create LLVM ir from c source code
     ll_code = clang.create_llvm(cc_code, output=ll_path)
diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py
index 6fd2de15288d..293f95cf75c6 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/tutorials/micro/micro_tflite.py
@@ -98,7 +98,7 @@
 import tvm
 import tvm.micro as micro
 from tvm.contrib.download import download_testdata
-from tvm.contrib import graph_runtime, util
+from tvm.contrib import graph_runtime, utils
 from tvm import relay
 
 # %%
diff --git a/version.py b/version.py
index 461b81a5571b..f3af4202fe6b 100644
--- a/version.py
+++ b/version.py
@@ -85,7 +85,7 @@ def git_describe_version():
         msg = py_str(out)
         if msg.find("not a git repository") != -1:
             return __version__, __version__
-        logging.warning("git describe error: %", msg)
+        logging.warning("git describe: %s, use %s", msg, __version__)
         return __version__, __version__
     describe = py_str(out).strip()
     arr_info = describe.split("-")
diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py
index d39f982823f9..d143c4db6884 100644
--- a/vta/python/vta/__init__.py
+++ b/vta/python/vta/__init__.py
@@ -20,8 +20,6 @@
 Besides the compiler toolchain, it also includes utility functions to
 configure the hardware environment and access remote device through RPC.
 """
-from __future__ import absolute_import as _abs
-
 import sys
 
 from .bitstream import get_bitstream_path, download_bitstream
diff --git a/vta/python/vta/testing/__init__.py b/vta/python/vta/testing/__init__.py
index fbc50b063fb5..8d294c2f4d22 100644
--- a/vta/python/vta/testing/__init__.py
+++ b/vta/python/vta/testing/__init__.py
@@ -17,4 +17,4 @@
 
 """Testing utilities, this namespace is not imported by default."""
 
-from .util import run
+from .utils import run
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/utils.py
similarity index 100%
rename from vta/python/vta/testing/util.py
rename to vta/python/vta/testing/utils.py
diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index 6f62aff469d4..b9ebe55703c5 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -24,4 +24,4 @@
 from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
 from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
 from .vta_dense import dense_packed, schedule_dense_packed
-from . import util
+from . import utils
diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
index 52bd13da6c23..50b3729a1e44 100644
--- a/vta/python/vta/top/bitpack.py
+++ b/vta/python/vta/top/bitpack.py
@@ -21,7 +21,7 @@
 
 import tvm
 from tvm import te
-from tvm.topi import util
+from tvm.topi import utils
 
 from tvm.relay.op.op import register_compute, register_injective_schedule
 from tvm.relay.op.op import register_pattern, OpPattern
@@ -55,7 +55,7 @@ def bitpack(data, bits, pack_type="int8", name="bitpack"):
     lanes = data_width // bits
 
     # Data must be in multiples of the data_width
-    assert util.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
+    assert utils.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
     shape_vec[-1] = shape_vec[-1] // lanes
     oshape = tuple(shape_vec)
 
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 27105576f218..f3b808a6d1a0 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -26,7 +26,7 @@
 from tvm.relay.op import strategy as _strategy
 from tvm.relay.op.op import OpPattern, OpStrategy
 
-from .util import is_packed_layout
+from .utils import is_packed_layout
 from .vta_conv2d import conv2d_packed, schedule_conv2d_packed
 from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
 from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
@@ -69,7 +69,7 @@ def conv2d_strategy_vta(attrs, inputs, out_type, target):
     """conv2d vta strategy"""
     strategy = OpStrategy()
     kernel = inputs[1]
-    dilation = topi.util.get_const_tuple(attrs.dilation)
+    dilation = topi.utils.get_const_tuple(attrs.dilation)
     groups = attrs.groups
     layout = attrs.data_layout
 
@@ -102,7 +102,7 @@ def conv2d_strategy_vta(attrs, inputs, out_type, target):
 @_strategy.conv2d_transpose_strategy.register("vta")
 def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target):
     """conv2d_transpose vta strategy"""
-    dilation = topi.util.get_const_tuple(attrs.dilation)
+    dilation = topi.utils.get_const_tuple(attrs.dilation)
     layout = attrs.data_layout
     assert dilation == (1, 1), "support for dilation limited to (1, 1)"
 
diff --git a/vta/python/vta/top/util.py b/vta/python/vta/top/utils.py
similarity index 100%
rename from vta/python/vta/top/util.py
rename to vta/python/vta/top/utils.py
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index e1555654b1e2..0b9cb719189f 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -23,7 +23,7 @@
 from tvm import autotvm
 from tvm import topi
 
-from .util import is_packed_layout
+from .utils import is_packed_layout
 from ..environment import get_env
 
 
@@ -40,12 +40,12 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
         pad_data = data
     assert len(data.shape) == 6
     assert len(kernel.shape) == 6
-    oheight = topi.util.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
-    owidth = topi.util.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
+    oheight = topi.utils.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
+    owidth = topi.utils.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
     oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])
 
-    ishape = topi.util.get_const_tuple(data.shape)
-    kshape = topi.util.get_const_tuple(kernel.shape)
+    ishape = topi.utils.get_const_tuple(data.shape)
+    kshape = topi.utils.get_const_tuple(kernel.shape)
     d_i = te.reduce_axis((0, kshape[2]), name="d_i")
     d_j = te.reduce_axis((0, kshape[3]), name="d_j")
     k_o = te.reduce_axis((0, ishape[1]), name="k_o")
@@ -64,7 +64,7 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
 
     cfg.add_flop(
         2
-        * np.prod(topi.util.get_const_tuple(oshape))
+        * np.prod(topi.utils.get_const_tuple(oshape))
         * kshape[2]
         * kshape[3]
         * ishape[1]
diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py
index c020747221c5..5a44104baa57 100644
--- a/vta/python/vta/top/vta_conv2d_transpose.py
+++ b/vta/python/vta/top/vta_conv2d_transpose.py
@@ -22,8 +22,8 @@
 from tvm import te
 from tvm import autotvm
 from tvm import topi
-from tvm.topi.util import get_const_tuple
-from tvm.topi.nn.util import get_pad_tuple
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
 
 from ..environment import get_env
 
@@ -75,7 +75,7 @@ def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype, outp
 
     cfg.add_flop(
         2
-        * np.prod(topi.util.get_const_tuple(oshape))
+        * np.prod(topi.utils.get_const_tuple(oshape))
         * kshape[2]
         * kshape[3]
         * ishape[1]
diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index 4a618f198324..5e06cf9f5624 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -44,8 +44,8 @@ def dense_packed(cfg, data, weight, bias=None, out_dtype=None):
         raise topi.InvalidShapeError()
 
     # Derive shapes
-    ishape = topi.util.get_const_tuple(data.shape)
-    wshape = topi.util.get_const_tuple(weight.shape)
+    ishape = topi.utils.get_const_tuple(data.shape)
+    wshape = topi.utils.get_const_tuple(weight.shape)
     oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2])
 
     # Reduction axes (input channel)
@@ -64,7 +64,7 @@ def dense_packed(cfg, data, weight, bias=None, out_dtype=None):
         tag="dense_pack",
     )
 
-    cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * ishape[1] * ishape[3])
+    cfg.add_flop(2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1] * ishape[3])
 
     return res
 
diff --git a/vta/python/vta/top/vta_group_conv2d.py b/vta/python/vta/top/vta_group_conv2d.py
index b2661b38f15c..deb4ea779214 100644
--- a/vta/python/vta/top/vta_group_conv2d.py
+++ b/vta/python/vta/top/vta_group_conv2d.py
@@ -41,12 +41,12 @@ def group_conv2d_packed(cfg, data, kernel, strides, padding, dilation, group, ou
     assert kernel.dtype == "int8", kernel.dtype
     assert out_dtype == "int32", out_dtype
 
-    oheight = topi.util.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
-    owidth = topi.util.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
+    oheight = topi.utils.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
+    owidth = topi.utils.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
     oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])
 
-    ishape = topi.util.get_const_tuple(data.shape)
-    kshape = topi.util.get_const_tuple(kernel.shape)
+    ishape = topi.utils.get_const_tuple(data.shape)
+    kshape = topi.utils.get_const_tuple(kernel.shape)
     assert group * kshape[1] == ishape[1]
     assert kshape[0] % group == 0
     d_i = te.reduce_axis((0, kshape[2]), name="d_i")
@@ -74,7 +74,7 @@ def group_conv2d_packed(cfg, data, kernel, strides, padding, dilation, group, ou
 
     cfg.add_flop(
         2
-        * np.prod(topi.util.get_const_tuple(oshape))
+        * np.prod(topi.utils.get_const_tuple(oshape))
         * kshape[2]
         * kshape[3]
         * ishape[1]
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index ed64ba3dc26c..a485d2cfb7b8 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -18,7 +18,7 @@
 # pylint: disable=len-as-condition, no-else-return, unused-argument, invalid-name
 import tvm
 from tvm import te
-from tvm.topi import util
+from tvm.topi import utils
 
 from .environment import get_env
 
@@ -346,7 +346,7 @@ def _check_compact(buf):
         ndim = len(buf.shape)
         size = tvm.tir.const(1, buf.shape[0].dtype)
         for i in reversed(range(ndim)):
-            if not util.equal_const_int(size - buf.strides[i], 0):
+            if not utils.equal_const_int(size - buf.strides[i], 0):
                 raise RuntimeError(
                     "Cannot prove compact: shape=%s, strides=%s" % (buf.shape, buf.strides)
                 )
@@ -357,10 +357,10 @@ def _fold_buffer_dim(buf, scope, elem_block):
         x_size = 1
         base = 0
         for i in range(1, ndim + 1):
-            if not util.equal_const_int(buf.strides[ndim - i] - x_size, 0):
+            if not utils.equal_const_int(buf.strides[ndim - i] - x_size, 0):
                 raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block))
             x_size = x_size * buf.shape[ndim - i]
-            if util.equal_const_int(x_size - elem_block, 0):
+            if utils.equal_const_int(x_size - elem_block, 0):
                 base = i + 1
                 break
         if base == 0:
@@ -370,7 +370,7 @@ def _fold_buffer_dim(buf, scope, elem_block):
         shape = [elem_block]
         strides = [1]
 
-        if base < ndim + 1 and not util.equal_const_int(buf.strides[ndim - base], elem_block):
+        if base < ndim + 1 and not utils.equal_const_int(buf.strides[ndim - base], elem_block):
             shape.append(1)
             strides.append(elem_block)
 
@@ -379,14 +379,14 @@ def _fold_buffer_dim(buf, scope, elem_block):
             x_size = 1
             x_stride = buf.strides[ndim - base]
             next_base = base
-            if not util.equal_const_int(idxm(x_stride, elem_block), 0):
+            if not utils.equal_const_int(idxm(x_stride, elem_block), 0):
                 raise RuntimeError(
                     "scope %s need to have block=%d, shape=%s, strides=%s"
                     % (scope, elem_block, buf.shape, buf.strides)
                 )
             for i in range(base, ndim + 1):
                 k = ndim - i
-                if not util.equal_const_int(x_size * x_stride - buf.strides[k], 0):
+                if not utils.equal_const_int(x_size * x_stride - buf.strides[k], 0):
                     break
                 x_size = x_size * buf.shape[k]
                 next_base = i + 1
@@ -404,7 +404,7 @@ def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
         if buf.dtype != dtype:
             raise RuntimeError("Expect buffer type to be %s instead of %s" % (dtype, buf.dtype))
         shape, strides = buf.shape, buf.strides
-        if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
+        if not utils.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
             raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))
         if allow_fold:
             shape, strides = _fold_buffer_dim(buf, scope, elem_block)
@@ -425,10 +425,10 @@ def raise_error():
         ndim = len(shape)
 
         # Check if the inner-tensor is already flat
-        flat = util.equal_const_int(shape[-1], elem_block)
+        flat = utils.equal_const_int(shape[-1], elem_block)
 
         if flat:
-            if not util.equal_const_int(strides[-1], 1):
+            if not utils.equal_const_int(strides[-1], 1):
                 raise_error()
 
             if ndim == 1:
@@ -436,7 +436,7 @@ def raise_error():
                 x_stride = 1
                 y_size = 1
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not util.equal_const_int(strides[-2] - elem_block, 0):
+            if not utils.equal_const_int(strides[-2] - elem_block, 0):
                 raise_error()
 
             if ndim == 2:
@@ -444,7 +444,7 @@ def raise_error():
                 x_stride = shape[-2]
                 y_size = 1
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not util.equal_const_int(idxm(strides[-3], elem_block), 0):
+            if not utils.equal_const_int(idxm(strides[-3], elem_block), 0):
                 raise_error()
 
             if ndim == 3:
@@ -454,11 +454,11 @@ def raise_error():
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
 
         else:
-            if not util.equal_const_int(strides[-1], 1):
+            if not utils.equal_const_int(strides[-1], 1):
                 raise_error()
-            if not util.equal_const_int(strides[-2] - shape[-1], 0):
+            if not utils.equal_const_int(strides[-2] - shape[-1], 0):
                 raise_error()
-            if not util.equal_const_int(shape[-1] * shape[-2], elem_block):
+            if not utils.equal_const_int(shape[-1] * shape[-2], elem_block):
                 raise_error()
 
             if ndim == 2:
@@ -466,7 +466,7 @@ def raise_error():
                 x_stride = 1
                 y_size = 1
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not util.equal_const_int(strides[-3], elem_block):
+            if not utils.equal_const_int(strides[-3], elem_block):
                 raise_error()
 
             if ndim == 3:
@@ -474,7 +474,7 @@ def raise_error():
                 x_stride = shape[-3]
                 y_size = 1
                 return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not util.equal_const_int(idxm(strides[-4], elem_block), 0):
+            if not utils.equal_const_int(idxm(strides[-4], elem_block), 0):
                 raise_error()
 
             if ndim == 4:
@@ -556,9 +556,9 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
                     y_pad_after = pad_after[1]
                     x_pad_after = pad_after[2]
                     for dim in range(3, ndim):
-                        if not util.equal_const_int(pad_before[dim], 0):
+                        if not utils.equal_const_int(pad_before[dim], 0):
                             raise ValueError("Do not support pad on the innermost block")
-                        if not util.equal_const_int(pad_after[dim], 0):
+                        if not utils.equal_const_int(pad_after[dim], 0):
                             raise ValueError("Do not support pad on the innermost block")
                 else:
                     y_pad_before = pad_before[0]
@@ -566,9 +566,9 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
                     y_pad_after = pad_after[0]
                     x_pad_after = pad_after[1]
                     for dim in range(2, ndim):
-                        if not util.equal_const_int(pad_before[dim], 0):
+                        if not utils.equal_const_int(pad_before[dim], 0):
                             raise ValueError("Do not support pad on the innermost block")
-                        if not util.equal_const_int(pad_after[dim], 0):
+                        if not utils.equal_const_int(pad_after[dim], 0):
                             raise ValueError("Do not support pad on the innermost block")
                 allow_fold = False
             else:
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 6d6490448e78..04f430ef8624 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -28,7 +28,7 @@
 from tvm import rpc, autotvm, relay
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib import graph_runtime, util, download
+from tvm.contrib import graph_runtime, utils, download
 from tvm.contrib.debugger import debug_runtime
 import vta
 from vta.testing import simulator
@@ -318,7 +318,7 @@ def tune_tasks(
                 )
 
         # Export library
-        temp = util.tempdir()
+        temp = utils.tempdir()
         lib.save(temp.relpath("graphlib.o"))
         remote.upload(temp.relpath("graphlib.o"))
         lib = remote.load_module("graphlib.o")
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
index 432d6f7041ba..3ce2d9c9e4a9 100644
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -18,7 +18,7 @@
 import tvm.testing
 from tvm import te
 import numpy as np
-from tvm.contrib import util
+from tvm.contrib import utils
 import vta.testing
 from vta.testing import simulator
 
@@ -61,7 +61,7 @@ def run_gemm_packed(env, remote, batch_size, channel, block):
 
         def verify(s, check_correctness=True):
             mod = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="gemm")
-            temp = util.tempdir()
+            temp = utils.tempdir()
             mod.save(temp.relpath("gemm.o"))
             remote.upload(temp.relpath("gemm.o"))
             f = remote.load_module("gemm.o")
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 004cc6b9c7d3..cad560c208b6 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -28,7 +28,7 @@
 from tvm import te
 from tvm import relay
 from tvm import autotvm
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
@@ -224,13 +224,13 @@ def get_ref_data():
         mod = tvm.build(
             s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d"
         )
-    temp = util.tempdir()
+    temp = utils.tempdir()
     mod.save(temp.relpath("conv2d.o"))
     remote.upload(temp.relpath("conv2d.o"))
     f = remote.load_module("conv2d.o")
     ctx = remote.context(str(target))
 
-    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     bias_arr = tvm.nd.array(bias_np, ctx)
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
index 23c4a5c78d90..f750225ed8f7 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
@@ -28,7 +28,7 @@
 from tvm import te
 from tvm import relay
 from tvm import autotvm
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
@@ -220,13 +220,13 @@ def get_ref_data():
             target_host=env.target_host,
             name="conv2d_transpose",
         )
-    temp = util.tempdir()
+    temp = utils.tempdir()
     mod.save(temp.relpath("conv2d_transpose.o"))
     remote.upload(temp.relpath("conv2d_transpose.o"))
     f = remote.load_module("conv2d_transpose.o")
     ctx = remote.context(str(target))
 
-    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     res_arr = tvm.nd.array(res_np, ctx)
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
index 37cfac16f5d7..0b604108a35f 100644
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -26,7 +26,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
@@ -131,13 +131,13 @@ def get_ref_data():
         mod = tvm.build(
             s, [data, kernel, res], target=target, target_host=env.target_host, name="dense"
         )
-    temp = util.tempdir()
+    temp = utils.tempdir()
     mod.save(temp.relpath("dense.o"))
     remote.upload(temp.relpath("dense.o"))
     f = remote.load_module("dense.o")
     ctx = remote.context(str(target))
 
-    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     res_arr = tvm.nd.array(res_np, ctx)
diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
index 08d5e4b52555..da6ba5b8fb94 100644
--- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
@@ -28,7 +28,7 @@
 from tvm import te
 from tvm import relay
 from tvm import autotvm
-from tvm.contrib import util
+from tvm.contrib import utils
 from tvm import topi
 import tvm.topi.testing
 import vta
@@ -218,13 +218,13 @@ def get_ref_data():
         mod = tvm.build(
             s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d"
         )
-    temp = util.tempdir()
+    temp = utils.tempdir()
     mod.save(temp.relpath("conv2d.o"))
     remote.upload(temp.relpath("conv2d.o"))
     f = remote.load_module("conv2d.o")
     ctx = remote.context(str(target))
 
-    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
+    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, ctx)
     kernel_arr = tvm.nd.array(kernel_np, ctx)
     bias_arr = tvm.nd.array(bias_np, ctx)
diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py
index fb0acf1d065f..b83510f4a9dc 100644
--- a/vta/tests/python/unittest/test_vta_insn.py
+++ b/vta/tests/python/unittest/test_vta_insn.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import topi
-from tvm.contrib import util
+from tvm.contrib import utils
 
 import vta
 import vta.testing
@@ -54,7 +54,7 @@ def _run(env, remote):
 
         if not remote:
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         m.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
@@ -125,7 +125,7 @@ def check_padded_load(pad_before, pad_after, test_name=None):
 
             if not remote:
                 return
-            temp = util.tempdir()
+            temp = utils.tempdir()
             mod.save(temp.relpath("padded_load.o"))
             remote.upload(temp.relpath("padded_load.o"))
             f = remote.load_module("padded_load.o")
@@ -209,7 +209,7 @@ def _run(env, remote):
 
         def verify(s, name=None):
             mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
-            temp = util.tempdir()
+            temp = utils.tempdir()
             mod.save(temp.relpath("gemm.o"))
             remote.upload(temp.relpath("gemm.o"))
             f = remote.load_module("gemm.o")
@@ -371,7 +371,7 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
                     mod = vta.build(s, [a, res], "ext_dev", env.target_host)
                 else:
                     mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
-            temp = util.tempdir()
+            temp = utils.tempdir()
             mod.save(temp.relpath("load_act.o"))
             remote.upload(temp.relpath("load_act.o"))
             f = remote.load_module("load_act.o")
@@ -454,7 +454,7 @@ def _run(env, remote):
             mod = vta.build(s, [a, res], "ext_dev", env.target_host)
         if not remote:
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         mod.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
@@ -516,7 +516,7 @@ def _run(env, remote):
         mod = vta.build(s, [a, res], "ext_dev", env.target_host)
         if not remote:
             return
-        temp = util.tempdir()
+        temp = utils.tempdir()
         mod.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index cb36040f2da4..7f0442402c57 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -62,7 +62,7 @@
 import tvm
 from tvm import te
 from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, util, download
+from tvm.contrib import graph_runtime, utils, download
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 
@@ -424,7 +424,7 @@ def tune_and_evaluate(tuning_opt):
 
         # Export library
         print("Upload...")
-        temp = util.tempdir()
+        temp = utils.tempdir()
         lib.save(temp.relpath("graphlib.o"))
         remote.upload(temp.relpath("graphlib.o"))
         lib = remote.load_module("graphlib.o")
@@ -507,4 +507,4 @@ def tune_and_evaluate(tuning_opt):
 #      import logging
 #      logging.getLogger('autotvm').setLevel(logging.DEBUG)
 #
-#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 582eb03adc56..1bf4161a3340 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -52,7 +52,7 @@
 import tvm
 from tvm import te
 from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, util, download
+from tvm.contrib import graph_runtime, utils, download
 from tvm.contrib.debugger import debug_runtime
 from tvm.relay import transform
 
@@ -204,7 +204,7 @@
     print(model + " inference graph built in {0:.2f}s!".format(build_time))
 
     # Send the inference library over to the remote RPC server
-    temp = util.tempdir()
+    temp = utils.tempdir()
     lib.export_library(temp.relpath("graphlib.tar"))
     remote.upload(temp.relpath("graphlib.tar"))
     lib = remote.load_module("graphlib.tar")
diff --git a/vta/tutorials/frontend/legacy/deploy_detection.py b/vta/tutorials/frontend/legacy/deploy_detection.py
index f2c42c1fc1d8..7a4aba93146b 100644
--- a/vta/tutorials/frontend/legacy/deploy_detection.py
+++ b/vta/tutorials/frontend/legacy/deploy_detection.py
@@ -241,7 +241,7 @@
     print(MODEL_NAME + " inference graph built in {0:.2f}s!".format(build_time))
 
     # Send the inference library over to the remote RPC server
-    temp = util.tempdir()
+    temp = utils.tempdir()
     lib.export_library(temp.relpath("graphlib.tar"))
     remote.upload(temp.relpath("graphlib.tar"))
     lib = remote.load_module("graphlib.tar")
diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py
index 71a8f672dbb4..60ea9ca275c1 100644
--- a/vta/tutorials/matrix_multiply.py
+++ b/vta/tutorials/matrix_multiply.py
@@ -40,7 +40,7 @@
 import vta
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from vta.testing import simulator
 
 # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
@@ -389,7 +389,7 @@
 my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")
 
 # Write the compiled module into an object file.
-temp = util.tempdir()
+temp = utils.tempdir()
 my_gemm.save(temp.relpath("gemm.o"))
 
 # Send the executable over RPC
diff --git a/vta/tutorials/optimize/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py
index 2888f34855f6..e991ffe18f79 100644
--- a/vta/tutorials/optimize/convolution_opt.py
+++ b/vta/tutorials/optimize/convolution_opt.py
@@ -45,7 +45,7 @@
 import numpy as np
 
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from vta.testing import simulator
 
 # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
@@ -370,7 +370,7 @@
 
 # Compile the TVM module
 my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
-temp = util.tempdir()
+temp = utils.tempdir()
 my_conv.save(temp.relpath("conv2d.o"))
 remote.upload(temp.relpath("conv2d.o"))
 f = remote.load_module("conv2d.o")
diff --git a/vta/tutorials/optimize/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
index 8797c3edeffd..44552db21688 100644
--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ b/vta/tutorials/optimize/matrix_multiply_opt.py
@@ -43,7 +43,7 @@
 import vta
 import numpy as np
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from vta.testing import simulator
 
 # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
@@ -311,7 +311,7 @@
 
 # Compile the TVM module
 my_gemm = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="my_gemm")
-temp = util.tempdir()
+temp = utils.tempdir()
 my_gemm.save(temp.relpath("gemm.o"))
 remote.upload(temp.relpath("gemm.o"))
 f = remote.load_module("gemm.o")
diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py
index 8f37b2d10179..16d613581b57 100644
--- a/vta/tutorials/vta_get_started.py
+++ b/vta/tutorials/vta_get_started.py
@@ -67,7 +67,7 @@
 
 # We'll need the TVM RPC module and the VTA simulator module
 from tvm import rpc
-from tvm.contrib import util
+from tvm.contrib import utils
 from vta.testing import simulator
 
 # We read the Pynq RPC host IP address and port number from the OS environment
@@ -320,7 +320,7 @@
 # execution.
 
 # Write the compiled module into an object file.
-temp = util.tempdir()
+temp = utils.tempdir()
 my_vadd.save(temp.relpath("vadd.o"))
 
 # Send the executable over RPC
diff --git a/web/tests/python/webgpu_rpc_test.py b/web/tests/python/webgpu_rpc_test.py
index 5de8cf8c72bf..5efc85cf5e32 100644
--- a/web/tests/python/webgpu_rpc_test.py
+++ b/web/tests/python/webgpu_rpc_test.py
@@ -23,7 +23,7 @@
 import tvm
 from tvm import te
 from tvm import rpc
-from tvm.contrib import util, emcc
+from tvm.contrib import utils, emcc
 import numpy as np
 
 proxy_host = "localhost"
@@ -50,7 +50,7 @@ def test_rpc():
     s[B].bind(xo, te.thread_axis("blockIdx.x"))
 
     fadd = tvm.build(s, [A, B], target_device, target_host=target_host, name="addone")
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     wasm_path = temp.relpath("addone_gpu.wasm")
     fadd.export_library(wasm_path, emcc.create_tvmjs_wasm)
diff --git a/web/tests/python/websock_rpc_test.py b/web/tests/python/websock_rpc_test.py
index 6729964e11d2..48603e86b7f7 100644
--- a/web/tests/python/websock_rpc_test.py
+++ b/web/tests/python/websock_rpc_test.py
@@ -23,7 +23,7 @@
 import tvm
 from tvm import te
 from tvm import rpc
-from tvm.contrib import util, emcc
+from tvm.contrib import utils, emcc
 import numpy as np
 
 proxy_host = "localhost"
@@ -43,7 +43,7 @@ def test_rpc():
     s = te.create_schedule(B.op)
 
     fadd = tvm.build(s, [A, B], target, name="addone")
-    temp = util.tempdir()
+    temp = utils.tempdir()
 
     wasm_path = temp.relpath("addone.wasm")
     fadd.export_library(wasm_path, emcc.create_tvmjs_wasm)

From 1903b1d4514783a7c3ed936526e00c950f1ce647 Mon Sep 17 00:00:00 2001
From: alter-xp <72853260+alter-xp@users.noreply.github.com>
Date: Fri, 30 Oct 2020 03:00:38 +0800
Subject: [PATCH 083/258] TF frontend: add expm1 op (#6783)

* TF frontend: add expm1 op

* TF frontend: add description for expm1

* TF frontend: use overload operator - instead of subtract

* TF frontend: Limits the range of input data in the Expm1 test

Co-authored-by: xup <xp224797@alibaba-inc.com>
---
 python/tvm/relay/frontend/tensorflow.py          | 10 ++++++++++
 tests/python/frontend/tensorflow/test_forward.py | 16 ++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 218c6e57f995..1f5786f911cb 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -788,6 +788,15 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _expm1():
+    # op description: https://www.tensorflow.org/api_docs/python/tf/math/expm1
+    def _impl(inputs, attr, params, mod):
+        exp_out = get_relay_op("exp")(inputs[0])
+        return exp_out - tvm.relay.const(1.0)
+
+    return _impl
+
+
 def _resize(method):
     def _impl(inputs, attr, params, mod):
         if attr["_output_shapes"][0] is not None:
@@ -2297,6 +2306,7 @@ def _impl(inputs, attr, params, mod):
     "EuclideanNorm": _euclidean_norm(),
     "Exp": AttrCvt("exp"),
     "ExpandDims": _expand_dims(),
+    "Expm1": _expm1(),
     "Fill": _fill(),
     "Floor": AttrCvt("floor"),
     "FloorDiv": _floordiv(),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index fc50d771c71d..a5c15c751b50 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -3504,6 +3504,22 @@ def test_forward_atan2():
     compare_tf_with_tvm([np_data_1, np_data_2], ["in_data_1:0", "in_data_2:0"], "atan2:0")
 
 
+def test_forward_expm1():
+    """test operator expm1 """
+
+    def _test_forward_expm1(shape):
+        tf.disable_eager_execution()
+        np_data = np.random.uniform(1, 10, size=shape).astype(np.float32)
+        tf.reset_default_graph()
+        in_data = tf.placeholder(tf.float32, shape, name="in_data")
+        tf.expm1(in_data, name="expm1")
+        compare_tf_with_tvm([np_data], ["in_data:0"], "expm1:0")
+
+    _test_forward_expm1([1, 100])
+    _test_forward_expm1([1, 10, 10])
+    _test_forward_expm1([2, 5, 2, 5])
+
+
 def test_forward_negative():
     """test tf operator Neg """
     np_data = np.random.uniform(-100, 255, size=(224, 224, 3)).astype(np.float32)

From 3e3963b449f7cb3ce526f486045ee1a139fec6ba Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandron85@gmail.com>
Date: Thu, 29 Oct 2020 20:55:11 +0000
Subject: [PATCH 084/258] [TVMC] use common function to obtain target from
 --target value on 'tvmc compile' (#6788)

- This is solving a TODO item on tvmc
---
 python/tvm/driver/tvmc/common.py   |  1 -
 python/tvm/driver/tvmc/compiler.py | 11 +----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index a625a99f0e7e..9db22f3f3390 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -76,7 +76,6 @@ def convert_graph_layout(mod, desired_layout):
             )
 
 
-# TODO In a separate PR, eliminate the duplicated code here and in compiler.py (@leandron)
 def target_from_cli(target):
     """
     Create a tvm.target.Target instance from a
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index eeb5d07fe051..e1a4a7481f6a 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -177,16 +177,7 @@ def compile_model(
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
-    # Handle the case in which target is a path to a JSON file.
-    if os.path.exists(target):
-        with open(target) as target_file:
-            logger.info("using target input from file: %s", target)
-            target = "".join(target_file.readlines())
-
-    # TODO(@leandron) We don't have an API to collect a list of supported
-    #       targets yet
-    logger.debug("creating target from input: %s", target)
-    tvm_target = tvm.target.Target(target)
+    tvm_target = common.target_from_cli(target)
     target_host = target_host or ""
 
     if tuning_records and os.path.exists(tuning_records):

From 9bd17d3a3f94c9cb20a35fdeb3c302511fa76509 Mon Sep 17 00:00:00 2001
From: presburger <mayushengmusic@gmail.com>
Date: Fri, 30 Oct 2020 05:01:08 +0800
Subject: [PATCH 085/258] fix a bug in convertSSA. (#6785)

---
 src/tir/transforms/ir_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index 838194203b5b..033a2e093a2a 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -89,7 +89,7 @@ Stmt MergeNest(const std::vector<std::vector<Stmt>>& nest, Stmt body) {
 class IRConvertSSA final : public StmtExprMutator {
  public:
   PrimExpr VisitExpr_(const VarNode* op) final {
-    if (scope_.count(op)) {
+    if (scope_.count(op) && !scope_[op].empty()) {
       return scope_[op].back();
     } else {
       return GetRef<PrimExpr>(op);

From b8ac76004973b7e6f4d9129735326a0363aa6c80 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 29 Oct 2020 14:14:57 -0700
Subject: [PATCH 086/258] [QNN] Optimize requantize for power of 2 and fix
 dequantize for per-channel quantized input (#6675)

* [QNN] Optimize requantize for power of 2 and bug in dequantize

* Comments

* Docs

* Comments

* Ethos
---
 src/relay/qnn/op/dequantize.cc               |  4 +-
 src/target/intrin_rule.cc                    | 93 +++++++++++++-------
 tests/python/relay/test_op_qnn_dequantize.py | 18 ++++
 tests/python/relay/test_op_qnn_requantize.py | 43 +++++++++
 4 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 2e7a28624e26..2fe075c7e64b 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -96,8 +96,8 @@ Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
     expanded_input_zero_point = ExpandBiasToMatchAxis(input_zero_point, n_dim, {axis});
   }
 
-  auto shift = Subtract(Cast(input_tensor, DataType::Int(32)), input_zero_point);
-  auto scaled_output = Multiply(Cast(shift, DataType::Float(32)), input_scale);
+  auto shift = Subtract(Cast(input_tensor, DataType::Int(32)), expanded_input_zero_point);
+  auto scaled_output = Multiply(Cast(shift, DataType::Float(32)), expanded_input_scale);
   return scaled_output;
 }
 
diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index 0808d237fc28..f8f4d0ef5414 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -128,37 +128,68 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.q_multiply_shift")
       PrimExpr q = call->args[2];
       PrimExpr s = call->args[3];
 
-      // Only int32 types are supported (any number of lanes is allowed)
-      ICHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
-      ICHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
-
-      DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
-      DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
-
-      // 1) Calculating the integer multiplier and integer shift
-      PrimExpr zero = make_const(s.dtype(), 0);
-      PrimExpr left_shift = tir::Select(s > zero, s, zero);
-      PrimExpr right_shift = tir::Select(s > zero, zero, -s);
-
-      // 2) Cast and Multiply the integer multiplier
-      PrimExpr one = make_const(hp_dtype, 1);
-      x = cast(hp_dtype, x);
-      y = cast(hp_dtype, y);
-      x = tir::Select(left_shift != zero, x << left_shift, x);
-
-      // 3) Perform the multiplication in higher precision.
-      x = x * y;
-
-      // 4) Find the rounding scalar
-      PrimExpr total_right_shift = right_shift + q;
-      PrimExpr pos_rounding_value = (one << (total_right_shift - 1));
-      x = x + pos_rounding_value;
-
-      // 5) Simply right shift the result to get the final output.
-      x = x >> total_right_shift;
-
-      // 6) The fixed point multiplication keeps the value in int32 range. Casting back to int32.
-      *rv = cast(lp_dtype, x);
+      // Lambda function to extract the int value from PrimExpr
+      auto get_int_value = [](const PrimExpr node) {
+        if (auto int_node = node.as<IntImmNode>()) {
+          return int_node->value;
+        }
+        auto broadcast_node = node.as<BroadcastNode>();
+        CHECK(broadcast_node != nullptr);
+        auto int_node = broadcast_node->value.as<IntImmNode>();
+        CHECK(int_node != nullptr);
+        return int_node->value;
+      };
+      // Power of 2 is determined by the fixed_point_multiplier == 1 << 30. In case of power of 2,
+      // fixed point multiplier will represent a float value of 0.5. In fixed point, this is
+      // represented by 1 << 30.
+      if (get_int_value(y) == (1 << 30)) {
+        PrimExpr exp = s - 1;
+        int exp_val = get_int_value(s) - 1;
+        if (exp_val > 0) {
+          // power of 2 is greater than 0, apply left shift.
+          *rv = x << exp;
+        } else {
+          // power of 2 is less than 0, round and then apply right shift.
+          DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+          PrimExpr one = make_const(lp_dtype, 1);
+          exp = -exp;
+          PrimExpr rounding_factor = one << (exp - 1);
+          PrimExpr rounded_t = x + rounding_factor;
+          *rv = rounded_t >> exp;
+        }
+      } else {
+        // Only int32 types are supported (any number of lanes is allowed)
+        ICHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
+        ICHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
+
+        DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
+        DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+
+        // 1) Calculating the integer multiplier and integer shift
+        PrimExpr zero = make_const(s.dtype(), 0);
+        PrimExpr left_shift = tir::Select(s > zero, s, zero);
+        PrimExpr right_shift = tir::Select(s > zero, zero, -s);
+
+        // 2) Cast and Multiply the integer multiplier
+        PrimExpr one = make_const(hp_dtype, 1);
+        x = cast(hp_dtype, x);
+        y = cast(hp_dtype, y);
+        x = tir::Select(left_shift != zero, x << left_shift, x);
+
+        // 3) Perform the multiplication in higher precision.
+        x = x * y;
+
+        // 4) Find the rounding scalar
+        PrimExpr total_right_shift = right_shift + q;
+        PrimExpr pos_rounding_value = (one << (total_right_shift - 1));
+        x = x + pos_rounding_value;
+
+        // 5) Simply right shift the result to get the final output.
+        x = x >> total_right_shift;
+
+        // 6) The fixed point multiplication keeps the value in int32 range. Casting back to int32.
+        *rv = cast(lp_dtype, x);
+      }
     });
 
 }  // namespace intrin
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index 6598e2bb2062..e1416622c236 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -101,8 +101,26 @@ def test_channelwise_axis_1():
     )
 
 
+def test_channelwise_axis_0():
+    data = np.array([0, 1, 2, 3, 4, 243, 247, 249, 250, 251]).astype("uint8").reshape((2, 5))
+    output = (
+        np.array([-63.5, -63, -62.5, -62, -61.5, 30, 31, 31.5, 31.75, 32])
+        .astype("float32")
+        .reshape((2, 5))
+    )
+    quant_args = {
+        "in_zero_point": np.array([127, 123]).astype("int32"),
+        "in_scale": np.array([0.5, 0.25]).astype("float32"),
+    }
+
+    dequantize_test_driver(
+        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=0
+    )
+
+
 if __name__ == "__main__":
     test_uint8_to_float32()
     test_int8_to_float32()
     test_int32_to_float32()
     test_channelwise_axis_1()
+    test_channelwise_axis_0()
diff --git a/tests/python/relay/test_op_qnn_requantize.py b/tests/python/relay/test_op_qnn_requantize.py
index f152a4ebf840..f40a08711451 100644
--- a/tests/python/relay/test_op_qnn_requantize.py
+++ b/tests/python/relay/test_op_qnn_requantize.py
@@ -204,6 +204,48 @@ def test_upscale():
         verify(mod, (golden_data, golden_output))
 
 
+def test_non_power_of_two():
+    for rounding in roundings:
+        mod = get_mod(
+            data_shape=(32,),
+            data_dtype="int32",
+            out_dtype="int8",
+            input_scale=1,
+            output_scale=3,
+            rounding=rounding,
+        )
+
+        # Try positive values
+        golden_data = np.multiply(np.arange(0, 32, 1).astype("int32"), 3)
+        golden_output = np.arange(0, 32, 1)
+        verify(mod, (golden_data, golden_output))
+
+        # Try negative values
+        golden_data = np.multiply(np.arange(0, -32, -1).astype("int32"), 3)
+        golden_output = np.arange(0, -32, -1)
+        verify(mod, (golden_data, golden_output))
+
+        # Try a different scale
+        mod = get_mod(
+            data_shape=(32,),
+            data_dtype="int32",
+            out_dtype="int8",
+            input_scale=3,
+            output_scale=1,
+            rounding=rounding,
+        )
+
+        # Try positive values
+        golden_data = np.arange(0, 32, 1).astype("int32")
+        golden_output = np.multiply(golden_data, 3)
+        verify(mod, (golden_data, golden_output))
+
+        # Try negative values
+        golden_data = np.arange(0, -32, -1).astype("int32")
+        golden_output = np.multiply(golden_data, 3)
+        verify(mod, (golden_data, golden_output))
+
+
 def test_saturation():
     for rounding in roundings:
         mod = get_mod(
@@ -397,6 +439,7 @@ def test_per_channel_different_scale():
     test_same_scale()
     test_downscale()
     test_upscale()
+    test_non_power_of_two()
     test_saturation()
     test_zero_point()
     test_per_channel_same_scale()

From 33c3688c5cf36d6550ce4e1cbec3c15bf33b0dc9 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 29 Oct 2020 14:24:03 -0700
Subject: [PATCH 087/258] [FIX,AUTOSCHEDULER] Fix auto_scheduler to run with
 multiprocessing's spawn start method (#6671)

* Fix multiprocessing with spawn issues

* address reviewer feedback

* Fix tutorials

* formatting

* undo autotvm work

* Undo tutorial changes

* Add spawn tests

* fix test
---
 python/tvm/auto_scheduler/measure.py          | 545 ++++++++++--------
 python/tvm/auto_scheduler/measure_record.py   |  39 +-
 python/tvm/auto_scheduler/utils.py            |  42 +-
 .../tvm/auto_scheduler/workload_registry.py   |  35 ++
 python/tvm/testing.py                         |  17 +
 src/auto_scheduler/measure_record.cc          |  75 ++-
 .../unittest/test_auto_scheduler_measure.py   |  21 +-
 .../test_auto_scheduler_search_policy.py      |  19 +-
 .../test_auto_scheduler_task_scheduler.py     |  14 +
 .../unittest/test_autotvm_dispatch_context.py |  11 +-
 tests/python/unittest/test_runtime_rpc.py     | 143 ++---
 11 files changed, 574 insertions(+), 387 deletions(-)
 mode change 100755 => 100644 src/auto_scheduler/measure_record.cc

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 8a8b92201d15..9f592550cda8 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -51,19 +51,51 @@
 from .loop_state import StateObject
 from .utils import (
     get_const_tuple,
-    NoDaemonPool,
     call_func_with_timeout,
     request_remote,
     check_remote,
 )
+from .compute_dag import ComputeDAG
+from .search_task import SearchTask
+from .workload_registry import workload_func_name, get_workload_func
 
 # The maximum length of error message
 MAX_ERROR_MSG_LEN = 512
 
-# We use fork and a global variable to copy arguments between processes.
-# This can avoid expensive serialization of TVM IR when using multiprocessing.Pool
-GLOBAL_BUILD_ARGUMENTS = None
-GLOBAL_RUN_ARGUMENTS = None
+
+def recover_measure_input(inp, rebuild_state=False):
+    """
+    Recover a deserialized MeasureInput by rebuilding the missing fields.
+    1. Rebuid the compute_dag in inp.task
+    2. (Optional) Rebuild the stages in inp.state
+
+    Parameters
+    ----------
+    inp: MeasureInput
+        The deserialized MeasureInput
+    rebuild_state: bool = False
+        Whether rebuild the stages in MeasureInput.State
+
+    Returns
+    -------
+    new_input: MeasureInput
+        The fully recovered MeasureInput with all fields rebuilt.
+    """
+    task = inp.task
+    new_task = SearchTask(
+        ComputeDAG(task.workload_key),
+        task.workload_key,
+        task.target,
+        task.target_host,
+        task.hardware_params,
+    )
+
+    if rebuild_state:
+        new_state = new_task.compute_dag.infer_bound_from_state(inp.state)
+    else:
+        new_state = inp.state
+
+    return MeasureInput(new_task, new_state)
 
 
 @tvm._ffi.register_object("auto_scheduler.MeasureCallback")
@@ -87,6 +119,31 @@ def __init__(self, task, state):
         state = state if isinstance(state, StateObject) else state.state_object
         self.__init_handle_by_constructor__(_ffi_api.MeasureInput, task, state)
 
+    def serialize(self):
+        """Custom serialization to workaround MeasureInput not exposing all its
+        members to the TVM ffi interface.
+
+        Note that we do not implement __getstate__ as it does not seem to work
+        with initialization of the workload registry (maybe because of
+        initialization order?).
+        """
+        serialize = tvm.get_global_func("auto_scheduler.SerializeMeasureInput", True)
+        assert serialize
+        # We serialize the workload function so that it can be used on the deserialized side.
+        return {
+            "measureinput": serialize(self),
+            "name": workload_func_name(self.task.workload_key),
+            "func": get_workload_func(self.task),
+        }
+
+    @staticmethod
+    def deserialize(state):
+        deserialize = tvm.get_global_func("auto_scheduler.DeserializeMeasureInput", True)
+        assert deserialize
+        tvm.auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY[state["name"]] = state["func"]
+        x = deserialize(state["measureinput"])
+        return recover_measure_input(x)
+
 
 @tvm._ffi.register_object("auto_scheduler.BuildResult")
 class BuildResult(Object):
@@ -486,29 +543,63 @@ def make_error_msg():
     return error_msg
 
 
-def local_build_worker(index):
+def _timed_func(inp_serialized, build_func, verbose):
+    tic = time.time()
+    inp = MeasureInput.deserialize(inp_serialized)
+    task = inp.task
+
+    error_no = MeasureErrorNo.NO_ERROR
+    error_msg = None
+    args = []
+
+    try:
+        sch, args = task.compute_dag.apply_steps_from_state(inp.state, layout_rewrite=True)
+    # pylint: disable=broad-except
+    except Exception:
+        error_no = MeasureErrorNo.INSTANTIATION_ERROR
+        error_msg = make_error_msg()
+
+    if error_no == 0:
+        dirname = tempfile.mkdtemp()
+        filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
+
+        try:
+            # TODO(merrymercy): Port the unroll pass.
+            with transform.PassContext():
+                func = build_module.build(
+                    sch, args, target=task.target, target_host=task.target_host
+                )
+            func.export_library(filename, build_func)
+        # pylint: disable=broad-except
+        except Exception:
+            error_no = MeasureErrorNo.COMPILE_HOST
+            error_msg = make_error_msg()
+    else:
+        filename = ""
+
+    if verbose >= 1:
+        if error_no == MeasureErrorNo.NO_ERROR:
+            print(".", end="")
+        else:
+            print(".E", end="")  # Build error
+    return filename, args, error_no, error_msg, time.time() - tic
+
+
+def local_build_worker(args):
     """
     Build function of LocalBuilder to be ran in the Builder thread pool.
 
     Parameters
     ----------
-    index : int
-        The MeasureInput index to be processed by the current Builder thread.
+    args: Tuple[MeasureInput, str, int, int]
+        inputs, build-func, time, verbose args passed to local_builder_build
 
     Returns
     -------
     res : BuildResult
         The build result of this Builder thread.
     """
-    global GLOBAL_BUILD_ARGUMENTS
-
-    # We use fork and a global variable to copy arguments between processes.
-    # This can avoid expensive serialization of TVM IR when using multiprocessing.Pool
-    if not GLOBAL_BUILD_ARGUMENTS:
-        raise ValueError("GLOBAL_BUILD_ARGUMENTS not found")
-    measure_inputs, build_func, timeout, verbose = GLOBAL_BUILD_ARGUMENTS
-    assert isinstance(build_func, str)
-
+    inp, build_func, timeout, verbose = args
     if build_func == "default":
         build_func = tar.tar
     elif build_func == "ndk":
@@ -516,48 +607,7 @@ def local_build_worker(index):
     else:
         raise ValueError("Invalid build_func" + build_func)
 
-    def timed_func():
-        tic = time.time()
-        inp = measure_inputs[index]
-        task = inp.task
-
-        error_no = MeasureErrorNo.NO_ERROR
-        error_msg = None
-        args = []
-
-        try:
-            sch, args = task.compute_dag.apply_steps_from_state(inp.state, layout_rewrite=True)
-        # pylint: disable=broad-except
-        except Exception:
-            error_no = MeasureErrorNo.INSTANTIATION_ERROR
-            error_msg = make_error_msg()
-
-        if error_no == 0:
-            dirname = tempfile.mkdtemp()
-            filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
-
-            try:
-                # TODO(merrymercy): Port the unroll pass.
-                with transform.PassContext():
-                    func = build_module.build(
-                        sch, args, target=task.target, target_host=task.target_host
-                    )
-                func.export_library(filename, build_func)
-            # pylint: disable=broad-except
-            except Exception:
-                error_no = MeasureErrorNo.COMPILE_HOST
-                error_msg = make_error_msg()
-        else:
-            filename = ""
-
-        if verbose >= 1:
-            if error_no == MeasureErrorNo.NO_ERROR:
-                print(".", end="")
-            else:
-                print(".E", end="")  # Build error
-        return filename, args, error_no, error_msg, time.time() - tic
-
-    res = call_func_with_timeout(timeout, timed_func)
+    res = call_func_with_timeout(timeout, _timed_func, args=(inp, build_func, verbose))
     if isinstance(res, TimeoutError):
         if verbose >= 1:
             print(".T", end="")  # Build timeout
@@ -590,14 +640,20 @@ def local_builder_build(inputs, timeout, n_parallel, build_func="default", verbo
     res : List[BuildResult]
         The build results of these MeasureInputs.
     """
-    # We use fork and a global variable to copy arguments between processes.
-    # This can avoid expensive serialization of TVM IR when using multiprocessing.Pool
-    global GLOBAL_BUILD_ARGUMENTS
-
-    GLOBAL_BUILD_ARGUMENTS = (inputs, build_func, timeout, verbose)
-
-    pool = NoDaemonPool(n_parallel)
-    tuple_res = pool.map(local_build_worker, range(len(inputs)))
+    # This pool is not doing computationally intensive work, so we can use threads
+    pool = multiprocessing.pool.ThreadPool(n_parallel)
+    tuple_res = pool.map(
+        local_build_worker,
+        [
+            (
+                i.serialize(),
+                build_func,
+                timeout,
+                verbose,
+            )
+            for i in inputs
+        ],
+    )
     pool.terminate()
     pool.join()
     del pool
@@ -609,6 +665,70 @@ def local_builder_build(inputs, timeout, n_parallel, build_func="default", verbo
     return results
 
 
+def _timed_eval_func(
+    inp_serialized,
+    build_res,
+    number,
+    repeat,
+    min_repeat_ms,
+    cooldown_interval,
+    enable_cpu_cache_flush,
+    verbose,
+):
+    inp = MeasureInput.deserialize(inp_serialized)
+    tic = time.time()
+    error_no = 0
+    error_msg = None
+    try:
+        func = module.load_module(build_res.filename)
+        ctx = ndarray.context(str(inp.task.target), 0)
+        # Limitation:
+        # We can not get PackFunction directly in the remote mode as it is wrapped
+        # under the std::function. We could lift the restriction later once we fold
+        # the PackedFunc as an object. Currently, we pass function name to work
+        # around it.
+        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
+        time_f = func.time_evaluator(
+            func.entry_name,
+            ctx,
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            f_preproc=f_prepare,
+        )
+    # pylint: disable=broad-except
+    except Exception:
+        costs = (max_float,)
+        error_no = MeasureErrorNo.COMPILE_DEVICE
+        error_msg = make_error_msg()
+
+    if error_no == 0:
+        try:
+            args = [ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args]
+            random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True)
+            assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake"
+            for arg in args:
+                random_fill(arg)
+            ctx.sync()
+            costs = time_f(*args).results
+        # pylint: disable=broad-except
+        except Exception:
+            costs = (max_float,)
+            error_no = MeasureErrorNo.RUNTIME_DEVICE
+            error_msg = make_error_msg()
+
+    shutil.rmtree(os.path.dirname(build_res.filename))
+    toc = time.time()
+    time.sleep(cooldown_interval)
+
+    if verbose >= 1:
+        if error_no == MeasureErrorNo.NO_ERROR:
+            print("*", end="")
+        else:
+            print("*E", end="")  # Run error
+    return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
+
+
 @tvm._ffi.register_func("auto_scheduler.local_runner.run")
 def local_run(
     inputs,
@@ -667,61 +787,6 @@ def local_run(
     """
     max_float = 1e10  # We use 1e10 instead of sys.float_info.max for better readability in log
 
-    def timed_func(inp, build_res):
-        tic = time.time()
-        error_no = 0
-        error_msg = None
-        try:
-            func = module.load_module(build_res.filename)
-            ctx = ndarray.context(str(inp.task.target), 0)
-            # Limitation:
-            # We can not get PackFunction directly in the remote mode as it is wrapped
-            # under the std::function. We could lift the restriction later once we fold
-            # the PackedFunc as an object. Currently, we pass function name to work
-            # around it.
-            f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
-            time_f = func.time_evaluator(
-                func.entry_name,
-                ctx,
-                number=number,
-                repeat=repeat,
-                min_repeat_ms=min_repeat_ms,
-                f_preproc=f_prepare,
-            )
-        # pylint: disable=broad-except
-        except Exception:
-            costs = (max_float,)
-            error_no = MeasureErrorNo.COMPILE_DEVICE
-            error_msg = make_error_msg()
-
-        if error_no == 0:
-            try:
-                args = [
-                    ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args
-                ]
-                random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True)
-                assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake"
-                for arg in args:
-                    random_fill(arg)
-                ctx.sync()
-                costs = time_f(*args).results
-            # pylint: disable=broad-except
-            except Exception:
-                costs = (max_float,)
-                error_no = MeasureErrorNo.RUNTIME_DEVICE
-                error_msg = make_error_msg()
-
-        shutil.rmtree(os.path.dirname(build_res.filename))
-        toc = time.time()
-        time.sleep(cooldown_interval)
-
-        if verbose >= 1:
-            if error_no == MeasureErrorNo.NO_ERROR:
-                print("*", end="")
-            else:
-                print("*E", end="")  # Run error
-        return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
-
     measure_results = []
     assert len(inputs) == len(build_results), "Measure input size should be equal to build results"
     for inp, build_res in zip(inputs, build_results):
@@ -734,7 +799,20 @@ def timed_func(inp, build_res):
                 time.time(),
             )
         else:
-            res = call_func_with_timeout(timeout, timed_func, args=(inp, build_res))
+            res = call_func_with_timeout(
+                timeout,
+                _timed_eval_func,
+                args=(
+                    inp.serialize(),
+                    build_res,
+                    number,
+                    repeat,
+                    min_repeat_ms,
+                    cooldown_interval,
+                    enable_cpu_cache_flush,
+                    verbose,
+                ),
+            )
             if isinstance(res, TimeoutError):
                 if verbose >= 1:
                     print("*T", end="")  # Run timeout
@@ -753,40 +831,104 @@ def timed_func(inp, build_res):
     return measure_results
 
 
-def rpc_run_worker(index):
+def _timed_rpc_run(
+    inp_serialized,
+    build_res,
+    key,
+    host,
+    port,
+    priority,
+    timeout,
+    number,
+    repeat,
+    min_repeat_ms,
+    cooldown_interval,
+    enable_cpu_cache_flush,
+    verbose,
+):
+    inp = MeasureInput.deserialize(inp_serialized)
+    tic = time.time()
+    error_no = 0
+    error_msg = None
+    try:
+        # upload built module
+        remote = request_remote(key, host, port, priority, timeout)
+        remote.upload(build_res.filename)
+        func = remote.load_module(os.path.split(build_res.filename)[1])
+        ctx = remote.context(str(inp.task.target), 0)
+        # Limitation:
+        # We can not get PackFunction directly in the remote mode as it is wrapped
+        # under the std::function. We could lift the restriction later once we fold
+        # the PackedFunc as an object. Currently, we pass function name to work
+        # around it.
+        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
+        time_f = func.time_evaluator(
+            func.entry_name,
+            ctx,
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            f_preproc=f_prepare,
+        )
+    # pylint: disable=broad-except
+    except Exception:
+        costs = (max_float,)
+        error_no = MeasureErrorNo.COMPILE_DEVICE
+        error_msg = make_error_msg()
+
+    if error_no == 0:
+        try:
+            args = [ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args]
+            try:
+                random_fill = remote.get_function("tvm.contrib.random.random_fill")
+            except AttributeError:
+                raise AttributeError(
+                    "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
+                )
+            for arg in args:
+                random_fill(arg)
+            ctx.sync()
+
+            costs = time_f(*args).results
+            # clean up remote files
+            remote.remove(build_res.filename)
+            remote.remove(os.path.splitext(build_res.filename)[0] + ".so")
+            remote.remove("")
+        # pylint: disable=broad-except
+        except Exception:
+            costs = (max_float,)
+            error_no = MeasureErrorNo.RUNTIME_DEVICE
+            error_msg = make_error_msg()
+
+    shutil.rmtree(os.path.dirname(build_res.filename))
+    toc = time.time()
+
+    time.sleep(cooldown_interval)
+    if verbose >= 1:
+        if error_no == MeasureErrorNo.NO_ERROR:
+            print("*", end="")
+        else:
+            print("*E", end="")  # Run error
+
+    return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
+
+
+def _rpc_run_worker(args):
     """Function to be ran in the RPCRunner thread pool.
 
     Parameters
     ----------
-    index : int
-        The MeasureInput and BuildResult index to be processed by the current Runner thread.
+    args : Tuple[MeasureInput, BuildResult, ...]
+        Single input and build result plus the rest of the arguments to `rpc_runner_run`.
 
     Returns
     -------
     res : MeasureResult
         The measure result of this Runner thread.
     """
-    global GLOBAL_RUN_ARGUMENTS
-    (
-        inputs,
-        build_results,
-        key,
-        host,
-        port,
-        priority,
-        timeout,
-        number,
-        repeat,
-        min_repeat_ms,
-        cooldown_interval,
-        enable_cpu_cache_flush,
-        verbose,
-    ) = GLOBAL_RUN_ARGUMENTS
-
     max_float = 1e10  # We use 1e10 instead of sys.float_info.max for better readability in log
-    inp = inputs[index]
-    build_res = build_results[index]
 
+    _, build_res, _, _, _, _, timeout, _, _, _, _, _, verbose = args
     if build_res.error_no != MeasureErrorNo.NO_ERROR:
         return (
             (max_float,),
@@ -796,76 +938,7 @@ def rpc_run_worker(index):
             time.time(),
         )
 
-    def timed_func():
-        tic = time.time()
-        error_no = 0
-        error_msg = None
-        try:
-            # upload built module
-            remote = request_remote(key, host, port, priority, timeout)
-            remote.upload(build_res.filename)
-            func = remote.load_module(os.path.split(build_res.filename)[1])
-            ctx = remote.context(str(inp.task.target), 0)
-            # Limitation:
-            # We can not get PackFunction directly in the remote mode as it is wrapped
-            # under the std::function. We could lift the restriction later once we fold
-            # the PackedFunc as an object. Currently, we pass function name to work
-            # around it.
-            f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
-            time_f = func.time_evaluator(
-                func.entry_name,
-                ctx,
-                number=number,
-                repeat=repeat,
-                min_repeat_ms=min_repeat_ms,
-                f_preproc=f_prepare,
-            )
-        # pylint: disable=broad-except
-        except Exception:
-            costs = (max_float,)
-            error_no = MeasureErrorNo.COMPILE_DEVICE
-            error_msg = make_error_msg()
-
-        if error_no == 0:
-            try:
-                args = [
-                    ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args
-                ]
-                try:
-                    random_fill = remote.get_function("tvm.contrib.random.random_fill")
-                except AttributeError:
-                    raise AttributeError(
-                        "Please make sure USE_RANDOM is ON in the config.cmake "
-                        "on the remote devices"
-                    )
-                for arg in args:
-                    random_fill(arg)
-                ctx.sync()
-
-                costs = time_f(*args).results
-                # clean up remote files
-                remote.remove(build_res.filename)
-                remote.remove(os.path.splitext(build_res.filename)[0] + ".so")
-                remote.remove("")
-            # pylint: disable=broad-except
-            except Exception:
-                costs = (max_float,)
-                error_no = MeasureErrorNo.RUNTIME_DEVICE
-                error_msg = make_error_msg()
-
-        shutil.rmtree(os.path.dirname(build_res.filename))
-        toc = time.time()
-
-        time.sleep(cooldown_interval)
-        if verbose >= 1:
-            if error_no == MeasureErrorNo.NO_ERROR:
-                print("*", end="")
-            else:
-                print("*E", end="")  # Run error
-
-        return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc
-
-    res = call_func_with_timeout(timeout, timed_func)
+    res = call_func_with_timeout(timeout, _timed_rpc_run, args=args)
 
     if isinstance(res, TimeoutError):
         if verbose >= 1:
@@ -950,26 +1023,30 @@ def rpc_runner_run(
     res : List[MeasureResult]
         The measure results of these MeasureInputs.
     """
-    global GLOBAL_RUN_ARGUMENTS
-    GLOBAL_RUN_ARGUMENTS = (
-        inputs,
-        build_results,
-        key,
-        host,
-        port,
-        priority,
-        timeout,
-        number,
-        repeat,
-        min_repeat_ms,
-        cooldown_interval,
-        enable_cpu_cache_flush,
-        verbose,
-    )
-
     assert len(inputs) == len(build_results), "Measure input size should be equal to build results"
-    pool = NoDaemonPool(n_parallel)
-    tuple_res = pool.map(rpc_run_worker, range(len(build_results)))
+    # This pool is not doing computationally intensive work, so we can use threads
+    pool = multiprocessing.pool.ThreadPool(n_parallel)
+    tuple_res = pool.map(
+        _rpc_run_worker,
+        [
+            (
+                inp.serialize(),
+                build_res,
+                key,
+                host,
+                port,
+                priority,
+                timeout,
+                number,
+                repeat,
+                min_repeat_ms,
+                cooldown_interval,
+                enable_cpu_cache_flush,
+                verbose,
+            )
+            for inp, build_res in zip(inputs, build_results)
+        ],
+    )
     pool.terminate()
     pool.join()
     del pool
diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
index 1d0d7650a0f6..f0d930e3257e 100644
--- a/python/tvm/auto_scheduler/measure_record.py
+++ b/python/tvm/auto_scheduler/measure_record.py
@@ -21,9 +21,7 @@
 
 import tvm._ffi
 from tvm.runtime import Object
-from .compute_dag import ComputeDAG
-from .measure import MeasureErrorNo, MeasureInput, MeasureCallback
-from .search_task import SearchTask
+from .measure import MeasureErrorNo, MeasureCallback
 from . import _ffi_api
 
 
@@ -175,38 +173,3 @@ def load_best(filename, workload_key=None, target=None):
             best_res = res
 
     return best_inp, best_res
-
-
-def recover_measure_input(inp, rebuild_state=False):
-    """
-    Recover a deserialized MeasureInput by rebuilding the missing fields.
-    1. Rebuid the compute_dag in inp.task
-    2. (Optional) Rebuild the stages in inp.state
-
-    Parameters
-    ----------
-    inp: MeasureInput
-        The deserialized MeasureInput
-    rebuild_state: bool = False
-        Whether rebuild the stages in MeasureInput.State
-
-    Returns
-    -------
-    new_input: MeasureInput
-        The fully recovered MeasureInput with all fields rebuilt.
-    """
-    task = inp.task
-    new_task = SearchTask(
-        ComputeDAG(task.workload_key),
-        task.workload_key,
-        task.target,
-        task.target_host,
-        task.hardware_params,
-    )
-
-    if rebuild_state:
-        new_state = new_task.compute_dag.infer_bound_from_state(inp.state)
-    else:
-        new_state = inp.state
-
-    return MeasureInput(new_task, new_state)
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index 75fec9c891e8..2d0ec3efd75d 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -129,32 +129,6 @@ def deserialize_args(args):
     return ret
 
 
-class NoDaemonProcess(multiprocessing.Process):
-    @property
-    def daemon(self):
-        return False
-
-    @daemon.setter
-    def daemon(self, value):
-        pass
-
-
-class NoDaemonContext(type(multiprocessing.get_context())):
-    Process = NoDaemonProcess
-
-
-class NoDaemonPool(multiprocessing.pool.Pool):
-    """A no daemon pool version of multiprocessing.Pool.
-    This allows us to start new processes inside the worker function"""
-
-    def __init__(self, *args, **kwargs):
-        kwargs["context"] = NoDaemonContext()
-        super().__init__(*args, **kwargs)
-
-    def __reduce__(self):
-        pass
-
-
 def kill_child_processes(parent_pid, sig=signal.SIGTERM):
     """kill all child processes recursively"""
     try:
@@ -169,17 +143,19 @@ def kill_child_processes(parent_pid, sig=signal.SIGTERM):
             return
 
 
+def _func_wrapper(que, func, args, kwargs):
+    """Call function and return the result over the queue."""
+    if kwargs:
+        que.put(func(*args, **kwargs))
+    else:
+        que.put(func(*args))
+
+
 def call_func_with_timeout(timeout, func, args=(), kwargs=None):
     """Call a function with timeout"""
 
-    def func_wrapper(que):
-        if kwargs:
-            que.put(func(*args, **kwargs))
-        else:
-            que.put(func(*args))
-
     que = multiprocessing.Queue(2)
-    process = multiprocessing.Process(target=func_wrapper, args=(que,))
+    process = multiprocessing.Process(target=_func_wrapper, args=(que, func, args, kwargs))
     process.start()
     process.join(timeout)
 
diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
index 1d9ee6da4f7a..c2d7f90771e3 100644
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ b/python/tvm/auto_scheduler/workload_registry.py
@@ -175,6 +175,41 @@ def workload_key_to_tensors(workload_key):
     return lookup(*args)
 
 
+def get_workload_func(task):
+    """Get the workload function for a given task
+
+    Parameters
+    ----------
+    task : SearchTask
+        Task to get workload of.
+
+    Returns
+    -------
+    workload : callable
+        The registered workload function.
+    """
+    name = workload_func_name(task.workload_key)
+    lookup = WORKLOAD_FUNC_REGISTRY[name]
+    assert callable(lookup)
+    return lookup
+
+
+def workload_func_name(workload_key):
+    """Decode a workload key to the registered function name.
+
+    Parameters
+    ----------
+    workload_key : str
+        The input workload key.
+
+    Returns
+    -------
+    name : str
+        The function name of this workload key.
+    """
+    return decode_workload_key_to_func_args(workload_key)[0]
+
+
 def save_workload_func_registry(filename):
     """Dump workload function registry to a pickle binary file.
 
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index 51fa2d0d7def..e5b17f3d7b53 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -634,6 +634,23 @@ def requires_micro(*args):
     return _compose(args, _requires_micro)
 
 
+def requires_rpc(*args):
+    """Mark a test as requiring rpc to run.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_rpc = [
+        pytest.mark.skipif(
+            tvm.support.libinfo().get("USE_RPC", "OFF") != "ON",
+            reason="RPC support not enabled. Set USE_RPC=ON in config.cmake to enable.",
+        )
+    ]
+    return _compose(args, _requires_rpc)
+
+
 def _target_to_requirement(target):
     # mapping from target to decorator
     if target.startswith("cuda"):
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
old mode 100755
new mode 100644
index 66f521e17e80..1bc2c78a99f0
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -107,6 +107,54 @@ struct Handler<::tvm::auto_scheduler::StateNode> {
   }
 };
 
+template <>
+struct Handler<::tvm::auto_scheduler::HardwareParamsNode> {
+  inline static void Write(dmlc::JSONWriter* writer,
+                           const ::tvm::auto_scheduler::HardwareParamsNode& data) {
+    writer->BeginArray(false);
+    writer->WriteArrayItem(data.num_cores);
+    writer->WriteArrayItem(data.vector_unit_bytes);
+    writer->WriteArrayItem(data.cache_line_bytes);
+    writer->WriteArrayItem(data.max_shared_memory_per_block);
+    writer->WriteArrayItem(data.max_registers_per_block);
+    writer->WriteArrayItem(data.max_threads_per_block);
+    writer->WriteArrayItem(data.max_vthread_extent);
+    writer->WriteArrayItem(data.warp_size);
+    writer->EndArray();
+  }
+  inline static void Read(dmlc::JSONReader* reader,
+                          ::tvm::auto_scheduler::HardwareParamsNode* data) {
+    bool s;
+    reader->BeginArray();
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->num_cores);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->vector_unit_bytes);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->cache_line_bytes);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->max_shared_memory_per_block);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->max_registers_per_block);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->max_threads_per_block);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->max_vthread_extent);
+    s = reader->NextArrayItem();
+    CHECK(s);
+    reader->Read(&data->warp_size);
+    s = reader->NextArrayItem();
+    CHECK(!s);
+  }
+};
+
 template <>
 struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
   inline static void Write(dmlc::JSONWriter* writer,
@@ -114,11 +162,13 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     writer->BeginArray(false);
     writer->WriteArrayItem(std::string(data.workload_key));
     writer->WriteArrayItem(data.target->str());
+    writer->WriteArrayItem(*data.hardware_params.get());
     writer->EndArray();
   }
   inline static void Read(dmlc::JSONReader* reader, ::tvm::auto_scheduler::SearchTaskNode* data) {
     bool s;
     std::string str_value;
+    auto hardware_params_node = ::tvm::make_object<::tvm::auto_scheduler::HardwareParamsNode>();
     reader->BeginArray();
     s = reader->NextArrayItem();
     ICHECK(s);
@@ -129,7 +179,12 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     reader->Read(&str_value);
     data->target = ::tvm::Target(str_value);
     s = reader->NextArrayItem();
-    ICHECK(!s);
+    if (s) {
+      reader->Read(hardware_params_node.get());
+      s = reader->NextArrayItem();
+      data->hardware_params = ::tvm::auto_scheduler::HardwareParams(hardware_params_node);
+      ICHECK(!s);
+    }
   }
 };
 
@@ -216,7 +271,7 @@ namespace auto_scheduler {
 TVM_REGISTER_OBJECT_TYPE(RecordToFileNode);
 TVM_REGISTER_OBJECT_TYPE(RecordReaderNode);
 
-const std::string AUTO_SCHEDULER_LOG_VERSION = "v0.2";  // NOLINT(*)
+const std::string AUTO_SCHEDULER_LOG_VERSION = "v0.3";  // NOLINT(*)
 
 RecordToFile::RecordToFile(String filename) {
   auto node = make_object<RecordToFileNode>();
@@ -340,5 +395,21 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SaveRecords")
       std::ofstream ofs(filename, std::ofstream::app);
       WriteMeasureRecords(&ofs, in, res);
     });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.SerializeMeasureInput")
+    .set_body_typed([](const MeasureInput& input) {
+      std::ostringstream os;
+      dmlc::JSONWriter writer(&os);
+      writer.Write(*input.get());
+      return os.str();
+    });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.DeserializeMeasureInput").set_body_typed([](String json) {
+  std::istringstream ss(json);
+  dmlc::JSONReader reader(&ss);
+  auto inp = make_object<MeasureInputNode>();
+  reader.Read(inp.get());
+  return ObjectRef(inp);
+});
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 4369d203b476..80ce98d0b1c1 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -17,6 +17,7 @@
 
 """ Test measurement and log serialization. """
 
+import multiprocessing
 import tvm
 from tvm import topi
 from tvm import te, auto_scheduler
@@ -182,12 +183,10 @@ def test_recover_measure_input():
 
         raw_inp = inputs[0]
 
-        correct_inp = auto_scheduler.measure_record.recover_measure_input(raw_inp)
+        correct_inp = auto_scheduler.measure.recover_measure_input(raw_inp)
         assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag)
 
-        correct_inp = auto_scheduler.measure_record.recover_measure_input(
-            raw_inp, rebuild_state=True
-        )
+        correct_inp = auto_scheduler.measure.recover_measure_input(raw_inp, rebuild_state=True)
         assert str(correct_inp.state) == str(inp.state)
 
 
@@ -232,6 +231,19 @@ def test_measure_local_builder_rpc_runner():
         del measure_ctx
 
 
+def measure_local_builder_rpc_runner_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_measure_local_builder_rpc_runner()
+
+
+@tvm.testing.requires_llvm
+def test_measure_local_builder_rpc_runner_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=measure_local_builder_rpc_runner_spawn)
+    p.start()
+    p.join()
+
+
 if __name__ == "__main__":
     test_record_split_reorder_fuse_annotation()
     test_record_compute_at_root_inline_cache_read_write()
@@ -239,4 +251,5 @@ def test_measure_local_builder_rpc_runner():
     test_record_pragma_storage_align_rfactor()
     test_recover_measure_input()
     test_measure_local_builder_runner()
+    test_measure_local_builder_runner_spawn()
     test_measure_local_builder_rpc_runner()
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index 07cf4c8141a0..5329f3d50685 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -18,6 +18,7 @@
 """Test search policy"""
 
 import random
+import multiprocessing
 import numpy as np
 import tempfile
 
@@ -26,6 +27,7 @@
 from tvm import auto_scheduler
 
 from test_auto_scheduler_common import matmul_auto_scheduler_test, PropagatingThread
+import multiprocessing
 
 
 def search_common(
@@ -122,6 +124,19 @@ def test_sketch_search_policy_basic():
     t.join()
 
 
+def sketch_search_policy_basic_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_sketch_search_policy_basic()
+
+
+@tvm.testing.requires_llvm
+def test_sketch_search_policy_basic_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=sketch_search_policy_basic_spawn)
+    p.start()
+    p.join()
+
+
 @tvm.testing.requires_llvm
 def test_sketch_search_policy_xgbmodel():
     # wrap the search in a new thread to avoid the conflict
@@ -156,9 +171,8 @@ def test_sketch_search_policy_cuda_rpc_runner():
     t.join()
 
 
+@tvm.testing.requires_cuda
 def test_sketch_search_policy_cuda_xgbmodel_rpc_runner():
-    if not tvm.runtime.enabled("cuda"):
-        return
     measure_ctx = auto_scheduler.LocalRPCMeasureContext()
     # wrap the search in a new thread to avoid the conflict
     # between python's multiprocessing and tvm's thread pool
@@ -179,6 +193,7 @@ def test_sketch_search_policy_cuda_xgbmodel_rpc_runner():
 if __name__ == "__main__":
     test_workload_registry_search_basic()
     test_sketch_search_policy_basic()
+    test_sketch_search_policy_basic_spawn()
     test_sketch_search_policy_xgbmodel()
     test_sketch_search_policy_cuda_rpc_runner()
     test_sketch_search_policy_cuda_xgbmodel_rpc_runner()
diff --git a/tests/python/unittest/test_auto_scheduler_task_scheduler.py b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
index 72b998a5a38a..7851d922013d 100644
--- a/tests/python/unittest/test_auto_scheduler_task_scheduler.py
+++ b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
@@ -18,6 +18,7 @@
 
 import tempfile
 
+import multiprocessing
 import numpy as np
 
 from tvm import auto_scheduler
@@ -68,6 +69,18 @@ def objective_func(costs):
         task_scheduler.tune(tune_option, search_policy="sketch.random")
 
 
+def task_scheduler_round_robin_spawn():
+    assert multiprocessing.get_start_method(False) == "spawn"
+    test_task_scheduler_round_robin()
+
+
+def test_task_scheduler_round_robin_spawn():
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=task_scheduler_round_robin_spawn)
+    p.start()
+    p.join()
+
+
 def test_task_scheduler_gradient():
     tasks = []
     for n in [2, 4]:
@@ -109,4 +122,5 @@ def objective_func(costs):
 
 if __name__ == "__main__":
     test_task_scheduler_round_robin()
+    test_task_scheduler_round_robin_spawn()
     test_task_scheduler_gradient()
diff --git a/tests/python/unittest/test_autotvm_dispatch_context.py b/tests/python/unittest/test_autotvm_dispatch_context.py
index 4064ede3cc06..6ca062047fd7 100644
--- a/tests/python/unittest/test_autotvm_dispatch_context.py
+++ b/tests/python/unittest/test_autotvm_dispatch_context.py
@@ -21,12 +21,13 @@
 from tvm import autotvm
 
 
-def test_fallback():
-    @autotvm.template("testing/dispatch_fallback")
-    def simple_template(a, b):
-        cfg = autotvm.get_config()
-        assert cfg.is_fallback
+@autotvm.template("testing/dispatch_fallback")
+def simple_template(a, b):
+    cfg = autotvm.get_config()
+    assert cfg.is_fallback
+
 
+def test_fallback():
     simple_template(2, 3)
 
 
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index e0fe8b5fe26f..efcc25d84edc 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -30,6 +30,7 @@
 from tvm.rpc.tracker import Tracker
 
 
+@tvm.testing.requires_rpc
 def test_bigendian_rpc():
     """Test big endian rpc when there is a PowerPC RPC server available"""
     host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
@@ -61,22 +62,23 @@ def verify_rpc(remote, target, shape, dtype):
         verify_rpc(remote, target, (10,), dtype)
 
 
-def test_rpc_simple():
-    if not tvm.runtime.enabled("rpc"):
-        return
+@tvm.register_func("rpc.test.addone")
+def addone(x):
+    return x + 1
+
+
+@tvm.register_func("rpc.test.strcat")
+def strcat(name, x):
+    return "%s:%d" % (name, x)
 
-    @tvm.register_func("rpc.test.addone")
-    def addone(x):
-        return x + 1
 
-    @tvm.register_func("rpc.test.strcat")
-    def strcat(name, x):
-        return "%s:%d" % (name, x)
+@tvm.register_func("rpc.test.except")
+def remotethrow(name):
+    raise ValueError("%s" % name)
 
-    @tvm.register_func("rpc.test.except")
-    def remotethrow(name):
-        raise ValueError("%s" % name)
 
+@tvm.testing.requires_rpc
+def test_rpc_simple():
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.addone")
@@ -90,14 +92,13 @@ def remotethrow(name):
     assert f2("abc", 11) == "abc:11"
 
 
-def test_rpc_runtime_string():
-    if not tvm.runtime.enabled("rpc"):
-        return
+@tvm.register_func("rpc.test.runtime_str_concat")
+def strcat(x, y):
+    return x + y
 
-    @tvm.register_func("rpc.test.runtime_str_concat")
-    def strcat(x, y):
-        return x + y
 
+@tvm.testing.requires_rpc
+def test_rpc_runtime_string():
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     func = client.get_function("rpc.test.runtime_str_concat")
@@ -106,14 +107,15 @@ def strcat(x, y):
     assert str(func(x, y)) == "abcdef"
 
 
-def test_rpc_array():
-    if not tvm.runtime.enabled("rpc"):
-        return
-    x = np.random.randint(0, 10, size=(3, 4))
+@tvm.register_func("rpc.test.remote_array_func")
+def remote_array_func(y):
+    x = np.ones((3, 4))
+    np.testing.assert_equal(y.asnumpy(), x)
 
-    @tvm.register_func("rpc.test.remote_array_func")
-    def remote_array_func(y):
-        np.testing.assert_equal(y.asnumpy(), x)
+
+@tvm.testing.requires_rpc
+def test_rpc_array():
+    x = np.ones((3, 4))
 
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
@@ -124,6 +126,7 @@ def remote_array_func(y):
     fremote(r_cpu)
 
 
+@tvm.testing.requires_rpc
 def test_rpc_large_array():
     # testcase of large array creation
     server = rpc.Server("localhost")
@@ -137,6 +140,7 @@ def test_rpc_large_array():
     np.testing.assert_equal(b.asnumpy(), b_np)
 
 
+@tvm.testing.requires_rpc
 def test_rpc_echo():
     def check(remote):
         fecho = remote.get_function("testing.echo")
@@ -180,9 +184,8 @@ def check_minrpc():
     check_minrpc()
 
 
+@tvm.testing.requires_rpc
 def test_rpc_file_exchange():
-    if not tvm.runtime.enabled("rpc"):
-        return
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
     blob = bytearray(np.random.randint(0, 10, size=(10)))
@@ -191,10 +194,9 @@ def test_rpc_file_exchange():
     assert rev == blob
 
 
+@tvm.testing.requires_rpc
 @tvm.testing.requires_llvm
 def test_rpc_remote_module():
-    if not tvm.runtime.enabled("rpc"):
-        return
     # graph
     n = tvm.runtime.convert(102)
     A = te.placeholder((n,), name="A")
@@ -317,11 +319,13 @@ def check_remote_link_cl(remote):
     check_minrpc()
 
 
-def test_rpc_return_func():
-    @tvm.register_func("rpc.test.remote_func")
-    def addone(x):
-        return lambda y: x + y
+@tvm.register_func("rpc.test.remote_func")
+def addone(x):
+    return lambda y: x + y
+
 
+@tvm.testing.requires_rpc
+def test_rpc_return_func():
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.remote_func")
@@ -329,6 +333,7 @@ def addone(x):
     assert fadd(12) == 22
 
 
+@tvm.testing.requires_rpc
 def test_rpc_session_constructor_args():
     # start server
     server0 = rpc.Server("localhost", key="x0")
@@ -365,21 +370,23 @@ def check_error_handling():
     check_error_handling()
 
 
-def test_rpc_return_ndarray():
+@tvm.register_func("rpc.test.remote_return_nd")
+def my_module(name):
     # Use closure to check the ref counter correctness
     nd = tvm.nd.array(np.zeros(10).astype("float32"))
 
-    @tvm.register_func("rpc.test.remote_return_nd")
-    def my_module(name):
-        if name == "get_arr":
-            return lambda: nd
-        elif name == "ref_count":
-            return lambda: tvm.testing.object_use_count(nd)
-        elif name == "get_elem":
-            return lambda idx: nd.asnumpy()[idx]
-        elif name == "get_arr_elem":
-            return lambda arr, idx: arr.asnumpy()[idx]
+    if name == "get_arr":
+        return lambda: nd
+    elif name == "ref_count":
+        return lambda: tvm.testing.object_use_count(nd)
+    elif name == "get_elem":
+        return lambda idx: nd.asnumpy()[idx]
+    elif name == "get_arr_elem":
+        return lambda arr, idx: arr.asnumpy()[idx]
 
+
+@tvm.testing.requires_rpc
+def test_rpc_return_ndarray():
     # start server
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
@@ -392,26 +399,19 @@ def my_module(name):
     # array test
     def run_arr_test():
         arr = get_arr()
-        assert ref_count() == 2
-        arr2 = get_arr()
-        assert ref_count() == 3
-        assert arr.context == client.cpu(0)
-        arr.copyfrom(np.ones(10).astype(arr.dtype))
-        assert arr2.asnumpy()[0] == 1.0
-        assert get_elem(0) == 1.0
-        assert get_arr_elem(arr2, 0) == 1.0
-
-    assert ref_count() == 1
+        assert get_elem(0) == 0.0
+        assert get_arr_elem(arr, 0) == 0.0
+
     run_arr_test()
-    # check recycle correctness
-    assert ref_count() == 1
 
 
-def test_local_func():
-    @tvm.register_func("rpc.test.remote_func2")
-    def addone(x):
-        return lambda y: x + y
+@tvm.register_func("rpc.test.remote_func2")
+def addone(x):
+    return lambda y: x + y
+
 
+@tvm.testing.requires_rpc
+def test_local_func():
     client = rpc.LocalSession()
     f1 = client.get_function("rpc.test.remote_func2")
     fadd = f1(10)
@@ -423,6 +423,7 @@ def addone(x):
     assert rev == blob
 
 
+@tvm.testing.requires_rpc
 def test_rpc_tracker_register():
     # test registration
     tracker = Tracker("localhost", port=9000, port_end=10000)
@@ -459,6 +460,15 @@ def test_rpc_tracker_register():
     tracker.terminate()
 
 
+def _target(host, port, device_key, timeout):
+    client = rpc.connect_tracker(host, port)
+    remote = client.request(device_key, session_timeout=timeout)
+    while True:
+        pass
+    remote.cpu()
+
+
+@tvm.testing.requires_rpc
 def test_rpc_tracker_request():
     # test concurrent request
     tracker = Tracker("localhost", port=9000, port_end=10000)
@@ -472,16 +482,11 @@ def test_rpc_tracker_request():
     )
     client = rpc.connect_tracker(tracker.host, tracker.port)
 
-    def target(host, port, device_key, timeout):
-        client = rpc.connect_tracker(host, port)
-        remote = client.request(device_key, session_timeout=timeout)
-        while True:
-            pass
-        remote.cpu()
-
-    proc1 = multiprocessing.Process(target=target, args=(tracker.host, tracker.port, device_key, 4))
+    proc1 = multiprocessing.Process(
+        target=_target, args=(tracker.host, tracker.port, device_key, 4)
+    )
     proc2 = multiprocessing.Process(
-        target=target, args=(tracker.host, tracker.port, device_key, 200)
+        target=_target, args=(tracker.host, tracker.port, device_key, 200)
     )
     proc1.start()
     time.sleep(0.5)

From f5993ec0113544307e4b5c3557cfba5945d4431e Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Fri, 30 Oct 2020 04:43:56 +0530
Subject: [PATCH 088/258] [CI] Keras version upgraded from 2.3.1 to 2.4.3
 (#6793)

---
 docker/install/ubuntu_install_tensorflow.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index 4e766d4d5a5b..21c88d6f8ece 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install tensorflow==2.3.1 keras==2.3.1 h5py
+pip3 install tensorflow==2.3.1 keras==2.4.3 h5py

From 5dd2d5ab57f6b8f24a5d9afa7f8b20599a2157d2 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 30 Oct 2020 06:57:58 -0700
Subject: [PATCH 089/258] [TVMSCRIPT] Add synr dependency in preparation for
 tvmscript diagnostics overhaul. (#6795)

---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index c8d9856b6de0..d86cbecba213 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -21,4 +21,4 @@ set -u
 set -o pipefail
 
 # install libraries for python package on ubuntu
-pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest pytest-xdist pytest-profiling mypy orderedset attrs requests Pillow packaging cloudpickle
+pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest pytest-xdist pytest-profiling mypy orderedset attrs requests Pillow packaging cloudpickle synr

From 5702f9f6fdc02f0a822a48599706f67189264b27 Mon Sep 17 00:00:00 2001
From: mbaret <55580676+mbaret@users.noreply.github.com>
Date: Fri, 30 Oct 2020 15:00:22 +0000
Subject: [PATCH 090/258] [BYOC] Allow custom codegens to register their own
 constant updater (#6697)

* [BYOC] Allow custom codegens to register their own constant updater

Currently, all codegens using BYOC must make use of the default
ConstantUpdater pass. However, certain codegens, like Ethos-N,
don't want to store any constants in metadata module. This
provides an interface (via a global) to register a custom
constant updating method and assigns a 'null' updater for the
Ethos-N codegen.

Change-Id: Ibd71d3091f992362eeede5d894eedb373b2dbc8f

* Fix to use symbol in const name

Change-Id: I0ade81af9002d413c5b20a50488018e8cd8d8bad

* Remove ;

Change-Id: I61967bc4997efb87f87b49dad7e0a660c536ef35

* Remove ccompiler constant updater

Change-Id: Iea9ee0f689683512fa114afeadeccb7fc9048e4f

* Unregister updater after test

Change-Id: I8009940bb2ac949f2c3f0d72c943a5b74afd6954

* Create UpdateConstants utility function

Change-Id: I83c8c6f92cfe3be3a7e811e98a4eec17590186ff
---
 .../backend/contrib/ethosn/codegen_ethosn.h   |  3 +
 src/relay/backend/graph_runtime_codegen.cc    |  9 +-
 src/relay/backend/utils.h                     | 31 +++++++
 src/relay/backend/vm/compiler.cc              |  4 +-
 .../test_ethosn/test_constant_duplication.py  | 82 +++++++++++++++++++
 tests/python/relay/test_external_codegen.py   | 34 ++++++++
 6 files changed, 153 insertions(+), 10 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosn/test_constant_duplication.py

diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index f3d7f4562533..4b3e1bc05367 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -338,6 +338,9 @@ runtime::Module CompileEthosn(const ObjectRef& ref) {
 
 TVM_REGISTER_GLOBAL("relay.ext.ethos-n").set_body_typed(CompileEthosn);
 
+TVM_REGISTER_GLOBAL("relay.ext.ethos-n.constant_updater")
+    .set_body_typed([](Expr expr, std::string symbol) { return Map<String, runtime::NDArray>(); });
+
 }  // namespace ethosn
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index 7b71e34b777b..e24d18de931c 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -368,14 +368,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
       CCacheKey key = (*pf0)(func, target);
       CachedFunc ext_func = (*pf1)(compile_engine_, key);
       ICHECK(ext_func.defined()) << "External function is not defined.";
-
-      // Step into the functions that are handled by external codegen to
-      // collect metadata.
-      const auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-      std::string symobl = std::string(name_node.value());
-      ConstantUpdater const_visit(symobl, &params_);
-      const_visit(func);
-
+      UpdateConstants(func, &params_);
       return GraphAddCallNode(op, ext_func->func_name, ext_func->func_name);
     }
 
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 3def6359c615..4426642e8e18 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -63,6 +63,37 @@ struct ConstantUpdater : public ExprVisitor {
   std::unordered_map<std::string, runtime::NDArray>* params_;
 };
 
+/*!
+ * \brief A function to update the params with constants found in an external function.
+ * \param func The function from which to get the constant params.
+ * \param params The params to update with the constants.
+ */
+inline void UpdateConstants(Function func,
+                            std::unordered_map<std::string, runtime::NDArray>* params) {
+  auto codegen = func->GetAttr<String>(attr::kCompiler);
+  ICHECK(codegen.defined()) << "No external codegen is set";
+  std::string codegen_name = codegen.value();
+  const auto name_node = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+  std::string symbol = std::string(name_node.value());
+  std::string const_update_name = "relay.ext." + codegen_name + ".constant_updater";
+  // Get the constant updater for the external codegen
+  auto pf = tvm::runtime::Registry::Get(const_update_name);
+  // If the backend hasn't registered a constant updater, use a default one
+  if (pf == nullptr) {
+    ConstantUpdater const_visit(symbol, params);
+    const_visit(func);
+  } else {
+    Map<String, tvm::runtime::NDArray> constants = (*pf)(func, symbol);
+    for (const auto& it : constants) {
+      std::string const_name(it.first);
+      // Constant names should begin this the compiler name (to avoid conflicts)
+      ICHECK(const_name.find(codegen_name) == 0)
+          << "External constant names must start with compiler name";
+      (*params)[const_name] = it.second;
+    }
+  }
+}
+
 /*!
  * \brief A simple wrapper around ExprFunctor for a single argument case.
  *  The result of visit is memoized.
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 4a7e5eec17bc..f652644afa3c 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1124,8 +1124,8 @@ void VMCompiler::Codegen() {
     if (target_str == "ext_dev") {
       // Collect metadata in functions that are handled by external codegen.
       ICHECK(mod->ContainGlobalVar(cfunc->func_name));
-      backend::ConstantUpdater const_visit(cfunc->func_name, &params_);
-      const_visit(Downcast<Function>(mod->Lookup(cfunc->func_name)));
+      Function func = Downcast<Function>(mod->Lookup(cfunc->func_name));
+      backend::UpdateConstants(func, &params_);
       continue;
     } else if (funcs.count(target_str) == 0) {
       funcs.emplace(target_str, mod);
diff --git a/tests/python/contrib/test_ethosn/test_constant_duplication.py b/tests/python/contrib/test_ethosn/test_constant_duplication.py
new file mode 100644
index 000000000000..a096e57c19a9
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_constant_duplication.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test that constants aren't duplicated for Ethos-N"""
+
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib.ethosn import ethosn_available
+from . import infrastructure as tei
+
+
+def _get_model():
+    """Return a model and any parameters it may have"""
+    shape = (1, 4, 4, 4)
+    kernel_h = 3
+    kernel_w = 3
+    out_channels = 8
+
+    a = relay.var("a", shape=shape, dtype="uint8")
+    add_const_value = tvm.nd.array(np.random.randint(0, high=10, size=shape, dtype="uint8"))
+    add_const = relay.const(add_const_value, "uint8")
+    a = relay.add(a, add_const)
+    weight_shape = (kernel_h, kernel_w, shape[3], out_channels)
+    w = tvm.nd.array(np.random.randint(low=0, high=255, size=weight_shape, dtype="uint8"))
+    weights = relay.const(w, "uint8")
+    conv = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(0, "int32"),
+        kernel_zero_point=relay.const(0, "int32"),
+        input_scale=relay.const(0.3, "float32"),
+        kernel_scale=relay.const(0.4, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        dilation=(1, 1),
+        strides=(1, 1),
+        groups=1,
+        channels=out_channels,
+        padding=(0, 0, 0, 0),
+        out_dtype="int32",
+    )
+    b = tvm.nd.array(np.random.randint(0, high=10, size=(out_channels,), dtype="int32"))
+    biasc = relay.const(b, "int32")
+    bias = relay.nn.bias_add(conv, biasc, axis=3)
+    req = relay.qnn.op.requantize(
+        bias,
+        relay.const(0.3 * 0.4, "float32"),  # input zero scale
+        relay.const(0, "int32"),  # input zero point
+        relay.const(0.4, "float32"),  # output zero scale
+        relay.const(0, "int32"),  # output zero point
+        out_dtype="uint8",
+    )
+    params = {"w": w, "b": b}
+    return req, params
+
+
+def test_constant_duplication():
+    if not ethosn_available():
+        return
+
+    model, params = _get_model()
+    mod = tei.make_module(model, params)
+    res = tei.build(mod, params, npu=True, expected_host_ops=1)
+    for key, value in res.params.items():
+        assert key == "p0"
+        assert value.asnumpy().size == 64
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index b2c4dd98721c..0d729b7b1b94 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -219,6 +219,39 @@ def test_extern_gcc():
     check_result(mod, {"x": x_data, "y": y_data}, (2, 2), (y_data * y_data) - (x_data + x_data))
 
 
+def test_extern_gcc_consts():
+    @tvm._ffi.register_func("relay.ext.ccompiler.constant_updater")
+    def constant_updater(expr, symbol):
+        """A dummy constant updater just to test that a custom one works."""
+        return {"ccompiler_0_p0": tvm.nd.array(y0_data)}
+
+    x = relay.var("x", shape=(8, 8))
+    y0_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+
+    x0 = relay.var("x0", shape=(8, 8))
+    y0_const = relay.const(y0_data, "float32")
+    z = x0 + y0_const
+    f = relay.Function([x0], z)
+    f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
+    call = relay.Call(f, [x])
+    mod = tvm.IRModule.from_expr(call)
+
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        compiler = relay.backend.vm.VMCompiler()
+        compiler.lower(mod, "llvm")
+        compiler.codegen()
+        params = compiler.get_params()
+        assert len(params) == 1
+        assert "ccompiler_0_p0" in params.keys()
+
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        _, _, params = relay.build(mod, target="llvm")
+        assert len(params) == 1
+        assert "ccompiler_0_p0" in params.keys()
+
+    tvm._ffi.registry.remove_global_func("relay.ext.ccompiler.constant_updater")
+
+
 def test_extern_dnnl():
     if not tvm.get_global_func("relay.ext.dnnl", True):
         print("skip because DNNL codegen is not available")
@@ -301,5 +334,6 @@ def test_extern_dnnl_const():
     test_extern_gcc_single_op()
     test_extern_gcc_single_op_int()
     test_extern_gcc()
+    test_extern_gcc_consts()
     test_extern_dnnl()
     test_extern_dnnl_const()

From 3a1bb252a90eb415b1af8373e86a5a1a1bfde9b6 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 30 Oct 2020 16:59:14 -0700
Subject: [PATCH 091/258] [AutoScheduler] Relay integration : Task extraction
 (#6710)

* add task extraction

* fix evo search

* fix tests

* fix test

* fix docstring

* fix docstring

* update workload registry

* fix warning

* fix test

* fix fallback

* fix lint

* fix tests
---
 python/tvm/auto_scheduler/__init__.py         |   6 +
 python/tvm/auto_scheduler/compute_dag.py      |   9 +-
 .../auto_scheduler/cost_model/xgb_model.py    |  22 +-
 python/tvm/auto_scheduler/dispatcher.py       | 275 ++++++++++++++++++
 python/tvm/auto_scheduler/env.py              |  56 ++++
 python/tvm/auto_scheduler/measure.py          | 118 ++++----
 .../tvm/auto_scheduler/relay_integration.py   | 232 +++++++++++++++
 python/tvm/auto_scheduler/search_policy.py    |  11 +-
 python/tvm/auto_scheduler/utils.py            |   5 +-
 .../tvm/auto_scheduler/workload_registry.py   | 119 ++++----
 python/tvm/relay/backend/compile_engine.py    |  27 +-
 python/tvm/relay/op/op.py                     |  30 +-
 python/tvm/relay/op/strategy/cuda.py          |  13 +-
 python/tvm/relay/op/strategy/generic.py       |   2 +-
 .../search_policy/sketch_policy.cc            |  30 +-
 .../search_policy/sketch_policy.h             |   7 +-
 src/relay/backend/compile_engine.cc           |  24 +-
 .../test_auto_scheduler_task_extraction.py    |  90 ++++++
 .../relay/test_auto_scheduler_tuning.py       |  62 ++++
 .../test_auto_scheduler_cost_model.py         |   2 +-
 ...test_auto_scheduler_evolutionary_search.py |   4 +-
 21 files changed, 990 insertions(+), 154 deletions(-)
 create mode 100644 python/tvm/auto_scheduler/dispatcher.py
 create mode 100644 python/tvm/auto_scheduler/env.py
 create mode 100644 python/tvm/auto_scheduler/relay_integration.py
 create mode 100644 tests/python/relay/test_auto_scheduler_task_extraction.py
 create mode 100644 tests/python/relay/test_auto_scheduler_tuning.py

diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index 99d96e893ba0..46d606c628d9 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -18,10 +18,13 @@
 """ Namespace for TVM Auto-scheduler. """
 
 from . import compute_dag
+from . import dispatcher
+from . import env
 from . import feature
 from . import loop_state
 from . import measure
 from . import measure_record
+from . import relay_integration
 from . import search_policy
 from . import search_task
 from . import task_scheduler
@@ -32,6 +35,8 @@
 from .auto_schedule import TuningOptions, HardwareParams, create_task, auto_schedule
 from .compute_dag import ComputeDAG
 from .cost_model import RandomModel, XGBModel
+from .dispatcher import DispatchContext, ApplyHistoryBest
+from .env import enable_relay_integration, is_relay_integration_enabled
 from .measure import (
     MeasureInput,
     MeasureResult,
@@ -41,6 +46,7 @@
     LocalRPCMeasureContext,
 )
 from .measure_record import RecordToFile, RecordReader, load_best, load_records, save_records
+from .relay_integration import extract_tasks
 from .search_task import SearchTask
 from .search_policy import EmptyPolicy, SketchPolicy, PreloadMeasuredStates
 from .task_scheduler import TaskScheduler
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index 4b1b264c30d8..2fc0d7d0bf8c 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -151,7 +151,14 @@ def infer_bound_from_state(self, state):
                 updated_state.stage_id_map[k] = v
         return updated_state
 
-    def __hash__(self):
+    def hash_key(self):
+        """Return the hash key of this compute DAG.
+
+        Returns
+        -------
+        key: str
+            The hash key of this compute DAG
+        """
         # TODO(merrymercy): Implement this more carefully and move this to c++ as a member function
         # of ComputeDAG
         str_key = ""
diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index b8953c1db63b..b9afd98be21d 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -23,10 +23,11 @@
 import time
 
 import numpy as np
-import xgboost as xgb
-from xgboost.core import EarlyStopException
-from xgboost.callback import _fmt_metric
-from xgboost.training import aggcv
+
+try:
+    import xgboost as xgb
+except ImportError:
+    xgb = None
 
 from tvm.autotvm.tuner.metric import max_curve
 from .cost_model import PythonBasedModel
@@ -92,6 +93,14 @@ class XGBModel(PythonBasedModel):
     """
 
     def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+
+        if xgb is None:
+            raise ImportError(
+                "XGBoost is required for XGBModel. "
+                "Please install its python package first. "
+                "Help: (https://xgboost.readthedocs.io/en/latest/) "
+            )
+
         self.xgb_params = {
             "max_depth": 10,
             "gamma": 0.001,
@@ -505,6 +514,11 @@ def custom_callback(
     skip_every=2,
 ):
     """Callback function for xgboost to support multiple custom evaluation functions"""
+    # pylint: disable=import-outside-toplevel
+    from xgboost.core import EarlyStopException
+    from xgboost.callback import _fmt_metric
+    from xgboost.training import aggcv
+
     state = {}
     metric_shortname = metric.split("-")[1]
 
diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
new file mode 100644
index 000000000000..7c0c6ef64322
--- /dev/null
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -0,0 +1,275 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+The global context that dispatches best schedules to workloads.
+
+In auto-scheduler, a state (loop_state.py::StateObject) saves the
+schedule configuration by its transform_steps, so a state is used
+as a schedule configuration here.
+"""
+# pylint: disable=invalid-name
+
+import logging
+import pathlib
+
+import numpy as np
+
+from tvm.tir.expr import FloatImm
+from .measure_record import load_records
+
+logger = logging.getLogger("auto_scheduler")
+
+
+class DispatchContext(object):
+    """
+    Base class of dispatch context.
+    """
+
+    current = None
+
+    def __init__(self):
+        self._old_ctx = DispatchContext.current
+
+    def query(self, target, workload_key):
+        """
+        Query the context to get the specific config for a workload.
+        If cannot find the result inside this context, this function will query it
+        from the upper contexts.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload_key : str
+            The workload key
+
+        Returns
+        -------
+        state : StateObject
+            The state that stores schedule configuration for the workload
+        """
+        ret = self._query_inside(target, workload_key)
+        if ret is None:
+            ret = self._old_ctx.query(target, workload_key)
+        return ret
+
+    def update(self, target, workload_key, state):
+        """
+        Update the config for a workload
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload_key : str
+            The current workload_key.
+        state : StateObject
+            The state that stores schedule configuration for the workload
+        """
+        raise NotImplementedError()
+
+    def _query_inside(self, target, workload_key):
+        """
+        Query the context to get the specific config for a workload.
+        This function only query config inside this context.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload_key : str
+            The current workload_key.
+
+        Returns
+        -------
+        state : StateObject
+            The schedule configuration for the workload
+        """
+        raise NotImplementedError()
+
+    def __enter__(self):
+        self._old_ctx = DispatchContext.current
+        DispatchContext.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        DispatchContext.current = self._old_ctx
+
+
+class ApplyHistoryBest(DispatchContext):
+    """
+    Apply the history best config
+
+    Parameters
+    ----------
+    records : str or iterator of (auto_scheduler.measure.MeasureInput,\
+                                  auto_scheduler.measure.MeasureResult)
+        Collection of tuning records.
+        If is str, then it should be the filename of a records log file.
+        Each row of this file is an encoded record pair. Otherwise, it is an iterator.
+    n_lines: Optional[int]
+        if it is not None, only load the first `n_lines` lines of log
+    """
+
+    def __init__(self, records, n_lines=None):
+        super(ApplyHistoryBest, self).__init__()
+
+        self.best_by_targetkey = {}
+        self.best_by_model = {}
+        self._best_user_defined = {}
+
+        self.load(records, n_lines)
+
+    def load(self, records, n_lines=None):
+        """Load records to this dispatch context
+
+        Parameters
+        ----------
+        records : str or iterator of (auto_scheduler.measure.MeasureInput,\
+                                      auto_scheduler.measure.MeasureResult)
+            Collection of tuning records.
+            If is str, then it should be the filename of a records log file.
+            Each row of this file is an encoded record pair. Otherwise, it is an iterator.
+        n_lines: Optional[int]
+            if it is not None, only load the first `n_lines` lines of log
+        """
+        if isinstance(records, pathlib.Path):
+            records = str(records)
+
+        if isinstance(records, str):
+            records = load_records(records)
+
+        if not records:
+            return
+
+        best_by_targetkey = self.best_by_targetkey
+        best_by_model = self.best_by_model
+
+        counter = 0
+        for inp, res in records:
+            if n_lines is not None and counter >= n_lines:
+                break
+            counter += 1
+            if res.error_no != 0:
+                continue
+
+            # use target keys in tvm target system as key to build best map
+            for k in inp.task.target.keys:
+                key = (k, inp.task.workload_key)
+                if key not in best_by_targetkey:
+                    best_by_targetkey[key] = (inp, res)
+                else:
+                    _, other_res = best_by_targetkey[key]
+                    other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)]
+                    costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
+                    if np.mean(other_costs) > np.mean(costs):
+                        best_by_targetkey[key] = (inp, res)
+
+            # use model as key to build best map
+            key = (inp.task.target.model, inp.task.workload_key)
+            if key not in best_by_model:
+                if inp.task.target.model != "unknown":
+                    best_by_model[key] = (inp, res)
+            else:
+                _, other_res = best_by_model[key]
+                other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)]
+                costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
+                if np.mean(other_costs) > np.mean(costs):
+                    best_by_model[key] = (inp, res)
+
+        logger.debug("Finish loading %d records", counter)
+
+    def _query_inside(self, target, workload_key):
+        if target is None:
+            raise RuntimeError(
+                "Need a target context to find the history best. "
+                "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
+                " above the dispatcher call. So does other target. "
+            )
+
+        # first try matching by model
+        key = (target.model, workload_key)
+        if key in self._best_user_defined:
+            return self._best_user_defined[key]
+        if key in self.best_by_model:
+            return self.best_by_model[key][0].state
+
+        # then try matching by target key
+        for k in target.keys:
+            key = (k, workload_key)
+            if key in self._best_user_defined:
+                return self._best_user_defined[key]
+            if key in self.best_by_targetkey:
+                return self.best_by_targetkey[key][0].state
+
+        return None
+
+    def update(self, target, workload_key, state):
+        model = target.model
+        key = (model, workload)
+        self._best_user_defined[key] = state
+
+        for k in target.keys:
+            key = (k, workload)
+            self._best_user_defined[key] = state
+
+
+class FallbackContext(DispatchContext):
+    """
+    A fallback dispatch context.
+    This is used as the root context.
+    """
+
+    def __init__(self):
+        super(FallbackContext, self).__init__()
+        self.memory = {}
+        self.silent = False
+
+        # a set to prevent print duplicated message
+        self.messages = set()
+
+    def query(self, target, workload_key):
+        key = (str(target), workload_key)
+        if key in self.memory:
+            return self.memory[key]
+
+        if not self.silent:
+            msg = (
+                "Cannot find tuned schedule for target=%s, workload_key=%s. "
+                "A fallback schedule is used, "
+                "which may bring great performance regression." % (target, workload_key)
+            )
+            if msg not in self.messages:
+                self.messages.add(msg)
+                logger.warning(msg)
+
+        state = None
+
+        # cache this config to avoid duplicated warning message
+        self.memory[key] = state
+        return state
+
+    def _query_inside(self, target, workload_key):
+        _ = target = workload_key
+        raise RuntimeError("This function should never be called")
+
+    def update(self, target, workload_key, state):
+        key = (str(target), workload_key)
+        self.memory[key] = state
+
+
+DispatchContext.current = FallbackContext()
diff --git a/python/tvm/auto_scheduler/env.py b/python/tvm/auto_scheduler/env.py
new file mode 100644
index 000000000000..95c7ccf971a2
--- /dev/null
+++ b/python/tvm/auto_scheduler/env.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The scope to store global environmental variables of the auto-scheduler"""
+
+
+class AutoSchedulerGlobalScope(object):
+    """The global scope to store environmental variables of the auot-scheduler"""
+
+    def __init__(self):
+        self.enable_relay_integration = False
+
+
+GLOBAL_SCOPE = AutoSchedulerGlobalScope()
+
+
+def is_relay_integration_enabled():
+    """Return whether the relay integration is enabled
+
+    Returns
+    -------
+    enabled: bool
+        Whether the relay integration is enabled
+    """
+    return GLOBAL_SCOPE.enable_relay_integration
+
+
+def enable_relay_integration(new_value=True):
+    """Set the relay integration
+
+    Parameters
+    ---------
+    new_value: bool = True
+        The new setting of relay integration
+
+    Returns
+    -------
+    old_value: bool
+        The old setting.
+    """
+    old_value = GLOBAL_SCOPE.enable_relay_integration
+    GLOBAL_SCOPE.enable_relay_integration = new_value
+    return old_value
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 9f592550cda8..0121ddf37d03 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -57,45 +57,17 @@
 )
 from .compute_dag import ComputeDAG
 from .search_task import SearchTask
-from .workload_registry import workload_func_name, get_workload_func
+from .workload_registry import (
+    serialize_workload_registry_entry,
+    deserialize_workload_registry_entry,
+)
 
 # The maximum length of error message
 MAX_ERROR_MSG_LEN = 512
 
-
-def recover_measure_input(inp, rebuild_state=False):
-    """
-    Recover a deserialized MeasureInput by rebuilding the missing fields.
-    1. Rebuid the compute_dag in inp.task
-    2. (Optional) Rebuild the stages in inp.state
-
-    Parameters
-    ----------
-    inp: MeasureInput
-        The deserialized MeasureInput
-    rebuild_state: bool = False
-        Whether rebuild the stages in MeasureInput.State
-
-    Returns
-    -------
-    new_input: MeasureInput
-        The fully recovered MeasureInput with all fields rebuilt.
-    """
-    task = inp.task
-    new_task = SearchTask(
-        ComputeDAG(task.workload_key),
-        task.workload_key,
-        task.target,
-        task.target_host,
-        task.hardware_params,
-    )
-
-    if rebuild_state:
-        new_state = new_task.compute_dag.infer_bound_from_state(inp.state)
-    else:
-        new_state = inp.state
-
-    return MeasureInput(new_task, new_state)
+# The time cost for measurements with errors
+# We use 1e10 instead of sys.float_info.max for better readability in log
+MAX_FLOAT = 1e10
 
 
 @tvm._ffi.register_object("auto_scheduler.MeasureCallback")
@@ -127,22 +99,16 @@ def serialize(self):
         with initialization of the workload registry (maybe because of
         initialization order?).
         """
-        serialize = tvm.get_global_func("auto_scheduler.SerializeMeasureInput", True)
-        assert serialize
-        # We serialize the workload function so that it can be used on the deserialized side.
-        return {
-            "measureinput": serialize(self),
-            "name": workload_func_name(self.task.workload_key),
-            "func": get_workload_func(self.task),
-        }
+        return [
+            _ffi_api.SerializeMeasureInput(self),
+            serialize_workload_registry_entry(self.task.workload_key),
+        ]
 
     @staticmethod
-    def deserialize(state):
-        deserialize = tvm.get_global_func("auto_scheduler.DeserializeMeasureInput", True)
-        assert deserialize
-        tvm.auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY[state["name"]] = state["func"]
-        x = deserialize(state["measureinput"])
-        return recover_measure_input(x)
+    def deserialize(data):
+        inp = _ffi_api.DeserializeMeasureInput(data[0])
+        deserialize_workload_registry_entry(data[1])
+        return recover_measure_input(inp)
 
 
 @tvm._ffi.register_object("auto_scheduler.BuildResult")
@@ -198,6 +164,41 @@ def __init__(self, costs, error_no, error_msg, all_cost, timestamp):
         )
 
 
+def recover_measure_input(inp, rebuild_state=False):
+    """
+    Recover a deserialized MeasureInput by rebuilding the missing fields.
+    1. Rebuid the compute_dag in inp.task
+    2. (Optional) Rebuild the stages in inp.state
+
+    Parameters
+    ----------
+    inp: MeasureInput
+        The deserialized MeasureInput
+    rebuild_state: bool = False
+        Whether rebuild the stages in MeasureInput.State
+
+    Returns
+    -------
+    new_input: MeasureInput
+        The fully recovered MeasureInput with all fields rebuilt.
+    """
+    task = inp.task
+    new_task = SearchTask(
+        ComputeDAG(task.workload_key),
+        task.workload_key,
+        task.target,
+        task.target_host,
+        task.hardware_params,
+    )
+
+    if rebuild_state:
+        new_state = new_task.compute_dag.infer_bound_from_state(inp.state)
+    else:
+        new_state = inp.state
+
+    return MeasureInput(new_task, new_state)
+
+
 @tvm._ffi.register_object("auto_scheduler.ProgramBuilder")
 class ProgramBuilder(Object):
     """ The base class of ProgramBuilders. """
@@ -698,7 +699,7 @@ def _timed_eval_func(
         )
     # pylint: disable=broad-except
     except Exception:
-        costs = (max_float,)
+        costs = (MAX_FLOAT,)
         error_no = MeasureErrorNo.COMPILE_DEVICE
         error_msg = make_error_msg()
 
@@ -713,7 +714,7 @@ def _timed_eval_func(
             costs = time_f(*args).results
         # pylint: disable=broad-except
         except Exception:
-            costs = (max_float,)
+            costs = (MAX_FLOAT,)
             error_no = MeasureErrorNo.RUNTIME_DEVICE
             error_msg = make_error_msg()
 
@@ -785,14 +786,13 @@ def local_run(
     res : List[MeasureResult]
         The measure results of these MeasureInputs.
     """
-    max_float = 1e10  # We use 1e10 instead of sys.float_info.max for better readability in log
 
     measure_results = []
     assert len(inputs) == len(build_results), "Measure input size should be equal to build results"
     for inp, build_res in zip(inputs, build_results):
         if build_res.error_no != 0:
             res = (
-                (max_float,),
+                (MAX_FLOAT,),
                 build_res.error_no,
                 build_res.error_msg,
                 build_res.time_cost,
@@ -817,7 +817,7 @@ def local_run(
                 if verbose >= 1:
                     print("*T", end="")  # Run timeout
                 res = (
-                    (max_float,),
+                    (MAX_FLOAT,),
                     MeasureErrorNo.RUN_TIMEOUT,
                     None,
                     build_res.time_cost + timeout,
@@ -872,7 +872,7 @@ def _timed_rpc_run(
         )
     # pylint: disable=broad-except
     except Exception:
-        costs = (max_float,)
+        costs = (MAX_FLOAT,)
         error_no = MeasureErrorNo.COMPILE_DEVICE
         error_msg = make_error_msg()
 
@@ -896,7 +896,7 @@ def _timed_rpc_run(
             remote.remove("")
         # pylint: disable=broad-except
         except Exception:
-            costs = (max_float,)
+            costs = (MAX_FLOAT,)
             error_no = MeasureErrorNo.RUNTIME_DEVICE
             error_msg = make_error_msg()
 
@@ -926,12 +926,10 @@ def _rpc_run_worker(args):
     res : MeasureResult
         The measure result of this Runner thread.
     """
-    max_float = 1e10  # We use 1e10 instead of sys.float_info.max for better readability in log
-
     _, build_res, _, _, _, _, timeout, _, _, _, _, _, verbose = args
     if build_res.error_no != MeasureErrorNo.NO_ERROR:
         return (
-            (max_float,),
+            (MAX_FLOAT,),
             build_res.error_no,
             build_res.error_msg,
             build_res.time_cost,
@@ -944,7 +942,7 @@ def _rpc_run_worker(args):
         if verbose >= 1:
             print("*T", end="")  # Run timeout
         res = (
-            (max_float,),
+            (MAX_FLOAT,),
             MeasureErrorNo.RUN_TIMEOUT,
             None,
             build_res.time_cost + timeout,
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
new file mode 100644
index 000000000000..24a4c44ba432
--- /dev/null
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -0,0 +1,232 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-variable,invalid-name
+
+"""
+Integrate auto_scheduler into relay. It implements the following items:
+1. Extract search tasks from a relay program
+2. Provide auto-scheduling for all TOPI compute functions
+"""
+
+import threading
+
+import tvm
+from tvm import te, transform
+from tvm.te.tensor import ComputeOp, PlaceholderOp
+from .compute_dag import ComputeDAG
+from .dispatcher import DispatchContext
+from .search_task import SearchTask
+from .workload_registry import register_workload_tensors
+
+
+def call_all_topi_funcs(mod, params, target):
+    """Call all TOPI compute + schedule to extract tasks in a relay program"""
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+    from tvm.relay.backend import graph_runtime_codegen
+
+    with transform.PassContext(opt_level=3):
+        opt_mod, _ = relay.optimize(mod, target, params)
+        grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
+        grc.codegen(opt_mod["main"])
+
+
+def extract_tasks(mod, params, target, target_host=None, hardware_params=None):
+    """Extract tuning tasks from a relay program.
+
+    Parameters
+    ----------
+    mod: tvm.IRModule or relay.function.Function
+        The module or function to tune
+    params: dict of str to numpy array
+        The associated parameters of the program
+    target: Union[tvm.target.Target, str]
+        The compilation target
+    target_host: Optional[Union[tvm.target.Target, str]]
+        The host compilation target
+    hardware_params : Optional[HardwareParams]
+        Hardware parameters used for the search tasks
+
+    Returns
+    -------
+    tasks: List[SearchTask]
+        The tasks in this network
+    weights: List[int]
+        The weight (i.e. the number of appearance) of extracted tasks
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    if isinstance(target, str):
+        target = Target(target)
+    if isinstance(target_host, str):
+        target_host = Target(target_host)
+
+    # Run the compiler to collect all TOPI calls during compilation.
+    env = TracingEnvironment(TracingMode.EXTRACT_TASK)
+    with env:
+        # Wrap build call in a new thread to avoid the conflict
+        # between python's multiprocessing and tvm's thread pool
+        build_thread = threading.Thread(target=call_all_topi_funcs, args=(mod, params, target))
+        build_thread.start()
+        build_thread.join()
+
+    # query the compile engine to get the number of occurrence of all tasks
+    engine = relay.backend.compile_engine.get()
+    use_count_dict = {}
+    for k, v in engine.items():
+        use_count_dict[k] = v.use_count
+
+    # create search tasks
+    tasks = []
+    weights = []
+    for wkl_key, ccache_key in env.wkl_key_to_ccache_key.items():
+        dag = ComputeDAG(wkl_key)
+        tasks.append(SearchTask(dag, wkl_key, target, target_host, hardware_params))
+        weights.append(use_count_dict[ccache_key] + 1)
+
+    # clean the cached lowering results
+    engine.clear()
+
+    return tasks, weights
+
+
+class TracingMode:
+    """Two modes for tracing"""
+
+    EXTRACT_TASK = 0  # trace all topi calls to extract tasks
+    PREPARE_LAYOUT_REWRITE = 1  # trace topi calls to prepare layout rewrite
+
+
+class TracingEnvironment:
+    """Global environment for tracing all topi function calls"""
+
+    current = None
+
+    def __init__(self, tracing_mode):
+        self.tracing_mode = tracing_mode
+        self.relay_disable_build_cache = "false"
+        self.wkl_key_to_ccache_key = {}
+
+    def __enter__(self):
+        TracingEnvironment.current = self
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        TracingEnvironment.current = None
+
+    def add_workload_key(self, workload_key, ccache_key):
+        """Add the workload key of a search task
+
+        Parameters
+        ----------
+        workload_key: str
+            The workload key of a task
+        ccache_key: CCacheKey
+            The corresponding ccache_key of the task
+        """
+        self.wkl_key_to_ccache_key[workload_key] = ccache_key
+
+
+def traverse_to_get_io_tensors(outs):
+    """Traverse from a list of output tensors to get both input and output tensors
+
+    Parameters
+    ----------
+    outs: List[Tensor]
+        The output tensors
+
+    Returns
+    -------
+    io_tensors: List[Tensor]
+        The input and output tensors
+    has_layout_free: bool
+        Whether the compute DAG has layout_free placeholders
+    """
+    layout_free_ops = []
+    inputs = []
+
+    visited = set()
+
+    def traverse(t):
+        if t in visited:
+            return
+        if isinstance(t.op, PlaceholderOp):
+            inputs.append(t)
+        elif isinstance(t.op, ComputeOp):
+            if "layout_free_placeholders" in t.op.attrs:
+                layout_free_ops.append(t.op)
+            for x in t.op.input_tensors:
+                traverse(x)
+        visited.add(t)
+
+    for t in outs:
+        traverse(t)
+
+    has_layout_free = len(layout_free_ops) > 0
+    return inputs + list(outs), has_layout_free
+
+
+# The suffix of implementations that use the auto-scheduler in the OpStrategy.
+auto_schedule_impl_suffix = ".auto_scheduler"
+
+
+def auto_schedule_topi(outs):
+    """Use auto-scheduler to schedule any topi compute function.
+
+    Note: This is used internally for relay integration. Do
+    not use this as a general user-facing API.
+
+    Parameters
+    ----------
+    outs: List[Tensor]
+        The output tensors of topi compute functions
+
+    Returns
+    -------
+    sch: te.Schedule
+        A topi schedule function
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    io_tensors, has_layout_free = traverse_to_get_io_tensors(outs)
+    key = register_workload_tensors(io_tensors)
+
+    # only enable layout rewrite for cpu backend
+    enable_layout_rewrite = "cpu" in tvm.target.Target.current().keys
+
+    env = TracingEnvironment.current
+    if env is None:  # in the final build mode
+        state = DispatchContext.current.query(tvm.target.Target.current(), key)
+        if state is None:
+            return te.create_schedule([x.op for x in outs])
+
+        dag = ComputeDAG(io_tensors)
+        schedule, _ = dag.apply_steps_from_state(state)
+    elif env.tracing_mode == TracingMode.EXTRACT_TASK:  # in the task extraction mode
+        engine = relay.backend.compile_engine.get()
+        ccache_key = engine.get_current_ccache_key()
+        env.add_workload_key(key, ccache_key)
+        schedule = te.create_schedule([x.op for x in outs])
+    elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
+        # todo(merrymercy, minminsun): port layout rewrite
+        raise NotImplementedError
+    else:
+        raise ValueError("Invalid tracing mode: " + env.tracing_mode)
+
+    return schedule
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index 838ced1806aa..ecf6af32cf78 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -148,7 +148,7 @@ class SketchPolicy(SearchPolicy):
     DEFAULT_PARAMS = {
         "eps_greedy": 0.05,
         "retry_search_one_round_on_empty": 10,
-        "sample_init_population": 50,
+        "sample_init_min_population": 50,
         "sample_init_use_measured_ratio": 0.2,
         "evolutionary_search_population": 2048,
         "evolutionary_search_num_iters": 10,
@@ -210,22 +210,17 @@ def generate_sketches(self, print_for_debug=False):
                 print(s)
         return sketches
 
-    def sample_initial_population(self, pop_size):
+    def sample_initial_population(self):
         """Sample initial population.
         This python interface is mainly used for debugging and testing.
         The actual search is all done in c++.
 
-        Parameters
-        ----------
-        pop_size : int
-            The size of sampled population
-
         Returns
         -------
         states: List[State]
             The sampled states
         """
-        states = _ffi_api.SketchPolicySampleInitialPopulation(self, pop_size)
+        states = _ffi_api.SketchPolicySampleInitialPopulation(self)
         return states
 
     def evolutionary_search(self, init_populations, out_size):
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index 2d0ec3efd75d..0780d39e9042 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -30,7 +30,7 @@
 try:
     import psutil
 except ImportError:
-    raise ImportError("psutil not found, try `pip install psutil` to fix this")
+    psutil = None
 
 from tvm import rpc
 from tvm.tir import expr
@@ -131,6 +131,9 @@ def deserialize_args(args):
 
 def kill_child_processes(parent_pid, sig=signal.SIGTERM):
     """kill all child processes recursively"""
+    if not psutil:
+        raise ImportError("psutil not found, try `pip install psutil` to fix this")
+
     try:
         parent = psutil.Process(parent_pid)
     except psutil.NoSuchProcess:
diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
index c2d7f90771e3..8a42c5f9b83a 100644
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ b/python/tvm/auto_scheduler/workload_registry.py
@@ -35,6 +35,16 @@
 import tvm._ffi
 from .utils import serialize_args, deserialize_args, get_func_name
 
+
+# Global workload function and hash key registry
+# It stores two types of workload:
+# 1. User registered tasks. This type of workload is registered
+#    by the decorator "register_workload"
+# 2. Extracted tasks from a relay program. This type of workload is
+#    registered by function "register_workload_tensors".
+#
+# For 1, the dictionary maps a function name to its function pointer
+# For 2, the dictionary maps a hash key to a list of input/output tensors
 WORKLOAD_FUNC_REGISTRY = {}
 
 
@@ -85,6 +95,27 @@ def register(myf):
     return register
 
 
+def register_workload_tensors(tensors):
+    """Register a workload by provding input/output tensors
+
+    Parameters
+    ----------
+    tensors: List[Tensor]
+        The input/output tensors of a compute DAG
+
+    Returns
+    -------
+    key: str
+        The workload key
+    """
+    # pylint: disable=import-outside-toplevel
+    from .compute_dag import ComputeDAG
+
+    key = ComputeDAG(tensors).hash_key()
+    WORKLOAD_FUNC_REGISTRY[key] = tensors
+    return json.dumps((key,))
+
+
 def make_workload_key(func, args):
     """Make a workload key by function and arguments.
 
@@ -125,32 +156,6 @@ def make_workload_key(func, args):
     return json.dumps((func_name,) + args)
 
 
-def decode_workload_key_to_func_args(workload_key):
-    """Decode a workload key to the registered function name and its corresponding args.
-
-    Parameters
-    ----------
-    workload_key : str
-        The input workload key.
-
-    Returns
-    -------
-    name : str
-        The function name of this workload key.
-    args : List[Tensor]
-        The args of the generation function.
-    """
-    global WORKLOAD_FUNC_REGISTRY
-
-    workload = json.loads(workload_key)
-    if not workload[0] in WORKLOAD_FUNC_REGISTRY:
-        raise ValueError(
-            "%s is not registered. " % workload[0]
-            + "Please register it with @auto_scheduler.register_workload"
-        )
-    return workload[0], deserialize_args(workload[1:])
-
-
 @tvm._ffi.register_func("auto_scheduler.workload_key_to_tensors")
 def workload_key_to_tensors(workload_key):
     """Get the input/output tensors from the workload key.
@@ -169,45 +174,59 @@ def workload_key_to_tensors(workload_key):
     """
     global WORKLOAD_FUNC_REGISTRY
 
-    name, args = decode_workload_key_to_func_args(workload_key)
-    lookup = WORKLOAD_FUNC_REGISTRY[name]
-    assert callable(lookup)
-    return lookup(*args)
+    workload = json.loads(workload_key)
+    name = workload[0]
+    value = WORKLOAD_FUNC_REGISTRY[name]
 
+    # "value" can be either a function or a list of tensors
+    if callable(value):  # if it is a func
+        args = deserialize_args(workload[1:])
+        return value(*args)
+    # otherwise, it is a list of tensors
+    return value
 
-def get_workload_func(task):
-    """Get the workload function for a given task
+
+def serialize_workload_registry_entry(workload_key):
+    """
+    Serialize a workload registry entry.
+
+    This is used when the start method of multiprocessing is spawn.
+    We need to serialize the entry and register it in the new processes.
 
     Parameters
     ----------
-    task : SearchTask
-        Task to get workload of.
+    workload_key : str
+        The workload key
 
     Returns
     -------
-    workload : callable
-        The registered workload function.
+    data: Tuple
+        The serialized pickable data
     """
-    name = workload_func_name(task.workload_key)
-    lookup = WORKLOAD_FUNC_REGISTRY[name]
-    assert callable(lookup)
-    return lookup
+    global WORKLOAD_FUNC_REGISTRY
+
+    workload = json.loads(workload_key)
+    name = workload[0]
+    value = WORKLOAD_FUNC_REGISTRY[name]
+
+    return name, value
 
 
-def workload_func_name(workload_key):
-    """Decode a workload key to the registered function name.
+def deserialize_workload_registry_entry(data):
+    """
+    Deserialize a workload registry entry.
+    This should be used along with :code:`serialize_workload_registry_entry`
 
     Parameters
     ----------
-    workload_key : str
-        The input workload key.
-
-    Returns
-    -------
-    name : str
-        The function name of this workload key.
+    data: Tuple
+        The return value of :code:`serialize_workload_registry_entry`
     """
-    return decode_workload_key_to_func_args(workload_key)[0]
+    global WORKLOAD_FUNC_REGISTRY
+
+    name, value = data
+    if name not in WORKLOAD_FUNC_REGISTRY:
+        WORKLOAD_FUNC_REGISTRY[name] = value
 
 
 def save_workload_func_registry(filename):
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 8d4a331f1b86..d874732d6fa0 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -21,11 +21,10 @@
 import logging
 import numpy as np
 import tvm
-from tvm import te
+from tvm import te, autotvm, auto_scheduler
 from tvm.runtime import Object
 from tvm.support import libinfo
-from ...target import Target
-from ... import autotvm
+from tvm.target import Target
 from .. import function as _function
 from .. import ty as _ty
 from . import _backend
@@ -184,8 +183,9 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
         The best op implementation and the corresponding output tensors.
     """
     all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
-
     best_plevel_impl = max(all_impls, key=lambda x: x.plevel)
+
+    # If not use autotvm, always return the implementation with the highest priority
     if not use_autotvm:
         logger.info(
             "Using %s for %s based on highest priority (%d)",
@@ -196,6 +196,20 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
         outs = best_plevel_impl.compute(attrs, inputs, out_type)
         return best_plevel_impl, outs
 
+    # If auto-scheduler is enabled for Relay, always prefer auto-scheduler
+    if auto_scheduler.is_relay_integration_enabled():
+        auto_scheduler_impls = []
+        for impl in all_impls:
+            if impl.name.endswith(auto_scheduler.relay_integration.auto_schedule_impl_suffix):
+                auto_scheduler_impls.append(impl)
+
+        if auto_scheduler_impls:
+            assert len(auto_scheduler_impls) == 1
+            impl = auto_scheduler_impls[0]
+            outs = impl.compute(attrs, inputs, out_type)
+            return impl, outs
+
+    # Otherwise, try autotvm templates
     outputs = {}
     workloads = {}
     best_autotvm_impl = None
@@ -219,6 +233,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
             best_autotvm_impl = impl
             best_cfg = cfg
     autotvm.GLOBAL_SCOPE.silent = False
+
     if best_autotvm_impl:
         # The best autotvm implementation definitely doesn't use fallback config
         logger.info(
@@ -228,6 +243,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
             best_cfg.cost,
         )
         return best_autotvm_impl, outputs[best_autotvm_impl]
+
     # Use the implementation with highest plevel
     if workloads[best_plevel_impl] is not None:
         msg = (
@@ -371,6 +387,9 @@ def items(self):
         assert len(res) % 2 == 0
         return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
 
+    def get_current_ccache_key(self):
+        return _backend._CompileEngineGetCurrentCCacheKey(self)
+
     def dump(self):
         """Return a string representation of engine dump.
 
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 755659a2819f..fa420c4e71a3 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -18,10 +18,10 @@
 """The base node types for the Relay language."""
 import tvm._ffi
 import tvm.ir
+from tvm.auto_scheduler.relay_integration import auto_schedule_topi, auto_schedule_impl_suffix
 from tvm.driver import lower, build
-
-from ...target import get_native_generic_func, GenericFunc
-from ...runtime import Object
+from tvm.target import get_native_generic_func, GenericFunc
+from tvm.runtime import Object
 from . import _make
 
 
@@ -144,6 +144,30 @@ def add_implementation(self, compute, schedule, name="default", plevel=10):
         """
         _OpStrategyAddImplementation(self, compute, schedule, name, plevel)
 
+    def add_auto_scheduler(self, compute, name, plevel=10):
+        """Add an implementation using the auto-scheduler.
+
+        Parameters
+        ----------
+        compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type)
+                           -> List[Tensor]
+            The compute function.
+
+        name : str
+            The name of implementation.
+
+        plevel : int
+            The priority level of implementation.
+        """
+
+        def wrap_schedule(attrs, outs, target):
+            with target:
+                return auto_schedule_topi(outs)
+
+        self.add_implementation(
+            compute, wrap_schedule, name=name + auto_schedule_impl_suffix, plevel=plevel
+        )
+
 
 def _wrap_default_fstrategy(compute, schedule, name):
     def _fstrategy(attrs, inputs, out_type, target):
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 187ea01c47b8..b7ceda304639 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -20,9 +20,9 @@
 import tvm
 from tvm.te import SpecializedCondition
 from tvm.contrib import nvcc
+from tvm._ffi import get_global_func
 from .generic import *
 from .. import op as _op
-from .... import get_global_func
 
 
 @schedule_injective.register(["cuda", "gpu"])
@@ -160,6 +160,11 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                 name="conv2d_nhwc.cuda",
             )
+
+            strategy.add_auto_scheduler(
+                wrap_compute_conv2d(topi.nn.conv2d_nhwc), name="conv2d_nhwc"
+            )
+
             N, H, W, _ = get_const_tuple(data.shape)
             KH, KW, CI, CO = get_const_tuple(kernel.shape)
             # Winograd shape related judgment
@@ -575,6 +580,12 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
             wrap_topi_schedule(topi.cuda.schedule_dense_small_batch),
             name="dense_small_batch.cuda",
         )
+
+        strategy.add_auto_scheduler(
+            wrap_compute_dense(topi.nn.dense),
+            name="dense",
+        )
+
         with SpecializedCondition(b >= 32):
             strategy.add_implementation(
                 wrap_compute_dense(topi.cuda.dense_large_batch),
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 273bee41cf75..276bf67bd463 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -21,8 +21,8 @@
 import re
 from tvm import topi
 from tvm.topi.utils import get_const_int, get_const_float, get_const_tuple, get_float_tuple
+from tvm.target import generic_func, override_native_generic_func
 from .. import op as _op
-from ....target import generic_func, override_native_generic_func
 
 logger = logging.getLogger("strategy")
 
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 5d6d1d28be1c..e4e186bc11d7 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -258,7 +258,7 @@ std::pair<Array<MeasureInput>, Array<MeasureResult>> SketchPolicyNode::ContinueS
 
 Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State>* random_states) {
   // Get parameters
-  int population = GetIntParam(params, SketchParamKey::SampleInitPopulation::population);
+  int population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
   int num_use_measured = std::min(
       static_cast<int>(measured_states_vector_.size()),
       static_cast<int>(
@@ -272,8 +272,7 @@ Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State
   }
 
   // 2. Sample the init population
-  Array<State> init_population = SampleInitPopulation(
-      sketch_cache_, is_cost_model_reasonable ? population - num_use_measured : population);
+  Array<State> init_population = SampleInitPopulation(sketch_cache_);
 
   // 3. Perform evolutionary search if a cost model is utilized. Otherwise,
   // just return some random states.
@@ -364,28 +363,33 @@ Array<State> SketchPolicyNode::GenerateSketches() {
   return out_states;
 }
 
-Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches, int out_size) {
+Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches) {
+  // Use this population as the parallel degree to do sampling
+  int population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
+  // At least we should sample this number of valid programs
+  int min_population = GetIntParam(params, SketchParamKey::SampleInitPopulation::min_population);
+
   int fail_ct = 0;
   Array<State> out_states;
   std::vector<std::mt19937> rand_gens;
-  rand_gens.reserve(out_size);
-  for (int i = 0; i < out_size; i++) {
+  rand_gens.reserve(population);
+  for (int i = 0; i < population; i++) {
     rand_gens.push_back(std::mt19937(rand_gen()));
   }
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
   size_t iter = 1;
-  size_t target_size = out_size;
+  size_t target_size = min_population;
   size_t unchange_cnt = 0;
   while (out_states.size() < target_size) {
-    std::vector<State> temp_states(out_size);
+    std::vector<State> temp_states(population);
 
-    // Initial a batch of states randomly
-    support::parallel_for(0, out_size,
+    // Sample a batch of states randomly
+    support::parallel_for(0, population,
                           [this, &temp_states, &sketches, &rand_gens](int index) {
                             // Randomly choose a sketch
                             State tmp_s = sketches[(rand_gens[index])() % sketches.size()];
-                            // Derivation rule based enumeration
+                            // Apply random annotation rules one by one
                             bool valid = true;
                             for (const auto& rule : init_rules) {
                               if (rule->Apply(this, &tmp_s, &rand_gens[index]) ==
@@ -646,10 +650,10 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicyGenerateSketches")
     .set_body_typed([](SketchPolicy policy) { return policy->GenerateSketches(); });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicySampleInitialPopulation")
-    .set_body_typed([](SketchPolicy policy, int pop_size) {
+    .set_body_typed([](SketchPolicy policy) {
       const Array<State>& sketches = policy->GenerateSketches();
 
-      Array<State> init_population = policy->SampleInitPopulation(sketches, pop_size);
+      Array<State> init_population = policy->SampleInitPopulation(sketches);
       return init_population;
     });
 
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index 930fd5ecbc4b..3d135d1bda94 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -57,8 +57,8 @@ struct SketchParamKey {
   static constexpr const char* empty_retry_count = "retry_search_one_round_on_empty";
 
   struct SampleInitPopulation {
-    /*! \brief The population size of initial sampling. */
-    static constexpr const char* population = "sample_init_population";
+    /*! \brief The minimal size of valid population in the initial sampling. */
+    static constexpr const char* min_population = "sample_init_min_population";
     /*! \brief The maximum percentage of measured states in the initial sampling. */
     static constexpr const char* use_measured_ratio = "sample_init_use_measured_ratio";
   };
@@ -124,10 +124,9 @@ class SketchPolicyNode : public SearchPolicyNode {
   /*!
    * \brief Sample the init population.
    * \param sketches The initial sketches for the sampled population
-   * \param out_size The number of output states.
    * \return The generated states (the initial population).
    */
-  Array<State> SampleInitPopulation(const Array<State>& sketches, int out_size);
+  Array<State> SampleInitPopulation(const Array<State>& sketches);
 
   /*!
    * \brief Perform evolutionary search.
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 556687c453ac..767cb6f644de 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -620,6 +620,7 @@ class CompileEngineImpl : public CompileEngineNode {
   }
 
   void Clear() final { cache_.clear(); }
+
   // List all items in the cache.
   Array<ObjectRef> ListItems() {
     std::lock_guard<std::mutex> lock(mutex_);
@@ -630,6 +631,13 @@ class CompileEngineImpl : public CompileEngineNode {
     }
     return items;
   }
+
+  /*!
+   * \brief Get the cache key of the function that is being lowered currently
+   * \return the cache key
+   */
+  CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
+
   /*!
    * \brief Create schedule for target.
    * \param source_func The primitive function to be lowered.
@@ -656,6 +664,8 @@ class CompileEngineImpl : public CompileEngineNode {
       value->use_count = 0;
       cache_[key] = value;
     }
+    cur_ccache_key_ = key;
+
     // No need to lower external functions for now. We will invoke the external
     // codegen tool once and lower all functions together.
     if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
@@ -770,6 +780,8 @@ class CompileEngineImpl : public CompileEngineNode {
   std::unordered_map<CCacheKey, CCacheValue> cache_;
   /*! \brief internal compiler cache for shape funcs */
   std::unordered_map<CCacheKey, CCacheValue> shape_func_cache_;
+  /*! \brief the cache key of the function that is being lowered currently*/
+  CCacheKey cur_ccache_key_;
 };
 
 /*! \brief The global compile engine */
@@ -811,7 +823,17 @@ TVM_REGISTER_GLOBAL("relay.backend._CompileEngineJIT")
     .set_body_typed([](CompileEngine self, CCacheKey key) { return self->JIT(key); });
 
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems").set_body_typed([](CompileEngine self) {
-  return static_cast<CompileEngineImpl*>(self.operator->())->ListItems();
+  CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
+  ICHECK(ptr != nullptr);
+  return ptr->ListItems();
 });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGetCurrentCCacheKey")
+    .set_body_typed([](CompileEngine self) {
+      CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
+      ICHECK(ptr != nullptr);
+      return ptr->GetCurrentCCacheKey();
+    });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py
new file mode 100644
index 000000000000..63d4a6f6a404
--- /dev/null
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test task extraction for auto-scheduler"""
+import tvm.relay.testing
+import tvm.testing
+from tvm import auto_scheduler, relay
+
+
+def get_network(name, batch_size=1, layout="NHWC"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefer NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    if name == "resnet-18":
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "resnet-50":
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=50, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "resnet3d-18":
+        mod, params = relay.testing.resnet_3d.get_workload(
+            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
+    elif name == "dcgan":
+        mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size, layout=layout)
+    elif name == "mlp":
+        data = relay.var("data", shape=(batch_size, 32))
+        fc1 = relay.nn.dense(data, relay.var("fc1_weight"), units=32)
+        fc1 = relay.nn.bias_add(fc1, relay.var("fc1_bias"), axis=-1)
+        act1 = relay.nn.relu(fc1)
+        fc2 = relay.nn.dense(act1, relay.var("fc2_weight"), units=32)
+        fc2 = relay.nn.bias_add(fc2, relay.var("fc2_bias"), axis=-1)
+        act2 = relay.nn.relu(fc2)
+        mlp = act2
+        args = relay.analysis.free_vars(act2)
+        mlp = relay.Function(args, mlp)
+        mod, params = relay.testing.init.create_workload(mlp)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return mod, params
+
+
+@tvm.testing.requires_cuda
+def test_task_extraction_cuda():
+    auto_scheduler.enable_relay_integration()
+
+    mod, params = get_network("mlp")
+    target = tvm.target.Target("cuda")
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+    assert len(tasks) == 1
+    assert sum(task_weights) == 2
+
+    mod, params = get_network("resnet-18")
+    target = tvm.target.Target("cuda")
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+    assert len(tasks) == 21
+    assert sum(task_weights) == 22
+
+
+if __name__ == "__main__":
+    test_task_extraction_cuda()
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
new file mode 100644
index 000000000000..b8f5145de4aa
--- /dev/null
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test end-to-end network tuning with auto-scheduler"""
+import tempfile
+
+import tvm.testing
+from tvm import auto_scheduler, relay
+
+from test_auto_scheduler_task_extraction import get_network
+
+
+@tvm.testing.requires_cuda
+def test_tuning_cuda():
+    auto_scheduler.enable_relay_integration()
+
+    # Extract tasks
+    mod, params = get_network("mlp")
+    target = tvm.target.Target("cuda")
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+    objective = lambda costs: sum(c * w for c, w in zip(costs, task_weights))
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+
+        # Tuning
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=100)
+        tuner = auto_scheduler.TaskScheduler(tasks, objective)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=2,
+            num_measures_per_round=1,
+            runner=measure_ctx.runner,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        tuner.tune(tune_option, search_policy="sketch.random")
+        del measure_ctx
+
+        # Compile with the history best
+        with auto_scheduler.ApplyHistoryBest(log_file):
+            with tvm.transform.PassContext(opt_level=3):
+                lib = relay.build(mod, target=target, params=params)
+
+    # Todo(merrymercy): compile without any history to test the fallback mechanism
+
+    auto_scheduler.enable_relay_integration(False)
+
+
+if __name__ == "__main__":
+    test_tuning_cuda()
diff --git a/tests/python/unittest/test_auto_scheduler_cost_model.py b/tests/python/unittest/test_auto_scheduler_cost_model.py
index 62acb6b8e387..5ed736a5b8cb 100644
--- a/tests/python/unittest/test_auto_scheduler_cost_model.py
+++ b/tests/python/unittest/test_auto_scheduler_cost_model.py
@@ -32,7 +32,7 @@ def get_sample_records(number):
     N = 128
     task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), "llvm")
     policy = auto_scheduler.SketchPolicy(task, verbose=0)
-    states = policy.sample_initial_population(number)
+    states = policy.sample_initial_population()[:number]
 
     inputs = [auto_scheduler.MeasureInput(task, s) for s in states]
     results = [
diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
index 4acfa3908cc6..70bea3afd849 100644
--- a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
+++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
@@ -52,7 +52,7 @@ def predict(self, task, states):
     dag = auto_scheduler.ComputeDAG(workload_key)
     task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.Target("llvm"))
     policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)
-    states = policy.sample_initial_population(50)
+    states = policy.sample_initial_population()[:50]
 
     bad_states = []
     for state in states:
@@ -98,7 +98,7 @@ def predict(self, task, states):
     found = False
     retry_ct = 0
     while retry_ct < 10 and not found:
-        states = policy.sample_initial_population(100)
+        states = policy.sample_initial_population()[:100]
         bad_states = []
         for state in states:
             if not MockCostModel.is_good_state(state):

From 2a1bae5c3fe84d2b2e6a0332449ff2c85c745c6c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 31 Oct 2020 03:02:34 -0700
Subject: [PATCH 092/258] Fix mutate auto unroll (#6807)

---
 python/tvm/auto_scheduler/measure.py                   |  6 ++++--
 .../search_policy/sketch_policy_rules.cc               | 10 +++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 0121ddf37d03..642e8f85e86b 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -42,8 +42,6 @@
 from tvm.runtime import Object, module, ndarray
 from tvm.driver import build_module
 from tvm.ir import transform
-from tvm.rpc.tracker import Tracker
-from tvm.rpc.server import Server
 from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
 from tvm.contrib import tar, ndk
 
@@ -481,6 +479,10 @@ def __init__(
         cooldown_interval=0.0,
         enable_cpu_cache_flush=False,
     ):
+        # pylint: disable=import-outside-toplevel
+        from tvm.rpc.tracker import Tracker
+        from tvm.rpc.server import Server
+
         ctx = tvm.context("cuda", 0)
         if ctx.exist:
             cuda_arch = "sm_" + "".join(ctx.compute_version.split("."))
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 1b6cc06a4c45..692ace103be3 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -998,10 +998,14 @@ PopulationGenerationRule::ResultKind MutateAutoUnroll::Apply(SketchPolicyNode* p
   ICHECK(ps);
 
   // Mutate its value to a random candidates
-  auto val = std::to_string(auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()]);
+  int val = auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()];
   StateNode* pstate = state->CopyOnWrite();
-  pstate->transform_steps.Set(step_id, PragmaStep(ps->stage_id, ps->iter_id,
-                                                  std::string("auto_unroll_max_step") + "$" + val));
+  pstate->transform_steps.Set(
+      step_id, PragmaStep(ps->stage_id, ps->iter_id,
+                          std::string("auto_unroll_max_step") + "$" + std::to_string(val)));
+  Stage new_stage = pstate->stages[ps->stage_id];
+  new_stage.CopyOnWrite()->attrs.auto_unroll_max_step = val;
+  pstate->stages.Set(ps->stage_id, new_stage);
   return ResultKind::kValid;
 }
 

From f73b1281c024b8d5c7aa1cb4d2fe1604643ddd72 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandron85@gmail.com>
Date: Sat, 31 Oct 2020 13:12:44 +0000
Subject: [PATCH 093/258] [CI] Pin h5py version to < 3.0 to workaround issues
 with TF/Keras (#6808)

* Pin h5py to use the previous major release (2.x) and not
   new version 3.0, due to incompatibilities with TF and Keras
   that make TVMC and Frontend tests to fail
---
 docker/install/ubuntu_install_tensorflow.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index 21c88d6f8ece..a95702d530a5 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -20,4 +20,7 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install tensorflow==2.3.1 keras==2.4.3 h5py
+# h5py is pinned to minor than 3 due to issues with
+# tensorflow:
+# https://github.com/tensorflow/tensorflow/issues/44467
+pip3 install tensorflow==2.3.1 keras==2.3.1 "h5py<3.0"

From 89d790142b0bd3ebef8056615f4b0ed830358442 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Sat, 31 Oct 2020 06:21:48 -0700
Subject: [PATCH 094/258] Extract channels from weight shape for conv2d.
 (#6805)

---
 src/runtime/contrib/tensorrt/tensorrt_ops.cc | 2 +-
 tests/python/contrib/test_tensorrt.py        | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index 4c5eeea1e644..a86f107941bc 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -242,7 +242,7 @@ class Conv2DOpConverter : public TensorRTOpConverter {
     auto str_dilation = params->node.GetAttr<std::vector<std::string>>("dilation");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
     int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
-    int channels = std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
+    int channels = weight_shape[0];
     // TRT conv2d op doesn't support asymmetric padding before 5.1, so we
     // workaround by adding a padding layer before the pooling op.
     nvinfer1::DimsHW prepadding, postpadding;
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 6f615397db58..9faf51f397f3 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -251,7 +251,6 @@ def get_graph(
         out = relay.nn.conv2d(
             x,
             kernel,
-            channels=k_shape[0],
             kernel_size=k_shape[2:4],
             groups=groups,
             padding=padding,

From e012198c7fee35d7f003c9b9f18dac64cba2dcd1 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sat, 31 Oct 2020 06:24:01 -0700
Subject: [PATCH 095/258] =?UTF-8?q?[=C2=B5TVM]=20Add=20serial=20transport,?=
 =?UTF-8?q?=20parameterize=20=C2=B5TVM=20Zephyr=20test,=20run=20on=20physi?=
 =?UTF-8?q?cal=20HW=20(#6789)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [BUGFIX] Respect infinite-timed session start timeouts.

 * When debugging, the intended behavior is to set the session start
   timeout to infinite to allow the user to configure the debugger.
 * At present, if a session start retry timeout is defined, the
   current logic will bail after the retry timeout expires.
 * This change makes the session start logic retry forever, once per
   retry timeout.

* Document RPCEndpoint::Create.

* Add stm32f746xx to tvm.target.micro() call; fix parameter name.

 * This API is expected to just be used with positional args, not
   kwargs, so this change isn't expected to cause any breakage.
 * model is more inline with the rest of the file, given TVM Target
   Specification RFC.

* [BUGFIX] If session start fails, exit transport context manager.

 * If an error occurred during session setup, then complex transports
   e.g. DebugWrapperTransport would not de-initialize.

* Align transport writes/reads in TransportLogger

* fix syntax errors which were not exercised in previous PR

* Remove microTVM logic from standard RPC server, add debug shell.

 * microTVM uses the host RPC server as a way to launch a debugger in
   a dedicated, separate terminal window. microTVM needs to be able to
   launch the debugger itself, because its model of the device
   flash/debug flow separates these two things into distinct
   operations implemented by shell commands (for maximum portability
   across frameworks).
 * microTVM can be configured to launch the debugger (e.g. GDB) in the
   same terminal as is used for flashing, but this is sub-optimal
   because then it hides any logs emitted by the device.
 * Using the standard RPC server was hard because GDB expects the user
   to issue SIGINT to interrupt program flow, but due to the RPC
   server's necessary use of multiprocessing, multiple signal handlers
   needed to be SIG_IGN'd, and further, because libtvm.so is
   intentionally frontend-agnostic, it's difficult to include signal
   handling directly in that binary (Python expects you to call
   PyErr_CheckSignals, but we don't require and don't want to require
   python-dev to compile libtvm.so, and this is the only such case
   where libtvm.so is expected to block the main thread for a long
   period of time).
 * Here we implement a separate microTVM debug shell python script
   using the non-blocking server implementation.

* Add serial transport, parameterize test_zephyr to work on real hardware

* add pytest test fixture, missed from previous change.

 * this test fixture helps to parameterize the test case

* address leandron@ comment from #6703
---
 python/tvm/exec/microtvm_debug_shell.py       | 152 +++++++++++++++
 python/tvm/micro/contrib/zephyr.py            |  37 ++--
 python/tvm/micro/debugger.py                  | 173 ++++++++++++++++--
 python/tvm/micro/session.py                   |  36 ++--
 python/tvm/micro/transport/base.py            |  20 +-
 python/tvm/micro/transport/debug.py           |   4 +-
 python/tvm/micro/transport/file_descriptor.py |   2 +-
 python/tvm/micro/transport/serial.py          | 128 +++++++++++++
 python/tvm/rpc/server.py                      |  12 +-
 python/tvm/target/target.py                   |  17 +-
 src/runtime/micro/micro_session.cc            |  13 +-
 src/runtime/rpc/rpc_endpoint.cc               |   8 +
 tests/micro/qemu/.gitignore                   |   2 +-
 tests/micro/qemu/conftest.py                  |  32 ++++
 tests/micro/qemu/test_zephyr.py               |  46 ++---
 15 files changed, 580 insertions(+), 102 deletions(-)
 create mode 100644 python/tvm/exec/microtvm_debug_shell.py
 create mode 100644 python/tvm/micro/transport/serial.py
 create mode 100644 tests/micro/qemu/conftest.py

diff --git a/python/tvm/exec/microtvm_debug_shell.py b/python/tvm/exec/microtvm_debug_shell.py
new file mode 100644
index 000000000000..576c07e7fe9e
--- /dev/null
+++ b/python/tvm/exec/microtvm_debug_shell.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=redefined-outer-name, invalid-name
+"""Start an RPC server intended for use as a microTVM debugger.
+
+microTVM aims to be runtime-agnostic, and to that end, frameworks often define command-line tools
+used to launch a debug flow. These tools often manage the process of connecting to an attached
+device using a hardware debugger, exposing a GDB server, and launching GDB connected to that
+server with a source file attached. It's also true that this debugger can typically not be executed
+concurrently with any flash tool, so this integration point is provided to allow TVM to launch and
+terminate any debuggers integrated with the larger microTVM compilation/autotuning flow.
+
+To use this tool, first launch this script in a separate terminal window. Then, provide the hostport
+to your compiler's Flasher instance.
+"""
+
+import argparse
+import logging
+import socket
+import struct
+
+import tvm.micro.debugger as _  # NOTE: imported to expose global PackedFuncs over RPC.
+
+from .._ffi.base import py_str
+from ..rpc import base
+from ..rpc import _ffi_api
+
+
+_LOG = logging.getLogger(__name__)
+
+
+def parse_args():
+    """Parse command line arguments to this script."""
+    parser = argparse.ArgumentParser(description="microTVM debug-tool runner")
+    parser.add_argument("--host", default="0.0.0.0", help="hostname to listen on")
+    parser.add_argument("--port", type=int, default=9090, help="hostname to listen on")
+    parser.add_argument(
+        "--impl",
+        help=(
+            "If given, name of a module underneath tvm.micro.contrib "
+            "which contains the Debugger implementation to use. For example, to enable a "
+            "debugger named BarDebugger in python/tvm/micro/contrib/foo.py, specify either "
+            "'tvm.micro.contrib.foo' or 'foo' here. To enable a debugger named BazDebugger in "
+            "a third-party module ext_package.debugger, specify 'ext_package.debugger' here. "
+            "NOTE: the module cannot be in a sub-package of tvm.micro.contrib."
+        ),
+    )
+
+    return parser.parse_args()
+
+
+class ConnectionClosedError(Exception):
+    """Raised when the connection is closed."""
+
+
+def handle_conn(conn, rpc_key):
+    """Handle a single connection that has just been accept'd()."""
+
+    def send(data):
+        conn.sendall(data)
+        return len(data)
+
+    magic = struct.unpack("<i", base.recvall(conn, 4))[0]
+    if magic != base.RPC_MAGIC:
+        conn.close()
+        return
+
+    keylen = struct.unpack("<i", base.recvall(conn, 4))[0]
+    key = py_str(base.recvall(conn, keylen))
+    arr = key.split()
+    expect_header = "client:"
+    server_key = "server:" + rpc_key
+    if arr[0] != expect_header:
+        conn.sendall(struct.pack("<i", base.RPC_CODE_MISMATCH))
+        _LOG.warning("mismatch key from %s", addr)
+        return
+
+    conn.sendall(struct.pack("<i", base.RPC_CODE_SUCCESS))
+    conn.sendall(struct.pack("<i", len(server_key)))
+    conn.sendall(server_key.encode("utf-8"))
+    server = _ffi_api.CreateEventDrivenServer(send, "microtvm-rpc-debugger", key)
+
+    def _readall(n):
+        buf = bytearray()
+        while len(buf) < n:
+            x = conn.recv(n - len(buf))
+            if not x:
+                raise ConnectionClosedError()
+
+            buf = buf + x
+
+        return buf
+
+    while True:
+        packet_length_bytes = _readall(8)
+        packet_length = struct.unpack("<q", packet_length_bytes)[0]
+        if not packet_length:
+            break
+
+        status = server(packet_length_bytes, 3)
+        if status == 0:
+            break
+
+        packet_body = _readall(packet_length)
+        status = server(packet_body, 3)
+
+
+def main():
+    """Main entry point for microTVM debug shell."""
+    args = parse_args()
+    logging.basicConfig(level=logging.INFO)
+    if args.impl:
+        package = None
+        if "." not in args.impl:
+            package = f"tvm.micro.contrib.{args.impl}"
+        importlib.import_module(args.impl, package)
+
+    sock = socket.socket(base.get_addr_family([args.host, args.port]), socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind((args.host, args.port))
+    sock.listen(1)
+    bind_addr, bind_port = sock.getsockname()
+    _LOG.info("listening for connections on %s:%d", bind_addr, bind_port)
+
+    while True:
+        conn, peer = sock.accept()
+        _LOG.info("accepted connection from %s", peer)
+        try:
+            handle_conn(conn, "")
+        except ConnectionClosedError:
+            pass
+        finally:
+            conn.close()
+            _LOG.info("closed connection from %s", peer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
index 25b3a6bd4f48..66254987cb8b 100644
--- a/python/tvm/micro/contrib/zephyr.py
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -23,9 +23,7 @@
 import os
 import re
 import tempfile
-import termios
 import textwrap
-import signal
 import shlex
 import shutil
 import subprocess
@@ -40,6 +38,7 @@
 from ..transport import debug
 from ..transport import file_descriptor
 
+from ..transport import serial
 from ..transport import Transport, TransportClosedError, TransportTimeouts
 from ..transport import wakeup
 
@@ -219,7 +218,7 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non
         return tvm.micro.MicroBinary(
             output,
             binary_file=os.path.join("zephyr", "zephyr.elf"),
-            debug_files=[],
+            debug_files=[os.path.join("zephyr", "zephyr.elf")],
             labelled_files={
                 "cmake_cache": ["CMakeCache.txt"],
                 "device_tree": [os.path.join("zephyr", "zephyr.dts")],
@@ -289,6 +288,7 @@ def __init__(
         openocd_serial=None,
         flash_args=None,
         debug_rpc_session=None,
+        serial_timeouts=None,
     ):
         zephyr_base = zephyr_base or os.environ["ZEPHYR_BASE"]
         sys.path.insert(0, os.path.join(zephyr_base, "scripts", "dts"))
@@ -308,6 +308,7 @@ def __init__(
         self._subprocess_env = SubprocessEnv(subprocess_env)
         self._debug_rpc_session = debug_rpc_session
         self._nrfjprog_snr = nrfjprog_snr
+        self._serial_timeouts = serial_timeouts
 
     def _get_nrf_device_args(self):
         nrfjprog_args = ["nrfjprog", "--ids"]
@@ -361,7 +362,7 @@ def openocd_serial(self, cmake_entries):
             serials.sort()
 
             self._autodetected_openocd_serial = serials[0]
-            print("autodetected", serials[0])
+            _LOG.debug("zephyr openocd driver: autodetected serial %s", serials[0])
 
         return self._autodetected_openocd_serial
 
@@ -457,7 +458,9 @@ def transport(self, micro_binary):
         _LOG.debug("zephyr transport: found UART baudrate from devicetree: %d", uart_baud)
 
         port_kwargs = self._find_serial_port(micro_binary)
-        serial_transport = serial.SerialTransport(baudrate=uart_baud, **port_kwargs)
+        serial_transport = serial.SerialTransport(
+            timeouts=self._serial_timeouts, baudrate=uart_baud, **port_kwargs
+        )
         if self._debug_rpc_session is None:
             return serial_transport
 
@@ -468,7 +471,7 @@ def transport(self, micro_binary):
                     ZephyrDebugger,
                     (
                         " ".join(shlex.quote(x) for x in self._west_cmd),
-                        os.path.join(self._project_dir, "__tvm_build"),
+                        os.path.dirname(micro_binary.abspath(micro_binary.label("cmake_cache")[0])),
                         micro_binary.abspath(micro_binary.debug_files[0]),
                         self._zephyr_base,
                     ),
@@ -566,8 +569,6 @@ def close(self):
             self.fd_transport = None
 
         if self.proc is not None:
-            #            self.proc.wait()
-            #            self.proc.terminate()
             self.proc = None
 
         if self.pipe_dir is not None:
@@ -585,23 +586,22 @@ def write(self, data, timeout_sec):
         return self.fd_transport.write(data, timeout_sec)
 
 
-class ZephyrDebugger(debugger.Debugger):
+class ZephyrDebugger(debugger.GdbDebugger):
     """A Zephyr debugger implementation."""
 
     def __init__(self, west_cmd, build_dir, elf_path, zephyr_base):
-        debugger.Debugger.__init__(self)
+        super(ZephyrDebugger, self).__init__()
         self._west_cmd = shlex.split(west_cmd)
         self._build_dir = build_dir
         self._elf_path = elf_path
         self._zephyr_base = zephyr_base
 
-    def start(self):
+    def popen_kwargs(self):
         env = dict(os.environ)
         env["ZEPHYR_BASE"] = self._zephyr_base
-        sys.stdin = open(0)  # re-open stdin, closed by multiprocessing.
-        self._old_termios = termios.tcgetattr(sys.stdin)
-        self._proc = subprocess.Popen(
-            self._west_cmd
+
+        return dict(
+            args=self._west_cmd
             + [
                 "debug",
                 "--skip-rebuild",
@@ -612,10 +612,3 @@ def start(self):
             ],
             env=env,
         )
-        self._old_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
-
-    def stop(self):
-        signal.signal(signal.SIGINT, self._old_sigint_handler)
-        termios.tcsetattr(sys.stdin, termios.TCSADRAIN, self._old_termios)
-        self._proc.terminate()
-        self._proc.wait()
diff --git a/python/tvm/micro/debugger.py b/python/tvm/micro/debugger.py
index 0b5430c52e66..b76d46a04db6 100644
--- a/python/tvm/micro/debugger.py
+++ b/python/tvm/micro/debugger.py
@@ -17,15 +17,26 @@
 
 """Defines functions for controlling debuggers for micro TVM binaries."""
 
+import atexit
 import abc
+import logging
 import os
 import signal
 import subprocess
+import sys
+import termios
 import threading
 
+import psutil
+
+from .._ffi import register_func
+from . import class_factory
 from . import transport
 
 
+_LOG = logging.getLogger(__name__)
+
+
 class Debugger(metaclass=abc.ABCMeta):
     """An interface for controlling micro TVM debuggers."""
 
@@ -45,34 +56,102 @@ def stop(self):
         """Terminate the debugger."""
         raise NotImplementedError()
 
+    def _run_on_terminate_callbacks(self):
+        for callback in self.on_terminate_callbacks:
+            try:
+                callback()
+            except Exception:  # pylint: disable=broad-except
+                _LOG.warning("on_terminate_callback raised exception", exc_info=True)
+
 
 class GdbDebugger(Debugger):
     """Handles launching, suspending signals, and potentially dealing with terminal issues."""
 
+    # Number of seconds to wait in stop() for a graceful shutdown. After this time has elapsed,
+    # the debugger is kill()'d.
+    _GRACEFUL_SHUTDOWN_TIMEOUT_SEC = 5.0
+
+    # The instance of GdbDebugger that's currently started.
+    _STARTED_INSTANCE = None
+
+    @classmethod
+    def _stop_all(cls):
+        if cls._STARTED_INSTANCE:
+            cls._STARTED_INSTANCE.stop()
+
+    def __init__(self):
+        super(GdbDebugger, self).__init__()
+        self._is_running = False
+        self._child_alive_lock = threading.RLock()
+        self._is_child_alive = False
+
     @abc.abstractmethod
     def popen_kwargs(self):
         raise NotImplementedError()
 
-    def _wait_restore_signal(self):
+    def _wait_for_child(self):
         self.popen.wait()
-        if not self.did_terminate.is_set():
-            for callback in self.on_terminate_callbacks:
+        with self._child_alive_lock:
+            self._is_child_alive = True
+
+    @classmethod
+    def _sigint_handler(cls, signum, stack_frame):  # pylint: disable=unused-argument
+        if cls._STARTED_INSTANCE is not None:
+            with cls._STARTED_INSTANCE._child_alive_lock:
+                exists = cls._STARTED_INSTANCE._is_child_alive
+            if exists:
                 try:
-                    callback()
-                except Exception:  # pylint: disable=broad-except
-                    logging.warn("on_terminate_callback raised exception", exc_info=True)
+                    os.killpg(cls._STARTED_INSTANCE.child_pgid, signal.SIGINT)
+                    return
+                except ProcessLookupError:
+                    pass
+
+        raise Exception()
 
     def start(self):
+        assert not self._is_running
+        assert not self._STARTED_INSTANCE
+
         kwargs = self.popen_kwargs()
-        self.did_terminate = threading.Event()
-        self.old_signal = signal.signal(signal.SIGINT, signal.SIG_IGN)
+        self.did_start_new_session = kwargs.setdefault("start_new_session", True)
+
+        self.old_termios = termios.tcgetattr(sys.stdin.fileno())
         self.popen = subprocess.Popen(**kwargs)
-        threading.Thread(target=self._wait_restore_signal).start()
+        self._is_running = True
+        self.__class__._STARTED_INSTANCE = self
+        try:
+            self.child_pgid = os.getpgid(self.popen.pid)
+        except Exception:
+            self.stop()
+            raise
+        with self._child_alive_lock:
+            self._is_child_alive = True
+        self.old_sigint_handler = signal.signal(signal.SIGINT, self._sigint_handler)
+        t = threading.Thread(target=self._wait_for_child)
+        t.daemon = True
+        t.start()
 
     def stop(self):
-        self.did_terminate.set()
-        self.popen.terminate()
-        signal.signal(signal.SIGINT, self.old_signal)
+        if not self._is_running:
+            return
+
+        signal.signal(signal.SIGINT, self.old_sigint_handler)
+        termios.tcsetattr(sys.stdin.fileno(), termios.TCSAFLUSH, self.old_termios)
+
+        try:
+            children = psutil.Process(self.popen.pid).children(recursive=True)
+            for c in children:
+                c.terminate()
+            _, alive = psutil.wait_procs(children, timeout=self._GRACEFUL_SHUTDOWN_TIMEOUT_SEC)
+            for a in alive:
+                a.kill()
+        finally:
+            self.__class__._STARTED_INSTANCE = None
+            self._is_running = False
+            self._run_on_terminate_callbacks()
+
+
+atexit.register(GdbDebugger._stop_all)
 
 
 class GdbTransportDebugger(GdbDebugger):
@@ -198,3 +277,73 @@ def stop(self):
         finally:
             if self.wrapping_context_manager is not None:
                 self.wrapping_context_manager.__exit__(None, None, None)
+
+
+GLOBAL_DEBUGGER = None
+
+
+class DebuggerFactory(class_factory.ClassFactory):
+
+    SUPERCLASS = Debugger
+
+
+def launch_debugger(debugger_factory, *args, **kw):
+    global GLOBAL_DEBUGGER
+    if GLOBAL_DEBUGGER is not None:
+        stop_debugger()
+
+    GLOBAL_DEBUGGER = debugger_factory.instantiate(*args, **kw)
+    GLOBAL_DEBUGGER.start()
+
+
+@register_func("tvm.micro.debugger.launch_debugger")
+def _launch_debugger(debugger_factory_json):
+    launch_debugger(DebuggerFactory.from_json(debugger_factory_json))
+
+
+@register_func("tvm.micro.debugger.stop_debugger")
+def stop_debugger():
+    global GLOBAL_DEBUGGER
+    if GLOBAL_DEBUGGER is not None:
+        try:
+            GLOBAL_DEBUGGER.stop()
+        finally:
+            GLOBAL_DEBUGGER = None
+
+
+class RpcDebugger(Debugger):
+    """A Debugger instance that launches the actual debugger on a remote TVM RPC server."""
+
+    def __init__(self, rpc_session, factory, wrapping_context_manager=None):
+        super(RpcDebugger, self).__init__()
+        self._factory = factory
+        self.launch_debugger = rpc_session.get_function("tvm.micro.debugger.launch_debugger")
+        self.stop_debugger = rpc_session.get_function("tvm.micro.debugger.stop_debugger")
+        self.wrapping_context_manager = wrapping_context_manager
+
+    def start(self):
+        if self.wrapping_context_manager is not None:
+            self.wrapping_context_manager.__enter__()
+
+        try:
+            self.launch_debugger(self._factory.to_json)
+        except Exception:
+            if self.wrapping_context_manager is not None:
+                self.wrapping_context_manager.__exit__(None, None, None)
+            raise
+
+        try:
+            input("Press [Enter] when debugger is set")
+        except Exception:
+            self.stop()
+            raise
+
+        self._is_running = True
+
+    def stop(self):
+        try:
+            self.stop_debugger()
+            self._run_on_terminate_callbacks()
+        finally:
+            if self.wrapping_context_manager is not None:
+                self.wrapping_context_manager.__exit__(None, None, None)
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index a6b6d266db36..3f84f3beab5b 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -18,6 +18,7 @@
 """Defines a top-level glue class that operates the Transport and Flasher classes."""
 
 import logging
+import sys
 
 from ..error import register_error
 from .._ffi import get_global_func
@@ -126,22 +127,27 @@ def __enter__(self):
             self.session_name, self.transport_context_manager, level=logging.INFO
         ).__enter__()
 
-        timeouts = self.timeout_override
-        if timeouts is None:
-            timeouts = self.transport.timeouts()
-
-        self._rpc = RPCSession(
-            _rpc_connect(
-                self.session_name,
-                self._wrap_transport_write,
-                self._wrap_transport_read,
-                int(timeouts.session_start_retry_timeout_sec * 1e6),
-                int(timeouts.session_start_timeout_sec * 1e6),
-                int(timeouts.session_established_timeout_sec * 1e6),
+        try:
+            timeouts = self.timeout_override
+            if timeouts is None:
+                timeouts = self.transport.timeouts()
+
+            self._rpc = RPCSession(
+                _rpc_connect(
+                    self.session_name,
+                    self._wrap_transport_write,
+                    self._wrap_transport_read,
+                    int(timeouts.session_start_retry_timeout_sec * 1e6),
+                    int(timeouts.session_start_timeout_sec * 1e6),
+                    int(timeouts.session_established_timeout_sec * 1e6),
+                )
             )
-        )
-        self.context = self._rpc.cpu(0)
-        return self
+            self.context = self._rpc.cpu(0)
+            return self
+
+        except:
+            self.transport.__exit__(*sys.exc_info())
+            raise
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
         """Tear down this session and associated RPC session resources."""
diff --git a/python/tvm/micro/transport/base.py b/python/tvm/micro/transport/base.py
index 15d91342ef5a..f8951f6226a5 100644
--- a/python/tvm/micro/transport/base.py
+++ b/python/tvm/micro/transport/base.py
@@ -213,7 +213,7 @@ def read(self, n, timeout_sec):
         except IoTimeoutError:
             self.logger.log(
                 self.level,
-                "%s read {%3.2fs} %4d B -> [IoTimeoutError %.2f s]",
+                "%s read {%5.2fs} %4d B -> [IoTimeoutError %.2f s]",
                 self.name,
                 timeout_sec,
                 n,
@@ -223,7 +223,7 @@ def read(self, n, timeout_sec):
         except Exception as err:
             self.logger.log(
                 self.level,
-                "%s read {%3.2fs} %4d B -> [err: %s]",
+                "%s read {%5.2fs} %4d B -> [err: %s]",
                 self.name,
                 timeout_sec,
                 n,
@@ -236,7 +236,7 @@ def read(self, n, timeout_sec):
         if len(hex_lines) > 1:
             self.logger.log(
                 self.level,
-                "%s read {%3.2fs} %4d B -> [%d B]:\n%s",
+                "%s read {%5.2fs} %4d B -> [%3d B]:\n%s",
                 self.name,
                 timeout_sec,
                 n,
@@ -246,7 +246,7 @@ def read(self, n, timeout_sec):
         else:
             self.logger.log(
                 self.level,
-                "%s read {%3.2fs} %4d B -> [%d B]: %s",
+                "%s read {%5.2fs} %4d B -> [%3d B]: %s",
                 self.name,
                 timeout_sec,
                 n,
@@ -262,7 +262,7 @@ def write(self, data, timeout_sec):
         except IoTimeoutError:
             self.logger.log(
                 self.level,
-                "%s write             <- [%d B]: [IoTimeoutError %.2f s]",
+                "%s write                <- [%3d B]: [IoTimeoutError %.2f s]",
                 self.name,
                 len(data),
                 timeout_sec,
@@ -271,7 +271,7 @@ def write(self, data, timeout_sec):
         except Exception as err:
             self.logger.log(
                 self.level,
-                "%s write             <- [%d B]: [err: %s]",
+                "%s write                <- [%3d B]: [err: %s]",
                 self.name,
                 len(data),
                 str(err),
@@ -283,14 +283,18 @@ def write(self, data, timeout_sec):
         if len(hex_lines) > 1:
             self.logger.log(
                 self.level,
-                "%s write      <- [%d B]:\n%s",
+                "%s write                <- [%3d B]:\n%s",
                 self.name,
                 bytes_written,
                 "\n".join(hex_lines),
             )
         else:
             self.logger.log(
-                self.level, "%s write      <- [%d B]: %s", self.name, bytes_written, hex_lines[0]
+                self.level,
+                "%s write                <- [%3d B]: %s",
+                self.name,
+                bytes_written,
+                hex_lines[0],
             )
 
         return bytes_written
diff --git a/python/tvm/micro/transport/debug.py b/python/tvm/micro/transport/debug.py
index 6fc14f8a7a3d..e897b3d99df8 100644
--- a/python/tvm/micro/transport/debug.py
+++ b/python/tvm/micro/transport/debug.py
@@ -37,7 +37,9 @@ def timeouts(self):
         child_timeouts = self.transport.timeouts()
         return TransportTimeouts(
             session_start_retry_timeout_sec=(
-                0 if self.disable_session_start_retry else child_timeouts.session_start_retry
+                0
+                if self.disable_session_start_retry
+                else child_timeouts.session_start_retry_timeout_sec
             ),
             session_start_timeout_sec=0,
             session_established_timeout_sec=0,
diff --git a/python/tvm/micro/transport/file_descriptor.py b/python/tvm/micro/transport/file_descriptor.py
index ce3025ccbf55..3f69c4c26751 100644
--- a/python/tvm/micro/transport/file_descriptor.py
+++ b/python/tvm/micro/transport/file_descriptor.py
@@ -73,7 +73,7 @@ def _await_ready(self, rlist, wlist, timeout_sec=None, end_time=None):
             timeout_sec = max(0, end_time - time.monotonic())
         rlist, wlist, xlist = select.select(rlist, wlist, rlist + wlist, timeout_sec)
         if not rlist and not wlist and not xlist:
-            raise IoTimeoutError()
+            raise base.IoTimeoutError()
 
         return True
 
diff --git a/python/tvm/micro/transport/serial.py b/python/tvm/micro/transport/serial.py
new file mode 100644
index 000000000000..3b36f1e0e83f
--- /dev/null
+++ b/python/tvm/micro/transport/serial.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines a Transport implementation using pyserial."""
+
+import atexit
+import time
+import serial
+import serial.tools.list_ports
+from .base import IoTimeoutError, Transport, TransportTimeouts
+
+
+_DEFAULT_SERIAL_TIMEOUTS = TransportTimeouts(
+    session_start_retry_timeout_sec=5,
+    session_start_timeout_sec=10.0,
+    session_established_timeout_sec=30.0,
+)
+
+
+class SerialTransport(Transport):
+    """A Transport implementation using pySerial."""
+
+    _OPEN_PORTS = []
+
+    @classmethod
+    def close_atexit(cls):
+        """Close all serial ports before exit.
+
+        Some USB-UART kernel drivers are particularly sensitive to being left open (i.e. require
+        unplugging and replugging of attached hardware or reboot of machine); try very hard to
+        close all serial ports at exit.
+        """
+        for port in cls._OPEN_PORTS:
+            try:
+                port.close()
+            except Exception:  # pylint: disable=broad-except
+                _LOG.warn("exception closing port", exc_info=True)
+
+        cls._OPEN_PORTS = []
+
+    def __init__(self, grep=None, port_path=None, timeouts=None, **kw):
+        self._port_path = port_path
+        self._grep = grep
+        self._timeouts = timeouts if timeouts is not None else _DEFAULT_SERIAL_TIMEOUTS
+        self._kw = kw
+        if self._port_path is None and self._grep is None:
+            raise SerialPortNotFoundError("Must specify one of grep= or port_path=")
+
+    def timeouts(self):
+        return self._timeouts
+
+    def open(self):
+        if self._port_path is not None:
+            port_path = self._port_path
+        else:
+            ports = list(serial.tools.list_ports.grep(self._grep, include_links=True))
+            if len(ports) != 1:
+                raise SerialPortNotFoundError(
+                    f"grep expression should find 1 serial port; found {ports!r}"
+                )
+
+            port_path = ports[0].device
+
+        self._port = serial.Serial(port_path, timeout=0.1, exclusive=True, **self._kw)
+        self._port.cancel_read()
+        self._port.reset_input_buffer()
+        self._port.reset_output_buffer()
+        self._OPEN_PORTS.append(self._port)
+
+    def close(self):
+        if self._port is None:
+            return
+
+        self._port.close()
+        self._OPEN_PORTS.remove(self._port)
+        self._port = None
+
+    def read(self, n, timeout_sec):
+        end_time = time.monotonic() + timeout_sec
+        to_return = bytearray()
+        while True:
+            timeout_remaining = end_time - time.monotonic()
+            if timeout_sec != 0 and timeout_remaining < 0:
+                break
+
+            # Read until *something* can be returned. If nothing is sent within 5 chars' time, stop.
+            # 5 is an arbitrary number.
+            self._port.timeout = 1 / self._port.baudrate * 5
+            try:
+                data = self._port.read(n if timeout_sec != 0 else 1)
+                if not data and to_return:
+                    break
+
+                to_return.extend(data)
+            except serial.SerialTimeoutException:
+                if to_return:
+                    break
+
+        if not to_return:
+            raise IoTimeoutError()
+
+        return to_return
+
+    def write(self, data, timeout_sec):
+        self._port.write_timeout = timeout_sec
+        try:
+            to_return = self._port.write(data)
+            self._port.flush()
+            return to_return
+        except serial.SerialTimeoutException:
+            raise IoTimeoutError()
+
+
+atexit.register(SerialTransport.close_atexit)
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 9489a734eb8b..786154253133 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -235,12 +235,13 @@ def _accept_conn(listen_sock, tracker_conn, ping_period=2):
         server_proc = multiprocessing.Process(
             target=_serve_loop, args=(conn, addr, load_library, work_path)
         )
-        server_proc.deamon = True
+
         server_proc.start()
         # close from our side.
         conn.close()
         # wait until server process finish or timeout
         server_proc.join(opts.get("timeout", None))
+
         if server_proc.is_alive():
             logger.info("Timeout in RPC session, kill..")
             # pylint: disable=import-outside-toplevel
@@ -280,7 +281,6 @@ def _connect_proxy_loop(addr, key, load_library):
             opts = _parse_server_opt(remote_key.split()[1:])
             logger.info("connected to %s", str(addr))
             process = multiprocessing.Process(target=_serve_loop, args=(sock, addr, load_library))
-            process.deamon = True
             process.start()
             sock.close()
             process.join(opts.get("timeout", None))
@@ -362,8 +362,6 @@ def __init__(
         load_library=None,
         custom_addr=None,
         silent=False,
-        utvm_dev_id=None,
-        utvm_dev_config_args=None,
     ):
         try:
             if _ffi_api.ServerLoop is None:
@@ -397,10 +395,6 @@ def __init__(
                 cmd += ["--custom-addr", custom_addr]
             if silent:
                 cmd += ["--silent"]
-            if utvm_dev_id is not None:
-                assert utvm_dev_config_args is not None
-                cmd += [f"--utvm-dev-id={utvm_dev_id}"]
-                cmd += [f"--utvm-dev-config-args={utvm_dev_config_args}"]
 
             # prexec_fn is not thread safe and may result in deadlock.
             # python 3.2 introduced the start_new_session parameter as
@@ -437,13 +431,11 @@ def __init__(
                 target=_listen_loop,
                 args=(self.sock, self.port, key, tracker_addr, load_library, self.custom_addr),
             )
-            self.proc.deamon = True
             self.proc.start()
         else:
             self.proc = multiprocessing.Process(
                 target=_connect_proxy_loop, args=((host, port), key, load_library)
             )
-            self.proc.deamon = True
             self.proc.start()
 
     def terminate(self):
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 43426a554549..ba4a1a2f744e 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -220,19 +220,24 @@ def intel_graphics(model="unknown", options=None):
     return Target(" ".join(["opencl"] + opts))
 
 
-def micro(hardware="unknown", options=None):
+def micro(model="unknown", options=None):
     """Returns a microTVM target.
 
     Parameters
     ----------
-    hardware : str
-        Canonically identifies the target device; typicaly one of cortex-mX, or a specific SoC model
-        when that model has been tested to work with microTVM.
+    model : str
+        Canonically identifies the target device. This is typically a CPU or board level name (other
+        flags such as -mcpu identify the ISA).
     options : str or list of str
         Additional options
     """
-    trans_table = {"host": ["-mcpu=native"]}
-    opts = _merge_opts(trans_table[hardware] + ["-runtime=c", "--system-lib"], options)
+    trans_table = {
+        "host": ["-mcpu=native"],
+        "stm32f746xx": ["-mcpu=cortex-m7"],
+    }
+    opts = _merge_opts(
+        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"], options
+    )
 
     # NOTE: in the future, the default micro target will be LLVM except when
     # external dependencies are present.
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index ea7682d3de57..662597086d8a 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -159,7 +159,8 @@ class MicroTransportChannel : public RPCChannel {
                                               end_time - ::std::chrono::steady_clock::now()));
 
       if (!ReceiveUntil([this]() -> bool { return session_.IsEstablished(); }, time_remaining)) {
-        if (end_time >= session_start_end_time) {
+        if (session_start_timeout_ != ::std::chrono::microseconds::zero() &&
+            end_time >= session_start_end_time) {
           break;
         }
         end_time += session_start_retry_timeout_;
@@ -199,11 +200,13 @@ class MicroTransportChannel : public RPCChannel {
       did_receive_message_ = false;
       if (!ReceiveUntil([this]() -> bool { return did_receive_message_; },
                         session_established_timeout_)) {
-        std::stringstream ss;
-        ss << "MicroSessionTimeoutError: failed to read reply message after timeout "
-           << session_established_timeout_.count() / 1e6 << "s";
+        if (session_established_timeout_ != ::std::chrono::microseconds::zero()) {
+          std::stringstream ss;
+          ss << "MicroSessionTimeoutError: failed to read reply message after timeout "
+             << session_established_timeout_.count() / 1e6 << "s";
 
-        throw std::runtime_error(ss.str());
+          throw std::runtime_error(ss.str());
+        }
       }
     }
 
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index 0f526007f49e..b8c2a3bb0b97 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -700,6 +700,14 @@ void RPCEndpoint::Init() {
   });
 }
 
+/*!
+ * \brief Create a new RPCEndpoint instance.
+ * \param channel RPCChannel used to communicate
+ * \param name Name of this session, used to identify log messages from this RPCEndpoint instance.
+ * \param The remote key reported during protocol initialization, or "%toinit" if the RPCEndpoint
+ *     should handle this phase of the protocol for you. Some servers may prefer to access parts of
+ *     the key to modify their behavior.
+ */
 std::shared_ptr<RPCEndpoint> RPCEndpoint::Create(std::unique_ptr<RPCChannel> channel,
                                                  std::string name, std::string remote_key) {
   std::shared_ptr<RPCEndpoint> endpt = std::make_shared<RPCEndpoint>();
diff --git a/tests/micro/qemu/.gitignore b/tests/micro/qemu/.gitignore
index 1066e164f0eb..c920d8f93ff8 100644
--- a/tests/micro/qemu/.gitignore
+++ b/tests/micro/qemu/.gitignore
@@ -1,2 +1,2 @@
-/test_zephyr-workspace
+/test_zephyr*-workspace
 /*.micro-binary
diff --git a/tests/micro/qemu/conftest.py b/tests/micro/qemu/conftest.py
new file mode 100644
index 000000000000..e6cd9f2ffb1a
--- /dev/null
+++ b/tests/micro/qemu/conftest.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--microtvm-platforms",
+        default="host",
+        help=(
+            "Specify a comma-separated list of test models (i.e. as passed to tvm.target.micro()) "
+            "for microTVM tests."
+        ),
+    )
+
+
+def pytest_generate_tests(metafunc):
+    if "platform" in metafunc.fixturenames:
+        metafunc.parametrize("platform", metafunc.config.getoption("microtvm_platforms").split(","))
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
index a5fff781ddef..2213203d55c1 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/qemu/test_zephyr.py
@@ -21,7 +21,9 @@
 import glob
 import os
 import subprocess
+import sys
 
+import pytest
 import numpy as np
 
 import tvm
@@ -36,19 +38,20 @@
 DEBUG = False
 
 
-TARGET = tvm.target.target.micro("host")
+TARGET = None
 
 
-def _make_sess_from_op(op_name, sched, arg_bufs):
+def _make_sess_from_op(model, zephyr_board, op_name, sched, arg_bufs):
+    target = tvm.target.target.micro(model)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.build(sched, arg_bufs, TARGET, target_host=TARGET, name=op_name)
+        mod = tvm.build(sched, arg_bufs, target, target_host=target, name=op_name)
 
-    return _make_session(mod)
+    return _make_session(model, target, zephyr_board, mod)
 
 
-def _make_session(mod):
-    prev_build = f"{os.path.splitext(__file__)[0]}-last-build.micro-binary"
-    test_name = os.path.splitext(os.path.abspath(__file__))[0]
+def _make_session(model, target, zephyr_board, mod):
+    test_name = f"{os.path.splitext(os.path.abspath(__file__))[0]}-{model}"
+    prev_build = f"{test_name}-last-build.micro-binary"
     workspace_root = (
         f'{test_name}-workspace/{datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")}'
     )
@@ -60,7 +63,7 @@ def _make_session(mod):
     project_dir = os.path.join(os.path.dirname(__file__) or ".", "zephyr-runtime")
     compiler = zephyr.ZephyrCompiler(
         project_dir=project_dir,
-        board="qemu_x86",
+        board="nucleo_f746zg" if "stm32f746" in str(target) else "qemu_x86",
         zephyr_toolchain_variant="zephyr",
     )
 
@@ -101,24 +104,28 @@ def _make_session(mod):
     return tvm.micro.Session(**session_kw)
 
 
-def _make_add_sess():
+def _make_add_sess(model, zephyr_board):
     A = tvm.te.placeholder((2,), dtype="int8")
     B = tvm.te.placeholder((1,), dtype="int8")
     C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
     sched = tvm.te.create_schedule(C.op)
-    return _make_sess_from_op("add", sched, [A, B, C])
+    return _make_sess_from_op(model, zephyr_board, "add", sched, [A, B, C])
 
 
-def _make_ident_sess():
-    A = tvm.te.placeholder((2,), dtype="int8")
-    B = tvm.te.compute(A.shape, lambda i: A[i], name="B")
-    sched = tvm.te.create_schedule(B.op)
-    return _make_sess_from_op("ident", sched, [A, B])
+# The models that should pass this configuration. Maps a short, identifying platform string to
+# (model, zephyr_board).
+PLATFORMS = {
+    "host": ("host", "qemu_x86"),
+    "stm32f746xx": ("stm32f746xx", "nucleo_f746zg"),
+}
 
 
-def test_compile_runtime():
+# The same test code can be executed on both the QEMU simulation and on real hardware.
+def test_compile_runtime(platform):
     """Test compiling the on-device runtime."""
 
+    model, zephyr_board = PLATFORMS[platform]
+
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
     def test_basic_add(sess):
         A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
@@ -132,12 +139,9 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.asnumpy() == np.array([6, 7])).all()
 
-    with _make_add_sess() as sess:
+    with _make_add_sess(model, zephyr_board) as sess:
         test_basic_add(sess)
 
 
 if __name__ == "__main__":
-    import logging
-
-    logging.basicConfig(level=logging.DEBUG)
-    test_compile_runtime()
+    sys.exit(pytest.main([os.path.dirname(__file__)] + sys.argv[1:]))

From fd6daeabf5d53e20a0a3c5935396cafe95652074 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Sat, 31 Oct 2020 15:32:05 -0700
Subject: [PATCH 096/258] [CI] Add m6g instance (ARM64) to mainline CI (#6804)

* [CI] Add m6g instance (ARM64) to CI (#6781)

* [CI] Add m6g instance (ARM64) to CI

* address comments

Co-authored-by: Ubuntu <ubuntu@ip-172-31-54-90.us-west-2.compute.internal>

* [CI] fix cpp test (#6796)

* Update tests/python/unittest/test_target_codegen_x86.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

Co-authored-by: Ubuntu <ubuntu@ip-172-31-54-90.us-west-2.compute.internal>
Co-authored-by: Tianqi Chen <tqchen@users.noreply.github.com>
Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 Jenkinsfile                                   | 25 ++++++++++-
 docker/Dockerfile.ci_arm                      | 43 +++++++++++++++++++
 .../unittest/test_target_codegen_x86.py       |  7 +++
 tests/scripts/task_config_build_arm.sh        | 36 ++++++++++++++++
 4 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 docker/Dockerfile.ci_arm
 create mode 100755 tests/scripts/task_config_build_arm.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index e2abb018d6e2..f6988e5fb2a2 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -50,6 +50,7 @@ ci_cpu = "tlcpack/ci-cpu:v0.70"
 ci_wasm = "tlcpack/ci-wasm:v0.70"
 ci_i386 = "tlcpack/ci-i386:v0.70"
 ci_qemu = "tlcpack/ci-qemu:v0.01"
+ci_arm = "tlcpack/ci-arm:v0.01"
 // <--- End of regex-scanned config.
 
 // tvm libraries
@@ -184,7 +185,7 @@ stage('Build') {
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_fsim.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh"
-          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+          // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh"
         }
       }
@@ -212,6 +213,16 @@ stage('Build') {
       }
     }
   },
+  'BUILD : arm': {
+    node('ARM') {
+      ws(per_exec_ws("tvm/build-arm")) {
+        init_git()
+        sh "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh"
+        make(ci_arm, 'build', '-j4')
+        pack_lib('arm', tvm_multilib)
+      }
+    }
+  },
   'BUILD: QEMU': {
     node('CPU') {
       ws(per_exec_ws("tvm/build-qemu")) {
@@ -253,6 +264,18 @@ stage('Unit Test') {
       }
     }
   },
+  'python3: arm': {
+    node('ARM') {
+      ws(per_exec_ws("tvm/ut-python-arm")) {
+        init_git()
+        unpack_lib('arm', tvm_multilib)
+        timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_unittest.sh"
+          // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
+        }
+      }
+    }
+  },
   'java: GPU': {
     node('GPU') {
       ws(per_exec_ws("tvm/ut-java")) {
diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
new file mode 100644
index 000000000000..f5b2c2af0fbf
--- /dev/null
+++ b/docker/Dockerfile.ci_arm
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# CI docker arm env
+# tag: v0.10
+
+FROM ubuntu:18.04
+
+RUN apt-get update --fix-missing
+RUN apt-get install -y ca-certificates gnupg2
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
+RUN bash /install/ubuntu_install_llvm.sh
+
+COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
+RUN bash /install/ubuntu1804_install_python.sh
+
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
+COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
+RUN bash /install/ubuntu_install_python_package.sh
+
+# AutoTVM deps
+COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
+RUN bash /install/ubuntu_install_redis.sh
diff --git a/tests/python/unittest/test_target_codegen_x86.py b/tests/python/unittest/test_target_codegen_x86.py
index ec11d26a9e76..b581f72ec763 100644
--- a/tests/python/unittest/test_target_codegen_x86.py
+++ b/tests/python/unittest/test_target_codegen_x86.py
@@ -28,6 +28,13 @@ def test_fp16_to_fp32():
         )
         return
 
+    import platform
+
+    machine = platform.machine()
+    if machine not in ["x86_64", "i386", "AMD64"]:
+        print("Skipping test because the platform is: {} ".format(machine))
+        return
+
     def fp16_to_fp32(target, width, match=None, not_match=None):
         elements = 64
         n = tvm.runtime.convert(elements)
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
new file mode 100755
index 000000000000..65f5d6359dac
--- /dev/null
+++ b/tests/scripts/task_config_build_arm.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+
+mkdir -p build
+cd build
+cp ../cmake/config.cmake .
+
+echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
+echo set\(USE_MICRO ON\) >> config.cmake
+echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
+echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_LLVM llvm-config-8\) >> config.cmake
+echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
+echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
+echo set\(USE_VTA_TSIM ON\) >> config.cmake
+echo set\(USE_VTA_FSIM ON\) >> config.cmake

From 15a358bca520ff1675aab32d4cf584052a6889be Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandron85@gmail.com>
Date: Sun, 1 Nov 2020 13:15:36 +0000
Subject: [PATCH 097/258] [CI] Move back Keras to 2.4.3 (#6810)

* I mistakenly moved it back to 2.3.1, now fixing it
---
 docker/install/ubuntu_install_tensorflow.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index a95702d530a5..286a086abd82 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -23,4 +23,4 @@ set -o pipefail
 # h5py is pinned to minor than 3 due to issues with
 # tensorflow:
 # https://github.com/tensorflow/tensorflow/issues/44467
-pip3 install tensorflow==2.3.1 keras==2.3.1 "h5py<3.0"
+pip3 install tensorflow==2.3.1 keras==2.4.3 "h5py<3.0"

From 8f2beac7177e9b1d834ea3b50b1c4acc920f1eca Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 1 Nov 2020 13:33:59 -0500
Subject: [PATCH 098/258] [CI] Update to latest (#6812)

- Fix Keras related regression.
---
 Jenkinsfile                      |  6 +++---
 tutorials/frontend/from_keras.py | 26 +++++++++++++++++++-------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index f6988e5fb2a2..c8c2fe342dfa 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,10 +45,10 @@
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = "tlcpack/ci-lint:v0.62"
-ci_gpu = "tlcpack/ci-gpu:v0.70"
-ci_cpu = "tlcpack/ci-cpu:v0.70"
+ci_gpu = "tlcpack/ci-gpu:v0.71"
+ci_cpu = "tlcpack/ci-cpu:v0.71"
 ci_wasm = "tlcpack/ci-wasm:v0.70"
-ci_i386 = "tlcpack/ci-i386:v0.70"
+ci_i386 = "tlcpack/ci-i386:v0.71"
 ci_qemu = "tlcpack/ci-qemu:v0.01"
 ci_arm = "tlcpack/ci-arm:v0.01"
 // <--- End of regex-scanned config.
diff --git a/tutorials/frontend/from_keras.py b/tutorials/frontend/from_keras.py
index 3dcefd59327a..25a1e5c9d1fa 100644
--- a/tutorials/frontend/from_keras.py
+++ b/tutorials/frontend/from_keras.py
@@ -45,13 +45,25 @@
 # Load pretrained keras model
 # ----------------------------
 # We load a pretrained resnet-50 classification model provided by keras.
-weights_url = "".join(
-    [
-        "https://github.com/fchollet/deep-learning-models/releases/",
-        "download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
-    ]
-)
-weights_file = "resnet50_weights.h5"
+
+if tuple(keras.__version__.split(".")) < ("2", "4", "0"):
+    weights_url = "".join(
+        [
+            "https://github.com/fchollet/deep-learning-models/releases/",
+            "download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
+        ]
+    )
+    weights_file = "resnet50_keras_old.h5"
+else:
+    weights_url = "".join(
+        [
+            " https://storage.googleapis.com/tensorflow/keras-applications/",
+            "resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5",
+        ]
+    )
+    weights_file = "resnet50_keras_new.h5"
+
+
 weights_path = download_testdata(weights_url, weights_file, module="keras")
 keras_resnet50 = keras.applications.resnet50.ResNet50(
     include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000

From 1c1560fc0bea22cdc7be1426443f956be4e2dc09 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 1 Nov 2020 17:54:26 -0500
Subject: [PATCH 099/258] [OBJECT] Update types slots for baseexpr and primexpr
 (#6814)

---
 include/tvm/ir/expr.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index c982c5cf850b..ffb225c512cd 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -48,7 +48,7 @@ class BaseExprNode : public Object {
   static constexpr const char* _type_key = "BaseExpr";
   static constexpr const bool _type_has_method_sequal_reduce = true;
   static constexpr const bool _type_has_method_shash_reduce = true;
-  static constexpr const uint32_t _type_child_slots = 58;
+  static constexpr const uint32_t _type_child_slots = 62;
   TVM_DECLARE_BASE_OBJECT_INFO(BaseExprNode, Object);
 };
 
@@ -92,7 +92,7 @@ class PrimExprNode : public BaseExprNode {
   DataType dtype;
 
   static constexpr const char* _type_key = "PrimExpr";
-  static constexpr const uint32_t _type_child_slots = 34;
+  static constexpr const uint32_t _type_child_slots = 38;
   TVM_DECLARE_BASE_OBJECT_INFO(PrimExprNode, BaseExprNode);
 };
 

From 72ab03f4fc2004d84ee788ae227d0e3282572dd8 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Sun, 1 Nov 2020 16:03:04 -0800
Subject: [PATCH 100/258] [Rust][Diagnostics] Add initial boilerplate for Rust
 diagnostic interface. (#6656)

* Add initial boilerplate for Rust diagnostic interface.

* Codespan example almost working

* WIP

* Hacking on Rust inside of TVM

* Borrow code from Egg

* Update CMake and delete old API

* Fix Linux build

* Clean up exporting to show off new diagnostics

* Improve Rust bindings

* Fix calling

* Fix

* Rust Diagnostics work

* Remove type checker

* Format and cleanup

* Fix the extension code

* More cleanup

* Fix some CR

* Add docs and address feedback

* WIP more improvments

* Update cmake/modules/RustExt.cmake

Co-authored-by: Robert Kimball <bobkimball@gmail.com>

* Update rust/tvm/src/ir/diagnostics/mod.rs

Co-authored-by: Robert Kimball <bobkimball@gmail.com>

* Clean up PR

* Format all

* Remove dead comment

* Code review comments  and apache  headers

* Purge test file

* Update cmake/modules/LLVM.cmake

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>

* Format Rust

* Add TK's suggestion

* More CR and cleanup

* Fix tyck line

* Format

Co-authored-by: Robert Kimball <bobkimball@gmail.com>
Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>
---
 CMakeLists.txt                          |   2 +
 cmake/modules/LLVM.cmake                |   9 +-
 cmake/modules/RustExt.cmake             |  43 +++++
 include/tvm/parser/source_map.h         |   2 -
 python/tvm/ir/diagnostics/__init__.py   |   1 +
 rust/Cargo.toml                         |   1 +
 rust/compiler-ext/Cargo.toml            |  32 ++++
 rust/compiler-ext/src/lib.rs            |  35 ++++
 rust/tvm-rt/Cargo.toml                  |  15 +-
 rust/tvm-rt/src/array.rs                |  37 ++++
 rust/tvm-rt/src/errors.rs               |  17 ++
 rust/tvm-rt/src/function.rs             |  35 ++--
 rust/tvm-rt/src/object/object_ptr.rs    |  14 +-
 rust/tvm-rt/src/string.rs               |   4 +-
 rust/tvm-sys/Cargo.toml                 |   1 +
 rust/tvm-sys/build.rs                   |   2 +-
 rust/tvm-sys/src/packed_func.rs         |   1 +
 rust/tvm/Cargo.toml                     |  24 ++-
 rust/tvm/src/bin/tyck.rs                |  49 +++++
 rust/tvm/src/ir/arith.rs                |   2 +-
 rust/tvm/src/ir/diagnostics/codespan.rs | 216 +++++++++++++++++++++
 rust/tvm/src/ir/diagnostics/mod.rs      | 245 ++++++++++++++++++++++++
 rust/tvm/src/ir/expr.rs                 |   2 +-
 rust/tvm/src/ir/mod.rs                  |   2 +
 rust/tvm/src/ir/module.rs               |  30 ++-
 rust/tvm/src/ir/relay/mod.rs            |  16 +-
 rust/tvm/src/ir/source_map.rs           |  57 ++++++
 rust/tvm/src/ir/span.rs                 |  87 +++++++--
 rust/tvm/src/ir/ty.rs                   |   2 +-
 rust/tvm/src/lib.rs                     |  27 ++-
 rust/tvm/src/transform.rs               |   2 +-
 src/contrib/rust_extension.cc           |  31 +++
 src/ir/diagnostic.cc                    |   5 +
 src/parser/source_map.cc                |  11 --
 34 files changed, 968 insertions(+), 91 deletions(-)
 create mode 100644 cmake/modules/RustExt.cmake
 create mode 100644 rust/compiler-ext/Cargo.toml
 create mode 100644 rust/compiler-ext/src/lib.rs
 create mode 100644 rust/tvm/src/bin/tyck.rs
 create mode 100644 rust/tvm/src/ir/diagnostics/codespan.rs
 create mode 100644 rust/tvm/src/ir/diagnostics/mod.rs
 create mode 100644 rust/tvm/src/ir/source_map.rs
 create mode 100644 src/contrib/rust_extension.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c873a8016e2..f8ecf4635fbe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,7 @@ tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF)
 tvm_option(USE_TENSORRT_CODEGEN "Build with TensorRT Codegen support" OFF)
 tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF)
+tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, DYNAMIC, or OFF" OFF)
 
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
@@ -368,6 +369,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
+include(cmake/modules/RustExt.cmake)
 
 include(CheckCXXCompilerFlag)
 if(NOT MSVC)
diff --git a/cmake/modules/LLVM.cmake b/cmake/modules/LLVM.cmake
index 5f8ace17111f..ac870b17faeb 100644
--- a/cmake/modules/LLVM.cmake
+++ b/cmake/modules/LLVM.cmake
@@ -16,7 +16,14 @@
 # under the License.
 
 # LLVM rules
-add_definitions(-DDMLC_USE_FOPEN64=0)
+# Due to LLVM debug symbols you can sometimes face linking issues on
+# certain compiler, platform combinations if you don't set NDEBUG.
+#
+# See https://github.com/imageworks/OpenShadingLanguage/issues/1069
+# for more discussion.
+add_definitions(-DDMLC_USE_FOPEN64=0 -DNDEBUG=1)
+# TODO(@jroesch, @tkonolige): if we actually use targets we can do this.
+# target_compile_definitions(tvm PRIVATE NDEBUG=1)
 
 # Test if ${USE_LLVM} is not an explicit boolean false
 # It may be a boolean or a string
diff --git a/cmake/modules/RustExt.cmake b/cmake/modules/RustExt.cmake
new file mode 100644
index 000000000000..2922bc48dee2
--- /dev/null
+++ b/cmake/modules/RustExt.cmake
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_RUST_EXT)
+    set(RUST_SRC_DIR "${CMAKE_SOURCE_DIR}/rust")
+    set(CARGO_OUT_DIR "${CMAKE_SOURCE_DIR}/rust/target")
+
+    if(USE_RUST_EXT STREQUAL "STATIC")
+        set(COMPILER_EXT_PATH "${CARGO_OUT_DIR}/release/libcompiler_ext.a")
+    elseif(USE_RUST_EXT STREQUAL "DYNAMIC")
+        set(COMPILER_EXT_PATH "${CARGO_OUT_DIR}/release/libcompiler_ext.so")
+    else()
+        message(FATAL_ERROR "invalid setting for USE_RUST_EXT, STATIC, DYNAMIC or OFF")
+    endif()
+
+    add_custom_command(
+        OUTPUT "${COMPILER_EXT_PATH}"
+        COMMAND cargo build --release
+        MAIN_DEPENDENCY "${RUST_SRC_DIR}"
+        WORKING_DIRECTORY "${RUST_SRC_DIR}/compiler-ext")
+
+    add_custom_target(rust_ext ALL DEPENDS "${COMPILER_EXT_PATH}")
+
+    # TODO(@jroesch, @tkonolige): move this to CMake target
+    # target_link_libraries(tvm "${COMPILER_EXT_PATH}" PRIVATE)
+    list(APPEND TVM_LINKER_LIBS ${COMPILER_EXT_PATH})
+
+    add_definitions(-DRUST_COMPILER_EXT=1)
+endif()
diff --git a/include/tvm/parser/source_map.h b/include/tvm/parser/source_map.h
index 424af5c98cc8..a160c22a2a2f 100644
--- a/include/tvm/parser/source_map.h
+++ b/include/tvm/parser/source_map.h
@@ -103,8 +103,6 @@ class SourceMap : public ObjectRef {
 
   TVM_DLL SourceMap() : SourceMap(Map<SourceName, Source>()) {}
 
-  TVM_DLL static SourceMap Global();
-
   void Add(const Source& source);
 
   SourceMapNode* operator->() {
diff --git a/python/tvm/ir/diagnostics/__init__.py b/python/tvm/ir/diagnostics/__init__.py
index 6503743aaa51..3a6402c0359d 100644
--- a/python/tvm/ir/diagnostics/__init__.py
+++ b/python/tvm/ir/diagnostics/__init__.py
@@ -38,6 +38,7 @@ def get_renderer():
     return _ffi_api.GetRenderer()
 
 
+@tvm.register_func("diagnostics.override_renderer")
 def override_renderer(render_func):
     """
     Sets a custom renderer for diagnostics.
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 28312a5e73dc..7c092d860b50 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -29,4 +29,5 @@ members = [
 	"tvm-graph-rt/tests/test_tvm_dso",
 	"tvm-graph-rt/tests/test_wasm32",
 	"tvm-graph-rt/tests/test_nn",
+	"compiler-ext",
 ]
diff --git a/rust/compiler-ext/Cargo.toml b/rust/compiler-ext/Cargo.toml
new file mode 100644
index 000000000000..b830b7a84135
--- /dev/null
+++ b/rust/compiler-ext/Cargo.toml
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "compiler-ext"
+version = "0.1.0"
+authors = ["TVM Contributors"]
+edition = "2018"
+
+[lib]
+crate-type = ["staticlib", "cdylib"]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+tvm = { path = "../tvm", default-features = false, features = ["static-linking"] }
+log = "*"
+env_logger = "*"
diff --git a/rust/compiler-ext/src/lib.rs b/rust/compiler-ext/src/lib.rs
new file mode 100644
index 000000000000..278060ef4897
--- /dev/null
+++ b/rust/compiler-ext/src/lib.rs
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use env_logger;
+use tvm::export;
+
+fn diagnostics() -> Result<(), tvm::Error> {
+    tvm::ir::diagnostics::codespan::init()
+}
+
+export!(diagnostics);
+
+#[no_mangle]
+extern "C" fn compiler_ext_initialize() -> i32 {
+    let _ = env_logger::try_init();
+    tvm_export("rust_ext").expect("failed to initialize the Rust compiler extensions.");
+    log::debug!("Loaded the Rust compiler extension.");
+    return 0;
+}
diff --git a/rust/tvm-rt/Cargo.toml b/rust/tvm-rt/Cargo.toml
index acece5aeec48..9660943da50d 100644
--- a/rust/tvm-rt/Cargo.toml
+++ b/rust/tvm-rt/Cargo.toml
@@ -28,19 +28,26 @@ categories = ["api-bindings", "science"]
 authors = ["TVM Contributors"]
 edition = "2018"
 
+[features]
+default = ["dynamic-linking"]
+dynamic-linking = ["tvm-sys/bindings"]
+static-linking = []
+blas = ["ndarray/blas"]
+
 [dependencies]
 thiserror = "^1.0"
 ndarray = "0.12"
 num-traits = "0.2"
-tvm-sys = { version = "0.1", path = "../tvm-sys/", features = ["bindings"] }
 tvm-macros = { version = "0.1", path = "../tvm-macros" }
 paste = "0.1"
 mashup = "0.1"
 once_cell = "^1.3.1"
 memoffset = "0.5.6"
 
+[dependencies.tvm-sys]
+version = "0.1"
+default-features = false
+path = "../tvm-sys/"
+
 [dev-dependencies]
 anyhow = "^1.0"
-
-[features]
-blas = ["ndarray/blas"]
diff --git a/rust/tvm-rt/src/array.rs b/rust/tvm-rt/src/array.rs
index 5e19cefd8e97..98414f9c5b34 100644
--- a/rust/tvm-rt/src/array.rs
+++ b/rust/tvm-rt/src/array.rs
@@ -18,6 +18,7 @@
  */
 
 use std::convert::{TryFrom, TryInto};
+use std::iter::{IntoIterator, Iterator};
 use std::marker::PhantomData;
 
 use crate::errors::Error;
@@ -81,6 +82,42 @@ impl<T: IsObjectRef> Array<T> {
     }
 }
 
+pub struct IntoIter<T: IsObjectRef> {
+    array: Array<T>,
+    pos: isize,
+    size: isize,
+}
+
+impl<T: IsObjectRef> Iterator for IntoIter<T> {
+    type Item = T;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.pos < self.size {
+            let item =
+                self.array.get(self.pos)
+                    .expect("Can not index as in-bounds position after bounds checking.\nNote: this error can only be do to an uncaught issue with API bindings.");
+            self.pos += 1;
+            Some(item)
+        } else {
+            None
+        }
+    }
+}
+
+impl<T: IsObjectRef> IntoIterator for Array<T> {
+    type Item = T;
+    type IntoIter = IntoIter<T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        let size = self.len() as isize;
+        IntoIter {
+            array: self,
+            pos: 0,
+            size: size,
+        }
+    }
+}
+
 impl<T: IsObjectRef> From<Array<T>> for ArgValue<'static> {
     fn from(array: Array<T>) -> ArgValue<'static> {
         array.object.into()
diff --git a/rust/tvm-rt/src/errors.rs b/rust/tvm-rt/src/errors.rs
index c884c56fed44..31ce385ef662 100644
--- a/rust/tvm-rt/src/errors.rs
+++ b/rust/tvm-rt/src/errors.rs
@@ -68,6 +68,23 @@ pub enum Error {
     Infallible(#[from] std::convert::Infallible),
     #[error("a panic occurred while executing a Rust packed function")]
     Panic,
+    #[error(
+        "one or more error diagnostics were emitted, please check diagnostic render for output."
+    )]
+    DiagnosticError(String),
+    #[error("{0}")]
+    Raw(String),
+}
+
+impl Error {
+    pub fn from_raw_tvm(raw: &str) -> Error {
+        let err_header = raw.find(":").unwrap_or(0);
+        let (err_ty, err_content) = raw.split_at(err_header);
+        match err_ty {
+            "DiagnosticError" => Error::DiagnosticError((&err_content[1..]).into()),
+            _ => Error::Raw(raw.into()),
+        }
+    }
 }
 
 impl Error {
diff --git a/rust/tvm-rt/src/function.rs b/rust/tvm-rt/src/function.rs
index bae06e929361..aec4a8ad44de 100644
--- a/rust/tvm-rt/src/function.rs
+++ b/rust/tvm-rt/src/function.rs
@@ -120,24 +120,27 @@ impl Function {
         let mut ret_val = ffi::TVMValue { v_int64: 0 };
         let mut ret_type_code = 0i32;
 
-        check_call!(ffi::TVMFuncCall(
-            self.handle,
-            values.as_mut_ptr() as *mut ffi::TVMValue,
-            type_codes.as_mut_ptr() as *mut c_int,
-            num_args as c_int,
-            &mut ret_val as *mut _,
-            &mut ret_type_code as *mut _
-        ));
+        let ret_code = unsafe {
+            ffi::TVMFuncCall(
+                self.handle,
+                values.as_mut_ptr() as *mut ffi::TVMValue,
+                type_codes.as_mut_ptr() as *mut c_int,
+                num_args as c_int,
+                &mut ret_val as *mut _,
+                &mut ret_type_code as *mut _,
+            )
+        };
+
+        if ret_code != 0 {
+            let raw_error = crate::get_last_error();
+            let error = match Error::from_raw_tvm(raw_error) {
+                Error::Raw(string) => Error::CallFailed(string),
+                e => e,
+            };
+            return Err(error);
+        }
 
         let rv = RetValue::from_tvm_value(ret_val, ret_type_code as u32);
-        match rv {
-            RetValue::ObjectHandle(object) => {
-                let optr = crate::object::ObjectPtr::from_raw(object as _).unwrap();
-                // println!("after wrapped call: {}", optr.count());
-                crate::object::ObjectPtr::leak(optr);
-            }
-            _ => {}
-        };
 
         Ok(rv)
     }
diff --git a/rust/tvm-rt/src/object/object_ptr.rs b/rust/tvm-rt/src/object/object_ptr.rs
index 77254d2fbca2..8d535368c352 100644
--- a/rust/tvm-rt/src/object/object_ptr.rs
+++ b/rust/tvm-rt/src/object/object_ptr.rs
@@ -125,7 +125,7 @@ impl Object {
     /// By using associated constants and generics we can provide a
     /// type indexed abstraction over allocating objects with the
     /// correct index and deleter.
-    pub fn base_object<T: IsObject>() -> Object {
+    pub fn base<T: IsObject>() -> Object {
         let index = Object::get_type_index::<T>();
         Object::new(index, delete::<T>)
     }
@@ -351,7 +351,7 @@ mod tests {
 
     #[test]
     fn test_new_object() -> anyhow::Result<()> {
-        let object = Object::base_object::<Object>();
+        let object = Object::base::<Object>();
         let ptr = ObjectPtr::new(object);
         assert_eq!(ptr.count(), 1);
         Ok(())
@@ -359,7 +359,7 @@ mod tests {
 
     #[test]
     fn test_leak() -> anyhow::Result<()> {
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let object = ObjectPtr::leak(ptr);
         assert_eq!(object.count(), 1);
@@ -368,7 +368,7 @@ mod tests {
 
     #[test]
     fn test_clone() -> anyhow::Result<()> {
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let ptr2 = ptr.clone();
         assert_eq!(ptr2.count(), 2);
@@ -379,7 +379,7 @@ mod tests {
 
     #[test]
     fn roundtrip_retvalue() -> Result<()> {
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let ret_value: RetValue = ptr.clone().into();
         let ptr2: ObjectPtr<Object> = ret_value.try_into()?;
@@ -401,7 +401,7 @@ mod tests {
 
     #[test]
     fn roundtrip_argvalue() -> Result<()> {
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let ptr_clone = ptr.clone();
         assert_eq!(ptr.count(), 2);
@@ -435,7 +435,7 @@ mod tests {
     fn test_ref_count_boundary3() {
         use super::*;
         use crate::function::{register, Function};
-        let ptr = ObjectPtr::new(Object::base_object::<Object>());
+        let ptr = ObjectPtr::new(Object::base::<Object>());
         assert_eq!(ptr.count(), 1);
         let stay = ptr.clone();
         assert_eq!(ptr.count(), 2);
diff --git a/rust/tvm-rt/src/string.rs b/rust/tvm-rt/src/string.rs
index 6ff24bef3a60..3cd33a226d44 100644
--- a/rust/tvm-rt/src/string.rs
+++ b/rust/tvm-rt/src/string.rs
@@ -38,7 +38,7 @@ impl From<std::string::String> for String {
     fn from(s: std::string::String) -> Self {
         let size = s.len() as u64;
         let data = Box::into_raw(s.into_boxed_str()).cast();
-        let base = Object::base_object::<StringObj>();
+        let base = Object::base::<StringObj>();
         StringObj { base, data, size }.into()
     }
 }
@@ -47,7 +47,7 @@ impl From<&'static str> for String {
     fn from(s: &'static str) -> Self {
         let size = s.len() as u64;
         let data = s.as_bytes().as_ptr();
-        let base = Object::base_object::<StringObj>();
+        let base = Object::base::<StringObj>();
         StringObj { base, data, size }.into()
     }
 }
diff --git a/rust/tvm-sys/Cargo.toml b/rust/tvm-sys/Cargo.toml
index 4e3fc98b4e75..2952aa4938d7 100644
--- a/rust/tvm-sys/Cargo.toml
+++ b/rust/tvm-sys/Cargo.toml
@@ -23,6 +23,7 @@ license = "Apache-2.0"
 edition = "2018"
 
 [features]
+default = []
 bindings = []
 
 [dependencies]
diff --git a/rust/tvm-sys/build.rs b/rust/tvm-sys/build.rs
index 05806c0d5ce0..159023463e8d 100644
--- a/rust/tvm-sys/build.rs
+++ b/rust/tvm-sys/build.rs
@@ -60,7 +60,7 @@ fn main() -> Result<()> {
     if cfg!(feature = "bindings") {
         println!("cargo:rerun-if-env-changed=TVM_HOME");
         println!("cargo:rustc-link-lib=dylib=tvm");
-        println!("cargo:rustc-link-search={}/build", tvm_home);
+        println!("cargo:rustc-link-search=native={}/build", tvm_home);
     }
 
     // @see rust-bindgen#550 for `blacklist_type`
diff --git a/rust/tvm-sys/src/packed_func.rs b/rust/tvm-sys/src/packed_func.rs
index f7b289c59675..7b8d5296d641 100644
--- a/rust/tvm-sys/src/packed_func.rs
+++ b/rust/tvm-sys/src/packed_func.rs
@@ -101,6 +101,7 @@ macro_rules! TVMPODValue {
                         TVMArgTypeCode_kTVMOpaqueHandle => Handle($value.v_handle),
                         TVMArgTypeCode_kTVMDLTensorHandle => ArrayHandle($value.v_handle as TVMArrayHandle),
                         TVMArgTypeCode_kTVMObjectHandle => ObjectHandle($value.v_handle),
+                        TVMArgTypeCode_kTVMObjectRValueRefArg => ObjectHandle(*($value.v_handle as *mut *mut c_void)),
                         TVMArgTypeCode_kTVMModuleHandle => ModuleHandle($value.v_handle),
                         TVMArgTypeCode_kTVMPackedFuncHandle => FuncHandle($value.v_handle),
                         TVMArgTypeCode_kTVMNDArrayHandle => NDArrayHandle($value.v_handle),
diff --git a/rust/tvm/Cargo.toml b/rust/tvm/Cargo.toml
index 55fc1790604e..153a1950e46b 100644
--- a/rust/tvm/Cargo.toml
+++ b/rust/tvm/Cargo.toml
@@ -28,22 +28,32 @@ categories = ["api-bindings", "science"]
 authors = ["TVM Contributors"]
 edition = "2018"
 
+[features]
+default = ["python", "dynamic-linking"]
+dynamic-linking = ["tvm-rt/dynamic-linking"]
+static-linking = ["tvm-rt/static-linking"]
+blas = ["ndarray/blas"]
+python = ["pyo3"]
+
+[dependencies.tvm-rt]
+version = "0.1"
+default-features = false
+path = "../tvm-rt/"
+
 [dependencies]
 thiserror = "^1.0"
 anyhow = "^1.0"
 lazy_static = "1.1"
 ndarray = "0.12"
 num-traits = "0.2"
-tvm-rt = { version = "0.1", path = "../tvm-rt/" }
-tvm-sys = { version = "0.1", path = "../tvm-sys/" }
 tvm-macros = { version = "*", path = "../tvm-macros/" }
 paste = "0.1"
 mashup = "0.1"
 once_cell = "^1.3.1"
 pyo3 = { version = "0.11.1", optional = true }
+codespan-reporting = "0.9.5"
+structopt = { version = "0.3" }
 
-[features]
-default = ["python"]
-
-blas = ["ndarray/blas"]
-python = ["pyo3"]
+[[bin]]
+name = "tyck"
+required-features = ["dynamic-linking"]
diff --git a/rust/tvm/src/bin/tyck.rs b/rust/tvm/src/bin/tyck.rs
new file mode 100644
index 000000000000..839a6bd1c17f
--- /dev/null
+++ b/rust/tvm/src/bin/tyck.rs
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use std::path::PathBuf;
+
+use anyhow::Result;
+use structopt::StructOpt;
+
+use tvm::ir::diagnostics::codespan;
+use tvm::ir::{self, IRModule};
+use tvm::runtime::Error;
+
+#[derive(Debug, StructOpt)]
+#[structopt(name = "tyck", about = "Parse and type check a Relay program.")]
+struct Opt {
+    /// Input file
+    #[structopt(parse(from_os_str))]
+    input: PathBuf,
+}
+
+fn main() -> Result<()> {
+    codespan::init().expect("Failed to initialize Rust based diagnostics.");
+    let opt = Opt::from_args();
+    let _module = match IRModule::parse_file(opt.input) {
+        Err(ir::module::Error::TVM(Error::DiagnosticError(_))) => return Ok(()),
+        Err(e) => {
+            return Err(e.into());
+        }
+        Ok(module) => module,
+    };
+
+    Ok(())
+}
diff --git a/rust/tvm/src/ir/arith.rs b/rust/tvm/src/ir/arith.rs
index f589f2ac25c6..92a1de69ff78 100644
--- a/rust/tvm/src/ir/arith.rs
+++ b/rust/tvm/src/ir/arith.rs
@@ -34,7 +34,7 @@ macro_rules! define_node {
 
         impl $name {
             pub fn new($($id : $t,)*) -> $name {
-                let base = Object::base_object::<$node>();
+                let base = Object::base::<$node>();
                 let node = $node { base, $($id),* };
                 $name(Some(ObjectPtr::new(node)))
             }
diff --git a/rust/tvm/src/ir/diagnostics/codespan.rs b/rust/tvm/src/ir/diagnostics/codespan.rs
new file mode 100644
index 000000000000..c411c0cd31a7
--- /dev/null
+++ b/rust/tvm/src/ir/diagnostics/codespan.rs
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+//! A TVM diagnostics renderer which uses the Rust `codespan` library
+//! to produce error messages.
+//!
+//! This is an example of using the exposed API surface of TVM to
+//! customize the compiler behavior.
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+
+use codespan_reporting::diagnostic::{Diagnostic as CDiagnostic, Label, Severity};
+use codespan_reporting::files::SimpleFiles;
+use codespan_reporting::term::termcolor::{ColorChoice, StandardStream};
+use codespan_reporting::term::{self};
+
+use super::*;
+use crate::ir::source_map::*;
+
+/// A representation of a TVM Span as a range of bytes in a file.
+struct ByteRange<FileId> {
+    /// The file in which the range occurs.
+    #[allow(dead_code)]
+    file_id: FileId,
+    /// The range start.
+    start_pos: usize,
+    /// The range end.
+    end_pos: usize,
+}
+
+/// A mapping from Span to ByteRange for a single file.
+enum FileSpanToByteRange {
+    AsciiSource(Vec<usize>),
+    #[allow(dead_code)]
+    Utf8 {
+        /// Map character regions which are larger then 1-byte to length.
+        lengths: HashMap<isize, isize>,
+        /// The source of the program.
+        source: String,
+    },
+}
+
+impl FileSpanToByteRange {
+    /// Construct a span to byte range mapping from the program source.
+    fn new(source: String) -> FileSpanToByteRange {
+        if source.is_ascii() {
+            let line_lengths = source.lines().map(|line| line.len()).collect();
+            FileSpanToByteRange::AsciiSource(line_lengths)
+        } else {
+            panic!()
+        }
+    }
+
+    /// Lookup the corresponding ByteRange for a given Span.
+    fn lookup(&self, span: &Span) -> ByteRange<String> {
+        use FileSpanToByteRange::*;
+
+        let source_name: String = span.source_name.name.as_str().unwrap().into();
+
+        match self {
+            AsciiSource(ref line_lengths) => {
+                let start_pos = (&line_lengths[0..(span.line - 1) as usize])
+                    .into_iter()
+                    .sum::<usize>()
+                    + (span.column) as usize;
+                let end_pos = (&line_lengths[0..(span.end_line - 1) as usize])
+                    .into_iter()
+                    .sum::<usize>()
+                    + (span.end_column) as usize;
+                ByteRange {
+                    file_id: source_name,
+                    start_pos,
+                    end_pos,
+                }
+            }
+            _ => panic!(),
+        }
+    }
+}
+
+/// A mapping for all files in a source map to byte ranges.
+struct SpanToByteRange {
+    map: HashMap<String, FileSpanToByteRange>,
+}
+
+impl SpanToByteRange {
+    fn new() -> SpanToByteRange {
+        SpanToByteRange {
+            map: HashMap::new(),
+        }
+    }
+
+    /// Add a source file to the span mapping.
+    pub fn add_source(&mut self, source: Source) {
+        let source_name: String = source.source_name.name.as_str().expect("foo").into();
+
+        if self.map.contains_key(&source_name) {
+            panic!()
+        } else {
+            let source = source.source.as_str().expect("fpp").into();
+            self.map
+                .insert(source_name, FileSpanToByteRange::new(source));
+        }
+    }
+
+    /// Lookup a span to byte range mapping.
+    ///
+    /// First resolves the Span to a file, and then maps the span to a byte range in the file.
+    pub fn lookup(&self, span: &Span) -> ByteRange<String> {
+        let source_name: String = span.source_name.name.as_str().expect("foo").into();
+
+        match self.map.get(&source_name) {
+            Some(file_span_to_bytes) => file_span_to_bytes.lookup(span),
+            None => panic!(),
+        }
+    }
+}
+
+/// The state of the `codespan` based diagnostics.
+struct DiagnosticState {
+    files: SimpleFiles<String, String>,
+    span_map: SpanToByteRange,
+    // todo unify wih source name
+    source_to_id: HashMap<String, usize>,
+}
+
+impl DiagnosticState {
+    fn new() -> DiagnosticState {
+        DiagnosticState {
+            files: SimpleFiles::new(),
+            span_map: SpanToByteRange::new(),
+            source_to_id: HashMap::new(),
+        }
+    }
+
+    fn add_source(&mut self, source: Source) {
+        let source_str: String = source.source.as_str().unwrap().into();
+        let source_name: String = source.source_name.name.as_str().unwrap().into();
+        self.span_map.add_source(source);
+        let file_id = self.files.add(source_name.clone(), source_str);
+        self.source_to_id.insert(source_name, file_id);
+    }
+
+    fn to_diagnostic(&self, diag: super::Diagnostic) -> CDiagnostic<usize> {
+        let severity = match diag.level {
+            DiagnosticLevel::Error => Severity::Error,
+            DiagnosticLevel::Warning => Severity::Warning,
+            DiagnosticLevel::Note => Severity::Note,
+            DiagnosticLevel::Help => Severity::Help,
+            DiagnosticLevel::Bug => Severity::Bug,
+        };
+
+        let source_name: String = diag.span.source_name.name.as_str().unwrap().into();
+        let file_id = *self.source_to_id.get(&source_name).unwrap();
+
+        let message: String = diag.message.as_str().unwrap().into();
+
+        let byte_range = self.span_map.lookup(&diag.span);
+
+        let diagnostic = CDiagnostic::new(severity)
+            .with_message(message)
+            .with_code("EXXX")
+            .with_labels(vec![Label::primary(
+                file_id,
+                byte_range.start_pos..byte_range.end_pos,
+            )]);
+
+        diagnostic
+    }
+}
+
+fn renderer(state: &mut DiagnosticState, diag_ctx: DiagnosticContext) {
+    let source_map = diag_ctx.module.source_map.clone();
+    let writer = StandardStream::stderr(ColorChoice::Always);
+    let config = codespan_reporting::term::Config::default();
+    for diagnostic in diag_ctx.diagnostics.clone() {
+        match source_map.source_map.get(&diagnostic.span.source_name) {
+            Err(err) => panic!(err),
+            Ok(source) => {
+                state.add_source(source);
+                let diagnostic = state.to_diagnostic(diagnostic);
+                term::emit(&mut writer.lock(), &config, &state.files, &diagnostic).unwrap();
+            }
+        }
+    }
+}
+
+/// Initialize the `codespan` based diagnostics.
+///
+/// Calling this function will globally override the TVM diagnostics renderer.
+pub fn init() -> Result<()> {
+    let diag_state = Arc::new(Mutex::new(DiagnosticState::new()));
+    let render_fn = move |diag_ctx: DiagnosticContext| {
+        let mut guard = diag_state.lock().unwrap();
+        renderer(&mut *guard, diag_ctx);
+    };
+
+    override_renderer(Some(render_fn))?;
+    Ok(())
+}
diff --git a/rust/tvm/src/ir/diagnostics/mod.rs b/rust/tvm/src/ir/diagnostics/mod.rs
new file mode 100644
index 000000000000..051bb9eb16c4
--- /dev/null
+++ b/rust/tvm/src/ir/diagnostics/mod.rs
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use super::module::IRModule;
+use super::span::*;
+use crate::runtime::function::Result;
+use crate::runtime::object::{Object, ObjectPtr};
+use crate::runtime::{
+    array::Array,
+    function::{self, Function, ToFunction},
+    string::String as TString,
+};
+/// The diagnostic interface to TVM, used for reporting and rendering
+/// diagnostic information by the compiler. This module exposes
+/// three key abstractions: a Diagnostic, the DiagnosticContext,
+/// and the DiagnosticRenderer.
+use tvm_macros::{external, Object};
+
+pub mod codespan;
+
+external! {
+    #[name("node.ArrayGetItem")]
+    fn get_renderer() -> DiagnosticRenderer;
+
+    #[name("diagnostics.DiagnosticRenderer")]
+    fn diagnostic_renderer(func: Function) -> DiagnosticRenderer;
+
+    #[name("diagnostics.Emit")]
+    fn emit(ctx: DiagnosticContext, diagnostic: Diagnostic) -> ();
+
+    #[name("diagnostics.DiagnosticContextDefault")]
+    fn diagnostic_context_default(module: IRModule) -> DiagnosticContext;
+
+    #[name("diagnostics.DiagnosticContextRender")]
+    fn diagnostic_context_render(ctx: DiagnosticContext) -> ();
+
+    #[name("diagnostics.DiagnosticRendererRender")]
+    fn diagnositc_renderer_render(renderer: DiagnosticRenderer,ctx: DiagnosticContext) -> ();
+
+    #[name("diagnostics.ClearRenderer")]
+    fn clear_renderer() -> ();
+}
+
+/// The diagnostic level, controls the printing of the message.
+#[repr(C)]
+pub enum DiagnosticLevel {
+    Bug = 10,
+    Error = 20,
+    Warning = 30,
+    Note = 40,
+    Help = 50,
+}
+
+/// A compiler diagnostic.
+#[repr(C)]
+#[derive(Object)]
+#[ref_name = "Diagnostic"]
+#[type_key = "Diagnostic"]
+pub struct DiagnosticNode {
+    pub base: Object,
+    /// The level.
+    pub level: DiagnosticLevel,
+    /// The span at which to report an error.
+    pub span: Span,
+    /// The diagnostic message.
+    pub message: TString,
+}
+
+impl Diagnostic {
+    pub fn new(level: DiagnosticLevel, span: Span, message: TString) -> Diagnostic {
+        let node = DiagnosticNode {
+            base: Object::base::<DiagnosticNode>(),
+            level,
+            span,
+            message,
+        };
+        ObjectPtr::new(node).into()
+    }
+
+    pub fn bug(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Bug, span)
+    }
+
+    pub fn error(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Error, span)
+    }
+
+    pub fn warning(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Warning, span)
+    }
+
+    pub fn note(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Note, span)
+    }
+
+    pub fn help(span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder::new(DiagnosticLevel::Help, span)
+    }
+}
+
+/// A wrapper around std::stringstream to build a diagnostic.
+pub struct DiagnosticBuilder {
+    /// The level.
+    pub level: DiagnosticLevel,
+
+    /// The span of the diagnostic.
+    pub span: Span,
+
+    /// The in progress message.
+    pub message: String,
+}
+
+impl DiagnosticBuilder {
+    pub fn new(level: DiagnosticLevel, span: Span) -> DiagnosticBuilder {
+        DiagnosticBuilder {
+            level,
+            span,
+            message: "".into(),
+        }
+    }
+}
+
+/// Display diagnostics in a given display format.
+///
+/// A diagnostic renderer is responsible for converting the
+/// raw diagnostics into consumable output.
+///
+/// For example the terminal renderer will render a sequence
+/// of compiler diagnostics to std::out and std::err in
+/// a human readable form.
+#[repr(C)]
+#[derive(Object)]
+#[ref_name = "DiagnosticRenderer"]
+#[type_key = "DiagnosticRenderer"]
+/// A diagnostic renderer, which given a diagnostic context produces a "rendered"
+/// form of the diagnostics for either human or computer consumption.
+pub struct DiagnosticRendererNode {
+    /// The base type.
+    pub base: Object,
+    // TODO(@jroesch): we can't easily exposed packed functions due to
+    // memory layout
+    // missing field here
+}
+
+impl DiagnosticRenderer {
+    /// Render the provided context.
+    pub fn render(&self, ctx: DiagnosticContext) -> Result<()> {
+        diagnositc_renderer_render(self.clone(), ctx)
+    }
+}
+
+#[repr(C)]
+#[derive(Object)]
+#[ref_name = "DiagnosticContext"]
+#[type_key = "DiagnosticContext"]
+/// A diagnostic context for recording errors against a source file.
+pub struct DiagnosticContextNode {
+    // The base type.
+    pub base: Object,
+
+    /// The Module to report against.
+    pub module: IRModule,
+
+    /// The set of diagnostics to report.
+    pub diagnostics: Array<Diagnostic>,
+
+    /// The renderer set for the context.
+    pub renderer: DiagnosticRenderer,
+}
+
+/// A diagnostic context which records active errors
+/// and contains a renderer.
+impl DiagnosticContext {
+    pub fn new<F>(module: IRModule, render_func: F) -> DiagnosticContext
+    where
+        F: Fn(DiagnosticContext) -> () + 'static,
+    {
+        let renderer = diagnostic_renderer(render_func.to_function()).unwrap();
+        let node = DiagnosticContextNode {
+            base: Object::base::<DiagnosticContextNode>(),
+            module,
+            diagnostics: Array::from_vec(vec![]).unwrap(),
+            renderer,
+        };
+        DiagnosticContext(Some(ObjectPtr::new(node)))
+    }
+
+    pub fn default(module: IRModule) -> DiagnosticContext {
+        diagnostic_context_default(module).unwrap()
+    }
+
+    /// Emit a diagnostic.
+    pub fn emit(&mut self, diagnostic: Diagnostic) -> Result<()> {
+        emit(self.clone(), diagnostic)
+    }
+
+    /// Render the errors and raise a DiagnosticError exception.
+    pub fn render(&mut self) -> Result<()> {
+        diagnostic_context_render(self.clone())
+    }
+
+    /// Emit a diagnostic and then immediately attempt to render all errors.
+    pub fn emit_fatal(&mut self, diagnostic: Diagnostic) -> Result<()> {
+        self.emit(diagnostic)?;
+        self.render()?;
+        Ok(())
+    }
+}
+
+/// Override the global diagnostics renderer.
+// render_func: Option[Callable[[DiagnosticContext], None]]
+//     If the render_func is None it will remove the current custom renderer
+//     and return to default behavior.
+fn override_renderer<F>(opt_func: Option<F>) -> Result<()>
+where
+    F: Fn(DiagnosticContext) -> () + 'static,
+{
+    match opt_func {
+        None => clear_renderer(),
+        Some(func) => {
+            let func = func.to_function();
+            let render_factory = move || diagnostic_renderer(func.clone()).unwrap();
+
+            function::register_override(render_factory, "diagnostics.OverrideRenderer", true)?;
+
+            Ok(())
+        }
+    }
+}
diff --git a/rust/tvm/src/ir/expr.rs b/rust/tvm/src/ir/expr.rs
index 91c42f0edbcf..f74522d91c70 100644
--- a/rust/tvm/src/ir/expr.rs
+++ b/rust/tvm/src/ir/expr.rs
@@ -35,7 +35,7 @@ pub struct BaseExprNode {
 impl BaseExprNode {
     pub fn base<T: IsObject>() -> BaseExprNode {
         BaseExprNode {
-            base: Object::base_object::<T>(),
+            base: Object::base::<T>(),
         }
     }
 }
diff --git a/rust/tvm/src/ir/mod.rs b/rust/tvm/src/ir/mod.rs
index 126d0faccabb..6d5158005497 100644
--- a/rust/tvm/src/ir/mod.rs
+++ b/rust/tvm/src/ir/mod.rs
@@ -19,11 +19,13 @@
 
 pub mod arith;
 pub mod attrs;
+pub mod diagnostics;
 pub mod expr;
 pub mod function;
 pub mod module;
 pub mod op;
 pub mod relay;
+pub mod source_map;
 pub mod span;
 pub mod tir;
 pub mod ty;
diff --git a/rust/tvm/src/ir/module.rs b/rust/tvm/src/ir/module.rs
index e0444b3101da..190b477b98f2 100644
--- a/rust/tvm/src/ir/module.rs
+++ b/rust/tvm/src/ir/module.rs
@@ -16,6 +16,10 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+use std::path::Path;
+
+use thiserror::Error;
+use tvm_macros::Object;
 
 use crate::runtime::array::Array;
 use crate::runtime::function::Result;
@@ -25,16 +29,20 @@ use crate::runtime::{external, Object, ObjectRef};
 
 use super::expr::GlobalVar;
 use super::function::BaseFunc;
-
-use std::io::Result as IOResult;
-use std::path::Path;
-
-use tvm_macros::Object;
+use super::source_map::SourceMap;
 
 // TODO(@jroesch): define type
 type TypeData = ObjectRef;
 type GlobalTypeVar = ObjectRef;
 
+#[derive(Error, Debug)]
+pub enum Error {
+    #[error("{0}")]
+    IO(#[from] std::io::Error),
+    #[error("{0}")]
+    TVM(#[from] crate::runtime::Error),
+}
+
 #[repr(C)]
 #[derive(Object)]
 #[ref_name = "IRModule"]
@@ -43,6 +51,8 @@ pub struct IRModuleNode {
     pub base: Object,
     pub functions: Map<GlobalVar, BaseFunc>,
     pub type_definitions: Map<GlobalTypeVar, TypeData>,
+    pub source_map: SourceMap,
+    // TODO(@jroesch): this is missing some fields
 }
 
 external! {
@@ -113,19 +123,21 @@ external! {
 //     });
 
 impl IRModule {
-    pub fn parse<N, S>(file_name: N, source: S) -> IRModule
+    pub fn parse<N, S>(file_name: N, source: S) -> Result<IRModule>
     where
         N: Into<TVMString>,
         S: Into<TVMString>,
     {
-        parse_module(file_name.into(), source.into()).expect("failed to call parser")
+        parse_module(file_name.into(), source.into())
     }
 
-    pub fn parse_file<P: 'static + AsRef<Path>>(file_path: P) -> IOResult<IRModule> {
+    pub fn parse_file<P: 'static + AsRef<Path>>(
+        file_path: P,
+    ) -> std::result::Result<IRModule, Error> {
         let file_path = file_path.as_ref();
         let file_path_as_str = file_path.to_str().unwrap().to_string();
         let source = std::fs::read_to_string(file_path)?;
-        let module = IRModule::parse(file_path_as_str, source);
+        let module = IRModule::parse(file_path_as_str, source)?;
         Ok(module)
     }
 
diff --git a/rust/tvm/src/ir/relay/mod.rs b/rust/tvm/src/ir/relay/mod.rs
index e539221d1db6..cc1a76bef7e3 100644
--- a/rust/tvm/src/ir/relay/mod.rs
+++ b/rust/tvm/src/ir/relay/mod.rs
@@ -22,11 +22,12 @@ pub mod attrs;
 use std::hash::Hash;
 
 use crate::runtime::array::Array;
-use crate::runtime::{object::*, String as TString};
+use crate::runtime::{object::*, IsObjectRef, String as TString};
 
 use super::attrs::Attrs;
 use super::expr::BaseExprNode;
 use super::function::BaseFuncNode;
+use super::span::Span;
 use super::ty::{Type, TypeNode};
 
 use tvm_macros::Object;
@@ -50,8 +51,8 @@ impl ExprNode {
             base: BaseExprNode::base::<T>(),
             span: ObjectRef::null(),
             checked_type: Type::from(TypeNode {
-                base: Object::base_object::<TypeNode>(),
-                span: ObjectRef::null(),
+                base: Object::base::<TypeNode>(),
+                span: Span::null(),
             }),
         }
     }
@@ -83,7 +84,7 @@ pub struct IdNode {
 impl Id {
     fn new(name_hint: TString) -> Id {
         let node = IdNode {
-            base: Object::base_object::<IdNode>(),
+            base: Object::base::<IdNode>(),
             name_hint: name_hint,
         };
         Id(Some(ObjectPtr::new(node)))
@@ -351,7 +352,7 @@ pub struct PatternNode {
 impl PatternNode {
     pub fn base<T: IsObject>() -> PatternNode {
         PatternNode {
-            base: Object::base_object::<T>(),
+            base: Object::base::<T>(),
             span: ObjectRef::null(),
         }
     }
@@ -450,7 +451,7 @@ pub struct ClauseNode {
 impl Clause {
     pub fn new(lhs: Pattern, rhs: Expr, _span: ObjectRef) -> Clause {
         let node = ClauseNode {
-            base: Object::base_object::<ClauseNode>(),
+            base: Object::base::<ClauseNode>(),
             lhs,
             rhs,
         };
@@ -553,7 +554,8 @@ def @main() -> float32 {
   0.01639530062675476f
 }
 "#,
-        );
+        )
+        .unwrap();
         let main = module
             .lookup(module.get_global_var("main".to_string().into()).unwrap())
             .unwrap();
diff --git a/rust/tvm/src/ir/source_map.rs b/rust/tvm/src/ir/source_map.rs
new file mode 100644
index 000000000000..54e16dac62ac
--- /dev/null
+++ b/rust/tvm/src/ir/source_map.rs
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either exprss or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use crate::runtime::map::Map;
+use crate::runtime::object::Object;
+use crate::runtime::string::String as TString;
+
+use super::span::SourceName;
+
+use tvm_macros::Object;
+
+/// A program source in any language.
+///
+/// Could represent the source from an ML framework or a source of an IRModule.
+#[repr(C)]
+#[derive(Object)]
+#[type_key = "Source"]
+#[ref_name = "Source"]
+pub struct SourceNode {
+    pub base: Object,
+    /// The source name.
+    pub source_name: SourceName,
+
+    /// The raw source.
+    pub source: TString,
+    // TODO(@jroesch): Non-ABI compat field
+    // A mapping of line breaks into the raw source.
+    // std::vector<std::pair<int, int>> line_map;
+}
+
+/// A mapping from a unique source name to source fragments.
+#[repr(C)]
+#[derive(Object)]
+#[type_key = "SourceMap"]
+#[ref_name = "SourceMap"]
+pub struct SourceMapNode {
+    /// The base object.
+    pub base: Object,
+    /// The source mapping.
+    pub source_map: Map<SourceName, Source>,
+}
diff --git a/rust/tvm/src/ir/span.rs b/rust/tvm/src/ir/span.rs
index d2e19a25a950..eb6821af69dc 100644
--- a/rust/tvm/src/ir/span.rs
+++ b/rust/tvm/src/ir/span.rs
@@ -1,22 +1,71 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
 
-use crate::runtime::ObjectRef;
+* specific language governing permissions and limitations
+* under the License.
+*/
 
-pub type Span = ObjectRef;
+use crate::runtime::{Object, ObjectPtr, String as TString};
+use tvm_macros::Object;
+
+/// A source file name, contained in a Span.
+#[repr(C)]
+#[derive(Object)]
+#[type_key = "SourceName"]
+#[ref_name = "SourceName"]
+pub struct SourceNameNode {
+    pub base: Object,
+    pub name: TString,
+}
+
+/// Span information for diagnostic purposes.
+#[repr(C)]
+#[derive(Object)]
+#[type_key = "Span"]
+#[ref_name = "Span"]
+pub struct SpanNode {
+    pub base: Object,
+    /// The source name.
+    pub source_name: SourceName,
+    /// The line number.
+    pub line: i32,
+    /// The column offset.
+    pub column: i32,
+    /// The end line number.
+    pub end_line: i32,
+    /// The end column number.
+    pub end_column: i32,
+}
+
+impl Span {
+    pub fn new(
+        source_name: SourceName,
+        line: i32,
+        end_line: i32,
+        column: i32,
+        end_column: i32,
+    ) -> Span {
+        let span_node = SpanNode {
+            base: Object::base::<SpanNode>(),
+            source_name,
+            line,
+            end_line,
+            column,
+            end_column,
+        };
+        Span(Some(ObjectPtr::new(span_node)))
+    }
+}
diff --git a/rust/tvm/src/ir/ty.rs b/rust/tvm/src/ir/ty.rs
index b6a47f553da4..d12f094a63ea 100644
--- a/rust/tvm/src/ir/ty.rs
+++ b/rust/tvm/src/ir/ty.rs
@@ -36,7 +36,7 @@ pub struct TypeNode {
 impl TypeNode {
     fn base<T: IsObject>(span: Span) -> Self {
         TypeNode {
-            base: Object::base_object::<T>(),
+            base: Object::base::<T>(),
             span,
         }
     }
diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs
index 36c750328249..7e0682b86b33 100644
--- a/rust/tvm/src/lib.rs
+++ b/rust/tvm/src/lib.rs
@@ -24,7 +24,7 @@
 //! One particular use case is that given optimized deep learning model artifacts,
 //! (compiled with TVM) which include a shared library
 //! `lib.so`, `graph.json` and a byte-array `param.params`, one can load them
-//! in Rust idomatically to create a TVM Graph Runtime and
+//! in Rust idiomatically to create a TVM Graph Runtime and
 //! run the model for some inputs and get the
 //! desired predictions *all in Rust*.
 //!
@@ -47,3 +47,28 @@ pub mod runtime;
 pub mod transform;
 
 pub use runtime::version;
+
+#[macro_export]
+macro_rules! export {
+    ($($fn_name:expr),*) => {
+        pub fn tvm_export(ns: &str) -> Result<(), tvm::Error> {
+            $(
+                let name = String::fromwe(ns) + ::std::stringify!($fn_name);
+                tvm::runtime::function::register_override($fn_name, name, true)?;
+            )*
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! export_mod {
+    ($ns:expr, $($mod_name:expr),*) => {
+        pub fn tvm_mod_export() -> Result<(), tvm::Error> {
+            $(
+                $mod_name::tvm_export($ns)?;
+            )*
+            Ok(())
+        }
+    }
+}
diff --git a/rust/tvm/src/transform.rs b/rust/tvm/src/transform.rs
index 59fc60450825..c5a65c417c93 100644
--- a/rust/tvm/src/transform.rs
+++ b/rust/tvm/src/transform.rs
@@ -50,7 +50,7 @@ impl PassInfo {
         let required = Array::from_vec(required)?;
 
         let node = PassInfoNode {
-            base: Object::base_object::<PassInfoNode>(),
+            base: Object::base::<PassInfoNode>(),
             opt_level,
             name: name.into(),
             required,
diff --git a/src/contrib/rust_extension.cc b/src/contrib/rust_extension.cc
new file mode 100644
index 000000000000..46e94fffdf55
--- /dev/null
+++ b/src/contrib/rust_extension.cc
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/contrib/rust_extension.cc
+ * \brief Expose Rust extensions initialization.
+ */
+#ifdef RUST_COMPILER_EXT
+
+extern "C" {
+int compiler_ext_initialize();
+static int test = compiler_ext_initialize();
+}
+
+#endif
diff --git a/src/ir/diagnostic.cc b/src/ir/diagnostic.cc
index f9299e3e27e8..876113b85f6e 100644
--- a/src/ir/diagnostic.cc
+++ b/src/ir/diagnostic.cc
@@ -113,6 +113,7 @@ TVM_REGISTER_GLOBAL("diagnostics.DiagnosticRendererRender")
     });
 
 DiagnosticContext::DiagnosticContext(const IRModule& module, const DiagnosticRenderer& renderer) {
+  CHECK(renderer.defined()) << "can not initialize a diagnostic renderer with a null function";
   auto n = make_object<DiagnosticContextNode>();
   n->module = module;
   n->renderer = renderer;
@@ -167,6 +168,10 @@ DiagnosticContext DiagnosticContext::Default(const IRModule& module) {
   return DiagnosticContext(module, renderer);
 }
 
+TVM_REGISTER_GLOBAL("diagnostics.Default").set_body_typed([](const IRModule& module) {
+  return DiagnosticContext::Default(module);
+});
+
 std::ostream& EmitDiagnosticHeader(std::ostream& out, const Span& span, DiagnosticLevel level,
                                    std::string msg) {
   rang::fg diagnostic_color = rang::fg::reset;
diff --git a/src/parser/source_map.cc b/src/parser/source_map.cc
index 7ac978cd6341..7340f6977943 100644
--- a/src/parser/source_map.cc
+++ b/src/parser/source_map.cc
@@ -77,12 +77,6 @@ tvm::String Source::GetLine(int line) {
   return line_text;
 }
 
-// TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-//     .set_dispatch<SourceNameNode>([](const ObjectRef& ref, ReprPrinter* p) {
-//       auto* node = static_cast<const SourceNameNode*>(ref.get());
-//       p->stream << "SourceName(" << node->name << ", " << node << ")";
-//     });
-
 TVM_REGISTER_NODE_TYPE(SourceMapNode);
 
 SourceMap::SourceMap(Map<SourceName, Source> source_map) {
@@ -91,11 +85,6 @@ SourceMap::SourceMap(Map<SourceName, Source> source_map) {
   data_ = std::move(n);
 }
 
-// TODO(@jroesch): fix this
-static SourceMap global_source_map = SourceMap(Map<SourceName, Source>());
-
-SourceMap SourceMap::Global() { return global_source_map; }
-
 void SourceMap::Add(const Source& source) { (*this)->source_map.Set(source->source_name, source); }
 
 TVM_REGISTER_GLOBAL("SourceMapAdd").set_body_typed([](SourceMap map, String name, String content) {

From 3b7b5dc143fe6776eda66a36a2e7a41b79133353 Mon Sep 17 00:00:00 2001
From: alter-xp <xp56@linux.alibaba.com>
Date: Mon, 2 Nov 2020 11:16:03 +0800
Subject: [PATCH 101/258] TF frontend: add softsign op (#6799)

---
 python/tvm/relay/frontend/tensorflow.py          | 11 +++++++++++
 tests/python/frontend/tensorflow/test_forward.py | 16 ++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 1f5786f911cb..6a23c8da9739 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1923,6 +1923,16 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _softsign():
+    # op description: https://www.tensorflow.org/api_docs/python/tf/math/softsign
+    def _impl(inputs, attr, params, mod):
+        abs_out = get_relay_op("abs")(inputs[0])
+        add_out = abs_out + tvm.relay.const(1, attr["T"].name)
+        return inputs[0] / add_out
+
+    return _impl
+
+
 def _softplus():
     # op description: https://www.tensorflow.org/api_docs/python/tf/math/softplus
     def _impl(inputs, attr, params, mod):
@@ -2381,6 +2391,7 @@ def _impl(inputs, attr, params, mod):
     "Slice": _slice(),
     "Softmax": _softmax(),
     "Softplus": _softplus(),
+    "Softsign": _softsign(),
     "SpaceToBatchND": _space_to_batch_nd(),
     "SpaceToDepth": _space_to_depth(),
     "Split": _split(False),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index a5c15c751b50..94c2c440e4d1 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -3520,6 +3520,22 @@ def _test_forward_expm1(shape):
     _test_forward_expm1([2, 5, 2, 5])
 
 
+def test_forward_softsign():
+    """test operator softsign """
+
+    def _test_forward_softsign(shape):
+        tf.disable_eager_execution()
+        np_data = np.random.uniform(1, 100, size=shape).astype(np.float32)
+        tf.reset_default_graph()
+        in_data = tf.placeholder(tf.float32, shape, name="in_data")
+        tf.nn.softsign(in_data, name="softsign")
+        compare_tf_with_tvm([np_data], ["in_data:0"], "softsign:0")
+
+    _test_forward_softsign([1, 100])
+    _test_forward_softsign([1, 10, 10])
+    _test_forward_softsign([2, 5, 2, 5])
+
+
 def test_forward_negative():
     """test tf operator Neg """
     np_data = np.random.uniform(-100, 255, size=(224, 224, 3)).astype(np.float32)

From 6db2d3a404a3f4bcabffd440e276a1a6043c1778 Mon Sep 17 00:00:00 2001
From: Siju Samuel <siju.samuel@huawei.com>
Date: Mon, 2 Nov 2020 13:23:01 +0530
Subject: [PATCH 102/258] [TENSORFLOW]Sparse2Dense support (#5767)

* [TENSORFLOW]Sparse2Dense support

* Formatting issues fixed
---
 python/tvm/relay/frontend/tensorflow.py       | 13 ++++
 .../frontend/tensorflow/test_forward.py       | 77 +++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 6a23c8da9739..2c7adf03bad8 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1281,6 +1281,18 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _sparse_to_dense():
+    def _impl(inputs, attr, params, mod):
+        sparse_indices = inputs[0]
+        sparse_values = inputs[2]
+        default_value = inputs[3]
+        output_shape = attr["_output_shapes"][0]
+
+        return _op.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
+
+    return _impl
+
+
 def _bias_add():
     def _impl(inputs, attr, params, mod):
         # Must expand for proper broadcasting in NCHW.
@@ -2394,6 +2406,7 @@ def _impl(inputs, attr, params, mod):
     "Softsign": _softsign(),
     "SpaceToBatchND": _space_to_batch_nd(),
     "SpaceToDepth": _space_to_depth(),
+    "SparseToDense": _sparse_to_dense(),
     "Split": _split(False),
     "SplitV": _split(True),
     "Sqrt": AttrCvt("sqrt"),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 94c2c440e4d1..6697cfd0d36f 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -3968,6 +3968,83 @@ def test_forward_dilation():
     _test_dilation2d([1, 3, 3, 1], [2, 2, 1], [1, 1, 1, 1], [1, 1, 2, 1], "VALID")
 
 
+#######################################################################
+# Sparse To Dense
+# ---------------
+def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape):
+    with tf.Graph().as_default():
+        indices = tf.placeholder(
+            shape=sparse_indices.shape, dtype=str(sparse_indices.dtype), name="indices"
+        )
+        values = tf.placeholder(
+            shape=sparse_values.shape, dtype=str(sparse_values.dtype), name="values"
+        )
+        oshape = tf.constant(output_shape, shape=output_shape.shape, dtype=str(output_shape.dtype))
+
+        if default_value == None:
+            output = tf.sparse_to_dense(indices, oshape, values)
+            compare_tf_with_tvm(
+                [sparse_indices, sparse_values], ["indices:0", "values:0"], output.name
+            )
+        else:
+            dv = tf.placeholder(shape=(), dtype=str(default_value.dtype), name="default_value")
+            output = tf.sparse_to_dense(indices, oshape, values, dv)
+            compare_tf_with_tvm(
+                [sparse_indices, sparse_values, default_value],
+                ["indices:0", "values:0", "default_value:0"],
+                output.name,
+            )
+
+
+def test_forward_sparse_to_dense():
+    # scalar
+    _test_sparse_to_dense(
+        sparse_indices=np.int32(1),
+        sparse_values=np.int32(3),
+        default_value=np.int32(0),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # vector
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3, 3, 3]).astype("int32"),
+        default_value=np.int32(0),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # vector nXd
+    _test_sparse_to_dense(
+        sparse_indices=np.array([[0, 0], [1, 2]]).astype("int32"),
+        sparse_values=np.array([1, 2]).astype("int32"),
+        default_value=np.int32(0),
+        output_shape=np.array([3, 4]).astype("int32"),
+    )
+
+    _test_sparse_to_dense(
+        sparse_indices=np.array([[0, 0, 0], [1, 2, 3]]).astype("int32"),
+        sparse_values=np.array([1, 2]).astype("int32"),
+        default_value=np.int32(4),
+        output_shape=np.array([2, 3, 4]).astype("int32"),
+    )
+
+    # floats
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"),
+        default_value=np.float32(3.5),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # default value not specified
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"),
+        default_value=None,
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+
 #######################################################################
 # infinity ops
 # ------------

From aca586f8de5ab07d24adde1ba99588c3e808be90 Mon Sep 17 00:00:00 2001
From: Chenfan <chengfan.jcf@alibaba-inc.com>
Date: Mon, 2 Nov 2020 22:33:33 +0800
Subject: [PATCH 103/258] [AutoScheduler] New layout rewrite option: Weight
 pre-transpose (#6750)

* Add pre transpose support for layout rewrite

* Update

* Bug fix

* Bug fix

* Update

* Bug fix

* CI Fix

* Update

* Update

* Re-trigger CI

* Update

* Update test_auto_scheduler_layout_rewrite.py

* Update test_auto_scheduler_layout_rewrite.py

* Update task_scheduler ut, re-trigger CI

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
---
 include/tvm/auto_scheduler/compute_dag.h      |  32 ++-
 include/tvm/auto_scheduler/transform_step.h   |  46 +++--
 python/tvm/auto_scheduler/compute_dag.py      |   7 +-
 src/auto_scheduler/compute_dag.cc             | 182 +++++++++++++-----
 src/auto_scheduler/loop_state.cc              |   6 +
 src/auto_scheduler/transform_step.cc          |  52 +++++
 .../test_auto_scheduler_layout_rewrite.py     |  79 +++++++-
 .../test_auto_scheduler_task_scheduler.py     |  12 ++
 8 files changed, 334 insertions(+), 82 deletions(-)

diff --git a/include/tvm/auto_scheduler/compute_dag.h b/include/tvm/auto_scheduler/compute_dag.h
index 6e67fef0f283..da0d196f4912 100755
--- a/include/tvm/auto_scheduler/compute_dag.h
+++ b/include/tvm/auto_scheduler/compute_dag.h
@@ -194,6 +194,24 @@ class ComputeDAGNode : public Object {
   TVM_DECLARE_FINAL_OBJECT_INFO(ComputeDAGNode, Object);
 };
 
+/*!
+ * \brief Options for applying layout rewrite.
+ * This is an optimization to rewrite the layout of input tensors according to the schedule we get.
+ */
+enum class LayoutRewriteOption : int {
+  /*! \brief Do not process layout rewrite. */
+  NoRewrite = 0,
+  /*! \brief Insert layout transformation stages for input placeholders in the compute DAG */
+  InsertTransformStage = 1,
+  /*!
+   * \brief Do not insert layout transformation stages and assume the input placeholders
+   * are pre-transformed.
+   * \note The lowered function with this option does not accept the origial input shapes,
+   * so this option must be used along with a layout conversion pass in Relay.
+   */
+  RewriteForPreTransformed = 2,
+};
+
 /*!
  * \brief Managed reference to ComputeDAGNode.
  * \sa ComputeDAGNode
@@ -214,8 +232,10 @@ class ComputeDAG : public ObjectRef {
    * \brief Rewrite the layout of placeholder specified by attr `layout_free_placeholders`
    * according to the loop nest derived with `transform_steps`.
    * \param transform_steps Transform steps of a state.
+   * \param layout_rewrite Different options in layout rewrite.
+   * \return The updated ComputeDAG after layout rewrite.
    */
-  void RewriteLayout(const Array<Step>& transform_steps);
+  ComputeDAG RewriteLayout(Array<Step>* transform_steps, LayoutRewriteOption layout_rewrite) const;
 
   /*!
    * \brief Apply the history transform steps to get a TVM schedule.
@@ -225,14 +245,14 @@ class ComputeDAG : public ObjectRef {
    * \param stage_to_axes The map that stores all axes for one stage.
    * Pass a valid pointer if this information needs to be used outside this function.
    * \param layout_rewrite Rewrite the layout of placeholders specified by
-   * attr `layout_free_placeholders`
+   * attr `layout_free_placeholders`.
    * \return A `te.schedule` and the an Array of `te.Tensor` to be used in `tvm.lower`
    * or `tvm.build`.
    */
-  std::pair<te::Schedule, Array<te::Tensor>> ApplySteps(const Array<Step>& transform_steps,
-                                                        Array<te::Stage>* stages = nullptr,
-                                                        StageToAxesMap* stage_to_axes = nullptr,
-                                                        bool layout_rewrite = false) const;
+  std::pair<te::Schedule, Array<te::Tensor>> ApplySteps(
+      const Array<Step>& transform_steps, Array<te::Stage>* stages = nullptr,
+      StageToAxesMap* stage_to_axes = nullptr,
+      LayoutRewriteOption layout_rewrite = LayoutRewriteOption::NoRewrite) const;
 
   /*!
    * \brief Print transform steps as equivalent python schedule API.
diff --git a/include/tvm/auto_scheduler/transform_step.h b/include/tvm/auto_scheduler/transform_step.h
index 7be3554c7c5d..4cc1551e76fc 100755
--- a/include/tvm/auto_scheduler/transform_step.h
+++ b/include/tvm/auto_scheduler/transform_step.h
@@ -182,7 +182,23 @@ class StepNode : public Object {
  */
 class Step : public ObjectRef {
  public:
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Step, ObjectRef, StepNode);
+  /*!
+   * \brief CopyOnWrite function for Step.
+   * This works almost the same as a normal ObjectRef.CopyOnWrite(), but can dispatch to different
+   * steps.
+   * \return A base StepNode pointer, need to cast to its real StepNode type before doing any
+   * modifications.
+   * \code
+   *
+   *  SplitStep ref;
+   *  StepNode* mutable_ref = ref.CopyOnWrite();
+   *  dynamic_cast<SplitStepNode*>(mutable_ref)->... = ...;
+   *
+   * \endcode
+   */
+  StepNode* CopyOnWrite();
+
+  TVM_DEFINE_OBJECT_REF_METHODS(Step, ObjectRef, StepNode);
 };
 
 // Forward declaration
@@ -267,7 +283,7 @@ class AnnotationStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "AN";
 
   static constexpr const char* _type_key = "auto_scheduler.AnnotationStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(AnnotationStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(AnnotationStepNode, StepNode);
 };
 
 /*!
@@ -330,7 +346,7 @@ class FuseStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "FU";
 
   static constexpr const char* _type_key = "auto_scheduler.FuseStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FuseStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(FuseStepNode, StepNode);
 };
 
 /*!
@@ -390,7 +406,7 @@ class PragmaStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "PR";
 
   static constexpr const char* _type_key = "auto_scheduler.PragmaStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PragmaStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(PragmaStepNode, StepNode);
 };
 
 /*!
@@ -452,7 +468,7 @@ class ReorderStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "RE";
 
   static constexpr const char* _type_key = "auto_scheduler.ReorderStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ReorderStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ReorderStepNode, StepNode);
 };
 
 /*!
@@ -527,7 +543,7 @@ class SplitStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "SP";
 
   static constexpr const char* _type_key = "auto_scheduler.SplitStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(SplitStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(SplitStepNode, StepNode);
 };
 
 /*!
@@ -607,7 +623,7 @@ class FollowSplitStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "FSP";
 
   static constexpr const char* _type_key = "auto_scheduler.FollowSplitStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FollowSplitStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(FollowSplitStepNode, StepNode);
 };
 
 /*!
@@ -688,7 +704,7 @@ class FollowFusedSplitStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "FFSP";
 
   static constexpr const char* _type_key = "auto_scheduler.FollowFusedSplitStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FollowFusedSplitStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(FollowFusedSplitStepNode, StepNode);
 };
 
 /*!
@@ -754,7 +770,7 @@ class StorageAlignStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "SA";
 
   static constexpr const char* _type_key = "auto_scheduler.StorageAlignStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(StorageAlignStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(StorageAlignStepNode, StepNode);
 };
 
 /*!
@@ -822,7 +838,7 @@ class ComputeAtStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CA";
 
   static constexpr const char* _type_key = "auto_scheduler.ComputeAtStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeAtStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeAtStepNode, StepNode);
 };
 
 /*!
@@ -879,7 +895,7 @@ class ComputeInlineStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CI";
 
   static constexpr const char* _type_key = "auto_scheduler.ComputeInlineStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeInlineStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeInlineStepNode, StepNode);
 };
 
 /*!
@@ -938,7 +954,7 @@ class ComputeRootStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CR";
 
   static constexpr const char* _type_key = "auto_scheduler.ComputeRootStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeRootStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ComputeRootStepNode, StepNode);
 };
 
 /*!
@@ -1010,7 +1026,7 @@ class CacheReadStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CHR";
 
   static constexpr const char* _type_key = "auto_scheduler.CacheReadStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CacheReadStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(CacheReadStepNode, StepNode);
 };
 
 /*!
@@ -1081,7 +1097,7 @@ class CacheWriteStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "CHW";
 
   static constexpr const char* _type_key = "auto_scheduler.CacheWriteStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CacheWriteStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(CacheWriteStepNode, StepNode);
 };
 
 /*!
@@ -1148,7 +1164,7 @@ class RfactorStepNode : public StepNode {
   static constexpr const char* record_prefix_str = "RF";
 
   static constexpr const char* _type_key = "auto_scheduler.RfactorStep";
-  TVM_DECLARE_FINAL_OBJECT_INFO(RfactorStepNode, Object);
+  TVM_DECLARE_FINAL_OBJECT_INFO(RfactorStepNode, StepNode);
 };
 
 /*!
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index 2fc0d7d0bf8c..d50ff395b679 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -51,6 +51,11 @@ class ComputeDAG(Object):
         Input/output tensors or workload key for a compute declaration.
     """
 
+    # Layout Rewrite Options
+    NoRewrite = 0
+    InsertTransformStage = 1
+    RewriteForPreTransformed = 2
+
     def __init__(self, compute_or_sche):
         if isinstance(compute_or_sche, str):
             compute = workload_key_to_tensors(compute_or_sche)
@@ -81,7 +86,7 @@ def get_init_state(self):
         """
         return State(self.init_state, self)
 
-    def apply_steps_from_state(self, state, layout_rewrite=False):
+    def apply_steps_from_state(self, state, layout_rewrite=NoRewrite):
         """
         Apply the history transform steps from a State to get a TVM schedule.
 
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index c6cf094ee202..090e6daf9859 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -796,8 +796,8 @@ std::string GetOrigLayout(std::set<std::string>* placeholder_axis_names, const t
   return orig_layout;
 }
 
-std::string GetNewLayout(Array<PrimExpr>* new_shape, const State& state, const int stage_id,
-                         const Stage& stage, const te::Operation& op, const te::Tensor& placeholder,
+std::string GetNewLayout(const State& state, const int stage_id, const Stage& stage,
+                         const te::Operation& op, const te::Tensor& placeholder,
                          const std::set<std::string>& placeholder_axis_names) {
   std::ostringstream os;
   Array<Iterator> stage_iters;
@@ -852,7 +852,6 @@ std::string GetNewLayout(Array<PrimExpr>* new_shape, const State& state, const i
     if (placeholder_axis_names.count(ori_iter_name)) {
       os << iter->range->extent << ori_iter_name;
       new_names.push_back(ori_iter_name);
-      new_shape->push_back(iter->range->extent);
     }
   }
   std::string new_layout = os.str();
@@ -862,16 +861,22 @@ std::string GetNewLayout(Array<PrimExpr>* new_shape, const State& state, const i
   return new_layout;
 }
 
-void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
-  ComputeDAGNode* p_dag = this->CopyOnWrite();
+ComputeDAG ComputeDAG::RewriteLayout(Array<Step>* transform_steps,
+                                     LayoutRewriteOption layout_rewrite) const {
+  CHECK(layout_rewrite != LayoutRewriteOption::NoRewrite)
+      << "Call ComputeDAG::RewriteLayout with NoRewrite.";
+  ComputeDAG new_dag = *this;
+  ComputeDAGNode* p_dag = new_dag.CopyOnWrite();
+
   auto node = make_object<StateNode>();
-  node->transform_steps = transform_steps;
+  node->transform_steps = *transform_steps;
   node->concrete = true;
   const State& state = InferBound(State(node));
+
   OperationSet handled_ops;
-  int stage_id = -1;
-  for (const auto& stage : state->stages) {
-    stage_id += 1;
+  for (size_t stage_id = 0; stage_id < state->stages.size(); stage_id++) {
+    const auto& stage = state->stages[stage_id];
+
     const te::Operation& op = stage->op;
     if (!op->IsInstance<te::ComputeOpNode>()) {
       continue;
@@ -881,15 +886,13 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
       continue;
     }
     const ObjectRef& attr_value = attrs[layout_free_placeholders_key];
-    Array<te::Tensor> placeholders = Downcast<Array<te::Tensor>>(attr_value);
-    for (const auto& placeholder : placeholders) {
+    for (const auto& placeholder : Downcast<Array<te::Tensor>>(attr_value)) {
       const auto& placeholder_op = placeholder->op;
 
       // Check whether this placeholder has already been handled
       if (handled_ops.count(placeholder_op)) {
         continue;
       }
-
       // Skip the op that is not direct consumer of this placeholder.
       // This is usually caused by cache read/write.
       bool direct_consumer = false;
@@ -902,28 +905,89 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
       if (!direct_consumer) {
         continue;
       }
+      handled_ops.insert(placeholder_op);
 
+      // Process original layout
       std::set<std::string> placeholder_axis_names;
-      GetOrigLayout(&placeholder_axis_names, op, placeholder);
+      std::string origin_layout = GetOrigLayout(&placeholder_axis_names, op, placeholder);
+      Array<PrimExpr> origin_shape;
+      std::vector<std::string> origin_axes;
+      ParseKernelLayout(origin_layout, &origin_shape, &origin_axes);
 
-      Array<PrimExpr> new_shape;
+      // Process new layout
       std::string new_layout =
-          GetNewLayout(&new_shape, state, stage_id, stage, op, placeholder, placeholder_axis_names);
-
-      handled_ops.insert(placeholder_op);
-
-      Array<te::Operation> old_ops = p_dag->ops;
-      ArrayNode* pops = p_dag->ops.CopyOnWrite();
-
-      // Create new placeholder
-      te::Operation new_placeholder_op;
-      new_placeholder_op = te::PlaceholderOp(placeholder_op->name, new_shape,
+          GetNewLayout(state, stage_id, stage, op, placeholder, placeholder_axis_names);
+      Array<PrimExpr> new_shape;
+      std::vector<std::string> new_axes;
+      ParseKernelLayout(new_layout, &new_shape, &new_axes);
+
+      // Process op updates
+      te::Operation new_op_to_update;
+      if (layout_rewrite == LayoutRewriteOption::RewriteForPreTransformed) {
+        // Create new placeholder
+        new_op_to_update = te::PlaceholderOp(placeholder_op->name, new_shape,
                                              placeholder_op.as<te::PlaceholderOpNode>()->dtype);
+      } else if (layout_rewrite == LayoutRewriteOption::InsertTransformStage) {
+        // Process index strides
+        std::unordered_map<std::string, PrimExpr> axes_stride;
+        for (const auto& i : origin_axes) {
+          axes_stride[i] = Integer(1);
+        }
+        Array<PrimExpr> new_stride(new_shape.size(), PrimExpr());
+        PrimExpr temp = Integer(1);
+        for (int i = new_shape.size() - 1; i >= 0; i--) {
+          new_stride.Set(i, axes_stride[new_axes[i]]);
+          axes_stride[new_axes[i]] *= new_shape[i];
+        }
 
-      te::Operation new_compute_op, old_compute_op;
+        // Add extra layout transpose stage
+        const auto& layout_transform_tensor = te::compute(
+            new_shape,
+            [&new_stride, &placeholder_op, &origin_shape, &new_shape, &origin_axes,
+             &new_axes](const tvm::runtime::Array<tvm::tir::Var>& indices) -> tvm::PrimExpr {
+              Array<PrimExpr> access_indices;
+              for (size_t indice_index = 0; indice_index < origin_shape.size(); indice_index++) {
+                PrimExpr temp = Integer(0);
+                for (size_t i = 0; i < new_shape.size(); i++) {
+                  if (origin_axes[indice_index].compare(new_axes[i]) == 0) {
+                    temp += indices[i] * new_stride[i];
+                  }
+                }
+                access_indices.push_back(temp);
+              }
+              return placeholder_op.output(0)(access_indices);
+            },
+            "auto_schedule_layout_transpose");
+        new_op_to_update = layout_transform_tensor->op;
+
+        // Update the transform steps
+        for (size_t i = 0; i < transform_steps->size(); i++) {
+          Step step = (*transform_steps)[i];
+          if (step->stage_id >= static_cast<int>(stage_id)) {
+            step.CopyOnWrite()->stage_id++;
+          }
+          if (step->IsInstance<ComputeAtStepNode>()) {
+            auto compute_at_step = tvm::Downcast<ComputeAtStep>(step);
+            if (compute_at_step->target_stage_id >= static_cast<int>(stage_id)) {
+              dynamic_cast<ComputeAtStepNode*>(compute_at_step.CopyOnWrite())->target_stage_id++;
+            }
+            transform_steps->Set(i, std::move(compute_at_step));
+          } else {
+            transform_steps->Set(i, std::move(step));
+          }
+        }
+        Array<Integer> to_fuse;
+        for (size_t i = 0; i < new_shape.size() - 1; i++) {
+          to_fuse.push_back(i);
+        }
+        transform_steps->push_back(FuseStep(stage_id, to_fuse));
+        transform_steps->push_back(AnnotationStep(stage_id, 0, IteratorAnnotation::kParallel));
+      }
+
+      te::Operation new_compute_op, original_compute_op;
       Array<PrimExpr> new_body;
       IndexRewriter index_rewriter(placeholder_op, new_layout);
-      for (auto& op : old_ops) {
+      for (const auto& op : p_dag->ops) {
         if (auto* pop = op.as<te::ComputeOpNode>()) {
           bool need_update = false;
           for (auto& t : op->InputTensors()) {
@@ -933,35 +997,45 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
             }
           }
           if (need_update) {
-            for (auto& body : pop->body) {
+            for (const auto& body : pop->body) {
               new_body.push_back(index_rewriter.Rewrite(body));
             }
-            old_compute_op = op;
-            ICHECK(!new_compute_op.defined());
+            original_compute_op = op;
+            CHECK(!new_compute_op.defined());
             new_compute_op = te::ComputeOp(pop->name, pop->tag, pop->attrs, pop->axis, new_body);
           }
         }
       }
 
-      // construct the map from old_op to new_op
+      // construct the map from original_op to new_op
       std::unordered_map<te::Operation, te::Operation> updated_ops;
-      for (size_t i = 0; i < old_ops.size(); ++i) {
-        auto old_op = old_ops[i];
-        if (old_op == placeholder_op) {
-          pops->SetItem(i, new_placeholder_op);
-          updated_ops[placeholder_op] = new_placeholder_op;
-        } else if (old_op == old_compute_op) {
-          pops->SetItem(i, new_compute_op);
-          updated_ops[old_compute_op] = new_compute_op;
+
+      Array<te::Operation> original_ops = p_dag->ops;
+      p_dag->ops.clear();
+      for (size_t i = 0; i < original_ops.size(); ++i) {
+        const auto& original_op = original_ops[i];
+        if (original_op == placeholder_op) {
+          if (layout_rewrite == LayoutRewriteOption::InsertTransformStage) {
+            p_dag->ops.push_back(placeholder_op);
+          }
+          p_dag->ops.push_back(new_op_to_update);
+          updated_ops[placeholder_op] = new_op_to_update;
+        } else if (original_op == original_compute_op) {
+          p_dag->ops.push_back(new_compute_op);
+          updated_ops[original_compute_op] = new_compute_op;
         } else {
-          pops->SetItem(i, old_op);
+          p_dag->ops.push_back(original_op);
         }
       }
 
+      ArrayNode* pops = p_dag->ops.CopyOnWrite();
       // Because ops is sorted in topo-order, only do one pass linear scan here.
       for (size_t i = 0; i < pops->size(); ++i) {
-        auto old_op = Downcast<te::Operation>(pops->at(i));
-        if (auto* pop = old_op.as<te::ComputeOpNode>()) {
+        const auto& original_op = Downcast<te::Operation>(pops->at(i));
+        if (auto* pop = original_op.as<te::ComputeOpNode>()) {
+          if (original_op == new_op_to_update) {
+            continue;
+          }
           auto inputs = pop->InputTensors();
           std::unordered_map<te::Tensor, te::Tensor> rmap;
           for (auto input : inputs) {
@@ -977,8 +1051,8 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
             }
           }
           if (!rmap.empty()) {
-            te::Operation new_op = pop->ReplaceInputs(old_op, rmap);
-            updated_ops[old_op] = new_op;
+            te::Operation new_op = pop->ReplaceInputs(original_op, rmap);
+            updated_ops[original_op] = new_op;
             pops->SetItem(i, new_op);
           }
         }
@@ -986,9 +1060,12 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
 
       Array<te::Tensor> old_tensors = p_dag->tensors;
       ArrayNode* p_tensors = p_dag->tensors.CopyOnWrite();
-
       for (size_t i = 0; i < old_tensors.size(); ++i) {
         const auto& old_tensor = old_tensors[i];
+        if (layout_rewrite != LayoutRewriteOption::RewriteForPreTransformed &&
+            old_tensor->op->IsInstance<te::PlaceholderOpNode>()) {
+          continue;
+        }
         auto it = updated_ops.find(old_tensor->op);
         te::Operation new_op;
         while (it != updated_ops.end()) {
@@ -1018,15 +1095,17 @@ void ComputeDAG::RewriteLayout(const Array<Step>& transform_steps) {
   }
   p_dag->flop_ct = FlopEstimator().EstimateFlop(p_dag->ops);
   p_dag->init_state = State(p_dag->ops);
+
+  return new_dag;
 }
 
 std::pair<te::Schedule, Array<te::Tensor>> ComputeDAG::ApplySteps(
     const Array<Step>& transform_steps, Array<te::Stage>* stages, StageToAxesMap* stage_to_axes,
-    bool layout_rewrite) const {
-  if (layout_rewrite && !transform_steps.empty()) {
-    ComputeDAG new_dag = *this;
-    new_dag.RewriteLayout(transform_steps);
-    return new_dag.ApplySteps(transform_steps, stages, stage_to_axes, false);
+    LayoutRewriteOption layout_rewrite) const {
+  if (layout_rewrite != LayoutRewriteOption::NoRewrite && !transform_steps.empty()) {
+    Array<Step> steps = transform_steps;
+    const auto& dag = RewriteLayout(&steps, layout_rewrite);
+    return dag.ApplySteps(steps);
   }
 
   // Temporal object to be used if the input pointer is nullptr
@@ -1305,11 +1384,12 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAG")
     });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGApplyStepsFromState")
-    .set_body_typed([](const ComputeDAG& dag, const State& state, const bool layout_rewrite) {
+    .set_body_typed([](const ComputeDAG& dag, const State& state, int layout_rewrite) {
       te::Schedule sch;
       Array<te::Tensor> return_tensors;
       std::tie(sch, return_tensors) =
-          dag.ApplySteps(state->transform_steps, nullptr, nullptr, layout_rewrite);
+          dag.ApplySteps(state->transform_steps, nullptr, nullptr,
+                         static_cast<LayoutRewriteOption>(layout_rewrite));
       return Array<ObjectRef>{sch, return_tensors};
     });
 
diff --git a/src/auto_scheduler/loop_state.cc b/src/auto_scheduler/loop_state.cc
index 23d6eb64da6c..517f7ff91f55 100755
--- a/src/auto_scheduler/loop_state.cc
+++ b/src/auto_scheduler/loop_state.cc
@@ -445,6 +445,12 @@ String State::ToStr(bool delete_trivial_loop) const {
   return os.str();
 }
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<StageNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      const auto& stage = tvm::Downcast<Stage>(ref);
+      p->stream << stage->GetTypeKey() << "(" << stage.get() << ": " << stage->op->name << ")";
+    });
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<StateNode>([](const ObjectRef& ref, ReprPrinter* p) {
       PrintState(&p->stream, tvm::Downcast<State>(ref), true);
diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc
index 852f1e1f17d8..5560907dcffa 100755
--- a/src/auto_scheduler/transform_step.cc
+++ b/src/auto_scheduler/transform_step.cc
@@ -122,6 +122,58 @@ const char* IteratorAnnotationString[] = {
     "tensorize"     // kTensorized = 11
 };
 
+StepNode* Step::CopyOnWrite() {
+  CHECK(data_ != nullptr);
+  if (!data_.unique()) {
+    if (const auto& ps = as<AnnotationStepNode>()) {
+      auto n = make_object<AnnotationStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<FuseStepNode>()) {
+      auto n = make_object<FuseStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<PragmaStepNode>()) {
+      auto n = make_object<PragmaStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<ReorderStepNode>()) {
+      auto n = make_object<ReorderStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<SplitStepNode>()) {
+      auto n = make_object<SplitStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<FollowSplitStepNode>()) {
+      auto n = make_object<FollowSplitStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<FollowFusedSplitStepNode>()) {
+      auto n = make_object<FollowFusedSplitStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<StorageAlignStepNode>()) {
+      auto n = make_object<StorageAlignStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<ComputeAtStepNode>()) {
+      auto n = make_object<ComputeAtStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<ComputeInlineStepNode>()) {
+      auto n = make_object<ComputeInlineStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<ComputeRootStepNode>()) {
+      auto n = make_object<ComputeRootStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<CacheReadStepNode>()) {
+      auto n = make_object<CacheReadStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<CacheWriteStepNode>()) {
+      auto n = make_object<CacheWriteStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else if (const auto& ps = as<RfactorStepNode>()) {
+      auto n = make_object<RfactorStepNode>(*ps);
+      ObjectPtr<Object>(std::move(n)).swap(data_);
+    } else {
+      LOG(FATAL) << "Invalid step: " << (*this);
+    }
+  }
+  return static_cast<StepNode*>(data_.get());
+}
+
 Step StepReadFromRecord(dmlc::JSONReader* reader) {
   std::string name;
   bool s;
diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index 3ce7a438eef4..4a11d0fb0ca0 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -28,18 +28,26 @@
 
 def test_apply_steps_with_layout_rewrite():
     dag, s = get_tiled_matmul()
-    _, bufs = dag.apply_steps_from_state(s, layout_rewrite=False)
+    _, bufs = dag.apply_steps_from_state(s)
     assert bufs[1].shape[0] == 512
     assert bufs[1].shape[1] == 512
-    _, bufs = dag.apply_steps_from_state(s, layout_rewrite=True)
+    _, bufs = dag.apply_steps_from_state(
+        s, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.RewriteForPreTransformed
+    )
     assert bufs[1].shape[0] == 4
     assert bufs[1].shape[1] == 8
     assert bufs[1].shape[2] == 4
     assert bufs[1].shape[3] == 4
     assert bufs[1].shape[4] == 512
+    _, bufs = dag.apply_steps_from_state(
+        s, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.InsertTransformStage
+    )
+    assert bufs[1].shape[0] == 512
+    assert bufs[1].shape[1] == 512
 
 
-def test_layout_rewrite_correctness():
+@tvm.testing.requires_llvm
+def test_correctness_layout_rewrite_rewrite_for_preTransformed():
     N = 128
     target = tvm.target.Target("llvm")
     task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target)
@@ -50,16 +58,19 @@ def test_layout_rewrite_correctness():
 
         search_policy = auto_scheduler.SketchPolicy(task)
 
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
         tuning_options = auto_scheduler.TuningOptions(
             num_measure_trials=2,
-            runner="local",
+            runner=measure_ctx.runner,
             verbose=1,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
         auto_scheduler.auto_schedule(task, search_policy, tuning_options)
         inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target)
-        s, bufs = dag.apply_steps_from_state(inp.state, layout_rewrite=True)
-        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state, layout_rewrite=False)
+        s, bufs = dag.apply_steps_from_state(
+            inp.state, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.RewriteForPreTransformed
+        )
+        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
         np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
         np_args_ref = [np.array(x) for x in np_args]
 
@@ -100,10 +111,60 @@ def test_layout_rewrite_correctness():
         func_ref(*args_ref)
         ctx.sync()
 
-        np.testing.assert_allclose(np_args[0], np_args_ref[0])
-        np.testing.assert_allclose(np_args[2], np_args_ref[2])
+        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), rtol=1e-4)
+        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), rtol=1e-4)
+        del measure_ctx
+
+
+@tvm.testing.requires_llvm
+def test_correctness_layout_rewrite_insert_transform_stage():
+    N = 128
+    target = tvm.target.Target("llvm")
+    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target)
+    dag = task.compute_dag
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+
+        search_policy = auto_scheduler.SketchPolicy(task)
+
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
+        tuning_options = auto_scheduler.TuningOptions(
+            num_measure_trials=2,
+            runner=measure_ctx.runner,
+            verbose=1,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        auto_scheduler.auto_schedule(task, search_policy, tuning_options)
+        inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target)
+        s, bufs = dag.apply_steps_from_state(
+            inp.state, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.InsertTransformStage
+        )
+
+        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
+        np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
+
+        func = tvm.build(s, bufs, target=target)
+        func_ref = tvm.build(s_ref, bufs_ref, target=target)
+
+        ctx = tvm.context(str(target))
+        ctx_ref = tvm.cpu()
+
+        args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
+        args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args]
+        ctx.sync()
+
+        func(*args)
+        func_ref(*args_ref)
+        ctx.sync()
+
+        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), rtol=1e-4)
+        tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), rtol=1e-4)
+        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), rtol=1e-4)
+        del measure_ctx
 
 
 if __name__ == "__main__":
     test_apply_steps_with_layout_rewrite()
-    test_layout_rewrite_correctness()
+    test_correctness_layout_rewrite_rewrite_for_preTransformed()
+    test_correctness_layout_rewrite_insert_transform_stage()
diff --git a/tests/python/unittest/test_auto_scheduler_task_scheduler.py b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
index 7851d922013d..2debc14fc356 100644
--- a/tests/python/unittest/test_auto_scheduler_task_scheduler.py
+++ b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
@@ -21,11 +21,14 @@
 import multiprocessing
 import numpy as np
 
+import tvm
+import tvm.testing
 from tvm import auto_scheduler
 
 from test_auto_scheduler_common import matmul_auto_scheduler_test
 
 
+@tvm.testing.requires_llvm
 def test_task_scheduler_round_robin():
     tasks = []
     for n in [2, 4, 8]:
@@ -39,8 +42,10 @@ def objective_func(costs):
         num_trials_per_task = 2
 
         # Tune all tasks
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
         tune_option = auto_scheduler.TuningOptions(
             num_measure_trials=num_trials_per_task * len(tasks),
+            runner=measure_ctx.runner,
             num_measures_per_round=1,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
@@ -67,13 +72,16 @@ def objective_func(costs):
             num_measures_per_round=1,
         )
         task_scheduler.tune(tune_option, search_policy="sketch.random")
+        del measure_ctx
 
 
+@tvm.testing.requires_llvm
 def task_scheduler_round_robin_spawn():
     assert multiprocessing.get_start_method(False) == "spawn"
     test_task_scheduler_round_robin()
 
 
+@tvm.testing.requires_llvm
 def test_task_scheduler_round_robin_spawn():
     ctx = multiprocessing.get_context("spawn")
     p = ctx.Process(target=task_scheduler_round_robin_spawn)
@@ -81,6 +89,7 @@ def test_task_scheduler_round_robin_spawn():
     p.join()
 
 
+@tvm.testing.requires_llvm
 def test_task_scheduler_gradient():
     tasks = []
     for n in [2, 4]:
@@ -95,8 +104,10 @@ def objective_func(costs):
         n_trials = 5
 
         # Tune all tasks
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
         tune_option = auto_scheduler.TuningOptions(
             num_measure_trials=n_trials,
+            runner=measure_ctx.runner,
             num_measures_per_round=1,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
@@ -118,6 +129,7 @@ def objective_func(costs):
 
         assert counters[tasks[0].workload_key] == n_trials - 1
         assert counters[tasks[1].workload_key] == 1
+        del measure_ctx
 
 
 if __name__ == "__main__":

From 17973cfedb7d9722aa91ee796b86ca3f0b70fc4d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 2 Nov 2020 12:05:47 -0500
Subject: [PATCH 104/258] Update stale link to new location (#6819)

---
 docs/vta/dev/hardware.rst                     | 12 ++++++------
 docs/vta/dev/index.rst                        |  2 +-
 docs/vta/install.rst                          |  2 +-
 vta/python/vta/bitstream.py                   |  2 +-
 vta/tutorials/matrix_multiply.py              |  6 +++---
 vta/tutorials/optimize/convolution_opt.py     |  6 +++---
 vta/tutorials/optimize/matrix_multiply_opt.py |  4 ++--
 vta/tutorials/vta_get_started.py              |  2 +-
 8 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/docs/vta/dev/hardware.rst b/docs/vta/dev/hardware.rst
index c8d543330728..1e3c0acdb185 100644
--- a/docs/vta/dev/hardware.rst
+++ b/docs/vta/dev/hardware.rst
@@ -36,7 +36,7 @@ In addition the design adopts decoupled access-execute to hide memory access lat
 
 To a broader extent, VTA can serve as a template deep learning accelerator design for full stack optimization, exposing a generic tensor computation interface to the compiler stack.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/blogpost/vta_overview.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/blogpost/vta_overview.png
    :align: center
    :width: 80%
 
@@ -175,7 +175,7 @@ Finally, the ``STORE`` instructions are executed by the store module exclusively
 The fields of each instruction is described in the figure below.
 The meaning of each field will be further explained in the :ref:`vta-uarch` section.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/vta_instructions.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/vta_instructions.png
    :align: center
    :width: 100%
 
@@ -191,7 +191,7 @@ VTA relies on dependence FIFO queues between hardware modules to synchronize the
 The figure below shows how a given hardware module can execute concurrently from its producer and consumer modules in a dataflow fashion through the use of dependence FIFO queues, and single-reader/single-writer SRAM buffers.
 Each module is connected to its consumer and producer via read-after-write (RAW) and write-after-read (WAR) dependence queues.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/dataflow.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/dataflow.png
    :align: center
    :width: 100%
 
@@ -258,7 +258,7 @@ There are two types of compute micro-ops: ALU and GEMM operations.
 To minimize the footprint of micro-op kernels, while avoiding the need for control-flow instructions such as conditional jumps, the compute module executes micro-op sequences inside a two-level nested loop that computes the location of each tensor register location via an affine function.
 This compression approach helps reduce the micro-kernel instruction footprint, and applies to both matrix multiplication and 2D convolution, commonly found in neural network operators.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/gemm_core.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/gemm_core.png
    :align: center
    :width: 100%
 
@@ -269,7 +269,7 @@ This tensorization intrinsic is defined by the dimensions of the input, weight a
 Each data type can have a different integer precision: typically both weight and input types are low-precision (8-bits or less), while the accumulator tensor has a wider type to prevent overflows (32-bits).
 In order to keep the GEMM core busy, each of the input buffer, weight buffer, and register file have to expose sufficient read/write bandwidth.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/alu_core.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/alu_core.png
    :align: center
    :width: 100%
 
@@ -289,7 +289,7 @@ The micro-code in the context of tensor ALU computation only takes care of speci
 Load and Store Modules
 ~~~~~~~~~~~~~~~~~~~~~~
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/developer/2d_dma.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/2d_dma.png
    :align: center
    :width: 100%
 
diff --git a/docs/vta/dev/index.rst b/docs/vta/dev/index.rst
index d95f6e23d90d..2b715740ed29 100644
--- a/docs/vta/dev/index.rst
+++ b/docs/vta/dev/index.rst
@@ -20,7 +20,7 @@ VTA Design and Developer Guide
 
 This developer guide details the complete VTA-TVM hardware-software stack.
 
-.. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/blogpost/vta_stack.png
+.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/blogpost/vta_stack.png
    :align: center
    :width: 60%
 
diff --git a/docs/vta/install.rst b/docs/vta/install.rst
index 4cd1ee93a6e6..bb5c1c9c9669 100644
--- a/docs/vta/install.rst
+++ b/docs/vta/install.rst
@@ -202,7 +202,7 @@ This time again, we will run the 2D convolution testbench.
 Beforehand, we need to program the Pynq board FPGA with a VTA bitstream, and build the VTA runtime via RPC.
 The following ``test_program_rpc.py`` script will perform two operations:
 
-* FPGA programming, by downloading a pre-compiled bitstream from a `VTA bitstream repository <https://github.com/uwsaml/vta-distro>`_ that matches the default ``vta_config.json`` configuration set by the host, and sending it over to the Pynq via RPC to program the Pynq's FPGA.
+* FPGA programming, by downloading a pre-compiled bitstream from a `VTA bitstream repository <https://github.com/uwsampl/vta-distro>`_ that matches the default ``vta_config.json`` configuration set by the host, and sending it over to the Pynq via RPC to program the Pynq's FPGA.
 * Runtime building on the Pynq, which needs to be run every time the ``vta_config.json`` configuration is modified. This ensures that the VTA software runtime that generates the accelerator's executable via just-in-time (JIT) compilation matches the specifications of the VTA design that is programmed on the FPGA. The build process takes about 30 seconds to complete so be patient!
 
 .. code:: bash
diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py
index 254243d59543..3f7064061c06 100644
--- a/vta/python/vta/bitstream.py
+++ b/vta/python/vta/bitstream.py
@@ -29,7 +29,7 @@
     import urllib2
 
 # bitstream repo
-BITSTREAM_URL = "https://github.com/uwsaml/vta-distro/raw/master/bitstreams/"
+BITSTREAM_URL = "https://github.com/uwsampl/vta-distro/raw/master/bitstreams/"
 
 
 def get_bitstream_path():
diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py
index 60ea9ca275c1..593ac3c5a0ee 100644
--- a/vta/tutorials/matrix_multiply.py
+++ b/vta/tutorials/matrix_multiply.py
@@ -86,7 +86,7 @@
 # The last operation is a cast and copy back to DRAM, into results tensor
 # :code:`C`.
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/gemm_dataflow.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/gemm_dataflow.png
 #      :align: center
 
 ######################################################################
@@ -107,7 +107,7 @@
 #   adding the result matrix to an accumulator matrix, as shown in the
 #   figure below.
 #
-#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/tensor_core.png
+#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/tensor_core.png
 #        :align: center
 #        :width: 480px
 #
@@ -126,7 +126,7 @@
 #   contiguous.
 #   The resulting tiled tensor has a shape of (2, 4, 2, 2).
 #
-#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/data_tiling.png
+#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/data_tiling.png
 #        :align: center
 #        :width: 480px
 #
diff --git a/vta/tutorials/optimize/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py
index e991ffe18f79..185b71fdc210 100644
--- a/vta/tutorials/optimize/convolution_opt.py
+++ b/vta/tutorials/optimize/convolution_opt.py
@@ -93,7 +93,7 @@
 # convolution followed by a rectified linear activation.
 # We describe the TVM dataflow graph of the 2D convolution layer below:
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/conv2d_dataflow.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/conv2d_dataflow.png
 #      :align: center
 #
 # This computation is intentionally too large to fit onto VTA's on-chip
@@ -120,7 +120,7 @@
 #   loaded from DRAM into VTA's SRAM, following a 2D strided and padded memory
 #   read.
 #
-#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/padding.png
+#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/padding.png
 #        :align: center
 #        :width: 480px
 
@@ -292,7 +292,7 @@
 # We show how work is split when computing the 2D convolution in the figure
 # below.
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/virtual_threading.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/virtual_threading.png
 #      :align: center
 #      :width: 480px
 
diff --git a/vta/tutorials/optimize/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
index 44552db21688..c9d1c137fbff 100644
--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ b/vta/tutorials/optimize/matrix_multiply_opt.py
@@ -88,7 +88,7 @@
 # matrix multiplication followed by a rectified linear activation.
 # We describe the TVM dataflow graph of the fully connected layer below:
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/fc_dataflow.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/fc_dataflow.png
 #      :align: center
 #
 # This computation is intentionally too large to fit onto VTA's on-chip
@@ -183,7 +183,7 @@
 # We show the outcome of blocking on the computation schedule in the diagram
 # below:
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/blocking.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/blocking.png
 #      :align: center
 #      :width: 480px
 #
diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py
index 16d613581b57..1a097b804a31 100644
--- a/vta/tutorials/vta_get_started.py
+++ b/vta/tutorials/vta_get_started.py
@@ -115,7 +115,7 @@
 # The last operation is a cast and copy back to DRAM, into results tensor
 # :code:`C`.
 #
-# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/main/vta/tutorial/vadd_dataflow.png
+# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/vadd_dataflow.png
 #      :align: center
 
 ######################################################################

From 1df70a1fe11fdf26204713cbf71e8c6f58730e1c Mon Sep 17 00:00:00 2001
From: Adelbert Chang <adelbertc@gmail.com>
Date: Mon, 2 Nov 2020 15:35:09 -0800
Subject: [PATCH 105/258] [rust][tvm-graph-rt]: maintain error sources when
 propagating errors, swap Mutex for RwLock (#6815)

---
 rust/tvm-graph-rt/src/errors.rs        | 14 ++++----
 rust/tvm-graph-rt/src/graph.rs         | 49 ++++++++++++++++++--------
 rust/tvm-graph-rt/src/module/syslib.rs | 10 +++---
 3 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/rust/tvm-graph-rt/src/errors.rs b/rust/tvm-graph-rt/src/errors.rs
index 2ca97bdabb6b..c4bddb85b0de 100644
--- a/rust/tvm-graph-rt/src/errors.rs
+++ b/rust/tvm-graph-rt/src/errors.rs
@@ -22,14 +22,14 @@ use tvm_sys::DataType;
 
 #[derive(Debug, Error)]
 pub enum GraphFormatError {
-    #[error("Could not parse graph json")]
-    Parse(#[from] serde_json::Error),
-    #[error("Could not parse graph params")]
-    Params,
-    #[error("{0} is missing attr: {1}")]
+    #[error("Failed to parse graph with error: {0}")]
+    Parse(#[source] serde_json::Error),
+    #[error("Failed to parse graph parameters with error: {0:?}")]
+    Params(#[source] Option<nom::Err<(Vec<u8>, nom::error::ErrorKind)>>),
+    #[error("{0} is missing attribute: {1}")]
     MissingAttr(String, String),
-    #[error("Graph has invalid attr that can't be parsed: {0}")]
-    InvalidAttr(#[from] std::num::ParseIntError),
+    #[error("Failed to parse graph attribute '{0}' with error: {1}")]
+    InvalidAttr(String, #[source] std::num::ParseIntError),
     #[error("Missing field: {0}")]
     MissingField(&'static str),
     #[error("Invalid DLType: {0}")]
diff --git a/rust/tvm-graph-rt/src/graph.rs b/rust/tvm-graph-rt/src/graph.rs
index 87dd4a76d5e4..646a20daaf5b 100644
--- a/rust/tvm-graph-rt/src/graph.rs
+++ b/rust/tvm-graph-rt/src/graph.rs
@@ -26,7 +26,7 @@ use nom::{
     character::complete::{alpha1, digit1},
     complete, count, do_parse, length_count, map, named,
     number::complete::{le_i32, le_i64, le_u16, le_u32, le_u64, le_u8},
-    opt, tag, take, tuple,
+    opt, tag, take, tuple, Err as NomErr,
 };
 use serde::{Deserialize, Serialize};
 use serde_json;
@@ -121,10 +121,22 @@ impl Node {
             .attrs
             .as_ref()
             .ok_or_else(|| GraphFormatError::MissingAttr(self.name.clone(), "attrs".to_owned()))?;
+
+        let func_name = get_node_attr!(self.name, attrs, "func_name")?.to_owned();
+
+        let num_outputs = get_node_attr!(self.name, attrs, "num_outputs")?
+            .parse::<usize>()
+            .map_err(|error| GraphFormatError::InvalidAttr("num_outputs".to_string(), error))?;
+
+        let flatten_data = get_node_attr!(self.name, attrs, "flatten_data")?
+            .parse::<u8>()
+            .map(|val| val == 1)
+            .map_err(|error| GraphFormatError::InvalidAttr("flatten_data".to_string(), error))?;
+
         Ok(NodeAttrs {
-            func_name: get_node_attr!(self.name, attrs, "func_name")?.to_owned(),
-            num_outputs: get_node_attr!(self.name, attrs, "num_outputs")?.parse::<usize>()?,
-            flatten_data: get_node_attr!(self.name, attrs, "flatten_data")?.parse::<u8>()? == 1,
+            func_name,
+            num_outputs,
+            flatten_data,
         })
     }
 }
@@ -132,16 +144,14 @@ impl Node {
 impl<'a> TryFrom<&'a String> for Graph {
     type Error = GraphFormatError;
     fn try_from(graph_json: &String) -> Result<Self, GraphFormatError> {
-        let graph = serde_json::from_str(graph_json)?;
-        Ok(graph)
+        serde_json::from_str(graph_json).map_err(|error| GraphFormatError::Parse(error))
     }
 }
 
 impl<'a> TryFrom<&'a str> for Graph {
     type Error = GraphFormatError;
     fn try_from(graph_json: &'a str) -> Result<Self, Self::Error> {
-        let graph = serde_json::from_str(graph_json)?;
-        Ok(graph)
+        serde_json::from_str(graph_json).map_err(|error| GraphFormatError::Parse(error))
     }
 }
 
@@ -475,14 +485,23 @@ named! {
 
 /// Loads a param dict saved using `relay.save_param_dict`.
 pub fn load_param_dict(bytes: &[u8]) -> Result<HashMap<String, Tensor>, GraphFormatError> {
-    if let Ok((remaining_bytes, param_dict)) = parse_param_dict(bytes) {
-        if remaining_bytes.is_empty() {
-            Ok(param_dict)
-        } else {
-            Err(GraphFormatError::Params)
+    match parse_param_dict(bytes) {
+        Ok((remaining_bytes, param_dict)) => {
+            if remaining_bytes.is_empty() {
+                Ok(param_dict)
+            } else {
+                Err(GraphFormatError::Params(None))
+            }
         }
-    } else {
-        Err(GraphFormatError::Params)
+        Err(error) => Err(match error {
+            NomErr::Incomplete(error) => GraphFormatError::Params(Some(NomErr::Incomplete(error))),
+            NomErr::Error((remainder, error_kind)) => {
+                GraphFormatError::Params(Some(NomErr::Error((remainder.into(), error_kind))))
+            }
+            NomErr::Failure((remainder, error_kind)) => {
+                GraphFormatError::Params(Some(NomErr::Failure((remainder.into(), error_kind))))
+            }
+        }),
     }
 }
 
diff --git a/rust/tvm-graph-rt/src/module/syslib.rs b/rust/tvm-graph-rt/src/module/syslib.rs
index 0279e31be079..efc29a336620 100644
--- a/rust/tvm-graph-rt/src/module/syslib.rs
+++ b/rust/tvm-graph-rt/src/module/syslib.rs
@@ -18,7 +18,7 @@
  */
 
 use std::{
-    collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::Mutex,
+    collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::RwLock,
 };
 
 use lazy_static::lazy_static;
@@ -35,14 +35,14 @@ extern "C" {
 }
 
 lazy_static! {
-    static ref SYSTEM_LIB_FUNCTIONS: Mutex<HashMap<String, &'static (dyn PackedFunc)>> =
-        Mutex::new(HashMap::new());
+    static ref SYSTEM_LIB_FUNCTIONS: RwLock<HashMap<String, &'static (dyn PackedFunc)>> =
+        RwLock::new(HashMap::new());
 }
 
 impl Module for SystemLibModule {
     fn get_function<S: AsRef<str>>(&self, name: S) -> Option<&(dyn PackedFunc)> {
         SYSTEM_LIB_FUNCTIONS
-            .lock()
+            .read()
             .unwrap()
             .get(name.as_ref())
             .copied()
@@ -65,7 +65,7 @@ pub extern "C" fn TVMBackendRegisterSystemLibSymbol(
     func: BackendPackedCFunc,
 ) -> i32 {
     let name = unsafe { CStr::from_ptr(cname).to_str().unwrap() };
-    SYSTEM_LIB_FUNCTIONS.lock().unwrap().insert(
+    SYSTEM_LIB_FUNCTIONS.write().unwrap().insert(
         name.to_string(),
         &*Box::leak(super::wrap_backend_packed_func(name.to_string(), func)),
     );

From 1028daf1c25bfd3ec60394a8168fd2df2a4d659f Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 3 Nov 2020 05:56:33 +0000
Subject: [PATCH 106/258] Improve AArch64 depthwise convolution through
 smlal/smlal2 intrinsic (#6711)

* Improve depthwise convolution through smlal/smlal2 intrinsic

- Added an intrinsic to load a single int16x8 vector and produce two
  int32x4 output vectors through smlal/smlal2 instructions

- Changed the NHWC depthwise schedule to accomodate the aforementioned
  intrinsic

Change-Id: I347c3bf98fa8dd87057304dcda0d78e558424c57

* Address review comments

* Rebasing - 2

* Rebasing - 3

* Rebasing - 3

* Fix linting
---
 python/tvm/topi/arm_cpu/depthwise_conv2d.py   | 65 ++++++++++++--
 python/tvm/topi/arm_cpu/tensor_intrin.py      | 90 +++++++++++++++++++
 .../topi/python/test_topi_depthwise_conv2d.py | 54 +++++++++++
 3 files changed, 200 insertions(+), 9 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 3c32d3e1f3f2..441b0a5a3688 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -25,6 +25,8 @@
 from .. import nn
 from ..utils import traverse_inline, get_const_tuple, get_const_int
 from ..nn.utils import get_pad_tuple
+from .tensor_intrin import smlal_int16_int32
+from .arm_utils import is_aarch64_arm
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -222,7 +224,6 @@ def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation, o
     output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-
     out_dtype = out_dtype or data.dtype
 
     N, IH, IW, IC = get_const_tuple(data.shape)
@@ -288,10 +289,18 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):
 
     ##### space definition begin #####
     n, h, w, c = s[out].op.axis
+    # Split the number of input/output channels
     cfg.define_split("tile_c", c, num_outputs=2)
+    # Split the height of the convolution
     _, hi = cfg.define_split("tile_h", h, num_outputs=2)
+    # Split the width of the convolution
     _, wi = cfg.define_split("tile_w", w, num_outputs=2)
+    # Additional out (e.g., requantization, bias addition, etc..)
+    # 0: locate the output on the second last axis of the main compuation
+    # 1: locate the output closest to the main computation
     cfg.define_knob("locate_output", [0, 1])
+    # Determine if we should unroll the computation of the inner tile
+    cfg.define_knob("unroll_tile", [True, False])
 
     # fallback support
     if cfg.is_fallback:
@@ -299,10 +308,15 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):
         cfg["tile_h"] = SplitEntity([-1, 2])
         cfg["tile_w"] = SplitEntity([-1, 2])
         cfg["locate_output"] = OtherOptionEntity(1)
+        cfg["unroll_tile"] = OtherOptionEntity(True)
     ##### space definition end #####
 
     def schedule_conv(conv):
         conv_data = conv.op.input_tensors[0]
+        kernel_data = conv.op.input_tensors[1]
+        in_type = conv_data.dtype
+
+        _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape)
 
         n, w, h, c = conv.op.axis
         r_h, r_w = conv.op.reduce_axis
@@ -310,24 +324,53 @@ def schedule_conv(conv):
         wo, wi = cfg["tile_w"].apply(s, conv, w)
         co, ci = cfg["tile_c"].apply(s, conv, c)
 
+        split_val = cfg["tile_c"].size[-1]
+        use_tensorization = (
+            (in_type == "int16")
+            and (split_val == 8)
+            and (IC % split_val == 0)
+            and (channel_multiplier == 1)
+            and is_aarch64_arm()
+        )
+
+        data_pad_value = -1
         if conv_data.name == "data_pad":
             assert isinstance(conv_data.op, tvm.te.ComputeOp)
-            # Define a policy for padding computation
-            cfg.define_knob("data_pad_inline", [1, 2, 3])
+            # Define a strategy for padding computation
+            cfg.define_knob("data_pad_strategy", [1, 2, 3])
             if cfg.is_fallback:
-                cfg["data_pad_inline"] = OtherOptionEntity(3)
-            if cfg["data_pad_inline"].val == 1:
+                # We cannot inline padding when tensorizing.
+                # So, if we can tensorize, let's compute_at the closest axis
+                cfg["data_pad_strategy"] = (
+                    OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3)
+                )
+            # Compute padding on the third to last axis of the computation
+            if cfg["data_pad_strategy"].val == 1:
                 s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                 s[conv_data].compute_at(s[conv], ho)
-            if cfg["data_pad_inline"].val == 2:
+            # Compute padding on the second to last axis of the computation
+            if cfg["data_pad_strategy"].val == 2:
                 s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                 s[conv_data].compute_at(s[conv], wo)
-            if cfg["data_pad_inline"].val == 3:
+            # Inline padding during computation
+            if cfg["data_pad_strategy"].val == 3:
                 s[conv_data].compute_inline()
+            data_pad_value = cfg["data_pad_strategy"].val
+
+        if use_tensorization and data_pad_value != 3:
+            smlal = smlal_int16_int32()
+            s[conv].tensorize(ci, smlal)
+        else:
+            s[conv].vectorize(ci)
+
+        if cfg["unroll_tile"].val:
+            s[conv].unroll(r_h)
+            s[conv].unroll(r_w)
+            s[conv].unroll(wi)
+            s[conv].unroll(hi)
 
         s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
         fused_n_ho = s[conv].fuse(n, ho)
-        s[conv].vectorize(ci)
         return fused_n_ho
 
     def schedule_conv_out(out):
@@ -335,13 +378,17 @@ def schedule_conv_out(out):
         co, ci = cfg["tile_c"].apply(s, out, c)
         wo, wi = cfg["tile_w"].apply(s, out, w)
         ho, hi = cfg["tile_h"].apply(s, out, h)
-        s[out].reorder(n, ho, wo, co, hi, wi)
+        s[out].reorder(n, ho, wo, co, hi, wi, ci)
+        if cfg["unroll_tile"]:
+            s[out].unroll(wi)
+            s[out].unroll(hi)
 
         if out.dtype in ["int8", "uint8"]:
             # In case of quantized convolution further split the channel in batches of 4 elements
             # so that we can use arm intrinsics to run fixed_point_multiplication
             ci_outer, ci_inner = s[out].split(ci, 4)
             s[out].vectorize(ci_inner)
+            s[out].unroll(ci_outer)
 
         fused_n_ho = s[out].fuse(n, ho)
         return hi, wi, fused_n_ho
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 196f788e6b8c..1b999dfe4e80 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -879,6 +879,96 @@ def _instr(index):
     )
 
 
+def smlal_int16_int32():
+    """
+    Intrinsic to be used in order to load two int16x8 vectors and multiply
+    them together through a pair of smlal/smlal2 instructions. The pseudo-code
+    for the algorithm is as follows:
+
+        vec_a = vload(A, "int16x8")
+        vec_b = vload(B, "int16x8")
+
+        vec_c[0:4] += vec_a[0:4]*vec_b[0:4] //  -> smlal instruction
+        vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction
+
+    So we load a single int16x8 vector and we accumulate its lower (0:4) and
+    higher part separately.
+    """
+    int16_lanes = 8
+    A = te.placeholder((int16_lanes,), dtype="int16", name="A")
+    B = te.placeholder((int16_lanes, 1), dtype="int16", name="B")
+    C = te.compute(
+        (int16_lanes,),
+        lambda i: A[i].astype("int32") * B[i, 0].astype("int32"),
+        name="C",
+    )
+
+    a_buffer = tvm.tir.decl_buffer(
+        A.shape, dtype="int16", name="a_buffer", offset_factor=1, strides=[1]
+    )
+    b_buffer = tvm.tir.decl_buffer(
+        B.shape,
+        dtype="int16",
+        name="b_buffer",
+        offset_factor=1,
+        strides=[te.var("sb"), 1],
+    )
+    c_buffer = tvm.tir.decl_buffer(
+        C.shape,
+        dtype="int32",
+        name="c_buffer",
+        offset_factor=1,
+        strides=[1],
+    )
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x8")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0], "int16x8")
+            vec_b = ins[1].vload([0, 0], "int16x8")
+            inst = "llvm.aarch64.neon.smull"
+
+            # Higher part of the vector
+            vec_c_h = outs[0].vload([4], "int32x4")
+            vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a)
+            vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b)
+            vmull_h = tvm.tir.call_llvm_pure_intrin(
+                "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_h, vec_b_h
+            )
+            vec_out_h = vec_c_h + vmull_h
+
+            # Lower part of the vector
+            vec_c_l = outs[0].vload([0], "int32x4")
+            vec_a_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_a)
+            vec_b_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_b)
+            vmull_l = tvm.tir.call_llvm_pure_intrin(
+                "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_l, vec_b_l
+            )
+            vec_out_l = vec_c_l + vmull_l
+
+            # Combine higher and lower part in a single int32x8 vector to store
+            # (this will require two different store instructions, since the
+            # length of a NEON vector is fixed at 128
+            vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h)
+            ib.emit(outs[0].vstore(0, vec_out))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={A: a_buffer, B: b_buffer, C: c_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
 def _q_multiply_shift_arm(op):
     """
     Implementation of q_multiply_shift_arm through arm intrinsics
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index d5fbef98593c..55d2fe0c4e52 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -56,6 +56,55 @@
 }
 
 
+def compile_depthwise_NHWC_int8_arm(
+    batch,
+    in_channel,
+    in_size,
+    kernel,
+    depth_multiplier,
+    stride,
+    padding,
+    add_bias=False,
+    dilation=1,
+):
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+    padding_sum = pad_top + pad_left + pad_bottom + pad_right
+
+    in_height = in_width = in_size
+    A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int16")
+    W = te.placeholder((kernel, kernel, in_channel, depth_multiplier), name="W", dtype="int16")
+    bias = te.placeholder((in_channel * depth_multiplier,), name="bias", dtype="int32")
+    dtype = "int32"
+
+    device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"
+    compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc
+    schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc
+
+    if not tvm.testing.device_enabled(device):
+        print("Skip because %s is not enabled" % device)
+        return
+
+    print("Compiling on arm AArch64 target: %s" % device)
+    with tvm.target.Target(device):
+        assert topi.arm_cpu.arm_utils.is_aarch64_arm(), "AArch64 target not recognized"
+
+        C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
+        if add_bias:
+            C += bias
+            ins_outs = [A, W, bias, C]
+        else:
+            ins_outs = [A, W, C]
+
+        s = schedule([C])
+
+        func = tvm.build(
+            s,
+            ins_outs,
+            device,
+            name="depthwise_conv2d",
+        )
+
+
 def depthwise_conv2d_with_workload_nchw(
     batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1
 ):
@@ -478,6 +527,7 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME")
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
+
     # dilation = 2
     # disabled because it uses too large shared memory on cuda
     # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
@@ -487,6 +537,10 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
 
+    # Test compilation on arm devices
+    compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 3, 1, "SAME")
+    compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 1, 1, "SAME", True)
+
 
 if __name__ == "__main__":
     test_depthwise_conv2d()

From a20bbd8a3cee8f1f3166c17b73b61f465a592824 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 3 Nov 2020 21:27:10 +0900
Subject: [PATCH 107/258] [CI] Torch 1.7 update to mainline (#6828)

---
 Jenkinsfile                                   |  2 +-
 docker/install/ubuntu_install_onnx.sh         |  2 +-
 python/tvm/relay/frontend/pytorch.py          | 60 +++++++++++--------
 tests/python/frontend/pytorch/qnn_test.py     |  3 +-
 tests/python/frontend/pytorch/test_forward.py |  4 +-
 .../test_auto_scheduler_layout_rewrite.py     |  5 +-
 .../deploy_object_detection_pytorch.py        |  6 +-
 tutorials/frontend/from_pytorch.py            |  6 +-
 8 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index c8c2fe342dfa..59d08b5c7ce7 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = "tlcpack/ci-lint:v0.62"
-ci_gpu = "tlcpack/ci-gpu:v0.71"
+ci_gpu = "tlcpack/ci-gpu:v0.72"
 ci_cpu = "tlcpack/ci-cpu:v0.71"
 ci_wasm = "tlcpack/ci-wasm:v0.70"
 ci_i386 = "tlcpack/ci-i386:v0.71"
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
index 2ad601983fa2..a92a0244d707 100755
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -28,4 +28,4 @@ pip3 install onnxruntime==1.0.0
 # not expose that in the wheel!!!
 pip3 install future
 
-pip3 install torch==1.4.0 torchvision==0.5.0
+pip3 install torch==1.7.0 torchvision==0.8.1
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index d8c0769e24ea..2fd207883dad 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -21,6 +21,7 @@
 import itertools
 import logging
 import sys
+import math
 
 import numpy as np
 
@@ -168,7 +169,6 @@ def _min():
 
 def _unary(name):
     def _impl(inputs, input_types):
-        input_type = input_types[0]
         # this is just to ensure tensor input
         (data,) = _pytorch_promote_types(inputs[:1], input_types[:1])
         return get_relay_op(name)(data)
@@ -1552,7 +1552,7 @@ def _impl(inputs, input_types):
         axis = None
         keepdims = False
         if len(inputs) > 2:
-            axis = inputs[1]
+            axis = inputs[1] if len(inputs[1]) > 0 else None
             keepdims = bool(inputs[2])
 
         return _op.sqrt(_op.reduce.sum((data * data), axis=axis, keepdims=keepdims))
@@ -1847,18 +1847,33 @@ def _impl(inputs, input_types):
     return _impl
 
 
-def _upsample(method, prelude):
-    def _impl(inputs, input_types):
-        out_size = []
+def _get_upsample_out_size(inputs, method):
+    # This assumes a static shape
+    out_size = []
+    if inputs[1] is not None:
         for size in inputs[1]:
             if not isinstance(size, int):
                 out_size.append(int(_infer_value(size, {}).asnumpy()))
             else:
                 out_size.append(size)
+    else:
+        scale_index = 3 if method in ["bilinear", "trilinear"] else 2
+        scales = inputs[scale_index]
+        assert scales is not None, "neither out size nor scale provided"
+        assert isinstance(scales, list)
+        ishape = _infer_shape(inputs[0])
+        for i, scale in enumerate(scales):
+            out_size.append(int(math.floor(float(ishape[2 + i]) * scale)))
+
+    return out_size
 
+
+def _upsample(method, prelude):
+    def _impl(inputs, input_types):
         data = inputs[0]
+        out_size = _get_upsample_out_size(inputs, method)
 
-        if len(inputs) > 2:
+        if len(inputs) > 2 and method == "bilinear":
             align_corners = inputs[2]
         else:
             align_corners = False
@@ -1874,17 +1889,13 @@ def func(x):
             return _op.image.resize(x, out_size, "NCHW", method, coord_trans)
 
         if _is_quantized_tensor(data, prelude):
-            # Torch version > 1.4 changed upsampling API
-            if is_version_greater_than("1.4.0"):
-                num_inputs = 7
-            else:
-                num_inputs = 5
-
-            assert len(inputs) == num_inputs, "Input quant param not found in op inputs"
-
+            # input qparams are manually appended by us
+            assert isinstance(inputs[-2], float)
+            assert isinstance(inputs[-1], int)
             input_scale = _expr.const(inputs[-2])
             input_zero_point = _expr.const(inputs[-1])
             return qnn_torch.quantized_upsample(data, input_scale, input_zero_point, func)
+
         return func(data)
 
     return _impl
@@ -1892,17 +1903,10 @@ def func(x):
 
 def _upsample3d(method):
     def _impl(inputs, input_types):
-        if isinstance(inputs[1], _expr.Var):
-            out_size = _infer_shape(inputs[1])
-        elif _is_int_seq(inputs[1]):
-            out_size = inputs[1]
-        elif isinstance(inputs[1], list):
-            infer_res = [_infer_value(size, {}) for size in inputs[1]]
-            out_size = [np.asscalar(res.asnumpy().astype(np.int)) for res in infer_res]
-
         data = inputs[0]
+        out_size = _get_upsample_out_size(inputs, method)
 
-        if len(inputs) > 2:
+        if len(inputs) > 2 and method == "trilinear":
             align_corners = inputs[2]
         else:
             align_corners = False
@@ -1983,8 +1987,7 @@ def _impl(inputs, input_types):
 
 def _logical_not():
     def _impl(inputs, input_types):
-        data = inputs[0]
-
+        data = _wrap_const(inputs[0])
         return _op.logical_not(_op.cast(data, "bool"))
 
     return _impl
@@ -2732,6 +2735,7 @@ def _get_convert_map(prelude, default_dtype):
         "aten::empty": _empty(),
         "aten::bincount": _bincount(),
         "aten::scatter_add": _scatter_add(),
+        "aten::__not__": _logical_not(),
     }
     return convert_map
 
@@ -2798,6 +2802,7 @@ def _report_missing_conversion(op_names, convert_map):
         "prim::ListUnpack",
         "prim::TupleConstruct",
         "prim::TupleUnpack",
+        "prim::RaiseException",
         "prim::If",
         "prim::Loop",
     ]
@@ -2903,6 +2908,8 @@ def _get_operator_nodes(nodes):
     ops = []
     # Traverse nodes and add to graph
     for node in nodes:
+        if node.outputsSize() == 0:
+            continue
         if node.outputsSize() > 1:
             node_name = "_".join(_get_output_names(node))
         else:
@@ -3286,6 +3293,9 @@ def convert_operators(operators, outputs, ret_names, convert_map, prelude, defau
             else:
                 unpacked = _unpack_tuple(inputs[0])
             outputs.update(zip(_get_output_names(op_node), unpacked))
+        elif operator == "prim::prim::RaiseException":
+            logging.warning("raising exceptions is ignored")
+            outputs[node_name] = None
         elif operator == "prim::If":
             if_out = convert_if(op_node, outputs, convert_map, prelude, default_dtype=default_dtype)
             outputs[node_name] = if_out
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 1851e31e817f..9781eb5d57c4 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -367,7 +367,8 @@ def get_imagenet_input():
             # disable inception test for now, since loading it takes ~5min on torchvision-0.5 due to scipy bug
             # See https://discuss.pytorch.org/t/torchvisions-inception-v3-takes-much-longer-to-load-than-other-models/68756
             # ("inception_v3", qinception.inception_v3(pretrained=True), per_channel),
-            ("googlenet", qgooglenet(pretrained=True), per_channel),
+            # tracing quantized googlenet broken as of v1.6
+            # ("googlenet", qgooglenet(pretrained=True), per_channel),
         ]
 
     results = []
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index e997ebe07a50..4dec5f7e5916 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -2535,7 +2535,7 @@ def test_forward_linspace():
 
     class Linspace1(Module):
         def forward(self, *args):
-            return torch.linspace(5, 10)
+            return torch.linspace(5, 10, steps=100)
 
     class Linspace2(Module):
         def forward(self, *args):
@@ -2559,7 +2559,7 @@ def forward(self, *args):
 
     class Linspace7(Module):
         def forward(self, *args):
-            return torch.linspace(1, 4, dtype=torch.float32)
+            return torch.linspace(1, 4, steps=100, dtype=torch.float32)
 
     class Linspace8(Module):
         def forward(self, *args):
diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index 4a11d0fb0ca0..e6f9a76fce62 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -166,5 +166,6 @@ def test_correctness_layout_rewrite_insert_transform_stage():
 
 if __name__ == "__main__":
     test_apply_steps_with_layout_rewrite()
-    test_correctness_layout_rewrite_rewrite_for_preTransformed()
-    test_correctness_layout_rewrite_insert_transform_stage()
+    # Disable for now due to being flaky on i386
+    # test_correctness_layout_rewrite_rewrite_for_preTransformed()
+    # test_correctness_layout_rewrite_insert_transform_stage()
diff --git a/tutorials/frontend/deploy_object_detection_pytorch.py b/tutorials/frontend/deploy_object_detection_pytorch.py
index 6408685febfb..2852dd3ad99d 100644
--- a/tutorials/frontend/deploy_object_detection_pytorch.py
+++ b/tutorials/frontend/deploy_object_detection_pytorch.py
@@ -27,8 +27,8 @@
 
 .. code-block:: bash
 
-    pip install torch==1.4.0
-    pip install torchvision==0.5.0
+    pip install torch==1.7.0
+    pip install torchvision==0.8.1
 
 or please refer to official site
 https://pytorch.org/get-started/locally/
@@ -36,7 +36,7 @@
 PyTorch versions should be backwards compatible but should be used
 with the proper TorchVision version.
 
-Currently, TVM supports PyTorch 1.4 and 1.3. Other versions may
+Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may
 be unstable.
 """
 
diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
index 33a05884f61d..b5bcdf6792f9 100644
--- a/tutorials/frontend/from_pytorch.py
+++ b/tutorials/frontend/from_pytorch.py
@@ -28,8 +28,8 @@
 
 .. code-block:: bash
 
-    pip install torch==1.4.0
-    pip install torchvision==0.5.0
+    pip install torch==1.7.0
+    pip install torchvision==0.8.1
 
 or please refer to official site
 https://pytorch.org/get-started/locally/
@@ -37,7 +37,7 @@
 PyTorch versions should be backwards compatible but should be used
 with the proper TorchVision version.
 
-Currently, TVM supports PyTorch 1.4 and 1.3. Other versions may
+Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may
 be unstable.
 """
 

From e46a60950f67cb25d1de1500fe138f22e3f8f38a Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Wed, 4 Nov 2020 00:25:24 +0800
Subject: [PATCH 108/258] [TF] Fix a bug in _stridedSlice() (#6829)

When stride < 0, the slicing range for whole demension should be
  [-1, -(dim+1))
---
 python/tvm/relay/frontend/tensorflow.py          |  8 ++++++--
 tests/python/frontend/tensorflow/test_forward.py | 10 ++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 2c7adf03bad8..a6fd1db7e7b5 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1616,11 +1616,15 @@ def _transform_mask(stride_dim, ellipsis_mask):
                     if final_index == len(m_begin):
                         break
                     if mask & begin_mask:
-                        m_begin[final_index] = data_shape[final_index] if stride[index] < 0 else 0
+                        m_begin[final_index] = -1 if stride[index] < 0 else 0
                     elif begin[index]:
                         m_begin[final_index] = begin[index]
                     if mask & end_mask:
-                        m_end[final_index] = 0 if stride[index] < 0 else data_shape[final_index]
+                        m_end[final_index] = (
+                            -(data_shape[final_index] + 1)
+                            if stride[index] < 0
+                            else data_shape[final_index]
+                        )
                     elif end[index]:
                         m_end[final_index] = end[index]
                     m_stride[final_index] = stride[index]
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 6697cfd0d36f..90e639e36809 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1895,6 +1895,16 @@ def test_forward_stridedslice():
         begin_mask=5,
         end_mask=8,
     )
+    _test_stridedslice(
+        (1, 13, 13, 3, 2),
+        [0, 0],
+        [1, 1],
+        [1, -1],
+        "float32",
+        ellipsis_mask=1,
+        begin_mask=2,
+        end_mask=2,
+    )
 
 
 #######################################################################

From b4e4cdf41c574077b900e9a7202b391b054d5e97 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Tue, 3 Nov 2020 09:31:38 -0800
Subject: [PATCH 109/258] [CI] remove unused environment var (#6824)

---
 cmake/config.cmake                            | 4 ----
 docker/README.md                              | 2 +-
 tests/scripts/task_config_build_cpu.sh        | 1 -
 tests/scripts/task_config_build_gpu.sh        | 1 -
 tests/scripts/task_config_build_gpu_vulkan.sh | 1 -
 tests/scripts/task_config_build_i386.sh       | 1 -
 tests/scripts/task_config_build_qemu.sh       | 1 -
 tests/scripts/task_config_build_wasm.sh       | 1 -
 8 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index b220f3b0b9f0..36eeac729969 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -270,7 +270,3 @@ set(USE_HEXAGON_SDK /path/to/sdk)
 
 # Whether to use ONNX codegen
 set(USE_TARGET_ONNX OFF)
-
-# Whether to compile the standalone C runtime.
-set(USE_STANDALONE_CRT ON)
-
diff --git a/docker/README.md b/docker/README.md
index dffaf3a5ba4f..ae972f954668 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -52,7 +52,7 @@ Then inside the docker container, you can type the following command to start th
 jupyter notebook
 ```
 
-You can find some un-official prebuilt images in https://hub.docker.com/r/tvmai/ .
+You can find some un-official prebuilt images in https://hub.docker.com/r/tlcpack/ .
 Note that these are convenience images and are not part of the ASF release.
 
 
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 521ab9b8ccdc..6fc64966c0ab 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -26,7 +26,6 @@ cp ../cmake/config.cmake .
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 0072fb59cf11..155bac80533f 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -29,7 +29,6 @@ echo set\(USE_CUDA ON\) >> config.cmake
 echo set\(USE_OPENGL ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-9\) >> config.cmake
 echo set\(USE_NNPACK ON\) >> config.cmake
 echo set\(NNPACK_PATH /NNPACK/build/\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu_vulkan.sh b/tests/scripts/task_config_build_gpu_vulkan.sh
index a5742e22110a..74096b1a9760 100755
--- a/tests/scripts/task_config_build_gpu_vulkan.sh
+++ b/tests/scripts/task_config_build_gpu_vulkan.sh
@@ -27,7 +27,6 @@ echo set\(USE_OPENCL ON\) >> config.cmake
 echo set\(USE_ROCM ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER clang-7\) >> config.cmake
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index d773985277aa..8ed5f94e30dc 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -28,7 +28,6 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-4.0\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
diff --git a/tests/scripts/task_config_build_qemu.sh b/tests/scripts/task_config_build_qemu.sh
index ebabdcab65b7..086ca8034dc9 100755
--- a/tests/scripts/task_config_build_qemu.sh
+++ b/tests/scripts/task_config_build_qemu.sh
@@ -25,7 +25,6 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-10\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index f3157bd54df0..c37a119b0590 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -26,7 +26,6 @@ cp ../cmake/config.cmake .
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_STANDALONE_CRT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake

From 0f99c6714c23c93ed24358b6f286a924de61e82a Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 3 Nov 2020 18:10:32 +0000
Subject: [PATCH 110/258] [TVMC] 'tvmc tune' --rpc-tracker and --rpc-tracker
 fail due to argparse misconfiguration (#6822)

Fix an error with `tvmc tune`, that causes --rpc-tracker and --rpc-key to be identified as a list of strings, rather than the expected string type.

Removing the unnecessary nargs solves the issues.

This is a follow-up of https://github.com/apache/incubator-tvm/pull/6762
---
 python/tvm/driver/tvmc/autotuner.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 90efa50965da..53c8f3bdc43d 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -92,12 +92,10 @@ def add_tune_parser(subparsers):
     )
     parser.add_argument(
         "--rpc-key",
-        nargs=1,
         help="the RPC tracker key of the target device. Required when --rpc-tracker is provided.",
     )
     parser.add_argument(
         "--rpc-tracker",
-        nargs=1,
         help="hostname (required) and port (optional, defaults to 9090) of the RPC tracker, "
         "e.g. '192.168.0.100:9999'",
     )

From 3f59a622ec6eede217719b63d0d3e3e1b0198be9 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Tue, 3 Nov 2020 10:19:16 -0800
Subject: [PATCH 111/258] Fix Annotate Target to support freevars(relay.zeros,
 relay.ones etc) of any size (including zero)  (#6826)

* Fix Annotate Target

* Add Test Cases

* Formatting

* Comments C++

* Remove Unnecesssary test cases

* typo

* annotate_target

Co-authored-by: Ubuntu <ubuntu@ip-172-31-27-149.us-east-2.compute.internal>
---
 src/relay/transforms/annotate_target.cc       | 14 ++-
 .../python/relay/test_pass_annotate_target.py | 87 +++++++++++++++++++
 2 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
index 7a083304515b..9d160b26f1ad 100644
--- a/src/relay/transforms/annotate_target.cc
+++ b/src/relay/transforms/annotate_target.cc
@@ -77,7 +77,13 @@ class AnnotateTargetRewriter : public ExprRewriter {
         compiler_ends.push_back(call->args[0]);
       } else if (op_expr_to_target_.find(arg) != op_expr_to_target_.end()) {
         arg_target = op_expr_to_target_[arg];
-        compiler_ends.push_back(InsertAnnotation(arg, arg_target, make_end_op));
+        // If an argument is a call node and has no argument, then it should be tensor ops such as
+        // zeros, so we treat it as input vars.
+        if (call && call->args.size() == 0) {
+          compiler_ends.push_back(arg);
+        } else {
+          compiler_ends.push_back(InsertAnnotation(arg, arg_target, make_end_op));
+        }
       } else {
         // Input vars.
         compiler_ends.push_back(arg);
@@ -113,14 +119,16 @@ class AnnotateTargetRewriter : public ExprRewriter {
      * \brief This function inserts compiler end to expr and maps the corresponding target to the
      * new expression.
      *
-     *  This function checks for expr existence within the map and inserts the annotation
+     *  This function checks for expr existence within the map and inserts the annotation.
+     *  If the expression has a free variable (e.g: relay.zeros, relay.ones) we do not insert
+     *  compiler end, since there are no compiler begins for it.
      *  Further, it propagates the target to the new expression and returns it
      *
      * \param expr A relay expression
      * \return An annotated and target-propagated relay expression.
      */
     Expr new_expr = expr;
-    if (op_expr_to_target_.find(expr) != op_expr_to_target_.end()) {
+    if (op_expr_to_target_.find(expr) != op_expr_to_target_.end() && FreeVars(expr).size() != 0) {
       new_expr = InsertAnnotation(expr, op_expr_to_target_[expr], make_end_op);
       op_expr_to_target_[new_expr] = op_expr_to_target_[expr];
     }
diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py
index b99e3bc02ba4..106909e16fa7 100644
--- a/tests/python/relay/test_pass_annotate_target.py
+++ b/tests/python/relay/test_pass_annotate_target.py
@@ -510,6 +510,91 @@ def after():
     assert tvm.ir.structural_equal(expected, result)
 
 
+def test_if_free_vars():
+    target = "test_if_free_vars"
+
+    @tvm.ir.register_op_attr("equal", "target." + target)
+    def equal(attrs, args):  # pylint: disable=unused-variable
+        return True
+
+    @tvm.ir.register_op_attr("sigmoid", "target." + target)
+    def sigmoid(attrs, args):  # pylint: disable=unused-variable
+        return True
+
+    @tvm.ir.register_op_attr("erf", "target." + target)
+    def erf(attrs, args):  # pylint: disable=unused-variable
+        return True
+
+    """Test that If-else nodes compiles correctly when surrounded by free variables"""
+
+    def before():
+        data = relay.var("data", shape=(1, 32))
+        eq1 = relay.var("e1", shape=[], dtype="float32")
+        eq2 = relay.var("e2", shape=[], dtype="float32")
+        eq = relay.equal(eq1, eq2)
+
+        true_branch = relay.zeros(shape=(1, 32), dtype="float32")
+        false_branch = relay.sigmoid(data)
+        ife = relay.If(eq, true_branch, false_branch)
+        out = relay.erf(ife)
+
+        func = relay.Function([data, eq1, eq2], out)
+        mod = tvm.IRModule.from_expr(func)
+
+        return mod
+
+    def after():
+        data = relay.var("data", shape=(1, 32))
+        eq1 = relay.var("e1", shape=[], dtype="float32")
+        eq2 = relay.var("e2", shape=[], dtype="float32")
+
+        cb_1 = relay.annotation.compiler_begin(eq1, target)
+        cb_2 = relay.annotation.compiler_begin(eq2, target)
+
+        equality_condition = relay.equal(cb_1, cb_2)
+        ce_1 = relay.annotation.compiler_end(equality_condition, target)
+
+        # if condition
+        true_branch = relay.zeros(shape=(1, 32), dtype="float32")
+
+        # else condition
+        cb_3 = relay.annotation.compiler_begin(data, target)
+        false_branch = relay.sigmoid(cb_3)
+        ce_2 = relay.annotation.compiler_end(false_branch, target)
+
+        if_condition = relay.If(ce_1, true_branch, ce_2)
+        cb_4 = relay.annotation.compiler_begin(if_condition, target)
+        erf_out = relay.erf(cb_4)
+        ce_3 = relay.annotation.compiler_end(erf_out, target)
+        func = relay.Function([data, eq1, eq2], ce_3)
+        mod = tvm.IRModule.from_expr(func)
+        return mod
+
+    result = transform.AnnotateTarget(target)(before())
+    expected = transform.InferType()(after())
+    assert tvm.ir.structural_equal(expected, result)
+
+
+def test_free_vars_zeros():
+    target = "test_free_vars_zeros"
+
+    """Test that free variables compile correctly on their own"""
+
+    def before():
+        func = relay.Function([], relay.zeros(shape=(0), dtype="float32"))
+        mod = tvm.IRModule.from_expr(func)
+        return mod
+
+    def after():
+        func = relay.Function([], relay.zeros(shape=(0), dtype="float32"))
+        mod = tvm.IRModule.from_expr(func)
+        return mod
+
+    result = transform.AnnotateTarget(target)(before())
+    expected = transform.InferType()(after())
+    assert tvm.ir.structural_equal(expected, result)
+
+
 if __name__ == "__main__":
     test_extern_dnnl()
     test_composite_function()
@@ -520,3 +605,5 @@ def after():
     test_multiple_runs()
     test_if_else()
     test_while_let()
+    test_if_free_vars()
+    test_free_vars_zeros()

From c69392164d5a0555a59fbbd8f15750b88778c810 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 3 Nov 2020 18:19:07 -0500
Subject: [PATCH 112/258] [DOCS] Enable theme with header and footer. (#6834)

Also fixed a sphinx warning in pytorch.
---
 docs/README.txt                       |  2 +-
 docs/conf.py                          | 56 ++++++++++++++++++++++++++-
 python/tvm/relay/frontend/pytorch.py  | 12 +++---
 tests/scripts/task_python_docs.sh     |  4 ++
 tests/scripts/task_sphinx_precheck.sh |  4 ++
 5 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/docs/README.txt b/docs/README.txt
index 09c8e9b7e557..eeec6d972d68 100644
--- a/docs/README.txt
+++ b/docs/README.txt
@@ -3,7 +3,7 @@ TVM Documentations
 This folder contains the source of TVM documents
 
 - A hosted version of doc is at https://tvm.apache.org/docs
-- pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark "Pillow<7" autodocsumm
+- pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark "Pillow<7" autodocsumm tlcpack-sphinx-addon
 - Build tvm first in the root folder.
 - Run the following command
 ```bash
diff --git a/docs/conf.py b/docs/conf.py
index 259d9c3fa0e2..9ed10d710df2 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -234,6 +234,61 @@
     "tvm.relay": ["tvm.ir", "tvm.tir"],
 }
 
+## Setup header and other configs
+import tlcpack_sphinx_addon
+
+footer_copyright = "© 2020 Apache Software Foundation | All right reserved"
+footer_note = " ".join(
+    """
+Apache TVM is an effort undergoing incubation at The Apache Software Foundation (ASF),
+sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
+until a further review indicates that the infrastructure, communications, and decision making
+process have stabilized in a manner consistent with other successful ASF projects. While
+incubation status is not necessarily a reflection of the completeness or stability of the code,
+it does indicate that the project has yet to be fully endorsed by the ASF.
+Copyright © 2020 The Apache Software Foundation. Apache TVM, Apache, the Apache feather,
+and the Apache TVM project logo are either trademarks or registered trademarks of
+the Apache Software Foundation.""".split(
+        "\n"
+    )
+).strip()
+
+header_logo = "https://tvm.apache.org/assets/images/logo.svg"
+
+header_links = [
+    ("Community", "https://tvm.apache.org/community"),
+    ("Download", "https://tvm.apache.org/download"),
+    ("VTA", "https://tvm.apache.org/vta"),
+    ("Blog", "https://tvm.apache.org/blog"),
+    ("Docs", "https://tvm.apache.org/docs"),
+    ("Conference", "https://tvmconf.org"),
+    ("Github", "https://github.com/apache/incubator-tvm/"),
+]
+
+header_dropdown = {
+    "name": "ASF",
+    "items": [
+        ("Apache Homepage", "https://apache.org/"),
+        ("License", "https://www.apache.org/licenses/"),
+        ("Sponsorship", "https://www.apache.org/foundation/sponsorship.html"),
+        ("Security", "https://www.apache.org/security/"),
+        ("Thanks", "https://www.apache.org/foundation/thanks.html"),
+        ("Events", "https://www.apache.org/events/current-event"),
+    ],
+}
+
+html_context = {
+    "footer_copyright": footer_copyright,
+    "footer_note": footer_note,
+    "header_links": header_links,
+    "header_dropdown": header_dropdown,
+    "header_logo": header_logo,
+}
+
+# add additional overrides
+templates_path += [tlcpack_sphinx_addon.get_templates_path()]
+html_static_path += [tlcpack_sphinx_addon.get_static_path()]
+
 
 def update_alias_docstring(name, obj, lines):
     """Update the docstring of alias functions.
@@ -282,4 +337,3 @@ def process_docstring(app, what, name, obj, options, lines):
 
 def setup(app):
     app.connect("autodoc-process-docstring", process_docstring)
-    app.add_css_file("css/tvm_theme.css")
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 2fd207883dad..38478e27ff92 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -3344,17 +3344,17 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
         TorchScripted PyTorch graph
         Note: We currently only support traces (ie: torch.jit.trace(model, input))
 
-    input_infos: List of tuples of (input name, input shape)
-                 or (input name, (input shape, input types))
+    input_infos : List of tuples
+        Can be (input name, input shape) or (input name, (input shape, input types))
         Graph level input shape and type list
         The same input names need to be used for deployment, so choose easy to
         remember names (such as: input0, input1)
         e.g.
-          [('input0', (1, 2)), ('input1', (3, 4))]
-          or
-          [('input0', ((1, 2), 'int')), ('input1', ((3, 4), 'float'))]
+        [('input0', (1, 2)), ('input1', (3, 4))]
+        or
+        [('input0', ((1, 2), 'int')), ('input1', ((3, 4), 'float'))]
 
-    custom_convert_map: Dictionary of str to Relay op
+    custom_convert_map : Dictionary of str to Relay op
         A custom op conversion map in the same format as _convert_map above
 
     Returns
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 3d229651cb4f..efb7a998014f 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -51,6 +51,10 @@ find . -type f -path "*.log" | xargs rm -f
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
+# install theme addon for to local if does not exists
+# avoid docker for now before we stablize on the choice of style
+python3 -m pip install --user --upgrade -q tlcpack-sphinx-addon==0.1.0
+
 cd docs
 PYTHONPATH=`pwd`/../python make html |& tee /tmp/$$.log.txt
 if grep -E "failed to execute|Segmentation fault" < /tmp/$$.log.txt; then
diff --git a/tests/scripts/task_sphinx_precheck.sh b/tests/scripts/task_sphinx_precheck.sh
index fd67b0ab539b..08184d9ff333 100755
--- a/tests/scripts/task_sphinx_precheck.sh
+++ b/tests/scripts/task_sphinx_precheck.sh
@@ -33,6 +33,10 @@ rm -rf docs/vta/tutorials
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
+# install theme addon for to local if does not exists
+# avoid docker for now before we stablize on the choice of style
+python3 -m pip install --user --upgrade -q tlcpack-sphinx-addon==0.1.0
+
 echo "PreCheck sphinx doc generation WARNINGS.."
 cd docs
 make clean

From fa99c396390eaa232baad4dac436ddbbdbad5371 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 3 Nov 2020 20:43:49 -0500
Subject: [PATCH 113/258] Update link (#6838)

---
 python/tvm/autotvm/tophub.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index e3170ba98f8a..b1be488a220a 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -36,7 +36,7 @@
 AUTOTVM_TOPHUB_LOC_VAR = "TOPHUB_LOCATION"
 
 # default location of TopHub
-AUTOTVM_TOPHUB_DEFAULT_LOC = "https://raw.githubusercontent.com/uwsampl/tvm-distro/master/tophub"
+AUTOTVM_TOPHUB_DEFAULT_LOC = "https://raw.githubusercontent.com/tlc-pack/tophub/main/tophub"
 
 # value of AUTOTVM_TOPHUB_LOC_VAR to specify to not read from TopHub
 AUTOTVM_TOPHUB_NONE_LOC = "NONE"

From f6b8067c757cd7aac4cc76cf0638f95db2301978 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Wed, 4 Nov 2020 02:28:16 +0000
Subject: [PATCH 114/258] [BYOC] FTVMAnnotateTarget method signature update
 (#6786)

Signature of FTVMAnnotateTarget changed to runtime::TypedPackedFunc<bool(const Expr& expr)>
which allows to utilise extra information from passed expr argument.
---
 include/tvm/relay/op_attr_types.h             |   8 +-
 .../tvm/relay/op/contrib/arm_compute_lib.py   |  47 ++++----
 python/tvm/relay/op/contrib/coreml.py         |   3 +-
 python/tvm/relay/op/contrib/dnnl.py           |   2 +-
 python/tvm/relay/op/contrib/ethosn.py         |  21 ++--
 python/tvm/relay/op/contrib/tensorrt.py       | 102 +++++++++++++-----
 src/relay/transforms/annotate_target.cc       |   3 +-
 .../python/relay/test_pass_annotate_target.py |  34 +++---
 .../python/relay/test_pass_partition_graph.py |  12 +--
 9 files changed, 149 insertions(+), 83 deletions(-)

diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index acd4a03aed03..1e9b86d9e0bc 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -175,15 +175,11 @@ using FTVMLegalize = runtime::TypedPackedFunc<Expr(const Attrs& attrs, const Arr
 /*!
  * \brief Annotates an expression to indicate if an op should be compiled using
  * the given compiler/target.
- *
- * \param attrs The attribute of the original expr.
- * \param args The arguments of the original expr.
- *
+ * \param expr The original expr.
  * \return true if this op should be registered to invoke a specific compiler
  * for codegen, otherwise, false.
  */
-using FTVMAnnotateTarget = runtime::TypedPackedFunc<bool(const Attrs& attrs,  // NOLINT(*)
-                                                         const Array<Expr>& args)>;
+using FTVMAnnotateTarget = runtime::TypedPackedFunc<bool(const Expr& expr)>;
 
 /*!
  * \brief Forward rewriting rule for a specific op.
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 8dfb3b7e0bf4..80d64db693ce 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -167,7 +167,7 @@ def check_conv(extract):
         call = extract
         while call.op.name != "nn.conv2d":
             call = call.args[0]
-        return conv2d(call.attrs, call.args)
+        return conv2d(call)
 
     def check_qnn_conv(extract):
         """Check qnn conv pattern is supported by ACL."""
@@ -176,14 +176,14 @@ def check_qnn_conv(extract):
         call = extract
         while call.op.name != "qnn.conv2d":
             call = call.args[0]
-        return qnn_conv2d(call.attrs, call.args)
+        return qnn_conv2d(call)
 
     def check_dense(extract):
         """Check conv pattern is supported by ACL."""
         call = extract
         while call.op.name != "nn.dense":
             call = call.args[0]
-        return dense(call.attrs, call.args)
+        return dense(call)
 
     def check_qnn_dense(extract):
         """Check qnn conv pattern is supported by ACL."""
@@ -192,7 +192,7 @@ def check_qnn_dense(extract):
         call = extract
         while call.op.name != "qnn.dense":
             call = call.args[0]
-        return qnn_dense(call.attrs, call.args)
+        return qnn_dense(call)
 
     def check_avg_pool2d(extract):
         """Check average pool2d pattern is supported by ACL."""
@@ -201,12 +201,12 @@ def check_avg_pool2d(extract):
         pool = extract.args[0]
         if pool.args[0].attrs.dtype != "int32":
             return False
-        return avg_pool2d(pool.attrs, pool.args, from_quantized_composite=True)
+        return avg_pool2d(pool, from_quantized_composite=True)
 
     def check_l2_pool2d(extract):
         """Check l2 pool2d pattern is supported by ACL."""
         pool = extract.args[0]
-        return avg_pool2d(pool.attrs, pool.args)
+        return avg_pool2d(pool)
 
     return [
         ("arm_compute_lib.conv2d", conv_pattern(), check_conv),
@@ -221,7 +221,7 @@ def check_l2_pool2d(extract):
 
 def _register_external_op_helper(op_name, supported=True):
     @tvm.ir.register_op_attr(op_name, "target.arm_compute_lib")
-    def _func_wrapper(attrs, args):
+    def _func_wrapper(expr):
         return supported
 
     return _func_wrapper
@@ -231,8 +231,9 @@ def _func_wrapper(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.conv2d", "target.arm_compute_lib")
-def conv2d(attrs, args):
+def conv2d(expr):
     """Check if the external ACL codegen for conv2d should be used."""
+    attrs, args = expr.attrs, expr.args
     if attrs.groups != 1:
         return False
     if attrs.data_layout != "NHWC":
@@ -248,8 +249,9 @@ def conv2d(attrs, args):
     return True
 
 
-def qnn_conv2d(attrs, args):
+def qnn_conv2d(expr):
     """Check if the external ACL codegen for qnn.conv2d should be used."""
+    attrs, args = expr.attrs, expr.args
     if attrs.groups != 1:
         return False
     if attrs.data_layout != "NHWC":
@@ -266,8 +268,9 @@ def qnn_conv2d(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.dense", "target.arm_compute_lib")
-def dense(attrs, args):
+def dense(expr):
     """Check if the external ACL codegen for dense should be used."""
+    attrs, args = expr.attrs, expr.args
     data_typ = args[0].checked_type
     if data_typ.dtype != "float32":
         return False
@@ -279,8 +282,9 @@ def dense(attrs, args):
     return True
 
 
-def qnn_dense(attrs, args):
+def qnn_dense(expr):
     """Check if the external ACL codegen for qnn.dense should be used."""
+    attrs, args = expr.attrs, expr.args
     data_typ = args[0].checked_type
     if data_typ.dtype != "uint8":
         return False
@@ -293,8 +297,9 @@ def qnn_dense(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib")
-def max_pool2d(attrs, args):
+def max_pool2d(expr):
     """Check if the external ACL codegen for maxpool2d should be used."""
+    attrs, args = expr.attrs, expr.args
     if attrs.layout != "NHWC":
         return False
     typ = args[0].checked_type
@@ -304,8 +309,9 @@ def max_pool2d(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib")
-def avg_pool2d(attrs, args, from_quantized_composite=False):
+def avg_pool2d(expr, from_quantized_composite=False):
     """Check if the external ACL codegen for avgpool2d should be used."""
+    attrs, args = expr.attrs, expr.args
     typ = args[0].checked_type
     if from_quantized_composite:
         if typ.dtype != "int32":
@@ -319,8 +325,9 @@ def avg_pool2d(attrs, args, from_quantized_composite=False):
 
 
 @tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib")
-def global_max_pool2d(attrs, args):
+def global_max_pool2d(expr):
     """Check if the external ACL codegen for gloval_maxpool2d should be used."""
+    attrs, args = expr.attrs, expr.args
     typ = args[0].checked_type
     if typ.dtype not in ["float32", "uint8"]:
         return False
@@ -330,8 +337,9 @@ def global_max_pool2d(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib")
-def global_avg_pool2d(attrs, args):
+def global_avg_pool2d(expr):
     """Check if the external ACL codegen for global_avgpool2d should be used."""
+    attrs, args = expr.attrs, expr.args
     typ = args[0].checked_type
     if typ.dtype not in ["float32"]:
         return False
@@ -341,16 +349,18 @@ def global_avg_pool2d(attrs, args):
 
 
 @tvm.ir.register_op_attr("maximum", "target.arm_compute_lib")
-def maximum(attrs, args):
+def maximum(expr):
     """Check if the external ACL codegen for maximum should be used."""
+    args = expr.args
     type_a = args[0].checked_type
     type_b = args[0].checked_type
     return (type_a.dtype == "float32") and (type_b.dtype == "float32")
 
 
 @tvm.ir.register_op_attr("add", "target.arm_compute_lib")
-def add(attrs, args):
+def add(expr):
     """Check if the external ACL codegen for add should be used."""
+    args = expr.args
     for typ in [args[0].checked_type, args[1].checked_type]:
         if typ.dtype != "float32":
             return False
@@ -359,8 +369,9 @@ def add(attrs, args):
 
 
 @tvm.ir.register_op_attr("qnn.add", "target.arm_compute_lib")
-def qnn_add(attrs, args):
+def qnn_add(expr):
     """Check if the external ACL codegen for add should be used."""
+    args = expr.args
     for typ in [args[0].checked_type, args[1].checked_type]:
         if typ.dtype != "uint8":
             return False
diff --git a/python/tvm/relay/op/contrib/coreml.py b/python/tvm/relay/op/contrib/coreml.py
index 105009a9f9b0..c1c012199cec 100644
--- a/python/tvm/relay/op/contrib/coreml.py
+++ b/python/tvm/relay/op/contrib/coreml.py
@@ -31,7 +31,8 @@ def _register_coreml_op(op_name):
 
     """
 
-    def _check_supported(attrs, args):
+    def _check_supported(expr):
+        attrs, args = expr.attrs, expr.args
         if op_name == "nn.conv2d":
             if not isinstance(args[1], Constant):
                 return False
diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 816cb3818409..79bd02db164b 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -53,7 +53,7 @@ def _register_external_op_helper(op_name, supported=True):
     """
 
     @tvm.ir.register_op_attr(op_name, "target.dnnl")
-    def _func_wrapper(attrs, args):
+    def _func_wrapper(expr):
         return supported
 
     return _func_wrapper
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 3c676f4d9623..3a05011242e7 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -128,21 +128,23 @@ def _is_ethosn_composite(node):
 
 
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.ethos-n")
-def max_pool2d(attrs, args):
+def max_pool2d(expr):
     """Check if a max pool2d is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     pool = tvm.relay.nn.max_pool2d(*args, **attrs)
     return support.max_pool2d(pool)
 
 
 @tvm.ir.register_op_attr("reshape", "target.ethos-n")
-def reshape(attrs, args):
+def reshape(expr):
     """Check if a reshape is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     if not _is_ethosn_composite(args[0]):
         return False
 
@@ -151,21 +153,23 @@ def reshape(attrs, args):
 
 
 @tvm.ir.register_op_attr("qnn.add", "target.ethos-n")
-def qnn_add(attrs, args):
+def qnn_add(expr):
     """Check if an addition is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    args = expr.args
     add = _qnn.op.add(*args)
     return support.addition(add)
 
 
 @tvm.ir.register_op_attr("qnn.concatenate", "target.ethos-n")
-def qnn_concatenate(attrs, args):
+def qnn_concatenate(expr):
     """Check if a concatenate is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     conc = _qnn.op.concatenate(*args, **attrs)
     if not support.concatenate(conc):
         return False
@@ -190,11 +194,12 @@ def qnn_concatenate(attrs, args):
 
 
 @tvm.ir.register_op_attr("split", "target.ethos-n")
-def split(attrs, args):
+def split(expr):
     """Check if a split is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     if isinstance(attrs["indices_or_sections"], tvm.tir.IntImm):
         sp = tvm.relay.split(
             *args, indices_or_sections=attrs["indices_or_sections"].value, axis=attrs["axis"]
@@ -210,11 +215,12 @@ def split(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.depth_to_space", "target.ethos-n")
-def depth_to_space(attrs, args):
+def depth_to_space(expr):
     """Check if a depth_to_space is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     depth = tvm.relay.nn.depth_to_space(*args, **attrs)
     if not support.depth_to_space(depth):
         return False
@@ -223,11 +229,12 @@ def depth_to_space(attrs, args):
 
 
 @tvm.ir.register_op_attr("clip", "target.ethos-n")
-def clip(attrs, args):
+def clip(expr):
     """Check if a clip is supported by Ethos-N."""
     if not ethosn_available():
         return False
 
+    attrs, args = expr.attrs, expr.args
     c = tvm.relay.clip(*args, **attrs)
     if not support.relu(c):
         return False
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index a0e23a043a72..24c468fee0fe 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -157,7 +157,8 @@ def partition_for_tensorrt(
 
 def _register_external_op_helper_with_checker(op_name, checker):
     @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-    def _func_wrapper(attrs, args):
+    def _func_wrapper(expr):
+        attrs, args = expr.attrs, expr.args
         if any([x.checked_type.dtype != "float32" for x in args]):
             logger.info("Only float32 inputs are supported for TensorRT.")
             return False
@@ -192,9 +193,10 @@ def _register_external_op_helper(op_name, supported=True):
 
 
 @tvm.ir.register_op_attr("add", "target.tensorrt")
-def add_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def add_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if add is supported by TensorRT."""
 
+    args = expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -211,8 +213,10 @@ def add_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.batch_norm", "target.tensorrt")
-def batch_norm_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def batch_norm_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.batch_norm is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -223,8 +227,10 @@ def batch_norm_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.softmax", "target.tensorrt")
-def softmax_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def softmax_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.softmax is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -235,8 +241,10 @@ def softmax_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.conv2d", "target.tensorrt")
-def conv2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def conv2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -253,8 +261,10 @@ def conv2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.dense", "target.tensorrt")
-def dense_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def dense_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if dense is supported by TensorRT."""
+
+    args = expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -270,8 +280,10 @@ def dense_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.bias_add", "target.tensorrt")
-def bias_add_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def bias_add_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.bias_add is supported by TensorRT."""
+
+    args = expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -283,8 +295,10 @@ def bias_add_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.tensorrt")
-def max_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.max_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -298,8 +312,10 @@ def max_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.avg_pool2d", "target.tensorrt")
-def avg_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.avg_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -326,8 +342,10 @@ def avg_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.global_max_pool2d", "target.tensorrt")
-def global_max_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def global_max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.global_max_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -338,8 +356,10 @@ def global_max_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-varia
 
 
 @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.tensorrt")
-def global_avg_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def global_avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.global_avg_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -350,8 +370,10 @@ def global_avg_pool_2d_annotate_fn(attrs, args):  # pylint: disable=unused-varia
 
 
 @tvm.ir.register_op_attr("expand_dims", "target.tensorrt")
-def expand_dims_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def expand_dims_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if expand_dims is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -362,8 +384,10 @@ def expand_dims_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("squeeze", "target.tensorrt")
-def squeeze_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def squeeze_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if squeeze is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -377,8 +401,10 @@ def squeeze_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("concatenate", "target.tensorrt")
-def concatenate_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def concatenate_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if concatenate is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.dtype != "float32" for x in args[0].checked_type.fields]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -396,8 +422,10 @@ def concatenate_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.conv2d_transpose", "target.tensorrt")
-def conv2d_transpose_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def conv2d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv2d_transpose is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -419,8 +447,10 @@ def conv2d_transpose_annotate_fn(attrs, args):  # pylint: disable=unused-variabl
 
 
 @tvm.ir.register_op_attr("transpose", "target.tensorrt")
-def transpose_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if transpose is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -431,8 +461,10 @@ def transpose_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("layout_transform", "target.tensorrt")
-def layout_transform_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def layout_transform_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if layout_transform is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -450,8 +482,10 @@ def layout_transform_annotate_fn(attrs, args):  # pylint: disable=unused-variabl
 
 
 @tvm.ir.register_op_attr("reshape", "target.tensorrt")
-def reshape_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if reshape is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if args[0].checked_type.dtype != "float32":
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -481,8 +515,10 @@ def reshape_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.pad", "target.tensorrt")
-def pad_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def pad_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.pad is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -543,8 +579,10 @@ def _func_wrapper(attrs, args, op_name):
 
 
 @tvm.ir.register_op_attr("strided_slice", "target.tensorrt")
-def strided_slice_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def strided_slice_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if strided_slice is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if args[0].checked_type.dtype != "float32":
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -567,8 +605,10 @@ def strided_slice_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.adaptive_max_pool2d", "target.tensorrt")
-def adapative_max_pool2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def adapative_max_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.adaptive_max_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -579,8 +619,10 @@ def adapative_max_pool2d_annotate_fn(attrs, args):  # pylint: disable=unused-var
 
 
 @tvm.ir.register_op_attr("nn.adaptive_avg_pool2d", "target.tensorrt")
-def adapative_avg_pool2d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def adapative_avg_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.adaptive_avg_pool2d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -591,8 +633,10 @@ def adapative_avg_pool2d_annotate_fn(attrs, args):  # pylint: disable=unused-var
 
 
 @tvm.ir.register_op_attr("nn.conv3d", "target.tensorrt")
-def conv3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def conv3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv3d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -611,8 +655,10 @@ def conv3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.max_pool3d", "target.tensorrt")
-def max_pool_3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def max_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.max_pool3d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -625,8 +671,10 @@ def max_pool_3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.avg_pool3d", "target.tensorrt")
-def avg_pool_3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def avg_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.avg_pool3d is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
@@ -639,8 +687,10 @@ def avg_pool_3d_annotate_fn(attrs, args):  # pylint: disable=unused-variable
 
 
 @tvm.ir.register_op_attr("nn.conv3d_transpose", "target.tensorrt")
-def conv3d_transpose_annotate_fn(attrs, args):  # pylint: disable=unused-variable
+def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv3d_transpose is supported by TensorRT."""
+
+    attrs, args = expr.attrs, expr.args
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
index 9d160b26f1ad..d5f1e4cc1752 100644
--- a/src/relay/transforms/annotate_target.cc
+++ b/src/relay/transforms/annotate_target.cc
@@ -178,7 +178,8 @@ class AnnotateTargetRewriter : public ExprRewriter {
           continue;
         }
         auto fannotate = Op::GetAttrMap<FTVMAnnotateTarget>("target." + std::string(target));
-        if (fannotate.count(op) && fannotate[op](pre->attrs, pre->args)) {
+        const Expr& ex = GetRef<Expr>(pre);
+        if (fannotate.count(op) && fannotate[op](ex)) {
           supported_targets.push_back(target);
         }
       }
diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py
index 106909e16fa7..325826d183da 100644
--- a/tests/python/relay/test_pass_annotate_target.py
+++ b/tests/python/relay/test_pass_annotate_target.py
@@ -179,7 +179,7 @@ def test_extern_dnnl_mobilenet():
 
 def test_multiple_ends():
     @tvm.ir.register_op_attr("nn.relu", "target.test")
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     def before():
@@ -221,8 +221,8 @@ def test_type_propagation():
     target = "test_type_propagation"
 
     @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(attrs, args):  # pylint: disable=unused-variable
-        return args[0].checked_type.dtype == "float32"
+    def relu(expr):  # pylint: disable=unused-variable
+        return expr.args[0].checked_type.dtype == "float32"
 
     def before():
         x = relay.var("x", shape=(10, 10))
@@ -240,11 +240,11 @@ def test_tuple():
     target = "test_tuple"
 
     @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("concatenate", "target." + target)
-    def concatenate(attrs, args):  # pylint: disable=unused-variable
+    def concatenate(expr):  # pylint: disable=unused-variable
         return True
 
     """Test that TupleNode is included in annotation when surrounded by supported nodes."""
@@ -331,11 +331,11 @@ def after():
 
 def test_multiple_runs():
     @tvm.ir.register_op_attr("nn.relu", "target.A")
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("add", "target.B")
-    def add(attrs, args):  # pylint: disable=unused-variable
+    def add(expr):  # pylint: disable=unused-variable
         return True
 
     def before():
@@ -359,19 +359,19 @@ def test_if_else():
     target = "test_if_else"
 
     @tvm.ir.register_op_attr("equal", "target." + target)
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("tanh", "target." + target)
-    def tanh(attrs, args):  # pylint: disable=unused-variable
+    def tanh(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("sigmoid", "target." + target)
-    def sigmoid(attrs, args):  # pylint: disable=unused-variable
+    def sigmoid(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("erf", "target." + target)
-    def erf(attrs, args):  # pylint: disable=unused-variable
+    def erf(expr):  # pylint: disable=unused-variable
         return True
 
     """Test that If-else nodes compiles correctly when surrounded by supported nodes."""
@@ -430,15 +430,15 @@ def test_while_let():
     target = "test_while_let"
 
     @tvm.ir.register_op_attr("less", "target." + target)
-    def less(attrs, args):  # pylint: disable=unused-variable
+    def less(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("add", "target." + target)
-    def add(attrs, args):  # pylint: disable=unused-variable
+    def add(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("zeros_like", "target." + target)
-    def zeros_like(attrs, args):  # pylint: disable=unused-variable
+    def zeros_like(expr):  # pylint: disable=unused-variable
         return True
 
     """Test that let nodes compiles correctly when surrounded by other nodes."""
@@ -514,15 +514,15 @@ def test_if_free_vars():
     target = "test_if_free_vars"
 
     @tvm.ir.register_op_attr("equal", "target." + target)
-    def equal(attrs, args):  # pylint: disable=unused-variable
+    def equal(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("sigmoid", "target." + target)
-    def sigmoid(attrs, args):  # pylint: disable=unused-variable
+    def sigmoid(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("erf", "target." + target)
-    def erf(attrs, args):  # pylint: disable=unused-variable
+    def erf(expr):  # pylint: disable=unused-variable
         return True
 
     """Test that If-else nodes compiles correctly when surrounded by free variables"""
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 8d0e2d5e22e0..059d0b4c8af8 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -1035,7 +1035,7 @@ def test_duplicate_outputs():
     target = "test_duplicate_outputs"
 
     @tvm.ir.register_op_attr("abs", "target." + target)
-    def abs(attrs, args):  # pylint: disable=unused-variable
+    def abs(expr):  # pylint: disable=unused-variable
         return True
 
     def create_graph():
@@ -1096,11 +1096,11 @@ def test_duplicate_merge_and_tuplegetitem():
     target = "test_duplicate_merge_and_tuplegetitem"
 
     @tvm.ir.register_op_attr("nn.batch_norm", "target." + target)
-    def batch_norm(attrs, args):  # pylint: disable=unused-variable
+    def batch_norm(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("nn.relu", "target." + target)
-    def relu(attrs, args):  # pylint: disable=unused-variable
+    def relu(expr):  # pylint: disable=unused-variable
         return True
 
     def create_graph():
@@ -1177,7 +1177,7 @@ def expected():
 
 def test_constant_tuples():
     @tvm.ir.register_op_attr("qnn.concatenate", "target.const_tuples")
-    def add(attrs, args):  # pylint: disable=unused-variable
+    def add(expr):  # pylint: disable=unused-variable
         return True
 
     def create_graph():
@@ -1223,11 +1223,11 @@ def test_flatten_tuple_output():
     target = "test_flatten_tuple_output"
 
     @tvm.ir.register_op_attr("split", "target." + target)
-    def split(attrs, args):  # pylint: disable=unused-variable
+    def split(expr):  # pylint: disable=unused-variable
         return True
 
     @tvm.ir.register_op_attr("abs", "target." + target)
-    def abs(attrs, args):  # pylint: disable=unused-variable
+    def abs(expr):  # pylint: disable=unused-variable
         return True
 
     def create_graph():

From 85c2c0eb5afe8d6ee41d640e4e52129d6899aaf1 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 3 Nov 2020 19:55:21 -0800
Subject: [PATCH 115/258] [CI] Disable flaky tests (#6841)

* [CI] Disable flaky tests

* format
---
 tests/python/unittest/test_auto_scheduler_layout_rewrite.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index e6f9a76fce62..9e4f1bf27735 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -19,6 +19,8 @@
 import tempfile
 import numpy as np
 
+import pytest
+
 import tvm
 from tvm import topi
 from tvm import auto_scheduler, te
@@ -46,6 +48,7 @@ def test_apply_steps_with_layout_rewrite():
     assert bufs[1].shape[1] == 512
 
 
+@pytest.mark.skip("skip due to flaky")
 @tvm.testing.requires_llvm
 def test_correctness_layout_rewrite_rewrite_for_preTransformed():
     N = 128
@@ -116,6 +119,7 @@ def test_correctness_layout_rewrite_rewrite_for_preTransformed():
         del measure_ctx
 
 
+@pytest.mark.skip("skip due to flaky")
 @tvm.testing.requires_llvm
 def test_correctness_layout_rewrite_insert_transform_stage():
     N = 128

From 8b1ccc96d4f59a32ba2ce93eb2439c72291a76a3 Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Wed, 4 Nov 2020 09:58:16 +0530
Subject: [PATCH 116/258] [Relay][Frontend] SparseTensorDenseMatMul support for
 Tensorflow (#6685)

* [Relay][Frontend] SparseTensorDenseMatMul support for Tensorflow

* Lint error resolved

* [1] Review comments handled

* [2] Review comments handled
---
 python/tvm/relay/frontend/tensorflow.py       | 46 +++++++++++++++
 python/tvm/relay/op/nn/nn.py                  |  6 +-
 python/tvm/topi/cuda/sparse.py                | 13 +++--
 .../frontend/tensorflow/test_forward.py       | 56 +++++++++++++++++++
 4 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index a6fd1db7e7b5..c6079b4535c4 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -903,6 +903,51 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _sparse_tensor_dense_matmul():
+    # Sparse utility from scipy
+    from scipy.sparse import csr_matrix
+
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 4, "There should be 4 input tensors"
+
+        indices_tensor = _infer_value(inputs[0], params, mod).asnumpy()
+        values_tensor = _infer_value(inputs[1], params, mod).asnumpy()
+        dense_shape_tensor = _infer_value(inputs[2], params, mod).asnumpy()
+
+        data = inputs[3]
+
+        rows = [x[0] for x in indices_tensor]
+        cols = [x[1] for x in indices_tensor]
+
+        # Create scipy sparse Tensor(CSR)
+        weight_sp = csr_matrix(
+            (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist())
+        )
+        weight_sp = csr_matrix(weight_sp.transpose())
+
+        weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype)
+        weight_indptrs = _expr.const(weight_sp.indptr, weight_sp.indptr.dtype)
+        weight_indices = _expr.const(weight_sp.indices, weight_sp.indices.dtype)
+
+        ret = _op.nn.sparse_dense(data, [weight_data, weight_indices, weight_indptrs])
+
+        # If both are true means First input was dense and second was sparse
+        # TODO(ANSHUMAN87): Support other adjoint option too
+        if attr.get("adjoint_a") and attr.get("adjoint_b"):
+            ret = _op.transpose(ret)
+        else:
+            raise tvm.error.OpAttributeUnImplemented(
+                "Only tf.sparse.sparse_dense_matmul() with adjoint_a=True and adjoint_b=True"
+                " is supported, but adjoint_a={} and adjoint_b={} was supplied.".format(
+                    attr.get("adjoint_a"), attr.get("adjoint_b")
+                )
+            )
+
+        return ret
+
+    return _impl
+
+
 def _identity():
     def _impl(inputs, attr, params, mod):
         return inputs[0]
@@ -2411,6 +2456,7 @@ def _impl(inputs, attr, params, mod):
     "SpaceToBatchND": _space_to_batch_nd(),
     "SpaceToDepth": _space_to_depth(),
     "SparseToDense": _sparse_to_dense(),
+    "SparseTensorDenseMatMul": _sparse_tensor_dense_matmul(),
     "Split": _split(False),
     "SplitV": _split(True),
     "Sqrt": AttrCvt("sqrt"),
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 0d012540343f..4810bdc35bbd 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -2046,7 +2046,7 @@ def sparse_transpose(x):
 
     Parameters
     ----------
-    x : namedtuple.
+    x : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]].
         The sparse weight matrix for the fast matrix transpose.
 
     Returns
@@ -2055,7 +2055,9 @@ def sparse_transpose(x):
         Tuple of output sparse tensor (same shape and format as input),
         i.e. if CSR then output is in ([data, indices, indptr]) form
     """
-    return expr.TupleWrapper(_make.sparse_transpose(x.data, x.indices, x.indptr), 3)
+    if hasattr(x, "indices"):
+        return expr.TupleWrapper(_make.sparse_transpose(x.data, x.indices, x.indptr), 3)
+    return expr.TupleWrapper(_make.sparse_transpose(x[0], x[1], x[2]), 3)
 
 
 def contrib_conv2d_winograd_without_weight_transform(
diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index d125423968a9..ebac5517d46c 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -180,7 +180,7 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
         assert (
             mb >= mi
         ), "Number of block rows in dense matrix must be larger than warp size: {} vs {}.".format(
-            warp_size, m
+            warp_size, mb
         )
         mo = ceil_div(mb, mi)
         ni = 1  # TODO(tkonolige): how do I compute the number of warps per block?
@@ -367,9 +367,14 @@ def _alter_sparse_dense_layout(_attrs, inputs, _tinfos, _out_type):
         and isinstance(inputs[2], relay.Constant)
         and isinstance(inputs[3], relay.Constant)
     ):
-        sparse_matrix = sp.bsr_matrix(
-            (inputs[1].data.asnumpy(), inputs[2].data.asnumpy(), inputs[3].data.asnumpy())
-        )
+        if len(inputs[1].data.asnumpy().shape) == 1:
+            sparse_matrix = sp.csr_matrix(
+                (inputs[1].data.asnumpy(), inputs[2].data.asnumpy(), inputs[3].data.asnumpy())
+            ).tobsr()
+        else:
+            sparse_matrix = sp.bsr_matrix(
+                (inputs[1].data.asnumpy(), inputs[2].data.asnumpy(), inputs[3].data.asnumpy())
+            )
         warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
         sparse_matrix = pad_sparse_matrix(sparse_matrix, warp_size)
         return relay.nn._make.sparse_dense_padded(
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 90e639e36809..5f849ac9ac93 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1765,6 +1765,62 @@ def test_forward_batch_matmul():
     _test_batch_matmul((2, 3, 4, 2, 3, 4, 5, 6), (2, 3, 4, 2, 3, 4, 5, 6), "float32", False, True)
 
 
+#######################################################################
+# SparseTensorDenseMatMul
+# ----------------------------------
+
+
+def _test_sparse_dense_matmul(indices, values, A_shape, B_shape, dtype, flip=False):
+    """ One iteration of sparse_dense_matmul """
+
+    # TODO(ANSHUMAN87): Support adjoint options too
+    for adjoint_a in [False]:
+        for adjoint_b in [False]:
+            with tf.Graph().as_default():
+                A_sp = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=A_shape)
+                B = tf.placeholder(shape=B_shape, dtype=dtype, name="B")
+
+                if flip:
+                    result = tf.sparse.sparse_dense_matmul(
+                        B, A_sp, adjoint_a=adjoint_a, adjoint_b=adjoint_b
+                    )
+                else:
+                    result = tf.sparse.sparse_dense_matmul(
+                        A_sp, B, adjoint_a=adjoint_a, adjoint_b=adjoint_b
+                    )
+
+                B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype)
+
+                # TODO(ANSHUMAN87): There is an issue in cuda scheduling for csr, work in progress
+                compare_tf_with_tvm([B_np], [B.name], result.name, no_gpu=True)
+
+
+def test_forward_sparse_dense_matmul():
+    """ sparse_dense_matmul op test"""
+    ###################################################################
+    #
+    # In order to create a SparseTensor, it requires 3 input as below:
+    #    SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    #
+    # Above Sparse can be represented in Dense as below :
+    #    [[1, 0, 0, 0]
+    #     [0, 0, 2, 0]
+    #     [0, 0, 0, 0]]
+    #
+    # ------------------------------------------------------------------
+
+    # TODO(ANSHUMAN87): False case for flip need to be supported
+    # _test_sparse_dense_matmul([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [4, 3], "float32")
+    _test_sparse_dense_matmul([[0, 0], [1, 2]], [4.0, 8.0], [3, 5], [4, 3], "float32", True)
+    _test_sparse_dense_matmul([[0, 0], [1, 2]], [4.0, 8.0], [3, 3], [3, 3], "float32", True)
+    _test_sparse_dense_matmul(
+        [[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], "float32", True
+    )
+    _test_sparse_dense_matmul(
+        [[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [9, 5], [7, 9], "float32", True
+    )
+
+
 #######################################################################
 # StridedSlice
 # ------------

From 925d0577146e45675cd6f0ca924e6fb29e05e6da Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 3 Nov 2020 20:57:50 -0800
Subject: [PATCH 117/258] Register shape functions for some image related ops
 (#6373)

* debugging

* added three shape funcs

* fix lint

* address comment

* resolve conflicts

* resolve conflicts

* resolve conflicts

* resolve conflicts

* resolve conflicts
---
 python/tvm/relay/op/image/_image.py | 76 +++++++++++++++++++++++++
 tests/python/relay/test_any.py      | 88 +++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+)

diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py
index c0cdf64c621a..ee8a5b3883b1 100644
--- a/python/tvm/relay/op/image/_image.py
+++ b/python/tvm/relay/op/image/_image.py
@@ -42,6 +42,45 @@ def compute_resize(attrs, inputs, out_type):
 reg.register_injective_schedule("image.resize")
 
 
+@script
+def _resize_shape_func(image_shape, size, batch_axis, height_axis, width_axis, channel_axis):
+    out = output_tensor((4,), "int64")
+    out[batch_axis] = int64(image_shape[0])
+    out[height_axis] = int64(size[0])
+    out[width_axis] = int64(size[1])
+    out[channel_axis] = image_shape[channel_axis]
+    return out
+
+
+@reg.register_shape_func("image.resize", False)
+def resize_shape_func(attrs, inputs, _):
+    """
+    Shape function for resize op.
+    """
+    layout = attrs.layout
+    height_axis = width_axis = channel_axis = 1
+    for i, letter in enumerate(layout):
+        if letter == "N":
+            batch_axis = i
+        if letter == "H":
+            height_axis = i
+        if letter == "W":
+            width_axis = i
+        if letter == "C":
+            channel_axis = i
+    size = get_const_tuple(attrs.size)
+    return [
+        _resize_shape_func(
+            inputs[0],
+            convert(size),
+            convert(batch_axis),
+            convert(height_axis),
+            convert(width_axis),
+            convert(channel_axis),
+        )
+    ]
+
+
 @reg.register_compute("image.resize3d")
 def compute_resize3d(attrs, inputs, out_type):
     size = attrs.size
@@ -134,6 +173,25 @@ def compute_affine_grid(attrs, inputs, out_dtype):
 reg.register_injective_schedule("image.affine_grid")
 
 
+@script
+def _affine_grid_func(data, target_shape):
+    out = output_tensor((4,), "int64")
+    out[0] = int64(data[0])
+    out[1] = int64(2)
+    out[2] = int64(target_shape[0])
+    out[3] = int64(target_shape[1])
+    return out
+
+
+@reg.register_shape_func("image.affine_grid", False)
+def affine_grid_func(attrs, inputs, _):
+    """
+    Shape function for affine_grid op.
+    """
+    target_shape = get_const_tuple(attrs.target_shape)
+    return [_affine_grid_func(inputs[0], convert(target_shape))]
+
+
 # grid_sample
 @reg.register_compute("image.grid_sample")
 def compute_grid_sample(attrs, inputs, out_dtype):
@@ -143,3 +201,21 @@ def compute_grid_sample(attrs, inputs, out_dtype):
 
 
 reg.register_injective_schedule("image.grid_sample")
+
+
+@script
+def _grid_sample_func(data, grid):
+    out = output_tensor((4,), "int64")
+    out[0] = int64(data[0])
+    out[1] = int64(data[1])
+    out[2] = int64(grid[2])
+    out[3] = int64(grid[3])
+    return out
+
+
+@reg.register_shape_func("image.grid_sample", False)
+def grid_sample_func(attrs, inputs, _):
+    """
+    Shape function for grid_sample op.
+    """
+    return [_grid_sample_func(inputs[0], inputs[1])]
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 8784b97a31fa..546973704fea 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -1121,6 +1121,94 @@ def test_any_ndarray_size():
     verify_any_ndarray_size((1, 2, 3, 4))
 
 
+def verify_any_resize(data_shape, scale, layout, static_data_shape, ref_out_shape):
+    mod = tvm.IRModule()
+    dtype = "float32"
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    if layout == "NHWC":
+        size = (data_shape[1] * scale, data_shape[2] * scale)
+    else:
+        size = (data_shape[2] * scale, data_shape[3] * scale)
+    y = relay.image.resize(data, size, layout)
+    mod["main"] = relay.Function([data], y)
+    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+    check_result([data_np], mod, ref_out_shape, assert_shape=True)
+
+
+@tvm.testing.uses_gpu
+def test_any_resize():
+    verify_any_resize(
+        data_shape=(relay.Any(), 4, 4, 4),
+        scale=2,
+        layout="NHWC",
+        static_data_shape=(1, 4, 4, 4),
+        ref_out_shape=(1, 8, 8, 4),
+    )
+    verify_any_resize(
+        data_shape=(relay.Any(), 8, 17, 20),
+        scale=3,
+        layout="NCHW",
+        static_data_shape=(2, 8, 17, 20),
+        ref_out_shape=(2, 8, 51, 60),
+    )
+
+
+def verify_any_grid_sample(data_shape, grid_shape, static_data_shape, ref_out_shape):
+    mod = tvm.IRModule()
+    dtype = "float32"
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    grid = relay.var("grid", shape=grid_shape, dtype=dtype)
+    y = relay.image.grid_sample(data, grid)
+    mod["main"] = relay.Function([data, grid], y)
+    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+    grid_np = np.random.uniform(size=grid_shape).astype(dtype)
+    check_result([data_np, grid_np], mod, ref_out_shape, assert_shape=True)
+
+
+@tvm.testing.uses_gpu
+def test_any_grid_sample():
+    verify_any_grid_sample(
+        data_shape=(relay.Any(), 4, 16, 32),
+        grid_shape=(4, 2, 8, 8),
+        static_data_shape=(4, 4, 16, 32),
+        ref_out_shape=(4, 4, 8, 8),
+    )
+    verify_any_grid_sample(
+        data_shape=(relay.Any(), 4, 16, 32),
+        grid_shape=(4, 2, 32, 32),
+        static_data_shape=(4, 4, 16, 32),
+        ref_out_shape=(4, 4, 32, 32),
+    )
+
+
+def verify_any_affine_grid(num_batch, static_num_batch, target_shape, ref_out_shape):
+    mod = tvm.IRModule()
+    dtype = "float32"
+    data_shape = (num_batch, 2, 3)
+    static_data_shape = (static_num_batch, 2, 3)
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    y = relay.image.affine_grid(data, target_shape)
+    mod["main"] = relay.Function([data], y)
+    data_np = np.random.uniform(size=static_data_shape).astype(dtype)
+    check_result([data_np], mod, ref_out_shape, assert_shape=True)
+
+
+@tvm.testing.uses_gpu
+def test_any_affine_grid():
+    verify_any_affine_grid(
+        num_batch=relay.Any(),
+        static_num_batch=1,
+        target_shape=(16, 32),
+        ref_out_shape=(1, 2, 16, 32),
+    )
+    verify_any_affine_grid(
+        num_batch=relay.Any(),
+        static_num_batch=8,
+        target_shape=(32, 32),
+        ref_out_shape=(8, 2, 32, 32),
+    )
+
+
 def test_any_consecutive_broadcast():
     dtype = "float32"
     data0 = relay.var("data0", shape=any_dims(2), dtype=dtype)

From 68a2867422601279b65bd4fd1fc06704aae5a2a1 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 3 Nov 2020 22:00:58 -0800
Subject: [PATCH 118/258] [TopHub] Bump the versions (#6837)

* [TopHub] Update version

* trigger ci
---
 python/tvm/autotvm/tophub.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index b1be488a220a..c17c611f5499 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -46,14 +46,14 @@
 
 # the version of each package
 PACKAGE_VERSION = {
-    "arm_cpu": "v0.07",
+    "arm_cpu": "v0.08",
     "llvm": "v0.04",
     "cuda": "v0.09",
     "rocm": "v0.05",
     "opencl": "v0.04",
     "mali": "v0.06",
     "intel_graphics": "v0.02",
-    "vta": "v0.09",
+    "vta": "v0.10",
     "amd_apu": "v0.01",
 }
 

From 8e8d73d1d9087edf9eb7a8f83e970237a9286a80 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 4 Nov 2020 18:04:11 +0900
Subject: [PATCH 119/258] [Graph memory plan] Support nested tuples (#6809)

* add test

* test working

* uncomment other tests

* remove redundant visit

* test double nesting

* support nested tuple in CallNode's return type

* Revert "support nested tuple in CallNode's return type"

This reverts commit 66225eda33f37647cfc11ceb8caa2125dfe88d0d.
---
 src/relay/backend/graph_plan_memory.cc        |  5 ++--
 .../relay/test_backend_graph_runtime.py       | 26 +++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index bf58c8d5be41..15173c2c79db 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -82,9 +82,8 @@ class StorageAllocaBaseVisitor : public ExprVisitor {
   void VisitExpr_(const TupleNode* op) final {
     std::vector<StorageToken*> fields;
     for (Expr field : op->fields) {
-      auto tok = GetToken(field);
-      ICHECK_EQ(tok.size(), 1U);
-      fields.push_back(tok[0]);
+      auto tokens = GetToken(field);
+      fields.insert(fields.end(), tokens.begin(), tokens.end());
     }
     token_map_[op] = fields;
   }
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index 1bd551004ad7..3c42b7b4196f 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -184,6 +184,31 @@ def unit_numpy(X, W):
             tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
 
 
+def test_compile_nested_tuples():
+    x = relay.var("x", shape=(10,))
+    x1 = x + relay.const(1.0)
+    x2 = x1 + relay.const(1.0)
+    x3 = x2 + relay.const(1.0)
+    x4 = x3 + relay.const(1.0)
+    out = relay.Tuple([x1, relay.Tuple([relay.Tuple([x2, x3]), x4])])
+    func = relay.Function([x], out)
+
+    graph, lib, _ = relay.build(tvm.IRModule.from_expr(func), "llvm")
+    mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+
+    x_data = np.random.uniform(size=(10,)).astype(np.float32)
+    mod.set_input(x=x_data)
+    mod.run()
+
+    assert mod.get_num_outputs() == 4
+
+    ref = x_data + 1
+    for i in range(mod.get_num_outputs()):
+        out = mod.get_output(i).asnumpy()
+        tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+        ref = ref + 1
+
+
 if __name__ == "__main__":
     test_plan_memory()
     test_with_params()
@@ -191,3 +216,4 @@ def unit_numpy(X, W):
     test_add_op_tensor()
     test_add_op_broadcast()
     test_gru_like()
+    test_compile_nested_tuples()

From 275d3deb073d6fc59028f0cad460d09fa1fd0402 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 4 Nov 2020 09:02:01 -0500
Subject: [PATCH 120/258] [CI] Add python setup script (#6844)

---
 Jenkinsfile                           | 11 ++++++++++-
 tests/scripts/task_ci_python_setup.sh | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100755 tests/scripts/task_ci_python_setup.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index 59d08b5c7ce7..272abb4c4ec9 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -181,6 +181,7 @@ stage('Build') {
         make(ci_cpu, 'build', '-j2')
         pack_lib('cpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_fsim.sh"
@@ -198,6 +199,7 @@ stage('Build') {
         sh "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh"
         make(ci_wasm, 'build', '-j2')
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_wasm} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh"
         }
       }
@@ -230,6 +232,7 @@ stage('Build') {
         sh "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh"
         make(ci_qemu, 'build', '-j2')
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh"
         }
       }
@@ -244,6 +247,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_sphinx_precheck.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh"
@@ -257,6 +261,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('i386', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_i386} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_vta_fsim.sh"
@@ -282,6 +287,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh"
         }
       }
@@ -296,6 +302,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh"
         }
       }
@@ -307,6 +314,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh"
         }
       }
@@ -318,6 +326,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('cpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh"
         }
       }
@@ -330,12 +339,12 @@ stage('Integration Test') {
   //       init_git()
   //       unpack_lib('gpu', tvm_multilib)
   //       timeout(time: max_time, unit: 'MINUTES') {
+  //         sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
   //         sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh"
   //       }
   //       pack_lib('mydocs', 'docs.tgz')
   //     }
   //   }
-  // }
 }
 
 /*
diff --git a/tests/scripts/task_ci_python_setup.sh b/tests/scripts/task_ci_python_setup.sh
new file mode 100755
index 000000000000..b12452c59528
--- /dev/null
+++ b/tests/scripts/task_ci_python_setup.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+# Script to setup additional python env.

From 9e458a2051c2a2b4a8359f9f07a9176939d1e828 Mon Sep 17 00:00:00 2001
From: m1k3 <mikael.sevenier@amd.com>
Date: Wed, 4 Nov 2020 06:14:52 -0800
Subject: [PATCH 121/258] Syntax error String::fromwe() should be
 String::from() (#6846)

Co-authored-by: Mikael Sevenier <mikael.sevenier@sima.ai>
---
 rust/tvm/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs
index 7e0682b86b33..27e794984094 100644
--- a/rust/tvm/src/lib.rs
+++ b/rust/tvm/src/lib.rs
@@ -53,7 +53,7 @@ macro_rules! export {
     ($($fn_name:expr),*) => {
         pub fn tvm_export(ns: &str) -> Result<(), tvm::Error> {
             $(
-                let name = String::fromwe(ns) + ::std::stringify!($fn_name);
+                let name = String::from(ns) + ::std::stringify!($fn_name);
                 tvm::runtime::function::register_override($fn_name, name, true)?;
             )*
             Ok(())

From acee1555ca2cc83bb62a5302b009971845959cd7 Mon Sep 17 00:00:00 2001
From: Chenfan <chengfan.jcf@alibaba-inc.com>
Date: Wed, 4 Nov 2020 22:16:22 +0800
Subject: [PATCH 122/258] [AutoScheduler] Bug fix for layout rewrite CI error
 in i386 (#6830)

---
 .../test_auto_scheduler_layout_rewrite.py      | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index 9e4f1bf27735..9d9704df0524 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -22,6 +22,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import auto_scheduler, te
 
@@ -48,7 +49,6 @@ def test_apply_steps_with_layout_rewrite():
     assert bufs[1].shape[1] == 512
 
 
-@pytest.mark.skip("skip due to flaky")
 @tvm.testing.requires_llvm
 def test_correctness_layout_rewrite_rewrite_for_preTransformed():
     N = 128
@@ -114,12 +114,11 @@ def test_correctness_layout_rewrite_rewrite_for_preTransformed():
         func_ref(*args_ref)
         ctx.sync()
 
-        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), rtol=1e-4)
-        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), rtol=1e-4)
+        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3)
+        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3)
         del measure_ctx
 
 
-@pytest.mark.skip("skip due to flaky")
 @tvm.testing.requires_llvm
 def test_correctness_layout_rewrite_insert_transform_stage():
     N = 128
@@ -162,14 +161,13 @@ def test_correctness_layout_rewrite_insert_transform_stage():
         func_ref(*args_ref)
         ctx.sync()
 
-        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), rtol=1e-4)
-        tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), rtol=1e-4)
-        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), rtol=1e-4)
+        tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3)
+        tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), atol=1e-3, rtol=1e-3)
+        tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3)
         del measure_ctx
 
 
 if __name__ == "__main__":
     test_apply_steps_with_layout_rewrite()
-    # Disable for now due to being flaky on i386
-    # test_correctness_layout_rewrite_rewrite_for_preTransformed()
-    # test_correctness_layout_rewrite_insert_transform_stage()
+    test_correctness_layout_rewrite_rewrite_for_preTransformed()
+    test_correctness_layout_rewrite_insert_transform_stage()

From 4b319882ca13ec692d914885aec6e1106d166d66 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 4 Nov 2020 16:38:25 -0500
Subject: [PATCH 123/258] [CI] Add more guidelines about local setup (#6848)

---
 docker/bash.sh                        |  7 ++++---
 docker/build.sh                       |  1 +
 docker/with_the_same_user             |  1 +
 docs/conf.py                          |  2 ++
 tests/scripts/task_ci_python_setup.sh | 10 ++++++++++
 tests/scripts/task_python_docs.sh     |  4 ----
 tests/scripts/task_sphinx_precheck.sh |  4 ----
 7 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/docker/bash.sh b/docker/bash.sh
index a87701afb918..7420e6f9024c 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -83,9 +83,9 @@ else
 fi
 
 if [[ "${DOCKER_IMAGE_NAME}" == *"ci"* ]]; then
-    CI_PY_ENV="-e PYTHONPATH=/workspace/python"
+    CI_ADDON_ENV="-e PYTHONPATH=/workspace/python"
 else
-    CI_PY_ENV=""
+    CI_ADDON_ENV=""
 fi
 
 # If the Vitis-AI docker image is selected, expose the Xilinx FPGA devices and required volumes containing e.g. DSA's and overlays
@@ -143,7 +143,8 @@ ${DOCKER_BINARY} run --rm --pid=host\
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_PYTEST_ADD_OPTIONS=$CI_PYTEST_ADD_OPTIONS" \
-    ${CI_PY_ENV} \
+    -e "CI_IMAGE_NAME=${DOCKER_IMAGE_NAME}" \
+    ${CI_ADDON_ENV} \
     ${CUDA_ENV} \
     "${CI_DOCKER_EXTRA_PARAMS[@]}" \
     ${DOCKER_IMAGE_NAME} \
diff --git a/docker/build.sh b/docker/build.sh
index 7d9145832000..bd13937b2571 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -164,6 +164,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_PYTEST_ADD_OPTIONS=$CI_PYTEST_ADD_OPTIONS" \
+    -e "CI_IMAGE_NAME=${DOCKER_IMAGE_NAME}" \
     ${CUDA_ENV}\
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMG_NAME} \
diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index 2bcbb6f49201..459978409be5 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -56,5 +56,6 @@ PATH=${PATH} \
 JAVA_HOME=${JAVA_HOME} \
 LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
 PYTHONPATH=${PYTHONPATH} \
+CI_IMAGE_NAME=${CI_IMAGE_NAME} \
 HOME=${CI_BUILD_HOME} \
 "${COMMAND[@]}"
diff --git a/docs/conf.py b/docs/conf.py
index 9ed10d710df2..5bf2d6bbb75e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -254,6 +254,7 @@
 ).strip()
 
 header_logo = "https://tvm.apache.org/assets/images/logo.svg"
+header_logo_link = "https://tvm.apache.org/"
 
 header_links = [
     ("Community", "https://tvm.apache.org/community"),
@@ -283,6 +284,7 @@
     "header_links": header_links,
     "header_dropdown": header_dropdown,
     "header_logo": header_logo,
+    "header_logo_link": header_logo_link,
 }
 
 # add additional overrides
diff --git a/tests/scripts/task_ci_python_setup.sh b/tests/scripts/task_ci_python_setup.sh
index b12452c59528..6463142a28c0 100755
--- a/tests/scripts/task_ci_python_setup.sh
+++ b/tests/scripts/task_ci_python_setup.sh
@@ -21,3 +21,13 @@ set -u
 set -o pipefail
 
 # Script to setup additional python env.
+#
+# Use the following command to install the
+# package to /workspace/.local, these additional
+# packages will have precedence over the system packages.
+#
+# command: python3 -m pip install --user <package>==<version>
+#
+echo "Addtiional setup in" ${CI_IMAGE_NAME}
+
+python3 -m pip install --user tlcpack-sphinx-addon==0.1.2
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index efb7a998014f..3d229651cb4f 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -51,10 +51,6 @@ find . -type f -path "*.log" | xargs rm -f
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
-# install theme addon for to local if does not exists
-# avoid docker for now before we stablize on the choice of style
-python3 -m pip install --user --upgrade -q tlcpack-sphinx-addon==0.1.0
-
 cd docs
 PYTHONPATH=`pwd`/../python make html |& tee /tmp/$$.log.txt
 if grep -E "failed to execute|Segmentation fault" < /tmp/$$.log.txt; then
diff --git a/tests/scripts/task_sphinx_precheck.sh b/tests/scripts/task_sphinx_precheck.sh
index 08184d9ff333..fd67b0ab539b 100755
--- a/tests/scripts/task_sphinx_precheck.sh
+++ b/tests/scripts/task_sphinx_precheck.sh
@@ -33,10 +33,6 @@ rm -rf docs/vta/tutorials
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
-# install theme addon for to local if does not exists
-# avoid docker for now before we stablize on the choice of style
-python3 -m pip install --user --upgrade -q tlcpack-sphinx-addon==0.1.0
-
 echo "PreCheck sphinx doc generation WARNINGS.."
 cd docs
 make clean

From 5d1a4e9e1135b63fead9b07e1e072d782952a7a6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 5 Nov 2020 07:34:01 -0800
Subject: [PATCH 124/258] [FIX] Add task_ci_python_setup.sh to the arm CI
 (#6850)

---
 Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 272abb4c4ec9..b9e191d5c80e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -275,6 +275,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('arm', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_unittest.sh"
           // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
         }

From eb8be762f0e18c3d729e3ca0337f09781b5072dc Mon Sep 17 00:00:00 2001
From: Gus Smith <guscomps@gmail.com>
Date: Thu, 5 Nov 2020 07:46:39 -0800
Subject: [PATCH 125/258] Update SimplifyInference documentation (#6853)

---
 include/tvm/relay/transform.h           | 5 +++--
 python/tvm/relay/transform/transform.py | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index cbd6a88e584e..a9a45b5f101a 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -194,8 +194,9 @@ TVM_DLL Pass ToGraphNormalForm();
 TVM_DLL Pass PartialEval();
 
 /*!
- * \brief Simplify certain operators during inference. For example, batch norm
- * will be unpacked into a number of simplified operators.
+ * \brief Simplify certain operators during inference. For example, the result
+ * of a batch norm which is indexed at tuple index 0 will be unpacked into a
+ * number of simplified operators.
  *
  * \return The Pass.
  */
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index f0f55f60d0e3..4907a0bf2bd4 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -173,10 +173,14 @@ def SimplifyInference():
     """Simplify the data-flow graph for inference phase. An simplified expression
     which is semantically equal to the input expression will be returned.
 
+    Note that batch norms will only be simplified if their result is indexed at
+    tuple index 0.
+
     Returns
     -------
     ret: tvm.transform.Pass
         The registered pass to perform operator simplification.
+
     """
     return _ffi_api.SimplifyInference()
 

From 1e58427fb9c2127aa221e6f6256868d79136e03b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 5 Nov 2020 08:14:38 -0800
Subject: [PATCH 126/258] =?UTF-8?q?[=C2=B5TVM]=20Add=20virtual=20machine,?=
 =?UTF-8?q?=20test=20zephyr=20runtime=20on=20real=20hardware=20(#6703)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Split transport classes into transport package.

* Introduce transport timeouts.

* black format

* Add metadata-only artifacts

* Simplify utvm rpc server API and ease handling of short packets.

* add zephyr test against qemu

* Add qemu build config

* fix typo

* cleanup zephyr main

* fix nonblocking piping on some linux kernels

* don't double-open transport

* validate FD are in non-blocking mode

* gitignore test debug files

* cleanup zephyr compiler

* re-comment serial until added

* remove logging

* add zephyr exclusions to check_file_type

* add asf header

* lint

* black format

* more pylint

* kill utvm rpc_server bindings, which don't work anymore and fail pylint

* fix compiler warning

* fixes related to pylint

* clang-format again

* more black format

* add qemu regression

* Fix paths for qemu/ dir

* fix typo

* fix SETFL logic

* export SessionTerminatedError and update except after moving

* fix test_micro_artifact

* retrigger staging CI

* fix jenkins syntax hopefully

* one last syntax error

* Add microTVM VM setup scripts

* obliterate USE_ANTLR from cmake.config

* add poetry deps to pyproject.toml

 - mainly taken from output of `pip freeze` in ci-gpu and ci-lint

* initial attempt at setup.py + autodetect libtvm_runtime SO path

* hack to hardcode in build

* make pyproject lock

* Add ci_qemu to Jenkinsfile

* build in qemu

* checkpoint

* create diff for jared

* add missing stuff

* address liangfu comments

* fix new bug with list passing

* release v0.0.2

* works on hardware

* switch to pytest for zephyr tests

* add missing import

* fix option parsing

* remove extraneous changes

* lint

* asf lint, somehow local pass didn't work

* file type lint

* black-format

* try to fix ARMTargetParser.h #include in LLVM < 8.0

* rm misspelled deamon lines

* move to apps/microtvm-vm

* fetch keys from kitware server

* fix path exclusions in check_file_type

* retrigger CI

* reorganize vm, add tutorial

* fixes for reorganization

 - enable vagrant ssh

* update ssh instructions

* rm commented code

* standardize reference VM release process, add prerelease test

* remove -mfpu from this change

* fix exit code of test_zephyr

* rm unneeded files, update check_file_type

* add asf header

* git-black

* git-black against main

* git-black with docker

* fixes for virtualbox

* black format

* install python3.8, for zephyr gdb

* timestamp zephyr vm name, permits launching multiple VMs

* log warning when initial vagrant destroy fails

* revert changes moved into #6789

* address leandron@ comments

* black format

* black format

* add --skip-build to test subcommand, detach device from other VMs

* black format

* address leandron@ comments

* don't rm release test when building only 1 provider

* revert pyproject.toml

* remove need to copy pyproject.toml to root

 * this often contributes to erroneous changes to that file
---
 apps/microtvm/README.md                       |  28 ++
 apps/microtvm/reference-vm/.gitignore         |   1 +
 apps/microtvm/reference-vm/README.md          |  67 +++
 apps/microtvm/reference-vm/base-box-tool.py   | 407 ++++++++++++++++++
 apps/microtvm/reference-vm/zephyr/.gitignore  |   1 +
 apps/microtvm/reference-vm/zephyr/Vagrantfile |  56 +++
 .../reference-vm/zephyr/base-box/.gitignore   |   4 +
 .../base-box/Vagrantfile.packer-template      |  40 ++
 .../reference-vm/zephyr/base-box/setup.sh     | 102 +++++
 .../reference-vm/zephyr/pyproject.toml        | 140 ++++++
 .../reference-vm/zephyr/rebuild-tvm.sh        |  33 ++
 apps/microtvm/reference-vm/zephyr/setup.sh    |  41 ++
 src/target/source/codegen_c_host.cc           |   5 +-
 src/target/source/codegen_c_host.h            |   2 +-
 tests/lint/check_file_type.py                 |   3 +
 tutorials/micro/micro_reference_vm.py         | 139 ++++++
 16 files changed, 1066 insertions(+), 3 deletions(-)
 create mode 100644 apps/microtvm/README.md
 create mode 100644 apps/microtvm/reference-vm/.gitignore
 create mode 100644 apps/microtvm/reference-vm/README.md
 create mode 100755 apps/microtvm/reference-vm/base-box-tool.py
 create mode 100644 apps/microtvm/reference-vm/zephyr/.gitignore
 create mode 100644 apps/microtvm/reference-vm/zephyr/Vagrantfile
 create mode 100644 apps/microtvm/reference-vm/zephyr/base-box/.gitignore
 create mode 100644 apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
 create mode 100644 apps/microtvm/reference-vm/zephyr/base-box/setup.sh
 create mode 100644 apps/microtvm/reference-vm/zephyr/pyproject.toml
 create mode 100755 apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
 create mode 100644 apps/microtvm/reference-vm/zephyr/setup.sh
 create mode 100644 tutorials/micro/micro_reference_vm.py

diff --git a/apps/microtvm/README.md b/apps/microtvm/README.md
new file mode 100644
index 000000000000..97b844a4c01b
--- /dev/null
+++ b/apps/microtvm/README.md
@@ -0,0 +1,28 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# microTVM Reference Virtual Machines
+
+
+microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers.
+These Virtual Machines are used to reproduce results and bugs when using microTVM with real
+physical hardware. Note that they are not used to run Continuous Integration regression tests--
+those are instead run by the QEMU container (they run against an emulator, rather than real
+hardware).
+
+
+See the "microTVM Reference Virtual Machines" tutorial for information on how to use these.
diff --git a/apps/microtvm/reference-vm/.gitignore b/apps/microtvm/reference-vm/.gitignore
new file mode 100644
index 000000000000..d918f5e13cc5
--- /dev/null
+++ b/apps/microtvm/reference-vm/.gitignore
@@ -0,0 +1 @@
+/release-test
\ No newline at end of file
diff --git a/apps/microtvm/reference-vm/README.md b/apps/microtvm/reference-vm/README.md
new file mode 100644
index 000000000000..7ef7900c3e05
--- /dev/null
+++ b/apps/microtvm/reference-vm/README.md
@@ -0,0 +1,67 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# microTVM Reference Virtual Machines
+
+This directory contains Vagrant specifications that create reference Virtual Machines for use with
+microTVM. These machines help microTVM users collaborate by providing a stable reference test
+environment.
+
+For more information on how to use them, see the microTVM Reference Virtual Machines tutorial.
+
+
+## Reference VM Developer Information
+
+Each RTOS or platform that integrates with microTVM can check-in a Reference VM in this directory to
+help the community collaborate. You should use the tools provided here to ensure a uniform release
+process across all platforms. Typically, releases need to be created by TVM committers.
+
+Generally speaking, it's expected that any integrated platform with a regression test checked-in to
+the tvm repository should also define a reference VM. If you want to integrate a new platform,
+please raise a discussion on [the forum](https://discuss.tvm.ai).
+
+### Organization
+
+Reference VMs are organized as follows:
+
+* `base-box-tool.py` - Reference VM build, test, and release tool
+* `<platform>/`
+** `Vagrantfile` Vagrantfile that end-users will inovke. Should be based off a base box
+    which contains dependencies other than the TVM python dependencies.
+** `base-box` - Top-level directory which defines the base box.
+*** `Vagrantfile.packer-template` - Packer template Vagrantfile which will be used to build the
+    base box.
+*** `test-config.json` - JSON file explaining how to perform release tests to `base-box-tool.py`
+
+## Creating Releases
+
+1. Build the base box for the given platform: `$ ./base-box-tool.py build <platform>`
+2. Run release tests for each platform:
+    1. Connect any needed hardware to the VM host machine.
+    2. Run tests: `$ ./base-box-tool.py test <platform> [--test-device-serial=<serial>]`. This
+       command does the following for each provider:
+        1. Copies all files inside `./<platform>` except `.vagrant` and `base-box` to
+           `./release-test`. This is done to avoid reusing any VM the developer may have started.
+        2. Executes `$ vagrant up --provider=<provider>`.
+        3. Finds an attached USB device matching the VID and PID specified in `test-config.json`,
+           and if `--test-device-serial` was given, that serial number (as reported to USB). Creates
+           a rule to autoconnect this device to the VM, and also attaches it to the VM>
+        4. SSHs to the VM, `cd` to the TVM root directory, and runs `test_cmd` from
+           `test-config.json`. Nonzero status means failure.
+3. If release tests fail, fix them and restart from step 1.
+4. If release tests pass: `$ ./base-box-tool.py release <platform> <version>`. Be sure you've logged
+   in to Vagrant Cloud using the `vagrant` tool.
diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
new file mode 100755
index 000000000000..12aded6e63c6
--- /dev/null
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -0,0 +1,407 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import argparse
+import json
+import logging
+import os
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+
+
+_LOG = logging.getLogger(__name__)
+
+
+THIS_DIR = os.path.realpath(os.path.dirname(__file__) or ".")
+
+
+# List of vagrant providers supported by this tool
+ALL_PROVIDERS = (
+    "parallels",
+    "virtualbox",
+)
+
+
+def parse_virtualbox_devices():
+    output = subprocess.check_output(["VBoxManage", "list", "usbhost"], encoding="utf-8")
+    devices = []
+    current_dev = {}
+    for line in output.split("\n"):
+        if not line.strip():
+            if current_dev:
+                if "VendorId" in current_dev and "ProductId" in current_dev:
+                    devices.append(current_dev)
+                current_dev = {}
+
+            continue
+
+        key, value = line.split(":", 1)
+        value = value.lstrip(" ")
+        current_dev[key] = value
+
+    if current_dev:
+        devices.append(current_dev)
+    return devices
+
+
+VIRTUALBOX_VID_PID_RE = re.compile(r"0x([0-9A-Fa-f]{4}).*")
+
+
+def attach_virtualbox(uuid, vid_hex=None, pid_hex=None, serial=None):
+    usb_devices = parse_virtualbox_devices()
+    for dev in usb_devices:
+        m = VIRTUALBOX_VID_PID_RE.match(dev["VendorId"])
+        if not m:
+            _LOG.warning("Malformed VendorId: %s", dev["VendorId"])
+            continue
+
+        dev_vid_hex = m.group(1).lower()
+
+        m = VIRTUALBOX_VID_PID_RE.match(dev["ProductId"])
+        if not m:
+            _LOG.warning("Malformed ProductId: %s", dev["ProductId"])
+            continue
+
+        dev_pid_hex = m.group(1).lower()
+
+        if (
+            vid_hex == dev_vid_hex
+            and pid_hex == dev_pid_hex
+            and (serial is None or serial == dev["SerialNumber"])
+        ):
+            rule_args = [
+                "VBoxManage",
+                "usbfilter",
+                "add",
+                "0",
+                "--action",
+                "hold",
+                "--name",
+                "test device",
+                "--target",
+                uuid,
+                "--vendorid",
+                vid_hex,
+                "--productid",
+                pid_hex,
+            ]
+            if serial is not None:
+                rule_args.extend(["--serialnumber", serial])
+            subprocess.check_call(rule_args)
+            subprocess.check_call(["VBoxManage", "controlvm", uuid, "usbattach", dev["UUID"]])
+            return
+
+    raise Exception(
+        f"Device with vid={vid_hex}, pid={pid_hex}, serial={serial!r} not found:\n{usb_devices!r}"
+    )
+
+
+def attach_parallels(uuid, vid_hex=None, pid_hex=None, serial=None):
+    usb_devices = json.loads(
+        subprocess.check_output(["prlsrvctl", "usb", "list", "-j"], encoding="utf-8")
+    )
+    for dev in usb_devices:
+        _, dev_vid_hex, dev_pid_hex, _, _, dev_serial = dev["System name"].split("|")
+        dev_vid_hex = dev_vid_hex.lower()
+        dev_pid_hex = dev_pid_hex.lower()
+        if (
+            vid_hex == dev_vid_hex
+            and pid_hex == dev_pid_hex
+            and (serial is None or serial == dev_serial)
+        ):
+            subprocess.check_call(["prlsrvctl", "usb", "set", dev["Name"], uuid])
+            if "Used-By-Vm-Name" in dev:
+                subprocess.check_call(
+                    ["prlctl", "set", dev["Used-By-Vm-Name"], "--device-disconnect", dev["Name"]]
+                )
+            subprocess.check_call(["prlctl", "set", uuid, "--device-connect", dev["Name"]])
+            return
+
+    raise Exception(
+        f"Device with vid={vid_hex}, pid={pid_hex}, serial={serial!r} not found:\n{usb_devices!r}"
+    )
+
+
+ATTACH_USB_DEVICE = {
+    "parallels": attach_parallels,
+    "virtualbox": attach_virtualbox,
+}
+
+
+def generate_packer_config(file_path, providers):
+    builders = []
+    for provider_name in providers:
+        builders.append(
+            {
+                "type": "vagrant",
+                "output_dir": f"output-packer-{provider_name}",
+                "communicator": "ssh",
+                "source_path": "generic/ubuntu1804",
+                "provider": provider_name,
+                "template": "Vagrantfile.packer-template",
+            }
+        )
+
+    with open(file_path, "w") as f:
+        json.dump(
+            {
+                "builders": builders,
+            },
+            f,
+            sort_keys=True,
+            indent=2,
+        )
+
+
+def build_command(args):
+    generate_packer_config(
+        os.path.join(THIS_DIR, args.platform, "base-box", "packer.json"),
+        args.provider.split(",") or ALL_PROVIDERS,
+    )
+    subprocess.check_call(
+        ["packer", "build", "packer.json"], cwd=os.path.join(THIS_DIR, args.platform, "base-box")
+    )
+
+
+REQUIRED_TEST_CONFIG_KEYS = {
+    "vid_hex": str,
+    "pid_hex": str,
+    "test_cmd": list,
+}
+
+
+VM_BOX_RE = re.compile(r'(.*\.vm\.box) = "(.*)"')
+
+
+# Paths, relative to the platform box directory, which will not be copied to release-test dir.
+SKIP_COPY_PATHS = [".vagrant", "base-box"]
+
+
+def do_build_release_test_vm(release_test_dir, user_box_dir, base_box_dir, provider_name):
+    if os.path.exists(release_test_dir):
+        try:
+            subprocess.check_call(["vagrant", "destroy", "-f"], cwd=release_test_dir)
+        except subprocess.CalledProcessError:
+            _LOG.warning("vagrant destroy failed--removing dirtree anyhow", exc_info=True)
+
+        shutil.rmtree(release_test_dir)
+
+    for dirpath, _, filenames in os.walk(user_box_dir):
+        rel_path = os.path.relpath(dirpath, user_box_dir)
+        if any(
+            rel_path == scp or rel_path.startswith(f"{scp}{os.path.sep}") for scp in SKIP_COPY_PATHS
+        ):
+            continue
+
+        dest_dir = os.path.join(release_test_dir, rel_path)
+        os.makedirs(dest_dir)
+        for filename in filenames:
+            shutil.copy2(os.path.join(dirpath, filename), os.path.join(dest_dir, filename))
+
+    release_test_vagrantfile = os.path.join(release_test_dir, "Vagrantfile")
+    with open(release_test_vagrantfile) as f:
+        lines = list(f)
+
+    found_box_line = False
+    with open(release_test_vagrantfile, "w") as f:
+        for line in lines:
+            m = VM_BOX_RE.match(line)
+            if not m:
+                f.write(line)
+                continue
+
+            box_package = os.path.join(
+                base_box_dir, f"output-packer-{provider_name}", "package.box"
+            )
+            f.write(f'{m.group(1)} = "{os.path.relpath(box_package, release_test_dir)}"\n')
+            found_box_line = True
+
+    if not found_box_line:
+        _LOG.error(
+            "testing provider %s: couldn't find config.box.vm = line in Vagrantfile; unable to test",
+            provider_name,
+        )
+        return False
+
+    subprocess.check_call(["vagrant", "up", f"--provider={provider_name}"], cwd=release_test_dir)
+
+    return True
+
+
+def do_run_release_test(release_test_dir, provider_name, test_config, test_device_serial):
+    with open(
+        os.path.join(release_test_dir, ".vagrant", "machines", "default", provider_name, "id")
+    ) as f:
+        machine_uuid = f.read()
+    ATTACH_USB_DEVICE[provider_name](
+        machine_uuid,
+        vid_hex=test_config["vid_hex"],
+        pid_hex=test_config["pid_hex"],
+        serial=test_device_serial,
+    )
+    tvm_home = os.path.realpath(os.path.join(THIS_DIR, "..", "..", ".."))
+
+    def _quote_cmd(cmd):
+        return " ".join(shlex.quote(a) for a in cmd)
+
+    test_cmd = _quote_cmd(["cd", tvm_home]) + " && " + _quote_cmd(test_config["test_cmd"])
+    subprocess.check_call(["vagrant", "ssh", "-c", f"bash -ec '{test_cmd}'"], cwd=release_test_dir)
+
+
+def test_command(args):
+    user_box_dir = os.path.join(THIS_DIR, args.platform)
+    base_box_dir = os.path.join(THIS_DIR, args.platform, "base-box")
+    test_config_file = os.path.join(base_box_dir, "test-config.json")
+    with open(test_config_file) as f:
+        test_config = json.load(f)
+        for key, expected_type in REQUIRED_TEST_CONFIG_KEYS.items():
+            assert key in test_config and isinstance(
+                test_config[key], expected_type
+            ), f"Expected key {key} of type {expected_type} in {test_config_file}: {test_config!r}"
+
+        test_config["vid_hex"] = test_config["vid_hex"].lower()
+        test_config["pid_hex"] = test_config["pid_hex"].lower()
+
+    providers = args.provider.split(",")
+    provider_passed = {p: False for p in providers}
+
+    release_test_dir = os.path.join(THIS_DIR, "release-test")
+
+    if args.skip_build:
+        assert len(providers) == 1, "--skip-build was given, but >1 provider specified"
+
+    for provider_name in providers:
+        try:
+            if not args.skip_build:
+                do_build_release_test_vm(
+                    release_test_dir, user_box_dir, base_box_dir, provider_name
+                )
+            do_run_release_test(
+                release_test_dir, provider_name, test_config, args.test_device_serial
+            )
+            provider_passed[provider_name] = True
+
+        finally:
+            if not args.skip_build and len(providers) > 1:
+                subprocess.check_call(["vagrant", "destroy", "-f"], cwd=release_test_dir)
+                shutil.rmtree(release_test_dir)
+
+        if not all(provider_passed[p] for p in provider_passed.keys()):
+            sys.exit(
+                "some providers failed release test: "
+                + ",".join(name for name, passed in provider_passed if not passed)
+            )
+
+
+def release_command(args):
+    #  subprocess.check_call(["vagrant", "cloud", "version", "create", f"tlcpack/microtvm-{args.platform}", args.version])
+    if not args.version:
+        sys.exit(f"--version must be specified")
+
+    for provider_name in args.provider.split(","):
+        subprocess.check_call(
+            [
+                "vagrant",
+                "cloud",
+                "publish",
+                "-f",
+                f"tlcpack/microtvm-{args.platform}",
+                args.version,
+                provider_name,
+                os.path.join(
+                    THIS_DIR,
+                    args.platform,
+                    "base-box",
+                    f"output-packer-{provider_name}/package.box",
+                ),
+            ]
+        )
+
+
+ALL_COMMANDS = {
+    "build": build_command,
+    "test": test_command,
+    "release": release_command,
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Automates building, testing, and releasing a base box"
+    )
+    parser.add_argument(
+        "command",
+        default=",".join(ALL_COMMANDS),
+        choices=ALL_COMMANDS,
+        help="Action or actions (comma-separated) to perform.",
+    )
+    parser.add_argument(
+        "platform",
+        help="Name of the platform VM to act on. Must be a sub-directory of this directory.",
+    )
+    parser.add_argument(
+        "--provider",
+        choices=ALL_PROVIDERS,
+        help="Name of the provider or providers to act on; if not specified, act on all",
+    )
+    parser.add_argument(
+        "--skip-build",
+        action="store_true",
+        help=(
+            "For use with the 'test' command. If given, assume a box has already been built in "
+            "the release-test subdirectory. Attach a USB device to this box and execute the "
+            "release test script--do not delete it."
+        ),
+    )
+    parser.add_argument(
+        "--test-device-serial",
+        help=(
+            "If given, attach the test device with this USB serial number. Corresponds to the "
+            "iSerial field from `lsusb -v` output."
+        ),
+    )
+    parser.add_argument(
+        "--release-version",
+        help="Version to release, in the form 'x.y.z'. Must be specified with release.",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if os.path.sep in args.platform or not os.path.isdir(os.path.join(THIS_DIR, args.platform)):
+        sys.exit(f"<platform> must be a sub-direcotry of {THIS_DIR}; got {args.platform}")
+
+    todo = []
+    for phase in args.command.split(","):
+        if phase not in ALL_COMMANDS:
+            sys.exit(f"unknown command: {phase}")
+
+        todo.append(ALL_COMMANDS[phase])
+
+    for phase in todo:
+        phase(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/apps/microtvm/reference-vm/zephyr/.gitignore b/apps/microtvm/reference-vm/zephyr/.gitignore
new file mode 100644
index 000000000000..dace7081e3f2
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/.gitignore
@@ -0,0 +1 @@
+/.vagrant
diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile
new file mode 100644
index 000000000000..bb0c4eaac2c8
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/Vagrantfile
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+Vagrant.configure("2") do |config|
+  config.vm.box = "tlcpack/microtvm-zephyr"
+
+  tvm_home = "../../../.."
+  dirs_to_mount = [Pathname.new(Pathname.new(tvm_home).expand_path())]
+
+  git_file = Pathname.new(tvm_home + "/.git")
+  if git_file.ftype() == "file" then
+    gitdir_match = Regexp.new('^gitdir: (?<gitdir>.*/.git).*\n$', Regexp::MULTILINE).match(git_file.read())
+    if !gitdir_match.nil? then
+      dirs_to_mount.append(Pathname.new(gitdir_match.named_captures["gitdir"]))
+      puts "NOTE: also configuring git-worktree gitdir: %s" % [dirs_to_mount[-1]]
+    end
+  end
+
+  config.vm.provision "shell", path: "setup.sh", env: {"TVM_HOME": dirs_to_mount[0]}, privileged: false
+
+  # Enable USB Controller on VirtualBox
+  vm_name = "microtvm-#{Time.now.tv_sec}"
+  config.vm.provider "virtualbox" do |vb, overrides|
+    vb.name = vm_name
+    vb.customize ["modifyvm", :id, "--usb", "on"]
+    vb.customize ["modifyvm", :id, "--usbehci", "on"]
+    vb.customize ["modifyvm", :id, "--usbxhci", "on"]
+    dirs_to_mount.each do |d|
+      overrides.vm.synced_folder d.to_s, d.to_s
+    end
+  end
+
+  config.vm.provider "parallels" do |prl, overrides|
+    prl.name = vm_name
+    prl.update_guest_tools = true
+    prl.customize ["set", :id, "--support-usb30", "on"]
+    dirs_to_mount.each do |d|
+      overrides.vm.synced_folder d.to_s, d.to_s, mount_options: ["share", "nosuid", "host_inodes"]
+    end
+  end
+
+end
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/.gitignore b/apps/microtvm/reference-vm/zephyr/base-box/.gitignore
new file mode 100644
index 000000000000..e4406c4f61e2
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/base-box/.gitignore
@@ -0,0 +1,4 @@
+*.box
+.vagrant
+/output-packer-*
+/packer.json
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
new file mode 100644
index 000000000000..b1fff9c63806
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+Vagrant.configure("2") do |config|
+  # From hashicorp default template:
+  # https://github.com/hashicorp/packer/blob/master/builder/vagrant/step_create_vagrantfile.go#L23-L37
+
+  config.vm.define "source" do |source|
+    source.vm.box = "{{.SourceBox}}"
+    config.ssh.insert_key = {{.InsertKey}}
+  end
+
+  config.vm.define "output" do |output|
+    output.vm.box = "{{.BoxName}}"
+    output.vm.box_url = "file://package.box"
+    config.ssh.insert_key = {{.InsertKey}}
+  end
+
+  {{ if ne .SyncedFolder "" -}}
+    config.vm.synced_folder "{{.SyncedFolder}}", "/vagrant"
+  {{- else -}}
+    config.vm.synced_folder ".", "/vagrant", disabled: true
+  {{- end}}
+
+  config.vm.provision "shell", path: "../setup.sh", privileged: false
+end
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
new file mode 100644
index 000000000000..a89d650995bc
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
@@ -0,0 +1,102 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+sudo apt update
+sudo apt install -y build-essential
+sudo apt-get --purge remove modemmanager  # required to access serial ports.
+
+# Zephyr
+wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
+sudo apt-key add kitware-archive-latest.asc
+sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+sudo apt update
+sudo apt install -y --no-install-recommends git cmake ninja-build gperf \
+  ccache dfu-util device-tree-compiler wget \
+  python3-dev python3-pip python3-setuptools python3-tk python3-wheel xz-utils file \
+  make gcc gcc-multilib g++-multilib libsdl2-dev
+
+# Avahi, so that ssh microtvm works.
+# apt install -y avahi-daemon
+
+OLD_HOSTNAME=$(hostname)
+sudo hostnamectl set-hostname microtvm
+sudo sed -i.bak "s/${OLD_HOSTNAME}/microtvm.localdomain/g" /etc/hosts
+
+# Poetry deps
+sudo apt install -y python3-venv
+
+# TVM deps
+sudo apt install -y llvm
+
+# ONNX deps
+sudo apt install -y protobuf-compiler libprotoc-dev
+
+# nrfjprog
+cd ~
+mkdir -p nrfjprog
+wget --no-verbose -O nRFCommandLineTools1090Linuxamd64.tar.gz https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-9-0/nRFCommandLineTools1090Linuxamd64tar.gz
+cd nrfjprog
+tar -xzvf ../nRFCommandLineTools1090Linuxamd64.tar.gz
+sudo apt install -y ./JLink_Linux_V680a_x86_64.deb
+sudo apt install -y ./nRF-Command-Line-Tools_10_9_0_Linux-amd64.deb
+source ~/.profile
+nrfjprog --help
+cd ..
+rm -rf nrfjprog nRFCommandLineTools1090Linuxamd64.tar.gz
+
+# Zephyr
+pip3 install --user -U west
+echo 'export PATH=$HOME/.local/bin:"$PATH"' >> ~/.profile
+source ~/.profile
+echo PATH=$PATH
+west init --mr v2.4.0 ~/zephyr
+cd ~/zephyr
+west update
+west zephyr-export
+
+cd ~
+echo "Downloading zephyr SDK..."
+wget --no-verbose https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v0.11.3/zephyr-sdk-0.11.3-setup.run
+chmod +x zephyr-sdk-0.11.3-setup.run
+./zephyr-sdk-0.11.3-setup.run -- -d ~/zephyr-sdk -y
+rm -rf zephyr-sdk-0.11.3-setup.run
+
+# GDB for Zephyr SDK depends on python3.8
+sudo add-apt-repository ppa:deadsnakes/ppa
+sudo apt install -y python3.8-dev
+
+sudo find ~/zephyr-sdk -name '*.rules' -exec cp {} /etc/udev/rules.d \;
+sudo udevadm control --reload
+
+# Poetry
+curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python3
+sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
+sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zephyr" ~/.bashrc
+sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
+
+# Clean box for packaging as a base box
+sudo apt-get clean
+EMPTY_FILE="$HOME/EMPTY"
+dd if=/dev/zero "of=${EMPTY_FILE}" bs=1M || /bin/true
+if [ ! -e "${EMPTY_FILE}" ]; then
+    echo "failed to zero empty sectors on disk"
+    exit 2
+fi
+rm -f "${EMPTY_FILE}"
diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/reference-vm/zephyr/pyproject.toml
new file mode 100644
index 000000000000..d273b25eb3cd
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/pyproject.toml
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[tool.black]
+line-length = 100
+target-version = ['py36']
+include = '(\.pyi?$)'
+exclude = '''
+
+(
+  /(
+      \.github
+    | \.tvm
+    | \.tvm_test_data
+    | \.vscode
+    | \.venv
+    | 3rdparty
+    | build\/
+    | cmake\/
+    | conda\/
+    | docker\/
+    | docs\/
+    | golang\/
+    | include\/
+    | jvm\/
+    | licenses\/
+    | nnvm\/
+    | rust\/
+    | src\/
+    | vta\/
+    | web\/
+  )/
+)
+'''
+[tool.poetry]
+name = "incubator-tvm"
+version = "0.1.0"
+description = ""
+authors = ["Your Name <you@example.com>"]
+packages = [
+    { include = "tvm", from = "../../../../python" },
+]
+
+[tool.poetry.dependencies]
+attrs = "^19"
+decorator = "^4.4"
+numpy = "~1.19"
+psutil = "^5"
+scipy = "^1.4"
+python = "^3.6"
+tornado = "^6"
+typed_ast = "^1.4"
+
+# AutoTVM
+xgboost = {version = "^1.1", optional = true}
+
+#############
+# Importers #
+#############
+
+# NOTE: Caffe frontend dependency is from torch package.
+
+# CoreML
+coremltools = {version = "^3.3", optional = true}
+
+# Darknet
+opencv-python = {version = "^4.2", optional = true}
+cffi = {version = "^1.14", optional = true}
+
+# NOTE: Keras provided by tensorflow package.
+# If TF version conflict, maybe try: keras = "2.3.1"
+
+# MXNet frontend
+mxnet = {version = "^1.6.0", optional = true}
+
+# ONNX frontend
+onnx = {version = "1.6.0", optional = true}
+onnxruntime = {version = "1.0.0", optional = true}
+
+# Pytorch (also used by ONNX)
+torch = {version = "1.4.0", optional = true}
+torchvision = {version = "0.5.0", optional = true}
+# NOTE: torch depends on a number of other packages, but unhelpfully, does not expose that in the
+# wheel!!!
+future = {version = "*", optional = true}
+
+# Tensorflow frontend
+tensorflow = {version = "^2.1", optional = true}
+tensorflow-estimator = {version = "^2.1", optional = true}
+
+# TFLite frontend
+tflite = {version = "2.1.0", optional = true}
+wheel = "*"
+
+
+[tool.poetry.extras]
+xgboost = ["xgboost"]
+importer-caffe2 = ["torch"]
+importer-coreml = ["coremltools"]
+importer-darknet = ["opencv-python"]
+importer-keras = ["tensorflow", "tensorflow-estimator"]
+importer-onnx = ["onnx", "onnxruntime", "torch", "torchvision", "future"]
+importer-pytorch = ["torch", "torchvision", "future"]
+importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
+importer-tflite = ["tlfite", "tensorflow", "tensorflow-estimator"]
+
+[tool.poetry.dev-dependencies]
+autodocsumm = "^0.1"
+black = "^19.10b0"
+sphinx = "^3.0"
+sphinx-gallery = "^0.4"
+sphinx-rtd-theme = "^0.4"
+matplotlib = "^3.2"
+Image = "^1.5"
+recommonmark = "^0.6"
+pillow = "< 7"
+pyformat = "^0.7"
+pylint = "^2.4"
+pytest = "^5.4"
+
+[build-system]
+requires = ["poetry>=0.12"]
+build-backend = "poetry.masonry.api"
+
+[tool.autopep8]
+max_line_length = 100
diff --git a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
new file mode 100755
index 000000000000..9442947438a9
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
@@ -0,0 +1,33 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+cd "$(dirname $0)"
+cd "$(git rev-parse --show-toplevel)"
+BUILD_DIR=build-microtvm
+
+if [ ! -e "${BUILD_DIR}" ]; then
+    mkdir "${BUILD_DIR}"
+fi
+cp cmake/config.cmake "${BUILD_DIR}"
+cd "${BUILD_DIR}"
+sed -i 's/USE_MICRO OFF/USE_MICRO ON/' config.cmake
+sed -i 's/USE_LLVM OFF/USE_LLVM ON/' config.cmake
+cmake ..
+make -j4
diff --git a/apps/microtvm/reference-vm/zephyr/setup.sh b/apps/microtvm/reference-vm/zephyr/setup.sh
new file mode 100644
index 000000000000..6e87c1fa4eb9
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/setup.sh
@@ -0,0 +1,41 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+# TVM
+# NOTE: TVM is presumed to be mounted already by Vagrantfile.
+cd "${TVM_HOME}"
+
+apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
+
+cd apps/microtvm/reference-vm/zephyr
+
+echo "------------------------------[ TVM Message ]------------------------------"
+echo "WARNING: running 'poetry lock', which could take several minutes (depending"
+echo "on your network connection and the state of PyPI) as dependencies are"
+echo "downloaded and cached for future use."
+echo "------------------------------[ TVM Message ]------------------------------"
+
+poetry lock
+poetry install
+poetry run pip3 install -r ~/zephyr/zephyr/scripts/requirements.txt
+
+echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm" >>~/.profile
+echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/zephyr && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile
+echo "source \$VENV_PATH/bin/activate" >>~/.profile
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 310dab41215b..6ae11f4f9af8 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -37,9 +37,10 @@ namespace codegen {
 
 CodeGenCHost::CodeGenCHost() { module_name_ = GetUniqueName("__tvm_module_ctx"); }
 
-void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) {
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_str) {
   emit_asserts_ = emit_asserts;
   declared_globals_.clear();
+  decl_stream << "// tvm target: " << target_str << "\n";
   decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
   decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
   decl_stream << "#include <math.h>\n";
@@ -304,7 +305,7 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   bool output_ssa = false;
   bool emit_asserts = false;
   CodeGenCHost cg;
-  cg.Init(output_ssa, emit_asserts);
+  cg.Init(output_ssa, emit_asserts, target->str());
 
   for (auto kv : mod->functions) {
     ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 66ac4ddd99d3..1bf378be1422 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -38,7 +38,7 @@ namespace codegen {
 class CodeGenCHost final : public CodeGenC {
  public:
   CodeGenCHost();
-  void Init(bool output_ssa, bool emit_asserts);
+  void Init(bool output_ssa, bool emit_asserts, std::string target_str);
 
   void AddFunction(const PrimFunc& f);
 
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 7d3e95d5af13..ab51b6c79c83 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -128,6 +128,9 @@
     # Zephyr tests
     "tests/micro/qemu/zephyr-runtime/prj.conf",
     "tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386",
+    # microTVM Virtual Machines
+    "apps/microtvm/reference-vm/zephyr/Vagrantfile",
+    "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template",
 }
 
 
diff --git a/tutorials/micro/micro_reference_vm.py b/tutorials/micro/micro_reference_vm.py
new file mode 100644
index 000000000000..dec1be9779fd
--- /dev/null
+++ b/tutorials/micro/micro_reference_vm.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+===================================
+microTVM Reference Virtual Machines
+===================================
+**Author**: `Andrew Reusch <areusch@octoml.ai>`_
+
+This tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to
+develop on real physical hardware without needing to individually install the microTVM
+dependencies. These are also particularly useful when trying to reproduce behavior with
+microTVM, such as when filing bug reports.
+
+microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers.
+microTVM aims to be compatible with a wide variety of SoCs and runtime environments (i.e. bare metal,
+RTOS, etc). However, some stable software environment is needed to allow developers to share and
+reproduce bugs and results. The microTVM Reference Virtual Machines are intended to provide that
+environment.
+
+How it works
+============
+
+No Virtual Machines are stored in the TVM repository--instead, the files stored in
+``apps/microtvm/reference-vm`` describe how to build VMs to the Vagrant_ VM builder tool.
+
+The Reference VMs are split into two parts:
+
+1. A Vagrant Base Box, which contains all of the stable dependencies for that platform. Build
+   scripts are stored in ``apps/microtvm/reference-vm/<platform>/base-box``. TVM committers run
+   these when a platform's "stable" dependencies change, and the generated base boxes are stored in
+   `Vagrant Cloud`_.
+2. A per-workspace VM, which users normally build using the Base Box as a starting point. Build
+   scripts are stored in ``apps/microtvm/reference-vm/<platform>`` (everything except ``base-box``).
+
+.. _Vagrant: https://vagrantup.com
+.. _Vagrant Cloud: https://app.vagrantup.com/tlcpack
+
+Setting up the VM
+=================
+
+Installing prerequisites
+------------------------
+
+A minimal set of prerequisites are needed:
+
+
+1. `Vagrant <https://vagrantup.com>`__
+2. A supported Virtual Machine hypervisor.
+   `VirtualBox <https://www.virtualbox.org>`__ is one suggested free hypervisor, but please note
+   that the `VirtualBox Extension Pack`_ is required for proper USB forwarding.
+
+.. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
+
+First boot
+----------
+
+The first time you use a reference VM, you need to create the box locally and then provision it.
+
+.. code-block:: bash
+
+    ~/.../tvm $ cd apps/microtvm-vm
+    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox).
+    ~/.../tvm/apps/microtvm/vm $ vagrant up --provider=<provider_name>
+
+
+This command will take a couple of minutes to run and will require 4 to 5GB of storage on your
+machine. It does the following:
+
+1. Downloads the `microTVM base box`_ and clones it to form a new VM specific to this TVM directory.
+2. Mounts your TVM directory (and, if using ``git-subtree``, the original ``.git`` repo) into the
+   VM.
+3. Builds TVM and installs a Python virtualenv with the dependencies corresponding with your TVM
+   build.
+
+.. _microTVM base box: https://app.vagrantup.com/tlcpack/boxes/microtvm
+
+
+Next, you need to configure USB passthrough to attach your physical development board to the virtual
+machine (rather than directly to your laptop's host OS).
+
+It's suggested you setup a device filter, rather than doing a one-time forward, because often the
+device may reboot during the programming process and you may, at that time, need to enable
+forwarding again. It may not be obvious to the end user when this occurs. Instructions to do that:
+
+ * `VirtualBox <https://www.virtualbox.org/manual/ch03.html#usb-support>`__
+ * `Parallels <https://kb.parallels.com/122993>`__
+ * `VMWare Workstation <https://docs.vmware.com/en/VMware-Workstation-Pro/15.0/com.vmware.ws.using.doc/GUID-E003456F-EB94-4B53-9082-293D9617CB5A.html>`__
+
+Future use
+----------
+
+After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm``,
+up-to-date when you modify the C++ runtime or checkout a different revision. You can either
+re-provision the machine (``vagrant provision`` in the same directory you ran ``vagrant up`` before)
+or manually rebuild TVM yourself.
+
+Remember: the TVM ``.so`` built inside the VM is different from the one you may use on your host
+machine. This is why it's built inside the special directory ``build-microtvm``.
+
+Logging in to the VM
+--------------------
+
+The VM should be available to your host only with the hostname ``microtvm``. You can SSH to the VM
+as follows:
+
+.. code-block:: bash
+
+    $ vagrant ssh
+
+Then ``cd`` to the same path used on your host machine for TVM. For example, on Mac:
+
+.. code-block:: bash
+
+    $ cd /Users/yourusername/path/to/tvm
+
+Running tests
+=============
+
+Once the VM has been provisioned, tests can executed using ``poetry``:
+
+.. code-block:: bash
+
+    $ poetry run python3 tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+
+"""

From 0178a8b9f2e3f135c995356a320ece612cbb8f93 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 5 Nov 2020 11:57:53 -0800
Subject: [PATCH 127/258] [Rust][IRModule] Flesh out IRModule methods (#6741)

* WIP

* WIP

* WIP

* WIP

* Disable WASM and fix rebase

* Work on finishing tests

* Make entire object system printable

* Write some more tests for IRModule

* All tests pass

* Format

* Restore module.cc

* Bump syn
---
 rust/Cargo.toml                               |   1 -
 .../tvm-graph-rt/tests/test_wasm32/Cargo.toml |   2 +-
 rust/tvm-macros/Cargo.toml                    |   2 +-
 rust/tvm-macros/src/external.rs               |  51 ++-
 rust/tvm-macros/src/lib.rs                    |   5 +-
 rust/tvm-macros/src/object.rs                 |  32 +-
 rust/tvm-rt/src/array.rs                      |  15 +-
 rust/tvm-rt/src/map.rs                        |   2 -
 rust/tvm-rt/src/ndarray.rs                    |   2 +-
 rust/tvm-rt/src/object/mod.rs                 |  12 +-
 rust/tvm-rt/src/object/object_ptr.rs          |  40 ++-
 rust/tvm-rt/src/string.rs                     |   3 +-
 rust/tvm-rt/src/value.rs                      |   1 -
 rust/tvm-sys/src/datatype.rs                  |   4 +
 rust/tvm/src/ir/arith.rs                      |   2 +-
 rust/tvm/src/ir/attrs.rs                      |   2 +-
 rust/tvm/src/ir/diagnostics/mod.rs            |   7 +-
 rust/tvm/src/ir/expr.rs                       |  14 +-
 rust/tvm/src/ir/function.rs                   |   2 +-
 rust/tvm/src/ir/module.rs                     | 322 +++++++++++++++---
 rust/tvm/src/ir/op.rs                         |   2 +-
 rust/tvm/src/ir/relay/attrs/nn.rs             |  14 +-
 rust/tvm/src/ir/relay/attrs/transform.rs      |   2 +-
 rust/tvm/src/ir/relay/mod.rs                  |  97 +++---
 rust/tvm/src/ir/source_map.rs                 |   4 +-
 rust/tvm/src/ir/span.rs                       |   4 +-
 rust/tvm/src/ir/tir.rs                        |  16 +-
 rust/tvm/src/ir/ty.rs                         | 125 +++++--
 rust/tvm/src/transform.rs                     |   2 +-
 src/ir/module.cc                              |   3 +
 tests/scripts/task_rust.sh                    |   2 +-
 31 files changed, 599 insertions(+), 193 deletions(-)

diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 7c092d860b50..e75150859f90 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -27,7 +27,6 @@ members = [
 	"tvm-graph-rt",
 	"tvm-graph-rt/tests/test_tvm_basic",
 	"tvm-graph-rt/tests/test_tvm_dso",
-	"tvm-graph-rt/tests/test_wasm32",
 	"tvm-graph-rt/tests/test_nn",
 	"compiler-ext",
 ]
diff --git a/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml b/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml
index aed467f1235d..02e77d106f28 100644
--- a/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml
+++ b/rust/tvm-graph-rt/tests/test_wasm32/Cargo.toml
@@ -23,7 +23,7 @@ authors = ["TVM Contributors"]
 edition = "2018"
 
 [dependencies]
-ndarray="0.12"
+ndarray = "0.12"
 tvm-graph-rt = { path = "../../" }
 
 [build-dependencies]
diff --git a/rust/tvm-macros/Cargo.toml b/rust/tvm-macros/Cargo.toml
index 63b84727c525..e491177d8599 100644
--- a/rust/tvm-macros/Cargo.toml
+++ b/rust/tvm-macros/Cargo.toml
@@ -33,5 +33,5 @@ proc-macro = true
 goblin = "^0.2"
 proc-macro2 = "^1.0"
 quote = "^1.0"
-syn = { version = "1.0.17", features = ["full", "extra-traits"] }
+syn = { version = "1.0.48", features = ["full", "parsing", "extra-traits"] }
 proc-macro-error = "^1.0"
diff --git a/rust/tvm-macros/src/external.rs b/rust/tvm-macros/src/external.rs
index 802d7aeb6779..146f9d4d6bc6 100644
--- a/rust/tvm-macros/src/external.rs
+++ b/rust/tvm-macros/src/external.rs
@@ -17,12 +17,35 @@
  * under the License.
  */
 use proc_macro2::Span;
+use proc_macro_error::abort;
 use quote::quote;
 use syn::parse::{Parse, ParseStream, Result};
 
-use syn::{FnArg, Generics, Ident, Lit, Meta, NestedMeta, Pat, ReturnType, TraitItemMethod, Type};
+use syn::{
+    token::Semi, Attribute, FnArg, Generics, Ident, Lit, Meta, NestedMeta, Pat, ReturnType,
+    Signature, Type, Visibility,
+};
+
+struct ExternalItem {
+    attrs: Vec<Attribute>,
+    visibility: Visibility,
+    sig: Signature,
+}
+
+impl Parse for ExternalItem {
+    fn parse(input: ParseStream) -> Result<Self> {
+        let item = ExternalItem {
+            attrs: input.call(Attribute::parse_outer)?,
+            visibility: input.parse()?,
+            sig: input.parse()?,
+        };
+        let _semi: Semi = input.parse()?;
+        Ok(item)
+    }
+}
 
 struct External {
+    visibility: Visibility,
     tvm_name: String,
     ident: Ident,
     generics: Generics,
@@ -32,7 +55,8 @@ struct External {
 
 impl Parse for External {
     fn parse(input: ParseStream) -> Result<Self> {
-        let method: TraitItemMethod = input.parse()?;
+        let method: ExternalItem = input.parse()?;
+        let visibility = method.visibility;
         assert_eq!(method.attrs.len(), 1);
         let sig = method.sig;
         let tvm_name = method.attrs[0].parse_meta()?;
@@ -47,8 +71,7 @@ impl Parse for External {
             }
             _ => panic!(),
         };
-        assert_eq!(method.default, None);
-        assert!(method.semi_token != None);
+
         let ident = sig.ident;
         let generics = sig.generics;
         let inputs = sig
@@ -60,6 +83,7 @@ impl Parse for External {
         let ret_type = sig.output;
 
         Ok(External {
+            visibility,
             tvm_name,
             ident,
             generics,
@@ -98,6 +122,7 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
     let mut items = Vec::new();
 
     for external in &ext_input.externs {
+        let visibility = &external.visibility;
         let name = &external.ident;
         let global_name = format!("global_{}", external.ident);
         let global_name = Ident::new(&global_name, Span::call_site());
@@ -109,7 +134,9 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
             .iter()
             .map(|ty_param| match ty_param {
                 syn::GenericParam::Type(param) => param.clone(),
-                _ => panic!(),
+                _ => abort! { ty_param,
+                    "Only supports type parameters."
+                },
             })
             .collect();
 
@@ -124,15 +151,21 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
                         let ty: Type = *pat_type.ty.clone();
                         (ident, ty)
                     }
-                    _ => panic!(),
+                    _ => abort! { pat_type,
+                        "Only supports type parameters."
+                    },
+                },
+                pat => abort! {
+                    pat, "invalid pattern type for function";
+
+                    note = "{:?} is not allowed here", pat;
                 },
-                _ => panic!(),
             })
             .unzip();
 
         let ret_type = match &external.ret_type {
             ReturnType::Type(_, rtype) => *rtype.clone(),
-            _ => panic!(),
+            ReturnType::Default => syn::parse_str::<Type>("()").unwrap(),
         };
 
         let global = quote! {
@@ -147,7 +180,7 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
         items.push(global);
 
         let wrapper = quote! {
-            pub fn #name<#(#ty_params),*>(#(#args : #tys),*) -> #result_type<#ret_type> {
+            #visibility fn #name<#(#ty_params),*>(#(#args : #tys),*) -> #result_type<#ret_type> {
                 let func_ref: #tvm_rt_crate::Function = #global_name.clone();
                 let func_ref: Box<dyn Fn(#(#tys),*) -> #result_type<#ret_type>> = func_ref.into();
                 let res: #ret_type = func_ref(#(#args),*)?;
diff --git a/rust/tvm-macros/src/lib.rs b/rust/tvm-macros/src/lib.rs
index 603e1ceaafcc..e563a57f149e 100644
--- a/rust/tvm-macros/src/lib.rs
+++ b/rust/tvm-macros/src/lib.rs
@@ -18,6 +18,7 @@
  */
 
 use proc_macro::TokenStream;
+use proc_macro_error::proc_macro_error;
 
 mod external;
 mod import_module;
@@ -29,12 +30,14 @@ pub fn import_module(input: TokenStream) -> TokenStream {
     import_module::macro_impl(input)
 }
 
-#[proc_macro_derive(Object, attributes(base, ref_name, type_key))]
+#[proc_macro_error]
+#[proc_macro_derive(Object, attributes(base, ref_name, type_key, no_derive))]
 pub fn macro_impl(input: TokenStream) -> TokenStream {
     // let input = proc_macro2::TokenStream::from(input);
     TokenStream::from(object::macro_impl(input))
 }
 
+#[proc_macro_error]
 #[proc_macro]
 pub fn external(input: TokenStream) -> TokenStream {
     external::macro_impl(input)
diff --git a/rust/tvm-macros/src/object.rs b/rust/tvm-macros/src/object.rs
index ff72d6a649be..c84d0aab612f 100644
--- a/rust/tvm-macros/src/object.rs
+++ b/rust/tvm-macros/src/object.rs
@@ -36,6 +36,10 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> TokenStream {
         .map(attr_to_str)
         .expect("Failed to get type_key");
 
+    let derive = get_attr(&derive_input, "no_derive")
+        .map(|_| false)
+        .unwrap_or(true);
+
     let ref_id = get_attr(&derive_input, "ref_name")
         .map(|a| Ident::new(attr_to_str(a).value().as_str(), Span::call_site()))
         .unwrap_or_else(|| {
@@ -75,6 +79,12 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> TokenStream {
         _ => panic!("derive only works for structs"),
     };
 
+    let ref_derives = if derive {
+        quote! { #[derive(Debug, Clone)]}
+    } else {
+        quote! { #[derive(Clone)] }
+    };
+
     let mut expanded = quote! {
         unsafe impl #tvm_rt_crate::object::IsObject for #payload_id {
             const TYPE_KEY: &'static str = #type_key;
@@ -87,7 +97,7 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> TokenStream {
             }
         }
 
-        #[derive(Clone)]
+        #ref_derives
         pub struct #ref_id(Option<#tvm_rt_crate::object::ObjectPtr<#payload_id>>);
 
         impl #tvm_rt_crate::object::IsObjectRef for #ref_id {
@@ -185,5 +195,25 @@ pub fn macro_impl(input: proc_macro::TokenStream) -> TokenStream {
 
     expanded.extend(base_tokens);
 
+    if derive {
+        let derives = quote! {
+            impl std::hash::Hash for #ref_id {
+                fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+                    self.0.hash(state)
+                }
+            }
+
+            impl std::cmp::PartialEq for #ref_id {
+                fn eq(&self, other: &Self) -> bool {
+                    self.0 == other.0
+                }
+            }
+
+            impl std::cmp::Eq for #ref_id {}
+        };
+
+        expanded.extend(derives);
+    }
+
     TokenStream::from(expanded)
 }
diff --git a/rust/tvm-rt/src/array.rs b/rust/tvm-rt/src/array.rs
index 98414f9c5b34..1b0ce8399d1f 100644
--- a/rust/tvm-rt/src/array.rs
+++ b/rust/tvm-rt/src/array.rs
@@ -18,7 +18,7 @@
  */
 
 use std::convert::{TryFrom, TryInto};
-use std::iter::{IntoIterator, Iterator};
+use std::iter::{FromIterator, IntoIterator, Iterator};
 use std::marker::PhantomData;
 
 use crate::errors::Error;
@@ -82,6 +82,13 @@ impl<T: IsObjectRef> Array<T> {
     }
 }
 
+impl<T: IsObjectRef> std::fmt::Debug for Array<T> {
+    fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let as_vec: Vec<T> = self.clone().into_iter().collect();
+        write!(formatter, "{:?}", as_vec)
+    }
+}
+
 pub struct IntoIter<T: IsObjectRef> {
     array: Array<T>,
     pos: isize,
@@ -118,6 +125,12 @@ impl<T: IsObjectRef> IntoIterator for Array<T> {
     }
 }
 
+impl<T: IsObjectRef> FromIterator<T> for Array<T> {
+    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
+        Array::from_vec(iter.into_iter().collect()).unwrap()
+    }
+}
+
 impl<T: IsObjectRef> From<Array<T>> for ArgValue<'static> {
     fn from(array: Array<T>) -> ArgValue<'static> {
         array.object.into()
diff --git a/rust/tvm-rt/src/map.rs b/rust/tvm-rt/src/map.rs
index 721fb1ec4588..b8bfb4e5e644 100644
--- a/rust/tvm-rt/src/map.rs
+++ b/rust/tvm-rt/src/map.rs
@@ -48,8 +48,6 @@ where
 // TODO(@jroesch): convert to use generics instead of casting inside
 // the implementation.
 external! {
-    #[name("node.ArrayGetItem")]
-   fn array_get_item(array: ObjectRef, index: isize) -> ObjectRef;
    #[name("node.MapSize")]
    fn map_size(map: ObjectRef) -> i64;
    #[name("node.MapGetItem")]
diff --git a/rust/tvm-rt/src/ndarray.rs b/rust/tvm-rt/src/ndarray.rs
index ed280ccc2d80..07f783f0ef43 100644
--- a/rust/tvm-rt/src/ndarray.rs
+++ b/rust/tvm-rt/src/ndarray.rs
@@ -65,7 +65,7 @@ use crate::object::{Object, ObjectPtr};
 
 /// See the [`module-level documentation`](../ndarray/index.html) for more details.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "NDArray"]
 #[type_key = "runtime.NDArray"]
 pub struct NDArrayContainer {
diff --git a/rust/tvm-rt/src/object/mod.rs b/rust/tvm-rt/src/object/mod.rs
index 46e034232a63..8c07ed9f0853 100644
--- a/rust/tvm-rt/src/object/mod.rs
+++ b/rust/tvm-rt/src/object/mod.rs
@@ -40,6 +40,7 @@ pub trait IsObjectRef:
     + TryFrom<RetValue, Error = Error>
     + for<'a> Into<ArgValue<'a>>
     + for<'a> TryFrom<ArgValue<'a>, Error = Error>
+    + std::fmt::Debug
 {
     type Object: IsObject;
     fn as_ptr(&self) -> Option<&ObjectPtr<Self::Object>>;
@@ -88,14 +89,9 @@ pub trait IsObjectRef:
 
 external! {
     #[name("ir.DebugPrint")]
-    fn debug_print(object: ObjectRef) -> CString;
+    pub fn debug_print(object: ObjectRef) -> CString;
     #[name("node.StructuralHash")]
-    fn structural_hash(object: ObjectRef, map_free_vars: bool) -> ObjectRef;
+    fn structural_hash(object: ObjectRef, map_free_vars: bool) -> i64;
     #[name("node.StructuralEqual")]
-    fn structural_equal(lhs: ObjectRef, rhs: ObjectRef, assert_mode: bool, map_free_vars: bool) -> ObjectRef;
+    fn structural_equal(lhs: ObjectRef, rhs: ObjectRef, assert_mode: bool, map_free_vars: bool) -> bool;
 }
-
-// external! {
-//     #[name("ir.TextPrinter")]
-//     fn as_text(object: ObjectRef) -> CString;
-// }
diff --git a/rust/tvm-rt/src/object/object_ptr.rs b/rust/tvm-rt/src/object/object_ptr.rs
index 8d535368c352..8df6041956b8 100644
--- a/rust/tvm-rt/src/object/object_ptr.rs
+++ b/rust/tvm-rt/src/object/object_ptr.rs
@@ -19,6 +19,7 @@
 
 use std::convert::TryFrom;
 use std::ffi::CString;
+use std::fmt;
 use std::ptr::NonNull;
 use std::sync::atomic::AtomicI32;
 
@@ -147,6 +148,18 @@ impl Object {
     }
 }
 
+// impl fmt::Debug for Object {
+//     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+//         let index =
+//             format!("{} // key: {}", self.type_index, "the_key");
+
+//         f.debug_struct("Object")
+//          .field("type_index", &index)
+//          // TODO(@jroesch: do we expose other fields?)
+//          .finish()
+//     }
+// }
+
 /// An unsafe trait which should be implemented for an object
 /// subtype.
 ///
@@ -154,7 +167,7 @@ impl Object {
 /// index, a method for accessing the base object given the
 /// subtype, and a typed delete method which is specialized
 /// to the subtype.
-pub unsafe trait IsObject: AsRef<Object> {
+pub unsafe trait IsObject: AsRef<Object> + std::fmt::Debug {
     const TYPE_KEY: &'static str;
 
     unsafe extern "C" fn typed_delete(object: *mut Self) {
@@ -264,6 +277,13 @@ impl<T: IsObject> std::ops::Deref for ObjectPtr<T> {
     }
 }
 
+impl<T: IsObject> fmt::Debug for ObjectPtr<T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use std::ops::Deref;
+        write!(f, "{:?}", self.deref())
+    }
+}
+
 impl<'a, T: IsObject> From<ObjectPtr<T>> for RetValue {
     fn from(object_ptr: ObjectPtr<T>) -> RetValue {
         let raw_object_ptr = ObjectPtr::leak(object_ptr) as *mut T as *mut std::ffi::c_void;
@@ -342,6 +362,24 @@ impl<'a, T: IsObject> TryFrom<ArgValue<'a>> for ObjectPtr<T> {
     }
 }
 
+impl<T: IsObject> std::hash::Hash for ObjectPtr<T> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_i64(
+            super::structural_hash(ObjectRef(Some(self.clone().upcast())), false).unwrap(),
+        )
+    }
+}
+
+impl<T: IsObject> PartialEq for ObjectPtr<T> {
+    fn eq(&self, other: &Self) -> bool {
+        let lhs = ObjectRef(Some(self.clone().upcast()));
+        let rhs = ObjectRef(Some(other.clone().upcast()));
+        super::structural_equal(lhs, rhs, false, false).unwrap()
+    }
+}
+
+impl<T: IsObject> Eq for ObjectPtr<T> {}
+
 #[cfg(test)]
 mod tests {
     use super::{Object, ObjectPtr};
diff --git a/rust/tvm-rt/src/string.rs b/rust/tvm-rt/src/string.rs
index 3cd33a226d44..e61afaf7399b 100644
--- a/rust/tvm-rt/src/string.rs
+++ b/rust/tvm-rt/src/string.rs
@@ -25,9 +25,10 @@ use super::Object;
 use tvm_macros::Object;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "String"]
 #[type_key = "runtime.String"]
+#[no_derive]
 pub struct StringObj {
     base: Object,
     data: *const u8,
diff --git a/rust/tvm-rt/src/value.rs b/rust/tvm-rt/src/value.rs
index c49944dc7e33..b8cd190176c4 100644
--- a/rust/tvm-rt/src/value.rs
+++ b/rust/tvm-rt/src/value.rs
@@ -22,7 +22,6 @@
 //! `RetValue` is the owned version of `TVMPODValue`.
 
 use std::convert::TryFrom;
-// use std::ffi::c_void;
 
 use crate::{ArgValue, Module, RetValue};
 use tvm_sys::{errors::ValueDowncastError, ffi::TVMModuleHandle, try_downcast};
diff --git a/rust/tvm-sys/src/datatype.rs b/rust/tvm-sys/src/datatype.rs
index 8050d932e5c1..5f7e0c3a3b60 100644
--- a/rust/tvm-sys/src/datatype.rs
+++ b/rust/tvm-sys/src/datatype.rs
@@ -83,6 +83,10 @@ impl DataType {
         DataType::new(DL_FLOAT_CODE, bits, lanes)
     }
 
+    pub const fn float32() -> DataType {
+        Self::float(32, 1)
+    }
+
     pub const fn uint(bits: u8, lanes: u16) -> DataType {
         DataType::new(DL_UINT_CODE, bits, lanes)
     }
diff --git a/rust/tvm/src/ir/arith.rs b/rust/tvm/src/ir/arith.rs
index 92a1de69ff78..672e6e6113a0 100644
--- a/rust/tvm/src/ir/arith.rs
+++ b/rust/tvm/src/ir/arith.rs
@@ -24,7 +24,7 @@ use tvm_macros::Object;
 macro_rules! define_node {
     ($name:ident, $ref:expr, $typekey:expr; $node:ident { $($id:ident : $t:ty),*}) => {
         #[repr(C)]
-        #[derive(Object)]
+        #[derive(Object, Debug)]
         #[ref_name = $ref]
         #[type_key = $typekey]
         pub struct $node {
diff --git a/rust/tvm/src/ir/attrs.rs b/rust/tvm/src/ir/attrs.rs
index 5bd027ab4b4c..739ed405c906 100644
--- a/rust/tvm/src/ir/attrs.rs
+++ b/rust/tvm/src/ir/attrs.rs
@@ -21,7 +21,7 @@ use crate::runtime::Object;
 use tvm_macros::Object;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Attrs"]
 #[type_key = "Attrs"]
 pub struct BaseAttrsNode {
diff --git a/rust/tvm/src/ir/diagnostics/mod.rs b/rust/tvm/src/ir/diagnostics/mod.rs
index 051bb9eb16c4..8bcdf8f51e60 100644
--- a/rust/tvm/src/ir/diagnostics/mod.rs
+++ b/rust/tvm/src/ir/diagnostics/mod.rs
@@ -59,6 +59,7 @@ external! {
 
 /// The diagnostic level, controls the printing of the message.
 #[repr(C)]
+#[derive(PartialEq, Eq, Debug)]
 pub enum DiagnosticLevel {
     Bug = 10,
     Error = 20,
@@ -69,7 +70,7 @@ pub enum DiagnosticLevel {
 
 /// A compiler diagnostic.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Diagnostic"]
 #[type_key = "Diagnostic"]
 pub struct DiagnosticNode {
@@ -145,7 +146,7 @@ impl DiagnosticBuilder {
 /// of compiler diagnostics to std::out and std::err in
 /// a human readable form.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "DiagnosticRenderer"]
 #[type_key = "DiagnosticRenderer"]
 /// A diagnostic renderer, which given a diagnostic context produces a "rendered"
@@ -166,7 +167,7 @@ impl DiagnosticRenderer {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "DiagnosticContext"]
 #[type_key = "DiagnosticContext"]
 /// A diagnostic context for recording errors against a source file.
diff --git a/rust/tvm/src/ir/expr.rs b/rust/tvm/src/ir/expr.rs
index f74522d91c70..653169def3a4 100644
--- a/rust/tvm/src/ir/expr.rs
+++ b/rust/tvm/src/ir/expr.rs
@@ -17,15 +17,17 @@
  * under the License.
  */
 
-use super::relay;
+use tvm_macros::Object;
+
 use crate::runtime::String as TString;
 use crate::runtime::{self, external, IsObject, IsObjectRef, Object, ObjectPtr, ObjectRef};
 use crate::DataType;
 
-use tvm_macros::Object;
+use super::relay;
+use super::span::Span;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BaseExpr"]
 #[type_key = "Expr"]
 pub struct BaseExprNode {
@@ -41,7 +43,7 @@ impl BaseExprNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PrimExpr"]
 #[type_key = "PrimExpr"]
 pub struct PrimExprNode {
@@ -59,7 +61,7 @@ impl PrimExprNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "GlobalVar"]
 #[type_key = "GlobalVar"]
 pub struct GlobalVarNode {
@@ -68,7 +70,7 @@ pub struct GlobalVarNode {
 }
 
 impl GlobalVar {
-    pub fn new(name_hint: String, _span: ObjectRef) -> GlobalVar {
+    pub fn new(name_hint: String, _span: Span) -> GlobalVar {
         let node = GlobalVarNode {
             base: relay::ExprNode::base::<GlobalVarNode>(),
             name_hint: name_hint.into(),
diff --git a/rust/tvm/src/ir/function.rs b/rust/tvm/src/ir/function.rs
index 3043bf9e7cff..14c00ea02bf6 100644
--- a/rust/tvm/src/ir/function.rs
+++ b/rust/tvm/src/ir/function.rs
@@ -28,7 +28,7 @@ use tvm_macros::Object;
 pub type DictAttrs = ObjectRef;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BaseFunc"]
 #[type_key = "BaseFunc"]
 pub struct BaseFuncNode {
diff --git a/rust/tvm/src/ir/module.rs b/rust/tvm/src/ir/module.rs
index 190b477b98f2..a09f70dc25b9 100644
--- a/rust/tvm/src/ir/module.rs
+++ b/rust/tvm/src/ir/module.rs
@@ -16,6 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+
+use std::collections::HashMap;
+use std::iter::FromIterator;
 use std::path::Path;
 
 use thiserror::Error;
@@ -25,15 +28,12 @@ use crate::runtime::array::Array;
 use crate::runtime::function::Result;
 use crate::runtime::map::Map;
 use crate::runtime::string::String as TVMString;
-use crate::runtime::{external, Object, ObjectRef};
+use crate::runtime::{external, IsObjectRef, Object};
 
 use super::expr::GlobalVar;
 use super::function::BaseFunc;
 use super::source_map::SourceMap;
-
-// TODO(@jroesch): define type
-type TypeData = ObjectRef;
-type GlobalTypeVar = ObjectRef;
+use super::{relay, ty::GlobalTypeVar, ty::TypeData};
 
 #[derive(Error, Debug)]
 pub enum Error {
@@ -44,7 +44,7 @@ pub enum Error {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "IRModule"]
 #[type_key = "IRModule"]
 pub struct IRModuleNode {
@@ -61,7 +61,11 @@ external! {
     fn parse_module(file_name: TVMString, source: TVMString) -> IRModule;
     #[name("parser.ParseExpr")]
     fn parse_expression(file_name: TVMString, source: TVMString) -> IRModule;
+    #[name("ir.IRModule")]
+    fn module_new(funcs: Map<GlobalVar, BaseFunc>, types: Map<GlobalTypeVar, TypeData>) -> IRModule;
     // Module methods
+    #[name("ir.Module_Add")]
+    fn module_add(module: IRModule, type_name: GlobalVar, expr: BaseFunc, update: bool) -> IRModule;
     #[name("ir.Module_AddDef")]
     fn module_add_def(module: IRModule, type_name: GlobalTypeVar, type_data: TypeData, update: bool) -> ();
     #[name("ir.Module_GetGlobalVar")]
@@ -72,57 +76,43 @@ external! {
     fn module_lookup(module: IRModule, var: GlobalVar) -> BaseFunc;
     #[name("ir.Module_Lookup_str")]
     fn module_lookup_str(module: IRModule, name: TVMString) -> BaseFunc;
+    #[name("ir.Module_GetGlobalTypeVars")]
+    fn module_get_global_type_vars(module: IRModule) -> Array<GlobalTypeVar>;
+    #[name("ir.Module_ContainGlobalVar")]
+    fn module_contains_global_var(module: IRModule, name: TVMString) -> bool;
+    #[name("ir.Module_ContainGlobalTypeVar")]
+    fn module_contains_global_type_var(module: IRModule, name: TVMString) -> bool;
+    #[name("ir.Module_LookupDef")]
+    fn module_lookup_def(module: IRModule, global: GlobalTypeVar) -> TypeData;
+    #[name("ir.Module_LookupDef_str")]
+    fn module_lookup_def_str(module: IRModule, global: TVMString) -> TypeData;
+    #[name("ir.Module_LookupTag")]
+    fn module_lookup_tag(module: IRModule, tag: i32) -> relay::Constructor;
+    #[name("ir.Module_FromExpr")]
+    fn module_from_expr(expr: relay::Expr, funcs: Map<GlobalVar, BaseFunc>, types: Map<GlobalTypeVar, TypeData>) -> IRModule;
+    #[name("ir.Module_Import")]
+    fn module_import(module: IRModule, path: TVMString);
+    #[name("ir.Module_ImportFromStd")]
+    fn module_import_from_std(module: IRModule, path: TVMString);
 }
 
-// TVM_REGISTER_GLOBAL("ir.Module_GetGlobalTypeVars")
-//     .set_body_method<IRModule>(&IRModuleNode::GetGlobalTypeVars);
-
-// TVM_REGISTER_GLOBAL("ir.Module_ContainGlobalVar")
-//     .set_body_method<IRModule>(&IRModuleNode::ContainGlobalVar);
-
-// TVM_REGISTER_GLOBAL("ir.Module_GetGlobalTypeVar")
-//     .set_body_method<IRModule>(&IRModuleNode::GetGlobalTypeVar);
-
-// TVM_REGISTER_GLOBAL("ir.Module_LookupDef").set_body_typed([](IRModule mod, GlobalTypeVar var) {
-//   return mod->LookupTypeDef(var);
-// });
-
-// TVM_REGISTER_GLOBAL("ir.Module_LookupDef_str").set_body_typed([](IRModule mod, String var) {
-//   return mod->LookupTypeDef(var);
-// });
+// Note: we don't expose update here as update is going to be removed.
 
-// TVM_REGISTER_GLOBAL("ir.Module_LookupTag").set_body_typed([](IRModule mod, int32_t tag) {
-//   return mod->LookupTag(tag);
-// });
-
-// TVM_REGISTER_GLOBAL("ir.Module_FromExpr")
-//     .set_body_typed([](RelayExpr e, tvm::Map<GlobalVar, BaseFunc> funcs,
-//                        tvm::Map<GlobalTypeVar, TypeData> type_defs) {
-//       return IRModule::FromExpr(e, funcs, type_defs);
-//     });
-
-// TVM_REGISTER_GLOBAL("ir.Module_Update").set_body_typed([](IRModule mod, IRModule from) {
-//   mod->Update(from);
-// });
-
-// TVM_REGISTER_GLOBAL("ir.Module_UpdateFunction")
-//     .set_body_typed([](IRModule mod, GlobalVar gv, BaseFunc func) { mod->Update(gv, func); });
-
-// TVM_REGISTER_GLOBAL("ir.Module_Import").set_body_typed([](IRModule mod, String path) {
-//   mod->Import(path);
-// });
-
-// TVM_REGISTER_GLOBAL("ir.Module_ImportFromStd").set_body_typed([](IRModule mod, String path) {
-//   mod->ImportFromStd(path);
-// });
+impl IRModule {
+    pub fn new<F, T>(funcs: F, types: T) -> Result<IRModule>
+    where
+        F: IntoIterator<Item = (GlobalVar, BaseFunc)>,
+        T: IntoIterator<Item = (GlobalTypeVar, TypeData)>,
+    {
+        module_new(Map::from_iter(funcs), Map::from_iter(types))
+    }
 
-// TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-//     .set_dispatch<IRModuleNode>([](const ObjectRef& ref, ReprPrinter* p) {
-//       auto* node = static_cast<const IRModuleNode*>(ref.get());
-//       p->stream << "IRModuleNode( " << node->functions << ")";
-//     });
+    pub fn empty() -> Result<IRModule> {
+        let funcs = HashMap::<GlobalVar, BaseFunc>::new();
+        let types = HashMap::<GlobalTypeVar, TypeData>::new();
+        IRModule::new(funcs, types)
+    }
 
-impl IRModule {
     pub fn parse<N, S>(file_name: N, source: S) -> Result<IRModule>
     where
         N: Into<TVMString>,
@@ -141,6 +131,15 @@ impl IRModule {
         Ok(module)
     }
 
+    pub fn add<F>(&mut self, var: GlobalVar, func: F) -> Result<IRModule>
+    // todo(@jroesch): can we do better here? why doesn't BaseFunc::Object work?
+    where
+        F: IsObjectRef,
+        F::Object: AsRef<<BaseFunc as IsObjectRef>::Object>,
+    {
+        module_add(self.clone(), var, func.upcast(), true)
+    }
+
     pub fn add_def(
         &mut self,
         type_name: GlobalTypeVar,
@@ -150,8 +149,11 @@ impl IRModule {
         module_add_def(self.clone(), type_name, type_data, update)
     }
 
-    pub fn get_global_var(&self, name: TVMString) -> Result<GlobalVar> {
-        module_get_global_var(self.clone(), name)
+    pub fn get_global_var<S>(&self, name: S) -> Result<GlobalVar>
+    where
+        S: Into<TVMString>,
+    {
+        module_get_global_var(self.clone(), name.into())
     }
 
     pub fn get_global_vars(&self) -> Result<Array<GlobalVar>> {
@@ -168,4 +170,216 @@ impl IRModule {
     {
         module_lookup_str(self.clone(), name.into())
     }
+
+    pub fn get_global_type_vars(&self) -> Result<Array<GlobalTypeVar>> {
+        module_get_global_type_vars(self.clone())
+    }
+
+    pub fn contains_global_var<S: Into<TVMString>>(&self, name: S) -> Result<bool> {
+        module_contains_global_var(self.clone(), name.into())
+    }
+
+    pub fn contains_global_type_var<S: Into<TVMString>>(&self, name: S) -> Result<bool> {
+        module_contains_global_type_var(self.clone(), name.into())
+    }
+
+    pub fn lookup_def(&self, global: GlobalTypeVar) -> Result<TypeData> {
+        module_lookup_def(self.clone(), global)
+    }
+
+    pub fn lookup_def_str<S>(&self, global: S) -> Result<TypeData>
+    where
+        S: Into<TVMString>,
+    {
+        module_lookup_def_str(self.clone(), global.into())
+    }
+
+    pub fn lookup_tag(&self, tag: i32) -> Result<relay::Constructor> {
+        module_lookup_tag(self.clone(), tag)
+    }
+
+    pub fn from_expr<E>(expr: E) -> Result<IRModule>
+    where
+        E: IsObjectRef,
+        E::Object: AsRef<<relay::Expr as IsObjectRef>::Object>,
+    {
+        Self::from_expr_with_items(expr, HashMap::new(), HashMap::new())
+    }
+
+    pub fn from_expr_with_items<E, F, T>(expr: E, funcs: F, types: T) -> Result<IRModule>
+    where
+        F: IntoIterator<Item = (GlobalVar, BaseFunc)>,
+        T: IntoIterator<Item = (GlobalTypeVar, TypeData)>,
+        E: IsObjectRef,
+        E::Object: AsRef<<relay::Expr as IsObjectRef>::Object>,
+    {
+        module_from_expr(expr.upcast(), Map::from_iter(funcs), Map::from_iter(types))
+    }
+
+    pub fn import<S: Into<TVMString>>(&mut self, path: S) -> Result<()> {
+        module_import(self.clone(), path.into())
+    }
+
+    pub fn import_from_std<S: Into<TVMString>>(&mut self, path: S) -> Result<()> {
+        module_import_from_std(self.clone(), path.into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::relay::*;
+    use super::*;
+    use crate::ir::span::Span;
+    use crate::ir::ty::{GlobalTypeVar, TypeData, TypeKind};
+    use tvm_rt::IsObjectRef;
+
+    fn add_dummy_functions(names: Vec<&str>) -> Result<IRModule> {
+        let mut module = IRModule::empty()?;
+        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
+        let params = vec![x.clone()];
+        let func = relay::Function::simple(params, x);
+
+        for name in names {
+            let gv = GlobalVar::new(name.into(), Span::null());
+            module = module.add(gv, func.clone())?;
+        }
+
+        Ok(module)
+    }
+
+    fn add_dummy_types(names: Vec<&str>) -> Result<IRModule> {
+        let mut module = IRModule::empty()?;
+
+        for name in names {
+            let name: String = name.into();
+            let name = GlobalTypeVar::new(name, TypeKind::Type, Span::null());
+            let type_data = TypeData::new(name.clone(), vec![], vec![], Span::null());
+            module.add_def(name, type_data, true)?;
+        }
+
+        Ok(module)
+    }
+
+    #[test]
+    fn test_module_add() -> anyhow::Result<()> {
+        let mut module = IRModule::empty()?;
+        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
+        let params = vec![x.clone()];
+        let func = relay::Function::simple(params, x);
+        let module = module.add(GlobalVar::new("foo".into(), Span::null()), func)?;
+        let lfunc = module.lookup_str("foo")?;
+        let lfunc = lfunc.downcast::<relay::Function>()?;
+        assert_eq!(lfunc.params.len(), 1);
+        Ok(())
+    }
+
+    #[test]
+    fn test_module_add_def() -> Result<()> {
+        let mut module = IRModule::empty()?;
+        let name = GlobalTypeVar::new("my_type", TypeKind::Type, Span::null());
+        let type_data = TypeData::new(name.clone(), vec![], vec![], Span::null());
+        module.add_def(name.clone(), type_data, true)?;
+        let by_gtv = module.lookup_def(name)?;
+        let by_gv = module.lookup_def_str("my_type")?;
+        Ok(())
+    }
+
+    #[test]
+    fn test_get_global_var() -> Result<()> {
+        let mut module = IRModule::empty()?;
+        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
+        let params = vec![x.clone()];
+        let func = relay::Function::simple(params, x);
+        let gv_foo = GlobalVar::new("foo".into(), Span::null());
+        let module = module.add(gv_foo.clone(), func)?;
+        let gv = module.get_global_var("foo")?;
+        assert_eq!(gv_foo, gv);
+        Ok(())
+    }
+
+    #[test]
+    fn test_get_global_vars() -> Result<()> {
+        let names = vec!["foo", "bar", "baz"];
+        let module = add_dummy_functions(names.clone())?;
+        let gvars: Vec<String> = module
+            .get_global_vars()?
+            .into_iter()
+            .map(|gv| gv.name_hint.as_str().unwrap().to_string())
+            .collect();
+
+        for name in names {
+            assert!(gvars.contains(&name.to_string()));
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_get_global_type_vars() -> Result<()> {
+        let names = vec!["foo", "bar", "baz"];
+        let module = add_dummy_types(names.clone())?;
+        let gvars: Vec<String> = module
+            .get_global_type_vars()?
+            .into_iter()
+            .map(|gv| gv.name_hint.as_str().unwrap().to_string())
+            .collect();
+
+        for name in names {
+            assert!(gvars.contains(&name.to_string()));
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_contains_global_var() -> Result<()> {
+        let module = add_dummy_functions(vec!["foo"])?;
+        assert!(module.contains_global_var("foo")?);
+        Ok(())
+    }
+
+    #[test]
+    fn test_contains_global_type_var() -> Result<()> {
+        let module = add_dummy_types(vec!["foo"])?;
+        assert!(module.contains_global_type_var("foo")?);
+        Ok(())
+    }
+
+    // TODO(@jroesch): not really sure about this API at all.
+    // pub fn lookup_tag(&self, tag: i32) -> Result<relay::Constructor> {
+    //     module_lookup_tag(self.clone(), tag)
+    // }
+
+    #[test]
+    fn test_from_expr() -> Result<()> {
+        let x = Var::static_tensor("x".into(), vec![1, 1], DataType::float32());
+        let params = vec![x.clone()];
+        let func = relay::Function::simple(params, x);
+        let module = IRModule::from_expr(func.clone())?;
+        let main_fn = module.lookup_str("main")?;
+        let main_fn = main_fn.downcast::<relay::Function>()?;
+        assert_eq!(main_fn, func);
+        Ok(())
+    }
+
+    #[test]
+    fn test_import() -> Result<()> {
+        let mut std_path: String = env!("CARGO_MANIFEST_DIR").into();
+        std_path += "/../../python/tvm/relay/std/prelude.rly";
+
+        let mut mod1 = IRModule::empty()?;
+        mod1.import(std_path.clone())?;
+        mod1.lookup_str("map")?;
+
+        // TODO(@jroesch): this requires another patch of mine to enable.
+
+        // if cfg!(feature = "python") {
+        //     crate::python::load().unwrap();
+        //     let mut mod2 = IRModule::empty()?;
+        //     mod2.import_from_std("prelude.rly")?;
+        //     mod2.lookup_str("map")?;
+        // }
+
+        Ok(())
+    }
 }
diff --git a/rust/tvm/src/ir/op.rs b/rust/tvm/src/ir/op.rs
index d81d6a69c1eb..d222ead0391b 100644
--- a/rust/tvm/src/ir/op.rs
+++ b/rust/tvm/src/ir/op.rs
@@ -27,7 +27,7 @@ type FuncType = ObjectRef;
 type AttrFieldInfo = ObjectRef;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Op"]
 #[type_key = "Op"]
 pub struct OpNode {
diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index cb96f0fbf588..7ecd92febc22 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -27,7 +27,7 @@ use tvm_macros::Object;
 type IndexExpr = PrimExpr;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Conv2DAttrs"]
 #[type_key = "relay.attrs.Conv2DAttrs"]
 pub struct Conv2DAttrsNode {
@@ -46,7 +46,7 @@ pub struct Conv2DAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BiasAddAttrs"]
 #[type_key = "relay.attrs.BiasAddAttrs"]
 pub struct BiasAddAttrsNode {
@@ -55,7 +55,7 @@ pub struct BiasAddAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "DenseAttrs"]
 #[type_key = "relay.attrs.DenseAttrs"]
 pub struct DenseAttrsNode {
@@ -65,7 +65,7 @@ pub struct DenseAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "GlobalPool2DAttrs"]
 #[type_key = "relay.attrs.GlobalPool2DAttrs"]
 pub struct GlobalPool2DAttrsNode {
@@ -74,7 +74,7 @@ pub struct GlobalPool2DAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "MaxPool2DAttrs"]
 #[type_key = "relay.attrs.MaxPool2DAttrs"]
 pub struct MaxPool2DAttrsNode {
@@ -87,7 +87,7 @@ pub struct MaxPool2DAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "SoftmaxAttrs"]
 #[type_key = "relay.attrs.SoftmaxAttrs"]
 pub struct SoftmaxAttrsNode {
@@ -96,7 +96,7 @@ pub struct SoftmaxAttrsNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BatchNormAttrs"]
 #[type_key = "relay.attrs.BatchNormAttrs"]
 pub struct BatchNormAttrsNode {
diff --git a/rust/tvm/src/ir/relay/attrs/transform.rs b/rust/tvm/src/ir/relay/attrs/transform.rs
index 863f07617778..c459f96b2d2f 100644
--- a/rust/tvm/src/ir/relay/attrs/transform.rs
+++ b/rust/tvm/src/ir/relay/attrs/transform.rs
@@ -21,7 +21,7 @@ use crate::ir::attrs::BaseAttrsNode;
 use tvm_macros::Object;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "ExpandDimsAttrs"]
 #[type_key = "relay.attrs.ExpandDimsAttrs"]
 pub struct ExpandDimsAttrsNode {
diff --git a/rust/tvm/src/ir/relay/mod.rs b/rust/tvm/src/ir/relay/mod.rs
index cc1a76bef7e3..9d2983237acb 100644
--- a/rust/tvm/src/ir/relay/mod.rs
+++ b/rust/tvm/src/ir/relay/mod.rs
@@ -16,11 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
-pub mod attrs;
-
-use std::hash::Hash;
-
 use crate::runtime::array::Array;
 use crate::runtime::{object::*, IsObjectRef, String as TString};
 
@@ -34,9 +29,12 @@ use tvm_macros::Object;
 use tvm_rt::NDArray;
 
 pub use super::expr::{GlobalVar, GlobalVarNode};
+pub use crate::runtime::DataType;
+
+pub mod attrs;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Expr"]
 #[type_key = "RelayExpr"]
 pub struct ExprNode {
@@ -58,22 +56,8 @@ impl ExprNode {
     }
 }
 
-impl Hash for Expr {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.as_ptr().unwrap().ptr.hash(state)
-    }
-}
-
-impl PartialEq for Expr {
-    fn eq(&self, other: &Self) -> bool {
-        self.as_ptr().unwrap().ptr.eq(&other.as_ptr().unwrap().ptr)
-    }
-}
-
-impl Eq for Expr {}
-
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Id"]
 #[type_key = "relay.Id"]
 pub struct IdNode {
@@ -92,7 +76,7 @@ impl Id {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Constant"]
 #[type_key = "relay.Constant"]
 pub struct ConstantNode {
@@ -111,7 +95,7 @@ impl Constant {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Tuple"]
 #[type_key = "relay.Tuple"]
 pub struct TupleNode {
@@ -130,7 +114,7 @@ impl Tuple {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Var"]
 #[type_key = "relay.Var"]
 pub struct VarNode {
@@ -140,11 +124,11 @@ pub struct VarNode {
 }
 
 impl Var {
-    pub fn new(name_hint: String, type_annotation: Type, _span: ObjectRef) -> Var {
+    pub fn new(name_hint: String, type_annotation: Type, _span: Span) -> Var {
         let node = VarNode {
             base: ExprNode::base::<VarNode>(),
             vid: Id::new(name_hint.into()),
-            type_annotation,
+            type_annotation: type_annotation,
         };
         Var(Some(ObjectPtr::new(node)))
     }
@@ -153,13 +137,18 @@ impl Var {
         &self.vid.0.as_ref().unwrap().name_hint
     }
 
-    pub fn to_expr(self) -> Expr {
-        unsafe { Expr(std::mem::transmute(self.0)) }
+    pub fn static_tensor(name_hint: String, sh: Vec<i32>, dtype: DataType) -> Var {
+        let sh = Array::from_vec(sh.into_iter().map(Into::into).collect()).unwrap();
+        Self::new(
+            name_hint,
+            super::ty::TensorType::new(sh, dtype, Span::null()).upcast(),
+            Span::null(),
+        )
     }
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Call"]
 #[type_key = "relay.Call"]
 pub struct CallNode {
@@ -190,7 +179,7 @@ impl Call {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Let"]
 #[type_key = "relay.Let"]
 pub struct LetNode {
@@ -213,7 +202,7 @@ impl Let {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "If"]
 #[type_key = "relay.If"]
 pub struct IfNode {
@@ -236,7 +225,7 @@ impl If {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TupleGetItem"]
 #[type_key = "relay.TupleGetItem"]
 pub struct TupleGetItemNode {
@@ -257,7 +246,7 @@ impl TupleGetItem {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "RefCreate"]
 #[type_key = "relay.RefCreate"]
 pub struct RefCreateNode {
@@ -276,7 +265,7 @@ impl RefCreate {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "RefRead"]
 #[type_key = "relay.RefRead"]
 pub struct RefReadNode {
@@ -295,7 +284,7 @@ impl RefRead {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "RefWrite"]
 #[type_key = "relay.RefWrite"]
 pub struct RefWriteNode {
@@ -316,7 +305,7 @@ impl RefWrite {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Constructor"]
 #[type_key = "relay.Constructor"]
 pub struct ConstructorNode {
@@ -341,7 +330,7 @@ impl Constructor {
 // TODO(@jroesch): define the type data
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Pattern"]
 #[type_key = "relay.Pattern"]
 pub struct PatternNode {
@@ -359,7 +348,7 @@ impl PatternNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PatternWildcard"]
 #[type_key = "relay.PatternWildcard"]
 pub struct PatternWildcardNode {
@@ -376,7 +365,7 @@ impl PatternWildcard {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PatternVar"]
 #[type_key = "relay.PatternVar"]
 pub struct PatternVarNode {
@@ -395,7 +384,7 @@ impl PatternVar {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PatternConstructor"]
 #[type_key = "relay.PatternConstructor"]
 pub struct PatternConstructorNode {
@@ -420,7 +409,7 @@ impl PatternConstructor {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PatternTuple"]
 #[type_key = "relay.PatternTuple"]
 pub struct PatternTupleNode {
@@ -439,7 +428,7 @@ impl PatternTuple {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Clause"]
 #[type_key = "relay.Clause"]
 pub struct ClauseNode {
@@ -460,7 +449,7 @@ impl Clause {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Match"]
 #[type_key = "relay.Match"]
 pub struct MatchNode {
@@ -483,7 +472,7 @@ impl Match {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Function"]
 #[type_key = "relay.Function"]
 pub struct FunctionNode {
@@ -510,6 +499,20 @@ impl Function {
         };
         Function(Some(ObjectPtr::new(node)))
     }
+
+    pub fn simple<E>(params: Vec<Var>, body: E) -> Function
+    where
+        E: IsObjectRef,
+        E::Object: AsRef<<Expr as IsObjectRef>::Object>,
+    {
+        let params = Array::from_vec(params).unwrap();
+        Self::new(
+            params,
+            body.upcast(),
+            Type::null(),
+            Array::from_vec(vec![]).unwrap(),
+        )
+    }
 }
 
 #[cfg(test)]
@@ -530,7 +533,7 @@ mod tests {
 
     #[test]
     fn test_global() -> Result<()> {
-        let gv = GlobalVar::new("main".to_string(), ObjectRef::null());
+        let gv = GlobalVar::new("main".to_string(), Span::null());
         let text = as_text(gv.clone());
         assert!(text.contains("@main"));
         Ok(())
@@ -538,7 +541,7 @@ mod tests {
 
     #[test]
     fn test_var() -> Result<()> {
-        let var = Var::new("local".to_string(), Type::null(), ObjectRef::null());
+        let var = Var::new("local".to_string(), Type::null(), Span::null());
         let text = as_text(var.clone());
         assert!(text.contains("%local"));
         Ok(())
@@ -557,7 +560,7 @@ def @main() -> float32 {
         )
         .unwrap();
         let main = module
-            .lookup(module.get_global_var("main".to_string().into()).unwrap())
+            .lookup(module.get_global_var("main").unwrap())
             .unwrap();
         let func = main.downcast::<crate::ir::relay::Function>().unwrap();
         let constant = func
diff --git a/rust/tvm/src/ir/source_map.rs b/rust/tvm/src/ir/source_map.rs
index 54e16dac62ac..7376f4b74022 100644
--- a/rust/tvm/src/ir/source_map.rs
+++ b/rust/tvm/src/ir/source_map.rs
@@ -29,7 +29,7 @@ use tvm_macros::Object;
 ///
 /// Could represent the source from an ML framework or a source of an IRModule.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[type_key = "Source"]
 #[ref_name = "Source"]
 pub struct SourceNode {
@@ -46,7 +46,7 @@ pub struct SourceNode {
 
 /// A mapping from a unique source name to source fragments.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[type_key = "SourceMap"]
 #[ref_name = "SourceMap"]
 pub struct SourceMapNode {
diff --git a/rust/tvm/src/ir/span.rs b/rust/tvm/src/ir/span.rs
index eb6821af69dc..be74745b60ca 100644
--- a/rust/tvm/src/ir/span.rs
+++ b/rust/tvm/src/ir/span.rs
@@ -23,7 +23,7 @@ use tvm_macros::Object;
 
 /// A source file name, contained in a Span.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[type_key = "SourceName"]
 #[ref_name = "SourceName"]
 pub struct SourceNameNode {
@@ -33,7 +33,7 @@ pub struct SourceNameNode {
 
 /// Span information for diagnostic purposes.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[type_key = "Span"]
 #[ref_name = "Span"]
 pub struct SpanNode {
diff --git a/rust/tvm/src/ir/tir.rs b/rust/tvm/src/ir/tir.rs
index 22d4e02054e1..ccbe30c95820 100644
--- a/rust/tvm/src/ir/tir.rs
+++ b/rust/tvm/src/ir/tir.rs
@@ -26,7 +26,7 @@ use tvm_macros::Object;
 macro_rules! define_node {
     ($name:ident, $ref:expr, $typekey:expr; $node:ident { $($id:ident : $t:ty),*}) => {
         #[repr(C)]
-        #[derive(Object)]
+        #[derive(Object, Debug)]
         #[ref_name = $ref]
         #[type_key = $typekey]
         pub struct $node {
@@ -47,6 +47,20 @@ macro_rules! define_node {
 // TODO(@jroesch): should move up to expr.rs to mirror TVM.
 define_node!(IntImm, "IntImm", "IntImm";
              IntImmNode { value: i64 });
+
+impl From<i32> for IntImm {
+    fn from(i: i32) -> IntImm {
+        IntImm::new(DataType::int(32, 1), i as i64)
+    }
+}
+
+impl From<i32> for PrimExpr {
+    fn from(i: i32) -> PrimExpr {
+        use crate::runtime::IsObjectRef;
+        IntImm::from(i).upcast()
+    }
+}
+
 define_node!(Var, "Var", "tir.Var";
              VarNode { name_hint: TVMString });
 
diff --git a/rust/tvm/src/ir/ty.rs b/rust/tvm/src/ir/ty.rs
index d12f094a63ea..f7c52b51f332 100644
--- a/rust/tvm/src/ir/ty.rs
+++ b/rust/tvm/src/ir/ty.rs
@@ -17,15 +17,16 @@
  * under the License.
  */
 
-use super::span::Span;
-use crate::runtime::{IsObject, Object, ObjectPtr};
 use tvm_macros::Object;
 use tvm_rt::{array::Array, DataType};
 
-use super::PrimExpr;
+use crate::ir::relay::Constructor;
+use crate::ir::span::Span;
+use crate::ir::PrimExpr;
+use crate::runtime::{string::String as TString, IsObject, Object, ObjectPtr};
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "Type"]
 #[type_key = "Type"]
 pub struct TypeNode {
@@ -51,7 +52,7 @@ impl TypeNode {
  * \sa PrimType
  */
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PrimType"]
 #[type_key = "PrimType"]
 pub struct PrimTypeNode {
@@ -73,7 +74,7 @@ pub struct PrimTypeNode {
  */
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PointerType"]
 #[type_key = "PointerType"]
 pub struct PointerTypeNode {
@@ -83,6 +84,7 @@ pub struct PointerTypeNode {
 }
 
 /// Possible kinds of type variables.
+#[derive(PartialEq, Eq, Debug)]
 pub enum TypeKind {
     Type = 0,
     /// Template variable in shape expression.
@@ -92,47 +94,51 @@ pub enum TypeKind {
     TypeData = 6,
 }
 
-/*
- * \brief Type parameter in functions.
- *
- * A type variable can be viewed as template parameter in c++ template function.
- *
- * For example, in the following pesudo code,
- * the TypeVar of f is TypeVar("n", kind=kShapeVar).
- * This function can take in a Tensor with shape=(3, 3) and
- * returns a Tensor with shape=(9,)
- *
- * \code
- *
- *  template<i32 n>
- *  f(x : Tensor[i32, (n, n)]) -> Tensor[i32, (n * n)]
- *
- * \endcode
- * \sa TypeVar, TypeKind
- */
+/// Type parameter in functions.
+///
+/// A type variable can be viewed as template parameter in c++ template function.
+///
+/// For example, in the following pesudo code,
+/// the TypeVar of f is TypeVar("n", kind=kShapeVar).
+/// This function can take in a Tensor with shape=(3, 3) and
+/// returns a Tensor with shape=(9,)
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TypeVar"]
 #[type_key = "TypeVar"]
 pub struct TypeVarNode {
     pub base: TypeNode,
-    pub name_hint: String,
+    pub name_hint: TString,
     pub kind: TypeKind,
 }
 
 /// A global type variable that is used for defining new types or type aliases.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "GlobalTypeVar"]
 #[type_key = "GlobalTypeVar"]
 pub struct GlobalTypeVarNode {
     pub base: TypeNode,
-    pub name_hint: String,
+    pub name_hint: TString,
     pub kind: TypeKind,
 }
 
+impl GlobalTypeVar {
+    pub fn new<S>(name_hint: S, kind: TypeKind, span: Span) -> GlobalTypeVar
+    where
+        S: Into<TString>,
+    {
+        let node = GlobalTypeVarNode {
+            base: TypeNode::base::<GlobalTypeVarNode>(span),
+            name_hint: name_hint.into(),
+            kind: kind,
+        };
+        ObjectPtr::new(node).into()
+    }
+}
+
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TupleType"]
 #[type_key = "TupleType"]
 pub struct TupleTypeNode {
@@ -147,7 +153,7 @@ impl TupleType {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TypeConstraint"]
 #[type_key = "TypeConstraint"]
 pub struct TypeConstraintNode {
@@ -156,7 +162,7 @@ pub struct TypeConstraintNode {
 
 /// The representation of a polymorphic function type.
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "FuncType"]
 #[type_key = "FuncType"]
 pub struct FuncTypeNode {
@@ -181,7 +187,7 @@ pub struct FuncTypeNode {
  * TypeVar represents the input to the graph.
  */
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "IncompleteType"]
 #[type_key = "IncompleteType"]
 pub struct IncompleteTypeNode {
@@ -195,7 +201,7 @@ pub struct IncompleteTypeNode {
  * \sa RelayRefType.
  */
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "RefType"]
 #[type_key = "relay.RefType"]
 pub struct RelayRefTypeNode {
@@ -204,7 +210,7 @@ pub struct RelayRefTypeNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "BaseTensorType"]
 #[type_key = "relay.BaseTensorType"]
 pub struct BaseTensorTypeNode {
@@ -212,7 +218,7 @@ pub struct BaseTensorTypeNode {
 }
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "TensorType"]
 #[type_key = "relay.TensorType"]
 pub struct TensorTypeNode {
@@ -240,3 +246,52 @@ impl TensorType {
 // using TypeRelationFn = tvm::TypeRelationFn;
 // using TypeReporter = tvm::TypeReporter;
 // using TypeReporterNode = tvm::TypeReporterNode;
+
+/* TypeData container node.
+\brief Stores all data for an Algebraic Data Type (ADT).
+
+In particular, it stores the handle (global type var) for an ADT
+and the constructors used to build it and is kept in the module. Note
+that type parameters are also indicated in the type data: this means that
+for any instance of an ADT, the type parameters must be indicated. That is,
+an ADT definition is treated as a type-level function, so an ADT handle
+must be wrapped in a TypeCall node that instantiates the type-level arguments.
+The kind checker enforces this. */
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "TypeData"]
+#[type_key = "relay.TypeData"]
+pub struct TypeDataNode {
+    /// The header is simply the name of the ADT.
+    /// We adopt nominal typing for ADT definitions;
+    /// that is, differently-named ADT definitions with same constructors
+    /// have different types.
+    pub base: TypeNode,
+    pub type_name: GlobalTypeVar,
+    /// The type variables (to allow for polymorphism).
+    pub type_vars: Array<TypeVar>,
+    /// The constructors.
+    pub constructors: Array<Constructor>,
+}
+
+impl TypeData {
+    pub fn new<TypeVars, Ctors>(
+        type_name: GlobalTypeVar,
+        type_vars: TypeVars,
+        constructors: Ctors,
+        span: Span,
+    ) -> TypeData
+    where
+        TypeVars: IntoIterator<Item = TypeVar>,
+        Ctors: IntoIterator<Item = Constructor>,
+    {
+        use std::iter::FromIterator;
+        let type_data = TypeDataNode {
+            base: TypeNode::base::<TypeDataNode>(span),
+            type_name,
+            type_vars: Array::from_iter(type_vars),
+            constructors: Array::from_iter(constructors),
+        };
+        TypeData(Some(ObjectPtr::new(type_data)))
+    }
+}
diff --git a/rust/tvm/src/transform.rs b/rust/tvm/src/transform.rs
index c5a65c417c93..b49633777b65 100644
--- a/rust/tvm/src/transform.rs
+++ b/rust/tvm/src/transform.rs
@@ -33,7 +33,7 @@ pub type IRModule = ObjectRef;
 pub type PassContext = ObjectRef;
 
 #[repr(C)]
-#[derive(Object)]
+#[derive(Object, Debug)]
 #[ref_name = "PassInfo"]
 #[type_key = "transform.PassInfo"]
 pub struct PassInfoNode {
diff --git a/src/ir/module.cc b/src/ir/module.cc
index b011f2d2f664..7990b281fb04 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -439,6 +439,9 @@ TVM_REGISTER_GLOBAL("ir.Module_GetGlobalTypeVars")
 TVM_REGISTER_GLOBAL("ir.Module_ContainGlobalVar")
     .set_body_method<IRModule>(&IRModuleNode::ContainGlobalVar);
 
+TVM_REGISTER_GLOBAL("ir.Module_ContainGlobalTypeVar")
+    .set_body_method<IRModule>(&IRModuleNode::ContainGlobalTypeVar);
+
 TVM_REGISTER_GLOBAL("ir.Module_GetGlobalTypeVar")
     .set_body_method<IRModule>(&IRModuleNode::GetGlobalTypeVar);
 
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index d60999c3f3d0..2c87cceec8bb 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -74,7 +74,7 @@ cd tests/test_tvm_dso
 cargo run
 cd -
 
-# # run wasm32 test
+# run wasm32 test
 # cd tests/test_wasm32
 # cargo build
 # wasmtime $RUST_DIR/target/wasm32-wasi/debug/test-wasm32.wasm

From 334140b7aa7399e0bebbe3566d188c615fc273a1 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 6 Nov 2020 05:48:09 +0900
Subject: [PATCH 128/258] [TOPI] Enable scatter_add on GPU  (#6856)

* enable scatter gpu test on cuda

* adding update_func arg

* pytorch scatter_add gpu tests working

* update 3d and 4d scatter

* enable scatter_add gpu test

Co-authored-by: masa <masa@pop-os.localdomain>
---
 python/tvm/relay/op/_transform.py             |   2 +-
 python/tvm/relay/op/strategy/cuda.py          |  15 +-
 python/tvm/relay/op/strategy/generic.py       |  15 +-
 python/tvm/topi/cuda/scatter.py               | 133 ++++++++++++++----
 tests/python/frontend/pytorch/test_forward.py |  12 +-
 tests/python/relay/test_op_level3.py          |   4 +-
 6 files changed, 139 insertions(+), 42 deletions(-)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index a4da896e6111..2fa806c07e11 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -113,7 +113,7 @@ def compute_scatter_add(attrs, inputs, output_type):
     return [topi.scatter_add(inputs[0], inputs[1], inputs[2], attrs.axis)]
 
 
-_reg.register_schedule("scatter_add", strategy.schedule_scatter_add)
+_reg.register_strategy("scatter_add", strategy.scatter_add_strategy)
 
 # interpolate
 @_reg.register_compute("interpolate")
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index b7ceda304639..26e9a0060b66 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -664,7 +664,7 @@ def sparse_dense_padded_strategy_cuda(attrs, inputs, out_type, target):
 
 @scatter_strategy.register(["cuda", "gpu"])
 def scatter_cuda(attrs, inputs, out_type, target):
-    """sparse dense cuda strategy"""
+    """scatter cuda strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_scatter(topi.cuda.scatter),
@@ -675,6 +675,19 @@ def scatter_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+@scatter_add_strategy.register(["cuda", "gpu"])
+def scatter_add_cuda(attrs, inputs, out_type, target):
+    """scatter_add cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.cuda.scatter_add),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter_add.cuda",
+        plevel=10,
+    )
+    return strategy
+
+
 @argsort_strategy.register(["cuda", "gpu"])
 def argsort_strategy_cuda(attrs, inputs, out_type, target):
     """argsort cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 276bf67bd463..59b2b3489783 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1052,12 +1052,15 @@ def _compute_scatter(attrs, inputs, _):
     return _compute_scatter
 
 
-# scatter_add
-@generic_func
-def schedule_scatter_add(attrs, outs, target):
-    """schedule scatter_add"""
-    with target:
-        return topi.generic.schedule_scatter_add(outs)
+@override_native_generic_func("scatter_add_strategy")
+def scatter_add_strategy(attrs, outs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.scatter_add),
+        wrap_topi_schedule(topi.generic.schedule_scatter),
+        name="scatter_add.generic",
+    )
+    return strategy
 
 
 # interpolate
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 6522d74d8bef..0a3e96f4be30 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -24,7 +24,7 @@ def ceil_div(a, b):
     return (a + b - 1) // b
 
 
-def gen_ir_1d(data, indices, updates, axis, out):
+def gen_ir_1d(data, indices, updates, axis, out, update_func):
     """Generate scatter ir for 1d inputs
 
     Parameters
@@ -44,6 +44,9 @@ def gen_ir_1d(data, indices, updates, axis, out):
     out : tir.Tensor
         The output tensor.
 
+    update_func: function
+        The function to be applied to a destination and the corresponding update.
+
     Returns
     -------
     ret : tir
@@ -73,14 +76,14 @@ def gen_ir_1d(data, indices, updates, axis, out):
         with ib.for_range(0, ni, name="i") as i:
             index = indices_ptr[i]
             with ib.if_scope(index < 0):
-                out_ptr[index + n] = updates_ptr[i]
+                update_func(out_ptr, index + n, updates_ptr[i])
             with ib.else_scope():
-                out_ptr[index] = updates_ptr[i]
+                update_func(out_ptr, index, updates_ptr[i])
 
     return ib.get()
 
 
-def gen_ir_2d(data, indices, updates, axis, out):
+def gen_ir_2d(data, indices, updates, axis, out, update_func):
     """Generate scatter ir for 2d inputs
 
     Parameters
@@ -100,6 +103,9 @@ def gen_ir_2d(data, indices, updates, axis, out):
     out : tir.Tensor
         The output tensor.
 
+    update_func: function
+        The function to be applied to a destination and the corresponding update
+
     Returns
     -------
     ret : tir
@@ -140,9 +146,9 @@ def gen_ir_2d(data, indices, updates, axis, out):
                 idx = i * ci + j
                 index = indices_ptr[idx]
                 with ib.if_scope(index < 0):
-                    out_ptr[(index + n) * c + j] = updates_ptr[idx]
+                    update_func(out_ptr, (index + n) * c + j, updates_ptr[idx])
                 with ib.else_scope():
-                    out_ptr[index * c + j] = updates_ptr[idx]
+                    update_func(out_ptr, index * c + j, updates_ptr[idx])
     else:
         with ib.new_scope():
             i = te.thread_axis("blockIdx.x")
@@ -151,13 +157,13 @@ def gen_ir_2d(data, indices, updates, axis, out):
                 idx = i * ci + j
                 index = indices_ptr[idx]
                 with ib.if_scope(index < 0):
-                    out_ptr[i * c + (index + c)] = updates_ptr[idx]
+                    update_func(out_ptr, i * c + (index + c), updates_ptr[idx])
                 with ib.else_scope():
-                    out_ptr[i * c + index] = updates_ptr[idx]
+                    update_func(out_ptr, i * c + index, updates_ptr[idx])
     return ib.get()
 
 
-def gen_ir_3d(data, indices, updates, axis, out):
+def gen_ir_3d(data, indices, updates, axis, out, update_func):
     """Generate scatter ir for 3d inputs
 
     Parameters
@@ -177,6 +183,9 @@ def gen_ir_3d(data, indices, updates, axis, out):
     out : tir.Tensor
         The output tensor.
 
+    update_func: function
+        The function to be applied to a destination and the corresponding update
+
     Returns
     -------
     ret : tir
@@ -225,9 +234,9 @@ def gen_ir_3d(data, indices, updates, axis, out):
                         idx = (i * ci + j) * hi + k
                         index = indices_ptr[idx]
                         with ib.if_scope(index < 0):
-                            out_ptr[((index + n) * c + j) * h + k] = updates_ptr[idx]
+                            update_func(out_ptr, ((index + n) * c + j) * h + k, updates_ptr[idx])
                         with ib.else_scope():
-                            out_ptr[(index * c + j) * h + k] = updates_ptr[idx]
+                            update_func(out_ptr, (index * c + j) * h + k, updates_ptr[idx])
     elif axis == 1:
         with ib.new_scope():
             i = te.thread_axis("blockIdx.x")
@@ -241,9 +250,9 @@ def gen_ir_3d(data, indices, updates, axis, out):
                         idx = (i * ci + j) * hi + k
                         index = indices_ptr[idx]
                         with ib.if_scope(index < 0):
-                            out_ptr[(i * c + (index + c)) * h + k] = updates_ptr[idx]
+                            update_func(out_ptr, (i * c + (index + c)) * h + k, updates_ptr[idx])
                         with ib.else_scope():
-                            out_ptr[(i * c + index) * h + k] = updates_ptr[idx]
+                            update_func(out_ptr, (i * c + index) * h + k, updates_ptr[idx])
     else:
         with ib.new_scope():
             i = te.thread_axis("blockIdx.x")
@@ -254,13 +263,13 @@ def gen_ir_3d(data, indices, updates, axis, out):
                 idx = (i * ci + j) * hi + k
                 index = indices_ptr[idx]
                 with ib.if_scope(index < 0):
-                    out_ptr[(i * c + j) * h + (index + h)] = updates_ptr[idx]
+                    update_func(out_ptr, (i * c + j) * h + (index + h), updates_ptr[idx])
                 with ib.else_scope():
-                    out_ptr[(i * c + j) * h + index] = updates_ptr[idx]
+                    update_func(out_ptr, (i * c + j) * h + index, updates_ptr[idx])
     return ib.get()
 
 
-def gen_ir_4d(data, indices, updates, axis, out):
+def gen_ir_4d(data, indices, updates, axis, out, update_func):
     """Generate scatter ir for 4d inputs
 
     Parameters
@@ -280,6 +289,9 @@ def gen_ir_4d(data, indices, updates, axis, out):
     out : tir.Tensor
         The output tensor.
 
+    update_func: function
+        The function to be applied to a destination and the corresponding update
+
     Returns
     -------
     ret : tir
@@ -333,9 +345,13 @@ def gen_ir_4d(data, indices, updates, axis, out):
                         idx = ((i * ci + j) * hi + k) * wi + l
                         index = indices_ptr[idx]
                         with ib.if_scope(index < 0):
-                            out_ptr[(((index + n) * c + j) * h + k) * w + l] = updates_ptr[idx]
+                            update_func(
+                                out_ptr, (((index + n) * c + j) * h + k) * w + l, updates_ptr[idx]
+                            )
                         with ib.else_scope():
-                            out_ptr[((index * c + j) * h + k) * w + l] = updates_ptr[idx]
+                            update_func(
+                                out_ptr, ((index * c + j) * h + k) * w + l, updates_ptr[idx]
+                            )
     elif axis == 1:
         with ib.new_scope():
             i = te.thread_axis("blockIdx.x")
@@ -351,9 +367,13 @@ def gen_ir_4d(data, indices, updates, axis, out):
                         idx = ((i * ci + j) * hi + k) * wi + l
                         index = indices_ptr[idx]
                         with ib.if_scope(index < 0):
-                            out_ptr[((i * c + (index + c)) * h + k) * w + l] = updates_ptr[idx]
+                            update_func(
+                                out_ptr, ((i * c + (index + c)) * h + k) * w + l, updates_ptr[idx]
+                            )
                         with ib.else_scope():
-                            out_ptr[((i * c + index) * h + k) * w + l] = updates_ptr[idx]
+                            update_func(
+                                out_ptr, ((i * c + index) * h + k) * w + l, updates_ptr[idx]
+                            )
     elif axis == 2:
         with ib.new_scope():
             i = te.thread_axis("blockIdx.x")
@@ -369,9 +389,13 @@ def gen_ir_4d(data, indices, updates, axis, out):
                         idx = ((i * ci + j) * hi + k) * wi + l
                         index = indices_ptr[idx]
                         with ib.if_scope(index < 0):
-                            out_ptr[((i * c + j) * h + (index + h)) * w + l] = updates_ptr[idx]
+                            update_func(
+                                out_ptr, ((i * c + j) * h + (index + h)) * w + l, updates_ptr[idx]
+                            )
                         with ib.else_scope():
-                            out_ptr[((i * c + j) * h + index) * w + l] = updates_ptr[idx]
+                            update_func(
+                                out_ptr, ((i * c + j) * h + index) * w + l, updates_ptr[idx]
+                            )
     else:
         with ib.new_scope():
             i = te.thread_axis("blockIdx.x")
@@ -384,10 +408,9 @@ def gen_ir_4d(data, indices, updates, axis, out):
                 idx = ((i * ci + j) * hi + k) * wi + l
                 index = indices_ptr[idx]
                 with ib.if_scope(index < 0):
-                    out_ptr[((i * c + j) * h + k) * w + (index + w)] = updates_ptr[idx]
+                    update_func(out_ptr, ((i * c + j) * h + k) * w + (index + w), updates_ptr[idx])
                 with ib.else_scope():
-                    out_ptr[((i * c + j) * h + k) * w + index] = updates_ptr[idx]
-
+                    update_func(out_ptr, ((i * c + j) * h + k) * w + index, updates_ptr[idx])
     return ib.get()
 
 
@@ -428,12 +451,15 @@ def scatter(data, indices, updates, axis=0):
         4: gen_ir_4d,
     }
 
+    def update_func(dst_ptr, dst_index, update):
+        dst_ptr[dst_index] = update
+
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
     out = te.extern(
         [out_shape],
         [data, indices, updates],
-        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0]),
+        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func),
         dtype=data.dtype,
         out_buffers=[out_buf],
         name="scatter_gpu",
@@ -441,3 +467,58 @@ def scatter(data, indices, updates, axis=0):
     )
 
     return out
+
+
+def scatter_add(data, indices, updates, axis=0):
+    """Update data by adding values in updates at positions defined by indices
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    indices : relay.Expr
+        The index locations to update.
+
+    updates : relay.Expr
+        The values to be added.
+
+    axis : int
+        The axis to scatter on
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    if axis < 0:
+        axis += len(data.shape)
+    assert axis >= 0
+    assert axis < len(data.shape)
+
+    rank = len(data.shape)
+    assert 1 <= rank <= 4, "scatter_add only supports 1-4 dimensions"
+
+    ir_funcs = {
+        1: gen_ir_1d,
+        2: gen_ir_2d,
+        3: gen_ir_3d,
+        4: gen_ir_4d,
+    }
+
+    def update_func(dst_ptr, dst_index, update):
+        dst_ptr[dst_index] += update
+
+    out_shape = data.shape
+    out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
+    out = te.extern(
+        [out_shape],
+        [data, indices, updates],
+        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_add_gpu",
+        tag="scatter_add_gpu",
+    )
+
+    return out
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 4dec5f7e5916..6250dfff811a 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3149,17 +3149,17 @@ def test_fn_scatter_add(dim):
     in_data = torch.zeros(3, 5)
     in_index = torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]])
     in_src = torch.rand(2, 5)
-    # TODO: add scatter gpu schedule to enable gpu test.
-    verify_trace_model(test_fn_scatter(0), [in_data, in_index, in_src], ["llvm"])
-    verify_trace_model(test_fn_scatter_add(0), [in_data, in_index, in_src], ["llvm"])
+
+    targets = ["llvm", "cuda"]
+    verify_trace_model(test_fn_scatter(0), [in_data, in_index, in_src], targets)
+    verify_trace_model(test_fn_scatter_add(0), [in_data, in_index, in_src], targets)
 
     in_data = torch.zeros(2, 4)
     in_index = torch.tensor([[2], [3]])
     in_src = torch.rand(2, 1)
 
-    # # TODO: add scatter gpu schedule to enable gpu test.
-    verify_trace_model(test_fn_scatter(1), [in_data, in_index, in_src], ["llvm"])
-    verify_trace_model(test_fn_scatter_add(1), [in_data, in_index, in_src], ["llvm"])
+    verify_trace_model(test_fn_scatter(1), [in_data, in_index, in_src], targets)
+    verify_trace_model(test_fn_scatter_add(1), [in_data, in_index, in_src], targets)
 
 
 def test_numel():
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index e636fe3f0037..f091856f6b7e 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -993,6 +993,7 @@ def verify_dynamic_scatter(dshape, ishape, axis=0):
     verify_dynamic_scatter((16, 16, 4, 5), (16, 16, 4, 5), 3)
 
 
+@tvm.testing.uses_gpu
 def test_scatter_add():
     def ref_scatter_add(data, indices, updates, axis=0):
         output = np.copy(data)
@@ -1015,8 +1016,7 @@ def verify_scatter_add(dshape, ishape, axis=0):
         indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
 
         ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis)
-        # TODO(mbrookhart): expand testing when adding more backend schedules
-        for target, ctx in [("llvm", tvm.cpu())]:
+        for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)

From 90ddf106b4955d9234cc80cf4152eeba6df8a7e5 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Thu, 5 Nov 2020 16:18:49 -0800
Subject: [PATCH 129/258] [Relay][Frontend][Onnx] If Operator Support (#6730)

* If operator support in ONNX.

* Small tweak.

* Added uses_gpu tag.

* Disable test on GPU until onnxruntime version is updated.

* Use parametrize_target to specify CPU only.

* Just dont use onnxruntime for now i guess.
---
 python/tvm/relay/frontend/onnx.py          | 44 ++++++++++++++++++-
 tests/python/frontend/onnx/test_forward.py | 49 +++++++++++++++++++++-
 2 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index fa404efc39cf..7ebad7297471 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2146,7 +2146,9 @@ def body_fn(*loop_inputs):
 
             # Get the output of the current loop using the updated inputs.
             with subgraph_scope:
-                loop_outputs = subgraph_scope.from_onnx(body, 11, get_output_expr=True)
+                loop_outputs = subgraph_scope.from_onnx(
+                    body, graph_scope.opset, get_output_expr=True
+                )
             # Unpack the body outputs and prepare variables for next iteration.
             new_cond = loop_outputs[0]
             new_loop_vars = [loop_outputs[i] for i in range(1, 1 + num_deps)]
@@ -2197,6 +2199,43 @@ def body_fn(*loop_inputs):
         return outputs
 
 
+class If(OnnxOpConverter):
+    """Operator converter for If"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        cond = inputs[0]
+        then_branch = attr.get("then_branch", None)
+        else_branch = attr.get("else_branch", None)
+        assert then_branch is not None and else_branch is not None
+
+        # Create graph converters for both branches.
+        graph_scope = GraphProto.current
+        then_graph = GraphProto(graph_scope._shape, graph_scope._dtype)
+        then_graph._nodes = graph_scope._nodes.copy()
+        else_graph = GraphProto(graph_scope._shape, graph_scope._dtype)
+        else_graph._nodes = graph_scope._nodes.copy()
+
+        # Convert each branch to a relay expression.
+        with then_graph:
+            then_expr = then_graph.from_onnx(then_branch, graph_scope.opset, get_output_expr=True)
+        with else_graph:
+            else_expr = else_graph.from_onnx(else_branch, graph_scope.opset, get_output_expr=True)
+
+        # Add constants from both branches to parent graph.
+        graph_scope._params.update(then_graph._params)
+        then_free_vars = analysis.free_vars(then_expr)
+        for var in then_free_vars:
+            graph_scope._nodes.update({var.name_hint: var})
+        graph_scope._params.update(else_graph._params)
+        else_free_vars = analysis.free_vars(else_expr)
+        for var in else_free_vars:
+            graph_scope._nodes.update({var.name_hint: var})
+
+        # Now we can construct the relay if statement and return.
+        return _expr.If(cond, then_expr, else_expr)
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -2354,6 +2393,7 @@ def _get_convert_map(opset):
         "Range": Range.get_converter(opset),
         # defs/control_flow
         "Loop": Loop.get_converter(opset),
+        "If": If.get_converter(opset),
     }
 
 
@@ -2381,6 +2421,7 @@ def __init__(self, shape, dtype):
         self._num_param = 0
         self._shape = shape if shape else {}
         self._dtype = dtype
+        self.opset = None
 
     def __enter__(self):
         self._old_manager = GraphProto.current
@@ -2436,6 +2477,7 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
         params : dict
             A dict of name: tvm.nd.array pairs, used as pretrained weights
         """
+        self.opset = opset
         # parse network inputs to relay, aka parameters
         for init_tensor in graph.initializer:
             if not init_tensor.name.strip():
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index bf27ba5ddcd9..b84e55ac800c 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -17,7 +17,7 @@
 import numpy as np
 import math
 import onnx
-from onnx import helper, TensorProto, mapping
+from onnx import helper, TensorProto, mapping, numpy_helper
 import torch
 import torchvision
 import tvm.topi.testing
@@ -3841,6 +3841,53 @@ def test_loop():
     verify_count_loop()
 
 
+@tvm.testing.uses_gpu
+def test_if():
+    # Given a bool scalar input cond.
+    # return constant tensor x if cond is True, otherwise return constant tensor y.
+    then_out = onnx.helper.make_tensor_value_info("then_out", onnx.TensorProto.FLOAT, [5])
+    else_out = onnx.helper.make_tensor_value_info("else_out", onnx.TensorProto.FLOAT, [5])
+
+    x = np.array([1, 2, 3, 4, 5]).astype(np.float32)
+    y = np.array([5, 4, 3, 2, 1]).astype(np.float32)
+
+    then_const_node = onnx.helper.make_node(
+        "Constant", inputs=[], outputs=["then_out"], value=onnx.numpy_helper.from_array(x)
+    )
+
+    else_const_node = onnx.helper.make_node(
+        "Constant", inputs=[], outputs=["else_out"], value=onnx.numpy_helper.from_array(y)
+    )
+
+    then_body = onnx.helper.make_graph([then_const_node], "then_body", [], [then_out])
+
+    else_body = onnx.helper.make_graph([else_const_node], "else_body", [], [else_out])
+
+    if_node = onnx.helper.make_node(
+        "If", inputs=["cond"], outputs=["res"], then_branch=then_body, else_branch=else_body
+    )
+
+    if_graph = onnx.helper.make_graph(
+        [if_node],
+        "if_outer",
+        inputs=[
+            onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []),
+        ],
+        outputs=[
+            onnx.helper.make_tensor_value_info("res", onnx.TensorProto.FLOAT, [5]),
+        ],
+    )
+
+    if_model = onnx.helper.make_model(if_graph)
+    cond = np.array(1).astype("bool")
+    correct_out = x if cond else y
+
+    for target, ctx in tvm.testing.enabled_targets():
+        tvm_out = get_tvm_output_with_vm(if_model, [cond], target, ctx, freeze_params=True)
+        for i in range(len(tvm_out)):
+            tvm.testing.assert_allclose(correct_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
+
+
 if __name__ == "__main__":
     test_flatten()
     test_reshape()

From b9c2ed132913316847529908b573d76a99e2e205 Mon Sep 17 00:00:00 2001
From: Lily Orth-Smith <lilyorthsmith@gmail.com>
Date: Thu, 5 Nov 2020 16:20:55 -0800
Subject: [PATCH 130/258] [QNN] Dynamic scale, zero point in qnn.op.dequantize
 (#6849)

* add dynamic dequantize

* register quantize and dequantize as opaque

* make tests better

* black

* remove main fn

* fix black again

* move tests

* fix import

* fix import again

* try again

* fix import
---
 python/tvm/relay/qnn/op/qnn.py               |  7 +++++
 src/relay/qnn/op/dequantize.cc               | 23 ++++++++--------
 tests/python/relay/test_op_qnn_dequantize.py | 28 ++++++++++++++++++++
 tests/python/relay/test_op_qnn_quantize.py   | 28 ++++++++++++++++++++
 4 files changed, 74 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 3f23d6895b43..9a8f22bfb9bc 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -21,6 +21,8 @@
 from tvm.relay.expr import Tuple, TupleWrapper
 from tvm.relay.op.nn.utils import get_pad_tuple2d
 from . import _make
+from ... import op as reg
+from ...op import OpPattern
 
 
 def requantize(
@@ -496,3 +498,8 @@ def subtract(
         output_scale,
         output_zero_point,
     )
+
+
+# register fuse pattern for qnn ops
+reg.register_pattern("qnn.quantize", OpPattern.OPAQUE)
+reg.register_pattern("qnn.dequantize", OpPattern.OPAQUE)
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 2fe075c7e64b..724441e0c523 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -79,20 +79,27 @@ Expr MakeDequantize(Expr data, Expr input_scale, Expr input_zero_point, int axis
 }
 
 Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
-                     const Expr& input_zero_point, const Array<IndexExpr>& input_shape,
+                     const Expr& input_zero_point, const Array<tvm::relay::Type>& types,
                      const DequantizeAttrs* attrs) {
   const auto axis = attrs->axis;
 
+  ICHECK_EQ(types.size(), 4);
+  auto in_type = types[0];
+  auto in_tensor_type = in_type.as<TensorTypeNode>();
+  ICHECK(in_tensor_type != nullptr) << "Type information missing"
+                                    << " Please run infer_type pass.";
+  Array<IndexExpr> input_shape = in_tensor_type->shape;
+
   size_t n_dim = input_shape.size();
 
   // Expand scale and zero point if the input tensor is channel quantized
   auto expanded_input_scale = input_scale;
-  if (!IsConstScalar(input_scale)) {
+  if (!IsConstScalar(input_scale) && !IsScalarType(types[1])) {
     expanded_input_scale = ExpandBiasToMatchAxis(input_scale, n_dim, {axis});
   }
 
   auto expanded_input_zero_point = input_zero_point;
-  if (!IsConstScalar(input_zero_point)) {
+  if (!IsConstScalar(input_zero_point) && !IsScalarType(types[2])) {
     expanded_input_zero_point = ExpandBiasToMatchAxis(input_zero_point, n_dim, {axis});
   }
 
@@ -113,15 +120,7 @@ Expr DequantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
   ICHECK(dequantize_attrs != nullptr);
 
-  // Find input shape.
-  ICHECK_EQ(types.size(), 4);
-  auto in_type = types[0];
-  auto in_tensor_type = in_type.as<TensorTypeNode>();
-  ICHECK(in_tensor_type != nullptr) << "Type information missing."
-                                    << " Please run infer_type pass.";
-  Array<IndexExpr> input_shape = in_tensor_type->shape;
-
-  return DequantizeLower(data, input_scale, input_zero_point, input_shape, dequantize_attrs);
+  return DequantizeLower(data, input_scale, input_zero_point, types, dequantize_attrs);
 }
 
 RELAY_REGISTER_OP("qnn.dequantize")
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index e1416622c236..e7fb161a13cb 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -20,6 +20,7 @@
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
+from tvm.relay.testing import run_infer_type
 
 
 def dequantize_test_driver(in_dtype, quant_args, in_data, verify_output_data, axis):
@@ -118,9 +119,36 @@ def test_channelwise_axis_0():
     )
 
 
+def test_dynamic_dequantize():
+    x = relay.var("x", shape=(1, 2, 3, 4), dtype="int8")
+    scale_var = relay.var("scale", shape=(), dtype="float32")
+    zp_var = relay.var("zp", shape=(), dtype="int32")
+
+    deq_x = relay.qnn.op.dequantize(x, scale_var * scale_var, zp_var + zp_var)
+    tt = run_infer_type(deq_x)
+
+    assert tt.checked_type == relay.TensorType((1, 2, 3, 4), "float32")
+    func = relay.Function([x, scale_var, zp_var], deq_x)
+    data = np.random.uniform(size=(1, 2, 3, 4)).astype("int8")
+    scale = np.array(1).astype("float32")
+    zp = np.array(0).astype("int32")
+
+    mod = tvm.ir.IRModule.from_expr(func)
+
+    for target, ctx in tvm.testing.enabled_targets():
+        # TODO: (electriclilies) enable AlterOpLayout when it is fixed
+        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            lib = relay.build(mod, target=target)
+
+    module = graph_runtime.GraphModule(lib["default"](ctx))
+    module.set_input(**{"x": data, "scale": scale, "zp": zp})
+    module.run()
+
+
 if __name__ == "__main__":
     test_uint8_to_float32()
     test_int8_to_float32()
     test_int32_to_float32()
     test_channelwise_axis_1()
     test_channelwise_axis_0()
+    test_dynamic_dequantize()
diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py
index a22c25f5b97f..2ef298679904 100644
--- a/tests/python/relay/test_op_qnn_quantize.py
+++ b/tests/python/relay/test_op_qnn_quantize.py
@@ -20,6 +20,7 @@
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
+from tvm.relay.testing import run_infer_type
 
 
 def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data, verify_output_data):
@@ -133,8 +134,35 @@ def test_channelwise_axis_1():
     )
 
 
+def test_dynamic_quantize():
+    x = relay.var("x", shape=(1, 2, 3, 4), dtype="float32")
+    scale_var = relay.var("scale", shape=(), dtype="float32")
+    zp_var = relay.var("zp", shape=(), dtype="int32")
+
+    q_x = relay.qnn.op.quantize(x, scale_var * scale_var, zp_var + zp_var)
+    tt = run_infer_type(q_x)
+
+    assert tt.checked_type == relay.TensorType((1, 2, 3, 4), "int8")
+    func = relay.Function([x, scale_var, zp_var], q_x)
+    data = np.random.uniform(size=(1, 2, 3, 4)).astype("float32")
+    scale = np.array(1).astype("float32")
+    zp = np.array(0).astype("int32")
+
+    mod = tvm.ir.IRModule.from_expr(func)
+
+    for target, ctx in tvm.testing.enabled_targets():
+        # TODO: (electriclilies) enable AlterOpLayout when it is fixed
+        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            lib = relay.build(mod, target=target)
+
+    module = graph_runtime.GraphModule(lib["default"](ctx))
+    module.set_input(**{"x": data, "scale": scale, "zp": zp})
+    module.run()
+
+
 if __name__ == "__main__":
     test_float32_to_uint8()
     test_float32_to_int8()
     test_channelwise_axis_0()
     test_channelwise_axis_1()
+    test_dynamic_quantize()

From b1aa82cc3d751585c958beb3f9d16cdfa60d26d1 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 5 Nov 2020 16:24:32 -0800
Subject: [PATCH 131/258] [TVMSCRIPT] Using diagnostics for TVM Script (#6797)

* [TVMSCRIPT] Using diagnostics for TVM Script

* fix lint

* More documentation, improve some error messages

* Apply suggestions from code review

Co-authored-by: Leandro Nunes <leandron85@gmail.com>

* Add synr to ci setup and setup.py

* remove typed_ast dependency

Co-authored-by: Leandro Nunes <leandron85@gmail.com>
---
 .../install/ubuntu_install_python_package.sh  |   2 +-
 python/setup.py                               |   2 +-
 python/tvm/script/context_maintainer.py       |   4 +-
 python/tvm/script/diagnostics.py              |  54 +
 python/tvm/script/meta_unparser.py            |  31 +-
 python/tvm/script/parser.py                   | 979 +++++++++---------
 python/tvm/script/scope_handler.py            |  20 +-
 python/tvm/script/special_stmt.py             |  18 +-
 .../unittest/test_tvmscript_error_report.py   | 219 ++--
 tests/scripts/task_ci_python_setup.sh         |   2 +-
 10 files changed, 726 insertions(+), 605 deletions(-)
 create mode 100644 python/tvm/script/diagnostics.py

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index d86cbecba213..7989a49a4826 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -21,4 +21,4 @@ set -u
 set -o pipefail
 
 # install libraries for python package on ubuntu
-pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest pytest-xdist pytest-profiling mypy orderedset attrs requests Pillow packaging cloudpickle synr
+pip3 install six numpy pytest cython decorator scipy tornado pytest pytest-xdist pytest-profiling mypy orderedset attrs requests Pillow packaging cloudpickle synr
diff --git a/python/setup.py b/python/setup.py
index 5333da0da239..ec98e94f80eb 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -183,7 +183,7 @@ def get_package_data_files():
         "decorator",
         "attrs",
         "psutil",
-        "typed_ast",
+        "synr>=0.2.1",
     ],
     extras_require={
         "test": ["pillow<7", "matplotlib"],
diff --git a/python/tvm/script/context_maintainer.py b/python/tvm/script/context_maintainer.py
index 8ad39354e5cf..955266c4a3e0 100644
--- a/python/tvm/script/context_maintainer.py
+++ b/python/tvm/script/context_maintainer.py
@@ -70,5 +70,5 @@ def lookup_symbol(self, name):
                 return symbols[name]
         return None
 
-    def report_error(self, message):
-        self.parser.report_error(message)
+    def report_error(self, message, span):
+        self.parser.report_error(message, span)
diff --git a/python/tvm/script/diagnostics.py b/python/tvm/script/diagnostics.py
new file mode 100644
index 000000000000..fc196f6b16ae
--- /dev/null
+++ b/python/tvm/script/diagnostics.py
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Bridge from synr's (the library used for parsing the python AST)
+   DiagnosticContext to TVM's diagnostics
+"""
+import tvm
+from synr import DiagnosticContext, ast
+from tvm.ir.diagnostics import DiagnosticContext as TVMCtx
+from tvm.ir.diagnostics import get_renderer, DiagnosticLevel, Diagnostic
+
+
+class TVMDiagnosticCtx(DiagnosticContext):
+    """TVM diagnostics for synr"""
+
+    diag_ctx: TVMCtx
+
+    def __init__(self) -> None:
+        self.diag_ctx = TVMCtx(tvm.IRModule(), get_renderer())
+        self.source_name = None
+
+    def to_tvm_span(self, src_name, ast_span: ast.Span) -> tvm.ir.Span:
+        return tvm.ir.Span(
+            src_name,
+            ast_span.start_line,
+            ast_span.end_line,
+            ast_span.start_column,
+            ast_span.end_column,
+        )
+
+    def add_source(self, name: str, source: str) -> None:
+        src_name = self.diag_ctx.module.source_map.add(name, source)
+        self.source_name = src_name
+
+    def emit(self, _level, message, span):
+        span = self.to_tvm_span(self.source_name, span)
+        self.diag_ctx.emit(Diagnostic(DiagnosticLevel.ERROR, span, message))
+        self.diag_ctx.render()  # Raise exception on the first error we hit. TODO remove
+
+    def render(self):
+        self.diag_ctx.render()
diff --git a/python/tvm/script/meta_unparser.py b/python/tvm/script/meta_unparser.py
index d56fbad3d1e3..b1472ccdc758 100644
--- a/python/tvm/script/meta_unparser.py
+++ b/python/tvm/script/meta_unparser.py
@@ -17,34 +17,29 @@
 """Unparse meta AST node into a dict"""
 # pylint: disable=invalid-name
 
-from typed_ast import ast3 as ast
+from synr import Transformer
 
 
-class MetaUnparser(ast.NodeVisitor):
+class MetaUnparser(Transformer):
     """Python AST Visitor to unparse meta AST node into a dict"""
 
-    def visit_Dict(self, node):
+    def transform(self, node):
+        method = "transform_" + node.__class__.__name__
+        visitor = getattr(self, method, None)
+        if visitor is None:
+            self.error(f"Unexpected node type {type(node)} when parsing __tvm_meta__", node.span)
+        return visitor(node)
+
+    def transform_DictLiteral(self, node):
         keys = [self.visit(key) for key in node.keys]
         values = [self.visit(value) for value in node.values]
         return dict(zip(keys, values))
 
-    def visit_Tuple(self, node):
+    def transform_Tuple(self, node):
         return tuple(self.visit(element) for element in node.elts)
 
-    def visit_List(self, node):
+    def transform_ArrayLiteral(self, node):
         return [self.visit(element) for element in node.elts]
 
-    def visit_keyword(self, node):
-        return node.arg, self.visit(node.value)
-
-    def visit_NameConstant(self, node):
-        return node.value
-
-    def visit_Constant(self, node):
+    def transform_Constant(self, node):
         return node.value
-
-    def visit_Num(self, node):
-        return node.n
-
-    def visit_Str(self, node):
-        return node.s
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 70aa3fe34387..6ce682778e5c 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -14,21 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""TVM Script Parser For TIR"""
-# pylint: disable=invalid-name, missing-docstring, inconsistent-return-statements, no-else-return
-# pylint: disable=unnecessary-comprehension, unused-argument
-# pylint: disable=relative-beyond-top-level
+"""TVM Script Parser For TIR
+
+We use [synr](https://synr.readthedocs.io) to get an AST that is stable over
+different python versions. Synr also provides an error handling context that we
+use for error reporting.
+"""
+# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return
 import json
 import operator
 import inspect
-from typed_ast import ast3 as ast
+from synr import ast, Transformer, to_ast
 
 import tvm
 from tvm import IRModule
 from tvm._ffi.base import TVMError
 from tvm.ir import GlobalVar
-from tvm.tir import all as _all
-from tvm.tir import expr as _expr
 
 from . import context_maintainer, ty
 from .meta_unparser import MetaUnparser
@@ -37,31 +38,47 @@
 from .special_stmt import SpecialStmt
 from .scope_handler import ScopeHandler, WithScopeHandler, ForScopeHandler
 from . import _ffi_api
+from .diagnostics import TVMDiagnosticCtx
 
 
 class CallArgumentReader(object):
-    """A helper class which read required argument from passed arguments"""
+    """Helper class to read required arguments from passed arguments.
+
+    When parsing a function call, we need to match the arguments provided in
+    the AST to the required arguments of the function. This class makes sure
+    all the positional arguments are filled and also fill keyword arguments
+    with thier default value if a different value was not provided.
+    """
 
-    def __init__(self, func_name, args, kwargs, parser):
+    def __init__(self, func_name, args, kwargs, parser, node):
         self.func_name = func_name
         self.args = args
         self.kwargs = kwargs
         self.parser = parser
+        self.node = node
 
     def get_pos_only_arg(self, pos, name):
         """Get corresponding position only function argument from argument list"""
         if len(self.args) >= pos:
             arg = self.args[pos - 1]
         elif name not in self.kwargs:
-            self.parser.report_error(self.func_name + " misses argument " + name)
+            # If no positional argument was found in the AST, we see if it was
+            # defined by name instead.
+            # TODO(tkonolige): this error message is not quite correct. The
+            # number of required arguments is >= pos
+            self.parser.report_error(
+                f"{self.func_name} requires {pos} arguments, but only {len(self.args)} were given.",
+                self.node.span,
+            )
         else:
             arg = self.kwargs[name]
 
         return arg
 
     def get_kwarg(self, pos, name, default):
-        """Get corresponding keyword function argument from argument list
-        If user doesn't provide the argument, set it to default value
+        """Get corresponding keyword function argument from argument list.
+
+        If the user hasn't provided the argument, set it to the default value.
         """
         if len(self.args) >= pos:
             arg = self.args[pos - 1]
@@ -79,81 +96,76 @@ def get_varargs(self, pos):
         return []
 
 
-class TVMScriptParserError(RuntimeError):
-    """TVM script Parser Runtime Error"""
+class TVMScriptParser(Transformer):
+    """Synr AST visitor pass which finally lowers to TIR.
 
-
-class TVMScriptParser(ast.NodeVisitor):
-    """Python AST visitor pass which finally lowers it to TIR
-    Notes for extension:
-    1. To support new types of AST nodes. Add a function visit_xxx().
-    2. To support new functions
+    Notes for Extension
+    -------------------
+    1. To support a new type of AST node, add a function transform_xxx().
+    2. To support new functions, add the function to the appropriate registry:
         We divide allowed function calls in TVM script into 3 categories,
-        which is intrin, scope_handler and special_stmt.
-        1) intrin functions ought to have return value.
-        User can also register intrin category function into parser.
-        2) scope_handler functions have no return value and accepts parser and AST node
-        as its arguments, which is used in for scope and with scope.
-        3) special_stmt functions have return value and accepts parser and AST node as its arguments
-        When visiting Call node, we check special_stmt registry at first. If no registered function
-        is found, we then check intrin.
-        When visiting With node, we check with_scope registry.
-        When visiting For node, we check for_scope registry.
+        intrin, scope_handler and special_stmt.
+        1. intrin functions are low level functions like mod, load, and
+           constants. They correspond to a tir `IRNode`. They must have a
+           return value. The user can register intrin functions for the parser to
+           use.
+        2. scope_handler functions have no return value. They take two
+           arguments: the parser and the AST node. scope_handler functions are
+           used in with and for statements.
+        3. special_stmt functions handle cases that do not have a corresponding
+           tir `IRNode`. These functions take the parser and the AST node as
+           arguments and may return a value.
+        When visiting a Call node, we check the special_stmt registry first. If
+        no registered function is found, we then check the intrin registry.
+        When visiting With node, we check the with_scope registry.
+        When visiting For node, we check the for_scope registry.
     """
 
     _binop_maker = {
-        ast.Add: tvm.tir.Add,
-        ast.Sub: tvm.tir.Sub,
-        ast.Mult: tvm.tir.Mul,
-        ast.Div: tvm.tir.Div,
-        ast.FloorDiv: tvm.tir.FloorDiv,
-        ast.Mod: tvm.tir.FloorMod,
-        ast.BitOr: operator.or_,
-        ast.BitAnd: operator.and_,
-        ast.BitXor: operator.xor,
-        ast.Gt: tvm.tir.GT,
-        ast.GtE: tvm.tir.GE,
-        ast.Lt: tvm.tir.LT,
-        ast.LtE: tvm.tir.LE,
-        ast.Eq: tvm.tir.EQ,
-        ast.NotEq: tvm.tir.NE,
-        ast.And: tvm.tir.And,
-        ast.Or: tvm.tir.Or,
+        ast.BuiltinOp.Add: tvm.tir.Add,
+        ast.BuiltinOp.Sub: tvm.tir.Sub,
+        ast.BuiltinOp.Mul: tvm.tir.Mul,
+        ast.BuiltinOp.Div: tvm.tir.Div,
+        ast.BuiltinOp.FloorDiv: tvm.tir.FloorDiv,
+        ast.BuiltinOp.Mod: tvm.tir.FloorMod,
+        ast.BuiltinOp.BitOr: operator.or_,
+        ast.BuiltinOp.BitAnd: operator.and_,
+        ast.BuiltinOp.BitXor: operator.xor,
+        ast.BuiltinOp.GT: tvm.tir.GT,
+        ast.BuiltinOp.GE: tvm.tir.GE,
+        ast.BuiltinOp.LT: tvm.tir.LT,
+        ast.BuiltinOp.LE: tvm.tir.LE,
+        ast.BuiltinOp.Eq: tvm.tir.EQ,
+        ast.BuiltinOp.NotEq: tvm.tir.NE,
+        ast.BuiltinOp.And: tvm.tir.And,
+        ast.BuiltinOp.Or: tvm.tir.Or,
     }
 
-    _unaryop_maker = {ast.USub: operator.neg, ast.Invert: operator.invert, ast.Not: tvm.tir.Not}
+    _unaryop_maker = {
+        ast.BuiltinOp.USub: operator.neg,
+        ast.BuiltinOp.Invert: operator.invert,
+        ast.BuiltinOp.Not: tvm.tir.Not,
+    }
 
-    def __init__(self, src, base_lienno):
+    def __init__(self, base_lienno):
         self.context = None
 
-        self.src = src.split("\n")
         self.base_lineno = base_lienno
         self.current_lineno = 0
         self.current_col_offset = 0
         self.meta = None
 
-        self.functions = {}
-
     def init_function_parsing_env(self):
         """Initialize function parsing environment"""
         self.context = context_maintainer.ContextMaintainer(self)  # scope emitter
 
-    @staticmethod
-    def is_meta(node):
-        """Judge whether an AST node is META"""
-        return (
-            isinstance(node, ast.Assign)
-            and len(node.targets) == 1
-            and isinstance(node.targets[0], ast.Name)
-            and node.targets[0].id == "__tvm_meta__"
-        )
-
     def init_meta(self, meta_dict):
         if meta_dict is not None:
             self.meta = tvm.ir.load_json(json.dumps(meta_dict))
 
-    def visit(self, node):
-        """Override method in ast.NodeVisitor"""
+    def transform(self, node):
+        """Generic transformation for visiting the AST. Dispatches to
+        `transform_ClassName` for the appropriate ClassName."""
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
 
         if hasattr(node, "lineno"):
@@ -161,72 +173,74 @@ def visit(self, node):
         if hasattr(node, "col_offset"):
             self.current_col_offset = node.col_offset
 
-        method = "visit_" + node.__class__.__name__
+        method = "transform_" + node.__class__.__name__
         visitor = getattr(self, method, self.generic_visit)
-        visit_res = visitor(node)
+        transform_res = visitor(node)
 
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
 
-        return visit_res
-
-    def wrap_line_col(self, message, lineno, col_offset):
-        """Wrap the message with line number and column offset"""
-        src_line = self.src[lineno - self.base_lineno]
-        leading_space = len(src_line) - len(src_line.lstrip(" "))
-        col_offset = col_offset - leading_space
-        src_line = src_line[leading_space:]
-        return (
-            "\n  "
-            + src_line
-            + "\n  "
-            + " " * col_offset
-            + "^\n"
-            + "ParserError in line "
-            + str(lineno)
-            + " : "
-            + message
-        )
+        return transform_res
+
+    def report_error(self, message, span):
+        """Report an error occuring at a location.
+
+        This just dispatches to synr's DiagnosticContext.
 
-    def report_error(self, message, lineno=None, col_offset=None):
-        """Report an error occur in line lineno and column col_offset
         Parameters
         ----------
         message : str
             Error message
-        lineno : int
-            Line number of error line
-        col_offset : int
-            Column offset of error line
+        span : synr.ast.Span
+            Location of the error
         """
+        self.error(message, span)
 
-        if lineno is None:
-            lineno = self.current_lineno
-        if col_offset is None:
-            col_offset = self.current_col_offset
-        raise TVMScriptParserError(self.wrap_line_col(message, lineno, col_offset))
+    def parse_body(self, parent):
+        """Parse remaining statements in this scope.
 
-    def parse_body(self):
+        Parameters
+        ----------
+        parent : synr.ast.Node
+            Parent node of this scope. Errors will be reported here.
+        """
         body = []
+        stmt = parent
         while len(self.context.node_stack[-1]) > 0:
-            res = self.visit(self.context.node_stack[-1].pop())
+            stmt = self.context.node_stack[-1].pop()
+            res = self.transform(stmt)
             if res is not None:
                 body.append(res)
-        return tvm.tir.SeqStmt(body) if len(body) > 1 else body[0]
+        if len(body) == 0:
+            self.report_error(
+                "Expected another statement at the end of this block. Perhaps you "
+                "used a concise statement and forgot to include a body afterwards.",
+                stmt.span,
+            )
+        else:
+            return tvm.tir.SeqStmt(body) if len(body) > 1 else body[0]
 
     def parse_arg_list(self, func, node_call):
+        """Match the arguments of a function call in the AST to the required
+        arguments of the function. This handles positional arguments,
+        positional arguments specified by name, keyword arguments, and varargs.
+        """
         assert isinstance(node_call, ast.Call)
         # collect arguments
-        args = [self.visit(arg) for arg in node_call.args]
-        kw_args = [self.visit(keyword) for keyword in node_call.keywords]
-        kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
+        args = [self.transform(arg) for arg in node_call.params]
+        kw_args = {
+            self.transform(k): self.transform(v) for k, v in node_call.keyword_params.items()
+        }
         # get the name and parameter list of func
         if isinstance(func, (Intrin, ScopeHandler, SpecialStmt)):
             func_name, param_list = func.signature()
         else:
-            print(func)
-            raise Exception("Internal Error")
+            self.report_error(
+                "Internal Error: function must be of type Intrin, ScopeHandler or SpecialStmt, "
+                f"but it is {type(func).__name__}",
+                node_call.span,
+            )
         # check arguments and parameter list and get a list of arguments
-        reader = CallArgumentReader(func_name, args, kw_args, self)
+        reader = CallArgumentReader(func_name, args, kw_args, self, node_call)
         pos_only, kwargs, varargs = param_list
         internal_args = list()
         for i, arg_name in enumerate(pos_only):
@@ -238,25 +252,26 @@ def parse_arg_list(self, func, node_call):
             internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1))
         return internal_args
 
-    def parse_type(self, type_node):
-        """ Parse type """
+    def parse_type(self, type_node, parent):
+        """Parse a type annotation.
+
+        We require the parent object to the type so that we have a place to
+        report the error message if the type does not exist.
+        """
         if type_node is None:
-            self.report_error("missing type annotation")
-        res_type = self.visit(type_node)
+            self.report_error("A type annotation is required", parent.span)
+        res_type = self.transform(type_node)
         return tvm.ir.TupleType([]) if res_type is None else res_type.evaluate()
 
     def generic_visit(self, node):
-        """Override method in ast.NodeVisitor.
-        To directly filter out invalidate type of stmt.
-        """
+        """Fallback visitor if node type is not handled. Reports an error."""
 
-        self.report_error(type(node).__name__ + " AST node is not supported now")
+        self.report_error(type(node).__name__ + " AST node is not supported", node.span)
 
-    def visit_Module(self, node):
+    def transform_Module(self, node):
         """Module visitor
-        AST abstract grammar:
-            Module(stmt* body, type_ignore* type_ignore)
-        By now we support two format of TVM script shown below.
+
+        Right now, we only support two formats for TVM Script.
 
         Example
         -------
@@ -277,7 +292,7 @@ def A(...):
 
             import tvm
 
-            @tvm.script
+            @tvm.script.tir
             class MyMod():
                def A(...):
                   ...
@@ -290,79 +305,103 @@ def B(...):
             # returns an IRModule
             mod = MyMod()
         """
+        if len(node.funcs) == 1:
+            return self.transform(next(iter(node.funcs.values())))
+        elif len(node.func) == 0:
+            self.report_error(
+                "You must supply at least one class or function definition", node.span
+            )
+        else:
+            self.report_error(
+                "Only one-function, one-class or function-with-meta source code is allowed",
+                ast.Span.union([x.span for x in list(node.funcs.values())[1:]]),
+            )
 
-        if len(node.body) == 1 and isinstance(node.body[0], (ast.ClassDef, ast.FunctionDef)):
-            # class or single function
-            return self.visit(node.body[0])
-        elif len(node.body) == 2:
-            if isinstance(node.body[0], ast.Assign):
-                node.body[0], node.body[1] = node.body[1], node.body[0]
-            if isinstance(node.body[0], ast.FunctionDef) and TVMScriptParser.is_meta(node.body[1]):
-                # function with meta
-                self.init_meta(MetaUnparser().visit(node.body[1].value))
-                return self.visit(node.body[0])
-        self.report_error(
-            "Only one-function, one-class or function-with-meta source code is allowed"
-        )
+    def transform_Class(self, node):
+        """Class definition visitor.
 
-    def visit_ClassDef(self, node):
-        """ClassDef visitor
-        AST abstract grammar:
-            ClassDef(identifier name, expr* bases, keyword* keywords, stmt* body,
-                     expr* decorator_list)
+        A class can have multiple function definitions and a single
+        :code:`__tvm_meta__` statement. Each class corresponds to a single
+        :code:`IRModule`.
+
+        Example
+        -------
+        .. code-block:: python
+
+            @tvm.script.tir
+            class MyClass:
+                __tvm_meta__ = {}
+                def A():
+                    tir.evaluate(0)
         """
+        if len(node.assignments) == 1:
+            if not (
+                isinstance(node.assignments[0].lhs, ast.Var)
+                and node.assignments[0].lhs.id.name == "__tvm_meta__"
+            ):
+                self.report_error(
+                    "The only top level assignments allowed are `__tvm_meta__ = ...`",
+                    node.assignments[0].lhs.span,
+                )
+            self.init_meta(
+                MetaUnparser().do_transform(node.assignments[0].rhs, self._diagnostic_context)
+            )
+        elif len(node.assignments) > 1:
+            self.report_error(
+                "Only a single top level `__tvm_meta__` is allowed",
+                ast.Span.union([x.span for x in node.assignments[1:]]),
+            )
+
+        return create_module(
+            {GlobalVar(name): self.transform(func) for name, func in node.funcs.items()}
+        )
 
-        # parse meta
-        count = False
-        for body_element in node.body:
-            if isinstance(body_element, ast.FunctionDef):
-                pass
-            elif TVMScriptParser.is_meta(body_element) and not count:
-                count = True
-                self.init_meta(MetaUnparser().visit(body_element.value))
-            else:
-                self.report_error("invalid class member")
+    def transform_Function(self, node):
+        """Function definition visitor.
 
-        # parse member functions
-        for body_element in node.body:
-            if isinstance(body_element, ast.FunctionDef):
-                self.visit(body_element)
+        Each function definition is translated to a single :code:`PrimFunc`.
 
-        return create_module(self.functions)
+        There are a couple restrictions on TVM Script functions:
+        1. Function arguments must have their types specified.
+        2. The body of the function can contain :code:`func_attr` to specify
+           attributes of the function (like it's name).
+        3. The body of the function can also contain multiple :code:`buffer_bind`s,
+           which give shape and dtype information to arguments.
+        4. Return statements are implicit.
 
-    def visit_FunctionDef(self, node):
-        """FunctionDef visitor
-        AST abstract grammar:
-            FunctionDef(identifier name, arguments args, stmt* body, expr* decorator_list,
-                        expr? returns, string? type_comment)
-            arguments = (arg* posonlyargs, arg* args, arg? vararg, arg* kwonlyargs,
-                         expr* kw_defaults, arg? kwarg, expr* defaults)
-            arg = (identifier arg, expr? annotation, string? type_comment)
+        Example
+        -------
+        .. code-block:: python
+
+            @tvm.script.tir
+            def my_function(x: ty.handle):  # 1. Argument types
+                tir.func_attr({"global_symbol": "mmult"})  # 2. Function attributes
+                X_1 = tir.buffer_bind(x, [1024, 1024])  # 3. Buffer binding
+                tir.evaluate(0)  # 4. This function returns 0
         """
 
         self.init_function_parsing_env()
-        self.context.new_scope(nodes=node.body)
+        self.context.new_scope(nodes=node.body.stmts)
 
         # add parameters of function
-        for arg in node.args.args:
-            arg_var = tvm.te.var(arg.arg, self.parse_type(arg.annotation))
-            self.context.update_symbol(arg.arg, arg_var)
+        for arg in node.params:
+            arg_var = tvm.te.var(arg.name, self.parse_type(arg.ty, arg))
+            self.context.update_symbol(arg.name, arg_var)
             self.context.func_params.append(arg_var)
 
         # fetch the body and return a tir.PrimFunc
         func = tvm.tir.PrimFunc(
             self.context.func_params,
-            self.parse_body(),
-            ret_type=self.parse_type(node.returns),
+            self.parse_body(node.body),
+            ret_type=self.parse_type(node.ret_type, node),
             buffer_map=self.context.func_buffer_map,
             attrs=tvm.ir.make_node("DictAttrs", **self.context.func_dict_attr),
         )
-        self.functions[GlobalVar(node.name)] = func
 
         self.context.pop_scope()
         return func
 
-    def visit_Assign(self, node):
+    def transform_Assign(self, node):
         """Assign visitor
         AST abstract grammar:
             Assign(expr* targets, expr value, string? type_comment)
@@ -378,79 +417,76 @@ def visit_Assign(self, node):
                 4.1 var = tir.allocate()
         """
 
-        if not len(node.targets) == 1:
-            self.report_error("Only one-valued assignment is supported now")
-
-        if isinstance(node.targets[0], ast.Name) and isinstance(node.value, ast.Call):
+        if isinstance(node.rhs, ast.Call):
             # Pattern 1 & Pattern 4
-            func = self.visit(node.value.func)
-            arg_list = self.parse_arg_list(func, node.value)
+            func = self.transform(node.rhs.func_name)
             if isinstance(func, WithScopeHandler):
                 if not func.concise_scope or not func.def_symbol:
                     self.report_error(
-                        "with scope handler " + func.signature()[0] + " is not suitable here"
+                        "with scope handler " + func.signature()[0] + " is not suitable here",
+                        node.rhs.span,
                     )
                 # Pattern 4
                 func.enter_scope(node, self.context)
-                arg_list = self.parse_arg_list(func, node.value)
-                func.body = self.parse_body()
+                arg_list = self.parse_arg_list(func, node.rhs)
+                func.body = self.parse_body(node)
                 return func.exit_scope(node, self.context, arg_list)
             elif isinstance(func, SpecialStmt):
                 # Pattern 1
+                arg_list = self.parse_arg_list(func, node.rhs)
                 func.handle(node, self.context, arg_list)
+                return self.parse_body(node)
             else:
-                self.report_error("Unsupported Assign stmt")
-        elif isinstance(node.targets[0], ast.Subscript):
-            # Pattern 2 & Pattern 3
-            symbol, indexes = self.visit(node.targets[0])
-            rhs = self.visit(node.value)
-            if isinstance(symbol, tvm.tir.Buffer):
-                # Pattern 2
-                return tvm.tir.BufferStore(symbol, tvm.runtime.convert(rhs), indexes)
-            else:
-                if len(indexes) != 1:
-                    self.report_error("Invalid Store stmt")
-                # Pattern 3
-                return tvm.tir.Store(
-                    symbol, tvm.runtime.convert(rhs), indexes[0], tvm.runtime.convert(True)
-                )
-        else:
-            self.report_error("Unsupported Assign stmt")
-
-    def visit_AnnAssign(self, node):
-        """AnnAssign visitor
-        AST abstract grammar:
-            AnnAssign(expr target, expr annotation, expr? value, int simple)
-
-        Pattern corresponds to concise mode of with tir.let()
-        """
-
-        if isinstance(node.target, ast.Name):
-            value = self.visit(node.value)
-            var = tvm.te.var(node.target.id, self.parse_type(node.annotation))
-            self.context.update_symbol(var.name, var)
-            body = self.parse_body()
-            self.context.remove_symbol(var.name)
-            return tvm.tir.LetStmt(var, value, body)
+                value = self.transform(node.rhs)
+                if not isinstance(node.lhs, ast.Var):
+                    # This is a little confusing because it only is true when
+                    # we have taken this branch. We might need to clarify what
+                    # exectly is allowed in Assignments in tvmscript.
+                    self.report_error(
+                        "Left hand side of assignment must be an unqualified variable",
+                        node.lhs.span,
+                    )
+                var = tvm.te.var(node.lhs.id.name, self.parse_type(node.ty, node.lhs))
+                self.context.update_symbol(var.name, var)
+                body = self.parse_body(node)
+                self.context.remove_symbol(var.name)
+                return tvm.tir.LetStmt(var, value, body)
+
+        self.report_error("Unsupported Assign stmt", node.span)
+
+    def transform_SubscriptAssign(self, node):
+        """Visitor for statements of the form :code:`x[1] = 2`."""
+        symbol = self.transform(node.params[0])
+        indexes = self.transform(node.params[1])
+        rhs = self.transform(node.params[2])
+        if isinstance(symbol, tvm.tir.Buffer):
+            # BufferStore
+            return tvm.tir.BufferStore(symbol, tvm.runtime.convert(rhs), indexes)
         else:
-            self.report_error("Unsupported AnnAssign stmt")
+            if len(indexes) != 1:
+                self.report_error(
+                    f"Store is only allowed with one index, but {len(indexes)} were provided.",
+                    Span.union([x.span for x in indexes]),
+                )
+            # Store
+            return tvm.tir.Store(
+                symbol, tvm.runtime.convert(rhs), indexes[0], tvm.runtime.convert(True)
+            )
 
-    def visit_Assert(self, node):
+    def transform_Assert(self, node):
         """Assert visitor
-        AST abstract grammar:
-            Assert(expr test, expr? msg)
 
-        Pattern corresponds to concise mode of with tir.Assert()
+        Pattern corresponds to concise mode of :code:`with tir.Assert()`.
         """
 
-        condition = self.visit(node.test)
+        condition = self.transform(node.condition)
         if node.msg is None:
-            self.report_error("Message of AssertStmt can't be None")
-        message = self.visit(node.msg)
-        body = self.parse_body()
+            self.report_error("Assert statements must have an error message.", node.span)
+        message = self.transform(node.msg)
+        body = self.parse_body(node)
         return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), body)
 
-    def visit_For(self, node):
+    def transform_For(self, node):
         """For visitor
         AST abstract grammar:
             For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
@@ -459,29 +495,29 @@ def visit_For(self, node):
                 for name in tir.serial()/tir.parallel()/tir.vectorized()/tir.unroll()
         """
 
-        if not isinstance(node.iter, ast.Call):
-            self.report_error("The loop iter should be a Call")
-        func = self.visit(node.iter.func)
+        if not isinstance(node.rhs, ast.Call):
+            self.report_error("The loop iterator should be a function call.", node.rhs.span)
+        func = self.transform(node.rhs.func_name)
         if not isinstance(func, ForScopeHandler):
-            self.report_error("Only for scope handlers can be used in for stmt")
+            self.report_error(
+                "Only For scope handlers can be used in a for statement.", node.rhs.func_name.span
+            )
         # prepare for new for scope
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
-        self.current_lineno, self.current_col_offset = (
-            self.base_lineno + node.iter.lineno - 1,
-            node.iter.col_offset,
-        )
-        self.context.new_scope(nodes=node.body)
+        self.current_lineno = node.span.start_line
+        self.current_col_offset = node.span.start_column
+        self.context.new_scope(nodes=node.body.stmts)
         # for scope handler process the scope
         func.enter_scope(node, self.context)
-        func.body = self.parse_body()
-        arg_list = self.parse_arg_list(func, node.iter)
+        func.body = self.parse_body(node)
+        arg_list = self.parse_arg_list(func, node.rhs)
         res = func.exit_scope(node, self.context, arg_list)
         # exit the scope
         self.context.pop_scope()
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
-    def visit_With(self, node):
+    def transform_With(self, node):
         """With visitor
         AST abstract grammar:
             With(withitem* items, stmt* body, string? type_comment)
@@ -493,299 +529,281 @@ def visit_With(self, node):
                 with tir.let()/tir.Assert()/tir.attr()//tir.realize()
         """
 
-        if not len(node.items) == 1:
-            self.report_error("Only one with element is supported now")
-        if not isinstance(node.items[0].context_expr, ast.Call):
-            self.report_error("The context expression of with should be a Call")
+        if not isinstance(node.rhs, ast.Call):
+            self.report_error(
+                "The context expression of a `with` statement should be a function call.",
+                node.rhs.span,
+            )
 
-        func_call = node.items[0].context_expr
-        func_node = func_call.func
-        func = self.visit(func_node)
+        func = self.transform(node.rhs.func_name)
 
         if not isinstance(func, WithScopeHandler):
-            self.report_error("Function not allowed in with scope")
+            self.report_error(
+                f"Function {func} cannot be used in a `with` statement.", node.rhs.func_name.span
+            )
         # prepare for new block scope
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
-        self.current_lineno, self.current_col_offset = (
-            self.base_lineno + func_call.lineno - 1,
-            func_call.col_offset,
-        )
-        self.context.new_scope(nodes=node.body)
+        self.current_lineno = node.body.span.start_line
+        self.current_col_offset = node.body.span.start_column
+        self.context.new_scope(nodes=node.body.stmts)
         # with scope handler process the scope
         func.enter_scope(node, self.context)
-        func.body = self.parse_body()
-        arg_list = self.parse_arg_list(func, func_call)
+        func.body = self.parse_body(node)
+        arg_list = self.parse_arg_list(func, node.rhs)
         res = func.exit_scope(node, self.context, arg_list)
         # exit the scope
         self.context.pop_scope()
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
-    def visit_If(self, node):
+    def transform_If(self, node):
         """If visitor
         AST abstract grammar:
             If(expr test, stmt* body, stmt* orelse)
         """
 
-        condition = self.visit(node.test)
+        condition = self.transform(node.condition)
         # then body
-        self.context.new_scope(nodes=node.body)
-        then_body = self.parse_body()
+        self.context.new_scope(nodes=node.true.stmts)
+        then_body = self.parse_body(node)
         self.context.pop_scope()
 
         # else body
-        if len(node.orelse) > 0:
-            self.context.new_scope(nodes=node.orelse)
-            else_body = self.parse_body()
+        if len(node.false.stmts) > 0:
+            self.context.new_scope(nodes=node.false.stmts)
+            else_body = self.parse_body(node)
             self.context.pop_scope()
         else:
             else_body = None
 
         return tvm.tir.IfThenElse(condition, then_body, else_body)
 
-    def visit_Call(self, node):
+    def transform_Call(self, node):
         """Call visitor
-        AST abstract grammar:
-            Call(expr func, expr* args, keyword* keywords)
-            keyword = (identifier? arg, expr value)
 
-        By now 3 patterns of Call is allowed
-            1. Intrin representing PrimExpr/IterVar
+        3 different Call patterns are allowed:
+            1. Intrin representing a PrimExpr/IterVar
                 1.1 tir.int/uint/float8/16/32/64/floormod/floordiv/load/cast/ramp/broadcast/max
                 1.2 tir.range/reduce_axis/scan_axis/opaque_axis
             2. tir.Op(dtype, ...)
             3. other callable functions
         """
 
-        func = self.visit(node.func)
-        if isinstance(func, Intrin) and not func.stmt:
-            # pattern 1
-            arg_list = self.parse_arg_list(func, node)
-            return func.handle(arg_list)
+        if isinstance(node.func_name, ast.Op):
+            if node.func_name.name == ast.BuiltinOp.Subscript:
+                return self.transform_Subscript(node)
+            if node.func_name.name in self._binop_maker:
+                lhs = self.transform(node.params[0])
+                rhs = self.transform(node.params[1])
+                return self._binop_maker[node.func_name.name](lhs, rhs)
+            if node.func_name.name in self._unaryop_maker:
+                rhs = self.transform(node.params[0])
+                return self._unaryop_maker[node.func_name.name](rhs)
+            self.report_error(f"Unsupported operator {node.func_name.name}.", node.func_name.span)
         else:
-            args = [self.visit(arg) for arg in node.args]
-            kw_args = [self.visit(keyword) for keyword in node.keywords]
-            kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args}
-            if isinstance(func, tvm.tir.op.Op):
-                # pattern 2
-                return tvm.tir.Call(kw_args["dtype"], func, args)
-            elif callable(func):
-                # pattern 3
-                return func(*args, **kw_args)
-
-        self.report_error("Unsupported function call")
-
-    def visit_Expr(self, node):
-        """Expr visitor
-        AST abstract grammar:
-            Expr(expr value)
-
-        Now only 3 types of Expr stmt is allowed:
-            1. Intrin representing Stmt without body
-                tir.store()/tir.evaluate()
-            2. with scope handlers with concise scoping without var def
-                tir.attr()/tir.assert()/tir.allocate()/tir.realize()
-            3. special stmt without var def
-                tir.func_attr()
+            func = self.transform(node.func_name)
+            if isinstance(func, Intrin) and not func.stmt:
+                # pattern 1
+                arg_list = self.parse_arg_list(func, node)
+                return func.handle(arg_list)
+            else:
+                args = [self.transform(arg) for arg in node.params]
+                kw_args = {
+                    self.transform(k): self.transform(v) for k, v in node.keyword_params.items()
+                }
+                if isinstance(func, tvm.tir.op.Op):
+                    # pattern 2
+                    return tvm.tir.Call(kw_args["dtype"], func, args)
+                elif callable(func):
+                    # pattern 3
+                    return func(*args, **kw_args)
+
+        self.report_error("Unsupported function call.", node.func_name.span)
+
+    def transform_UnassignedCall(self, node):
+        """Visitor for statements that are function calls.
+
+        This handles function calls that appear on thier own line like `tir.realize`.
+
+        Examples
+        --------
+        .. code-block:: python
+
+            @tvm.script.tir
+            def f():
+                A = tir.buffer_decl([10, 10])
+                tir.realize(A[1:2, 1:2], "")  # This is an UnassignedCall
+                A[1, 1] = 2  # This is also an UnassignedCall
         """
+        # Only allowed builtin operator that can be a statement is x[1] = 3 i.e. subscript assign.
+        if isinstance(node.call.func_name, ast.Op):
+            if node.call.func_name.name != ast.BuiltinOp.SubscriptAssign:
+                self.report_error(
+                    "Binary and unary operators are not allowed as a statement", node.span
+                )
+            else:
+                return self.transform_SubscriptAssign(node.call)
 
-        if not isinstance(node.value, ast.Call):
-            self.report_error("Unsupported Expr stmt")
+        # handle a regular function call
+        func = self.transform(node.call.func_name)
+        arg_list = self.parse_arg_list(func, node.call)
 
-        func = self.visit(node.value.func)
-        arg_list = self.parse_arg_list(func, node.value)
+        if isinstance(func, tvm.script.scope_handler.AssertHandler):
+            self.report_error(
+                "A standalone `tir.Assert` is not allowed. Use `assert condition, message` "
+                "instead.",
+                node.call.func_name.span,
+            )
 
         if isinstance(func, Intrin) and func.stmt:
-            # pattern 1
             return func.handle(arg_list)
         elif isinstance(func, WithScopeHandler) and func.concise_scope and not func.def_symbol:
-            # pattern 2
             func.enter_scope(node, self.context)
-            func.body = self.parse_body()
+            func.body = self.parse_body(node)
             return func.exit_scope(node, self.context, arg_list)
         elif isinstance(func, SpecialStmt) and not func.def_symbol:
-            # pattern 3
             func.handle(node, self.context, arg_list)
             return
 
-        self.report_error("Invalid Expr stmt")
+        self.report_error(f"Invalid Expr stmt {type(func).__name__}.", node.call.func_name.span)
 
-    def visit_BinOp(self, node):
-        """BinOp visitor
-        AST abstract grammar:
-            BinOp(expr left, operator op, expr right)
-        """
+    def transform_Slice(self, node):
+        start = self.transform(node.start)
+        end = self.transform(node.end)
+        if not (isinstance(node.step, ast.Constant) and node.step.value == 1):
+            self.report_error("Only step size 1 is supported for slices.", node.step.span)
+        extent = end - start
+        if isinstance(extent, tvm.tir.PrimExpr):
+            ana = tvm.arith.Analyzer()
+            extent = ana.simplify(extent)
+        return tvm.ir.Range.from_min_extent(start, extent)
 
-        lhs = self.visit(node.left)
-        rhs = self.visit(node.right)
-        if not isinstance(node.op, tuple(TVMScriptParser._binop_maker.keys())):
-            self.report_error("BinOp " + str(type(node.op)) + " is not supported now")
-        return TVMScriptParser._binop_maker[type(node.op)](lhs, rhs)
+    def transform_Subscript(self, node):
+        """Array access visitor.
 
-    def visit_Compare(self, node):
-        """Compare visitor
-        AST abstract grammar:
-            Compare(expr left, expr right, ops=)
-        """
-
-        ops = [self.visit(node.left)]
-        ops += [self.visit(comparator) for comparator in node.comparators]
-        res = []
-        for i in range(len(node.ops)):
-            lhs = ops[i]
-            rhs = ops[i + 1]
-            res.append(TVMScriptParser._binop_maker[type(node.ops[i])](lhs, rhs))
-        return _all(*res)
-
-    def visit_BoolOp(self, node):
-        """BoolOp visitor
-        AST abstract grammar:
-            BoolOp(boolop op, expr* values)
-        """
-
-        values = [self.visit(value) for value in node.values]
-        return TVMScriptParser._binop_maker[type(node.op)](*values)
-
-    def visit_UnaryOp(self, node):
-        """UnaryOp visitor
-        AST abstract grammar:
-            UnaryOp(unaryop op, expr operand)
-        """
-
-        operand = self.visit(node.operand)
-        if not isinstance(node.op, tuple(TVMScriptParser._unaryop_maker.keys())):
-            self.report_error("UnaryOp " + str(type(node.op)) + " is not supported now")
-        return TVMScriptParser._unaryop_maker[type(node.op)](operand)
-
-    def visit_Subscript(self, node):
-        """Subscript visitor
-        AST abstract grammar:
-            Subscript(expr value, slice slice, expr_context ctx)
-            slice = Slice(expr? lower, expr? upper, expr? step)
-                    | ExtSlice(slice* dims)
-                    | Index(expr value)
-        By now 2 patterns of Subscript are supported:
+        By now only 2 types of Subscript are supported:
             1. Buffer[index, index, ...], Buffer element access(BufferLoad & BufferStore)
                Var[index] Buffer element access()
             2. meta[type_key][index], Meta info access
         """
 
-        symbol = self.visit(node.value)
+        symbol = self.transform(node.params[0])
         if symbol is None:
-            self.report_error(node.value.id + " is not defined")
-        if isinstance(symbol, (tvm.tir.expr.Var, tvm.tir.Buffer)):
-            if isinstance(node.slice, ast.Index):
-                # BufferLoad & BufferStore, Buffer/Var[index, index, ...]
-                indexes = self.visit(node.slice.value)
-                indexes = list(indexes) if isinstance(indexes, tuple) else [indexes]
-                if isinstance(node.ctx, ast.Load):
-                    if isinstance(symbol, tvm.tir.expr.Var):
-                        return tvm.tir.Load("float32", symbol, indexes, True)
-                    else:
-                        return tvm.tir.BufferLoad(symbol, indexes)
-                else:
-                    return symbol, indexes
-            else:
-                # Buffer Region, now used in tir.realize(buffer[bounds])
-                doms = []
-                slice_nodes = []
-                if isinstance(node.slice, ast.Slice):
-                    # Buffer[begin:end]
-                    slice_nodes.append(node.slice)
-                elif isinstance(node.slice, ast.ExtSlice):
-                    # Buffer[begin:end, begin:end]
-                    slice_nodes.extend(node.slice.dims)
-
-                for dim in slice_nodes:
-                    if not hasattr(dim, "step"):
-                        self.report_error("slice of Buffer Region ought to be begin:end")
-                    if dim.step is not None:
-                        self.report_error("step is not allowed in Buffer Region")
-                    upper = self.visit(dim.upper)
-                    lower = self.visit(dim.lower)
-                    extent = upper - lower
-                    if isinstance(extent, _expr.PrimExpr):
-                        ana = tvm.arith.Analyzer()
-                        extent = ana.simplify(extent)
-                    doms.append(tvm.ir.Range.from_min_extent(lower, extent))
-                return symbol, doms
-        else:
-            res = symbol[self.visit(slice)]
-            if res is None:
-                self.report_error("Only buffer variable and meta can be subscriptable")
-            return res
+            self.report_error(f"Variable {node.value.id} is not defined.", node.params[0].span)
 
-    def visit_Attribute(self, node):
-        """Attribute visitor
-        AST abstract grammar:
-            Attribute(expr value, identifier attr, expr_context ctx)
+        indexes = [self.transform(x) for x in node.params[1].values]
+        if isinstance(indexes[0], tvm.ir.Range):
+            return symbol, indexes
+
+        if isinstance(symbol, tvm.tir.expr.Var):
+            return tvm.tir.Load("float32", symbol, indexes, True)
+        if isinstance(symbol, tvm.tir.Buffer):
+            return tvm.tir.BufferLoad(symbol, indexes)
+
+        self.report_error(
+            f"Cannot subscript from a {type(symbol).__name__}. Only variables and "
+            "buffers are supported.",
+            node.params[0].span,
+        )
+
+    def transform_Attr(self, node):
+        """Visitor for field access of the form `x.y`.
+
+        This visitor is used to lookup function and symbol names. We have two
+        cases to handle here:
+        1. If we have a statement of the form `tir.something`, then we lookup
+           `tir.somthing` in the `Registry`. If the function is not in the
+           registry, then we try to find a `tvm.ir.op.Op` with the same name.
+        2. All other names `tvm.something` are lookup up in this current python
+           namespace.
         """
 
-        if isinstance(node.value, ast.Name):
-            if node.value.id == "tir":
-                func_name = "tir." + node.attr
+        if isinstance(node.object, ast.Var):
+            if node.object.id.name == "tir":
+                func_name = "tir." + node.field.name
                 res = Registry.lookup(func_name)
                 if res is not None:
                     return res
                 try:
                     return tvm.ir.op.Op.get(func_name)
-                except AttributeError:
-                    self.report_error("Unregistered function tir." + node.attr)
-            elif node.value.id == "ty":
-                if not hasattr(ty, node.attr):
-                    self.report_error("invalid type annotation ty." + node.attr)
-                return getattr(ty, node.attr)
-
-        symbol = self.visit(node.value)
+                except TVMError as e:
+                    # Check if we got an attribute error
+                    if e.args[0].find("AttributeError"):
+                        self.report_error(
+                            f"Unregistered function `tir.{node.field.name}`.", node.field.span
+                        )
+                    else:
+                        raise e
+
+        symbol = self.transform(node.object)
         if symbol is None:
-            self.report_error("Unsupported Attribute expression")
-        if not hasattr(symbol, node.attr):
-            self.report_error("Type " + type(symbol) + " has not attr " + node.attr)
-        res = getattr(symbol, node.attr)
+            self.report_error("Unsupported Attribute expression.", node.object.span)
+        if not hasattr(symbol, node.field.name):
+            self.report_error(
+                f"Type {type(symbol)} does not have a field called `{node.field}`.", node.span
+            )
+        res = getattr(symbol, node.field.name)
         return res
 
-    def visit_Dict(self, node):
-        """Dict visitor
-        AST abstract grammar:
-            Dict(expr* keys, expr* values)
+    def transform_TypeAttr(self, node):
+        """Visitor for field access of the form `x.y` for types.
+
+        We have two cases here:
+        1. If the type is of the form `ty.something`, we look up the type in
+           the `ty` namespace in this module.
+        2. If the type is of the form `tvm.x.something` then we look up
+           `tvm.x.something` in this modules namespace.
         """
+        if isinstance(node.object, ast.TypeVar):
+            if node.object.id.name == "ty":
+                if not hasattr(ty, node.field.name):
+                    self.report_error(f"Invalid type annotation `ty.{node.field.name}`.", node.span)
+                return getattr(ty, node.field.name)
 
-        keys = [self.visit(key) for key in node.keys]
-        values = [self.visit(value) for value in node.values]
+        symbol = self.transform(node.object)
+        if symbol is None:
+            self.report_error("Unsupported Attribute expression", node.object.span)
+        if not hasattr(symbol, node.field):
+            self.report_error(
+                f"Type {type(symbol)} does not have a field called `{node.field}`.", node.span
+            )
+        res = getattr(symbol, node.field)
+        return res
 
-        return {key: value for key, value in zip(keys, values)}
+    def transform_DictLiteral(self, node):
+        """Dictionary literal visitor.
 
-    def visit_Tuple(self, node):
-        """Tuple visitor
-        AST abstract grammar:
-            Tuple(expr* elts, expr_context ctx)
+        Handles dictionary literals of the form `{x:y, z:2}`.
         """
 
-        return tuple(self.visit(element) for element in node.elts)
+        keys = [self.transform(key) for key in node.keys]
+        values = [self.transform(value) for value in node.values]
 
-    def visit_List(self, node):
-        """List visitor
-        AST abstract grammar:
-            List(expr* elts, expr_context ctx)
+        return dict(zip(keys, values))
+
+    def transform_Tuple(self, node):
+        """Tuple visitor.
+
+        Handles tuples of the form `(x, y, 2)`.
         """
 
-        return [self.visit(element) for element in node.elts]
+        return tuple(self.transform(element) for element in node.values)
 
-    def visit_keyword(self, node):
-        """Keyword visitor
-        AST abstract grammar:
-            keyword = (identifier? arg, expr value)
+    def transform_ArrayLiteral(self, node):
+        """List literal visitor.
+
+        Handles lists of the form `[x, 2, 3]`.
         """
 
-        return node.arg, self.visit(node.value)
+        return [self.transform(element) for element in node.values]
 
-    def visit_Name(self, node):
-        """Name visitor
-        AST abstract grammar:
-            Name(identifier id, expr_context ctx)
+    def transform_Var(self, node):
+        """Variable visitor
+
+        Handles variables like `x` in `x = 2`.
         """
 
-        name = node.id
+        name = node.id.name
         if name == "meta":
             return self.meta
         symbol = Registry.lookup(name)
@@ -794,28 +812,51 @@ def visit_Name(self, node):
         symbol = self.context.lookup_symbol(name)
         if symbol is not None:
             return symbol
-        self.report_error("Unknown identifier %s" % name)
+        self.report_error(f"Unknown identifier {name}.", node.span)
+
+    def transform_TypeVar(self, node):
+        """Type variable visitor.
 
-    # note that after Python3.8, ast.NameConstant, ast.Num, ast.Str are no longer used
-    def visit_Constant(self, node):
+        Equivalent to `transform_Var` but for types.
+        """
+        name = node.id.name
+        symbol = Registry.lookup(name) or self.context.lookup_symbol(name)
+        if symbol is not None:
+            return symbol
+        self.report_error(f"Unknown identifier {name}.", node.span)
+
+    def transform_Constant(self, node):
+        """Constant value visitor.
+
+        Constant values include `None`, `"strings"`, `2` (integers), `4.2`
+        (floats), and `true` (booleans).
+        """
         return node.value
 
-    def visit_NameConstant(self, node):
+    def transform_TypeConstant(self, node):
+        """Constant value visitor for types.
+
+        See `transform_Constant`.
+        """
         return node.value
 
-    def visit_Num(self, node):
-        return node.n
+    def transform_Return(self, node):
+        self.report_error(
+            "TVM script does not support return statements. Instead the last statement in any "
+            "block is implicitly returned.",
+            node.span,
+        )
 
-    def visit_Str(self, node):
-        return node.s
 
+def from_source(src):
+    """Parse function or string into TIR.
 
-def from_source(src, func_lineno=0):
-    """Parse the src into TIR
+    If possible, pass the TVM script in as a function so that line numbers and
+    filename will be accurate.
 
     Parameters
     ----------
-    src : str
+    src : [str, function, class]
         Pruned source of original script
     func_lineno : Optional[int]
         The line number of the first line of the script to be parsed
@@ -824,32 +865,12 @@ def from_source(src, func_lineno=0):
     functions : PrimFunc or IRModule
         The PrimFunc or IRModule in IR.
     """
-
-    root = ast.parse(src)
-    parser = TVMScriptParser(src, func_lineno)
-
-    try:
-        return parser.visit(root)
-    except TVMScriptParserError as e:
-        raise e
-    except TVMError as e:
-        # TVM internal c++ error, we have to process the error message and inject line info
-        inject_e = str(e).split("\n")
-        msg = inject_e[-1].split(":", maxsplit=1)[1].strip()
-        inject_e = inject_e[:-1]
-        inject_e.extend(
-            parser.wrap_line_col(msg, parser.current_lineno, parser.current_col_offset).split("\n")
-        )
-        inject_e[-1] = "TVM" + inject_e[-1][6:]
-        raise TVMError("\n".join(inject_e)) from e
-    except Exception as e:
-        inject_e = parser.wrap_line_col(str(e), parser.current_lineno, parser.current_col_offset)
-        raise TVMScriptParserError(inject_e) from e
-
-
-def _parse(script_in):
-    """Helper function to parse TVM script into TIR"""
-    return from_source(inspect.getsource(script_in), inspect.getsourcelines(script_in)[1])
+    if isinstance(src, str):
+        start_line = 0
+    else:
+        _, start_line = inspect.getsourcelines(src)
+    parser = TVMScriptParser(start_line)
+    return to_ast(src, TVMDiagnosticCtx(), parser)
 
 
 def create_module(functions=None):
@@ -901,11 +922,11 @@ def tir(script_in):
     """
 
     if inspect.isfunction(script_in):
-        result = _parse(script_in)
+        result = from_source(script_in)
     elif inspect.isclass(script_in):
         result = TVMScriptClass(script_in)
     else:
-        raise TypeError("Only function and class are supported")
+        raise TypeError("Only function and class definitions are supported.")
     result.__name__ = script_in.__name__
     result.__qualname__ = script_in.__qualname__
     return result
@@ -932,4 +953,4 @@ def __init__(self, script_in):
 
     def __call__(self, *args, **kwargs):
         # call the parser to transform tvm script into TIR
-        return _parse(self.script)
+        return from_source(self.script)
diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py
index 251df8c6d6cb..15197eaf50af 100644
--- a/python/tvm/script/scope_handler.py
+++ b/python/tvm/script/scope_handler.py
@@ -17,7 +17,7 @@
 """TVM Script Parser Scope Handler Classes"""
 # pylint: disable=redefined-builtin, unused-argument, invalid-name, relative-beyond-top-level
 
-from typed_ast import ast3 as ast
+from synr import ast
 import tvm.tir
 from .utils import get_param_list
 from .registry import register
@@ -92,7 +92,7 @@ def enter_scope(self, node, context):
                 context.report_error("Unexpected number of vars")
             name = names[0]
         elif isinstance(node, ast.Assign):
-            name = node.targets[0].id
+            name = node.lhs.id.name
         else:
             raise Exception("Internal Bug")
 
@@ -186,15 +186,15 @@ def enter_scope(self, node, context):
         assert isinstance(node, ast.For)
 
         loop_var_names = list()
-        if isinstance(node.target, ast.Name):
-            loop_var_names.append(node.target.id)
-        elif isinstance(node.target, ast.Tuple):
-            for elt in node.target.elts:
-                if not isinstance(elt, ast.Name):
-                    context.report_error("Invalid loop var")
-                loop_var_names.append(elt.id)
+        if isinstance(node.lhs, ast.Var):
+            loop_var_names.append(node.lhs.id.name)
+        elif isinstance(node.lhs, ast.Tuple):
+            for elt in node.lhs.values:
+                if not isinstance(elt, ast.Var):
+                    context.report_error("Invalid loop var", elt.span)
+                loop_var_names.append(elt.id.name)
         else:
-            context.report_error("Invalid loop var")
+            context.report_error("Invalid loop var", node.lhs)
 
         self.loop_vars = [tvm.te.var(name, dtype="int32") for name in loop_var_names]
         for loop_var in self.loop_vars:
diff --git a/python/tvm/script/special_stmt.py b/python/tvm/script/special_stmt.py
index 31fe0ed7cebf..f69475e37cfa 100644
--- a/python/tvm/script/special_stmt.py
+++ b/python/tvm/script/special_stmt.py
@@ -17,7 +17,7 @@
 """TVM Script Parser Special Stmt Classes"""
 # pylint: disable=unused-argument, no-self-argument, inconsistent-return-statements
 # pylint: disable=relative-beyond-top-level
-from typed_ast import ast3 as ast
+from synr import ast
 
 import tvm.tir
 from tvm import te
@@ -69,7 +69,9 @@ def match_buffer(
             assert isinstance(self.node, ast.Assign)
 
             if param not in self.context.func_params:
-                self.context.report_error("Can not bind non-input param to buffer")
+                self.context.report_error(
+                    "Can not bind non-input param to buffer", self.node.rhs.params[0].span
+                )
             if strides is None:
                 strides = []
             align = align.value if not isinstance(align, int) else align
@@ -79,7 +81,7 @@ def match_buffer(
             buffer = tvm.tir.decl_buffer(
                 shape,
                 dtype,
-                self.node.targets[0].id,
+                self.node.lhs.id.name,
                 data,
                 strides,
                 elem_offset,
@@ -89,7 +91,7 @@ def match_buffer(
                 buffer_type,
             )
             self.context.func_buffer_map[param] = buffer
-            self.context.update_symbol(self.node.targets[0].id, buffer)
+            self.context.update_symbol(self.node.lhs.id.name, buffer)
 
         super().__init__(match_buffer, def_symbol=True)
 
@@ -127,7 +129,7 @@ def buffer_decl(
             buffer = tvm.tir.decl_buffer(
                 shape,
                 dtype,
-                self.node.targets[0].id,
+                self.node.lhs.id.name,
                 data,
                 strides,
                 elem_offset,
@@ -136,7 +138,7 @@ def buffer_decl(
                 offset_factor,
                 buffer_type,
             )
-            self.context.update_symbol(self.node.targets[0].id, buffer)
+            self.context.update_symbol(self.node.lhs.id.name, buffer)
             return buffer
 
         super().__init__(buffer_decl, def_symbol=True)
@@ -149,7 +151,7 @@ class VarDef(SpecialStmt):
     def __init__(self):
         def var(dtype):
             assert isinstance(self.node, ast.Assign)
-            v = te.var(self.node.targets[0].id, dtype)
+            v = te.var(self.node.lhs.id.name, dtype)
             self.context.update_symbol(v.name, v)
 
         super().__init__(var, def_symbol=True)
@@ -162,7 +164,7 @@ class EnvThread(SpecialStmt):
     def __init__(self):
         def env_thread(env_name):
             assert isinstance(self.node, ast.Assign)
-            v = te.var(self.node.targets[0].id)
+            v = te.var(self.node.lhs.id.name)
             self.context.func_var_env_dict[v] = env_name
             self.context.update_symbol(v.name, v)
 
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index dd8621d0fbfe..048a9544d6df 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -14,120 +14,169 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import pytest
-
 import tvm
 from tvm import tir
-from tvm.script import ty
-from tvm.script.parser import TVMScriptParserError
+from tvm.script import ty, from_source
+from tvm.ir.diagnostics import override_renderer
+import inspect
 
 
-@tvm.script.tir
-class Module1:
-    def buffer_bind_missing_args(a: ty.handle) -> None:
-        A = tir.match_buffer((16, 16), "float32")
+def buffer_bind_missing_args(a: ty.handle) -> None:
+    A = tir.match_buffer((16, 16), "float32")  # error
 
 
-@tvm.script.tir
-class Module2:
-    def range_missing_args(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
+def test_buffer_bind():
+    check_error(buffer_bind_missing_args, 2)
 
-        tir.attr(A, "realize_scope", "")
-        tir.realize(A[0:16, 0:16])
-        for i in tir.serial(16):
-            for j in tir.serial(0, 16):
-                A[i, j] = 0.0
 
+def range_missing_args(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
 
-@tvm.script.tir
-class Module3:
-    def undefined_buffer(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
+    tir.attr(A, "realize_scope", "")
+    tir.realize(A[0:16, 0:16], "")
+    for i in tir.serial(16):  # error
+        for j in tir.serial(0, 16):
+            A[i, j] = 0.0
 
-        tir.attr(A, "realize_scope", "")
-        tir.realize(C[0:16, 0:16])
-        for i in tir.serial(16):
-            for j in tir.serial(0, 16):
-                A[i, j] = 0.0
 
+def test_range_missing_args():
+    check_error(range_missing_args, 6)
 
-@tvm.script.tir
-class Module4:
-    def unsupported_stmt(a: ty.int32) -> None:
-        if a > 0:
-            print("I love tvm")
 
+def undefined_buffer(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
 
-@tvm.script.tir
-class Module5:
-    def unsupported_function_call(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
+    tir.attr(A, "realize_scope", "")
+    tir.realize(C[0:16, 0:16], "")  # error
+    for i in tir.serial(16):
+        for j in tir.serial(0, 16):
+            A[i, j] = 0.0
 
-        tir.attr(A, "realize_scope", "")
-        tir.realize(A[0:16, 0:16])
-        for i in tir.const_range(16):
-            for j in tir.serial(0, 16):
-                A[i, j] = 0.0
 
+def test_undefined_buffer():
+    check_error(undefined_buffer, 5)
 
-@tvm.script.tir
-class Module6:
-    def missing_type_annotation(a) -> None:
-        pass
+
+def unsupported_stmt(a: ty.int32) -> None:
+    if a > 0:
+        print("I love tvm")  # error
+
+
+def test_unsupported_stmt():
+    check_error(unsupported_stmt, 3)
+
+
+def unsupported_function_call(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+
+    tir.attr(A, "realize_scope", "")
+    tir.realize(A[0:16, 0:16], "")
+    for i in tir.const_range(16):  # error
+        for j in tir.serial(0, 16):
+            A[i, j] = 0.0
+
+
+def test_unsupported_function_call():
+    check_error(unsupported_function_call, 6)
+
+
+def missing_type_annotation(a) -> None:  # error
+    tir.evaluate(0.0)
+
+
+def test_missing_type_annotation():
+    check_error(missing_type_annotation, 1)
+
+
+def invalid_expr_stmt() -> None:
+    tir.max(1, 2)  # error
 
 
-@tvm.script.tir
-class Module7:
-    def invalid_concise_scoping() -> None:
-        tir.Assert(1.0 > 0.0, "aaaa")
-        tir.evaluate(0.0)
+def test_invalid_expr_stmt():
+    check_error(invalid_expr_stmt, 2)
 
 
-@tvm.script.tir
-class Module8:
-    def invalid_expr_stmt() -> None:
-        tir.max(1, 2)
+def invalid_for_function(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
 
+    for i in tir.evaluate(0.0):  # error
+        for j in tir.serial(0, 16):
+            A[i, j] = 0.0
 
-@tvm.script.tir
-class Module9:
-    def invalid_for_function(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
 
-        for i in tir.evaluate(0.0):
-            for j in tir.serial(0, 16):
-                A[i, j] = 0.0
+def test_invalid_for_function():
+    check_error(invalid_for_function, 4)
 
 
-@tvm.script.tir
-class Module10:
-    def invalid_block_function(a: ty.handle) -> None:
-        A = tir.match_buffer(a, (16, 16), "float32")
+def invalid_block_function(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
 
-        with tir.evaluate(0.0):
-            pass
+    with tir.evaluate(0.0):  # error
+        tir.evaluate(1.0)
 
 
-def wrap_error(module, lineno):
-    with pytest.raises(TVMScriptParserError) as error:
-        mod = module()
-    assert error is not None
-    e = error.value
-    print(e)
-    msg = str(e).split("\n")[-1].split(":", maxsplit=1)[0].strip().split(" ")[-1].strip()
-    assert int(msg) == lineno
+def test_invalid_block_function():
+    check_error(invalid_block_function, 4)
+
+
+def return_not_allowed(a: ty.handle) -> None:
+    return tir.evaluate(0)  # error
+
+
+def test_return_not_allowed():
+    check_error(return_not_allowed, 2)
+
+
+def tir_assert(a: ty.handle) -> None:
+    tir.Assert(0, "")  # error
+
+
+def test_tir_assert():
+    check_error(tir_assert, 2)
+
+
+def no_body(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+    tir.realize(A, "")  # error
+
+
+def test_no_body():
+    check_error(no_body, 3)
+
+
+def check_error(module, rel_lineno):
+    # Override the default renderer to accumulate errors
+    _, start_line = inspect.getsourcelines(module)
+    lineno = start_line + rel_lineno - 1
+    errors = []
+
+    def render(e):
+        for d in e.diagnostics:
+            errors.append(d)
+
+    override_renderer(render)
+    # The diagnostic context throws an exception when it gets an error
+    try:
+        mod = from_source(module)
+    except tvm.error.DiagnosticError as e:
+        pass
+    assert len(errors) == 1, errors
+    for d in errors:
+        assert (
+            d.span.line == lineno
+        ), f"Expected error to be on line {lineno}, but it was on {d.span.line}"
 
 
 if __name__ == "__main__":
-    wrap_error(Module1, 29)
-    wrap_error(Module2, 39)
-    wrap_error(Module3, 50)
-    wrap_error(Module4, 60)
-    wrap_error(Module5, 70)
-    wrap_error(Module6, 77)
-    wrap_error(Module7, 84)
-    wrap_error(Module8, 91)
-    wrap_error(Module9, 99)
-    wrap_error(Module10, 109)
+    test_buffer_bind()
+    test_range_missing_args()
+    test_undefined_buffer()
+    test_unsupported_stmt()
+    test_unsupported_function_call()
+    test_missing_type_annotation()
+    test_invalid_expr_stmt()
+    test_invalid_for_function()
+    test_invalid_block_function()
+    test_return_not_allowed()
+    test_tir_assert()
+    test_no_body()
diff --git a/tests/scripts/task_ci_python_setup.sh b/tests/scripts/task_ci_python_setup.sh
index 6463142a28c0..fe88ac650cc8 100755
--- a/tests/scripts/task_ci_python_setup.sh
+++ b/tests/scripts/task_ci_python_setup.sh
@@ -30,4 +30,4 @@ set -o pipefail
 #
 echo "Addtiional setup in" ${CI_IMAGE_NAME}
 
-python3 -m pip install --user tlcpack-sphinx-addon==0.1.2
+python3 -m pip install --user tlcpack-sphinx-addon==0.1.2 synr==0.2.1

From a01a03a89181c5733a3e51b38f99f81823b61bad Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Fri, 6 Nov 2020 02:01:38 +0000
Subject: [PATCH 132/258] [BYOC] [ACL] ACL Runtime padding workaround (#6724)

This workaround prevents execution of operations via ACL runtime
in case if arguments or output tensor require memory padding.
Workaround is applicable to all ACL versions prior forecoming ACL 20.11
(which will not use data padding).
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   | 42 +++++++++++--
 .../contrib/arm_compute_lib/acl_utils.cc      |  1 +
 .../test_arm_compute_lib/infrastructure.py    |  3 +-
 .../test_arm_compute_lib/test_dense.py        | 62 ++++++++++++++-----
 .../test_arm_compute_lib/test_maximum.py      |  1 +
 .../test_arm_compute_lib/test_network.py      |  7 ++-
 .../test_arm_compute_lib/test_pooling.py      | 11 ++--
 .../test_arm_compute_lib/test_reshape.py      |  5 +-
 8 files changed, 102 insertions(+), 30 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 80d64db693ce..bdbeb8616a51 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -17,6 +17,8 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm Compute Library supported operators."""
 import tvm
+import numpy as np
+
 from tvm.relay.expr import const
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
@@ -279,7 +281,7 @@ def dense(expr):
         return False
     if attrs.out_dtype != "float32" and attrs.out_dtype != "":
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
 
 
 def qnn_dense(expr):
@@ -293,7 +295,7 @@ def qnn_dense(expr):
         return False
     if attrs.out_dtype != "int32":
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
 
 
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib")
@@ -305,7 +307,33 @@ def max_pool2d(expr):
     typ = args[0].checked_type
     if typ.dtype not in ["float32", "uint8"]:
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
+
+
+def require_padding(inputs):
+    """Checks whether supplied data will require padding.
+    Most of the operators ACL up to 20.11 uses padded data.
+    """
+
+    def _check(shape, dtype):
+        """NEON has 128bits/16bytes per vector"""
+        if len(shape) == 0:
+            return False
+        return (shape[-1] * np.dtype(dtype).itemsize) % 16 != 0
+
+    for i in inputs:
+        if isinstance(i, (tvm.relay.expr.Var, tvm.relay.expr.Call)):
+            if _check(i.checked_type.shape, i.checked_type.dtype):
+                return True
+        elif isinstance(i, tvm.relay.expr.Constant):
+            if _check(i.data.shape, i.data.dtype):
+                return True
+        elif isinstance(i, tvm.ir.tensor_type.TensorType):
+            if _check(i.shape, i.dtype):
+                return True
+        else:
+            raise RuntimeException("Not supported input type: %s" % type(i))
+    return False
 
 
 @tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib")
@@ -313,6 +341,7 @@ def avg_pool2d(expr, from_quantized_composite=False):
     """Check if the external ACL codegen for avgpool2d should be used."""
     attrs, args = expr.attrs, expr.args
     typ = args[0].checked_type
+
     if from_quantized_composite:
         if typ.dtype != "int32":
             return False
@@ -321,7 +350,8 @@ def avg_pool2d(expr, from_quantized_composite=False):
             return False
     if attrs.layout != "NHWC":
         return False
-    return True
+
+    return not require_padding([*args, expr.checked_type])
 
 
 @tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib")
@@ -333,7 +363,7 @@ def global_max_pool2d(expr):
         return False
     if attrs.layout != "NHWC":
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
 
 
 @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib")
@@ -345,7 +375,7 @@ def global_avg_pool2d(expr):
         return False
     if attrs.layout != "NHWC":
         return False
-    return True
+    return not require_padding([*args, expr.checked_type])
 
 
 @tvm.ir.register_op_attr("maximum", "target.arm_compute_lib")
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 0b6d27623a1a..604c619bf49c 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -45,6 +45,7 @@ arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
   std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
   DLDataType dtype = tensor_rep.GetOpDataType()[0];
   arm_compute::TensorInfo info = MakeACLTensorInfo(shape, dtype, scale, offset);
+  info.set_is_resizable(false);
   tensor.allocator()->init(info);
   if (data != nullptr) {
     CheckACLError(tensor.allocator()->import_memory(data));
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index 0e444809b014..c5d711d7afa3 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -276,10 +276,11 @@ def verify_codegen(
     module,
     known_good_codegen,
     num_acl_modules,
+    tvm_ops=0,
     target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon",
 ):
     """Check acl codegen against a known good output."""
-    module = build_module(module, target)
+    module = build_module(module, target, tvm_ops=tvm_ops, acl_partitions=num_acl_modules)
     acl_modules = extract_acl_modules(module)
 
     assert len(acl_modules) == num_acl_modules, (
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index 8a3632a79919..0279aa72eaf7 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -20,8 +20,8 @@
 
 import tvm
 from tvm import relay
-
-from .infrastructure import (
+from tvm import testing
+from test_arm_compute_lib.infrastructure import (
     Device,
     skip_runtime_test,
     skip_codegen_test,
@@ -185,18 +185,34 @@ def test_dense():
     np.random.seed(0)
 
     dtype = ["float32"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [
+        (1, (1, 128), (16, 128), 16),
+        (1, (32, 32), (32, 32), 32),
+        (0, (1, 64), (1, 64), 1),
+        (0, (11, 2), (2, 2), 2),
+    ]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
         func, params = _get_model(
             shape, weight_shape, units, dtype, var_names=iter(inputs), has_bias=composite
         )
         for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0])
+            outputs.append(
+                build_and_run(
+                    func,
+                    inputs,
+                    1,
+                    params,
+                    device,
+                    enable_acl=acl,
+                    tvm_ops=(1 - acl_partitions) * (2 - int(not composite)),
+                    acl_partitions=acl_partitions,
+                )[0]
+            )
 
         config = {
             "shape": shape,
@@ -215,18 +231,18 @@ def test_codegen_dense():
     np.random.seed(0)
 
     dtype = ["float32"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         inputs = {"a"}
 
         args = (shape, weight_shape, units, dtype)
 
         func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite)
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, 1)
+        verify_codegen(func, exp_codegen, acl_partitions, 1 - acl_partitions)
 
 
 def test_qnn_dense():
@@ -239,11 +255,18 @@ def test_qnn_dense():
     np.random.seed(0)
 
     dtype = ["uint8"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [
+        (0, (4, 4), (4, 4), 4),
+        (1, (16, 16), (4, 16), 4),
+        (1, (1, 128), (16, 128), 16),
+        (1, (32, 32), (32, 32), 32),
+        (0, (1, 64), (1, 64), 1),
+    ]
+
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
         input_zp = 100
@@ -270,7 +293,18 @@ def test_qnn_dense():
         )
 
         for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0])
+            outputs.append(
+                build_and_run(
+                    func,
+                    inputs,
+                    1,
+                    params,
+                    device,
+                    tvm_ops=(1 - acl_partitions) * (3 - int(not composite)),
+                    acl_partitions=acl_partitions,
+                    enable_acl=acl,
+                )[0]
+            )
 
         config = {
             "shape": shape,
@@ -295,11 +329,11 @@ def test_codegen_qnn_dense():
     np.random.seed(0)
 
     dtype = ["uint8"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         inputs = {"a"}
         args = (shape, weight_shape, units, dtype)
 
@@ -323,7 +357,7 @@ def test_codegen_qnn_dense():
             has_bias=composite,
         )
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, 1)
+        verify_codegen(func, exp_codegen, acl_partitions, 2 - 2 * acl_partitions)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_arm_compute_lib/test_maximum.py b/tests/python/contrib/test_arm_compute_lib/test_maximum.py
index 8ddb901946fc..1942d1e213a5 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_maximum.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_maximum.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import relay
+from tvm import testing
 
 from .infrastructure import (
     skip_runtime_test,
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index 2526a584c56c..4efae487f220 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -17,11 +17,12 @@
 """Arm Compute Library network tests."""
 
 import numpy as np
-
+import pytest
+from tvm import testing
 from tvm import relay
 
-from .infrastructure import skip_runtime_test, build_and_run, verify
-from .infrastructure import Device
+from test_arm_compute_lib.infrastructure import skip_runtime_test, build_and_run, verify
+from test_arm_compute_lib.infrastructure import Device
 
 
 def _build_and_run_network(mod, params, inputs, device, tvm_ops, acl_partitions, atol, rtol):
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index 35017170d0ec..7ab4b42f95c1 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -20,15 +20,16 @@
 
 import tvm
 from tvm import relay
+from tvm import testing
 
-from .infrastructure import (
+from test_arm_compute_lib.infrastructure import (
     skip_runtime_test,
     skip_codegen_test,
     build_and_run,
     verify,
     verify_codegen,
 )
-from .infrastructure import Device
+from test_arm_compute_lib.infrastructure import Device
 
 
 def _calculate_output_shape(shape, sizes, padding, strides):
@@ -167,6 +168,7 @@ def test_pooling():
     uint8_dtype = ("uint8", 0, 255, 1, 0)
 
     trials = [
+        ["nn.max_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 0), False, False, (27, 27, 512)],
         ["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
         ["nn.max_pool2d", fp32_dtype, (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
         ["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
@@ -175,7 +177,8 @@ def test_pooling():
         ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
         ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
         ["nn.avg_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
-        ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
+        # 20.05: "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"
+        # ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)],
         ["nn.avg_pool2d", uint8_dtype, (3, 3), (2, 2), (0, 1), False, False, (16, 16, 16)],
         ["nn.l2_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), True, False, (16, 16, 16)],
         ["nn.l2_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 0), False, False, (16, 16, 16)],
@@ -211,6 +214,7 @@ def test_pooling():
             "padding": pad,
             "ceil_mode": ceil_mode,
             "count_include_pad": count_include_pad,
+            "inputs": inputs,
         }
         verify_saturation = True if dtype == "uint8" else False
 
@@ -255,7 +259,6 @@ def test_global_pooling():
         }
 
         func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
-
         config = {
             "shape": shape,
             "pooling type": typef,
diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
index 9547aefd8803..9364c6b1a478 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import relay
+from tvm import testing
 
 from .infrastructure import (
     skip_runtime_test,
@@ -77,7 +78,7 @@ def test_reshape():
     ]:
         inputs = {"a": tvm.nd.array(np.random.uniform(low, high, (1, 1, 1, 1000)).astype(dtype))}
 
-        for new_shape in [(1, 1000), (10, 10, 10)]:
+        for new_shape in [(1, 1000), (10, 10, 10), (10, 100, 1), (1, 1000, 1)]:
             outputs = []
             func = _get_model(inputs["a"].shape, new_shape, dtype, iter(inputs))
             for acl in [False, True]:
@@ -98,7 +99,7 @@ def test_codegen_reshape():
     shape = (1, 1, 1, 1000)
     inputs = {"a"}
     for dtype in ["float32", "uint8"]:
-        for new_shape in [(1, 1000), (10, 10, 10)]:
+        for new_shape in [(1, 1000), (10, 10, 10), (10, 100, 1)]:
             args = (shape, new_shape, dtype)
             func = _get_model(*args, iter(inputs))
             exp_codegen = _get_expected_codegen(*args)

From 72ae6832468b17c9732d004b215a84b3613dcf9f Mon Sep 17 00:00:00 2001
From: Leon Wang <wanghui71leon@gmail.com>
Date: Fri, 6 Nov 2020 21:22:29 +0800
Subject: [PATCH 133/258] Fix the build error for wasm-standalone app (#6862)

---
 apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
index cfea02a230d2..42695d28fadb 100644
--- a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
+++ b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
@@ -44,7 +44,7 @@ def build_graph_lib(model_file, opt_level):
 
     # Compile the relay mod
     mod, params = _get_mod_and_params(model_file)
-    target = "llvm -target=wasm32-unknown-unknown -mattr=+simd128 --system-lib"
+    target = "llvm -mtriple=wasm32-unknown-unknown -mattr=+simd128 --system-lib"
     with tvm.transform.PassContext(opt_level=opt_level):
         graph_json, lib, params = relay.build(mod, target=target, params=params)
 
@@ -71,7 +71,7 @@ def build_graph_lib(model_file, opt_level):
         "--opt-level",
         type=int,
         default=0,
-        help="level of optimization. 0 is unoptimized and 3 is the highest level",
+        help="level of optimization. 0 is non-optimized and 3 is the highest level",
     )
     args = parser.parse_args()
 

From 887dccdcf8ec603a64bef0f345ff37457bff7df2 Mon Sep 17 00:00:00 2001
From: Siju Samuel <siju.samuel@huawei.com>
Date: Fri, 6 Nov 2020 18:52:57 +0530
Subject: [PATCH 134/258] Update arm_compute_lib.rst (#6861)

Updated correct path in readme
---
 docs/deploy/arm_compute_lib.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index 5dd00764bcbc..a2eaa5fb5662 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -36,7 +36,7 @@ determine the architecture by looking online.
 
 We recommend two different ways to build and install ACL:
 
-* Use the script located at `docker/install/ubuntu_install_arm_compute_library.sh`. You can use this
+* Use the script located at `docker/install/ubuntu_install_arm_compute_lib.sh`. You can use this
   script for building ACL from source natively or for cross-compiling the library on an x86 machine.
   You may need to change the architecture of the device you wish to compile for by altering the
   `target_arch` variable. Binaries will be built from source and installed to the location denoted by

From 875a56125b147e2b053df2764d5aafbe14326361 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 6 Nov 2020 05:25:22 -0800
Subject: [PATCH 135/258] [Bugfix][Module] Fix recursive GetFunction in
 runtime::Module (#6859)

---
 src/runtime/module.cc                         |  3 ++
 .../test_runtime_module_based_interface.py    | 30 +++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index ac2b60f8a383..4cec5e3643c1 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -68,6 +68,9 @@ PackedFunc ModuleNode::GetFunction(const std::string& name, bool query_imports)
   if (query_imports) {
     for (Module& m : self->imports_) {
       pf = m.operator->()->GetFunction(name, query_imports);
+      if (pf != nullptr) {
+        return pf;
+      }
     }
   }
   return pf;
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 56ebb29c7c65..64f87fb3c561 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -538,6 +538,35 @@ def test_debug_graph_runtime():
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
 
+def test_multiple_imported_modules():
+    def make_func(symbol):
+        n = tvm.te.size_var("n")
+        Ab = tvm.tir.decl_buffer((n,), dtype="float32")
+        i = tvm.te.var("i")
+        stmt = tvm.tir.For(
+            i,
+            0,
+            n - 1,
+            0,
+            0,
+            tvm.tir.Store(Ab.data, tvm.tir.Load("float32", Ab.data, i) + 1, i + 1),
+        )
+        return tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", symbol)
+
+    def make_module(mod):
+        mod = tvm.IRModule(mod)
+        mod = tvm.driver.build(mod, target="llvm")
+        return mod
+
+    module_main = make_module({"main": make_func("main")})
+    module_a = make_module({"func_a": make_func("func_a")})
+    module_b = make_module({"func_b": make_func("func_b")})
+    module_main.import_module(module_a)
+    module_main.import_module(module_b)
+    module_main.get_function("func_a", query_imports=True)
+    module_main.get_function("func_b", query_imports=True)
+
+
 if __name__ == "__main__":
     test_legacy_compatibility()
     test_cpu()
@@ -545,3 +574,4 @@ def test_debug_graph_runtime():
     test_mod_export()
     test_remove_package_params()
     test_debug_graph_runtime()
+    test_multiple_imported_modules()

From f5805e5dad068da4fc443972b6f813521d48f98e Mon Sep 17 00:00:00 2001
From: anilmartha <anilmartha963@gmail.com>
Date: Fri, 6 Nov 2020 22:43:15 +0530
Subject: [PATCH 136/258] [BYOC][CONTRIB] Vitis-AI codegen integration (#6343)

* [BYOC][CONTRIB] VITIS-AI integration

* Remove environment related files

* Update vitis_ai.rst

* Add review changes

* Remove new lines and note frame in vitis_ai.rst

* use sys.exit

* Add condition for vitis_ai runtime exec function

* remove unused graph_json

* correct indentation

* use code python instead of bash

* Rename VITISAI.cmake to VitisAI.cmake

* use relay.ext.vitis_ai.options.build_dir in comparison

* Re-add deleted docker related files

* Make use of PyXIR XGraph and RuntimeModule serialization & refactor flow

* Fix linter errors

* Fix linter errors

* Address sphinx warnings

* Add infertype to fix Vitis-AI annotation test

* Renaming util to utils

* Add Vitis-AI flag to config.cmake file

* Move vitis-ai config options to compiler sources instead of runtime sources

* Fix clang-format errors

Co-authored-by: Anil Martha <anil.martha@xilinx.com>
Co-authored-by: anilm (generated by with_the_same_user script) <anilm@xhdabidk40.xilinx.com>
Co-authored-by: Jorn Tuyls <jornt@xilinx.com>
---
 CMakeLists.txt                                |   2 +
 cmake/config.cmake                            |   3 +
 cmake/modules/contrib/VitisAI.cmake           |  47 ++
 docs/deploy/index.rst                         |   1 +
 docs/deploy/vitis_ai.rst                      | 652 ++++++++++++++++++
 python/tvm/contrib/target/vitis_ai.py         | 156 +++++
 python/tvm/relay/op/contrib/vitis_ai.py       | 100 +++
 .../contrib/vitis_ai/config_vitis_ai.cc       |  46 ++
 .../contrib/vitis_ai/vitis_ai_runtime.cc      | 194 ++++++
 .../contrib/vitis_ai/vitis_ai_runtime.h       | 115 +++
 .../python/contrib/test_vitis_ai/__init__.py  |  18 +
 .../contrib/test_vitis_ai/infrastructure.py   | 171 +++++
 .../test_vitis_ai/test_vitis_ai_codegen.py    | 336 +++++++++
 .../test_vitis_ai_runtime_cpu_part.py         |  82 +++
 tests/scripts/task_config_build_cpu.sh        |   1 +
 15 files changed, 1924 insertions(+)
 create mode 100644 cmake/modules/contrib/VitisAI.cmake
 create mode 100755 docs/deploy/vitis_ai.rst
 create mode 100644 python/tvm/contrib/target/vitis_ai.py
 create mode 100644 python/tvm/relay/op/contrib/vitis_ai.py
 create mode 100644 src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
 create mode 100755 src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
 create mode 100755 src/runtime/contrib/vitis_ai/vitis_ai_runtime.h
 create mode 100644 tests/python/contrib/test_vitis_ai/__init__.py
 create mode 100644 tests/python/contrib/test_vitis_ai/infrastructure.py
 create mode 100644 tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
 create mode 100644 tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f8ecf4635fbe..3c1ff7035d62 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,6 +82,7 @@ tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library gra
 tvm_option(USE_TENSORRT_CODEGEN "Build with TensorRT Codegen support" OFF)
 tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF)
 tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, DYNAMIC, or OFF" OFF)
+tvm_option(USE_VITIS_AI "Build with VITIS-AI Codegen support" OFF)
 
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
@@ -367,6 +368,7 @@ include(cmake/modules/contrib/CoreML.cmake)
 include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
+include(cmake/modules/contrib/VitisAI.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 include(cmake/modules/RustExt.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 36eeac729969..6a3ace2c9283 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -232,6 +232,9 @@ set(USE_ETHOSN_HW OFF)
 set(USE_TENSORRT_CODEGEN OFF)
 set(USE_TENSORRT_RUNTIME OFF)
 
+# Whether use VITIS-AI codegen
+set(USE_VITIS_AI OFF)
+ 
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/contrib/VitisAI.cmake b/cmake/modules/contrib/VitisAI.cmake
new file mode 100644
index 000000000000..083bd6d7adc8
--- /dev/null
+++ b/cmake/modules/contrib/VitisAI.cmake
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_VITIS_AI)
+  set(PYXIR_SHARED_LIB libpyxir.so)
+  find_package(PythonInterp 3.6 REQUIRED)
+  if(NOT PYTHON)
+    find_program(PYTHON NAMES python3 python3.6)
+  endif()
+  execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+    "import pyxir as px; print(px.get_include_dir()); print(px.get_lib_dir());"
+    RESULT_VARIABLE __result
+    OUTPUT_VARIABLE __output
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(__result MATCHES 0)
+    string(REGEX REPLACE ";" "\\\\;" __values ${__output})
+    string(REGEX REPLACE "\r?\n" ";"    __values ${__values})
+    list(GET __values 0 PYXIR_INCLUDE_DIR)
+    list(GET __values 1 PYXIR_LIB_DIR)
+  else()
+    message(FATAL_ERROR "Can't build TVM with Vitis-AI because PyXIR can't be found")
+  endif()
+  message(STATUS "Build with contrib.vitisai")
+  include_directories(${PYXIR_INCLUDE_DIR})  
+  file(GLOB VAI_CONTRIB_SRC src/runtime/contrib/vitis_ai/*.cc)
+  file(GLOB COMPILER_VITIS_AI_SRCS
+       CONFIGURE_DEPENDS src/relay/backend/contrib/vitis_ai/*)
+  list(APPEND COMPILER_SRCS ${COMPILER_VITIS_AI_SRCS})
+  link_directories(${PYXIR_LIB_DIR})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS "pyxir")
+  list(APPEND RUNTIME_SRCS ${VAI_CONTRIB_SRC})
+endif(USE_VITIS_AI)
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
index 68843ba18248..e47b0a3c72fe 100644
--- a/docs/deploy/index.rst
+++ b/docs/deploy/index.rst
@@ -70,3 +70,4 @@ target device without relying on RPC. see the following resources on how to do s
    hls
    arm_compute_lib
    tensorrt
+   vitis_ai
diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst
new file mode 100755
index 000000000000..f0bd3edcd6e2
--- /dev/null
+++ b/docs/deploy/vitis_ai.rst
@@ -0,0 +1,652 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+
+Vitis-AI Integration
+====================
+
+`Vitis-AI <https://github.com/Xilinx/Vitis-AI>`__ is Xilinx's
+development stack for hardware-accelerated AI inference on Xilinx
+platforms, including both edge devices and Alveo cards. It consists of
+optimized IP, tools, libraries, models, and example designs. It is
+designed with high efficiency and ease of use in mind, unleashing the
+full potential of AI acceleration on Xilinx FPGA and ACAP.
+
+The current Vitis-AI Byoc flow inside TVM enables acceleration of Neural
+Network model inference on edge and cloud. The identifiers for the
+supported edge and cloud Deep Learning Processor Units (DPU's) are
+DPUCZDX8G respectively DPUCADX8G. DPUCZDX8G and DPUCADX8G are hardware
+accelerators for convolutional neural networks (CNN's) on top of the
+Xilinx `Zynq Ultrascale+
+MPSoc <https://www.xilinx.com/products/silicon-devices/soc/zynq-ultrascale-mpsoc.html>`__
+respectively
+`Alveo <https://www.xilinx.com/products/boards-and-kits/alveo.html>`__
+(U200/U250) platforms. For more information about the DPU identifiers
+see the section on `DPU naming information <#dpu-naming-information>`__.
+
+On this page you will find information on how to
+`build <#build-instructions>`__ TVM with Vitis-AI and on how to `get
+started <#getting-started>`__ with an example.
+
+DPU naming information
+----------------------
+
++---------------------------------+-----------------+-------------------------------------------------------------------------+------------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------+
+| DPU                             | Application     | HW Platform                                                             | Quantization Method                                        | Quantization Bitwidth                             | Design Target                                                            |
++=================================+=================+=========================================================================+============================================================+===================================================+==========================================================================+
+| Deep Learning Processing Unit   | C: CNN R: RNN   | AD: Alveo DDR AH: Alveo HBM VD: Versal DDR with AIE & PL ZD: Zynq DDR   | X: DECENT I: Integer threshold F: Float threshold R: RNN   | 4: 4-bit 8: 8-bit 16: 16-bit M: Mixed Precision   | G: General purpose H: High throughput L: Low latency C: Cost optimized   |
++---------------------------------+-----------------+-------------------------------------------------------------------------+------------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------+
+
+Build instructions
+------------------
+
+This section lists the instructions for building TVM with Vitis-AI for
+both `cloud <#cloud-dpucadx8g>`__ and `edge <#edge-dpuczdx8g>`__.
+
+Cloud (DPUCADX8G)
+~~~~~~~~~~~~~~~~~
+
+For Vitis-AI acceleration in the cloud TVM has to be built on top of the
+Xilinx Alveo platform.
+
+System requirements
+^^^^^^^^^^^^^^^^^^^
+
+The following table lists system requirements for running docker
+containers as well as Alveo cards.
+
++-----------------------------------------------------+----------------------------------------------------------+
+| **Component**                                       | **Requirement**                                          |
++=====================================================+==========================================================+
+| Motherboard                                         | PCI Express 3.0-compliant with one dual-width x16 slot   |
++-----------------------------------------------------+----------------------------------------------------------+
+| System Power Supply                                 | 225W                                                     |
++-----------------------------------------------------+----------------------------------------------------------+
+| Operating System                                    | Ubuntu 16.04, 18.04                                      |
++-----------------------------------------------------+----------------------------------------------------------+
+|                                                     | CentOS 7.4, 7.5                                          |
++-----------------------------------------------------+----------------------------------------------------------+
+|                                                     | RHEL 7.4, 7.5                                            |
++-----------------------------------------------------+----------------------------------------------------------+
+| CPU                                                 | Intel i3/i5/i7/i9/Xeon 64-bit CPU                        |
++-----------------------------------------------------+----------------------------------------------------------+
+| GPU (Optional to accelerate quantization)           | NVIDIA GPU with a compute capability > 3.0               |
++-----------------------------------------------------+----------------------------------------------------------+
+| CUDA Driver (Optional to accelerate quantization)   | nvidia-410                                               |
++-----------------------------------------------------+----------------------------------------------------------+
+| FPGA                                                | Xilinx Alveo U200 or U250                                |
++-----------------------------------------------------+----------------------------------------------------------+
+| Docker Version                                      | 19.03.1                                                  |
++-----------------------------------------------------+----------------------------------------------------------+
+
+Hardware setup and docker build
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. Clone the Vitis AI repository:
+
+   .. code:: bash
+
+      git clone --recurse-submodules https://github.com/Xilinx/Vitis-AI
+   
+2. Install Docker, and add the user to the docker group. Link the user
+   to docker installation instructions from the following docker's
+   website:
+   
+
+   -  https://docs.docker.com/install/linux/docker-ce/ubuntu/
+   -  https://docs.docker.com/install/linux/docker-ce/centos/
+   -  https://docs.docker.com/install/linux/linux-postinstall/
+
+3. Download the latest Vitis AI Docker with the following command. This container runs on CPU.
+
+   .. code:: bash
+      
+      docker pull xilinx/vitis-ai:latest
+    
+   To accelerate the quantization, you can optionally use the Vitis-AI GPU docker image. Use the below commands to build the Vitis-AI GPU docker container:
+   
+   .. code:: bash
+
+      cd Vitis-AI/docker
+      ./docker_build_gpu.sh
+
+4. Set up Vitis AI to target Alveo cards. To target Alveo cards with
+   Vitis AI for machine learning workloads, you must install the
+   following software components:
+
+   -  Xilinx Runtime (XRT)
+   -  Alveo Deployment Shells (DSAs)
+   -  Xilinx Resource Manager (XRM) (xbutler)
+   -  Xilinx Overlaybins (Accelerators to Dynamically Load - binary
+      programming files)
+
+   While it is possible to install all of these software components
+   individually, a script has been provided to automatically install
+   them at once. To do so:
+
+   -  Run the following commands:
+
+      .. code:: bash
+      
+         cd Vitis-AI/alveo/packages
+         sudo su
+         ./install.sh
+      
+   -  Power cycle the system.
+   
+5. Clone tvm repo and pyxir repo
+
+   .. code:: bash
+     
+      git clone --recursive https://github.com/apache/incubator-tvm.git
+      git clone --recursive https://github.com/Xilinx/pyxir.git
+   
+6. Build and start the tvm runtime Vitis-AI Docker Container.
+
+   .. code:: bash
+
+      ./incubator-tvm/docker/build.sh demo_vitis_ai bash
+      ./incubator-tvm/docker/bash.sh tvm.demo_vitis_ai
+	  
+      #Setup inside container
+      source /opt/xilinx/xrt/setup.sh
+      . $VAI_ROOT/conda/etc/profile.d/conda.sh
+      conda activate vitis-ai-tensorflow
+      
+7. Install PyXIR
+
+   .. code:: bash
+
+     cd pyxir
+     python3 setup.py install --use_vai_rt_dpucadx8g --user
+
+   
+8. Build TVM inside the container with Vitis-AI
+
+   .. code:: bash
+
+      cd incubator-tvm
+      mkdir build
+      cp cmake/config.cmake build
+      cd build  
+      echo set\(USE_LLVM ON\) >> config.cmake
+      echo set\(USE_VITIS_AI ON\) >> config.cmake
+      cmake ..
+      make -j$(nproc)
+   
+9.  Install TVM
+
+    .. code:: bash
+
+      cd incubator-tvm/python
+      pip3 install -e . --user
+      
+Edge (DPUCZDX8G)
+^^^^^^^^^^^^^^^^
+
+
+For edge deployment we make use of two systems referred to as host and
+edge. The `host <#host-requirements>`__ system is responsible for
+quantization and compilation of the neural network model in a first
+offline step. Afterwards, the model will de deployed on the
+`edge <#edge-requirements>`__ system.
+
+Host requirements
+^^^^^^^^^^^^^^^^^
+
+The following table lists system requirements for running the TVM -
+Vitis-AI docker container.
+
++-----------------------------------------------------+----------------------------------------------+
+| **Component**                                       | **Requirement**                              |
++=====================================================+==============================================+
+| Operating System                                    | Ubuntu 16.04, 18.04                          |
++-----------------------------------------------------+----------------------------------------------+
+|                                                     | CentOS 7.4, 7.5                              |
++-----------------------------------------------------+----------------------------------------------+
+|                                                     | RHEL 7.4, 7.5                                |
++-----------------------------------------------------+----------------------------------------------+
+| CPU                                                 | Intel i3/i5/i7/i9/Xeon 64-bit CPU            |
++-----------------------------------------------------+----------------------------------------------+
+| GPU (Optional to accelerate quantization)           | NVIDIA GPU with a compute capability > 3.0   |
++-----------------------------------------------------+----------------------------------------------+
+| CUDA Driver (Optional to accelerate quantization)   | nvidia-410                                   |
++-----------------------------------------------------+----------------------------------------------+
+| FPGA                                                | Not necessary on host                        |
++-----------------------------------------------------+----------------------------------------------+
+| Docker Version                                      | 19.03.1                                      |
++-----------------------------------------------------+----------------------------------------------+
+
+Host setup and docker build
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. Clone tvm repo
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/apache/incubator-tvm.git
+2. Build and start the tvm runtime Vitis-AI Docker Container.
+
+   .. code:: bash
+
+      cd incubator-tvm 
+      ./incubator-tvm/docker/build.sh demo_vitis_ai bash
+      ./incubator-tvm/docker/bash.sh tvm.demo_vitis_ai
+   
+      #Setup inside container
+      . $VAI_ROOT/conda/etc/profile.d/conda.sh
+      conda activate vitis-ai-tensorflow
+   
+3. Install PyXIR
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/Xilinx/pyxir.git
+      cd pyxir
+      python3 setup.py install --user
+   
+   
+4. Build TVM inside the container with Vitis-AI.
+
+   .. code:: bash
+
+      cd incubator-tvm 
+      mkdir build
+      cp cmake/config.cmake build
+      cd build
+      echo set\(USE_LLVM ON\) >> config.cmake
+      echo set\(USE_VITIS_AI ON\) >> config.cmake
+      cmake ..
+      make -j$(nproc)
+   
+5. Install TVM
+
+   .. code:: bash
+
+      cd incubator-tvm/python
+      pip3 install -e . --user
+
+Edge requirements
+^^^^^^^^^^^^^^^^^
+
+The DPUCZDX8G can be deployed on the `Zynq Ultrascale+
+MPSoc <https://www.xilinx.com/products/silicon-devices/soc/zynq-ultrascale-mpsoc.html>`__
+platform. The following development boards can be used out-of-the-box:
+
++--------------------+----------------------+-----------------------------------------------------------------------+
+| **Target board**   | **TVM identifier**   | **Info**                                                              |
++====================+======================+=======================================================================+
+| Ultra96            | DPUCZDX8G-ultra96    | https://www.xilinx.com/products/boards-and-kits/1-vad4rl.html         |
++--------------------+----------------------+-----------------------------------------------------------------------+
+| ZCU104             | DPUCZDX8G-zcu104     | https://www.xilinx.com/products/boards-and-kits/zcu104.html           |
++--------------------+----------------------+-----------------------------------------------------------------------+
+| ZCU102             | DPUCZDX8G-zcu102     | https://www.xilinx.com/products/boards-and-kits/ek-u1-zcu102-g.html   |
++--------------------+----------------------+-----------------------------------------------------------------------+
+
+Edge hardware setup
+^^^^^^^^^^^^^^^^^^^
+.. note:: 
+
+  This section provides instructions for setting up with the `Pynq <http://www.pynq.io/>`__ platform but 
+  Petalinux based flows are also supported. 
+
+1. Download the Pynq v2.5 image for your target (use Z1 or Z2 for
+   Ultra96 target depending on board version) Link to image:
+   https://github.com/Xilinx/PYNQ/releases/tag/v2.5
+2. Follow Pynq instructions for setting up the board: `pynq
+   setup <https://pynq.readthedocs.io/en/latest/getting_started.html>`__
+3. After connecting to the board, make sure to run as root. Execute
+   ``su``
+4. Set up DPU on Pynq by following the steps here: `DPU Pynq
+   setup <https://github.com/Xilinx/DPU-PYNQ>`__
+5. Run the following command to download the DPU bitstream:
+
+   .. code:: bash
+
+     python3 -c 'from pynq_dpu import DpuOverlay ; overlay = DpuOverlay("dpu.bit")'
+  
+6. Check whether the DPU kernel is alive:
+
+   .. code:: bash
+
+     dexplorer -w
+
+Edge TVM setup
+^^^^^^^^^^^^^^
+
+.. note:: 
+
+  When working on Petalinux instead of Pynq, the following steps might take more manual work (e.g building     
+  hdf5 from source). Also, TVM has a scipy dependency which you then might have to build from source or 
+  circumvent. We don't depend on scipy in our flow.
+
+Building TVM depends on the Xilinx
+`PyXIR <https://github.com/Xilinx/pyxir>`__ package. PyXIR acts as an
+interface between TVM and Vitis-AI tools.
+
+1. First install the PyXIR h5py and pydot dependencies:
+
+   .. code:: bash
+
+      apt-get install libhdf5-dev
+      pip3 install pydot h5py
+      
+2. Install PyXIR
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/Xilinx/pyxir.git
+      cd pyxir
+      sudo python3 setup.py install --use_vai_rt_dpuczdx8g
+   
+3. Build TVM with Vitis-AI
+
+   .. code:: bash
+
+      git clone --recursive https://github.com/apache/incubator-tvm
+      cd incubator-tvm
+      mkdir build
+      cp cmake/config.cmake build
+      cd build
+      echo set\(USE_VITIS_AI ON\) >> config.cmake
+      cmake ..     
+      make
+   
+4. Install TVM
+
+   .. code:: bash
+
+      cd incubator-tvm/python
+      pip3 install -e . --user
+
+5. Check whether the setup was successful in the Python shell:
+
+   .. code:: bash
+
+      python3 -c 'import pyxir; import tvm'
+
+
+Getting started
+---------------
+
+This section shows how to use TVM with Vitis-AI. For this it's important
+to understand that neural network models are quantized for Vitis-AI
+execution in fixed point arithmetic. The approach we take here is to
+quantize on-the-fly using the first N inputs as explained in the next
+section.
+
+On-the-fly quantization
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Usually, to be able to accelerate inference of Neural Network models
+with Vitis-AI DPU accelerators, those models need to quantized upfront.
+In TVM - Vitis-AI flow, we make use of on-the-fly quantization to remove
+this additional preprocessing step. In this flow, one doesn't need to
+quantize his/her model upfront but can make use of the typical inference
+execution calls (module.run) to quantize the model on-the-fly using the
+first N inputs that are provided (see more information below). This will
+set up and calibrate the Vitis-AI DPU and from that point onwards
+inference will be accelerated for all next inputs. Note that the edge
+flow deviates slightly from the explained flow in that inference won't
+be accelerated after the first N inputs but the model will have been
+quantized and compiled and can be moved to the edge device for
+deployment. Please check out the `edge <#Edge%20usage>`__ usage
+instructions below for more information.
+
+Config/Settings
+~~~~~~~~~~~~~~~
+
+A couple of environment variables can be used to customize the Vitis-AI
+Byoc flow.
+
++----------------------------+----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Environment Variable**   | **Default if unset**                   | **Explanation**                                                                                                                                                                                                                                                                                                                            |
++============================+========================================+============================================================================================================================================================================================================================================================================================================================================+
+| PX\_QUANT\_SIZE            | 128                                    | The number of inputs that will be used for quantization (necessary for Vitis-AI acceleration)                                                                                                                                                                                                                                              |
++----------------------------+----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| PX\_BUILD\_DIR             | Use the on-the-fly quantization flow   | Loads the quantization and compilation information from the provided build directory and immediately starts Vitis-AI hardware acceleration. This configuration can be used if the model has been executed before using on-the-fly quantization during which the quantization and comilation information was cached in a build directory.   |
++----------------------------+----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Cloud usage
+~~~~~~~~~~~
+
+This section shows how to accelerate a convolutional neural network
+model in TVM with Vitis-AI on the cloud.
+
+To be able to target the Vitis-AI cloud DPUCADX8G target we first have
+to import the target in PyXIR. This PyXIR package is the interface being
+used by TVM to integrate with the Vitis-AI stack. Additionaly, import
+the typical TVM and Relay modules and the Vitis-AI contrib module inside
+TVM.
+
+.. code:: python
+
+   import pyxir
+   import pyxir.contrib.target.DPUCADX8G
+
+   import tvm
+   import tvm.relay as relay
+   from tvm.contrib.target import vitis_ai
+   from tvm.contrib import util, graph_runtime
+   from tvm.relay.build_module import bind_params_by_name
+   from tvm.relay.op.contrib.vitis_ai import annotation
+
+After importing a convolutional neural network model using the usual
+Relay API's, annotate the Relay expression for the given Vitis-AI DPU
+target and partition the graph.
+
+.. code:: python
+
+   mod["main"] = bind_params_by_name(mod["main"], params)
+   mod = annotation(mod, params, target)
+   mod = relay.transform.MergeCompilerRegions()(mod)
+   mod = relay.transform.PartitionGraph()(mod)
+
+Now, we can build the TVM runtime library for executing the model. The
+TVM target is 'llvm' as the operations that can't be handled by the DPU
+are executed on the CPU. The Vitis-AI target is DPUCADX8G as we are
+targeting the cloud DPU and this target is passed as a config to the TVM
+build call.
+
+.. code:: python
+
+   tvm_target = 'llvm'
+   target='DPUCADX8G'
+
+   with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target}):   
+      lib = relay.build(mod, tvm_target, params=params)
+
+As one more step before we can accelerate a model with Vitis-AI in TVM
+we have to quantize and compile the model for execution on the DPU. We
+make use of on-the-fly quantization for this. Using this method one
+doesn’t need to quantize their model upfront and can make use of the
+typical inference execution calls (module.run) to calibrate the model
+on-the-fly using the first N inputs that are provided. After the first N
+iterations, computations will be accelerated on the DPU. So now we will
+feed N inputs to the TVM runtime module. Note that these first N inputs
+will take a substantial amount of time.
+
+.. code:: python
+
+   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+
+   # First N (default = 128) inputs are used for quantization calibration and will
+   # be executed on the CPU
+   # This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64)
+   for i in range(128):
+      module.set_input(input_name, inputs[i]) 
+      module.run()
+
+Afterwards, inference will be accelerated on the DPU.
+
+.. code:: python
+
+   module.set_input(name, data)
+   module.run()
+
+To save and load the built module, one can use the typical TVM API's:
+
+.. code:: python
+
+   lib_path = "deploy_lib.so"
+   lib.export_library(lib_path)
+
+Load the module from compiled files and run inference
+
+.. code:: python
+
+   # load the module into memory
+   loaded_lib = tvm.runtime.load_module(lib_path)
+
+   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+   module.set_input(name, data)
+   module.run()
+
+Edge usage
+~~~~~~~~~~
+
+This section shows how to accelerate a convolutional neural network
+model in TVM with Vitis-AI at the edge. The first couple of steps will
+have to be run on the host machine and take care of quantization and
+compilation for deployment at the edge.
+
+Host steps
+^^^^^^^^^^
+
+To be able to target the Vitis-AI cloud DPUCZDX8G target we first have
+to import the target in PyXIR. This PyXIR package is the interface being
+used by TVM to integrate with the Vitis-AI stack. Additionaly, import
+the typical TVM and Relay modules and the Vitis-AI contrib module inside
+TVM.
+
+.. code:: python
+
+   import pyxir
+   import pyxir.contrib.target.DPUCZDX8G
+
+   import tvm
+   import tvm.relay as relay
+   from tvm.contrib.target import vitis_ai
+   from tvm.contrib import util, graph_runtime
+   from tvm.relay.build_module import bind_params_by_name
+   from tvm.relay.op.contrib.vitis_ai import annotation
+
+After importing a convolutional neural network model using the usual
+Relay API's, annotate the Relay expression for the given Vitis-AI DPU
+target and partition the graph.
+
+.. code:: python
+
+   mod["main"] = bind_params_by_name(mod["main"], params)
+   mod = annotation(mod, params, target)
+   mod = relay.transform.MergeCompilerRegions()(mod)
+   mod = relay.transform.PartitionGraph()(mod)
+
+Now, we can build the TVM runtime library for executing the model. The
+TVM target is 'llvm' as the operations that can't be handled by the DPU
+are executed on the CPU. At this point that means the CPU on the host machine.
+The Vitis-AI target is DPUCZDX8G-zcu104 as we are targeting the edge DPU
+on the ZCU104 board and this target is passed as a config to the TVM
+build call. Note that different identifiers can be passed for different
+targets, see `edge targets info <#edge-requirements>`__. Additionally, we
+provide the 'export_runtime_module' config that points to a file to which we 
+can export the Vitis-AI runtime module. We have to do this because we will
+first be compiling and quantizing the model on the host machine before building
+the model for edge deployment. As you will see later on, the exported runtime 
+module will be passed to the edge build so that the Vitis-AI runtime module 
+can be included.
+
+.. code:: python
+
+   from tvm.contrib import util
+
+   temp = util.tempdir()
+   
+   tvm_target = 'llvm'
+   target='DPUCZDX8G-zcu104'
+   export_rt_mod_file = temp.relpath("vitis_ai.rtmod")
+  
+   with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target,
+   						        'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file}):   
+      lib = relay.build(mod, tvm_target, params=params)
+      
+We will quantize and compile the model for execution on the DPU using on-the-fly 
+quantization on the host machine. This makes use of TVM inference calls 
+(module.run) to quantize the model on the host with the first N inputs.
+
+.. code:: python
+
+   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+
+   # First N (default = 128) inputs are used for quantization calibration and will
+   # be executed on the CPU
+   # This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64)
+   for i in range(128):
+      module.set_input(input_name, inputs[i]) 
+      module.run()
+      
+Save the TVM lib module so that the Vitis-AI runtime module will also be exported 
+(to the 'export_runtime_module' path we previously passed as a config).
+
+.. code:: python
+
+   from tvm.contrib import util
+
+   temp = util.tempdir()
+   lib.export_library(temp.relpath("tvm_lib.so"))
+
+After quantizing and compiling the model for Vitis-AI acceleration using the 
+first N inputs we can build the model for execution on the ARM edge device. 
+Here we pass the previously exported Vitis-AI runtime module so it can be included 
+in the TVM build.
+
+.. code:: python
+
+   # Export lib for aarch64 target
+   tvm_target = tvm.target.arm_cpu('ultra96')
+   lib_kwargs = {
+        'fcompile': contrib.cc.create_shared,
+        'cc': "/usr/aarch64-linux-gnu/bin/ld"
+   }
+
+   with tvm.transform.PassContext(opt_level=3,
+                                  config={'relay.ext.vitis_ai.options.load_runtime_module': export_rt_mod_file}):
+        lib_arm = relay.build(mod, tvm_target, params=params)
+
+   lib_dpuv2.export_library('tvm_dpu_arm.so', **lib_kwargs)
+
+Now, move the TVM build files (tvm\_dpu\_arm.json, tvm\_dpu\_arm.so,
+tvm\_dpu\_arm.params) to the edge device. For information on setting
+up the edge device check out the `edge setup <#edge-dpuczdx8g>`__
+section.
+
+Edge steps
+^^^^^^^^^^
+
+After setting up TVM with Vitis-AI on the edge device, you can now load 
+the TVM runtime module into memory and feed inputs for inference.
+
+.. code:: python
+
+   ctx = tvm.cpu()
+
+   # load the module into memory
+   lib = tvm.runtime.load_module("tvm_dpu_arm.so")
+
+   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+   module.set_input(name, data)
+   module.run()
diff --git a/python/tvm/contrib/target/vitis_ai.py b/python/tvm/contrib/target/vitis_ai.py
new file mode 100644
index 000000000000..d4931d9e3f48
--- /dev/null
+++ b/python/tvm/contrib/target/vitis_ai.py
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+"""Utility to offload (sub-)models to Vitis-AI"""
+
+import warnings
+
+import pyxir
+import pyxir.frontend.tvm
+
+from tvm.relay.expr import Tuple, Call, TupleGetItem
+import tvm._ffi
+
+
+class CodegenVitisAI:
+
+    """Traverse Relay expression and convert into PyXIR XGraph format"""
+
+    def __init__(self, model_name, function):
+        self.model_name = model_name
+        self.function = function
+        self.params = {}
+
+    def convert_pyxir(self, target):
+        """Convert Relay expression to PyXIR XGraph"""
+        xgraph = pyxir.frontend.tvm.from_relay(
+            self.function, params=self.params, postprocessing=None
+        )
+        xgraph = pyxir.partition(xgraph, targets=[target])
+        return xgraph
+
+    def get_output_names(self):
+        """Get output names from Relay expression"""
+        func = self.function
+        output_relay_ids = []
+        expr = func.body
+        if isinstance(expr, Tuple):
+            for field in expr.fields:
+                output_relay_ids.append(hash(field))
+        elif isinstance(expr, Call):
+            output_relay_ids.append(hash(expr))
+        elif isinstance(expr, TupleGetItem):
+            output_relay_ids.append(hash(expr.tuple_value))
+        else:
+            raise ValueError("Vitis-AI codegen does not support {} as output".format(type(expr)))
+        return output_relay_ids
+
+
+@tvm._ffi.register_func("relay.ext.vitis_ai")
+def vitis_ai_compiler(ref):
+    """Create a Vitis-AI runtime from the provided Relay expression"""
+    assert isinstance(ref, tvm.relay.function.Function)
+
+    out_tensor_names = []
+    name = str(ref.attrs.global_symbol)
+
+    pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
+
+    # The target Vitis-AI accelerator device
+    target = (
+        str(pass_context.config["relay.ext.vitis_ai.options.target"])
+        if "relay.ext.vitis_ai.options.target" in pass_context.config
+        else None
+    )
+
+    # (Optional configs) The build and work directories to be used by Vitis-AI
+    vai_build_dir = (
+        str(pass_context.config["relay.ext.vitis_ai.options.build_dir"])
+        if "relay.ext.vitis_ai.options.build_dir" in pass_context.config
+        else tvm.contrib.utils.tempdir().relpath("")
+    )
+    vai_work_dir = (
+        str(pass_context.config["relay.ext.vitis_ai.options.work_dir"])
+        if "relay.ext.vitis_ai.options.work_dir" in pass_context.config
+        else tvm.contrib.utils.tempdir().relpath("")
+    )
+
+    # (Optional configs) Export and load PyXIR runtime module to file if provided. This is used to
+    #   compile and quantize a model on the host and deploy it at the edge
+    export_runtime_module = (
+        str(pass_context.config["relay.ext.vitis_ai.options.export_runtime_module"])
+        if "relay.ext.vitis_ai.options.export_runtime_module" in pass_context.config
+        else ""
+    )
+    load_runtime_module = (
+        str(pass_context.config["relay.ext.vitis_ai.options.load_runtime_module"])
+        if "relay.ext.vitis_ai.options.load_runtime_module" in pass_context.config
+        else ""
+    )
+
+    # Config checks
+    if load_runtime_module and target is not None:
+        warnings.warn(
+            "Both `load_runtime_module` and `target` configs were specified."
+            " The `load_runtime_module` points to a prebuilt runtime module with"
+            " an internal target so the `target` config will be ignored"
+        )
+    if load_runtime_module and "relay.ext.vitis_ai.options.build_dir" in pass_context.config:
+        warnings.warn(
+            "Both `load_runtime_module` and `build_dir` configs were specified."
+            " The `load_runtime_module` points to a prebuilt runtime module with"
+            " an internal build directory so the `build_dir` config will be ignored"
+        )
+    if load_runtime_module and "relay.ext.vitis_ai.options.work_dir" in pass_context.config:
+        warnings.warn(
+            "Both `load_runtime_module` and `work_dir` configs were specified."
+            " The `load_runtime_module` points to a prebuilt runtime module with"
+            " an internal work directory so the `work_dir` config will be ignored"
+        )
+
+    # If load_runtime_module is not set, we will build the PyXIR runtime module from scratch
+    if load_runtime_module == "":
+        # Convert Relay expression into XGraph and do partitioning inside PyXIR
+        builder = CodegenVitisAI(name, ref)
+        xgraph = builder.convert_pyxir(target)
+        output_relay_ids = builder.get_output_names()
+        layers = xgraph.get_layers()
+
+        # Get the output tensor names using XGraph and output Relay ids
+        out_tensor_names = []
+        for layer in layers:
+            if not layer.internal:
+                for relay_id in layer.attrs["relay_id"]:
+                    if relay_id in output_relay_ids:
+                        out_tensor_names.append(layer.name)
+                        break
+        if not out_tensor_names:
+            raise ValueError(
+                "During codegeneration the loading of subexpression \
+                             failed due to output tensor name mismatch in Relay PyXIR interface."
+            )
+        xgraph.meta_attrs["tvm_out_tensors"] = out_tensor_names
+        xgraph_str = pyxir.get_xgraph_str(xgraph)
+
+        runtime_func = "tvm.vitis_ai_runtime.from_xgraph"
+        fcreate = tvm._ffi.get_global_func(runtime_func)
+        return fcreate(name, xgraph_str, target, vai_build_dir, vai_work_dir, export_runtime_module)
+
+    runtime_func = "tvm.vitis_ai_runtime.from_rt_mod"
+    fcreate = tvm._ffi.get_global_func(runtime_func)
+    return fcreate(name, load_runtime_module, export_runtime_module)
diff --git a/python/tvm/relay/op/contrib/vitis_ai.py b/python/tvm/relay/op/contrib/vitis_ai.py
new file mode 100644
index 000000000000..fa17c63fc00a
--- /dev/null
+++ b/python/tvm/relay/op/contrib/vitis_ai.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, no-else-return, E1102
+"""Vitis-AI codegen annotation of supported operators"""
+
+import numpy as np
+
+import pyxir
+import pyxir.frontend.tvm
+
+from tvm import relay
+import tvm._ffi
+from tvm.relay.expr import Tuple, TupleGetItem
+from tvm.relay import transform
+from tvm.relay.op.annotation import compiler_begin, compiler_end
+
+
+@transform.function_pass(opt_level=0)
+class VitisAIAnnotationPass:
+    """Responsible for annotating Relay expressions for Vitis-AI DPU accelerators"""
+
+    def __init__(self, compiler, relay_ids):
+        self.compiler = compiler
+        self.relay_ids = relay_ids
+
+    def transform_function(self, func, mod, ctx):
+        """Transform function for annotating Relay module"""
+        annotator = self
+
+        class Annotator(tvm.relay.ExprMutator):
+            """Annotator for Vitis-AI DPU accelerators"""
+
+            def visit_tuple(self, tup):
+                """Add compiler_begin and compiler_end annotations to Tuple"""
+                field_list = []
+                cond = int(hash(tup))
+                for field in tup.fields:
+                    if cond in annotator.relay_ids:
+                        field_list.append(compiler_begin(super().visit(field), annotator.compiler))
+                    else:
+                        field_list.append(super().visit(field))
+                if cond in annotator.relay_ids:
+                    return compiler_end(Tuple(field_list), annotator.compiler)
+                else:
+                    return Tuple(field_list)
+
+            def visit_tuple_getitem(self, op):
+                """Add compiler_begin and compiler_end annotations to TupleGetItem"""
+                if int(hash(op.tuple_value)) in annotator.relay_ids:
+                    tuple_value = compiler_begin(super().visit(op.tuple_value), annotator.compiler)
+                    return compiler_end(TupleGetItem(tuple_value, op.index), annotator.compiler)
+                else:
+                    tuple_value = super().visit(op.tuple_value)
+                    return TupleGetItem(tuple_value, op.index)
+
+            def visit_call(self, call):
+                """Add compiler_begin and compiler_end annotations to the Call expr"""
+                if int(hash(call)) in annotator.relay_ids:
+                    new_args = []
+                    for arg in call.args:
+                        ann = compiler_begin(super().visit(arg), annotator.compiler)
+                        new_args.append(ann)
+                    new_call = relay.Call(call.op, new_args, call.attrs, call.type_args)
+                    return compiler_end(new_call, annotator.compiler)
+
+                else:
+                    return super().visit_call(call)
+
+        return Annotator().visit(func)
+
+
+def annotation(mod, params, target):
+    """Annotate Relay expression for Vitis-AI DPU accelerators"""
+    xgraph = pyxir.frontend.tvm.from_relay(mod, params, postprocessing=None)
+    xgraph = pyxir.partition(xgraph, targets=[target])
+
+    layers = xgraph.get_layers()
+    relay_ids = [
+        list(np.array(layer.attrs["relay_id"]).flatten())
+        for layer in layers
+        if layer.target == target
+    ]
+    relay_ids_flatten = [item for sublist in relay_ids for item in sublist]
+    mod = VitisAIAnnotationPass("vitis_ai", relay_ids_flatten)(mod)
+
+    return mod
diff --git a/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc b/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
new file mode 100644
index 000000000000..f74b5306c5f4
--- /dev/null
+++ b/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
+ * \brief Register Vitis-AI codegen options. Main codegen is implemented in python.
+ */
+
+#include <tvm/ir/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace vitis_ai {
+
+/*! \brief The target Vitis-AI accelerator device */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.target", String);
+/*! \brief (Optional config) The build directory to be used by Vitis-AI */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.build_dir", String);
+/*! \brief (Optional config) The work directory to be used by Vitis-AI */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.work_dir", String);
+/*! \brief (Optional config) Export PyXIR runtime module to disk during serialization if provided */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.export_runtime_module", String);
+/*! \brief (Optional config) Load PyXIR runtime module from disk */
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options.load_runtime_module", String);
+
+}  // namespace vitis_ai
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
new file mode 100755
index 000000000000..37dc767d31af
--- /dev/null
+++ b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file vitis_ai_runtime.cc
+ */
+
+#include "vitis_ai_runtime.h"
+
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <streambuf>
+#include <string>
+#include <vector>
+
+using namespace pyxir::runtime;
+
+namespace tvm {
+namespace runtime {
+
+VitisAIRuntime::VitisAIRuntime(const std::string& symbol_name, const Array<String> const_names,
+                               const std::string& serialized_rt_mod,
+                               const std::string& export_rt_mod_path)
+    : symbol_name_(symbol_name),
+      const_names_(const_names),
+      export_rt_mod_path_(export_rt_mod_path) {
+  std::istringstream sstream(serialized_rt_mod);
+  rt_mod_.reset(new RuntimeModule());
+  rt_mod_->deserialize(sstream);
+  in_tensor_names_ = rt_mod_->get_in_tensor_names();
+  out_tensor_names_ = rt_mod_->get_out_tensor_names();
+}
+
+VitisAIRuntime::VitisAIRuntime(const std::string& symbol_name, const std::string& xgraph_str,
+                               const Array<String> const_names, const std::string& target,
+                               const std::string& build_dir, const std::string& work_dir,
+                               const std::string& export_rt_mod_path)
+    : symbol_name_(symbol_name),
+      const_names_(const_names),
+      export_rt_mod_path_(export_rt_mod_path) {
+  std::istringstream xgraph_sstream(xgraph_str);
+  pyxir::XGraphHolder xgraph = std::make_shared<pyxir::graph::XGraph>("");
+  pyxir::read(xgraph, xgraph_sstream);
+  in_tensor_names_ = xgraph->get_input_names();
+  out_tensor_names_ = xgraph->get_meta_attr("tvm_out_tensors").get_strings();
+
+  pyxir::partition(xgraph, std::vector<std::string>{target}, "");
+
+  pyxir::RunOptionsHolder run_options(new pyxir::runtime::RunOptions());
+  run_options->on_the_fly_quantization = true;
+  run_options->build_dir = build_dir;
+  if (!work_dir.empty()) run_options->work_dir = work_dir;
+  rt_mod_ =
+      pyxir::build_rt(xgraph, target, in_tensor_names_, out_tensor_names_, "vai", run_options);
+}
+
+Module VitisAIRuntimeCreate(const std::string& name, const std::string& xgraph_str,
+                            const std::string& target, const std::string& build_dir,
+                            const std::string& work_dir, const std::string& export_rt_mod_path) {
+  Array<String> const_vars;
+  auto exec = make_object<VitisAIRuntime>(name, xgraph_str, const_vars, target, build_dir, work_dir,
+                                          export_rt_mod_path);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.vitis_ai_runtime.from_xgraph").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = VitisAIRuntimeCreate(args[0], args[1], args[2], args[3], args[4], args[5]);
+});
+
+Module VitisAIRuntimeCreate(const std::string& name, const std::string& serialized_rt_mod,
+                            const std::string& export_rt_mod_path) {
+  Array<String> const_vars;
+  auto exec = make_object<VitisAIRuntime>(name, const_vars, serialized_rt_mod, export_rt_mod_path);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.vitis_ai_runtime.from_rt_mod").set_body([](TVMArgs args, TVMRetValue* rv) {
+  std::string load_rt_mod_path = args[1];
+  assert(!load_rt_mod_path.empty());
+  std::ifstream in_file(load_rt_mod_path);
+  std::stringstream buffer;
+  buffer << in_file.rdbuf();
+  std::string serialized_rt_mod = buffer.str();
+  in_file.close();
+  *rv = VitisAIRuntimeCreate(args[0], serialized_rt_mod, args[2]);
+});
+
+Module VitisAIRuntimeLoadFromBinary(void* strm) {
+  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+  std::string symbol_name;
+  std::vector<std::string> const_vars;
+  std::string serialized_rt_mod;
+  std::string export_rt_mod_path;
+  stream->Read(&serialized_rt_mod);
+  stream->Read(&export_rt_mod_path);
+  stream->Read(&symbol_name);
+  stream->Read(&const_vars);
+  Array<String> const_names;
+  for (const auto& it : const_vars) {
+    const_names.push_back(it);
+  }
+  auto exec =
+      make_object<VitisAIRuntime>(symbol_name, const_names, serialized_rt_mod, export_rt_mod_path);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_VitisAIRuntime")
+    .set_body_typed(VitisAIRuntimeLoadFromBinary);
+
+void VitisAIRuntime::SaveToBinary(dmlc::Stream* stream) {
+  std::ostringstream sstream;
+  rt_mod_->serialize(sstream);
+  stream->Write(sstream.str());
+  stream->Write(export_rt_mod_path_);
+  stream->Write(symbol_name_);
+  std::vector<std::string> consts;
+  for (const auto& it : const_names_) {
+    consts.push_back(it);
+  }
+  stream->Write(consts);
+
+  // If export_rt_mod_path_ member variable is set, we will additionally export the PyXIR
+  //  runtime_module to the specified file
+  if (!export_rt_mod_path_.empty()) {
+    std::ofstream out_file(export_rt_mod_path_);
+    out_file << sstream.str();
+    out_file.close();
+  }
+}
+
+PackedFunc VitisAIRuntime::GetFunction(const std::string& name,
+                                       const ObjectPtr<Object>& sptr_to_self) {
+  if (name == "get_symbol") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->symbol_name_; });
+  } else if (name == "get_const_vars") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->const_names_; });
+  } else if ("__init_" + this->symbol_name_ == name) {
+    // The function to initialize constant tensors.
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      CHECK_EQ(args.size(), 1U);
+      this->initialized_ = true;
+      *rv = 0;
+    });
+  } else if (this->symbol_name_ == name) {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      // Initialize input tensors
+      DLTensor* inputs = args[0];
+      std::vector<pyxir::XBufferHolder> in_tensors;
+      std::vector<ssize_t> in_shape;
+      for (int i = 0; i < inputs->ndim; ++i) in_shape.push_back(inputs->shape[i]);
+      in_tensors.push_back(std::shared_ptr<pyxir::XBuffer>(
+          new pyxir::XBuffer(reinterpret_cast<void*>(static_cast<float*>(inputs->data)), 4, "f",
+                             in_shape.size(), in_shape, false, false)));
+
+      // Initialize output tensors
+      std::vector<pyxir::XBufferHolder> out_tensors;
+      for (unsigned i = 0; i < out_tensor_names_.size(); ++i) {
+        DLTensor* output_tensor = args[args.size() - out_tensor_names_.size() + i];
+        std::vector<ssize_t> out_shape;
+        for (int i = 0; i < output_tensor->ndim; ++i) out_shape.push_back(output_tensor->shape[i]);
+        void* output_data = reinterpret_cast<void*>(static_cast<float*>(output_tensor->data));
+        out_tensors.push_back(std::shared_ptr<pyxir::XBuffer>(
+            new pyxir::XBuffer(output_data, 4, "f", out_shape.size(), out_shape, false, false)));
+      }
+
+      // Execute the subgraph.
+      rt_mod_->execute(in_tensors, out_tensors);
+    });
+  } else {
+    return PackedFunc();
+  }
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h
new file mode 100755
index 000000000000..1092bc0ba27b
--- /dev/null
+++ b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.h
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Vitis-AI runtime that can run model
+ *        containing only tvm PackedFunc.
+ * \file vitis_ai_runtime.h
+ */
+#ifndef TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
+#define TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+// clang-format off
+#include <memory>
+#include <string>
+#include <vector>
+// clang-format on
+#include <pyxir/pyxir.hpp>
+#include <pyxir/runtime/run_options.hpp>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief VAI runtime.
+ *
+ *  This runtime can be accessed in various language via
+ *  TVM runtime PackedFunc API.
+ */
+class VitisAIRuntime : public ModuleNode {
+ public:
+  /*!
+   * \brief Create VitisAI runtime from serialized XGraph
+   * \param symbol_name The name of the function.
+   * \param const_names The names of each constant in the sub-graph.
+   * \param serialized_rt_mod The serialized runtime module.
+   * \param export_rt_mod_path The path to the file to be used for exporting the
+   *        PyXIR runtime module.
+   */
+  VitisAIRuntime(const std::string& symbol_name, const Array<String> const_names,
+                 const std::string& serialized_rt_mod, const std::string& export_rt_mod);
+
+  /*!
+   * \brief Create VitisAI runtime from serialized XGraph
+   * \param symbol_name The name of the function.
+   * \param xgraph_str serialized XGraph representation
+   * \param const_names The names of each constant in the sub-graph.
+   * \param target The Vitis-AI device target (e.g. DPUCADX8G, DPUCZDX8G).
+   * \param build_dir The directory to be used for Vitis-AI build files.
+   * \param work_dir The directory to be used for Vitis-AI work files.
+   * \param export_rt_mod_path The path to the file to be used for exporting the
+   *        PyXIR runtime module.
+   */
+  VitisAIRuntime(const std::string& symbol_name, const std::string& xgraph_str,
+                 const Array<String> const_names, const std::string& target,
+                 const std::string& build_dir, const std::string& work_dir,
+                 const std::string& export_runtime_module_path);
+
+  /*!
+   * \brief Get member function to front-end.
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+
+  /*!
+   * \return The type key of the executor.
+   */
+  const char* type_key() const { return "VitisAIRuntime"; }
+
+  /*!
+   * \brief Serialize the content of the pyxir directory and save it to
+   *        binary stream.
+   * \param stream The binary stream to save to.
+   */
+  void SaveToBinary(dmlc::Stream* stream) final;
+
+ private:
+  /*! \brief The only subgraph name for this module */
+  std::string symbol_name_;
+  /*! \brief The required constant names */
+  Array<String> const_names_;
+  /*! \brief The runtime module */
+  pyxir::RtModHolder rt_mod_;
+  /*! \brief The XGraph input tensor names in the order as provided by TVM */
+  std::vector<std::string> in_tensor_names_;
+  /*! \brief The XGraph output tensor names in the order as provided by TVM */
+  std::vector<std::string> out_tensor_names_;
+  /*! \brief The file path for exporting the runtime module if set */
+  std::string export_rt_mod_path_;
+  /*! \brief Whether constant tensors have been initialized */
+  bool initialized_{false};
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_VITIS_AI_VITIS_AI_RUNTIME_H_
diff --git a/tests/python/contrib/test_vitis_ai/__init__.py b/tests/python/contrib/test_vitis_ai/__init__.py
new file mode 100644
index 000000000000..c5fe1539b059
--- /dev/null
+++ b/tests/python/contrib/test_vitis_ai/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Infrastructure and tests for Vitis-AI codegen """
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
new file mode 100644
index 000000000000..df7836a37647
--- /dev/null
+++ b/tests/python/contrib/test_vitis_ai/infrastructure.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
+
+"""Expose Vitis-AI test functions to the Python frontend"""
+
+import sys
+import numpy as np
+
+import pytest
+
+pytest.importorskip("pyxir")
+import pyxir.contrib.target.DPUCADX8G
+import pyxir.contrib.target.DPUCZDX8G
+
+import tvm
+from tvm import relay
+from tvm import runtime
+from tvm.relay import transform
+from tvm.relay.op.contrib.vitis_ai import annotation
+from tvm.relay.build_module import bind_params_by_name
+from tvm.contrib.target import vitis_ai
+from tvm.contrib import graph_runtime
+from tvm.contrib import utils
+
+
+def get_cpu_op_count(mod):
+    """Traverse graph counting ops offloaded to TVM."""
+
+    class Counter(tvm.relay.ExprVisitor):
+        def __init__(self):
+            super().__init__()
+            self.count = 0
+
+        def visit_call(self, call):
+            if isinstance(call.op, tvm.ir.Op):
+                self.count += 1
+
+            super().visit_call(call)
+
+    c = Counter()
+    c.visit(mod["main"])
+    return c.count
+
+
+def skip_test():
+    """Skip test if it requires the Vitis-AI codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.vitis_ai", True):
+        print("Skip test because Vitis-AI codegen is not available.")
+        return True
+    return False
+
+
+def build_module(
+    mod,
+    target,
+    dpu_target="DPUCADX8G",
+    params=None,
+    enable_vitis_ai=True,
+    tvm_ops=0,
+    vitis_ai_partitions=1,
+):
+    """Build module for Vitis-AI codegen."""
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+    if params is None:
+        params = {}
+
+    with tvm.transform.PassContext(
+        opt_level=3, config={"relay.ext.vitis_ai.options.target": dpu_target}
+    ):
+        if enable_vitis_ai:
+            mod["main"] = bind_params_by_name(mod["main"], params)
+            mod = annotation(mod, params, dpu_target)
+            mod = transform.MergeCompilerRegions()(mod)
+            mod = transform.PartitionGraph()(mod)
+            tvm_op_count = get_cpu_op_count(mod)
+            assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format(
+                tvm_op_count, tvm_ops
+            )
+            partition_count = 0
+            for global_var in mod.get_global_vars():
+                if "vitis_ai" in global_var.name_hint:
+                    partition_count += 1
+
+            assert (
+                vitis_ai_partitions == partition_count
+            ), "Got {} Vitis-AI partitions, expected {}".format(
+                partition_count, vitis_ai_partitions
+            )
+        relay.backend.compile_engine.get().clear()
+        return relay.build(mod, target, params=params)
+
+
+def update_lib(lib, cross_compile=None):
+    tmp_path = utils.tempdir()
+    lib_name = "lib.so"
+    lib_path = tmp_path.relpath(lib_name)
+    if cross_compile:
+        lib.export_library(lib_path, cc=cross_compile)
+    else:
+        lib.export_library(lib_path)
+    lib = runtime.load_module(lib_path)
+    return lib
+
+
+def extract_vitis_ai_modules(module):
+    """Get the Vits-AI runtime module from llvm module."""
+    return list(
+        filter(lambda mod: mod.type_key == "VitisAIRuntime", module.get_lib().imported_modules)
+    )
+
+
+def verify_codegen(
+    module, num_vitis_ai_modules=1, params=None, target="llvm", dpu_target="DPUCADX8G"
+):
+    """Check Vitis-AI codegen against a known good output."""
+    module = build_module(module, target, params=params, dpu_target=dpu_target)
+    vitis_ai_modules = extract_vitis_ai_modules(module)
+
+    assert len(vitis_ai_modules) == num_vitis_ai_modules, (
+        f"The number of Vitis-AI modules produced ({len(vitis_ai_modules)}) does not "
+        f"match the expected value ({num_vitis_ai_modules})."
+    )
+
+
+def verify_result(
+    mod,
+    map_inputs,
+    out_shape,
+    result,
+    tol=1e-5,
+    target="llvm",
+    ctx=tvm.cpu(),
+    params=None,
+    dpu_target="DPUCADX8G",
+    tvm_ops=0,
+):
+    """To check the result between reference and byoc vitis-ai flow"""
+
+    lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops)
+    lib = update_lib(lib)
+    ctx = tvm.cpu()
+    rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+
+    for name, data in map_inputs.items():
+        rt_mod.set_input(name, data)
+    rt_mod.set_input(**params)
+    rt_mod.run()
+
+    out_shapes = out_shape if isinstance(out_shape, list) else [out_shape]
+    results = result if isinstance(result, list) else [result]
+
+    for idx, shape in enumerate(out_shapes):
+        out = tvm.nd.empty(shape, ctx=ctx)
+        out = rt_mod.get_output(idx, out)
+        tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=tol, atol=tol)
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
new file mode 100644
index 000000000000..4d5d5dc92c41
--- /dev/null
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
@@ -0,0 +1,336 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
+
+"""Vitis-AI codegen tests"""
+
+import sys
+import numpy as np
+
+import pytest
+
+pytest.importorskip("pyxir")
+import pyxir.contrib.target.DPUCADX8G
+import pyxir.contrib.target.DPUCZDX8G
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.op.contrib.vitis_ai import annotation
+from tvm.relay.build_module import bind_params_by_name
+from tvm.contrib.target import vitis_ai
+
+from .infrastructure import skip_test, verify_codegen
+
+
+def set_func_attr(func, compile_name, symbol_name):
+    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Compiler", compile_name)
+    func = func.with_attr("global_symbol", symbol_name)
+    return func
+
+
+def test_conv2d():
+    """Test conv2d operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    x = relay.var("x", shape=(1, 3, 224, 224))
+    w = relay.const(np.zeros((16, 3, 3, 3), dtype="float32"))
+    y = relay.nn.conv2d(x, w, strides=[2, 2], padding=[1, 1, 1, 1], kernel_size=[3, 3])
+    func = relay.Function([x], y)
+    params = {}
+    params["x"] = np.zeros((1, 3, 224, 224), dtype="float32")
+    params["w"] = np.random.rand(16, 3, 3, 3).astype("float32")
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCADX8G")
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_depthwise_conv():
+    """Test depthwise_conv operator for Vitis-AI DPUCZDX8G-zcu104 target"""
+
+    dtype = "float32"
+    ishape = (1, 32, 14, 14)
+    wshape = (32, 1, 3, 3)
+    data = relay.var("data", shape=(ishape), dtype=dtype)
+    weights = relay.var("weights", shape=(wshape), dtype=dtype)
+    depthwise_conv2d = relay.nn.conv2d(data, weights, kernel_size=(3, 3), padding=(1, 1), groups=32)
+    func = relay.Function([data, weights], depthwise_conv2d)
+    params = {}
+    params["weights"] = np.random.randn(32, 1, 3, 3).astype(dtype)
+    params["data"] = np.random.randn(1, 32, 14, 14).astype(dtype)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_bias_add():
+    """Test bias_add operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    dtype = "float32"
+    ishape = (1, 32, 14, 14)
+    data = relay.var("data", shape=(ishape), dtype=dtype)
+    bias = relay.var("bias", relay.TensorType((32,), dtype))
+    out = relay.nn.bias_add(data, bias)
+    func = relay.Function([data, bias], out)
+    params = {}
+    params["bias"] = np.random.randn(32).astype(dtype)
+    params["data"] = np.random.randn(1, 32, 14, 14).astype(dtype)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCADX8G")
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_relu():
+    """Test relu operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (10, 10)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.relu(x)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_batchnorm():
+    """Test batchnorm operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    data = relay.var("data", shape=(1, 16, 112, 112))
+    bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
+    bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
+    bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
+    bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
+    bn_output = relay.nn.batch_norm(data, bn_gamma, bn_beta, bn_mmean, bn_mvar)
+    func = relay.Function([data, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output[0])
+    params = {}
+    params["data"] = np.zeros((1, 16, 112, 112), dtype="float32")
+    params["bn_gamma"] = np.random.rand(16).astype("float32")
+    params["bn_beta"] = np.random.rand(16).astype("float32")
+    params["bn_mean"] = np.random.rand(16).astype("float32")
+    params["bn_var"] = np.random.rand(16).astype("float32")
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCADX8G")
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_add():
+    """Test add operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (10, 10)
+    x = relay.var("x", shape=shape)
+    y = x + x
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_global_avg_pool2d():
+    """Test global_avg_pool2d operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (10, 10, 7, 7)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.global_avg_pool2d(x)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_avg_pool2d():
+    """Test avg_pool2d for operator Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (10, 10, 10, 10)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.avg_pool2d(x, pool_size=(3, 3))
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_max_pool2d():
+    """Test max_pool2d for operator Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (64, 512, 10, 10)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.max_pool2d(x, pool_size=(3, 3))
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_global_max_pool2d():
+    """Test global_maxpool2d operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (1, 512, 7, 7)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.global_max_pool2d(x)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_upsampling():
+    """Test upsampling operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    shape = (64, 512, 10, 10)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.upsampling(x, scale_h=2, scale_w=2)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, dpu_target="DPUCADX8G")
+    verify_codegen(mod, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_conv2d_transpose():
+    """Test conv2d_transpose operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    dshape = (1, 3, 18, 18)
+    kshape = (3, 10, 3, 3)
+    x = relay.var("x", shape=dshape)
+    w = relay.const(np.zeros(kshape, dtype="float32"))
+    y = relay.nn.conv2d_transpose(
+        x, w, channels=10, kernel_size=(3, 3), strides=(1, 1), padding=(1, 1)
+    )
+    func = relay.Function([x], y)
+    params = {}
+    dtype = "float32"
+    params["x"] = np.random.uniform(size=dshape).astype(dtype)
+    params["w"] = np.random.uniform(size=kshape).astype(dtype)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    verify_codegen(mod, params=params, dpu_target="DPUCADX8G")
+    verify_codegen(mod, params=params, dpu_target="DPUCZDX8G-zcu104")
+
+
+def test_annotate():
+    """Test annotation operator for Vitis-AI DPUCADX8G and DPUCZDX8G-zcu104 targets"""
+
+    def partition(dpu_target):
+        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
+        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
+        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
+        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
+        bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
+        bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
+
+        conv = relay.nn.conv2d(
+            data=data, weight=weight, kernel_size=(3, 3), channels=16, padding=(1, 1)
+        )
+        bn_output = relay.nn.batch_norm(conv, bn_gamma, bn_beta, bn_mmean, bn_mvar)
+
+        func = relay.Function(
+            [data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn_output.astuple()
+        )
+        mod = tvm.IRModule()
+        mod["main"] = func
+        params = {}
+        params["weight"] = np.random.rand(16, 3, 3, 3).astype("float32")
+        params["bn_gamma"] = np.random.rand(16).astype("float32")
+        params["bn_beta"] = np.random.rand(16).astype("float32")
+        params["bn_mean"] = np.random.rand(16).astype("float32")
+        params["bn_var"] = np.random.rand(16).astype("float32")
+        mod = annotation(mod, params, dpu_target)
+
+        opt_pass = tvm.transform.Sequential(
+            [
+                transform.MergeCompilerRegions(),
+                transform.PartitionGraph(),
+            ]
+        )
+
+        with tvm.transform.PassContext(opt_level=3):
+            mod = opt_pass(mod)
+
+        return mod
+
+    def expected():
+        # function variables for conv2d
+        data0 = relay.var("data0", relay.TensorType((1, 3, 224, 224), "float32"))
+        weight0 = relay.var("weight0", relay.TensorType((16, 3, 3, 3), "float32"))
+        conv = relay.nn.conv2d(
+            data=data0, weight=weight0, kernel_size=(3, 3), channels=16, padding=(1, 1)
+        )
+
+        # function variables for batch_norm
+        bn_gamma0 = relay.var("bn_gamma0", relay.TensorType((16,), "float32"))
+        bn_beta0 = relay.var("bn_beta0", relay.TensorType((16,), "float32"))
+        bn_mmean0 = relay.var("bn_mean0", relay.TensorType((16,), "float32"))
+        bn_mvar0 = relay.var("bn_var0", relay.TensorType((16,), "float32"))
+        bn = relay.nn.batch_norm(conv, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0)
+        func0 = relay.Function(
+            [data0, weight0, bn_gamma0, bn_beta0, bn_mmean0, bn_mvar0], bn.astuple()
+        )
+        func0 = set_func_attr(func0, "vitis_ai", "vitis_ai_0")
+        gv0 = relay.GlobalVar("vitis_ai_0")
+        mod = tvm.IRModule()
+        mod[gv0] = func0
+        mod = relay.transform.InferType()(mod)
+
+        # main function
+        data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
+        weight = relay.var("weight", relay.TensorType((16, 3, 3, 3), "float32"))
+        bn_gamma = relay.var("bn_gamma", relay.TensorType((16,), "float32"))
+        bn_beta = relay.var("bn_beta", relay.TensorType((16,), "float32"))
+        bn_mmean = relay.var("bn_mean", relay.TensorType((16,), "float32"))
+        bn_mvar = relay.var("bn_var", relay.TensorType((16,), "float32"))
+        call0 = gv0(data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar)
+        mod["main"] = relay.Function([data, weight, bn_gamma, bn_beta, bn_mmean, bn_mvar], call0)
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    partitioned_dpuczdx8g_zcu104 = partition("DPUCZDX8G-zcu104")
+    partitioned_dpucadx8g = partition("DPUCADX8G")
+
+    ref_mod = expected()
+
+    assert tvm.ir.structural_equal(partitioned_dpuczdx8g_zcu104, ref_mod, map_free_vars=True)
+    assert tvm.ir.structural_equal(partitioned_dpucadx8g, ref_mod, map_free_vars=True)
+
+
+if __name__ == "__main__":
+    if sys.platform == "win32":
+        print("Skip test on Windows for now")
+        sys.exit(0)
+
+    test_conv2d()
+    test_depthwise_conv()
+    test_bias_add()
+    test_relu()
+    test_add()
+    test_max_pool2d()
+    test_global_max_pool2d()
+    test_batchnorm()
+    test_global_avg_pool2d()
+    test_avg_pool2d()
+    test_upsampling()
+    test_conv2d_transpose()
+    test_annotate()
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
new file mode 100644
index 000000000000..030dda372cfe
--- /dev/null
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, W0611, C0413
+
+"""Vitis-AI runtime test for CPU only part
+
+This test verifies as much as possible whether the a model can be correctly offloaded
+and executed for Vitis-AI acceleration. This entails:
+    - Annotating and partitioning model for Vitis-AI acceleration
+    - Building a Vitis-AI PyXIR runtime module with on-the-fly quantization enabled
+    - Run first iteration of on-the-fly quantization flow. This will always be run
+      on CPU as the first N (parameter) will be used for collecting calibration data
+      for quantization.
+
+NOTE This is not a full end-to-end test as we need the full Vitis-AI docker environment
+and access to an FPGA instance for that. This test verifies the Vitis-AI flow as much as
+possible without requiring access to dedicated docker environment and/or hardware setup.
+NOTE Quantization is not being tested (we need to be inside Vitis-AI docker environment
+for that) buth the internal representation used for quantization is being generated and
+functionally tested (CPU).
+"""
+
+import sys
+import numpy as np
+
+import pytest
+
+pytest.importorskip("pyxir")
+import pyxir.contrib.target.DPUCADX8G
+
+import tvm
+import tvm.relay.testing
+from tvm import relay
+
+from .infrastructure import skip_test, verify_result
+
+
+def test_extern_vitis_ai_resnet18():
+    """Test first part of Vitis-AI on-the-fly quantization runtime with ResNet 18 model"""
+    if skip_test():
+        return
+
+    dtype = "float32"
+    ishape = (1, 3, 224, 224)
+    mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1)
+    ref_mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1)
+
+    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0))
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+
+    ref_res = ref_ex.evaluate()(i_data, **params)
+    verify_result(
+        mod,
+        {"data": i_data},
+        (1, 1000),
+        ref_res.asnumpy(),
+        tol=1e-5,
+        params=params,
+        dpu_target="DPUCADX8G",
+        tvm_ops=4,
+    )
+
+
+if __name__ == "__main__":
+    if sys.platform == "win32":
+        print("Skip test on Windows for now")
+        sys.exit(0)
+    test_extern_vitis_ai_resnet18()
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 6fc64966c0ab..9a009b6a4a78 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -44,3 +44,4 @@ echo set\(USE_TENSORFLOW_PATH \"/tensorflow\"\) >> config.cmake
 echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake
 echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake
 echo set\(USE_ETHOSN_HW OFF\) >> config.cmake
+echo set\(USE_VITIS_AI ON\) >> config.cmake

From 4a0b368726c783f6376422f0df337da0bf44107f Mon Sep 17 00:00:00 2001
From: mbaret <55580676+mbaret@users.noreply.github.com>
Date: Fri, 6 Nov 2020 19:26:53 +0000
Subject: [PATCH 137/258] [TIR] Make loop unrolling in LoopPartition optional
 (#6823)

* [TIR] Make loop unrolling in LoopPartition optional

For certain analysis/tensorization, it can be useful
to keep the loop structure when partitioning loops.
The current behaviour removes For loops of length 1.
This change introduces the option to preserve these
loops with the 'unroll' flag.
---
 src/tir/transforms/loop_partition.cc          | 21 ++++++++++----
 .../test_tir_transform_loop_partition.py      | 28 +++++++++++++++++++
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index ab567dc0e417..a104dbb029eb 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -40,9 +40,13 @@ namespace tir {
 
 struct LoopPartitionConfigNode : public tvm::AttrsNode<LoopPartitionConfigNode> {
   bool partition_const_loop;
+  bool no_unroll_loop_with_extent_one;
 
   TVM_DECLARE_ATTRS(LoopPartitionConfigNode, "tir.transform.LoopPartitionConfig") {
     TVM_ATTR_FIELD(partition_const_loop).describe("Split constant loop").set_default(false);
+    TVM_ATTR_FIELD(no_unroll_loop_with_extent_one)
+        .describe("Don't unroll loops with extent 1")
+        .set_default(false);
   }
 };
 
@@ -334,8 +338,9 @@ class ThreadPartitionInserter : public StmtMutator {
 // likely conditions
 class LoopPartitioner : public StmtMutator {
  public:
-  explicit LoopPartitioner(bool partition_const_loop)
-      : selector(CandidateSelector(partition_const_loop)) {}
+  explicit LoopPartitioner(bool partition_const_loop, bool no_unroll_loop_with_extent_one)
+      : selector(CandidateSelector(partition_const_loop)),
+        no_unroll_loop_with_extent_one_(no_unroll_loop_with_extent_one) {}
 
   Stmt VisitAndMutate(Stmt stmt) {
     selector(stmt);
@@ -402,6 +407,7 @@ class LoopPartitioner : public StmtMutator {
   std::unordered_map<const VarNode*, IntSet> relax_map_;
   arith::Analyzer analyzer_;
   CandidateSelector selector;
+  bool no_unroll_loop_with_extent_one_;
 };
 
 // Returns an interval (in the first component) in which all the conditions
@@ -596,7 +602,8 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
 inline Stmt LoopPartitioner::MakeFor(const Object* node, PrimExpr extent, Stmt body) {
   const ForNode* for_node = static_cast<const ForNode*>(node);
   ICHECK(for_node);
-  if (analyzer_.CanProve(extent == make_const(DataType::Int(32), 1))) {
+  if (analyzer_.CanProve(extent == make_const(DataType::Int(32), 1)) &&
+      !no_unroll_loop_with_extent_one_) {
     // If the loop extent is 1, do not create the loop anymore
     return Substitute(body, {{Var{for_node->loop_var}, make_const(DataType::Int(32), 0)}});
   } else {
@@ -617,8 +624,9 @@ class RemoveLikelyTags : public StmtExprMutator {
   }
 };
 
-Stmt LoopPartition(Stmt stmt, bool partition_const_loop) {
-  stmt = LoopPartitioner(partition_const_loop).VisitAndMutate(std::move(stmt));
+Stmt LoopPartition(Stmt stmt, bool partition_const_loop, bool no_unroll_loop_with_extent_one) {
+  stmt = LoopPartitioner(partition_const_loop, no_unroll_loop_with_extent_one)
+             .VisitAndMutate(std::move(stmt));
   stmt = RemoveLikelyTags()(std::move(stmt));
   return stmt;
 }
@@ -632,7 +640,8 @@ Pass LoopPartition() {
     if (!cfg.defined()) {
       cfg = AttrsWithDefaultValues<LoopPartitionConfig>();
     }
-    n->body = LoopPartition(std::move(n->body), cfg.value()->partition_const_loop);
+    n->body = LoopPartition(std::move(n->body), cfg.value()->partition_const_loop,
+                            cfg.value()->no_unroll_loop_with_extent_one);
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.LoopPartition", {});
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index be2f307a06af..ecaff319441d 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -66,6 +66,34 @@ def test_const_loop():
     assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
 
 
+def test_no_unroll_loop():
+    n = 21
+    A = te.placeholder((n,), name="A")
+    B = te.placeholder((n,), name="B")
+
+    T = te.compute((n,), lambda i: A[i] + B[i])
+    s = te.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
+    with tvm.transform.PassContext(
+        config={
+            "tir.LoopPartition": {
+                "partition_const_loop": True,
+                "no_unroll_loop_with_extent_one": True,
+            }
+        }
+    ):
+        mod = tvm.tir.transform.LoopPartition()(mod)
+        mod = tvm.tir.transform.Simplify()(mod)
+        stmt = tvm.tir.transform.RemoveNoOp()(mod)["main"].body
+
+    assert sum(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.For))) == 4
+
+
 def test_multi_loop():
     ib = tvm.tir.ir_builder.create()
     m = te.size_var("m")

From 26e3e2aba572dc1e5e165475dbe0b1601cd40fdd Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Fri, 6 Nov 2020 11:27:46 -0800
Subject: [PATCH 138/258] [RELAY][OP] Support MXNet-style attributes for
 reshape_like (#6851)

* add MXNet-style reshape_like attrs support

* lint

* document, switch to int, add more tests, style

* add example usage in documentation

* fix doc formatting
---
 include/tvm/relay/attrs/transform.h  | 20 +++++++++
 python/tvm/relay/op/op_attrs.py      |  5 +++
 python/tvm/relay/op/transform.py     | 43 ++++++++++++++----
 src/relay/op/make_op.h               |  3 ++
 src/relay/op/tensor/transform.cc     | 66 +++++++++++++++++++++++++---
 src/relay/transforms/pattern_utils.h |  6 +--
 tests/python/relay/test_op_level3.py | 41 ++++++++++++++---
 7 files changed, 163 insertions(+), 21 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 274294ccb388..262f41edad67 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -93,6 +93,26 @@ struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
   }
 };  // struct ReshapeAttrs
 
+/*! \brief Attributes used in MXNet-style reshape_like operators */
+struct ReshapeLikeAttrs : public tvm::AttrsNode<ReshapeLikeAttrs> {
+  int lhs_begin;
+  Integer lhs_end;  // can be None
+  int rhs_begin;
+  Integer rhs_end;  // can be None
+  TVM_DECLARE_ATTRS(ReshapeLikeAttrs, "relay.attrs.ReshapeLikeAttrs") {
+    TVM_ATTR_FIELD(lhs_begin).set_default(0).describe(
+        "The axis of the input where reshaping should begin.");
+    TVM_ATTR_FIELD(lhs_end)
+        .set_default(NullValue<Integer>())
+        .describe("The axis of the input where reshaping should end, exclusive.");
+    TVM_ATTR_FIELD(rhs_begin).set_default(0).describe(
+        "The axis of the shape_like tensor to begin taking dimensions from.");
+    TVM_ATTR_FIELD(rhs_end)
+        .set_default(NullValue<Integer>())
+        .describe("The axis of the shape_like tensor to end taking dimensions from, exclusive.");
+  }
+};  // struct ReshapeLikeAttrs
+
 struct ScatterAttrs : public tvm::AttrsNode<ScatterAttrs> {
   Integer axis;
 
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 5dc2c2402c08..2c5f046bb7e8 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -194,6 +194,11 @@ class ReshapeAttrs(Attrs):
     """Attributes for transform.reshape"""
 
 
+@tvm._ffi.register_object("relay.attrs.ReshapeLikeAttrs")
+class ReshapeLikeAttrs(Attrs):
+    """Attributes for transform.reshape_like"""
+
+
 @tvm._ffi.register_object("relay.attrs.GatherAttrs")
 class GatherAttrs(Attrs):
     """Attributes for transform.gather"""
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 17f4c02380b3..b7df6001e59e 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -308,28 +308,55 @@ def scatter_add(data, indices, updates, axis):
     return _make.scatter_add(data, indices, updates, axis)
 
 
-def reshape_like(data, shape_like):
-    """Reshapes the input array by the size of another array.
-    For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
-    the input array into an output array with the same shape as the second input array.
+def reshape_like(data, shape_like, lhs_begin=0, lhs_end=None, rhs_begin=0, rhs_end=None):
+    """Reshapes the input tensor by the size of another tensor.
+    For an input tensor with shape ``(d0, d1, ..., d(k-1))``, `reshape_like` operation reshapes
+    the input tensor into an output tensor with the same shape as the second input tensor,
+    in particular reshaping the dimensions of `data` in `[lhs_begin, lhs_end)` using the dimensions
+    from `shape_like` in `[rhs_begin, rhs_end)`.
 
     .. note::
-        Sizes for both array should be compatible.
+        Sizes for `data` and the output tensor should be compatible.
 
     Parameters
     ----------
     data : relay.Expr
         The input data to the operator.
 
-    shape_like : tuple of int
-        The new shape. Should be compatible with the original shape.
+    shape_like : relay.Expr
+        The tensor to reshape data like. Should be compatible with the original shape on the
+        reshaped dimensions.
+
+    lhs_begin : int, optional
+        The axis of data to begin reshaping. Default is 0.
+
+    lhs_end : int or None, optional
+        The axis of data where reshaping should stop, exclusive. Default is None which reshapes to
+        the end.
+
+    rhs_begin : int, optional
+        The axis of shape_like where the target shape begins. Default is 0.
+
+    rhs_end : int or None, optional
+        The axis of shape_like where the target shape ends, exclusive. Default is None which extends
+        to the end.
 
     Returns
     -------
     ret : relay.Expr
         The computed result.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        data.shape == (1, 2, 3, 4)
+        shape_like.shape == (6, 2, 2, 3)
+
+        ret = relay.reshape_like(data, shape_like, lhs_begin=1, rhs_end=3)
+        ret.shape == (1, 6, 2, 2)
     """
-    return _make.reshape_like(data, shape_like)
+    return _make.reshape_like(data, shape_like, lhs_begin, lhs_end, rhs_begin, rhs_end)
 
 
 def take(data, indices, axis=None, mode="clip"):
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 631ec4c0d2f5..0e1f5c560081 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -62,6 +62,9 @@ Expr MakeRepeat(Expr data, int repeats, int axis);
 
 Expr MakeReshape(Expr data, Array<Integer> newshape);
 
+Expr MakeReshapeLike(Expr lhs, Expr rhs, int lhs_begin, Integer lhs_end, int rhs_begin,
+                     Integer rhs_end);
+
 Expr MakeSplit(Expr data, ObjectRef indices_or_sections, int axis);
 
 Expr MakeSqueeze(Expr data, Array<Integer> axis);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 02fd8930d332..3ca816a6caae 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -453,6 +453,7 @@ RELAY_REGISTER_OP("transpose")
 
 /* relay.reshape */
 TVM_REGISTER_NODE_TYPE(ReshapeAttrs);
+TVM_REGISTER_NODE_TYPE(ReshapeLikeAttrs);
 
 Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs& attrs) {
   const auto* param = attrs.as<ReshapeAttrs>();
@@ -641,11 +642,49 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
+Array<PrimExpr> infer_reshape_like(const Array<PrimExpr>& lhs_shape,
+                                   const Array<PrimExpr>& rhs_shape, const Attrs& attrs) {
+  const auto* like_attrs = attrs.as<ReshapeLikeAttrs>();
+  CHECK(!like_attrs->lhs_end.defined() || like_attrs->lhs_end.as<IntImmNode>())
+      << "lhs_end must be a concrete integer or None";
+  CHECK(!like_attrs->rhs_end.defined() || like_attrs->rhs_end.as<IntImmNode>())
+      << "rhs_end must be a concrete integer or None";
+
+  int64_t lhs_shape_size = static_cast<int64_t>(lhs_shape.size());
+  int64_t rhs_shape_size = static_cast<int64_t>(rhs_shape.size());
+  int64_t lhs_begin = static_cast<int64_t>(like_attrs->lhs_begin);
+  int64_t lhs_end =
+      like_attrs->lhs_end.defined() ? like_attrs->lhs_end.as<IntImmNode>()->value : lhs_shape_size;
+  int64_t rhs_begin = static_cast<int64_t>(like_attrs->rhs_begin);
+  int64_t rhs_end =
+      like_attrs->rhs_end.defined() ? like_attrs->rhs_end.as<IntImmNode>()->value : rhs_shape_size;
+
+  // handle negative axes
+  lhs_begin = lhs_begin < 0 ? lhs_begin + lhs_shape_size : lhs_begin;
+  lhs_end = lhs_end < 0 ? lhs_end + lhs_shape_size : lhs_end;
+  rhs_begin = rhs_begin < 0 ? rhs_begin + rhs_shape_size : rhs_begin;
+  rhs_end = rhs_end < 0 ? rhs_end + rhs_shape_size : rhs_end;
+
+  Array<PrimExpr> shape_like;
+  for (auto i = 0; i < lhs_begin; i++) {
+    shape_like.push_back(lhs_shape[i]);
+  }
+  for (auto i = rhs_begin; i < rhs_end; i++) {
+    shape_like.push_back(rhs_shape[i]);
+  }
+  for (auto i = lhs_end; i < lhs_shape_size; i++) {
+    shape_like.push_back(lhs_shape[i]);
+  }
+  return shape_like;
+}
+
 Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   // Quick path for reshape_like
   if (!attrs.as<ReshapeAttrs>()) {
-    return {topi::reshape(inputs[0], inputs[1]->shape)};
+    ICHECK(attrs.as<ReshapeLikeAttrs>() != nullptr);
+    auto shape_like = infer_reshape_like(inputs[0]->shape, inputs[1]->shape, attrs);
+    return {topi::reshape(inputs[0], shape_like)};
   }
 
   const auto* out_ttype = out_type.as<TensorTypeNode>();
@@ -746,6 +785,7 @@ Example::
  */
 bool ReshapeLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                     const TypeReporter& reporter) {
+  ICHECK(attrs.as<ReshapeLikeAttrs>() != nullptr);
   ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
@@ -755,6 +795,7 @@ bool ReshapeLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   if (reshape_like == nullptr) {
     return false;
   }
+  auto shape_like = infer_reshape_like(data->shape, reshape_like->shape, attrs);
   // Only check When input data has static shape.
   bool is_static_shape = true;
   for (size_t i = 0; i < data->shape.size(); ++i) {
@@ -763,17 +804,24 @@ bool ReshapeLikeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
       break;
     }
   }
+  auto output_type = TensorType(shape_like, data->dtype);
   if (is_static_shape) {
-    ICHECK(reporter->AssertEQ(data->Size(), reshape_like->Size()))
+    ICHECK(reporter->AssertEQ(data->Size(), output_type->Size()))
         << "Reshape inputs size should be compatible.";
   }
-  reporter->Assign(types[2], TensorType(reshape_like->shape, data->dtype));
+  reporter->Assign(types[2], output_type);
   return true;
 }
 
-Expr MakeReshapeLike(Expr data, Expr shape_like) {
+Expr MakeReshapeLike(Expr lhs, Expr rhs, int lhs_begin, Integer lhs_end, int rhs_begin,
+                     Integer rhs_end) {
+  auto attrs = make_object<ReshapeLikeAttrs>();
+  attrs->lhs_begin = std::move(lhs_begin);
+  attrs->lhs_end = std::move(lhs_end);
+  attrs->rhs_begin = std::move(rhs_begin);
+  attrs->rhs_end = std::move(rhs_end);
   static const Op& op = Op::Get("reshape_like");
-  return Call(op, {data, shape_like}, Attrs(), {});
+  return Call(op, {lhs, rhs}, Attrs(attrs), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op._make.reshape_like").set_body_typed(MakeReshapeLike);
@@ -784,7 +832,15 @@ For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation re
 the input array into an output array with the same shape as the second input array.
 .. note::
     Sizes for both array should be compatible.
+Example::
+
+  data.shape == (1, 2, 3, 4)
+  shape_like.shape == (6, 2, 2, 3)
+
+  ret = reshape_like(data, shape_like, lhs_begin=1, rhs_end=3)
+  ret.shape == (1, 6, 2, 2)
 )code" TVM_ADD_FILELINE)
+    .set_attrs_type<ReshapeLikeAttrs>()
     .set_num_inputs(2)
     .add_argument("data", "Tensor", "The input tensor.")
     .add_argument("shape_like", "Tensor", "Shape tensor.")
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index 555391a27e4b..8ef86e088193 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -594,9 +594,9 @@ inline Expr LeftShift(Expr x, Expr nbit) {
   return Call(op, {x, nbit}, Attrs(), {});
 }
 
-inline Expr ReshapeLike(Expr lhs, Expr rhs) {
-  static const Op& op = Op::Get("reshape_like");
-  return Call(op, {lhs, rhs}, Attrs(), {});
+inline Expr ReshapeLike(Expr lhs, Expr rhs, int lhs_begin, Integer lhs_end, int rhs_begin,
+                        Integer rhs_end) {
+  return MakeReshapeLike(lhs, rhs, lhs_begin, lhs_end, rhs_begin, rhs_end);
 }
 
 inline Expr Copy(Expr data) {
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index f091856f6b7e..90e6e870f370 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -316,17 +316,45 @@ def test_reshape_like_infer_type():
     zz = run_infer_type(z)
     assert zz.checked_type == relay.TensorType((1, 8, 8), "float32")
 
+    # partial reshaping
+    x = relay.var("x", relay.TensorType((1, 2, 3, 4), "float32"))
+    y = relay.var("y", relay.TensorType((1, 6, 5), "float32"))
+    z = relay.reshape_like(x, y, lhs_begin=1, lhs_end=3, rhs_begin=1, rhs_end=2)
+    zz = run_infer_type(z)
+    assert zz.checked_type == relay.TensorType((1, 6, 4), "float32")
+
+    x = relay.var("x", relay.TensorType((1, 2, 3, 4), "float32"))
+    y = relay.var("y", relay.TensorType((2, 3, 4, 1, 6), "float32"))
+    z = relay.reshape_like(x, y, rhs_end=3)
+    zz = run_infer_type(z)
+    assert zz.checked_type == relay.TensorType((2, 3, 4), "float32")
+    z = relay.reshape_like(x, y, rhs_begin=2)
+    zz = run_infer_type(z)
+    assert zz.checked_type == relay.TensorType((4, 1, 6), "float32")
+
+    # symbolic partial reshaping
+    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.var("y", relay.TensorType((5, 6), "float32"))
+    z = relay.var("z", relay.TensorType((4,), "float32"))
+    w = relay.reshape_like(x, y, lhs_end=3)
+    w = relay.reshape_like(w, z, lhs_begin=2)
+    w = run_infer_type(w)
+    assert w.checked_type == relay.TensorType((5, 6, 4), "float32")
+
 
 @tvm.testing.uses_gpu
 def test_reshape_like():
-    def verify_reshape_like(shape, oshape):
+    def verify_reshape_like(shape, oshape, shape_like=None, reshape_like_kwargs={}):
+        if shape_like is None:
+            shape_like = oshape
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        y_data = np.random.uniform(low=-1, high=1, size=oshape).astype("float32")
-        ref_res = np.reshape(x_data, y_data.shape)
+        y_data = np.random.uniform(low=-1, high=1, size=shape_like).astype("float32")
+        ref_res = np.reshape(x_data, oshape)
 
         x = relay.var("x", relay.TensorType(shape, "float32"))
-        y = relay.var("x", relay.TensorType(oshape, "float32"))
-        z = relay.reshape_like(x, y)
+        y = relay.var("x", relay.TensorType(shape_like, "float32"))
+        z = relay.reshape_like(x, y, **reshape_like_kwargs)
         zz = run_infer_type(z)
         assert zz.checked_type == relay.ty.TensorType(ref_res.shape, "float32")
 
@@ -340,6 +368,9 @@ def verify_reshape_like(shape, oshape):
 
     verify_reshape_like((2, 3, 4), (1, 8, 3))
     verify_reshape_like((4, 7), (2, 7, 2))
+    verify_reshape_like(
+        (1, 2, 3, 4), (1, 6, 4), (1, 6, 5), dict(lhs_begin=1, lhs_end=3, rhs_begin=1, rhs_end=2)
+    )
 
 
 def test_take_infer_type():

From 770b60ffe99b39f5745d9780935a14f80124a17f Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Fri, 6 Nov 2020 11:40:31 -0800
Subject: [PATCH 139/258] fix first-order AD on tuple arguments (#6827)

---
 src/relay/transforms/gradient.cc         | 20 +++++++++++++-
 tests/python/relay/test_pass_gradient.py | 33 ++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/gradient.cc
index 9441f8af5d27..cd3a99655341 100644
--- a/src/relay/transforms/gradient.cc
+++ b/src/relay/transforms/gradient.cc
@@ -181,6 +181,22 @@ struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
     return ret;
   }
 
+  Expr UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) {
+    if (t.as<TensorTypeNode>()) {
+      return ll->Push(Add(arg, grad));
+    } else if (auto* tt = t.as<TupleTypeNode>()) {
+      Array<Expr> updates;
+      for (size_t i = 0; i < tt->fields.size(); ++i) {
+        updates.push_back(this->UpdateGrad(tt->fields[i], ll->Push(GetField(arg, i)),
+                                           ll->Push(GetField(grad, i)), ll));
+      }
+      return ll->Push(Tuple(updates));
+    } else {
+      LOG(FATAL) << "unsupported arg type of operator: " << t;
+      throw;
+    }
+  }
+
   ADValue VisitExpr_(const OpNode* op) final {
     Op op_ref = GetRef<Op>(op);
     ICHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined";
@@ -198,8 +214,10 @@ struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
             tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
             ICHECK(args.size() == rev.size());
             for (size_t i = 0; i < args.size(); ++i) {
+              auto ad_arg = args[i]->get<ADTensor>();
+              auto ad_arg_type = ad_arg.forward->checked_type();
               args[i]->get<ADTensor>().reverse =
-                  ll->Push(Add(args[i]->get<ADTensor>().reverse, rev[i]));
+                  this->UpdateGrad(ad_arg_type, ad_arg.reverse, rev[i], ll);
             }
           });
           return ret;
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index 93bad3a19c53..0604ed51272c 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -255,6 +255,29 @@ def _test_tuple(mode):
     tvm.testing.assert_allclose(grad_z.asnumpy(), -1 * np.ones_like(grad_z.asnumpy()))
 
 
+def _test_tuple_argument(mode):
+    shape = (2, 3)
+    dtype = "float32"
+    tensor_type = relay.TensorType(shape, dtype)
+    fields = 3
+    tuple_type = relay.TupleType([tensor_type] * fields)
+    tup = relay.var("tup", type_annotation=tuple_type)
+    body = relay.TupleGetItem(tup, 0)
+    for i in range(1, fields):
+        body = relay.add(body, relay.TupleGetItem(tup, i))
+    func = relay.Function([tup], body)
+    func = run_infer_type(func)
+    back_func = run_infer_type(gradient(func, mode=mode))
+    xs = [rand(dtype, *shape) for _ in range(fields)]
+    xs_np = np.array([x.asnumpy() for x in xs])
+    expected_forward = np.sum(xs_np, axis=0)
+    ex = create_executor()
+    forward, grad = ex.evaluate(back_func)(tuple(xs))
+    tvm.testing.assert_allclose(forward.asnumpy(), expected_forward)
+    for field in grad[0]:
+        tvm.testing.assert_allclose(field.asnumpy(), np.ones_like(field.asnumpy()))
+
+
 def test_tuple():
     _test_tuple("higher_order")
 
@@ -263,6 +286,16 @@ def test_tuple_first_order():
     _test_tuple("first_order")
 
 
+@pytest.mark.xfail(raises=tvm.error.TVMError)
+def test_tuple_argument():
+    # fails until we add support for top-level tuple arguments in higher-order AD
+    _test_tuple_argument("higher_order")
+
+
+def test_tuple_argument_first_order():
+    _test_tuple_argument("first_order")
+
+
 def test_pow():
     mod = tvm.IRModule()
     p = Prelude(mod)

From c9563c87ca374485c34ee7f1b9be488859a66be2 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Sat, 7 Nov 2020 03:42:13 +0800
Subject: [PATCH 140/258] [Relay] Mix mode type inference (#6704)

---
 include/tvm/relay/expr_functor.h   | 68 ++++++++++++++++++++++++++++++
 src/relay/ir/expr_functor.cc       | 68 ------------------------------
 src/relay/op/algorithm/topk.cc     |  2 +-
 src/relay/transforms/type_infer.cc | 51 ++++++++++++++++++----
 4 files changed, 112 insertions(+), 77 deletions(-)

diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index df0940fa7482..8589f8cc4f16 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -32,6 +32,7 @@
 #include <tvm/relay/function.h>
 #include <tvm/relay/op.h>
 
+#include <stack>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -408,6 +409,73 @@ Expr PostOrderRewrite(const Expr& expr, ExprRewriter* rewriter);
  */
 void PostOrderVisit(const Expr& node, std::function<void(const Expr&)> fvisit);
 
+/*!
+ * \brief A function to iteratively traverse dataflow regions of a graph
+ *
+ * ExpandDataflow manually manages a stack and performs DFS to determine the processing
+ * order of nodes in an input graph.
+ *
+ * If it finds a dataflow node (Call, Tuple, TupleGetItem), it checks if the arguments to that node
+ * need to be processed via fcheck_visited. If so, the function pushes those arguments to the stack
+ * and continues iteratively to process the top of the stack. When it finds a node that doesn't
+ * match the dataflow types, or a node who's inputs have all been processed, it visits the current
+ * leaf via fvisit_leaf.
+ *
+ * This function should be used internally to other classes to implement mixed-mode traversals. The
+ * expectation is that fvisit_leaf will perform recursive analysis within mixed-mode traversal if it
+ * hits a non-dataflow node.
+ *
+ * fcheck_visited and fvisit_leaf are templated to encourage compiler inlining.
+ */
+template <typename FCheckVisited, typename FVisitLeaf>
+void ExpandDataflow(Expr expr, FCheckVisited fcheck_visited, FVisitLeaf fvisit_leaf) {
+  std::stack<std::pair<Expr, bool>> stack;
+  auto fpush_to_stack = [&fcheck_visited, &stack](const Expr& expr) {
+    // The second state of the stack indicate whether the child has been
+    // expanded in the pre-order.
+    // NOTE: function will be inlined.
+    if (!fcheck_visited(expr)) {
+      stack.push({expr, false});
+    }
+  };
+  fpush_to_stack(expr);
+  while (stack.size() > 0) {
+    auto node = stack.top().first;
+    if (fcheck_visited(node)) {
+      // if this node was visited through another path
+      // after being added to the stack ignore it.
+      stack.pop();
+    } else if (stack.top().second) {
+      // all the children have already been expanded.
+      // we can just run post order visit on it.
+      fvisit_leaf(node);
+      stack.pop();
+    } else if (const CallNode* op = node.as<CallNode>()) {
+      // mark expanded = true
+      stack.top().second = true;
+      // push the children to the stack in reverse order
+      // to match recursive processing order
+      for (auto it = op->args.rbegin(); it != op->args.rend(); ++it) {
+        fpush_to_stack(*it);
+      }
+      fpush_to_stack(op->op);
+    } else if (const TupleNode* op = node.as<TupleNode>()) {
+      stack.top().second = true;
+      // push the children to the stack in reverse order
+      // to match recursive processing order
+      for (auto it = op->fields.rbegin(); it != op->fields.rend(); ++it) {
+        fpush_to_stack(*it);
+      }
+    } else if (const TupleGetItemNode* op = node.as<TupleGetItemNode>()) {
+      stack.top().second = true;
+      fpush_to_stack(op->tuple);
+    } else {
+      // No need to expand the children directly run visit.
+      fvisit_leaf(node);
+      stack.pop();
+    }
+  }
+}
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_FUNCTOR_H_
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index a22b69c4ed1b..74095a753950 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -33,74 +33,6 @@
 
 namespace tvm {
 namespace relay {
-/*!
- * \brief A function to iteratively traverse dataflow regions of a graph
- *
- * ExpandDataflow manually manages a stack and performs DFS to determine the processing
- * order of nodes in an input graph.
- *
- * If it finds a dataflow node (Call, Tuple, TupleGetItem), it checks if the arguments to that node
- * need to be processed via fcheck_visited. If so, the function pushes those arguments to the stack
- * and continues iteratively to process the top of the stack. When it finds a node that doesn't
- * match the dataflow types, or a node who's inputs have all been processed, it visits the current
- * leaf via fvisit_leaf.
- *
- * This function should be used internally to other classes to implement mixed-mode traversals. The
- * expectation is that fvisit_leaf will perform recursive analysis within mixed-mode traversal if it
- * hits a non-dataflow node.
- *
- * fcheck_visited and fvisit_leaf are templated to encourage compiler inlining.
- */
-template <typename FCheckVisited, typename FVisitLeaf>
-void ExpandDataflow(Expr expr, FCheckVisited fcheck_visited, FVisitLeaf fvisit_leaf) {
-  std::stack<std::pair<Expr, bool>> stack;
-  auto fpush_to_stack = [&fcheck_visited, &stack](const Expr& expr) {
-    // The second state of the stack indicate whether the child has been
-    // expanded in the pre-order.
-    // NOTE: function will be inlined.
-    if (!fcheck_visited(expr)) {
-      stack.push({expr, false});
-    }
-  };
-  fpush_to_stack(expr);
-  while (stack.size() > 0) {
-    auto node = stack.top().first;
-    if (fcheck_visited(node)) {
-      // if this node was visited through another path
-      // after being added to the stack ignore it.
-      stack.pop();
-    } else if (stack.top().second) {
-      // all the children have already been expanded.
-      // we can just run post order visit on it.
-      fvisit_leaf(node);
-      stack.pop();
-    } else if (const CallNode* op = node.as<CallNode>()) {
-      // mark expanded = true
-      stack.top().second = true;
-      // push the children to the stack in reverse order
-      // to match recursive processing order
-      for (auto it = op->args.rbegin(); it != op->args.rend(); ++it) {
-        fpush_to_stack(*it);
-      }
-      fpush_to_stack(op->op);
-    } else if (const TupleNode* op = node.as<TupleNode>()) {
-      stack.top().second = true;
-      // push the children to the stack in reverse order
-      // to match recursive processing order
-      for (auto it = op->fields.rbegin(); it != op->fields.rend(); ++it) {
-        fpush_to_stack(*it);
-      }
-    } else if (const TupleGetItemNode* op = node.as<TupleGetItemNode>()) {
-      stack.top().second = true;
-      fpush_to_stack(op->tuple);
-    } else {
-      // No need to expand the children directly run visit.
-      fvisit_leaf(node);
-      stack.pop();
-    }
-  }
-}
-
 MixedModeVisitor::MixedModeVisitor(int visit_limit) {
   ICHECK(visit_limit > 0) << "Dataflow visit limit must be greater than 0";
   ICHECK(visit_limit < 10) << "Dataflow visit limit must be less than 10";
diff --git a/src/relay/op/algorithm/topk.cc b/src/relay/op/algorithm/topk.cc
index b0e4b5dc6b4e..c1d3e5472743 100644
--- a/src/relay/op/algorithm/topk.cc
+++ b/src/relay/op/algorithm/topk.cc
@@ -36,7 +36,7 @@ bool TopKRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const TopKAttrs* param = attrs.as<TopKAttrs>();
   ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
-  ICHECK(data);
+  if (data == nullptr) return false;
   int ndim = data->shape.size();
   int axis = param->axis;
   if (axis < 0) {
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index cb3ba0030a5b..327b5d1e260a 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -129,6 +129,37 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   TypeRelationFn tuple_getitem_rel_;
   TypeRelationFn make_tuple_rel_;
 
+  /*! \brief Internal map used for memoization. */
+  std::unordered_map<Expr, Type, ObjectPtrHash, ObjectPtrEqual> memo_;
+
+  void VisitLeaf(const Expr& expr) {
+    if (!memo_.count(expr)) {
+      Type ret = this->DispatchVisitExpr(expr);
+      memo_[expr] = ret;
+    }
+  }
+
+  bool CheckVisited(const Expr& expr) {
+    if (memo_.count(expr)) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  Type DispatchVisitExpr(const Expr& expr) { return ExprFunctor::VisitExpr(expr); }
+
+  Type VisitExpr(const Expr& expr) final {
+    auto fcheck_visited = [this](const Expr& expr) { return this->CheckVisited(expr); };
+    auto fvisit_leaf = [this](const Expr& expr) { return this->VisitLeaf(expr); };
+    if (memo_.count(expr)) {
+      return memo_[expr];
+    } else {
+      ExpandDataflow(expr, fcheck_visited, fvisit_leaf);
+      return memo_[expr];
+    }
+  }
+
   // Perform unification on two types and report the error at the expression
   // or the span of the expression.
   Type Unify(const Type& t1, const Type& t2, const Span& span) {
@@ -546,12 +577,14 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   }
 };
 
-class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
+class TypeInferencer::Resolver : public MixedModeMutator, PatternMutator {
  public:
   Resolver(const std::unordered_map<Expr, ResolvedTypeInfo, ObjectPtrHash, ObjectPtrEqual>& tmap,
            TypeSolver* solver)
       : tmap_(tmap), solver_(solver) {}
 
+  using MixedModeMutator::VisitExpr_;
+
   Expr VisitExpr_(const VarNode* op) final { return VisitVar(GetRef<Var>(op)); }
 
   Expr VisitExpr_(const ConstantNode* op) final { return AttachCheckedType(op); }
@@ -560,13 +593,15 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
 
   Expr VisitExpr_(const OpNode* op) final { return ExprMutator::VisitExpr_(op); }
 
-  Expr VisitExpr_(const TupleNode* op) final { return AttachCheckedType(op); }
+  Expr Rewrite_(const TupleNode* op, const Expr& post) final { return AttachCheckedType(op, post); }
 
-  Expr VisitExpr_(const TupleGetItemNode* op) final { return AttachCheckedType(op); }
+  Expr Rewrite_(const TupleGetItemNode* op, const Expr& post) final {
+    return AttachCheckedType(op, post);
+  }
 
   Expr VisitExpr_(const FunctionNode* op) final { return AttachCheckedType(op); }
 
-  Expr VisitExpr_(const CallNode* op) final { return AttachCheckedType(op); }
+  Expr Rewrite_(const CallNode* op, const Expr& post) final { return AttachCheckedType(op, post); }
 
   Expr VisitExpr_(const LetNode* op) final { return AttachCheckedType(op); }
 
@@ -593,7 +628,7 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
 
   // attach checked type to the mutated node.
   template <typename T>
-  Expr AttachCheckedType(const T* op) {
+  Expr AttachCheckedType(const T* op, const Expr& post = Expr()) {
     auto it = tmap_.find(GetRef<Expr>(op));
     ICHECK(it != tmap_.end());
     Type checked_type = solver_->Resolve(it->second.checked_type);
@@ -606,7 +641,7 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
           << " check other reported errors for hints of what may of happened.");
     }
 
-    Expr new_e = ExprMutator::VisitExpr_(op);
+    Expr new_e = post.defined() ? post : ExprMutator::VisitExpr_(op);
     // new_call and new_var's code is only going to be valid for VarNode/CallNode.
     // Compiler optimization will likely fold these away for other nodes.
     CallNode* new_call = (std::is_base_of<CallNode, T>::value
@@ -702,8 +737,8 @@ Expr TypeInferencer::Infer(GlobalVar var, Function function) {
   return resolved_expr;
 }
 
-struct AllCheckTypePopulated : ExprVisitor {
-  void VisitExpr(const Expr& e) {
+struct AllCheckTypePopulated : MixedModeVisitor {
+  void DispatchExprVisit(const Expr& e) {
     if (e.as<OpNode>()) {
       return;
     }

From fb0452b14ffb1e50e82c84d85c73609afac33709 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Fri, 6 Nov 2020 14:38:31 -0800
Subject: [PATCH 141/258] [FIX,RPC] Skip RPC tests when using multiprocessing's
 spawn method (#6858)

The rpc tests are broken when running under pytest with multiprocessing
using spawn. I suspect this is because pytest tests each function in a
separate process and does not import the full module.
---
 tests/python/unittest/test_runtime_rpc.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index efcc25d84edc..e975a1699341 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -29,6 +29,22 @@
 from tvm.contrib import utils, cc
 from tvm.rpc.tracker import Tracker
 
+# tkonolige: The issue as I understand it is this: multiprocessing's spawn
+# method launches a new process and then imports the relevant modules. This
+# means that all registered functions must exist at the top level scope. In
+# this file they are, so all is well when we run this file directly.
+# However, when run under pytest, the functions aren't registered on the
+# server. I believe this is because pytest is also using multiprocessing to
+# run individual functions. Somewhere along the way, the imports are being
+# lost, so the server ends up not registering the functions.
+pytestmark = pytest.mark.skipif(
+    multiprocessing.get_start_method() != "fork",
+    reason=(
+        "pytest + multiprocessing spawn method causes tvm.register_func to "
+        "not work on the rpc.Server."
+    ),
+)
+
 
 @tvm.testing.requires_rpc
 def test_bigendian_rpc():

From 9a801b424d187c949aef944bc5ebc0a4e83bfafd Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 6 Nov 2020 22:42:16 +0000
Subject: [PATCH 142/258] Add smmla/ummla support in quantized Conv2d (#6802)

* Add smmla/ummla support in quantized Conv2d

This introduces support for `smmla`/`ummla` instructions in TVM:
- Added `is_mmla_available` function in `arm_utils.py`
- Added the tiling node + tensorization schedule in `conv2d_gemm.py`
- Added the intrinsic support in `tensor_intrin.py`
- Added the test-case in `test_topi_conv2d_int8.py`

Change-Id: Iff48c77f16fe1e64ecb733da965a879651ce635f

* Address review comments and test failures

* Fix linting

* Rebasing
---
 python/tvm/topi/arm_cpu/arm_utils.py          |  27 ++-
 python/tvm/topi/arm_cpu/conv2d_gemm.py        | 179 ++++++++++++------
 python/tvm/topi/arm_cpu/tensor_intrin.py      | 106 ++++++++++-
 .../topi/python/test_topi_conv2d_int8.py      |   6 +
 4 files changed, 252 insertions(+), 66 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
index 7e0f566b96f4..15d84c20ed23 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -43,12 +43,21 @@ def get_arch_version(target_mattr):
 
 
 def is_dotprod_available():
-    """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
+    """ Checks whether the hardware has support for udot/sdot instructions. """
     target = tvm.target.Target.current(allow_none=False)
     arch_version = get_arch_version(target.mattr)
     return arch_version >= 8.4 or ((arch_version in (8.2, 8.3)) and "+dotprod" in target.mattr)
 
 
+def is_mmla_available():
+    """ Checks whether the hardware has support for ummla/smmla instructions. """
+    target = tvm.target.Target.current(allow_none=False)
+    arch_version = get_arch_version(target.mattr)
+    return arch_version >= 8.6 or (
+        (arch_version in (8.2, 8.3, 8.4, 8.5)) and "+i8mm" in target.mattr
+    )
+
+
 def is_aarch64_arm():
     """ Checks whether we are compiling for an AArch64 target. """
     target = tvm.target.Target.current(allow_none=False)
@@ -63,8 +72,10 @@ def get_tiling_B_interleaved_t(interleave_A):
     tile computation.
 
     Please refer to:
-        - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
-        - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
+    - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-performance-for-armv8-architectures # pylint: disable=line-too-long
+    - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
+    - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-through-mmla-instruction
+    - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
      In order to have more information
 
     Parameters
@@ -77,7 +88,13 @@ def get_tiling_B_interleaved_t(interleave_A):
     tile_rows_B: the output tile rows of B'
     tile_cols_B: the output tile columns of B'
     """
-    if is_dotprod_available():
+    if is_mmla_available():
+        # If smmla/ummla is available,  A must be interleaved.
+        # Each load from B' will contain 8 elements
+        # and we are loading 12 rows of B' (i.e., 12 columns of B)
+        tile_rows_B = 12
+        tile_cols_B = 8
+    elif is_dotprod_available():
         # The number of tile rows of B' vary depending on the
         # strategy:
         # * If we are interleaving A, then we select 12 columns from B'(i.e.,
@@ -92,7 +109,7 @@ def get_tiling_B_interleaved_t(interleave_A):
         # rows of the original matrix B)  need to be 4.
         tile_cols_B = 4
     else:
-        # If dot product is not available, A must be interleaved. In this case
+        # If no acceleration is available, A must be interleaved. In this case
         # we load 4 rows of B' (i.e., 4 columns of B). Each of them will contain 16 elements
         tile_rows_B = 4
         tile_cols_B = 16
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 81326f169260..6a5cb2ae890e 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -28,8 +28,9 @@
     gemm_quantized_impl,
     gemm_acc_4x4_int8_int8_int32,
     gemm_acc_nx16_int8_int8_int32,
+    gemm_acc_2x2_int8_int8_int32,
 )
-from .arm_utils import is_aarch64_arm, is_dotprod_available
+from .arm_utils import is_aarch64_arm, is_dotprod_available, is_mmla_available
 
 
 def configure_knobs(cfg, M, K):
@@ -130,11 +131,18 @@ def compute_conv2d_gemm_without_weight_transform(
     # the tile computation.
     #
     # Please refer to:
-    #   - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
-    #   - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
+    # - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-performance-for-armv8-architectures # pylint: disable=line-too-long
+    # - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
+    # - https://discuss.tvm.apache.org/t/rfc-improve-quantized-convolution-through-mmla-instruction
+    # - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
     # In order to have more information
     #
-    if is_dotprod_available() and interleave_A:
+    if is_mmla_available():
+        # If smmla/ummla is enabled, we are loading 8 rows from A. Each row
+        # will contain 8 elements
+        tile_rows_A = 8
+        tile_cols_A = 8
+    elif is_dotprod_available() and interleave_A:
         # If dot product has been enabled, and we are interleaving A
         # tile size should be 8x4
         tile_rows_A = 8
@@ -177,24 +185,71 @@ def compute_conv2d_gemm_without_weight_transform(
             lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y],
             name="A_interleaved",
         )
-        # Execute GEMM
-        C_interleaved = te.compute(
-            (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B),
-            lambda b, x, y, w, z: te.sum(
-                A_interleaved[b, x, k // tile_cols_A, w, idxm(k, tile_cols_A)].astype("int32")
-                * B_interleaved_t[y, k // tile_cols_B, z, idxm(k, tile_cols_B)].astype("int32"),
-                axis=k,
-            ),
-            name="C_interleaved",
-        )
-        # Unpack the result
-        C = te.compute(
-            (batches, M, N),
-            lambda b, x, y: C_interleaved[
-                b, x // tile_rows_A, y // tile_rows_B, idxm(x, tile_rows_A), idxm(y, tile_rows_B)
-            ].astype(out_dtype),
-            name="C",
-        )
+        if is_mmla_available():
+            # Execute GEMM. In the case of mmla, we need to enforce the tiling
+            # from the compute. This is because mmla is doing a tiled computation
+            # as well. So we have a big 8x12 tile, with small 2x2 sub-tiles
+            # generated by mmla. In theory we could make the tile 2x2 and
+            # fuse and split during scheduling, but this would not work
+            # because of possible padding
+            C_interleaved = te.compute(
+                (
+                    batches,
+                    M_padded // tile_rows_A,
+                    N_transformed,
+                    tile_rows_A // 2,
+                    tile_rows_B // 2,
+                    2,
+                    2,
+                ),
+                lambda b, x, y, w, z, s, t: te.sum(
+                    A_interleaved[b, x, k // tile_cols_A, 2 * w + s, idxm(k, tile_cols_A)].astype(
+                        "int32"
+                    )
+                    * B_interleaved_t[y, k // tile_cols_B, 2 * z + t, idxm(k, tile_cols_B)].astype(
+                        "int32"
+                    ),
+                    axis=k,
+                ),
+                name="C_interleaved",
+            )
+            # Unpack the result
+            C = te.compute(
+                (batches, M, N),
+                lambda b, x, y: C_interleaved[
+                    b,
+                    x // tile_rows_A,
+                    y // tile_rows_B,
+                    idxm(x, tile_rows_A) // 2,
+                    idxm(y, tile_rows_B) // 2,
+                    idxm(idxm(x, tile_rows_A), 2),
+                    idxm(idxm(y, tile_rows_B), 2),
+                ].astype(out_dtype),
+                name="C",
+            )
+        else:
+            # Execute GEMM
+            C_interleaved = te.compute(
+                (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B),
+                lambda b, x, y, w, z: te.sum(
+                    A_interleaved[b, x, k // tile_cols_A, w, idxm(k, tile_cols_A)].astype("int32")
+                    * B_interleaved_t[y, k // tile_cols_B, z, idxm(k, tile_cols_B)].astype("int32"),
+                    axis=k,
+                ),
+                name="C_interleaved",
+            )
+            # Unpack the result
+            C = te.compute(
+                (batches, M, N),
+                lambda b, x, y: C_interleaved[
+                    b,
+                    x // tile_rows_A,
+                    y // tile_rows_B,
+                    idxm(x, tile_rows_A),
+                    idxm(y, tile_rows_B),
+                ].astype(out_dtype),
+                name="C",
+            )
         zero = tvm.tir.const(0)
     else:
         # No need to pack/unpack, execute GEMM directly
@@ -255,7 +310,7 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
         s[data_im2col].compute_inline()
 
     # Computation(through tensorize)
-    b, xo, yo, xi, yi = C_interleaved.op.axis
+    b, xo, yo, xi, yi = C_interleaved.op.axis[0:5]
     outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo])
 
     b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm)
@@ -271,40 +326,50 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
 
     k = C_interleaved.op.reduce_axis[0]
     _, M, N = C.shape
-    if is_dotprod_available():
-        gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type)
-        xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
-            xi, yi, x_factor=8, y_factor=4
-        )
-        k_outer, k_inner = s[C_interleaved].split(k, 4)
-        xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4)
-        s[C_interleaved].reorder(
-            b_outer_gemm_fused,
-            inner_gemm,
-            xi_outer,
-            yi_outer,
-            k_outer,
-            xi_inner_outer,
-            xi_inner_inner,
-            yi_inner,
-            k_inner,
-        )
-        s[C_interleaved].tensorize(xi_inner_inner, gemm_acc)
-        s[C_interleaved].unroll(xi_inner_outer)
-
-    elif is_aarch64_arm():
-        s[C_interleaved].reorder(yi, xi)
-        K = A_interleaved_input.shape[2]
-        assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
-        unroll = cfg["gemm_quantized_unroll"].val
-        interleave = cfg["gemm_quantized_interleave"].val
-        gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type)
-        s[C_interleaved].pragma(
-            b_outer_gemm_fused,
-            "import_llvm",
-            gemm_quantized_impl(M, N, K, unroll, interleave, in_type),
-        )
-        s[C_interleaved].tensorize(yi, gemm)
+    if in_type in ["int8", "uint8"]:
+        if is_mmla_available():
+            gemm_acc = gemm_acc_2x2_int8_int8_int32(in_type)
+            xi_inner, yi_inner = C_interleaved.op.axis[-2:]
+            k_outer, k_inner = s[C_interleaved].split(k, 8)
+            s[C_interleaved].reorder(
+                b_outer_gemm_fused, inner_gemm, k_outer, xi, yi, xi_inner, yi_inner, k_inner
+            )
+            s[C_interleaved].tensorize(xi_inner, gemm_acc)
+            s[C_interleaved].unroll(xi)
+            s[C_interleaved].unroll(yi)
+        elif is_dotprod_available():
+            gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type)
+            xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
+                xi, yi, x_factor=8, y_factor=4
+            )
+            k_outer, k_inner = s[C_interleaved].split(k, 4)
+            xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4)
+            s[C_interleaved].reorder(
+                b_outer_gemm_fused,
+                inner_gemm,
+                xi_outer,
+                yi_outer,
+                k_outer,
+                xi_inner_outer,
+                xi_inner_inner,
+                yi_inner,
+                k_inner,
+            )
+            s[C_interleaved].tensorize(xi_inner_inner, gemm_acc)
+            s[C_interleaved].unroll(xi_inner_outer)
+
+        elif is_aarch64_arm():
+            s[C_interleaved].reorder(yi, xi)
+            K = A_interleaved_input.shape[2]
+            unroll = cfg["gemm_quantized_unroll"].val
+            interleave = cfg["gemm_quantized_interleave"].val
+            gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type)
+            s[C_interleaved].pragma(
+                b_outer_gemm_fused,
+                "import_llvm",
+                gemm_quantized_impl(M, N, K, unroll, interleave, in_type),
+            )
+            s[C_interleaved].tensorize(yi, gemm)
 
     # Output transform
     if out != final_out:
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 1b999dfe4e80..8ccbe0c41298 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -411,6 +411,7 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
     intrin : TensorIntrin
         The ARM uint8/int8 TensorIntrin that can be used in tensorizing schedule
     """
+    assert in_type in ["uint8", "int8"]
     A = te.placeholder((K // 16, te.var("m"), 16), dtype=in_type, name="A")
     B = te.placeholder((K // 16, te.var("n"), 16), dtype=in_type, name="B")
 
@@ -627,7 +628,7 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
     Int8 4x4 matrix multiplication and accumulation using sdot/udot
     instructions. This function takes two arrays of int8 datatype
     -- A[4][4] and B[4][4] and produces a 4x4 matrix
-    which is equal to A*B.
+    which is equal to A*B'.
 
     The pseudo code is as follows.
 
@@ -643,7 +644,6 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
         }
 
     Notes:
-        * The rows of matrix B are transposed
         * The tiling strategy is picked to maximize register usage.
 
     Parameters
@@ -656,6 +656,7 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
+    assert dtype in ["uint8", "int8"]
     # This needs to be a variable number of "rows" since TVM
     # "thinks" I only need to compute one row because of
     # padding
@@ -755,7 +756,7 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
     """
     Int8 nx16 matrix multiplication and accumulation using sdot/udot instructions
     This function takes two arrays of int8 datatype -- A[n][4] and
-    B[4][16] and produces a rowsx16 matrix which is equal to A*B
+    B[4][16] and produces a rowsx16 matrix which is equal to A*B'
     The pseudo code is as follows.
 
     .. code-block:: c
@@ -771,7 +772,6 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
         }
 
     Notes:
-        * The rows of matrix B are transposed
         * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16
           we need 4 tiles of B to compute a single row of the output. The first 4 values of
           k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on
@@ -789,6 +789,7 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
+    assert dtype in ["uint8", "int8"]
     A = te.placeholder((rows, 16), dtype, name="A")
     B = te.placeholder((4, 16, 4), dtype, name="B")
     dtype_vec = dtype + "x16"
@@ -969,6 +970,103 @@ def _instr(index):
     )
 
 
+def gemm_acc_2x2_int8_int8_int32(dtype):
+    """
+    Int8 2x2 matrix multiplication using smmla/ummla instructions
+    This function takes two arrays of int8 datatype -- A[2][8] and
+    B[2][8] and produces a 2x2 matrix which is equal to A*B'
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_2x2_int8_int8_int32(int8 A[2][8], int8 B[2][8], int32 C[2][2]){
+            for (int i = 0; i < 2; i++){
+                for (int j = 0; i < 2; i++){
+                    for (int k = 0; k < 8; k++){
+                        C[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    assert dtype in ["uint8", "int8"]
+    A = te.placeholder((2, 8), dtype, name="A")
+    B = te.placeholder((2, 8), dtype, name="B")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 8), name="k")
+    C = te.compute(
+        (2, 2),
+        lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.smmla" if dtype == "int8" else "llvm.aarch64.neon.ummla"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore([0, 0], tvm.tir.const(0, "int32x4")))
+                return ib.get()
+            # Load in vec_a the two rows of A
+            # vec_a = [a, b, c, d, e, f, g, h;
+            #          i, j, k, l, m, n, o, p,]
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+            # Load in vec_b the two rows of B
+            # vec_b = [0, 2, 4, 6, 8, 10, 12, 14;
+            #          1, 3, 5, 7, 9, 11, 13, 14,]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the matrix multiplication via (s/u)mmla:
+            # vec_c = [a*0 + b*2 + c*4 + d*6 +e*8 + f*10 + g*12 + h*14;
+            #          a*1 + b*3 + c*5 + d*7 +e*9 + f*11 + g*13 + h*15;
+            #          i*0 + j*2 + k*4 + l*6 +m*8 + n*10 + o*12 + p*14;
+            #          i*1 + j*3 + k*5 + l*7 +m*9 + n*11 + o*13 + p*15]
+            vec_c = outs[0].vload([0, 0], "int32x4")
+            vmmla = tvm.tir.call_llvm_intrin(
+                "int32x4",
+                llvm_intrin,
+                tvm.tir.const(3, "uint32"),
+                vec_c,
+                vec_a,
+                vec_b,
+            )
+            # Store the result
+            ib.emit(outs[0].vstore([0, 0], vmmla))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
 def _q_multiply_shift_arm(op):
     """
     Implementation of q_multiply_shift_arm through arm intrinsics
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index e5b0689da008..1bf83eba53ac 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -72,6 +72,12 @@ def compile_conv2d_NHWC_gemm_int8_arm(
             topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
             topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
         ),
+        # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension
+        # (
+        #   "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
+        #   topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+        #   topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+        # ),
     ]
 
     for device_tuple in devices:

From 72bc5a785271a7a146679107510af83c8edac294 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Sat, 7 Nov 2020 02:41:04 +0100
Subject: [PATCH 143/258] Update search for bitcode files for rocm 3.9 (#6865)

rocm 3.9 moved the bitcodes, we adapt to that.
As this gives opaque error messages that are hard to debug
(loading the module fails with could not initialize shared object
but does not tell you about the missing symbols), we tighten
the checks at this stage:
- we become more strict with missing bitcodes,
- we let the linker fail loudly for unresolved symbols.
---
 python/tvm/contrib/rocm.py | 72 +++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 21 deletions(-)

diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index e69b2558c0b5..4f62f1a8da26 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -73,7 +73,20 @@ def rocm_link(in_file, out_file, lld=None):
         The lld linker, if not specified,
         we will try to guess the matched clang version.
     """
-    args = [lld if lld is not None else find_lld()[0], "-shared", in_file, "-o", out_file]
+
+    # if our result has undefined symbols, it will fail to load
+    # (hipModuleLoad/hipModuleLoadData), but with a somewhat opaque message
+    # so we have ld.lld check this here.
+    # If you get a complaint about missing symbols you might want to check the
+    # list of bitcode files below.
+    args = [
+        lld if lld is not None else find_lld()[0],
+        "--no-undefined",
+        "-shared",
+        in_file,
+        "-o",
+        out_file,
+    ]
     proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
 
@@ -108,7 +121,7 @@ def callback_rocm_link(obj_bin):
 
 
 @tvm._ffi.register_func("tvm_callback_rocm_bitcode_path")
-def callback_rocm_bitcode_path(rocdl_dir="/opt/rocm/lib/"):
+def callback_rocm_bitcode_path(rocdl_dir=None):
     """Utility function to find ROCm device library bitcodes
 
     Parameters
@@ -118,23 +131,40 @@ def callback_rocm_bitcode_path(rocdl_dir="/opt/rocm/lib/"):
         The default value is the standard location
     """
     # seems link order matters.
-    bitcode_files = [
-        "oclc_daz_opt_on.amdgcn.bc",
-        "ocml.amdgcn.bc",
-        "hc.amdgcn.bc",
-        "irif.amdgcn.bc",
-        "ockl.amdgcn.bc",
-        "oclc_correctly_rounded_sqrt_off.amdgcn.bc",
-        "oclc_correctly_rounded_sqrt_on.amdgcn.bc",
-        "oclc_daz_opt_off.amdgcn.bc",
-        "oclc_finite_only_off.amdgcn.bc",
-        "oclc_finite_only_on.amdgcn.bc",
-        "oclc_isa_version_803.amdgcn.bc",
-        "oclc_isa_version_900.amdgcn.bc",
-        "oclc_isa_version_906.amdgcn.bc",
-        "oclc_unsafe_math_off.amdgcn.bc",
-        "oclc_unsafe_math_on.amdgcn.bc",
-        "oclc_wavefrontsize64_on.amdgcn.bc",
+
+    if rocdl_dir is None:
+        if exists("/opt/rocm/amdgcn/bitcode/"):
+            rocdl_dir = "/opt/rocm/amdgcn/bitcode/"  # starting with rocm 3.9
+        else:
+            rocdl_dir = "/opt/rocm/lib/"  # until rocm 3.8
+
+    bitcode_names = [
+        "oclc_daz_opt_on",
+        "ocml",
+        "hc",
+        "irif",  # this does not exist in rocm 3.9, drop eventually
+        "ockl",
+        "oclc_correctly_rounded_sqrt_off",
+        "oclc_correctly_rounded_sqrt_on",
+        "oclc_daz_opt_off",
+        "oclc_finite_only_off",
+        "oclc_finite_only_on",
+        "oclc_isa_version_803",  # todo (t-vi): an alternative might be to scan for the
+        "oclc_isa_version_900",  #              isa version files (if the linker throws out
+        "oclc_isa_version_906",  #              the unneeded ones or we filter for the arch we need)
+        "oclc_unsafe_math_off",
+        "oclc_unsafe_math_on",
+        "oclc_wavefrontsize64_on",
     ]
-    paths = [join(rocdl_dir, bitcode) for bitcode in bitcode_files]
-    return tvm.runtime.convert([path for path in paths if exists(path)])
+
+    bitcode_files = []
+    for n in bitcode_names:
+        p = join(rocdl_dir, n + ".bc")  # rocm >= 3.9
+        if not exists(p):  # rocm <= 3.8
+            p = join(rocdl_dir, n + ".amdgcn.bc")
+        if exists(p):
+            bitcode_files.append(p)
+        elif "isa_version" not in n and n not in {"irif"}:
+            raise RuntimeError("could not find bitcode " + n)
+
+    return tvm.runtime.convert(bitcode_files)

From 12ab4559447f906fc8422f54711fe8900d429553 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Fri, 6 Nov 2020 18:20:56 -0800
Subject: [PATCH 144/258] making quantization tweaks (#6731)

---
 python/tvm/relay/quantize/_annotate.py | 43 ++++++++++++++++++++++++++
 src/relay/quantize/realize.cc          | 36 +++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index b187387a56c2..6c395e257cc7 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -175,6 +175,28 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+@register_annotate_function("nn.conv1d")
+def conv1d_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for conv1d. Lhs of conv will be quantized to
+    input field, and rhs of conv will be quantized to weight field.
+    Output would be in activation field"""
+    if quantize_context().check_to_skip(ref_call):
+        return None
+
+    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
+    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
+
+    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
+        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
+
+    assert rhs_kind is None
+    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
+
+    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+
+    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
+
 @register_annotate_function("nn.dense")
 def dense_rewrite(ref_call, new_args, ctx):
     """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
@@ -289,6 +311,8 @@ def identity_rewrite(ref_call, new_args, ctx):
 register_annotate_function("nn.relu", identity_rewrite)
 register_annotate_function("strided_slice", identity_rewrite)
 register_annotate_function("nn.avg_pool2d", identity_rewrite)
+register_annotate_function("nn.batch_flatten", identity_rewrite)
+register_annotate_function("transpose", identity_rewrite)
 register_annotate_function("annotation.stop_fusion", identity_rewrite)
 
 
@@ -311,6 +335,25 @@ def pool2d_rewrite(ref_call, new_args, ctx):
 register_annotate_function("nn.max_pool2d", pool2d_rewrite)
 
 
+def pool1d_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for max pool1d"""
+    if quantize_context().check_to_skip(ref_call):
+        return None
+
+    expr, x_kind = _get_expr_kind(new_args[0])
+
+    if x_kind is None:
+        return None
+    if x_kind == QAnnotateKind.ACTIVATION:
+        expr = attach_simulated_quantize(expr, QAnnotateKind.INPUT)
+
+    expr = _forward_op(ref_call, [expr])
+    return QAnnotateExpr(expr, QAnnotateKind.INPUT)
+
+
+register_annotate_function("nn.max_pool1d", pool1d_rewrite)
+
+
 @register_annotate_function("annotation.cast_hint")
 def cast_hint_rewrite(ref_call, new_args, ctx):
     """Rewrite function to force cast"""
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 8db72a3f2b32..2716c6e65f65 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -234,6 +234,37 @@ Expr Conv2dRealize(const Call& ref_call, const Array<Expr>& new_args, const Obje
 
 RELAY_REGISTER_OP("nn.conv2d").set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dRealize);
 
+Expr Conv1dRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
+  const QConfig& cfg = QConfig::Current();
+  CHECK_EQ(new_args.size(), 2);
+  if (!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>()) {
+    return Expr(nullptr);
+  }
+  const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
+  CHECK(lhs);
+  const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
+  CHECK(rhs);
+
+  Expr ldata = lhs->data;
+  if (lhs->dtype != cfg->dtype_input) {
+    ldata = Cast(ldata, cfg->dtype_input);
+  }
+  Expr rdata = Cast(rhs->data, cfg->dtype_weight);
+
+  const auto ref_attrs = ref_call->attrs.as<Conv1DAttrs>();
+  auto attrs = make_object<Conv1DAttrs>();
+  *attrs = *ref_attrs;
+  DataType out_dtype = cfg->dtype_activation;
+  attrs->out_dtype = out_dtype;
+
+  Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
+  Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
+  Expr dom_scale = FoldConstantOpt(mul);
+  return QRealizeIntExpr(ret, dom_scale, out_dtype);
+}
+
+RELAY_REGISTER_OP("nn.conv1d").set_attr<FForwardRewrite>("FQRealizeRewrite", Conv1dRealize);
+
 Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   ICHECK_EQ(new_args.size(), 2);
@@ -449,6 +480,8 @@ RELAY_REGISTER_OP("strided_slice").set_attr<FForwardRewrite>("FQRealizeRewrite",
 RELAY_REGISTER_OP("nn.batch_flatten")
     .set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
+RELAY_REGISTER_OP("transpose").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
+
 RELAY_REGISTER_OP("annotation.stop_fusion")
     .set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
@@ -469,6 +502,9 @@ Expr CastDtypeInputRealize(const Call& ref_call, const Array<Expr>& new_args,
 RELAY_REGISTER_OP("nn.max_pool2d")
     .set_attr<FForwardRewrite>("FQRealizeRewrite", CastDtypeInputRealize);
 
+RELAY_REGISTER_OP("nn.max_pool1d")
+    .set_attr<FForwardRewrite>("FQRealizeRewrite", CastDtypeInputRealize);
+
 Expr AvgPoolRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   ICHECK_EQ(new_args.size(), 1);

From d8fb97436a885ba5fd2addfe54bf91c410e98d6c Mon Sep 17 00:00:00 2001
From: Alex Gladkov <gladkova@lab126.com>
Date: Fri, 6 Nov 2020 18:53:54 -0800
Subject: [PATCH 145/258] conv1d_transpose speedup. (#6840)

Improve performance of transposed convolution by avoiding
redundant multiplication by zero values from dilated data.

Co-authored-by: Ubuntu <ubuntu@ip-172-31-74-104.ec2.internal>
---
 python/tvm/topi/cuda/conv1d_transpose_ncw.py  | 75 +++++++++----------
 .../python/test_topi_conv1d_transpose_ncw.py  |  4 +
 2 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/python/tvm/topi/cuda/conv1d_transpose_ncw.py b/python/tvm/topi/cuda/conv1d_transpose_ncw.py
index 1ddbdcca9b36..58f53eab20ac 100644
--- a/python/tvm/topi/cuda/conv1d_transpose_ncw.py
+++ b/python/tvm/topi/cuda/conv1d_transpose_ncw.py
@@ -65,29 +65,46 @@ def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype, output_p
     out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right + output_padding
     pad_left = kernel_size - 1 - pad_left
     pad_right = kernel_size - 1 - pad_right + output_padding
-    dilated_width = stride * (inp_width - 1) + 1
-    data = te.compute(
-        (batch, inp_channels, pad_left + dilated_width + pad_right),
+    padded_width = pad_left + inp_width + pad_right
+
+    padded_data = te.compute(
+        (batch, inp_channels, padded_width),
         lambda n, c, x: tvm.tir.if_then_else(
-            tvm.tir.all(
-                x >= pad_left,
-                x < pad_left + dilated_width,
-                tvm.tir.indexmod(x - pad_left, stride).equal(0),
-            ),
-            data[n, c, tvm.tir.indexdiv(x - pad_left, stride)],
+            tvm.tir.all(x >= pad_left, x < pad_left + inp_width),
+            data[n, c, x - pad_left],
             tvm.tir.const(0.0, "float32"),
         ),
         name="data_pad",
     )
 
-    dc = te.reduce_axis((0, inp_channels), name="dc")
-    dw = te.reduce_axis((0, kernel_size), name="dw")
+    padded_kernel = te.compute(
+        (inp_channels, out_channels, kernel_size + stride - 1),
+        lambda ci, co, k: tvm.tir.if_then_else(
+            tvm.tir.all(k < kernel_size),
+            kernel[ci, co, kernel_size - k - 1],
+            tvm.tir.const(0.0, "float32"),
+        ),
+        name="kernel_pad",
+    )
+
+    ci = te.reduce_axis((0, inp_channels), name="ci")
+    k = te.reduce_axis((0, tvm.tir.indexdiv(kernel_size + stride - 1, stride)), name="k")
+    border = pad_left * (stride - 1)
+
+    # Skip multiplication by 0 values in the input data inserted when stride is greater then 1.
+    # During multiplication of kernel by padded data:
+    #  Kernel indices are: 0, 1 * stride, 2 * stride, ..., ceil(kernel_size / stride) plus
+    #  data offset mod stride
     data_out = te.compute(
         (batch, out_channels, out_width),
-        lambda b, c, w: te.sum(
-            data[b, dc, w + dw].astype(out_dtype)
-            * kernel[dc, c, kernel_size - 1 - dw].astype(out_dtype),
-            axis=[dc, dw],
+        lambda b, co, w: te.sum(
+            padded_data[b, ci, tvm.tir.indexdiv(border + w + stride - 1, stride) + k].astype(
+                out_dtype
+            )
+            * padded_kernel[
+                ci, co, k * stride + tvm.tir.indexmod(stride - w - border, stride)
+            ].astype(out_dtype),
+            axis=[ci, k],
         ),
         tag="conv1d_transpose_ncw",
     )
@@ -118,8 +135,8 @@ def schedule_conv1d_transpose_ncw(cfg, outs):
 
     def _callback(op):
         if op.tag == "conv1d_transpose_ncw":
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
+            padded_data = op.input_tensors[0]
+            padded_kernel = op.input_tensors[1]
             conv = op.output(0)
 
             ##### space definition begin #####
@@ -139,9 +156,6 @@ def _callback(op):
 
             ##### space definition end #####
 
-            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
             if conv.op in s.outputs:
                 output = conv
                 OL = s.cache_write(conv, "local")
@@ -150,10 +164,8 @@ def _callback(op):
                 s[conv].set_scope("local")
                 OL = conv
 
-            # create cache stage
-            s[pad_data].set_scope("shared")
-            AA = pad_data
-            WW = s.cache_read(kernel, "shared", [OL])
+            s[padded_kernel].compute_inline()
+            s[padded_data].compute_inline()
 
             # tile and bind spatial axes
             n, f, x = s[output].op.axis
@@ -172,9 +184,6 @@ def _callback(op):
 
             s[output].bind(tx, te.thread_axis("threadIdx.x"))
             s[OL].compute_at(s[output], tx)
-            # number of threads
-            n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-            n_tx = cfg["tile_x"].size[2]
 
             # tile reduction axes
             n, f, x = s[OL].op.axis
@@ -182,18 +191,6 @@ def _callback(op):
             rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
             s[OL].reorder(rco, rcm, rx, rci, n, f, x)
 
-            s[AA].compute_at(s[OL], rx)
-            s[WW].compute_at(s[OL], rx)
-
-            # cooperative fetching
-            for load in [AA, WW]:
-                n, f, x = s[load].op.axis
-                fused = s[load].fuse(f, x)
-                tz, fused = s[load].split(fused, nparts=n_tz)
-                tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, te.thread_axis("threadIdx.y"))
-                s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
             s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
             s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
 
diff --git a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
index c251283f8011..2b8c486b8cd1 100644
--- a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
+++ b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
@@ -91,9 +91,13 @@ def test_conv1d_transpose_ncw():
     verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 2, 256, (0,))
     verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256, (0,))
     verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256, (3,))
+    verify_conv1d_transpose_ncw(1, 2, 1024, 1, 128, 128, 0, (0,))
+    verify_conv1d_transpose_ncw(1, 1, 1024, 2, 128, 128, 0, (0,))
+    verify_conv1d_transpose_ncw(1, 1, 1024, 2, 2, 2, 0, (0,))
     verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (0, 3), (0,))
     verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (1, 3), (0,))
     verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (2, 3), (0,))
+    verify_conv1d_transpose_ncw(1, 257, 128, 1, 512, 128, 256, (0,))
 
 
 if __name__ == "__main__":

From 74bc0fd2f05ddd1bdfecfec539f4f8be796f1924 Mon Sep 17 00:00:00 2001
From: Chris Hoge <chris@hogepodge.com>
Date: Fri, 6 Nov 2020 21:07:08 -0800
Subject: [PATCH 146/258] Fix bug in processing script (#6867)

The argsort command returns a new array that is the sorted
index rather than a new sorted value array. This patch
stores the sorted index in a new variable and uses it to
reference the predicted values.
---
 tutorials/get_started/tvmc_command_line_driver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py
index d844de592035..bcdf03e56875 100644
--- a/tutorials/get_started/tvmc_command_line_driver.py
+++ b/tutorials/get_started/tvmc_command_line_driver.py
@@ -246,10 +246,10 @@
     with np.load(output_file) as data:
         scores = softmax(data["output_0"])
         scores = np.squeeze(scores)
-        scores = np.argsort(scores)[::-1]
+        ranks = np.argsort(scores)[::-1]
 
-        for i in scores[0:5]:
-            print("class='%s' with probability=%f" % (labels[i], scores[i]))
+        for rank in ranks[0:5]:
+            print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
 
 
 ########################################################################

From a7e1c38c343ae7ab20bf03ca3eef249ac29a8a5d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 7 Nov 2020 09:11:48 -0500
Subject: [PATCH 147/258] [COMMUNITY] New committer -- @mbaret (#6873)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 358d42cf7a19..8328e1c625e2 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -40,6 +40,7 @@ We add tag along with committer name to show areas that they are familiar with.
 We do encourage everyone to work anything they are interested in.
 
 - [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
+- [Matthew Barrett](https://github.com/mbaret): @mbaret - byoc, arm
 - [Tianqi Chen](https://github.com/tqchen) (PPMC): @tqchen - topi, compiler, relay, docs
 - [Liangfu Chen](https://github.com/liangfu): @liangfu - vta, chisel, intel FPGA, c runtime
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm

From 7a3e44f1f981f1a4891796b741d68bf30476d680 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Sat, 7 Nov 2020 11:10:25 -0800
Subject: [PATCH 148/258] [AutoScheduler] Make SearchTask and ComputeDAG
 serializable (#6842)

* serialize task and dag

* fix test

* more tests

* format

* format

* format

* trigger ci
---
 python/tvm/auto_scheduler/compute_dag.py      | 23 ++++++++++----
 python/tvm/auto_scheduler/search_task.py      | 29 ++++++++++++++++++
 .../unittest/test_auto_scheduler_common.py    |  4 +--
 .../test_auto_scheduler_compute_dag.py        | 30 ++++++++++++++++++-
 4 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index d50ff395b679..9390a9c4589a 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -21,14 +21,14 @@
 
 import tvm._ffi
 from tvm.runtime import Object
-from tvm.te import PlaceholderOp, ComputeOp
+from tvm.runtime._ffi_node_api import LoadJSON, SaveJSON
+from tvm.te import ComputeOp, PlaceholderOp
 
+from . import _ffi_api
 from .loop_state import State, StateObject
 from .utils import get_const_tuple
 from .workload_registry import workload_key_to_tensors
 
-from . import _ffi_api
-
 
 @tvm._ffi.register_object("auto_scheduler.ComputeDAG")
 class ComputeDAG(Object):
@@ -63,7 +63,10 @@ def __init__(self, compute_or_sche):
         elif isinstance(compute_or_sche, list):
             for item in compute_or_sche:
                 if not isinstance(item, tvm.te.Tensor):
-                    raise ValueError("The input of ComputeDAG should be a list of Tensor")
+                    raise ValueError(
+                        "The input of ComputeDAG should be a list of Tensor, but got %s"
+                        % type(item)
+                    )
             compute = compute_or_sche
             sche = None
         elif isinstance(compute_or_sche, tvm.te.Schedule):
@@ -72,8 +75,10 @@ def __init__(self, compute_or_sche):
         else:
             raise ValueError(
                 "Invalid compute type: %s. ComputeDAG expects string, list of Tensor, or Schedule"
-                % type(compute)
+                % type(compute_or_sche)
             )
+        self.compute = compute
+        self.sche = sche
         self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, compute, sche)
 
     def get_init_state(self):
@@ -182,3 +187,11 @@ def hash_key(self):
 
         str_key = str_key.encode(encoding="utf-8")
         return hashlib.md5(str_key).hexdigest()
+
+    def __getstate__(self):
+        return {"compute": SaveJSON(self.compute), "sche": SaveJSON(self.sche)}
+
+    def __setstate__(self, state):
+        self.compute = LoadJSON(state["compute"])  # pylint: disable=assignment-from-no-return
+        self.sche = LoadJSON(state["sche"])  # pylint: disable=assignment-from-no-return
+        self.__init_handle_by_constructor__(_ffi_api.ComputeDAG, self.compute, self.sche)
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 92c4f48bf371..7c5021b3f9b7 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -42,6 +42,35 @@ class SearchTask(Object):
     """
 
     def __init__(self, dag, workload_key, target, target_host=None, hardware_params=None):
+        self.dag = dag
+        self.workload_key = workload_key
+        self.target = target
+        self.target_host = target_host
+        self.hardware_params = hardware_params
         self.__init_handle_by_constructor__(
             _ffi_api.SearchTask, dag, workload_key, target, target_host, hardware_params
         )
+
+    def __getstate__(self):
+        return {
+            "dag": self.dag,
+            "workload_key": self.workload_key,
+            "target": self.target,
+            "target_host": self.target_host,
+            "hardware_params": self.hardware_params,
+        }
+
+    def __setstate__(self, state):
+        self.dag = state["dag"]
+        self.workload_key = state["workload_key"]
+        self.target = state["target"]
+        self.target_host = state["target_host"]
+        self.hardware_params = state["hardware_params"]
+        self.__init_handle_by_constructor__(
+            _ffi_api.SearchTask,
+            self.dag,
+            self.workload_key,
+            self.target,
+            self.target_host,
+            self.hardware_params,
+        )
diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py
index 6a3fe4e82c99..5b7add9733de 100644
--- a/tests/python/unittest/test_auto_scheduler_common.py
+++ b/tests/python/unittest/test_auto_scheduler_common.py
@@ -161,14 +161,12 @@ def conv2d_winograd_nhwc_auto_scheduler_test(
     r = KW
     m = tile_size
     alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, "float32")
+    A, B, _ = winograd_transform_matrices(m, r, "float32")
 
     H = (H + 2 * HPAD - KH) // HSTR + 1
     W = (W + 2 * WPAD - KW) // WSTR + 1
     nH, nW = (H + m - 1) // m, (W + m - 1) // m
     P = N * nH * nW
-    r_kh = te.reduce_axis((0, KH), name="r_kh")
-    r_kw = te.reduce_axis((0, KW), name="r_kw")
     kshape = (alpha, alpha, CI, CO)
     kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")
 
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index 2ccedef9e2de..e7774753796c 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -16,6 +16,7 @@
 # under the License.
 
 """Test ComputeDAG (replay, infer bound)"""
+import pickle
 
 import tvm
 from tvm import topi
@@ -32,7 +33,7 @@ def test_apply_steps():
     dag, s = get_tiled_matmul()
     dag.print_python_code_from_state(s)
     sch, tensors = dag.apply_steps_from_state(s)
-    stmt = tvm.lower(sch, tensors, simple_mode=True)
+    tvm.lower(sch, tensors, simple_mode=True)
 
 
 def test_infer_bound():
@@ -61,6 +62,7 @@ def test_estimate_flop():
 
 
 def test_stage_order():
+    """Test if the stage order is preserved when recovering a DAG."""
     N = 512
     A, B, C, D, E = parallel_matmul_auto_scheduler_test(N)
     sch = te.create_schedule([D.op, E.op])
@@ -87,6 +89,11 @@ def test_stage_order():
         elif op.name in ["B", "C"]:
             assert stage_ops_1[idx + 1].name == "%s.shared" % op.name
 
+    # Serialize and deserialize the ComputeDAG constructed by a schedule.
+    loaded_dag = pickle.loads(pickle.dumps(dag))
+    assert str(loaded_dag.get_init_state()) == str(dag.get_init_state())
+    assert len(loaded_dag.get_init_state().stage_ops) == len(dag.get_init_state().stage_ops)
+
     # Apply the same schedule to Ansor state and it should have the same stage order
     dag = auto_scheduler.ComputeDAG([A, B, C, D, E])
     state = dag.get_init_state()
@@ -105,6 +112,27 @@ def test_stage_order():
     for op1, op2 in zip(stage_ops_1, stage_ops_2):
         assert op1.name == op2.name
 
+    # Serialize and deserialize the ComputeDAG constructed by a list of tensor ops.
+    loaded_dag = pickle.loads(pickle.dumps(dag))
+    assert str(loaded_dag.get_init_state()) == str(dag.get_init_state())
+    assert len(loaded_dag.get_init_state().stage_ops) == len(dag.get_init_state().stage_ops)
+
+    # Serialize and deserialize the search task.
+    task = auto_scheduler.SearchTask(
+        dag,
+        "test1",
+        tvm.target.Target("llvm"),
+        hardware_params=auto_scheduler.HardwareParams(100000, 16, 64),
+    )
+    task2 = pickle.loads(pickle.dumps(task))
+    assert str(task.dag.get_init_state()) == str(task2.dag.get_init_state())
+    assert len(task.dag.get_init_state().stage_ops) == len(task2.dag.get_init_state().stage_ops)
+    assert task.workload_key == task2.workload_key
+    assert str(task.target) == str(task2.target)
+    assert task.hardware_params.num_cores == task2.hardware_params.num_cores
+    assert task.hardware_params.vector_unit_bytes == task2.hardware_params.vector_unit_bytes
+    assert task.hardware_params.cache_line_bytes == task2.hardware_params.cache_line_bytes
+
 
 if __name__ == "__main__":
     test_apply_steps()

From 5a486db894c00c53dd8de7c24e08726277c47ba8 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Sat, 7 Nov 2020 12:58:57 -0800
Subject: [PATCH 149/258] [BYOC][TRT] Allocate GPU data buffers and transfer
 data when needed (#6872)

* Allocate data buffers for gpu

fix

* Rename AllocateDeviceBuffer, update docstrings

* Remove unneeded cast
---
 .../contrib/tensorrt/tensorrt_builder.cc      | 43 +++++++++++++++----
 .../contrib/tensorrt/tensorrt_builder.h       | 27 ++++++++++--
 .../contrib/tensorrt/tensorrt_runtime.cc      | 41 ++++++++++++++----
 tests/python/contrib/test_tensorrt.py         | 29 ++++++++++---
 4 files changed, 114 insertions(+), 26 deletions(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index d308200eba05..4060b240cf8e 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -37,9 +37,12 @@ namespace tvm {
 namespace runtime {
 namespace contrib {
 
-TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger, size_t max_workspace_size,
-                                 bool use_implicit_batch, bool use_fp16, int batch_size)
-    : max_workspace_size_(max_workspace_size),
+TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
+                                 const std::vector<const DLTensor*>& data_entry,
+                                 size_t max_workspace_size, bool use_implicit_batch, bool use_fp16,
+                                 int batch_size)
+    : data_entry_(data_entry),
+      max_workspace_size_(max_workspace_size),
       use_implicit_batch_(use_implicit_batch),
       use_fp16_(use_fp16),
       batch_size_(batch_size) {
@@ -63,7 +66,7 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger, size_t max_workspace_si
 #endif
 }
 
-void TensorRTBuilder::AddInput(int nid, const JSONGraphNode& node) {
+void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node) {
   auto node_name = node.GetOpName();
   auto shapes = node.GetOpShape();
   auto dtypes = node.GetOpDataType();
@@ -80,7 +83,8 @@ void TensorRTBuilder::AddInput(int nid, const JSONGraphNode& node) {
     ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
     auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
     node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
-    network_input_names_.push_back(input_tensor->getName());
+    network_input_names_.push_back(name);
+    entry_id_map_[name] = entry_id + i;
   }
 }
 
@@ -94,14 +98,15 @@ void TensorRTBuilder::AddConstant(int nid, const DLTensor* data) {
   node_output_map_[nid] = {TensorRTOpInput(weight, shape)};
 }
 
-void TensorRTBuilder::AddOutput(const JSONGraphNodeEntry& node) {
+void TensorRTBuilder::AddOutput(const JSONGraphNodeEntry& node, uint32_t entry_id) {
   auto it = node_output_map_.find(node.id_);
   ICHECK(it != node_output_map_.end()) << "Output was not found.";
   auto out_tensor = it->second[node.index_].tensor;
   std::string name = "tensorrt_output_" + std::to_string(network_output_names_.size());
   out_tensor->setName(name.c_str());
   network_->markOutput(*out_tensor);
-  network_output_names_.push_back(out_tensor->getName());
+  network_output_names_.push_back(name);
+  entry_id_map_[name] = entry_id;
 }
 
 void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
@@ -168,7 +173,16 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
   ICHECK_EQ(engine->getNbBindings(), network_input_names_.size() + network_output_names_.size());
   nvinfer1::IExecutionContext* context = engine->createExecutionContext();
   CleanUp();
-  return {engine, context, network_input_names_, network_output_names_};
+
+  // Allocate I/O buffers on GPU for TVM inputs which are on a different context.
+  std::vector<runtime::NDArray> device_buffers(engine->getNbBindings());
+  for (size_t i = 0; i < network_input_names_.size(); ++i) {
+    AllocateDeviceBuffer(engine, network_input_names_[i], &device_buffers);
+  }
+  for (size_t i = 0; i < network_output_names_.size(); ++i) {
+    AllocateDeviceBuffer(engine, network_output_names_[i], &device_buffers);
+  }
+  return {engine, context, network_input_names_, network_output_names_, device_buffers};
 }
 
 nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
@@ -217,6 +231,19 @@ void TensorRTBuilder::CleanUp() {
   }
 }
 
+void TensorRTBuilder::AllocateDeviceBuffer(nvinfer1::ICudaEngine* engine, const std::string& name,
+                                           std::vector<runtime::NDArray>* device_buffers) {
+  const uint32_t entry_id = entry_id_map_[name];
+  if (data_entry_[entry_id]->ctx.device_type != kDLGPU) {
+    const int binding_index = engine->getBindingIndex(name.c_str());
+    ICHECK_NE(binding_index, -1);
+    std::vector<int64_t> shape(data_entry_[entry_id]->shape,
+                               data_entry_[entry_id]->shape + data_entry_[entry_id]->ndim);
+    device_buffers->at(binding_index) =
+        runtime::NDArray::Empty(shape, data_entry_[entry_id]->dtype, {kDLGPU, 0});
+  }
+}
+
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
index efb4d8175650..4926a4d02685 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h
@@ -25,6 +25,8 @@
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
 
+#include <tvm/runtime/ndarray.h>
+
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -50,6 +52,8 @@ struct TensorRTEngineAndContext {
   nvinfer1::IExecutionContext* context;
   std::vector<std::string> inputs;
   std::vector<std::string> outputs;
+  /*! \brief GPU buffers for inputs and outputs. */
+  std::vector<NDArray> device_buffers;
 };
 
 /*!
@@ -69,15 +73,17 @@ class TensorRTBuilder {
    * \param use_fp16 Whether to use implicit batch mode (default)
    * \param batch_size If use_implicit_batch,
    */
-  TensorRTBuilder(TensorRTLogger* logger, size_t max_workspace_size, bool use_implicit_batch,
-                  bool use_fp16, int batch_size);
+  TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
+                  size_t max_workspace_size, bool use_implicit_batch, bool use_fp16,
+                  int batch_size);
 
   /*!
    * \brief Add TensorRT input(s) for input node in network definition.
    * \param nid The input node id.
+   * \param entry_id The index into data_entry_ for first entry in node.
    * \param node The input node.
    */
-  void AddInput(int nid, const JSONGraphNode& node);
+  void AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node);
 
   /*!
    * \brief Add TensorRT weight for input constant in network definition.
@@ -96,8 +102,9 @@ class TensorRTBuilder {
   /*!
    * \brief Mark TensorRT output in network definition.
    * \param entry The output node entry.
+   * \param entry_id The output node entry id.
    */
-  void AddOutput(const JSONGraphNodeEntry& entry);
+  void AddOutput(const JSONGraphNodeEntry& entry, uint32_t entry_id);
 
   /*!
    * \brief Takes network definition and "compiles" a TensorRT engine which can be used for
@@ -116,6 +123,12 @@ class TensorRTBuilder {
   /*! \brief Clean up resources used to create engine. */
   void CleanUp();
 
+  /*! \brief Allocate a GPU buffer for input or output DLTensor, only if the context is not GPU
+   * already. Inputs that are already on the GPU can be passed directly to TensorRT and will not
+   * need a buffer. */
+  void AllocateDeviceBuffer(nvinfer1::ICudaEngine* engine, const std::string& name,
+                            std::vector<runtime::NDArray>* device_buffers);
+
   /*! \brief Maps a node to its outputs. */
   std::unordered_map<int, std::vector<TensorRTOpInput>> node_output_map_;
 
@@ -133,6 +146,12 @@ class TensorRTBuilder {
   /*! \brief List of all weights held in memory. */
   std::vector<nvinfer1::Weights> trt_weights_;
 
+  /*! \brief Input and output tensors from TVM. */
+  const std::vector<const DLTensor*>& data_entry_;
+
+  /*! \brief Map TensorRT binding name to index in data_entry_. */
+  std::unordered_map<std::string, uint32_t> entry_id_map_;
+
   /*! \brief Max workspace size in bytes for TRT. */
   size_t max_workspace_size_;
 
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index f183e2f24449..445010321668 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -78,8 +78,6 @@ class TensorRTRuntime : public JSONRuntimeBase {
     LoadGlobalAttributes();
     if (GetCachedEnginesFromDisk()) return;
     SetupConstants(consts);
-    BuildEngine();
-    CacheEngineToDisk();
   }
 
   void LoadGlobalAttributes() {
@@ -106,9 +104,11 @@ class TensorRTRuntime : public JSONRuntimeBase {
 #ifdef TVM_GRAPH_RUNTIME_TENSORRT
   /*! \brief Run inference using built engine. */
   void Run() override {
+    BuildEngine();
     auto& engine_and_context = trt_engine_cache_.at(symbol_name_);
     auto engine = engine_and_context.engine;
     auto context = engine_and_context.context;
+    auto& device_buffers = engine_and_context.device_buffers;
     std::vector<void*> bindings(engine->getNbBindings(), nullptr);
 
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
@@ -119,7 +119,12 @@ class TensorRTRuntime : public JSONRuntimeBase {
           const std::string name = nodes_[nid].GetOpName() + "_" + std::to_string(j);
           int binding_index = engine->getBindingIndex(name.c_str());
           ICHECK_NE(binding_index, -1);
-          bindings[binding_index] = data_entry_[eid]->data;
+          if (data_entry_[eid]->ctx.device_type == kDLGPU) {
+            bindings[binding_index] = data_entry_[eid]->data;
+          } else {
+            device_buffers[binding_index].CopyFrom(data_entry_[eid]);
+            bindings[binding_index] = device_buffers[binding_index]->data;
+          }
         }
       }
     }
@@ -129,7 +134,11 @@ class TensorRTRuntime : public JSONRuntimeBase {
       const std::string& name = engine_and_context.outputs[i];
       int binding_index = engine->getBindingIndex(name.c_str());
       ICHECK_NE(binding_index, -1);
-      bindings[binding_index] = data_entry_[eid]->data;
+      if (data_entry_[eid]->ctx.device_type == kDLGPU) {
+        bindings[binding_index] = data_entry_[eid]->data;
+      } else {
+        bindings[binding_index] = device_buffers[binding_index]->data;
+      }
     }
 
 #if TRT_VERSION_GE(6, 0, 1)
@@ -141,18 +150,31 @@ class TensorRTRuntime : public JSONRuntimeBase {
 #else
     ICHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT failed.";
 #endif
+
+    // Copy outputs from GPU buffers if needed.
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      const std::string& name = engine_and_context.outputs[i];
+      int binding_index = engine->getBindingIndex(name.c_str());
+      ICHECK_NE(binding_index, -1);
+      if (data_entry_[eid]->ctx.device_type != kDLGPU) {
+        device_buffers[binding_index].CopyTo(const_cast<DLTensor*>(data_entry_[eid]));
+      }
+    }
   }
 
  private:
   /*!
-   * \brief Build TensorRT engine from JSON representation.
+   * \brief Build TensorRT engine from JSON representation and cache it. If engine is already built,
+   * do nothing.
    */
   void BuildEngine() {
+    if (trt_engine_cache_.count(symbol_name_)) return;
     DLOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_;
     const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
     batch_size_ = GetBatchSize();
-    TensorRTBuilder builder(&logger_, max_workspace_size_, use_implicit_batch_, use_fp16,
-                            batch_size_);
+    TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
+                            use_fp16, batch_size_);
 
     // Add inputs and constants.
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
@@ -160,7 +182,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
       const auto& node = nodes_[nid];
       std::string name = node.GetOpName();
       if (node.GetOpType() == "input") {
-        builder.AddInput(nid, node);
+        builder.AddInput(nid, EntryID(nid, 0), node);
       } else {
         ICHECK_EQ(node.GetOpType(), "const");
         uint32_t eid = EntryID(nid, 0);
@@ -177,12 +199,13 @@ class TensorRTRuntime : public JSONRuntimeBase {
 
     // Add outputs.
     for (size_t i = 0; i < outputs_.size(); ++i) {
-      builder.AddOutput(outputs_[i]);
+      builder.AddOutput(outputs_[i], EntryID(outputs_[i]));
     }
 
     // Build engine.
     trt_engine_cache_[symbol_name_] = builder.BuildEngine();
     DLOG(INFO) << "Finished building TensorRT engine for subgraph " << symbol_name_;
+    CacheEngineToDisk();
   }
 
   /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 9faf51f397f3..8e8e54e8650a 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -46,7 +46,7 @@ def skip_runtime_test():
     return False
 
 
-def run_and_verify_func(config):
+def run_and_verify_func(config, target="cuda"):
     """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
 
     Parameters
@@ -70,10 +70,11 @@ def run_and_verify_func(config):
     mod["main"] = f
     mod, config = tensorrt.partition_for_tensorrt(mod, params)
     with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
-        graph, lib, graph_params = relay.build(mod, "cuda", params=params)
+        graph, lib, graph_params = relay.build(mod, target, params=params)
     if skip_runtime_test():
         return
-    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    ctx = tvm.context(target)
+    mod = graph_runtime.create(graph, lib, ctx=ctx)
     mod.set_input(**graph_params)
     mod.run(**input_dict)
     results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
@@ -82,8 +83,8 @@ def run_and_verify_func(config):
     mod = tvm.IRModule()
     mod["main"] = f
     with tvm.transform.PassContext(opt_level=3):
-        graph, lib, graph_params = relay.build(mod, "cuda", params=params)
-    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+        graph, lib, graph_params = relay.build(mod, target, params=params)
+    mod = graph_runtime.create(graph, lib, ctx=ctx)
     mod.set_input(**graph_params)
     mod.run(**input_dict)
     ref_results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
@@ -188,6 +189,23 @@ def test_tensorrt_simple():
     results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
 
 
+def test_tensorrt_simple_cpu_io():
+    def get_graph():
+        dtype = "float32"
+        x_shape = (1, 3, 2, 2)
+        y_shape = (1, 3, 1, 1)
+        z_shape = (1, 1, 1, 1)
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        y = relay.var("y", shape=(y_shape), dtype=dtype)
+        z = relay.var("z", shape=(z_shape), dtype=dtype)
+        w = z * (x + y)
+        out = relay.nn.relu(w)
+        f = relay.Function([x, y, z], out)
+        return f, {"x": x_shape, "y": y_shape, "z": z_shape}, ["y"]
+
+    run_and_verify_func(get_graph(), target="llvm")
+
+
 def test_tensorrt_not_compatible():
     if skip_codegen_test():
         return
@@ -859,6 +877,7 @@ def test_densenet121():
 if __name__ == "__main__":
     test_tensorrt_not_compatible()
     test_tensorrt_simple()
+    test_tensorrt_simple_cpu_io()
     test_tensorrt_serialize()
 
     # Op tests

From 19b605efb3bc443cd22722943f67ad705888e8ee Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 7 Nov 2020 19:53:15 -0800
Subject: [PATCH 150/258] register auto-scheduler to more ops (#6879)

---
 python/tvm/relay/op/strategy/cuda.py          | 24 +++++++++++
 .../test_auto_scheduler_task_extraction.py    | 40 ++++++++++++++-----
 2 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 26e9a0060b66..1229a71569d0 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -146,6 +146,10 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                     name="conv2d_nchw_winograd.cuda",
                     plevel=5,
                 )
+
+            strategy.add_auto_scheduler(
+                wrap_compute_conv2d(topi.nn.conv2d_nchw), name="conv2d_nchw"
+            )
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
             strategy.add_implementation(
@@ -271,6 +275,11 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                 name="depthwise_conv2d_nchw.cuda",
             )
+
+            strategy.add_auto_scheduler(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
+                name="depthwise_conv2d_nchw.cuda",
+            )
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
             strategy.add_implementation(
@@ -278,6 +287,11 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                 name="depthwise_conv2d_nhwc.cuda",
             )
+
+            strategy.add_auto_scheduler(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nhwc.cuda",
+            )
         else:
             raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
     else:  # group_conv2d
@@ -463,6 +477,11 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                 name="conv3d_ncdhw_winograd.cuda",
                 plevel=5,
             )
+
+        strategy.add_auto_scheduler(
+            wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
+            name="conv3d_ncdhw.cuda",
+        )
     else:  # layout == "NDHWC":
         strategy.add_implementation(
             wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
@@ -486,6 +505,11 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                         plevel=20,
                     )
 
+        strategy.add_auto_scheduler(
+            wrap_compute_conv3d(topi.nn.conv3d_ndhwc),
+            name="conv3d_ndhwc.cuda",
+        )
+
     if target.kind.name == "cuda" and "cudnn" in target.libs:
         strategy.add_implementation(
             wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True),
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py
index 63d4a6f6a404..9f6ddb652469 100644
--- a/tests/python/relay/test_auto_scheduler_task_extraction.py
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -28,6 +28,10 @@ def get_network(name, batch_size=1, layout="NHWC"):
         image_shape = (224, 224, 3)
     elif layout == "NCHW":
         image_shape = (3, 224, 224)
+    elif layout == "NCDHW":
+        image_shape = (3, 16, 224, 224)
+    elif layout == "NDHWC":
+        image_shape = (3, 224, 224, 16)
     else:
         raise ValueError("Invalid layout: " + layout)
 
@@ -39,14 +43,14 @@ def get_network(name, batch_size=1, layout="NHWC"):
         mod, params = relay.testing.resnet.get_workload(
             num_layers=50, batch_size=batch_size, layout=layout, image_shape=image_shape
         )
-    elif name == "resnet3d-18":
-        mod, params = relay.testing.resnet_3d.get_workload(
-            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
-        )
     elif name == "mobilenet":
         mod, params = relay.testing.mobilenet.get_workload(
             batch_size=batch_size, layout=layout, image_shape=image_shape
         )
+    elif name == "resnet3d-18":
+        mod, params = relay.testing.resnet_3d.get_workload(
+            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
     elif name == "dcgan":
         mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size, layout=layout)
     elif name == "mlp":
@@ -70,20 +74,34 @@ def get_network(name, batch_size=1, layout="NHWC"):
 @tvm.testing.requires_cuda
 def test_task_extraction_cuda():
     auto_scheduler.enable_relay_integration()
+    target = tvm.target.Target("cuda")
 
     mod, params = get_network("mlp")
-    target = tvm.target.Target("cuda")
     tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-
     assert len(tasks) == 1
     assert sum(task_weights) == 2
 
-    mod, params = get_network("resnet-18")
-    target = tvm.target.Target("cuda")
-    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+    for layout in ["NHWC", "NCHW"]:
+        mod, params = get_network("resnet-18", layout=layout)
+        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+        assert len(tasks) == 21
+        assert sum(task_weights) == 22
+
+        mod, params = get_network("mobilenet", layout=layout)
+        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+        assert len(tasks) == 20
+        assert sum(task_weights) == 28
+
+    for layout in ["NCDHW", "NDHWC"]:
+        mod, params = get_network("resnet3d-18", layout=layout)
+        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+        assert len(tasks) == 21
+        assert sum(task_weights) == 22
 
-    assert len(tasks) == 21
-    assert sum(task_weights) == 22
+    auto_scheduler.enable_relay_integration(False)
 
 
 if __name__ == "__main__":

From dab8783278640832f811d3756a8f849d95c5abed Mon Sep 17 00:00:00 2001
From: Balint Cristian <cristian.balint@gmail.com>
Date: Sun, 8 Nov 2020 06:54:45 +0200
Subject: [PATCH 151/258] More flexible conv2d_NCHWc_int8 generic operator.
 (#6714)

---
 python/tvm/topi/generic/conv2d.py | 12 ++++++------
 python/tvm/topi/nn/conv2d.py      |  7 ++++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index f23cff3bef84..7dd9aed7545d 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -51,7 +51,7 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
         num_int8_elements,
     )
 
-    oc_bn = int32_lanes
+    oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
     ic_bn = 1
     for bn in range(oc_bn, 0, -4):
         if wkl.in_filter % bn == 0:
@@ -99,7 +99,7 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
         num_int8_elements,
     )
 
-    oc_bn = int32_lanes
+    oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
     ic_bn = 1
     for bn in range(oc_bn, 0, -4):
         if wkl.in_filter % bn == 0:
@@ -119,7 +119,7 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
 
 
 def schedule_conv_NCHWc_cpu_common_int8(
-    s, cfg, data_vec, kernel_vec, conv_out, last, int32_lanes=16, intrin=None
+    s, cfg, data_vec, kernel_vec, conv_out, last, int32_lanes=16, int8_elems=4, intrin=None
 ):
     """
     Defines the schedule for INT8 for Intel and ARM machines
@@ -180,7 +180,7 @@ def schedule_conv_NCHWc_cpu_common_int8(
     ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
 
     assert oc_bn % int32_lanes == 0
-    assert ic_bn % 4 == 0  # 4 (u)int8 elements in (u)int32
+    assert ic_bn % int8_elems == 0  # (u)int8 elements in (u)int32
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
@@ -245,7 +245,7 @@ def schedule_conv_NCHWc_cpu_common_int8(
 
 
 def schedule_conv_NCHWc_cpu_1x1_int8(
-    s, cfg, data_vec, kernel_vec, conv_out, last, int32_lanes=16, intrin=None
+    s, cfg, data_vec, kernel_vec, conv_out, last, int32_lanes=16, int8_elems=4, intrin=None
 ):
     """
     Defines the 1x1 conv schedule for INT8 for Intel and ARM machines
@@ -305,7 +305,7 @@ def schedule_conv_NCHWc_cpu_1x1_int8(
     kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
     assert oc_bn % int32_lanes == 0
-    assert ic_bn % 4 == 0  # 4 (u)int8 elements in (u)int32
+    assert ic_bn % int8_elems == 0  # (u)int8 elements in (u)int32
 
     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 2e147fc148de..cd10c757e956 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -505,7 +505,7 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou
 
 
 def conv2d_NCHWc_int8(
-    data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="int32"
+    data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="int32", n_elems=4
 ):
     """Conv2D operator for nChw[x]c layout.
 
@@ -539,6 +539,9 @@ def conv2d_NCHWc_int8(
     out_dtype : str
         output data type
 
+    n_elems : int
+        numer of int8 elements accumulated
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -588,7 +591,6 @@ def conv2d_NCHWc_int8(
     kw = te.reduce_axis((0, kernel_width), name="kw")
 
     if groups == 1:
-        n_elems = 4
         ic_outer = te.reduce_axis((0, in_channel // ic_bn), name="ic_outer")
         ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner")
         ic_s_inner = te.reduce_axis((0, n_elems), name="ic_s_inner")
@@ -611,7 +613,6 @@ def conv2d_NCHWc_int8(
             tag="conv2d_NCHWc_int8",
         )
     # for int8 group conv support
-    n_elems = 4
     ic_chunk = in_channel // ic_bn
     ic_outer = te.reduce_axis((0, ic_chunk // groups), name="ic_outer")
     ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner")

From 32885311697a1922327d0950b2f80ea3aaa5524d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 8 Nov 2020 05:45:05 -0800
Subject: [PATCH 152/258] [AutoScheduler] Fix the occasional crash caused by
 split memo (#6883)

---
 .../search_policy/sketch_policy_rules.cc      |  6 ++-
 src/auto_scheduler/search_policy/utils.cc     | 42 ++-----------------
 src/auto_scheduler/search_policy/utils.h      | 27 ------------
 3 files changed, 7 insertions(+), 68 deletions(-)

diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 692ace103be3..1c69397833df 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -450,6 +450,7 @@ std::vector<std::pair<State, int>> RuleSpecialComputeLocationGPU::Apply(
 
 PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy, State* state,
                                                              std::mt19937* rand_gen) const {
+  SplitFactorizationMemo split_memo;
   int max_innermost_split_factor =
       GetIntParam(policy->params, SketchParamKey::max_innermost_split_factor);
 
@@ -470,8 +471,9 @@ PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* p
 
       ICHECK(ps->extent);
       int extent = GetIntImm(ps->extent.value());
-      const auto& candidate_lens = policy->split_memo.GetFactorizationSchemes(
-          extent, ps->lengths.size(), max_innermost_split_factor);
+      const auto& candidate_lens = split_memo.GetFactorizationSchemes(extent, ps->lengths.size(),
+                                                                      max_innermost_split_factor);
+      ICHECK(!candidate_lens.empty());
       const auto& candidate_lengths = candidate_lens[(*rand_gen)() % candidate_lens.size()];
 
       pstate->transform_steps.Set(
diff --git a/src/auto_scheduler/search_policy/utils.cc b/src/auto_scheduler/search_policy/utils.cc
index 3e2f7aaed44f..d59df6965776 100644
--- a/src/auto_scheduler/search_policy/utils.cc
+++ b/src/auto_scheduler/search_policy/utils.cc
@@ -413,55 +413,19 @@ void PruneInvalidState(const SearchTask& task, Array<State>* states) {
 }
 
 /********** SplitFactorizationMemo **********/
-
-void SplitFactorizationMemo::ReadWriteLock::GetRead() {
-  std::unique_lock<std::mutex> lock(cv_mutex_);
-  // Wake up and get the mutex lock if there's no writing thread
-  cv_.wait(lock, [this]() { return !this->is_writing_; });
-  read_count_++;
-}
-
-void SplitFactorizationMemo::ReadWriteLock::GetWrite() {
-  std::unique_lock<std::mutex> lock(cv_mutex_);
-  // Wake up and get the mutex lock if there's no reading or writing threads
-  cv_.wait(lock, [this]() { return this->read_count_ == 0 && !this->is_writing_; });
-  is_writing_ = true;
-}
-
-void SplitFactorizationMemo::ReadWriteLock::UnlockRead() {
-  std::lock_guard<std::mutex> lock(cv_mutex_);
-  read_count_--;
-  // Notify the other blocked threads if this is the last reading thread
-  if (read_count_ == 0) {
-    cv_.notify_one();
-  }
-}
-
-void SplitFactorizationMemo::ReadWriteLock::UnlockWrite() {
-  std::lock_guard<std::mutex> lock(cv_mutex_);
-  is_writing_ = false;
-  // Notify the other blocked threads
-  cv_.notify_one();
-}
-
 const Array<Array<Integer>>& SplitFactorizationMemo::GetFactorizationSchemes(
     int extent, int n_lengths, int max_innermost_factor) {
   QueryKey key = std::make_tuple(extent, n_lengths, max_innermost_factor);
-  const auto& const_memory = memory_;
-  lock_.GetRead();
-  const auto& it = const_memory.find(key);
-  const auto& memory_end = const_memory.end();
-  lock_.UnlockRead();
-  if (it != memory_end) {
+  const auto& it = memory_.find(key);
+  if (it != memory_.end()) {
     return it->second;
   }
 
-  lock_.GetWrite();
   tmp_stack_ = Array<Integer>(n_lengths, Integer());
   results_ = &memory_[key];
   n_lengths_ = n_lengths;
+
   DfsEnumerate(0, extent, max_innermost_factor);
-  lock_.UnlockWrite();
 
   return *results_;
 }
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index f0c4cbca9ca0..ecc46af5a5de 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -677,33 +677,6 @@ class SplitFactorizationMemo {
  private:
   void DfsEnumerate(int now, int remaining_length, int max_innermost_factor);
 
-  /*!
-   * \brief A simple implementation of read-write lock.
-   * The guarded block can be read by multiple threads at the same time, while other operations will
-   * be blocked if one thread is writing.
-   * \note Writing threads will wait until all reading threads have finshed. If there're multiple
-   * writing threads, the process order of them is not guaranteed.
-   */
-  class ReadWriteLock {
-   public:
-    /*! \brief The method to get the read lock. One thread can process read if there's on other
-     * writing threads. */
-    void GetRead();
-    /*! \brief The method to get the write lock. One thread can process write if there's on other
-     * reading or writing threads. */
-    void GetWrite();
-    /*! \brief The method to release the read lock. */
-    void UnlockRead();
-    /*! \brief The method to release the write lock. */
-    void UnlockWrite();
-
-   private:
-    uint32_t read_count_ = 0;
-    bool is_writing_ = false;
-    std::mutex cv_mutex_;
-    std::condition_variable cv_;
-  } lock_;
-
   std::unordered_map<QueryKey, Array<Array<Integer>>> memory_;
 
   int n_lengths_;

From 6dae2b034b7bb8f07956e1fd8cb3520d1a26cdc7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 8 Nov 2020 06:12:45 -0800
Subject: [PATCH 153/258] [DOC] Improve the order of tutorials within a
 subsection (#6880)

---
 docs/README.txt                               |  5 ++
 docs/conf.py                                  | 69 +++++++++++++++++++
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |  4 +-
 tutorials/auto_scheduler/tune_matmul_x86.py   |  2 +-
 tutorials/autotvm/tune_conv2d_cuda.py         |  3 +-
 tutorials/autotvm/tune_relay_arm.py           |  6 +-
 tutorials/autotvm/tune_relay_cuda.py          | 12 +---
 tutorials/autotvm/tune_relay_mobile_gpu.py    |  6 +-
 tutorials/autotvm/tune_relay_x86.py           |  6 +-
 tutorials/autotvm/tune_simple_template.py     |  2 +-
 10 files changed, 86 insertions(+), 29 deletions(-)

diff --git a/docs/README.txt b/docs/README.txt
index eeec6d972d68..e409107b78a6 100644
--- a/docs/README.txt
+++ b/docs/README.txt
@@ -51,3 +51,8 @@ You will need a gpu CI environment.
 ```bash
 ./tests/scripts/task_python_docs.sh
 ```
+
+Define the Order of Tutorials
+-----------------------------
+You can define the order of tutorials with `conf.py::subsection_order` and `conf.py::within_subsection_order`.
+By default, the tutorials within one subsection is sorted by filename.
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index 5bf2d6bbb75e..e3ddae214e10 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -204,6 +204,74 @@
     ]
 )
 
+# Explicitly define the order within a subsection.
+# The listed files are sorted according to the list.
+# The unlisted files are sorted by filenames.
+# The unlisted files always appear after listed files.
+within_subsection_order = {
+    "get_started": [
+        "relay_quick_start.py",
+        "tensor_expr_get_started.py",
+        "tvmc_command_line_driver.py",
+        "cross_compilation_and_rpc.py",
+    ],
+    "frontend": [
+        "from_pytorch.py",
+        "from_tensorflow.py",
+        "from_mxnet.py",
+        "from_onnx.py",
+        "from_keras.py",
+        "from_tflite.py",
+        "from_coreml.py",
+        "from_darknet.py",
+        "from_caffe2.py",
+    ],
+    "language": [
+        "schedule_primitives.py",
+        "reduciton.py",
+        "intrin_math.py",
+        "scan.py",
+        "extern_op.py",
+        "tensorize.py",
+        "tuple_inputs.py",
+        "tedd.py",
+    ],
+    "optimize": [
+        "opt_gemm.py",
+        "opt_conv_cuda.py",
+        "opt_conv_tensorcore.py",
+        "opt_matmul_auto_tensorcore.py",
+    ],
+    "autotvm": [
+        "tune_simple_template.py",
+        "tune_conv2d_cuda.py",
+        "tune_relay_cuda.py",
+        "tune_relay_x86.py",
+        "tune_relay_arm.py",
+        "tune_relay_mobile_gpu.py",
+    ],
+    "auto_scheduler": ["tune_matmul_x86.py", "tune_conv2d_layer_cuda.py"],
+}
+
+
+class WithinSubsectionOrder:
+    def __init__(self, src_dir):
+        self.src_dir = src_dir.split("/")[-1]
+
+    def __call__(self, filename):
+        # If the order is provided, use the provided order
+        if (
+            self.src_dir in within_subsection_order
+            and filename in within_subsection_order[self.src_dir]
+        ):
+            index = within_subsection_order[self.src_dir].index(filename)
+            assert index < 1e10
+            return "\0%010d" % index
+
+        # Otherwise, sort by filename
+        return filename
+
+
 sphinx_gallery_conf = {
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("tvm", "numpy"),
@@ -213,6 +281,7 @@
         "numpy": "https://numpy.org/doc/stable",
     },
     "examples_dirs": examples_dirs,
+    "within_subsection_order": WithinSubsectionOrder,
     "gallery_dirs": gallery_dirs,
     "subsection_order": subsection_order,
     "filename_pattern": os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", ".py"),
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 42273bf72891..d1b3c22d2084 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -22,8 +22,7 @@
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
-
-Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
 manual templates to define the search space, the auto-scheduler does not require any templates.
 Users only need to write the computation declaration without any schedule commands or templates.
 The auto-scheduler can automatically generate a large search space and
@@ -182,7 +181,6 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # and resume the status of search policy and cost model with the log file.
 # In the example below we resume the status and do more 5 trials.
 
-
 cost_model = auto_scheduler.XGBModel()
 cost_model.update_from_file(log_file)
 search_policy = auto_scheduler.SketchPolicy(
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 0f2ebe0e09a4..2bd47ded11c8 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -20,7 +20,7 @@
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
-Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
 manual templates to define the search space, the auto-scheduler does not require any templates.
 Users only need to write the computation declaration without any schedule commands or templates.
 The auto-scheduler can automatically generate a large search space and
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index b307077905d3..b662bafd73e6 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -53,8 +53,7 @@
 import numpy as np
 
 import tvm
-from tvm import te
-from tvm import topi
+from tvm import te, topi, testing
 from tvm.topi.testing import conv2d_nchw_python
 
 from tvm import autotvm
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 7514ee708292..c69c7d9eaf8a 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -66,9 +66,7 @@
 
 import numpy as np
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
+from tvm import relay, autotvm
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.contrib.utils import tempdir
@@ -104,7 +102,7 @@ def get_network(name, batch_size):
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index f9b89211a066..3dccefef4de9 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -64,12 +64,9 @@
 import numpy as np
 
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
+from tvm import relay, autotvm
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib.utils import tempdir
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -102,7 +99,7 @@ def get_network(name, batch_size):
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
@@ -239,11 +236,6 @@ def tune_and_evaluate(tuning_opt):
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build_module.build(mod, target=target, params=params)
 
-        # export library
-        tmp = tempdir()
-        filename = "net.tar"
-        lib.export_library(tmp.relpath(filename))
-
         # load parameters
         ctx = tvm.context(str(target), 0)
         module = runtime.GraphModule(lib["default"](ctx))
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index b7fbf89e59aa..3611696996b9 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -65,9 +65,7 @@
 import numpy as np
 
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
+from tvm import relay, autotvm
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.contrib.utils import tempdir
@@ -103,7 +101,7 @@ def get_network(name, batch_size):
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index b1b7ca29e46a..5b3d0320f580 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -32,9 +32,7 @@
 import numpy as np
 
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import relay
+from tvm import relay, autotvm
 from tvm.relay import testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
@@ -73,7 +71,7 @@ def get_network(name, batch_size):
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
-        input_shape = (1, 3, 299, 299)
+        input_shape = (batch_size, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index b5167b3c72ab..4c5c7dae63f8 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -59,7 +59,7 @@
 
 import numpy as np
 import tvm
-from tvm import te
+from tvm import te, testing
 
 # the module is called `autotvm`
 from tvm import autotvm

From b846e5008a11b248d877ed3eeb74973f707eb337 Mon Sep 17 00:00:00 2001
From: Honghua Cao <49267856+Beya2019@users.noreply.github.com>
Date: Sun, 8 Nov 2020 23:50:42 +0800
Subject: [PATCH 154/258] [RELAY][OP] roi_pool operator alter layout (#6516)

Co-authored-by: honghua.cao <honghua.cao@streamcomputing.com>
---
 python/tvm/relay/op/vision/_rcnn.py           | 44 ++++++++++++++-
 src/relay/op/vision/rcnn_op.cc                | 30 +++++++++--
 .../relay/test_pass_convert_op_layout.py      | 54 +++++++++++++++++++
 3 files changed, 123 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/vision/_rcnn.py b/python/tvm/relay/op/vision/_rcnn.py
index 46eb3cbc2e53..4686974059b4 100644
--- a/python/tvm/relay/op/vision/_rcnn.py
+++ b/python/tvm/relay/op/vision/_rcnn.py
@@ -69,11 +69,53 @@ def convert_roi_align(attrs, inputs, tinfos, desired_layouts):
     raise ValueError("Layout %s is not yet supported." % desired_data_layout)
 
 
+@reg.register_convert_op_layout("vision.roi_pool")
+def convert_roi_pool(attrs, inputs, tinfos, desired_layouts):
+    """Convert Layout pass registration for roi_pool op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current roi_pool
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    tinfos : list of types
+        List of input and output types
+    desired_layouts : list of layout strings
+        List of layouts defining our desired
+        layout for the data and rois inputs respectively.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The transformed expr
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+
+    data, rois = inputs
+    new_attrs = dict(attrs)
+    assert (
+        len(desired_layouts) == 2
+    ), "A desired layout is expected for both of vision.roi_pool's inputs"
+
+    desired_data_layout, desired_rois_layout = map(str, desired_layouts)
+    assert desired_data_layout != "default", "Data layout cannot be default"
+    assert desired_rois_layout == "default", "Rois layout must be default"
+
+    new_attrs["layout"] = desired_data_layout
+    # rois layout not change
+    if desired_data_layout in ["NCHW", "NHWC"]:
+        return relay.vision.roi_pool(data, rois, **new_attrs)
+
+    raise ValueError("Layout %s is not yet supported." % desired_data_layout)
+
+
 # roi_pool
 @reg.register_compute("vision.roi_pool")
 def compute_roi_pool(attrs, inputs, _):
     """Compute definition of roi_pool"""
-    assert attrs.layout == "NCHW"
+    assert attrs.layout == "NCHW", "only support nchw for now"
     return [
         topi.vision.rcnn.roi_pool_nchw(
             inputs[0],
diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc
index 8be38d020480..f7bbf378d09c 100644
--- a/src/relay/op/vision/rcnn_op.cc
+++ b/src/relay/op/vision/rcnn_op.cc
@@ -119,14 +119,35 @@ bool ROIPoolRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   ICHECK(roi_pool_attrs);
   ICHECK_EQ(dshape.size(), 4) << "Input data should be 4-D.";
   ICHECK_EQ(rshape.size(), 2) << "Input rois should be 2-D.";
-  ICHECK_EQ(roi_pool_attrs->layout, "NCHW") << "ROI Pool only supports NCHW layout";
   // assign output type
-  std::vector<IndexExpr> oshape(
-      {rshape[0], dshape[1], roi_pool_attrs->pooled_size[0], roi_pool_attrs->pooled_size[1]});
+  std::vector<IndexExpr> oshape;
+  if (roi_pool_attrs->layout == "NCHW") {
+    oshape = {rshape[0], dshape[1], roi_pool_attrs->pooled_size[0], roi_pool_attrs->pooled_size[1]};
+  } else if (roi_pool_attrs->layout == "NHWC") {
+    oshape = {rshape[0], roi_pool_attrs->pooled_size[0], roi_pool_attrs->pooled_size[1], dshape[3]};
+  } else {
+    LOG(FATAL) << "vision.roi_pool does not support " << roi_pool_attrs->layout << " layout";
+  }
+
   reporter->Assign(types[2], TensorType(oshape, data->dtype));
   return true;
 }
 
+template <typename T>
+Array<Array<Layout> > ROIPoolInferCorrectLayout(const Attrs& attrs,
+                                                const Array<Layout>& new_in_layouts,
+                                                const Array<Layout>& old_in_layouts,
+                                                const Array<tvm::relay::Type>& old_in_types) {
+  // NOTE: Discard "const" qualifier here.
+  T* params = const_cast<T*>(attrs.as<T>());
+  Layout data_layout = params->layout;
+
+  // Layout inference needs to define the layout for all inputs and output data layouts.
+  // For roi_pool, the second inputs is 2-D tensor with shape [num_roi, 5].
+  // So, we set the layout as "N5".
+  return Array<Array<Layout> >{{data_layout, Layout("N5")}, {data_layout}};
+}
+
 Expr MakeROIPool(Expr data, Expr rois, Array<IndexExpr> pooled_size, double spatial_scale,
                  String layout) {
   auto attrs = make_object<ROIPoolAttrs>();
@@ -153,7 +174,8 @@ RELAY_REGISTER_OP("vision.roi_pool")
     .add_argument("data", "Tensor", "The input tensor.")
     .add_argument("rois", "Tensor", "The input rois")
     .set_support_level(5)
-    .add_type_rel("ROIPool", ROIPoolRel);
+    .add_type_rel("ROIPool", ROIPoolRel)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ROIPoolInferCorrectLayout<ROIPoolAttrs>);
 
 TVM_REGISTER_NODE_TYPE(ProposalAttrs);
 
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 1fc5d39b9486..7fc896a72905 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -966,6 +966,59 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_conv_roi_pool_convert_layout():
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+        rois = relay.var("rois", shape=(32, 5))
+        y = relay.vision.roi_pool(
+            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, layout="NCHW"
+        )
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(64, 64, 3, 3))
+        x = relay.layout_transform(x, "NCHW", "NHWC")
+        weight1 = relay.layout_transform(weight1, "OIHW", "HWIO")
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        rois = relay.var("rois", shape=(32, 5))
+        y = relay.vision.roi_pool(
+            y, rois, pooled_size=(14, 14), spatial_scale=0.0625, layout="NHWC"
+        )
+        ret = relay.layout_transform(y, "NHWC", "NCHW")
+        y = relay.Function(analysis.free_vars(ret), ret)
+        return y
+
+    a = before()
+    desired_layouts = {
+        "nn.conv2d": ["NHWC", "HWIO"],
+        "vision.roi_pool": ["NHWC", "default"],
+    }
+    a = run_opt_pass(a, transform.ConvertLayout(desired_layouts))
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 def test_default_keyword():
     """ Check that the default keyword selects correct TVM default layout. """
 
@@ -1253,6 +1306,7 @@ def expected():
     test_conv_convert_kernel_layout()
     test_conv_transpose_convert_layout()
     test_conv_roi_align_convert_layout()
+    test_conv_roi_pool_convert_layout()
     test_conv_strided_slice_convert_layout()
     test_default_keyword()
     test_different_ops_convert_layout()

From 7a1332154d684c0ea8c0571baa6aafcaf672e64f Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 8 Nov 2020 08:55:14 -0800
Subject: [PATCH 155/258] Do not show meta-data when printing IRModule (#6881)

---
 python/tvm/ir/module.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/ir/module.py b/python/tvm/ir/module.py
index 352f8aaf04b6..f8b6ff295339 100644
--- a/python/tvm/ir/module.py
+++ b/python/tvm/ir/module.py
@@ -251,8 +251,7 @@ def import_from_std(self, file_to_import):
         return tvm.relay.transform.InferType()(self)
 
     def __str__(self):
-        # TODO(jroesch): why does this hang sometimes?
-        return self.astext()
+        return _ffi_api.PrettyPrint(self)
 
     def __repr__(self):
         return self.astext()

From 0135dc1b4d986e696debe280af7f7568e1f8a9ab Mon Sep 17 00:00:00 2001
From: alter-xp <xp56@linux.alibaba.com>
Date: Mon, 9 Nov 2020 23:15:42 +0800
Subject: [PATCH 156/258] TF frontend: add rint op (#6818)

* TF frontend: add rint op

* Added negative numbers to the test
---
 python/tvm/relay/frontend/tensorflow.py         |  1 +
 .../python/frontend/tensorflow/test_forward.py  | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index c6079b4535c4..642e680f47f7 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -2439,6 +2439,7 @@ def _impl(inputs, attr, params, mod):
     "ResizeNearestNeighbor": _resize("nearest_neighbor"),
     "ReverseV2": _reverse_v2(),
     "RightShift": AttrCvt("right_shift"),
+    "Rint": AttrCvt("round"),
     "Round": AttrCvt("round"),
     "Rsqrt": _rsqrt(),
     "Select": _where(),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 5f849ac9ac93..93bfd0cbaf83 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -3602,6 +3602,23 @@ def _test_forward_softsign(shape):
     _test_forward_softsign([2, 5, 2, 5])
 
 
+def test_forward_rint():
+    """test operator rint """
+
+    def _test_forward_rint(shape):
+        tf.disable_eager_execution()
+        np_data = np.random.uniform(-100, 100, size=shape).astype(np.float32)
+        tf.reset_default_graph()
+        in_data = tf.placeholder(tf.float32, shape, name="in_data")
+        tf.math.rint(in_data, name="rint")
+        compare_tf_with_tvm([np_data], ["in_data:0"], "rint:0")
+
+    _test_forward_rint([100])
+    _test_forward_rint([1, 100])
+    _test_forward_rint([1, 10, 10])
+    _test_forward_rint([2, 5, 2, 5])
+
+
 def test_forward_negative():
     """test tf operator Neg """
     np_data = np.random.uniform(-100, 255, size=(224, 224, 3)).astype(np.float32)

From e3856764b6c4e1f35cdf85815d92f214a803d876 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Tue, 10 Nov 2020 00:41:38 +0800
Subject: [PATCH 157/258] [Relay][TF] Keep node name in span (#6885)

---
 python/tvm/relay/expr.py                   | 14 ++++++++++----
 python/tvm/relay/expr_functor.py           |  4 ++--
 python/tvm/relay/frontend/tensorflow.py    | 21 +++++++++++++++++++--
 src/printer/relay_text_printer.cc          | 14 +++++++++++++-
 src/printer/text_printer.h                 |  1 +
 src/relay/ir/expr.cc                       |  8 ++++----
 src/relay/transforms/fuse_ops.cc           |  2 +-
 tests/python/relay/test_ir_text_printer.py | 18 ++++++++++++++++++
 8 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 6d304648fa1c..7b6e4b4ccf80 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -185,10 +185,13 @@ class Tuple(ExprWithOp):
     ----------
     fields : List[tvm.relay.Expr]
         The fields in the tuple.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code
     """
 
-    def __init__(self, fields):
-        self.__init_handle_by_constructor__(_ffi_api.Tuple, fields)
+    def __init__(self, fields, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Tuple, fields, span)
 
     def __getitem__(self, index):
         if index >= len(self):
@@ -251,12 +254,15 @@ class Call(ExprWithOp):
     type_args: Optional[List[tvm.relay.Type]]
         The additional type arguments, this is only
         used in advanced usecase of template functions.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code
     """
 
-    def __init__(self, op, args, attrs=None, type_args=None):
+    def __init__(self, op, args, attrs=None, type_args=None, span=None):
         if not type_args:
             type_args = []
-        self.__init_handle_by_constructor__(_ffi_api.Call, op, args, attrs, type_args)
+        self.__init_handle_by_constructor__(_ffi_api.Call, op, args, attrs, type_args, span)
 
 
 @tvm._ffi.register_object("relay.Let")
diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py
index 0a37e4d4393c..40a116ab0b43 100644
--- a/python/tvm/relay/expr_functor.py
+++ b/python/tvm/relay/expr_functor.py
@@ -213,7 +213,7 @@ def visit_let(self, let):
     def visit_call(self, call):
         new_fn = self.visit(call.op)
         new_args = [self.visit(arg) for arg in call.args]
-        return Call(new_fn, new_args, call.attrs)
+        return Call(new_fn, new_args, call.attrs, call.type_args, call.span)
 
     def visit_var(self, var):
         return var
@@ -225,7 +225,7 @@ def visit_if(self, ite):
         return If(self.visit(ite.cond), self.visit(ite.true_branch), self.visit(ite.false_branch))
 
     def visit_tuple(self, tup):
-        return Tuple([self.visit(field) for field in tup.fields])
+        return Tuple([self.visit(field) for field in tup.fields], tup.span)
 
     def visit_tuple_getitem(self, op):
         tuple_value = self.visit(op.tuple_value)
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 642e680f47f7..4a7a7da307fc 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -3402,7 +3402,7 @@ def _partition_call_operator(self, inputs, attr):
         return ret
 
     def _convert_operator(
-        self, op_name, inputs, attrs, graph, identity_list=None, convert_map=None
+        self, op_name, node_name, inputs, attrs, identity_list=None, convert_map=None
     ):
         """Convert from Tensorflow operator to relay operator.
         The converter must specify conversions explicitly for incompatible name, and
@@ -3441,6 +3441,23 @@ def _convert_operator(
             sym = self._partition_call_operator(inputs, attrs)
         else:
             raise NotImplementedError("Operator {} not implemented.".format(op_name))
+
+        sym = self._set_span(sym, node_name)
+
+        return sym
+
+    @staticmethod
+    def _set_span(sym, node_name):
+        span = tvm.relay.Span(tvm.relay.SourceName(node_name), 0, 0, 0, 0)
+        if isinstance(sym, _expr.Call):
+            sym = _expr.Call(sym.op, sym.args, sym.attrs, sym.type_args, span)
+        elif isinstance(sym, _expr.TupleWrapper):
+            tuple_value = sym.tuple_value
+            if isinstance(tuple_value, _expr.Call):
+                tuple_value = _expr.Call(
+                    tuple_value.op, tuple_value.args, tuple_value.attrs, tuple_value.type_args, span
+                )
+                sym = _expr.TupleWrapper(tuple_value, sym.size)
         return sym
 
     def _licm_construct(self, loop_name, node_name):
@@ -3577,7 +3594,7 @@ def _backtrack_construct(self, node_name):
                         actual_input = self._licm_construct(plname, iname)
                         inputs[i] = actual_input
 
-                op = self._convert_operator(node.op, inputs, attr, self._graph)
+                op = self._convert_operator(node.op, node.name, inputs, attr)
 
             if isinstance(op, np.ndarray):
                 self._params[node.name] = tvm.nd.array(op)
diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc
index 4132ab14ff29..da4f8cadfb3d 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/printer/relay_text_printer.cc
@@ -489,7 +489,11 @@ Doc RelayTextPrinter::VisitExpr_(const CallNode* op) {
     // don't print as a call if it's a 0-arity cons
     return doc;
   } else {
-    return doc << "(" << Doc::Concat(args) << ")";
+    doc << "(" << Doc::Concat(args) << ")";
+    if (op->span.defined()) {
+      doc << " /* " << PrintSpan(op->span) << " */";
+    }
+    return doc;
   }
 }
 
@@ -840,6 +844,14 @@ std::vector<Doc> RelayTextPrinter::PrintFuncAttrs(const Attrs& attrs) {
   return docs;
 }
 
+Doc RelayTextPrinter::PrintSpan(const Span& span) {
+  Doc doc;
+  const auto* span_node = span.as<SpanNode>();
+  ICHECK(span_node);
+  doc << span_node->source_name->name;
+  return doc;
+}
+
 TVM_REGISTER_GLOBAL("ir.TextPrinter").set_body_typed([](ObjectRef node) {
   auto text = AsText(node, false, nullptr);
   return text;
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index e519969d6a4b..9a24fe65b4b1 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -74,6 +74,7 @@ class RelayTextPrinter : public ExprFunctor<Doc(const Expr&)>,
   Doc PrintFinal(const ObjectRef& node);
   std::vector<Doc> PrintCallAttrs(const Attrs& attrs, const Expr& op);
   std::vector<Doc> PrintFuncAttrs(const Attrs& attrs);
+  Doc PrintSpan(const Span& span);
 
   Doc Print(const ObjectRef& node, bool meta = false, bool try_inline = false);
 
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index f2e0b363eb2b..89d1f1ab0f11 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -73,8 +73,8 @@ Tuple::Tuple(tvm::Array<relay::Expr> fields, Span span) {
 
 TVM_REGISTER_NODE_TYPE(TupleNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.Tuple").set_body_typed([](tvm::Array<relay::Expr> fields) {
-  return Tuple(fields);
+TVM_REGISTER_GLOBAL("relay.ir.Tuple").set_body_typed([](tvm::Array<relay::Expr> fields, Span span) {
+  return Tuple(fields, span);
 });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -121,8 +121,8 @@ Call::Call(Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args, Span s
 TVM_REGISTER_NODE_TYPE(CallNode);
 
 TVM_REGISTER_GLOBAL("relay.ir.Call")
-    .set_body_typed([](Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args) {
-      return Call(op, args, attrs, type_args);
+    .set_body_typed([](Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args, Span span) {
+      return Call(op, args, attrs, type_args, span);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 8023305f3f64..29f3bfa0a17e 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -870,7 +870,7 @@ class FuseMutator : private ExprMutator {
       auto* ret_group = gmap_.at(call)->FindRoot();
       Array<Expr> new_args = GetNewArguments(call->args, ret_group);
 
-      auto new_call = Call(call->op, new_args, call->attrs, call->type_args);
+      auto new_call = Call(call->op, new_args, call->attrs, call->type_args, call->span);
 
       if (ret_group->root_ref == call) {
         // This is the root of the group
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 6c2f7166f446..4a3569aca2ec 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -250,6 +250,24 @@ def test_null_attribute():
     assert "TestAttribute=(nullptr)" in txt
 
 
+def test_span():
+    x = relay.var("x", shape=(3, 2))
+    y = relay.var("y")
+    one = relay.const(10e10, dtype="float32")
+    z = relay.add(x, one)
+    z = relay.Call(
+        z.op, z.args, z.attrs, z.type_args, relay.Span(relay.SourceName("Add0"), 0, 0, 0, 0)
+    )
+    z = relay.add(z, z)
+    z = relay.Call(
+        z.op, z.args, z.attrs, z.type_args, relay.Span(relay.SourceName("Add1"), 0, 0, 0, 0)
+    )
+    f = relay.Function([x, y], z)
+    txt = astext(f)
+    assert "Add0" in txt
+    assert "Add1" in txt
+
+
 if __name__ == "__main__":
     import sys
 

From 769778131964316a0cf456bc181d094d1e6c1fbd Mon Sep 17 00:00:00 2001
From: eric <eun.taik.lee@samsung.com>
Date: Tue, 10 Nov 2020 10:56:48 +0900
Subject: [PATCH 158/258] [TVMC] add cl support in tvmc runner (#6831)

* [TVMC] add cl support in tvmc runner

* Cleanup comment and asssert device type in else case
---
 python/tvm/driver/tvmc/runner.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index a4abe8c31f56..dec0e9842a37 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -47,10 +47,10 @@ def add_run_parser(subparsers):
     parser.set_defaults(func=drive_run)
 
     # TODO --device needs to be extended and tested to support other targets,
-    #      like 'cl', 'webgpu', etc (@leandron)
+    #      like 'webgpu', etc (@leandron)
     parser.add_argument(
         "--device",
-        choices=["cpu", "gpu"],
+        choices=["cpu", "gpu", "cl"],
         default="cpu",
         help="target device to run the compiled module. Defaults to 'cpu'",
     )
@@ -361,7 +361,13 @@ def run_module(
 
         # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron)
         logger.debug("device is %s", device)
-        ctx = session.cpu() if device == "cpu" else session.gpu()
+        if device == "gpu":
+            ctx = session.gpu()
+        elif device == "cl":
+            ctx = session.cl()
+        else:
+            assert device == "cpu"
+            ctx = session.cpu()
 
         if profile:
             logger.debug("creating runtime with profiling enabled")

From cb5c12b14aaad47ae1ba3e810efd04340c9194cf Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Wed, 11 Nov 2020 04:52:08 +0800
Subject: [PATCH 159/258] [GCC] Fix GCC8.1 and GCC8.2 template dispatch
 compilation issue (#6893)

* update

* [GCC] Fix GCC8.1 and GCC8.2 template dispatch compilation issue
---
 include/tvm/ir/attrs.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index afb8ef0730e0..13bfd715cdfb 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -413,6 +413,12 @@ inline void SetIntValue(T* ptr, const TVMArgValue& val) {
   }
 }
 
+// Workaround for GCC8.1 / GCC8.2
+template <>
+inline void SetValue<DataType>(DataType* ptr, const TVMArgValue& val) {
+  *ptr = val.operator DataType();
+}
+
 template <>
 inline void SetValue<std::string>(std::string* ptr, const TVMArgValue& val) {
   if (String::CanConvertFrom(val)) {

From 59044432d1d0abdf8c99ecdc64e3de239b9abca0 Mon Sep 17 00:00:00 2001
From: Light-of-Hers <39763024+Light-of-Hers@users.noreply.github.com>
Date: Wed, 11 Nov 2020 04:52:43 +0800
Subject: [PATCH 160/258] Fix bug of generate-unmatched-brackets in
 CodeGenC::PrintSSAAssign (#6887)

---
 src/target/source/codegen_c.cc | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index ca9b80564cd9..417b7a2db508 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -139,10 +139,12 @@ void CodeGenC::PrintExpr(const PrimExpr& n, std::ostream& os) {  // NOLINT(*)
   }
 }
 
+static bool CheckOutermostBracketMatch(const std::string& s);
+
 void CodeGenC::PrintSSAAssign(const std::string& target, const std::string& src, DataType t) {
   PrintType(t, stream);
   stream << ' ' << target << " = ";
-  if (src.length() > 3 && src[0] == '(' && src[src.length() - 1] == ')') {
+  if (CheckOutermostBracketMatch(src)) {
     stream << src.substr(1, src.length() - 2);
   } else {
     stream << src;
@@ -973,5 +975,23 @@ void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
   return;
 }
 
+static bool CheckOutermostBracketMatch(const std::string& s) {
+  if (!s.empty() && s.front() == '(' && s.back() == ')') {
+    size_t len = s.size();
+    int n_unmatched = 0;
+    for (size_t i = 0; i < len; ++i) {
+      if (s[i] == '(') {
+        n_unmatched++;
+      } else if (s[i] == ')') {
+        n_unmatched--;
+      }
+      if (n_unmatched == 0) {
+        return i == len - 1;
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace codegen
 }  // namespace tvm

From 0266721342492fca60b4ff34a32b71056dcaaf7f Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 10 Nov 2020 16:09:53 -0800
Subject: [PATCH 161/258] [TIR] Add spans to all ExprNodes (#6860)

---
 include/tvm/ir/expr.h           |  21 +--
 include/tvm/tir/buffer.h        |  10 +-
 include/tvm/tir/expr.h          |  74 +++++----
 include/tvm/tir/function.h      |   3 +-
 include/tvm/tir/stmt.h          |  57 ++++---
 include/tvm/tir/var.h           |  19 ++-
 python/tvm/tir/expr.py          | 268 +++++++++++++++++++++++---------
 python/tvm/tir/function.py      |  14 +-
 python/tvm/tir/stmt.py          | 107 +++++++++----
 src/ir/expr.cc                  |  20 ++-
 src/target/llvm/codegen_llvm.cc |   2 +-
 src/tir/ir/buffer.cc            |   9 +-
 src/tir/ir/expr.cc              | 225 +++++++++++++++++----------
 src/tir/ir/function.cc          |   7 +-
 src/tir/ir/stmt.cc              | 119 ++++++++------
 src/tir/op/op.cc                |   4 +
 16 files changed, 635 insertions(+), 324 deletions(-)

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index ffb225c512cd..1c470fae51ee 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -45,6 +45,12 @@ using tvm::runtime::String;
  */
 class BaseExprNode : public Object {
  public:
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
+
   static constexpr const char* _type_key = "BaseExpr";
   static constexpr const bool _type_has_method_sequal_reduce = true;
   static constexpr const bool _type_has_method_shash_reduce = true;
@@ -135,11 +141,6 @@ class PrimExpr : public BaseExpr {
  */
 class RelayExprNode : public BaseExprNode {
  public:
-  /*!
-   * \brief Span that points to the original source code.
-   *        Reserved debug information.
-   */
-  mutable Span span;
   /*!
    * \brief Stores the result of type inference(type checking).
    *
@@ -263,8 +264,9 @@ class IntImm : public PrimExpr {
    * \brief Constructor.
    * \param dtype The data type of the value.
    * \param value The internal value.
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL IntImm(DataType dtype, int64_t value);
+  TVM_DLL IntImm(DataType dtype, int64_t value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(IntImm, PrimExpr, IntImmNode);
 };
@@ -307,8 +309,9 @@ class FloatImm : public PrimExpr {
    * \brief Constructor.
    * \param dtype The data type of the value.
    * \param value The internal value.
+   * \param span The location in the source code.
    */
-  TVM_DLL FloatImm(DataType dtype, double value);
+  TVM_DLL FloatImm(DataType dtype, double value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(FloatImm, PrimExpr, FloatImmNode);
 };
@@ -321,7 +324,7 @@ class FloatImm : public PrimExpr {
  */
 class Bool : public IntImm {
  public:
-  explicit Bool(bool value) : IntImm(DataType::Bool(), value) {}
+  explicit Bool(bool value, Span span = Span()) : IntImm(DataType::Bool(), value, span) {}
   Bool operator!() const { return Bool((*this)->value == 0); }
   operator bool() const { return (*this)->value != 0; }
 
@@ -358,7 +361,7 @@ class Integer : public IntImm {
   /*!
    * \brief Construct integer from int value.
    */
-  Integer(int value) : IntImm(DataType::Int(32), value) {}  // NOLINT(*)
+  Integer(int value, Span span = Span()) : IntImm(DataType::Int(32), value, span) {}  // NOLINT(*)
   /*!
    * \brief Construct integer from int imm.
    * \param other The other value.
diff --git a/include/tvm/tir/buffer.h b/include/tvm/tir/buffer.h
index e150ff38041b..69741bbdca62 100644
--- a/include/tvm/tir/buffer.h
+++ b/include/tvm/tir/buffer.h
@@ -77,6 +77,11 @@ class BufferNode : public Object {
   int offset_factor;
   /*! \brief buffer type */
   BufferType buffer_type;
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
   /*! \brief constructor */
   BufferNode() {}
 
@@ -135,7 +140,7 @@ class Buffer : public ObjectRef {
   // A default value will be picked.
   TVM_DLL Buffer(Var ptr, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr> strides,
                  PrimExpr elem_offset, String name, String scope, int data_alignment,
-                 int offset_factor, BufferType buffer_type);
+                 int offset_factor, BufferType buffer_type, Span span = Span());
 
   /*!
    * \brief Return a new buffer that is equivalent with current one
@@ -183,11 +188,12 @@ class Buffer : public ObjectRef {
  * \param shape The shape of the buffer,
  * \param dtype The content data type.
  * \param name The name of the buffer
+ * \param span The location of this object in the source code.
  * \return The created buffer.
  * \sa Buffer for complete constructor.
  */
 TVM_DLL Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
-                           String name = "buffer");
+                           String name = "buffer", Span span = Span());
 
 /*!
  * \brief Base node for data producers.
diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index eee0deecdc70..f2ae58554ab1 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -74,7 +74,7 @@ class StringImmNode : public PrimExprNode {
  */
 class StringImm : public PrimExpr {
  public:
-  TVM_DLL StringImm(String value);
+  TVM_DLL StringImm(String value, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(StringImm, PrimExpr, StringImmNode);
 };
 
@@ -111,7 +111,7 @@ class CastNode : public PrimExprNode {
  */
 class Cast : public PrimExpr {
  public:
-  TVM_DLL Cast(DataType dtype, PrimExpr value);
+  TVM_DLL Cast(DataType dtype, PrimExpr value, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Cast, PrimExpr, CastNode);
 };
 
@@ -158,7 +158,7 @@ class AddNode : public BinaryOpNode<AddNode> {
  */
 class Add : public PrimExpr {
  public:
-  TVM_DLL Add(PrimExpr a, PrimExpr b);
+  TVM_DLL Add(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Add, PrimExpr, AddNode);
 };
 
@@ -174,7 +174,7 @@ class SubNode : public BinaryOpNode<SubNode> {
  */
 class Sub : public PrimExpr {
  public:
-  TVM_DLL Sub(PrimExpr a, PrimExpr b);
+  TVM_DLL Sub(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Sub, PrimExpr, SubNode);
 };
 
@@ -190,7 +190,7 @@ class MulNode : public BinaryOpNode<MulNode> {
  */
 class Mul : public PrimExpr {
  public:
-  TVM_DLL Mul(PrimExpr a, PrimExpr b);
+  TVM_DLL Mul(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Mul, PrimExpr, MulNode);
 };
 
@@ -209,7 +209,7 @@ class DivNode : public BinaryOpNode<DivNode> {
  */
 class Div : public PrimExpr {
  public:
-  TVM_DLL Div(PrimExpr a, PrimExpr b);
+  TVM_DLL Div(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Div, PrimExpr, DivNode);
 };
 
@@ -228,7 +228,7 @@ class ModNode : public BinaryOpNode<ModNode> {
  */
 class Mod : public PrimExpr {
  public:
-  TVM_DLL Mod(PrimExpr a, PrimExpr b);
+  TVM_DLL Mod(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Mod, PrimExpr, ModNode);
 };
 
@@ -244,7 +244,7 @@ class FloorDivNode : public BinaryOpNode<FloorDivNode> {
  */
 class FloorDiv : public PrimExpr {
  public:
-  TVM_DLL FloorDiv(PrimExpr a, PrimExpr b);
+  TVM_DLL FloorDiv(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(FloorDiv, PrimExpr, FloorDivNode);
 };
 
@@ -260,7 +260,7 @@ class FloorModNode : public BinaryOpNode<FloorModNode> {
  */
 class FloorMod : public PrimExpr {
  public:
-  TVM_DLL FloorMod(PrimExpr a, PrimExpr b);
+  TVM_DLL FloorMod(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(FloorMod, PrimExpr, FloorModNode);
 };
 
@@ -276,7 +276,7 @@ class MinNode : public BinaryOpNode<MinNode> {
  */
 class Min : public PrimExpr {
  public:
-  TVM_DLL Min(PrimExpr a, PrimExpr b);
+  TVM_DLL Min(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Min, PrimExpr, MinNode);
 };
 
@@ -292,7 +292,7 @@ class MaxNode : public BinaryOpNode<MaxNode> {
  */
 class Max : public PrimExpr {
  public:
-  TVM_DLL Max(PrimExpr a, PrimExpr b);
+  TVM_DLL Max(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Max, PrimExpr, MaxNode);
 };
 
@@ -339,7 +339,7 @@ class EQNode : public CmpOpNode<EQNode> {
  */
 class EQ : public PrimExpr {
  public:
-  TVM_DLL EQ(PrimExpr a, PrimExpr b);
+  TVM_DLL EQ(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(EQ, PrimExpr, EQNode);
 };
 
@@ -355,7 +355,7 @@ class NENode : public CmpOpNode<NENode> {
  */
 class NE : public PrimExpr {
  public:
-  TVM_DLL NE(PrimExpr a, PrimExpr b);
+  TVM_DLL NE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(NE, PrimExpr, NENode);
 };
 
@@ -371,7 +371,7 @@ class LTNode : public CmpOpNode<LTNode> {
  */
 class LT : public PrimExpr {
  public:
-  TVM_DLL LT(PrimExpr a, PrimExpr b);
+  TVM_DLL LT(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(LT, PrimExpr, LTNode);
 };
 
@@ -387,7 +387,7 @@ struct LENode : public CmpOpNode<LENode> {
  */
 class LE : public PrimExpr {
  public:
-  TVM_DLL LE(PrimExpr a, PrimExpr b);
+  TVM_DLL LE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(LE, PrimExpr, LENode);
 };
 
@@ -403,7 +403,7 @@ class GTNode : public CmpOpNode<GTNode> {
  */
 class GT : public PrimExpr {
  public:
-  TVM_DLL GT(PrimExpr a, PrimExpr b);
+  TVM_DLL GT(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(GT, PrimExpr, GTNode);
 };
 
@@ -419,7 +419,7 @@ class GENode : public CmpOpNode<GENode> {
  */
 class GE : public PrimExpr {
  public:
-  TVM_DLL GE(PrimExpr a, PrimExpr b);
+  TVM_DLL GE(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(GE, PrimExpr, GENode);
 };
 
@@ -457,7 +457,7 @@ class AndNode : public PrimExprNode {
  */
 class And : public PrimExpr {
  public:
-  TVM_DLL And(PrimExpr a, PrimExpr b);
+  TVM_DLL And(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(And, PrimExpr, AndNode);
 };
 
@@ -495,7 +495,7 @@ class OrNode : public PrimExprNode {
  */
 class Or : public PrimExpr {
  public:
-  TVM_DLL Or(PrimExpr a, PrimExpr b);
+  TVM_DLL Or(PrimExpr a, PrimExpr b, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Or, PrimExpr, OrNode);
 };
 
@@ -529,7 +529,7 @@ class NotNode : public PrimExprNode {
  */
 class Not : public PrimExpr {
  public:
-  TVM_DLL Not(PrimExpr a);
+  TVM_DLL Not(PrimExpr a, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Not, PrimExpr, NotNode);
 };
 
@@ -578,7 +578,7 @@ class SelectNode : public PrimExprNode {
  */
 class Select : public PrimExpr {
  public:
-  TVM_DLL Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value);
+  TVM_DLL Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Select, PrimExpr, SelectNode);
 };
@@ -627,7 +627,7 @@ class BufferLoadNode : public PrimExprNode {
  */
 class BufferLoad : public PrimExpr {
  public:
-  TVM_DLL explicit BufferLoad(Buffer buffer, Array<PrimExpr> indices);
+  TVM_DLL explicit BufferLoad(Buffer buffer, Array<PrimExpr> indices, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(BufferLoad, PrimExpr, BufferLoadNode);
 };
 
@@ -674,7 +674,7 @@ class ProducerLoadNode : public PrimExprNode {
  */
 class ProducerLoad : public PrimExpr {
  public:
-  TVM_DLL explicit ProducerLoad(DataProducer producer, Array<PrimExpr> indices);
+  TVM_DLL explicit ProducerLoad(DataProducer producer, Array<PrimExpr> indices, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerLoad, PrimExpr, ProducerLoadNode);
 };
@@ -732,7 +732,8 @@ class LoadNode : public PrimExprNode {
  */
 class Load : public PrimExpr {
  public:
-  TVM_DLL Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate);
+  TVM_DLL Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate,
+               Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Load, PrimExpr, LoadNode);
 };
 
@@ -783,7 +784,7 @@ class RampNode : public PrimExprNode {
  */
 class Ramp : public PrimExpr {
  public:
-  TVM_DLL Ramp(PrimExpr base, PrimExpr stride, int lanes);
+  TVM_DLL Ramp(PrimExpr base, PrimExpr stride, int lanes, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Ramp, PrimExpr, RampNode);
 };
 
@@ -821,7 +822,7 @@ class BroadcastNode : public PrimExprNode {
  */
 class Broadcast : public PrimExpr {
  public:
-  TVM_DLL Broadcast(PrimExpr value, int lanes);
+  TVM_DLL Broadcast(PrimExpr value, int lanes, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Broadcast, PrimExpr, BroadcastNode);
 };
 
@@ -866,7 +867,7 @@ class LetNode : public PrimExprNode {
  */
 class Let : public PrimExpr {
  public:
-  TVM_DLL Let(Var var, PrimExpr value, PrimExpr body);
+  TVM_DLL Let(Var var, PrimExpr value, PrimExpr body, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Let, PrimExpr, LetNode);
 };
 
@@ -911,7 +912,7 @@ class CallNode : public PrimExprNode {
  */
 class Call : public PrimExpr {
  public:
-  TVM_DLL Call(DataType dtype, RelayExpr op, Array<PrimExpr> args);
+  TVM_DLL Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(Call, PrimExpr, CallNode);
 };
 
@@ -953,9 +954,9 @@ class ShuffleNode : public PrimExprNode {
  */
 class Shuffle : public PrimExpr {
  public:
-  TVM_DLL Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices);
-  TVM_DLL static PrimExpr Concat(Array<PrimExpr> vectors);
-  TVM_DLL static PrimExpr ExtractElement(PrimExpr vector, int index);
+  TVM_DLL Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices, Span span = Span());
+  TVM_DLL static PrimExpr Concat(Array<PrimExpr> vectors, Span span = Span());
+  TVM_DLL static PrimExpr ExtractElement(PrimExpr vector, int index, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Shuffle, PrimExpr, ShuffleNode);
 };
@@ -981,6 +982,11 @@ class CommReducerNode : public Object {
   Array<PrimExpr> identity_element;
   /*! \brief Function call operator to combine a and b */
   Array<PrimExpr> operator()(Array<PrimExpr> a, Array<PrimExpr> b) const;
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("lhs", &lhs);
@@ -1014,7 +1020,7 @@ class CommReducerNode : public Object {
 class CommReducer : public ObjectRef {
  public:
   TVM_DLL CommReducer(Array<Var> lhs, Array<Var> rhs, Array<PrimExpr> result,
-                      Array<PrimExpr> identity_element);
+                      Array<PrimExpr> identity_element, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(CommReducer, ObjectRef, CommReducerNode);
 };
@@ -1077,7 +1083,7 @@ class ReduceNode : public PrimExprNode {
 class Reduce : public PrimExpr {
  public:
   TVM_DLL Reduce(CommReducer combiner, Array<PrimExpr> src, Array<IterVar> rdom, PrimExpr condition,
-                 int value_index, Array<PrimExpr> init);
+                 int value_index, Array<PrimExpr> init, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Reduce, PrimExpr, ReduceNode);
 };
@@ -1106,7 +1112,7 @@ class AnyNode : public PrimExprNode {
  */
 class Any : public PrimExpr {
  public:
-  TVM_DLL Any();
+  TVM_DLL Any(Span span = Span());
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Any, PrimExpr, AnyNode);
 };
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index caddd99eeb2c..64dbb5cf8ec3 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -140,10 +140,11 @@ class PrimFunc : public BaseFunc {
    * \param ret_type The return type of the function.
    * \param buffer_map The buffer map for parameter buffer unpacking.
    * \param attrs Additional function attributes.
+   * \param span The location of this object in the source code.
    */
   TVM_DLL PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type = VoidType(),
                    Map<tir::Var, Buffer> buffer_map = Map<tir::Var, Buffer>(),
-                   DictAttrs attrs = NullValue<DictAttrs>());
+                   DictAttrs attrs = NullValue<DictAttrs>(), Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(PrimFunc, BaseFunc, PrimFuncNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(PrimFuncNode);
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 16800d57bda8..661c30110062 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -37,6 +37,15 @@ namespace tir {
 /*! \brief Base node of all statements. */
 class StmtNode : public Object {
  public:
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
+
+  StmtNode() = default;
+  explicit StmtNode(Span span) : span(span) {}
+
   static constexpr const char* _type_key = "tir.Stmt";
   static constexpr const bool _type_has_method_sequal_reduce = true;
   static constexpr const bool _type_has_method_shash_reduce = true;
@@ -89,7 +98,7 @@ class LetStmtNode : public StmtNode {
  */
 class LetStmt : public Stmt {
  public:
-  TVM_DLL LetStmt(Var var, PrimExpr value, Stmt body);
+  TVM_DLL LetStmt(Var var, PrimExpr value, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(LetStmt, Stmt, LetStmtNode);
 };
@@ -144,7 +153,7 @@ class AttrStmtNode : public StmtNode {
  */
 class AttrStmt : public Stmt {
  public:
-  TVM_DLL AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body);
+  TVM_DLL AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(AttrStmt, Stmt, AttrStmtNode);
 };
@@ -191,7 +200,7 @@ class AssertStmtNode : public StmtNode {
  */
 class AssertStmt : public Stmt {
  public:
-  TVM_DLL AssertStmt(PrimExpr condition, PrimExpr message, Stmt body);
+  TVM_DLL AssertStmt(PrimExpr condition, PrimExpr message, Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(AssertStmt, Stmt, AssertStmtNode);
 };
@@ -254,7 +263,8 @@ class StoreNode : public StmtNode {
  */
 class Store : public Stmt {
  public:
-  TVM_DLL Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate);
+  TVM_DLL Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate,
+                Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Store, Stmt, StoreNode);
 };
@@ -305,7 +315,8 @@ class BufferStoreNode : public StmtNode {
  */
 class BufferStore : public Stmt {
  public:
-  TVM_DLL explicit BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices);
+  TVM_DLL explicit BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices,
+                               Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(BufferStore, Stmt, BufferStoreNode);
 };
@@ -352,8 +363,9 @@ class BufferRealizeNode : public StmtNode {
   }
 
   BufferRealizeNode() = default;
-  BufferRealizeNode(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body)
-      : buffer(buffer), bounds(bounds), condition(condition), body(body) {}
+  BufferRealizeNode(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
+                    Span span = Span())
+      : StmtNode(span), buffer(buffer), bounds(bounds), condition(condition), body(body) {}
 
   static constexpr const char* _type_key = "tir.BufferRealize";
   TVM_DECLARE_FINAL_OBJECT_INFO(BufferRealizeNode, StmtNode);
@@ -365,7 +377,8 @@ class BufferRealizeNode : public StmtNode {
  */
 class BufferRealize : public Stmt {
  public:
-  TVM_DLL explicit BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body);
+  TVM_DLL explicit BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
+                                 Span span = Span());
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(BufferRealize, Stmt, BufferRealizeNode);
 };
@@ -416,7 +429,8 @@ class ProducerStoreNode : public StmtNode {
  */
 class ProducerStore : public Stmt {
  public:
-  TVM_DLL ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices);
+  TVM_DLL ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices,
+                        Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerStore, Stmt, ProducerStoreNode);
 };
@@ -472,7 +486,8 @@ class ProducerRealizeNode : public StmtNode {
  */
 class ProducerRealize : public Stmt {
  public:
-  TVM_DLL ProducerRealize(DataProducer producer, Region bounds, PrimExpr condition, Stmt body);
+  TVM_DLL ProducerRealize(DataProducer producer, Region bounds, PrimExpr condition, Stmt body,
+                          Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(ProducerRealize, Stmt, ProducerRealizeNode);
 };
@@ -540,7 +555,7 @@ class AllocateNode : public StmtNode {
 class Allocate : public Stmt {
  public:
   TVM_DLL Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, PrimExpr condition,
-                   Stmt body);
+                   Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Allocate, Stmt, AllocateNode);
 };
@@ -579,8 +594,9 @@ class SeqStmt : public Stmt {
   /*!
    * \brief Construct SeqStmt.
    * \param seq The sequence.
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit SeqStmt(Array<Stmt> seq);
+  TVM_DLL explicit SeqStmt(Array<Stmt> seq, Span span = Span());
 
   /*! \return get the size of the sequence */
   size_t size() const { return operator->()->size(); }
@@ -678,7 +694,8 @@ class IfThenElseNode : public StmtNode {
  */
 class IfThenElse : public Stmt {
  public:
-  TVM_DLL IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case = Stmt());
+  TVM_DLL IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case = Stmt(),
+                     Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(IfThenElse, Stmt, IfThenElseNode);
 };
@@ -712,9 +729,9 @@ class EvaluateNode : public StmtNode {
  */
 class Evaluate : public Stmt {
  public:
-  TVM_DLL explicit Evaluate(PrimExpr value);
+  TVM_DLL explicit Evaluate(PrimExpr value, Span span = Span());
 
-  explicit Evaluate(int value) : Evaluate(PrimExpr(value)) {}
+  explicit Evaluate(int value, Span span = Span()) : Evaluate(PrimExpr(value), span) {}
 
   TVM_DEFINE_OBJECT_REF_METHODS(Evaluate, Stmt, EvaluateNode);
 };
@@ -799,7 +816,7 @@ class ForNode : public StmtNode {
 class For : public Stmt {
  public:
   TVM_DLL For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api,
-              Stmt body);
+              Stmt body, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(For, Stmt, ForNode);
 };
@@ -829,7 +846,8 @@ class PrefetchNode : public StmtNode {
   }
 
   PrefetchNode() = default;
-  PrefetchNode(Buffer buffer, Array<Range> bounds) : buffer(buffer), bounds(bounds) {}
+  PrefetchNode(Buffer buffer, Array<Range> bounds, Span span = Span())
+      : StmtNode(span), buffer(buffer), bounds(bounds) {}
 
   static constexpr const char* _type_key = "tir.Prefetch";
   TVM_DECLARE_FINAL_OBJECT_INFO(PrefetchNode, StmtNode);
@@ -841,7 +859,7 @@ class PrefetchNode : public StmtNode {
  */
 class Prefetch : public Stmt {
  public:
-  TVM_DLL explicit Prefetch(Buffer buffer, Array<Range> bounds);
+  TVM_DLL explicit Prefetch(Buffer buffer, Array<Range> bounds, Span span = Span());
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Prefetch, Stmt, PrefetchNode);
 };
@@ -973,9 +991,10 @@ inline bool IsPragmaKey(const std::string& attr_key) {
 /*!
  * \brief Create a type annotation expression
  * \param dtype The data type
+ * \param span The location of this object in the source code.
  * \return Expr a expression with dtype.
  */
-TVM_DLL PrimExpr TypeAnnotation(DataType dtype);
+TVM_DLL PrimExpr TypeAnnotation(DataType dtype, Span span = Span());
 
 // overload printing of for type.
 TVM_DLL std::ostream& operator<<(std::ostream& os, ForType for_type);
diff --git a/include/tvm/tir/var.h b/include/tvm/tir/var.h
index f1651c118010..a2240939ddea 100644
--- a/include/tvm/tir/var.h
+++ b/include/tvm/tir/var.h
@@ -91,14 +91,17 @@ class Var : public PrimExpr {
    * \brief Constructor
    * \param name_hint variable name
    * \param dtype data type
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit Var(String name_hint = "v", DataType dtype = DataType::Int(32));
+  TVM_DLL explicit Var(String name_hint = "v", DataType dtype = DataType::Int(32),
+                       Span span = Span());
   /*!
    * \brief Constructor which provides a more detailed type annotation.
    * \param name_hint variable name.
    * \param type_annotation The type annotation.
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit Var(String name_hint, Type type_annotation);
+  TVM_DLL explicit Var(String name_hint, Type type_annotation, Span span = Span());
   /*!
    * \brief Make a new copy of var with same type, append suffix
    * \param suffix The suffix to be appended.
@@ -138,8 +141,10 @@ class SizeVar : public Var {
    * \brief constructor
    * \param name_hint variable name
    * \param t data type
+   * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit SizeVar(String name_hint = "s", DataType t = DataType::Int(32));
+  TVM_DLL explicit SizeVar(String name_hint = "s", DataType t = DataType::Int(32),
+                           Span span = Span());
   /*!
    * \brief Get pointer to the internal value.
    * \return the corresponding Variable.
@@ -246,6 +251,11 @@ class IterVarNode : public Object {
    *  set this if this is binded already to a known thread tag.
    */
   String thread_tag;
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("dom", &dom);
@@ -278,7 +288,8 @@ class IterVarNode : public Object {
  */
 class IterVar : public ObjectRef {
  public:
-  TVM_DLL IterVar(Range dom, Var var, IterVarType iter_type, String thread_tag = "");
+  TVM_DLL IterVar(Range dom, Var var, IterVarType iter_type, String thread_tag = "",
+                  Span span = Span());
   /*!
    * \return the corresponding var in the IterVar.
    */
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index 60d92e901764..ca46981acdb9 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -325,10 +325,13 @@ class Var(PrimExprWithOp):
 
     dtype : Union[str, tvm.irType]
         The data type
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, name, dtype):
-        self.__init_handle_by_constructor__(_ffi_api.Var, name, dtype)
+    def __init__(self, name, dtype, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Var, name, dtype, span)
 
 
 @tvm._ffi.register_object("tir.SizeVar")
@@ -343,11 +346,14 @@ class SizeVar(Var):
 
     dtype : int
         The data type
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
     # pylint: disable=super-init-not-called
-    def __init__(self, name, dtype):
-        self.__init_handle_by_constructor__(_ffi_api.SizeVar, name, dtype)
+    def __init__(self, name, dtype, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.SizeVar, name, dtype, span)
 
 
 @tvm._ffi.register_object("tir.IterVar")
@@ -370,6 +376,9 @@ class IterVar(Object, ExprOp):
     thread_tag : str
         The thread type tag.
 
+    span : Optional[Span]
+        The location of this itervar in the source code.
+
     See Also
     --------
     te.thread_axis: Create thread axis IterVar.
@@ -386,7 +395,7 @@ class IterVar(Object, ExprOp):
     Parallelized = 7
     Tensorized = 8
 
-    def __init__(self, dom, var, iter_type, thread_tag=""):
+    def __init__(self, dom, var, iter_type, thread_tag="", span=None):
         if dom is not None:
             if isinstance(dom, (list, tuple)):
                 if len(dom) != 2:
@@ -399,7 +408,7 @@ def __init__(self, dom, var, iter_type, thread_tag=""):
         name = var if var is not None else "iter"
         dtype = "int32" if dom is None else dom.extent.dtype
         var = Var(name, dtype=dtype) if not isinstance(var, Var) else var
-        self.__init_handle_by_constructor__(_ffi_api.IterVar, dom, var, iter_type, thread_tag)
+        self.__init_handle_by_constructor__(_ffi_api.IterVar, dom, var, iter_type, thread_tag, span)
 
 
 @tvm._ffi.register_object("tir.CommReducer")
@@ -419,11 +428,14 @@ class CommReducer(Object):
 
     identity_element : List[PrimExpr]
        The identity elements.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, lhs, rhs, result, identity_element):
+    def __init__(self, lhs, rhs, result, identity_element, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.CommReducer, lhs, rhs, result, identity_element
+            _ffi_api.CommReducer, lhs, rhs, result, identity_element, span
         )
 
 
@@ -450,11 +462,14 @@ class Reduce(PrimExprWithOp):
 
     init : list of Expr
         The initial value for output. This can be an int, float or ProducerLoad
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, combiner, src, rdom, condition, value_index, init=None):
+    def __init__(self, combiner, src, rdom, condition, value_index, init=None, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.Reduce, combiner, src, rdom, condition, value_index, init
+            _ffi_api.Reduce, combiner, src, rdom, condition, value_index, init, span
         )
 
 
@@ -469,10 +484,13 @@ class FloatImm(ConstExpr):
 
     value : float
         The constant value.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, value):
-        self.__init_handle_by_constructor__(tvm.ir._ffi_api.FloatImm, dtype, value)
+    def __init__(self, dtype, value, span=None):
+        self.__init_handle_by_constructor__(tvm.ir._ffi_api.FloatImm, dtype, value, span)
 
 
 @tvm._ffi.register_object
@@ -486,10 +504,13 @@ class IntImm(ConstExpr):
 
     value : int
         The constant value.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, value):
-        self.__init_handle_by_constructor__(tvm.ir._ffi_api.IntImm, dtype, value)
+    def __init__(self, dtype, value, span=None):
+        self.__init_handle_by_constructor__(tvm.ir._ffi_api.IntImm, dtype, value, span)
 
     def __hash__(self):
         return self.value
@@ -518,10 +539,13 @@ class StringImm(ConstExpr):
     ----------
     value : str
         The value of the function.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, value):
-        self.__init_handle_by_constructor__(_ffi_api.StringImm, value)
+    def __init__(self, value, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.StringImm, value, span)
 
     def __eq__(self, other):
         if isinstance(other, ConstExpr):
@@ -545,10 +569,13 @@ class Cast(PrimExprWithOp):
 
     value : PrimExpr
         The value of the function.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, value):
-        self.__init_handle_by_constructor__(_ffi_api.Cast, dtype, value)
+    def __init__(self, dtype, value, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Cast, dtype, value, span)
 
 
 @tvm._ffi.register_object("tir.Add")
@@ -562,10 +589,13 @@ class Add(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Add, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Add, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Sub")
@@ -579,10 +609,13 @@ class Sub(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Sub, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Sub, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Mul")
@@ -596,10 +629,13 @@ class Mul(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Mul, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Mul, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Div")
@@ -613,10 +649,13 @@ class Div(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Div, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Div, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Mod")
@@ -630,10 +669,13 @@ class Mod(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Mod, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Mod, a, b, span)
 
 
 @tvm._ffi.register_object("tir.FloorDiv")
@@ -647,10 +689,13 @@ class FloorDiv(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.FloorDiv, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.FloorDiv, a, b, span)
 
 
 @tvm._ffi.register_object("tir.FloorMod")
@@ -664,10 +709,13 @@ class FloorMod(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.FloorMod, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.FloorMod, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Min")
@@ -681,10 +729,13 @@ class Min(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Min, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Min, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Max")
@@ -698,10 +749,13 @@ class Max(BinaryOpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Max, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Max, a, b, span)
 
 
 @tvm._ffi.register_object("tir.EQ")
@@ -715,10 +769,13 @@ class EQ(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.EQ, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.EQ, a, b, span)
 
 
 @tvm._ffi.register_object("tir.NE")
@@ -732,10 +789,13 @@ class NE(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.NE, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.NE, a, b, span)
 
 
 @tvm._ffi.register_object("tir.LT")
@@ -749,10 +809,13 @@ class LT(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.LT, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.LT, a, b, span)
 
 
 @tvm._ffi.register_object("tir.LE")
@@ -766,10 +829,13 @@ class LE(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.LE, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.LE, a, b, span)
 
 
 @tvm._ffi.register_object("tir.GT")
@@ -783,10 +849,13 @@ class GT(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.GT, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.GT, a, b, span)
 
 
 @tvm._ffi.register_object("tir.GE")
@@ -800,10 +869,13 @@ class GE(CmpExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.GE, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.GE, a, b, span)
 
 
 @tvm._ffi.register_object("tir.And")
@@ -817,10 +889,13 @@ class And(LogicalExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.And, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.And, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Or")
@@ -834,10 +909,13 @@ class Or(LogicalExpr):
 
     b : PrimExpr
         The right hand operand.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a, b):
-        self.__init_handle_by_constructor__(_ffi_api.Or, a, b)
+    def __init__(self, a, b, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Or, a, b, span)
 
 
 @tvm._ffi.register_object("tir.Not")
@@ -848,10 +926,13 @@ class Not(LogicalExpr):
     ----------
     a : PrimExpr
         The input value
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, a):
-        self.__init_handle_by_constructor__(_ffi_api.Not, a)
+    def __init__(self, a, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Not, a, span)
 
 
 @tvm._ffi.register_object("tir.Select")
@@ -876,10 +957,14 @@ class Select(PrimExprWithOp):
     false_value : PrimExpr
         The value to take when condition is false.
 
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, condition, true_value, false_value):
-        self.__init_handle_by_constructor__(_ffi_api.Select, condition, true_value, false_value)
+    def __init__(self, condition, true_value, false_value, span=None):
+        self.__init_handle_by_constructor__(
+            _ffi_api.Select, condition, true_value, false_value, span
+        )
 
 
 @tvm._ffi.register_object("tir.Load")
@@ -899,11 +984,17 @@ class Load(PrimExprWithOp):
 
     predicate : PrimExpr
         The load predicate.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, buffer_var, index, predicate=None):
-        args = [] if predicate is None else [predicate]
-        self.__init_handle_by_constructor__(_ffi_api.Load, dtype, buffer_var, index, *args)
+    def __init__(self, dtype, buffer_var, index, predicate=None, span=None):
+        if predicate is None:
+            predicate = _ffi_api.const_true(dtype)
+        self.__init_handle_by_constructor__(
+            _ffi_api.Load, dtype, buffer_var, index, predicate, span
+        )
 
 
 @tvm._ffi.register_object("tir.BufferLoad")
@@ -917,10 +1008,13 @@ class BufferLoad(PrimExprWithOp):
 
     indices : List[PrimExpr]
         The buffer indices.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer, indices):
-        self.__init_handle_by_constructor__(_ffi_api.BufferLoad, buffer, indices)
+    def __init__(self, buffer, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.BufferLoad, buffer, indices, span)
 
 
 @tvm._ffi.register_object("tir.ProducerLoad")
@@ -934,10 +1028,13 @@ class ProducerLoad(PrimExprWithOp):
 
     indices : List[PrimExpr]
         The buffer indices.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, producer, indices):
-        self.__init_handle_by_constructor__(_ffi_api.ProducerLoad, producer, indices)
+    def __init__(self, producer, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.ProducerLoad, producer, indices, span)
 
 
 @tvm._ffi.register_object("tir.Ramp")
@@ -954,10 +1051,13 @@ class Ramp(PrimExprWithOp):
 
     lanes : int
         The lanes of the expression.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, base, stride, lanes):
-        self.__init_handle_by_constructor__(_ffi_api.Ramp, base, stride, lanes)
+    def __init__(self, base, stride, lanes, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Ramp, base, stride, lanes, span)
 
 
 @tvm._ffi.register_object("tir.Broadcast")
@@ -971,10 +1071,13 @@ class Broadcast(PrimExprWithOp):
 
     lanes : int
         The lanes of the expression.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, value, lanes):
-        self.__init_handle_by_constructor__(_ffi_api.Broadcast, value, lanes)
+    def __init__(self, value, lanes, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Broadcast, value, lanes, span)
 
 
 @tvm._ffi.register_object("tir.Shuffle")
@@ -988,10 +1091,13 @@ class Shuffle(PrimExprWithOp):
 
     indices : Array of indices
         The indices
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, vectors, indices):
-        self.__init_handle_by_constructor__(_ffi_api.Shuffle, vectors, indices)
+    def __init__(self, vectors, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Shuffle, vectors, indices, span)
 
 
 class CallEffectKind:
@@ -1020,9 +1126,12 @@ class Call(PrimExprWithOp):
 
     args : list of Expr
         The input arguments to the call
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, dtype, op, args):
+    def __init__(self, dtype, op, args, span=None):
         if isinstance(op, str):
             if not op.startswith("tir."):
                 raise ValueError(
@@ -1034,7 +1143,7 @@ def __init__(self, dtype, op, args):
                     % op
                 )
             op = Op.get(op)
-        self.__init_handle_by_constructor__(_ffi_api.Call, dtype, op, args)
+        self.__init_handle_by_constructor__(_ffi_api.Call, dtype, op, args, span)
 
 
 @tvm._ffi.register_object("tir.Let")
@@ -1051,15 +1160,22 @@ class Let(PrimExprWithOp):
 
     body : PrimExpr
         The body expression.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, var, value, body):
-        self.__init_handle_by_constructor__(_ffi_api.Let, var, value, body)
+    def __init__(self, var, value, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Let, var, value, body, span)
 
 
 @tvm._ffi.register_object("tir.Any")
 class Any(PrimExpr):
-    """Any node."""
+    """Any node.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
+    """
 
-    def __init__(self):
-        self.__init_handle_by_constructor__(_ffi_api.Any)
+    def __init__(self, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Any, span)
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index b02ebba18765..79d18d8970b5 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -45,9 +45,12 @@ class PrimFunc(BaseFunc):
 
     attrs: Optional[tvm.Attrs]
         Attributes of the function, can be None
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, params, body, ret_type=None, buffer_map=None, attrs=None):
+    def __init__(self, params, body, ret_type=None, buffer_map=None, attrs=None, span=None):
         param_list = []
         buffer_map = {} if buffer_map is None else buffer_map
         for x in params:
@@ -62,10 +65,10 @@ def __init__(self, params, body, ret_type=None, buffer_map=None, attrs=None):
                 raise TypeError("params can only contain Var or Buffer")
 
         self.__init_handle_by_constructor__(
-            _ffi_api.PrimFunc, param_list, body, ret_type, buffer_map, attrs
+            _ffi_api.PrimFunc, param_list, body, ret_type, buffer_map, attrs, span
         )
 
-    def with_body(self, new_body):
+    def with_body(self, new_body, span=None):
         """Create a new PrimFunc with the same set signatures but a new body.
 
         Parameters
@@ -73,9 +76,12 @@ def with_body(self, new_body):
         new_body : Stmt
             The new body.
 
+        span : Optional[Span]
+            The location of this itervar in the source code.
+
         Returns
         -------
         new_func : PrimFunc
             The created new function.
         """
-        return PrimFunc(self.params, new_body, self.ret_type, self.buffer_map, self.attrs)
+        return PrimFunc(self.params, new_body, self.ret_type, self.buffer_map, self.attrs, span)
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 573bc0e7d970..cba4ce337b1d 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -50,10 +50,13 @@ class LetStmt(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, var, value, body):
-        self.__init_handle_by_constructor__(_ffi_api.LetStmt, var, value, body)
+    def __init__(self, var, value, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.LetStmt, var, value, body, span)
 
 
 @tvm._ffi.register_object("tir.AssertStmt")
@@ -70,10 +73,13 @@ class AssertStmt(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, condition, message, body):
-        self.__init_handle_by_constructor__(_ffi_api.AssertStmt, condition, message, body)
+    def __init__(self, condition, message, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.AssertStmt, condition, message, body, span)
 
 
 @tvm._ffi.register_object("tir.For")
@@ -99,6 +105,9 @@ class For(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
     Serial = 0
@@ -106,9 +115,9 @@ class For(Stmt):
     Vectorized = 2
     Unrolled = 3
 
-    def __init__(self, loop_var, min_val, extent, for_type, device_api, body):
+    def __init__(self, loop_var, min_val, extent, for_type, device_api, body, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.For, loop_var, min_val, extent, for_type, device_api, body
+            _ffi_api.For, loop_var, min_val, extent, for_type, device_api, body, span
         )
 
 
@@ -129,11 +138,17 @@ class Store(Stmt):
 
     predicate : PrimExpr
         The store predicate.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer_var, value, index, predicate=None):
-        args = [] if predicate is None else [predicate]
-        self.__init_handle_by_constructor__(_ffi_api.Store, buffer_var, value, index, *args)
+    def __init__(self, buffer_var, value, index, predicate=None, span=None):
+        if predicate is None:
+            predicate = _ffi_api.const_true(value.dtype)
+        self.__init_handle_by_constructor__(
+            _ffi_api.Store, buffer_var, value, index, predicate, span
+        )
 
 
 @tvm._ffi.register_object("tir.BufferStore")
@@ -150,10 +165,13 @@ class BufferStore(Stmt):
 
     indices : List[PrimExpr]
         The indices location to be stored.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer, value, indices):
-        self.__init_handle_by_constructor__(_ffi_api.BufferStore, buffer, value, indices)
+    def __init__(self, buffer, value, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.BufferStore, buffer, value, indices, span)
 
 
 @tvm._ffi.register_object("tir.BufferRealize")
@@ -173,10 +191,15 @@ class BufferRealize(Stmt):
 
     body : Stmt
         The body of the statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer, bounds, condition, body):
-        self.__init_handle_by_constructor__(_ffi_api.BufferRealize, buffer, bounds, condition, body)
+    def __init__(self, buffer, bounds, condition, body, span=None):
+        self.__init_handle_by_constructor__(
+            _ffi_api.BufferRealize, buffer, bounds, condition, body, span
+        )
 
 
 @tvm._ffi.register_object("tir.ProducerStore")
@@ -193,10 +216,13 @@ class ProducerStore(Stmt):
 
     indices : list of Expr
         The index arguments of the store.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, producer, value, indices):
-        self.__init_handle_by_constructor__(_ffi_api.ProducerStore, producer, value, indices)
+    def __init__(self, producer, value, indices, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.ProducerStore, producer, value, indices, span)
 
 
 @tvm._ffi.register_object("tir.Allocate")
@@ -219,11 +245,14 @@ class Allocate(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer_var, dtype, extents, condition, body):
+    def __init__(self, buffer_var, dtype, extents, condition, body, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.Allocate, buffer_var, dtype, extents, condition, body
+            _ffi_api.Allocate, buffer_var, dtype, extents, condition, body, span
         )
 
 
@@ -244,10 +273,13 @@ class AttrStmt(Stmt):
 
     body : Stmt
         The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, node, attr_key, value, body):
-        self.__init_handle_by_constructor__(_ffi_api.AttrStmt, node, attr_key, value, body)
+    def __init__(self, node, attr_key, value, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.AttrStmt, node, attr_key, value, body, span)
 
 
 @tvm._ffi.register_object("tir.ProducerRealize")
@@ -267,11 +299,14 @@ class ProducerRealize(Stmt):
 
     body : Stmt
         The realize body
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, producer, bounds, condition, body):
+    def __init__(self, producer, bounds, condition, body, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.ProducerRealize, producer, bounds, condition, body
+            _ffi_api.ProducerRealize, producer, bounds, condition, body, span
         )
 
 
@@ -283,10 +318,13 @@ class SeqStmt(Stmt):
     ----------
     seq : List[Stmt]
         The statements
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, seq):
-        self.__init_handle_by_constructor__(_ffi_api.SeqStmt, seq)
+    def __init__(self, seq, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.SeqStmt, seq, span)
 
     def __getitem__(self, i):
         return self.seq[i]
@@ -309,10 +347,15 @@ class IfThenElse(Stmt):
 
     else_case : Stmt
         The statement to execute if condition is false.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, condition, then_case, else_case):
-        self.__init_handle_by_constructor__(_ffi_api.IfThenElse, condition, then_case, else_case)
+    def __init__(self, condition, then_case, else_case, span=None):
+        self.__init_handle_by_constructor__(
+            _ffi_api.IfThenElse, condition, then_case, else_case, span
+        )
 
 
 @tvm._ffi.register_object("tir.Evaluate")
@@ -323,10 +366,13 @@ class Evaluate(Stmt):
     ----------
     value : PrimExpr
         The expression to be evalued.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, value):
-        self.__init_handle_by_constructor__(_ffi_api.Evaluate, value)
+    def __init__(self, value, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Evaluate, value, span)
 
 
 @tvm._ffi.register_object("tir.Prefetch")
@@ -340,10 +386,13 @@ class Prefetch(Stmt):
 
     bounds : list of Range
         The bounds to be prefetched.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer, bounds):
-        self.__init_handle_by_constructor__(_ffi_api.Prefetch, buffer, bounds)
+    def __init__(self, buffer, bounds, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Prefetch, buffer, bounds, span)
 
 
 def stmt_seq(*args):
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index 67e5cea93011..0b7049ec212b 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -55,20 +55,23 @@ PrimExpr PrimExpr::FromObject_(ObjectRef ref) {
   return Downcast<PrimExpr>(ref);
 }
 
-IntImm::IntImm(DataType dtype, int64_t value) {
-  ICHECK(dtype.is_scalar()) << "ValueError: IntImm can only take scalar.";
-  ICHECK(dtype.is_int() || dtype.is_uint()) << "ValueError: IntImm supports only int or uint type.";
+IntImm::IntImm(DataType dtype, int64_t value, Span span) {
+  ICHECK(dtype.is_scalar()) << "ValueError: IntImm can only take scalar, but " << dtype
+                            << " was supplied.";
+  ICHECK(dtype.is_int() || dtype.is_uint())
+      << "ValueError: IntImm supports only int or uint type, but " << dtype << " was supplied.";
   if (dtype.is_uint()) {
     ICHECK_GE(value, 0U);
   }
   ObjectPtr<IntImmNode> node = make_object<IntImmNode>();
   node->dtype = dtype;
   node->value = value;
+  node->span = span;
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("ir.IntImm").set_body_typed([](DataType dtype, int64_t value) {
-  return IntImm(dtype, value);
+TVM_REGISTER_GLOBAL("ir.IntImm").set_body_typed([](DataType dtype, int64_t value, Span span) {
+  return IntImm(dtype, value, span);
 });
 
 TVM_REGISTER_NODE_TYPE(IntImmNode);
@@ -83,16 +86,17 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       }
     });
 
-FloatImm::FloatImm(DataType dtype, double value) {
+FloatImm::FloatImm(DataType dtype, double value, Span span) {
   ICHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar.";
   ObjectPtr<FloatImmNode> node = make_object<FloatImmNode>();
   node->dtype = dtype;
   node->value = value;
+  node->span = span;
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("ir.FloatImm").set_body_typed([](DataType dtype, double value) {
-  return FloatImm(dtype, value);
+TVM_REGISTER_GLOBAL("ir.FloatImm").set_body_typed([](DataType dtype, double value, Span span) {
+  return FloatImm(dtype, value, span);
 });
 
 TVM_REGISTER_NODE_TYPE(FloatImmNode);
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index fd55f2418628..faa483d019c0 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1154,7 +1154,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const BroadcastNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
-  ICHECK(is_one(op->predicate));
+  ICHECK(is_one(op->predicate)) << op->predicate;
   DataType t = op->value.dtype();
   bool is_volatile = volatile_buf_.count(op->buffer_var.get());
   llvm::Value* buffer = MakeValue(op->buffer_var);
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 08b2224e9912..7db49093e596 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -45,9 +45,9 @@ Array<PrimExpr> SimplifyArray(arith::Analyzer* ana, Array<PrimExpr> array) {
   return array;
 }
 
-Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name) {
-  return Buffer(Var(name, PointerType(PrimType(dtype))), dtype, shape, Array<PrimExpr>(),
-                PrimExpr(), name, "", 0, 0, kDefault);
+Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name, Span span) {
+  return Buffer(Var(name, PointerType(PrimType(dtype)), span), dtype, shape, Array<PrimExpr>(),
+                PrimExpr(), name, "", 0, 0, kDefault, span);
 }
 
 // Split the given expression w.r.t the add operator
@@ -382,7 +382,7 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
 
 Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr> strides,
                PrimExpr elem_offset, String name, String scope, int data_alignment,
-               int offset_factor, BufferType buffer_type) {
+               int offset_factor, BufferType buffer_type, Span span) {
   ICHECK(IsPointerType(data->type_annotation, dtype))
       << "Buffer data field expect to have the right pointer type annotation"
       << " annotation=" << data->type_annotation << ", dtype=" << dtype;
@@ -416,6 +416,7 @@ Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr>
       n->strides.push_back(Var("stride", n->shape[i].dtype()));
     }
   }
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index 825bac86919c..2d2a29943383 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -34,7 +34,7 @@ namespace tvm {
 namespace tir {
 
 #define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                             \
-  Name::Name(PrimExpr a, PrimExpr b) {                                 \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                      \
     using T = Name::ContainerType;                                     \
     ICHECK(a.defined()) << "ValueError: a is undefined\n";             \
     ICHECK(b.defined()) << "ValueError: b is undefined\n";             \
@@ -43,11 +43,12 @@ namespace tir {
     node->dtype = a.dtype();                                           \
     node->a = std::move(a);                                            \
     node->b = std::move(b);                                            \
+    node->span = std::move(span);                                      \
     data_ = std::move(node);                                           \
   }
 
 #define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                             \
-  Name::Name(PrimExpr a, PrimExpr b) {                                 \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                      \
     using T = Name::ContainerType;                                     \
     ICHECK(a.defined()) << "ValueError: a is undefined\n";             \
     ICHECK(b.defined()) << "ValueError: b is undefined\n";             \
@@ -56,22 +57,25 @@ namespace tir {
     node->dtype = DataType::Bool(a.dtype().lanes());                   \
     node->a = std::move(a);                                            \
     node->b = std::move(b);                                            \
+    node->span = std::move(span);                                      \
     data_ = std::move(node);                                           \
   }
 
 // Var
-Var::Var(String name_hint, DataType dtype) {
+Var::Var(String name_hint, DataType dtype, Span span) {
   auto n = make_object<VarNode>();
   n->name_hint = std::move(name_hint);
   n->dtype = std::move(dtype);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
-Var::Var(String name_hint, Type type_annotation) {
+Var::Var(String name_hint, Type type_annotation, Span span) {
   auto n = make_object<VarNode>();
   n->name_hint = std::move(name_hint);
   n->dtype = GetRuntimeDataType(type_annotation);
   n->type_annotation = std::move(type_annotation);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
@@ -87,11 +91,12 @@ Var Var::copy_with_suffix(const String& suffix) const {
   return Var(new_ptr);
 }
 
-TVM_REGISTER_GLOBAL("tir.Var").set_body_typed([](String name_hint, runtime::TVMArgValue type) {
+TVM_REGISTER_GLOBAL("tir.Var").set_body_typed([](String name_hint, runtime::TVMArgValue type,
+                                                 Span span) {
   if (type.IsObjectRef<Type>()) {
-    return Var(name_hint, type.operator Type());
+    return Var(name_hint, type.operator Type(), span);
   } else {
-    return Var(name_hint, type.operator DataType());
+    return Var(name_hint, type.operator DataType(), span);
   }
 });
 
@@ -106,15 +111,16 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // SizeVar
-SizeVar::SizeVar(String name_hint, DataType dtype) {
+SizeVar::SizeVar(String name_hint, DataType dtype, Span span) {
   auto n = make_object<SizeVarNode>();
   n->name_hint = std::move(name_hint);
   n->dtype = std::move(dtype);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
-TVM_REGISTER_GLOBAL("tir.SizeVar").set_body_typed([](String s, DataType t) {
-  return SizeVar(s, t);
+TVM_REGISTER_GLOBAL("tir.SizeVar").set_body_typed([](String s, DataType t, Span span) {
+  return SizeVar(s, t, span);
 });
 
 TVM_REGISTER_NODE_TYPE(SizeVarNode);
@@ -126,18 +132,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // IterVar
-IterVar::IterVar(Range dom, Var var, IterVarType t, String thread_tag) {
+IterVar::IterVar(Range dom, Var var, IterVarType t, String thread_tag, Span span) {
   ObjectPtr<IterVarNode> n = make_object<IterVarNode>();
   n->dom = dom;
   n->var = var;
   n->iter_type = t;
   n->thread_tag = thread_tag;
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("tir.IterVar")
-    .set_body_typed([](Range dom, Var var, int iter_type, String thread_tag) {
-      return IterVar(dom, var, static_cast<IterVarType>(iter_type), thread_tag);
+    .set_body_typed([](Range dom, Var var, int iter_type, String thread_tag, Span span) {
+      return IterVar(dom, var, static_cast<IterVarType>(iter_type), thread_tag, span);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -159,14 +166,17 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 TVM_REGISTER_NODE_TYPE(IterVarNode);
 
 // StringImm
-StringImm::StringImm(String value) {
+StringImm::StringImm(String value, Span span) {
   ObjectPtr<StringImmNode> node = make_object<StringImmNode>();
   node->dtype = DataType::Handle();
   node->value = std::move(value);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.StringImm").set_body_typed([](String value) { return StringImm(value); });
+TVM_REGISTER_GLOBAL("tir.StringImm").set_body_typed([](String value, Span span) {
+  return StringImm(value, span);
+});
 
 TVM_REGISTER_NODE_TYPE(StringImmNode);
 
@@ -177,17 +187,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Cast
-Cast::Cast(DataType t, PrimExpr value) {
+Cast::Cast(DataType t, PrimExpr value, Span span) {
   ICHECK(value.defined());
   ICHECK_EQ(t.lanes(), value.dtype().lanes());
   ObjectPtr<CastNode> node = make_object<CastNode>();
   node->dtype = t;
   node->value = std::move(value);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Cast").set_body_typed([](DataType dtype, PrimExpr value) {
-  return Cast(dtype, value);
+TVM_REGISTER_GLOBAL("tir.Cast").set_body_typed([](DataType dtype, PrimExpr value, Span span) {
+  return Cast(dtype, value, span);
 });
 
 TVM_REGISTER_NODE_TYPE(CastNode);
@@ -203,7 +214,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Add
 TVM_DEFINE_BINOP_CONSTRUCTOR(Add);
 
-TVM_REGISTER_GLOBAL("tir.Add").set_body_typed([](PrimExpr a, PrimExpr b) { return Add(a, b); });
+TVM_REGISTER_GLOBAL("tir.Add").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Add(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(AddNode);
 
@@ -220,7 +233,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Sub
 TVM_DEFINE_BINOP_CONSTRUCTOR(Sub);
 
-TVM_REGISTER_GLOBAL("tir.Sub").set_body_typed([](PrimExpr a, PrimExpr b) { return Sub(a, b); });
+TVM_REGISTER_GLOBAL("tir.Sub").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Sub(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(SubNode);
 
@@ -237,7 +252,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Mul
 TVM_DEFINE_BINOP_CONSTRUCTOR(Mul);
 
-TVM_REGISTER_GLOBAL("tir.Mul").set_body_typed([](PrimExpr a, PrimExpr b) { return Mul(a, b); });
+TVM_REGISTER_GLOBAL("tir.Mul").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Mul(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(MulNode);
 
@@ -254,7 +271,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Div
 TVM_DEFINE_BINOP_CONSTRUCTOR(Div);
 
-TVM_REGISTER_GLOBAL("tir.Div").set_body_typed([](PrimExpr a, PrimExpr b) { return Div(a, b); });
+TVM_REGISTER_GLOBAL("tir.Div").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Div(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(DivNode);
 
@@ -271,7 +290,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Mod
 TVM_DEFINE_BINOP_CONSTRUCTOR(Mod);
 
-TVM_REGISTER_GLOBAL("tir.Mod").set_body_typed([](PrimExpr a, PrimExpr b) { return Mod(a, b); });
+TVM_REGISTER_GLOBAL("tir.Mod").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Mod(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(ModNode);
 
@@ -288,8 +309,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // FloorDiv
 TVM_DEFINE_BINOP_CONSTRUCTOR(FloorDiv);
 
-TVM_REGISTER_GLOBAL("tir.FloorDiv").set_body_typed([](PrimExpr a, PrimExpr b) {
-  return FloorDiv(a, b);
+TVM_REGISTER_GLOBAL("tir.FloorDiv").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return FloorDiv(a, b, span);
 });
 
 TVM_REGISTER_NODE_TYPE(FloorDivNode);
@@ -303,8 +324,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // FloorMod
 TVM_DEFINE_BINOP_CONSTRUCTOR(FloorMod);
 
-TVM_REGISTER_GLOBAL("tir.FloorMod").set_body_typed([](PrimExpr a, PrimExpr b) {
-  return FloorMod(a, b);
+TVM_REGISTER_GLOBAL("tir.FloorMod").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return FloorMod(a, b, span);
 });
 
 TVM_REGISTER_NODE_TYPE(FloorModNode);
@@ -318,7 +339,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Min
 TVM_DEFINE_BINOP_CONSTRUCTOR(Min);
 
-TVM_REGISTER_GLOBAL("tir.Min").set_body_typed([](PrimExpr a, PrimExpr b) { return Min(a, b); });
+TVM_REGISTER_GLOBAL("tir.Min").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Min(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(MinNode);
 
@@ -335,7 +358,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Max
 TVM_DEFINE_BINOP_CONSTRUCTOR(Max);
 
-TVM_REGISTER_GLOBAL("tir.Max").set_body_typed([](PrimExpr a, PrimExpr b) { return Max(a, b); });
+TVM_REGISTER_GLOBAL("tir.Max").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Max(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(MaxNode);
 
@@ -352,7 +377,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // EQ
 TVM_DEFINE_CMPOP_CONSTRUCTOR(EQ);
 
-TVM_REGISTER_GLOBAL("tir.EQ").set_body_typed([](PrimExpr a, PrimExpr b) { return EQ(a, b); });
+TVM_REGISTER_GLOBAL("tir.EQ").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return EQ(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(EQNode);
 
@@ -369,7 +396,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // NE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(NE);
 
-TVM_REGISTER_GLOBAL("tir.NE").set_body_typed([](PrimExpr a, PrimExpr b) { return NE(a, b); });
+TVM_REGISTER_GLOBAL("tir.NE").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return NE(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(NENode);
 
@@ -386,7 +415,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // LT
 TVM_DEFINE_CMPOP_CONSTRUCTOR(LT);
 
-TVM_REGISTER_GLOBAL("tir.LT").set_body_typed([](PrimExpr a, PrimExpr b) { return LT(a, b); });
+TVM_REGISTER_GLOBAL("tir.LT").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return LT(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(LTNode);
 
@@ -403,7 +434,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // LE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(LE);
 
-TVM_REGISTER_GLOBAL("tir.LE").set_body_typed([](PrimExpr a, PrimExpr b) { return LE(a, b); });
+TVM_REGISTER_GLOBAL("tir.LE").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return LE(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(LENode);
 
@@ -420,7 +453,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // GT
 TVM_DEFINE_CMPOP_CONSTRUCTOR(GT);
 
-TVM_REGISTER_GLOBAL("tir.GT").set_body_typed([](PrimExpr a, PrimExpr b) { return GT(a, b); });
+TVM_REGISTER_GLOBAL("tir.GT").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return GT(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(GTNode);
 
@@ -437,7 +472,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // GE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(GE);
 
-TVM_REGISTER_GLOBAL("tir.GE").set_body_typed([](PrimExpr a, PrimExpr b) { return GE(a, b); });
+TVM_REGISTER_GLOBAL("tir.GE").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return GE(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(GENode);
 
@@ -452,7 +489,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // And
-And::And(PrimExpr a, PrimExpr b) {
+And::And(PrimExpr a, PrimExpr b, Span span) {
   ICHECK(a.defined()) << "ValueError: a is undefined";
   ICHECK(b.defined()) << "ValueError: b is undefined";
   ICHECK(a.dtype().is_bool());
@@ -463,10 +500,13 @@ And::And(PrimExpr a, PrimExpr b) {
   node->dtype = DataType::Bool(a.dtype().lanes());
   node->a = std::move(a);
   node->b = std::move(b);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.And").set_body_typed([](PrimExpr a, PrimExpr b) { return And(a, b); });
+TVM_REGISTER_GLOBAL("tir.And").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return And(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(AndNode);
 
@@ -481,7 +521,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Or
-Or::Or(PrimExpr a, PrimExpr b) {
+Or::Or(PrimExpr a, PrimExpr b, Span span) {
   ICHECK(a.defined()) << "ValueError: a is undefined";
   ICHECK(b.defined()) << "ValueError: b is undefined";
   ICHECK(a.dtype().is_bool());
@@ -492,10 +532,13 @@ Or::Or(PrimExpr a, PrimExpr b) {
   node->dtype = DataType::Bool(a.dtype().lanes());
   node->a = std::move(a);
   node->b = std::move(b);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Or").set_body_typed([](PrimExpr a, PrimExpr b) { return Or(a, b); });
+TVM_REGISTER_GLOBAL("tir.Or").set_body_typed([](PrimExpr a, PrimExpr b, Span span) {
+  return Or(a, b, span);
+});
 
 TVM_REGISTER_NODE_TYPE(OrNode);
 
@@ -510,17 +553,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Not
-Not::Not(PrimExpr a) {
+Not::Not(PrimExpr a, Span span) {
   ICHECK(a.defined()) << "ValueError: a is undefined";
   ICHECK(a.dtype().is_bool());
 
   ObjectPtr<NotNode> node = make_object<NotNode>();
   node->dtype = DataType::Bool(a.dtype().lanes());
   node->a = std::move(a);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Not").set_body_typed([](PrimExpr a) { return Not(a); });
+TVM_REGISTER_GLOBAL("tir.Not").set_body_typed([](PrimExpr a, Span span) { return Not(a, span); });
 
 TVM_REGISTER_NODE_TYPE(NotNode);
 
@@ -532,7 +576,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Select
-Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value) {
+Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Span span) {
   ICHECK(condition.defined()) << "ValueError: condition is undefined";
   ICHECK(true_value.defined()) << "ValueError: true_value is undefined";
   ICHECK(false_value.defined()) << "ValueError: true_value is undefined";
@@ -545,12 +589,13 @@ Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value) {
   node->condition = std::move(condition);
   node->true_value = std::move(true_value);
   node->false_value = std::move(false_value);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.Select")
-    .set_body_typed([](PrimExpr condition, PrimExpr true_value, PrimExpr false_value) {
-      return Select(condition, true_value, false_value);
+    .set_body_typed([](PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Span span) {
+      return Select(condition, true_value, false_value, span);
     });
 
 TVM_REGISTER_NODE_TYPE(SelectNode);
@@ -568,7 +613,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Load
-Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate) {
+Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate, Span span) {
   ICHECK(buffer_var.defined());
   ICHECK(predicate.defined());
   ICHECK(index.defined());
@@ -580,6 +625,7 @@ Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate) {
   node->buffer_var = std::move(buffer_var);
   node->index = std::move(index);
   node->predicate = std::move(predicate);
+  node->span = std::move(span);
 
   data_ = std::move(node);
 }
@@ -587,9 +633,11 @@ Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate) {
 TVM_REGISTER_GLOBAL("tir.Load").set_body([](TVMArgs args, TVMRetValue* ret) {
   DataType t = args[0];
   if (args.size() == 3) {
-    *ret = Load(t, args[1], args[2], const_true(t.lanes()));
+    *ret = Load(t, args[1], args[2], const_true(t.lanes()), Span());
+  } else if (args.size() == 4) {
+    *ret = Load(t, args[1], args[2], args[3], Span());
   } else {
-    *ret = Load(t, args[1], args[2], args[3]);
+    *ret = Load(t, args[1], args[2], args[3], args[4]);
   }
 });
 
@@ -608,7 +656,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Ramp
-Ramp::Ramp(PrimExpr base, PrimExpr stride, int lanes) {
+Ramp::Ramp(PrimExpr base, PrimExpr stride, int lanes, Span span) {
   ICHECK(base.defined());
   ICHECK(stride.defined());
   ICHECK(base.dtype().is_scalar());
@@ -621,12 +669,14 @@ Ramp::Ramp(PrimExpr base, PrimExpr stride, int lanes) {
   node->base = base;
   node->stride = stride;
   node->lanes = lanes;
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Ramp").set_body_typed([](PrimExpr base, PrimExpr stride, int lanes) {
-  return Ramp(base, stride, lanes);
-});
+TVM_REGISTER_GLOBAL("tir.Ramp")
+    .set_body_typed([](PrimExpr base, PrimExpr stride, int lanes, Span span) {
+      return Ramp(base, stride, lanes, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(RampNode);
 
@@ -641,7 +691,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Broadcast
-Broadcast::Broadcast(PrimExpr value, int lanes) {
+Broadcast::Broadcast(PrimExpr value, int lanes, Span span) {
   ICHECK(value.defined());
   ICHECK(value.dtype().is_scalar());
   ICHECK_GT(lanes, 1);
@@ -650,11 +700,12 @@ Broadcast::Broadcast(PrimExpr value, int lanes) {
   node->dtype = value.dtype().with_lanes(lanes);
   node->value = std::move(value);
   node->lanes = lanes;
+  node->span = std::move(span);
   data_ = node;
 }
 
-TVM_REGISTER_GLOBAL("tir.Broadcast").set_body_typed([](PrimExpr value, int lanes) {
-  return Broadcast(value, lanes);
+TVM_REGISTER_GLOBAL("tir.Broadcast").set_body_typed([](PrimExpr value, int lanes, Span span) {
+  return Broadcast(value, lanes, span);
 });
 
 TVM_REGISTER_NODE_TYPE(BroadcastNode);
@@ -668,7 +719,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Let
-Let::Let(Var var, PrimExpr value, PrimExpr body) {
+Let::Let(Var var, PrimExpr value, PrimExpr body, Span span) {
   ICHECK(value.defined());
   ICHECK(body.defined());
   ICHECK_EQ(value.dtype(), var.dtype());
@@ -678,11 +729,13 @@ Let::Let(Var var, PrimExpr value, PrimExpr body) {
   node->var = std::move(var);
   node->value = std::move(value);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Let").set_body_typed([](Var var, PrimExpr value, PrimExpr body) {
-  return Let(var, value, body);
+TVM_REGISTER_GLOBAL("tir.Let").set_body_typed([](Var var, PrimExpr value, PrimExpr body,
+                                                 Span span) {
+  return Let(var, value, body, span);
 });
 
 TVM_REGISTER_NODE_TYPE(LetNode);
@@ -698,7 +751,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Call
-Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args) {
+Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span) {
   for (size_t i = 0; i < args.size(); ++i) {
     ICHECK(args[i].defined());
   }
@@ -707,11 +760,12 @@ Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args) {
   node->dtype = dtype;
   node->op = std::move(op);
   node->args = std::move(args);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.Call")
-    .set_body_typed([](DataType type, RelayExpr op, Array<ObjectRef> args) {
+    .set_body_typed([](DataType type, RelayExpr op, Array<ObjectRef> args, Span span) {
       Array<PrimExpr> prim_expr_args;
       for (const auto& it : args) {
         ICHECK(it->IsInstance<runtime::StringObj>() || it->IsInstance<PrimExprNode>());
@@ -721,7 +775,7 @@ TVM_REGISTER_GLOBAL("tir.Call")
           prim_expr_args.push_back(Downcast<PrimExpr>(it));
         }
       }
-      return Call(type, op, prim_expr_args);
+      return Call(type, op, prim_expr_args, span);
     });
 
 TVM_REGISTER_NODE_TYPE(CallNode);
@@ -746,7 +800,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Shuffle
-Shuffle::Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices) {
+Shuffle::Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices, Span span) {
   ICHECK_NE(vectors.size(), 0U);
   ICHECK_NE(indices.size(), 0U);
 
@@ -763,10 +817,11 @@ Shuffle::Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices) {
   node->dtype = base_type.with_lanes(static_cast<int>(indices.size()));
   node->vectors = std::move(vectors);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = node;
 }
 
-PrimExpr Shuffle::Concat(Array<PrimExpr> vectors) {
+PrimExpr Shuffle::Concat(Array<PrimExpr> vectors, Span span) {
   ICHECK_NE(vectors.size(), 0);
   if (vectors.size() == 1) {
     return vectors[0];
@@ -778,16 +833,16 @@ PrimExpr Shuffle::Concat(Array<PrimExpr> vectors) {
       indices.push_back(IntImm(DataType::Int(32), index++));
     }
   }
-  return Shuffle(vectors, indices);
+  return Shuffle(vectors, indices, span);
 }
 
-PrimExpr Shuffle::ExtractElement(PrimExpr vector, int index) {
-  return Shuffle({vector}, {Integer(index)});
+PrimExpr Shuffle::ExtractElement(PrimExpr vector, int index, Span span) {
+  return Shuffle({vector}, {Integer(index)}, span);
 }
 
 TVM_REGISTER_GLOBAL("tir.Shuffle")
-    .set_body_typed([](Array<PrimExpr> vectors, Array<PrimExpr> indices) {
-      return Shuffle(vectors, indices);
+    .set_body_typed([](Array<PrimExpr> vectors, Array<PrimExpr> indices, Span span) {
+      return Shuffle(vectors, indices, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ShuffleNode);
@@ -814,12 +869,13 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // CommReducer
 CommReducer::CommReducer(Array<Var> lhs, Array<Var> rhs, Array<PrimExpr> result,
-                         Array<PrimExpr> identity_element) {
+                         Array<PrimExpr> identity_element, Span span) {
   auto node = make_object<CommReducerNode>();
   node->lhs = lhs;
   node->rhs = rhs;
   node->result = result;
   node->identity_element = identity_element;
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
@@ -839,8 +895,8 @@ Array<PrimExpr> CommReducerNode::operator()(Array<PrimExpr> a, Array<PrimExpr> b
 
 TVM_REGISTER_GLOBAL("tir.CommReducer")
     .set_body_typed([](Array<Var> lhs, Array<Var> rhs, Array<PrimExpr> result,
-                       Array<PrimExpr> identity_element) {
-      return CommReducer(lhs, rhs, result, identity_element);
+                       Array<PrimExpr> identity_element, Span span) {
+      return CommReducer(lhs, rhs, result, identity_element, span);
     });
 
 TVM_REGISTER_GLOBAL("tir.CommReducerCombine")
@@ -857,7 +913,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Reduce
 Reduce::Reduce(CommReducer combiner, Array<PrimExpr> source, Array<IterVar> axis,
-               PrimExpr condition, int value_index, Array<PrimExpr> init) {
+               PrimExpr condition, int value_index, Array<PrimExpr> init, Span span) {
   for (size_t i = 0; i < axis.size(); ++i) {
     ICHECK_EQ(axis[i]->iter_type, kCommReduce) << "Can only take axis created by reduce_axis";
   }
@@ -884,13 +940,14 @@ Reduce::Reduce(CommReducer combiner, Array<PrimExpr> source, Array<IterVar> axis
   n->axis = std::move(axis);
   n->condition = condition;
   n->value_index = value_index;
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("tir.Reduce")
     .set_body_typed([](CommReducer combiner, Array<PrimExpr> source, Array<IterVar> axis,
-                       PrimExpr condition, int value_index, Array<PrimExpr> init) {
-      return Reduce(combiner, source, axis, condition, value_index, init);
+                       PrimExpr condition, int value_index, Array<PrimExpr> init, Span span) {
+      return Reduce(combiner, source, axis, condition, value_index, init, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ReduceNode);
@@ -908,13 +965,14 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Any
-Any::Any() {
+Any::Any(Span span) {
   auto n = make_object<AnyNode>();
   n->dtype = DataType::Int(32);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
-TVM_REGISTER_GLOBAL("tir.Any").set_body_typed([]() { return Any(); });
+TVM_REGISTER_GLOBAL("tir.Any").set_body_typed([](Span span) { return Any(span); });
 
 TVM_REGISTER_NODE_TYPE(AnyNode);
 
@@ -922,17 +980,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<AnyNode>([](const ObjectRef& node, ReprPrinter* p) { p->stream << "?"; });
 
 // BufferLoad
-BufferLoad::BufferLoad(Buffer buffer, Array<PrimExpr> indices) {
+BufferLoad::BufferLoad(Buffer buffer, Array<PrimExpr> indices, Span span) {
   ObjectPtr<BufferLoadNode> node = make_object<BufferLoadNode>();
   node->dtype = buffer->dtype;
   node->buffer = std::move(buffer);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.BufferLoad").set_body_typed([](Buffer buffer, Array<PrimExpr> indices) {
-  return BufferLoad(buffer, indices);
-});
+TVM_REGISTER_GLOBAL("tir.BufferLoad")
+    .set_body_typed([](Buffer buffer, Array<PrimExpr> indices, Span span) {
+      return BufferLoad(buffer, indices, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(BufferLoadNode);
 
@@ -950,17 +1010,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // ProducerLoad
-ProducerLoad::ProducerLoad(DataProducer producer, Array<PrimExpr> indices) {
+ProducerLoad::ProducerLoad(DataProducer producer, Array<PrimExpr> indices, Span span) {
   ObjectPtr<ProducerLoadNode> node = make_object<ProducerLoadNode>();
   node->dtype = producer->GetDataType();
   node->producer = std::move(producer);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.ProducerLoad")
-    .set_body_typed([](DataProducer producer, Array<PrimExpr> indices) {
-      return ProducerLoad(producer, indices);
+    .set_body_typed([](DataProducer producer, Array<PrimExpr> indices, Span span) {
+      return ProducerLoad(producer, indices, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ProducerLoadNode);
diff --git a/src/tir/ir/function.cc b/src/tir/ir/function.cc
index 1149e039cae4..ef7f4f8e16dd 100644
--- a/src/tir/ir/function.cc
+++ b/src/tir/ir/function.cc
@@ -30,7 +30,7 @@ namespace tir {
 
 // Get the function type of a PrimFunc
 PrimFunc::PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type,
-                   Map<tir::Var, Buffer> buffer_map, DictAttrs attrs) {
+                   Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {
   // Assume void-return type for now
   // TODO(tvm-team) consider type deduction from body.
   if (!ret_type.defined()) {
@@ -43,6 +43,7 @@ PrimFunc::PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type,
   n->buffer_map = std::move(buffer_map);
   n->attrs = std::move(attrs);
   n->checked_type_ = n->func_type_annotation();
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
@@ -73,8 +74,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 TVM_REGISTER_GLOBAL("tir.PrimFunc")
     .set_body_typed([](Array<tir::Var> params, Stmt body, Type ret_type,
-                       Map<tir::Var, Buffer> buffer_map, DictAttrs attrs) {
-      return PrimFunc(params, body, ret_type, buffer_map, attrs);
+                       Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {
+      return PrimFunc(params, body, ret_type, buffer_map, attrs, span);
     });
 
 }  // namespace tir
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index dbbc99c3abed..86960d9bd999 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -29,7 +29,7 @@ namespace tvm {
 namespace tir {
 
 // LetStmt
-LetStmt::LetStmt(Var var, PrimExpr value, Stmt body) {
+LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
   ICHECK(value.defined());
   ICHECK(body.defined());
   ICHECK_EQ(value.dtype(), var.dtype());
@@ -38,12 +38,14 @@ LetStmt::LetStmt(Var var, PrimExpr value, Stmt body) {
   node->var = std::move(var);
   node->value = std::move(value);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.LetStmt").set_body_typed([](Var var, PrimExpr value, Stmt body) {
-  return LetStmt(var, value, body);
-});
+TVM_REGISTER_GLOBAL("tir.LetStmt")
+    .set_body_typed([](Var var, PrimExpr value, Stmt body, Span span) {
+      return LetStmt(var, value, body, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(LetStmtNode);
 
@@ -58,18 +60,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // AttrStmt
-AttrStmt::AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body) {
+AttrStmt::AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body, Span span) {
   auto n = make_object<AttrStmtNode>();
   n->node = node;
   n->attr_key = std::move(attr_key);
   n->value = std::move(value);
   n->body = std::move(body);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("tir.AttrStmt")
-    .set_body_typed([](ObjectRef node, String attr_key, PrimExpr value, Stmt body) {
-      return AttrStmt(node, attr_key, value, body);
+    .set_body_typed([](ObjectRef node, String attr_key, PrimExpr value, Stmt body, Span span) {
+      return AttrStmt(node, attr_key, value, body, span);
     });
 
 TVM_REGISTER_NODE_TYPE(AttrStmtNode);
@@ -87,7 +90,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // AssertStmt
-AssertStmt::AssertStmt(PrimExpr condition, PrimExpr message, Stmt body) {
+AssertStmt::AssertStmt(PrimExpr condition, PrimExpr message, Stmt body, Span span) {
   ICHECK(condition.defined());
   ICHECK(message.dtype() == DataType::Int(32) || message.as<StringImmNode>())
       << "TypeError: AssertStmt message must be an int or string:" << message << "\n";
@@ -96,18 +99,19 @@ AssertStmt::AssertStmt(PrimExpr condition, PrimExpr message, Stmt body) {
   node->condition = std::move(condition);
   node->message = std::move(message);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_NODE_TYPE(AssertStmtNode);
 
 TVM_REGISTER_GLOBAL("tir.AssertStmt")
-    .set_body_typed([](PrimExpr condition, ObjectRef message, Stmt body) {
+    .set_body_typed([](PrimExpr condition, ObjectRef message, Stmt body, Span span) {
       if (const auto* str = message.as<StringObj>()) {
         auto msg = StringImm(str->data);
-        return AssertStmt(condition, msg, body);
+        return AssertStmt(condition, msg, body, span);
       } else {
-        return AssertStmt(condition, Downcast<PrimExpr>(message), body);
+        return AssertStmt(condition, Downcast<PrimExpr>(message), body, span);
       }
     });
 
@@ -125,7 +129,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // For
 For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api,
-         Stmt body) {
+         Stmt body, Span span) {
   ICHECK(min.defined());
   ICHECK(extent.defined());
   ICHECK(min.dtype().is_scalar());
@@ -140,13 +144,15 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAP
   node->for_type = for_type;
   node->device_api = device_api;
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.For").set_body_typed([](Var loop_var, PrimExpr min, PrimExpr extent,
-                                                 int for_type, int device_api, Stmt body) {
+                                                 int for_type, int device_api, Stmt body,
+                                                 Span span) {
   return For(loop_var, min, extent, static_cast<ForType>(for_type),
-             static_cast<DeviceAPI>(device_api), body);
+             static_cast<DeviceAPI>(device_api), body, span);
 });
 
 TVM_REGISTER_NODE_TYPE(ForNode);
@@ -188,7 +194,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Store
-Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate) {
+Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate, Span span) {
   ICHECK(value.defined());
   ICHECK(index.defined());
   ICHECK(predicate.defined());
@@ -200,15 +206,18 @@ Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate)
   node->value = std::move(value);
   node->index = std::move(index);
   node->predicate = std::move(predicate);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.Store").set_body([](TVMArgs args, TVMRetValue* ret) {
   PrimExpr value = args[1];
   if (args.size() == 3) {
-    *ret = Store(args[0], value, args[2], const_true(value.dtype().lanes()));
+    *ret = Store(args[0], value, args[2], const_true(value.dtype().lanes()), Span());
+  } else if (args.size() == 4) {
+    *ret = Store(args[0], value, args[2], args[3], Span());
   } else {
-    *ret = Store(args[0], value, args[2], args[3]);
+    *ret = Store(args[0], value, args[2], args[3], args[4]);
   }
 });
 
@@ -230,17 +239,19 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // ProducerStore
-ProducerStore::ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices) {
+ProducerStore::ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices,
+                             Span span) {
   ObjectPtr<ProducerStoreNode> node = make_object<ProducerStoreNode>();
   node->producer = std::move(producer);
   node->value = std::move(value);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.ProducerStore")
-    .set_body_typed([](DataProducer producer, PrimExpr value, Array<PrimExpr> indices) {
-      return ProducerStore(producer, value, indices);
+    .set_body_typed([](DataProducer producer, PrimExpr value, Array<PrimExpr> indices, Span span) {
+      return ProducerStore(producer, value, indices, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ProducerStoreNode);
@@ -262,7 +273,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // Allocate
 Allocate::Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, PrimExpr condition,
-                   Stmt body) {
+                   Stmt body, Span span) {
   // TODO(tvm-team): Add invariant check to make sure
   // IsPointerPType(buffer_var->type_annotation, dtype)
   // once we fix the allocate tvm script printing.
@@ -280,6 +291,7 @@ Allocate::Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, Prim
   node->extents = std::move(extents);
   node->condition = std::move(condition);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
@@ -300,7 +312,9 @@ int32_t AllocateNode::constant_allocation_size(const Array<PrimExpr>& extents) {
 
 TVM_REGISTER_GLOBAL("tir.Allocate")
     .set_body_typed([](Var buffer_var, DataType type, Array<PrimExpr> extents, PrimExpr condition,
-                       Stmt body) { return Allocate(buffer_var, type, extents, condition, body); });
+                       Stmt body, Span span) {
+      return Allocate(buffer_var, type, extents, condition, body, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(AllocateNode);
 
@@ -324,7 +338,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // ProducerRealize
 ProducerRealize::ProducerRealize(DataProducer producer, Region bounds, PrimExpr condition,
-                                 Stmt body) {
+                                 Stmt body, Span span) {
   for (size_t i = 0; i < bounds.size(); ++i) {
     ICHECK(bounds[i]->min.defined());
     ICHECK(bounds[i]->extent.defined());
@@ -340,12 +354,14 @@ ProducerRealize::ProducerRealize(DataProducer producer, Region bounds, PrimExpr
   node->bounds = std::move(bounds);
   node->condition = std::move(condition);
   node->body = std::move(body);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.ProducerRealize")
-    .set_body_typed([](DataProducer producer, Region bounds, PrimExpr condition, Stmt body) {
-      return ProducerRealize(producer, bounds, condition, body);
+    .set_body_typed([](DataProducer producer, Region bounds, PrimExpr condition, Stmt body,
+                       Span span) {
+      return ProducerRealize(producer, bounds, condition, body, span);
     });
 
 TVM_REGISTER_NODE_TYPE(ProducerRealizeNode);
@@ -379,13 +395,14 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Prefetch
-Prefetch::Prefetch(Buffer buffer, Array<Range> bounds) {
-  data_ = make_object<PrefetchNode>(buffer, bounds);
+Prefetch::Prefetch(Buffer buffer, Array<Range> bounds, Span span) {
+  data_ = make_object<PrefetchNode>(buffer, bounds, span);
 }
 
-TVM_REGISTER_GLOBAL("tir.Prefetch").set_body_typed([](Buffer buffer, Array<Range> bounds) {
-  return Prefetch(buffer, bounds);
-});
+TVM_REGISTER_GLOBAL("tir.Prefetch")
+    .set_body_typed([](Buffer buffer, Array<Range> bounds, Span span) {
+      return Prefetch(buffer, bounds, span);
+    });
 
 TVM_REGISTER_NODE_TYPE(PrefetchNode);
 
@@ -406,14 +423,15 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // SeqStmt
-SeqStmt::SeqStmt(Array<Stmt> seq) {
+SeqStmt::SeqStmt(Array<Stmt> seq, Span span) {
   auto node = make_object<SeqStmtNode>();
   node->seq = std::move(seq);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.SeqStmt").set_body_typed([](Array<Stmt> seq) {
-  return SeqStmt(std::move(seq));
+TVM_REGISTER_GLOBAL("tir.SeqStmt").set_body_typed([](Array<Stmt> seq, Span span) {
+  return SeqStmt(std::move(seq), span);
 });
 
 TVM_REGISTER_NODE_TYPE(SeqStmtNode);
@@ -427,7 +445,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // IfThenElse
-IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case) {
+IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case, Span span) {
   ICHECK(condition.defined());
   ICHECK(then_case.defined());
   // else_case may be null.
@@ -435,14 +453,15 @@ IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Stmt else_case) {
   node->condition = std::move(condition);
   node->then_case = std::move(then_case);
   node->else_case = std::move(else_case);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_NODE_TYPE(IfThenElseNode);
 
 TVM_REGISTER_GLOBAL("tir.IfThenElse")
-    .set_body_typed([](PrimExpr condition, Stmt then_case, Stmt else_case) {
-      return IfThenElse(condition, then_case, else_case);
+    .set_body_typed([](PrimExpr condition, Stmt then_case, Stmt else_case, Span span) {
+      return IfThenElse(condition, then_case, else_case, span);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -477,15 +496,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Evaluate
-Evaluate::Evaluate(PrimExpr value) {
+Evaluate::Evaluate(PrimExpr value, Span span) {
   ICHECK(value.defined());
 
   ObjectPtr<EvaluateNode> node = make_object<EvaluateNode>();
   node->value = std::move(value);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.Evaluate").set_body_typed([](PrimExpr value) { return Evaluate(value); });
+TVM_REGISTER_GLOBAL("tir.Evaluate").set_body_typed([](PrimExpr value, Span span) {
+  return Evaluate(value, span);
+});
 
 TVM_REGISTER_NODE_TYPE(EvaluateNode);
 
@@ -498,17 +520,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // BufferStore
-BufferStore::BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices) {
+BufferStore::BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices, Span span) {
   ObjectPtr<BufferStoreNode> node = make_object<BufferStoreNode>();
   node->buffer = std::move(buffer);
   node->value = std::move(value);
   node->indices = std::move(indices);
+  node->span = std::move(span);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("tir.BufferStore")
-    .set_body_typed([](Buffer buffer, PrimExpr value, Array<PrimExpr> indices) {
-      return BufferStore(buffer, value, indices);
+    .set_body_typed([](Buffer buffer, PrimExpr value, Array<PrimExpr> indices, Span span) {
+      return BufferStore(buffer, value, indices, span);
     });
 
 TVM_REGISTER_NODE_TYPE(BufferStoreNode);
@@ -529,14 +552,14 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // BufferRealize
-BufferRealize::BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body) {
-  data_ = make_object<BufferRealizeNode>(buffer, bounds, condition, body);
+BufferRealize::BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
+                             Span span) {
+  data_ = make_object<BufferRealizeNode>(buffer, bounds, condition, body, span);
 }
 
 TVM_REGISTER_GLOBAL("tir.BufferRealize")
-    .set_body_typed([](Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body) {
-      return BufferRealize(buffer, bounds, condition, body);
-    });
+    .set_body_typed([](Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
+                       Span span) { return BufferRealize(buffer, bounds, condition, body, span); });
 
 TVM_REGISTER_NODE_TYPE(BufferRealizeNode);
 
@@ -568,9 +591,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "}\n";
     });
 
-PrimExpr TypeAnnotation(DataType dtype) {
+PrimExpr TypeAnnotation(DataType dtype, Span span) {
   static auto op = Op::Get("tir.type_annotation");
-  return tir::Call(dtype, op, {});
+  return tir::Call(dtype, op, {}, span);
 }
 
 TVM_REGISTER_OP("tir.type_annotation")
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 71321d2a3b02..1a6df556876d 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -922,4 +922,8 @@ TVM_REGISTER_GLOBAL("tir._OpIfThenElse")
       return if_then_else(cond, true_value, false_value);
     });
 
+TVM_REGISTER_GLOBAL("tir.const_true").set_body_typed([](DataType t) {
+  return const_true(t.lanes());
+});
+
 }  // namespace tvm

From 746ad51d534ceaa575c4a4d7ca5c56896e21bcf7 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 10 Nov 2020 18:20:33 -0800
Subject: [PATCH 162/258] [AutoScheduler] Improve tuning with random cost model
 (#6835)

* fix

* more fix

* fix

* revert

* format

* Update sketch_policy.cc

* increase measure trial to avoid flaky
---
 .../tvm/auto_scheduler/relay_integration.py   |  4 +-
 .../search_policy/sketch_policy.cc            | 58 +++++++++++--------
 .../test_auto_scheduler_search_policy.py      |  2 +-
 3 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 24a4c44ba432..c8a4ed5ac9d2 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -72,9 +72,9 @@ def extract_tasks(mod, params, target, target_host=None, hardware_params=None):
     from tvm import relay
 
     if isinstance(target, str):
-        target = Target(target)
+        target = tvm.target.Target(target)
     if isinstance(target_host, str):
-        target_host = Target(target_host)
+        target_host = tvm.target.Target(target_host)
 
     # Run the compiler to collect all TOPI calls during compilation.
     env = TracingEnvironment(TracingMode.EXTRACT_TASK)
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index e4e186bc11d7..6360c72e3e9e 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -264,7 +264,6 @@ Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State
       static_cast<int>(
           GetDoubleParam(params, SketchParamKey::SampleInitPopulation::use_measured_ratio) *
           population));
-  bool is_cost_model_reasonable = !program_cost_model->IsInstance<RandomModelNode>();
 
   // 1. Generate sketches
   if (sketch_cache_.empty()) {
@@ -274,23 +273,17 @@ Array<State> SketchPolicyNode::SearchOneRound(int num_random_states, Array<State
   // 2. Sample the init population
   Array<State> init_population = SampleInitPopulation(sketch_cache_);
 
-  // 3. Perform evolutionary search if a cost model is utilized. Otherwise,
-  // just return some random states.
-  if (is_cost_model_reasonable) {
-    // Also insert already measured good states to the initial population
-    std::vector<int> indices = Argsort(measured_states_throughputs_);
-    for (int i = 0; i < num_use_measured; i++) {
-      init_population.push_back(measured_states_vector_[indices[i]]);
-    }
-    // Sample some random states for eps-greedy
-    if (num_random_states > 0 && random_states != nullptr) {
-      *random_states = RandomSampleStates(init_population, &rand_gen, num_random_states);
-    }
-    return EvolutionarySearch(init_population, num_measure_per_iter_ * 2);
-  } else {
-    PruneInvalidState(search_task, &init_population);
-    return RandomSampleStates(init_population, &rand_gen, num_measure_per_iter_ * 2);
+  // 3. Perform evolutionary search.
+  // Also insert already measured good states to the initial population
+  std::vector<int> indices = Argsort(measured_states_throughputs_);
+  for (int i = 0; i < num_use_measured; i++) {
+    init_population.push_back(measured_states_vector_[indices[i]]);
+  }
+  // Sample some random states for eps-greedy
+  if (num_random_states > 0 && random_states != nullptr) {
+    *random_states = RandomSampleStates(init_population, &rand_gen, num_random_states);
   }
+  return EvolutionarySearch(init_population, num_measure_per_iter_ * 2);
 }
 
 Array<State> SketchPolicyNode::GenerateSketches() {
@@ -378,6 +371,7 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
   }
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
+  std::unordered_set<std::string> explored_state_strs;
   size_t iter = 1;
   size_t target_size = min_population;
   size_t unchange_cnt = 0;
@@ -421,10 +415,13 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
       std::vector<float> pop_scores;
       pop_scores.reserve(cand_states.size());
       cand_states = search_task->compute_dag.InferBound(cand_states);
+      PruneInvalidState(search_task, &cand_states);
       program_cost_model->Predict(search_task, cand_states, &pop_scores);
 
       for (size_t i = 0; i < cand_states.size(); i++) {
-        if (pop_scores[i] > -1e10) {
+        const auto state_str = cand_states[i].ToStr();
+        if (pop_scores[i] > -1e10 && explored_state_strs.count(state_str) == 0) {
+          explored_state_strs.insert(state_str);
           out_states.push_back(std::move(cand_states[i]));
           unchange_cnt = 0;  // Reset the counter once we found a valid state
         } else {
@@ -449,7 +446,7 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
       if (target_size > 1) {
         target_size /= 2;
         StdCout(verbose) << "#Target has been reduced to " << target_size
-                         << " due to too many failures";
+                         << " due to too many failures or duplications" << std::endl;
       }
       unchange_cnt = 0;
     }
@@ -471,8 +468,15 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
   size_t population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
-  int num_iters = GetIntParam(params, SketchParamKey::EvolutionarySearch::num_iters);
   double mutation_prob = GetDoubleParam(params, SketchParamKey::EvolutionarySearch::mutation_prob);
+  int num_iters = GetIntParam(params, SketchParamKey::EvolutionarySearch::num_iters);
+
+  bool is_cost_model_reasonable = !program_cost_model->IsInstance<RandomModelNode>();
+  if (!is_cost_model_reasonable && num_iters > 3) {
+    num_iters = 3;
+    StdCout(verbose) << "GA iteration number has been adjusted to " << num_iters
+                     << " due to random cost model" << std::endl;
+  }
 
   // Two ping pong buffers to avoid copy.
   Array<State> states_buf1{init_population}, states_buf2;
@@ -493,7 +497,7 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   // auxiliary global variables
   std::vector<float> pop_scores;
   std::vector<double> pop_selection_probs;
-  float max_score = 0.0;
+  float max_score = -1e-10;
   pop_scores.reserve(population);
   pop_selection_probs.reserve(population);
   std::uniform_real_distribution<> dis(0.0, 1.0);
@@ -541,9 +545,15 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
 
     // Print statistical information
     if (k % 5 == 0 || k == num_iters) {
-      StdCout(verbose) << "GA Iter: " << k << std::fixed << std::setprecision(4)
-                       << "\tMax score: " << max_score << "\tMin score: " << heap.front().second
-                       << "\t#Pop: " << pnow->size() << "\t#M+: " << mutation_success_ct / (k + 1)
+      StdCout(verbose) << "GA Iter: " << k;
+      if (!heap.empty()) {
+        StdCout(verbose) << std::fixed << std::setprecision(4) << "\tMax score: " << max_score
+                         << std::fixed << std::setprecision(4)
+                         << "\tMin score: " << heap.front().second;
+      } else {
+        StdCout(verbose) << "\tMax score: N/A\tMin score: N/A";
+      }
+      StdCout(verbose) << "\t#Pop: " << heap.size() << "\t#M+: " << mutation_success_ct / (k + 1)
                        << "\t#M-: " << mutation_fail_ct / (k + 1) << std::endl;
     }
     if (k == num_iters) {
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index 5329f3d50685..6493c246d406 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -37,7 +37,7 @@ def search_common(
     seed=random.randint(1, 1 << 30),
     runner="local",
     cost_model=auto_scheduler.RandomModel(),
-    num_measure_trials=2,
+    num_measure_trials=10,
     init_search_callbacks=None,
 ):
     print("Test %s schedule search with the default search policy" % (target))

From 2b40d3f7d15a23a59301a862615cb14ab2d64d8e Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Wed, 11 Nov 2020 15:09:05 -0800
Subject: [PATCH 163/258] fix (#6902)

---
 cmake/modules/contrib/TensorRT.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake
index 1536d23205a7..24a8241a2229 100644
--- a/cmake/modules/contrib/TensorRT.cmake
+++ b/cmake/modules/contrib/TensorRT.cmake
@@ -18,6 +18,9 @@
 # TensorRT Codegen only. This can be enabled independently of USE_TENSORRT_RUNTIME to enable
 # compilation of TensorRT modules without requiring TensorRT to be installed. The compiled modules
 # will only be able to be executed using a TVM built with USE_TENSORRT_RUNTIME=ON.
+
+include (FindPackageHandleStandardArgs)
+
 if(USE_TENSORRT_CODEGEN)
     message(STATUS "Build with TensorRT codegen")
     file(GLOB COMPILER_TENSORRT_SRCS src/relay/backend/contrib/tensorrt/*.cc)

From c94d2c5626c4802e5b6e7694e6653c677d8b29e6 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 11 Nov 2020 16:58:23 -0700
Subject: [PATCH 164/258] Dynamic gpu tests, add dynamic strided slice to topi
 (#6870)

* enable GPU tests for dynamic ops

* strided-slice can't do 0-sized output tensors, remove test

* move dynamic strided slice into topi

* add python interface to topi dynamic strided slice

add python interface, tests

* autoformat

* fix bad copy/paste

* fix doc string

* disable topk on gpu for now, remove invalid slice test
---
 include/tvm/topi/transform.h                  | 34 +++++++++++++++
 python/tvm/topi/transform.py                  | 16 +++++++-
 python/tvm/topi/utils.py                      | 36 ++++++++++++++++
 src/relay/op/dyn/tensor/transform.cc          | 24 +----------
 src/topi/transform.cc                         |  4 ++
 .../relay/dyn/test_dynamic_op_level10.py      | 33 +++++++++++++--
 .../relay/dyn/test_dynamic_op_level2.py       | 13 +++---
 .../relay/dyn/test_dynamic_op_level3.py       | 15 +++----
 .../relay/dyn/test_dynamic_op_level4.py       |  4 +-
 .../relay/dyn/test_dynamic_op_level5.py       |  3 +-
 .../relay/dyn/test_dynamic_op_level6.py       |  2 +-
 tests/python/relay/test_op_level4.py          |  4 +-
 .../python/topi/python/test_topi_transform.py | 41 +++++++++++++++++++
 13 files changed, 174 insertions(+), 55 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index e3cc2f61f57b..9fe3fb10822b 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -550,6 +550,40 @@ inline Array<Tensor> split(const Tensor& x, Array<PrimExpr> split_indices, int a
   return result;
 }
 
+/*!
+ * \brief strided_slice of a tensor with dynamic begin/end/stride
+ *
+ * \param x The input tensor
+ * \param begin The indices to begin with in the slicing
+ * \param end Indicies indicating end of the slice
+ * \param strides Specifies the stride values, it can be negative
+ * in that case, the input tensor will be reversed in that particular axis
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is the split operation
+ */
+inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& begin,
+                                        const te::Tensor& end, const te::Tensor& strides,
+                                        std::string name = "T_strided_slice_dynamic",
+                                        std::string tag = topi::kInjective) {
+  int64_t src_tensor_dim = x->shape.size();
+  Array<PrimExpr> out_shape;
+  for (int64_t i = 0; i < src_tensor_dim; ++i) {
+    out_shape.push_back(tvm::tir::Var("dim"));
+  }
+  return te::compute(
+      out_shape,
+      [&](const Array<tvm::tir::Var>& indices) {
+        Array<PrimExpr> real_indices;
+        for (int32_t i = 0; i < src_tensor_dim; ++i) {
+          real_indices.push_back(indices[i] * strides(i) + begin(i));
+        }
+        return x(real_indices);
+      },
+      name, tag);
+}
+
 /*!
  * \brief strided_slice of a tensor
  *
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index b4a7d1c414da..cdf9ce5c9275 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -22,7 +22,7 @@
 from tvm import topi
 from . import cpp
 from . import tag
-from .utils import within_index, make_idx
+from .utils import within_index, make_idx, const_vector
 
 
 def expand_dims(a, axis, num_newaxis=1):
@@ -200,6 +200,20 @@ def strided_slice(a, begin, end, strides=None, slice_mode="end"):
     -------
     ret : tvm.te.Tensor
     """
+    if (
+        isinstance(begin, tvm.te.Tensor)
+        or isinstance(end, tvm.te.Tensor)
+        or isinstance(strides, tvm.te.Tensor)
+    ):
+        if not isinstance(begin, tvm.te.Tensor):
+            begin = const_vector(begin)
+        if not isinstance(end, tvm.te.Tensor):
+            end = const_vector(end)
+        if strides is None:
+            strides = [1] * begin.shape[0].value
+        if not isinstance(strides, tvm.te.Tensor):
+            strides = const_vector(strides)
+        return cpp.dynamic_strided_slice(a, begin, end, strides)
     if strides is None:
         strides = []
     return cpp.strided_slice(a, begin, end, strides, slice_mode)
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index ea08f3a94fad..a5df788d38cb 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -18,6 +18,8 @@
 """Common topi utilities"""
 from __future__ import absolute_import as _abs
 from numbers import Integral
+import numpy as np
+
 
 import tvm
 from tvm import te
@@ -188,6 +190,40 @@ def get_const_tuple(in_tuple):
     return tuple(ret)
 
 
+def const_vector(vector, name="const_vector"):
+    """convert a const numpy 1-dimensional vector to tvm tensor
+
+    Parameters
+    ----------
+    vector: numpy.ndarray
+        Const input array
+    name: str, optional
+        The name of output op
+
+    Returns
+    -------
+    tensor: Tensor
+        The created tensor
+    """
+    if not isinstance(vector, np.ndarray):
+        vector = np.array(vector)
+    row = vector.shape[0]
+    dtype = str(vector.dtype)
+    idxm = tvm.tir.indexmod
+
+    def select_array(i):
+        now = tvm.tir.const(0.0, dtype)
+        for ii in range(row):
+            now = tvm.tir.Select(
+                tvm.tir.all(idxm(i, row) == ii),
+                tvm.tir.const(vector[ii], dtype),
+                now,
+            )
+        return now
+
+    return te.compute(vector.shape, select_array, name=name)
+
+
 def get_float_tuple(in_tuple):
     """Verifies input tuple is FloatImm, returns tuple of float.
 
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index 119eba3da188..a609e701c49f 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -472,28 +472,6 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
   return true;
 }
 
-inline te::Tensor DynamicStridedSlice(const te::Tensor& input, const te::Tensor& begin,
-                                      const te::Tensor& end, const te::Tensor& strides,
-                                      std::string name = "T_strided_slice_dynamic",
-                                      std::string tag = topi::kInjective) {
-  int64_t src_tensor_dim = input->shape.size();
-  Array<IndexExpr> out_shape;
-  for (int64_t i = 0; i < src_tensor_dim; ++i) {
-    out_shape.push_back(tvm::tir::Var("dim"));
-  }
-  // TODO(yongwww): move the compute into topi
-  return te::compute(
-      out_shape,
-      [&](const Array<tvm::tir::Var>& indices) {
-        Array<IndexExpr> real_indices;
-        for (int32_t i = 0; i < src_tensor_dim; ++i) {
-          real_indices.push_back(indices[i] * strides(i) + begin(i));
-        }
-        return input(real_indices);
-      },
-      name, tag);
-}
-
 Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                       const Type& out_type) {
   te::Tensor data = inputs[0];
@@ -507,7 +485,7 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
          strides->shape[0].as<IntImmNode>()->value == data_rank)
       << "begin, end, and strides are required to have the same length"
       << " if they are dynamic variables.";
-  return Array<te::Tensor>{DynamicStridedSlice(data, begin, end, strides)};
+  return Array<te::Tensor>{topi::dynamic_strided_slice(data, begin, end, strides)};
 }
 
 Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, String slice_mode) {
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index 2d7657eedcdd..e1e3988f6400 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -169,6 +169,10 @@ TVM_REGISTER_GLOBAL("topi.strided_slice").set_body([](TVMArgs args, TVMRetValue*
   *rv = strided_slice(args[0], args[1], args[2], args[3], args[4]);
 });
 
+TVM_REGISTER_GLOBAL("topi.dynamic_strided_slice").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = dynamic_strided_slice(args[0], args[1], args[2], args[3]);
+});
+
 TVM_REGISTER_GLOBAL("topi.one_hot").set_body([](TVMArgs args, TVMRetValue* rv) {
   int depth = args[3];
   int axis = args[4];
diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py
index 18e1dd5bb72e..a520f6c2c368 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level10.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level10.py
@@ -27,8 +27,8 @@
 import random
 import tvm.testing
 
-# TODO(mbrookhart): Enable when the VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+
+@tvm.testing.uses_gpu
 def test_broadcast_to():
     def verify_more_dynamic_broadcast_to(x_shape, out_shape):
         rank = len(out_shape)
@@ -82,8 +82,33 @@ def verify_broadcast_to(x_shape, out_shape):
     verify_broadcast_to((4, 1), (1, 4, 3))
 
 
-# TODO(mbrookhart): Enable when the VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
+def test_dyn_broadcast_to():
+    dtype = "uint8"
+    rank = 3
+    shape_type = "int64"
+    dyn_shape = relay.Var("shape", relay.ty.TensorType((rank,), shape_type))
+    x_shape = (1,)
+    x = relay.Var("x", relay.ty.TensorType(x_shape, dtype))
+    z = relay.broadcast_to(x, dyn_shape)
+    zz = run_infer_type(z)
+
+    assert zz.checked_type == relay.ty.TensorType((relay.Any(),) * rank, dtype)
+
+    func = relay.Function([x, dyn_shape], z)
+
+    x = np.random.uniform(size=x_shape).astype(dtype)
+    dyn_shape = (1,) * rank
+    ref_res = np.broadcast_to(x, dyn_shape)
+    for target, ctx in tvm.testing.enabled_targets():
+        for kind in ["vm", "debug"]:
+            mod = tvm.ir.IRModule.from_expr(func)
+            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x, np.array(dyn_shape).astype(shape_type))
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+
+@tvm.testing.uses_gpu
 def test_dyn_one_hot():
     def _get_oshape(indices_shape, depth, axis):
         oshape = []
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
index 37cc124d33f7..5ef975f97d2c 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level2.py
@@ -27,8 +27,8 @@
 import tvm.topi.testing
 from tvm.relay.testing import run_infer_type
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+
+@tvm.testing.uses_gpu
 def test_dyn_upsampling_run():
     def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=False):
 
@@ -72,8 +72,7 @@ def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=Fa
 
 
 # tests upsampling type inference with scale_h passed in as a constant and scale_w as a variable
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_upsampling_infer_type_const():
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
 
@@ -85,8 +84,7 @@ def test_dyn_upsampling_infer_type_const():
     assert zz.checked_type == relay.TensorType((n, c, relay.Any(), relay.Any()), "int8")
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_upsampling3d_run():
     def verify_upsampling3d(
         dshape, scale_d, scale_h, scale_w, layout, method, coord_trans="half_pixel"
@@ -167,8 +165,7 @@ def test_dyn_upsampling3d_infer_type_const():
     )
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_pad():
     def verify_pad(dshape, pad_width, pad_val, dtype):
         x = relay.var("x", relay.TensorType(dshape, dtype))
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 301e72267bd1..e6e866342639 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -37,8 +37,7 @@ def verify_func(func, data, ref_res):
             relay.backend.compile_engine.get().clear()
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_reshape():
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -68,8 +67,7 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((2, 3, 4), (0, -3), (2, 12))
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_shape_reshape():
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -87,8 +85,7 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_tile():
     def verify_tile(dshape, reps):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -106,8 +103,7 @@ def verify_tile(dshape, reps):
     verify_tile((2, 3), (3, 2, 1))
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_zeros_ones():
     def verify_zeros_ones(shape, dtype):
         for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
@@ -125,8 +121,7 @@ def verify_zeros_ones(shape, dtype):
     verify_zeros_ones((8, 9, 1, 2), "float32")
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_full():
     def verify_full(fill_value, src_shape, dtype):
         x = relay.var("x", relay.scalar_type(dtype))
diff --git a/tests/python/relay/dyn/test_dynamic_op_level4.py b/tests/python/relay/dyn/test_dynamic_op_level4.py
index b8b2486df376..3d7a99a28e33 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level4.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level4.py
@@ -23,8 +23,7 @@
 import tvm.topi.testing
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dynamic_strided_slice():
     def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True, dtype="int32"):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
@@ -64,7 +63,6 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
             op_res = intrp.evaluate()(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
-    verify((1, 3, 10, 10), [0, 0, 0, 0], [-1, 3, 10, 10], [1], (0, 3, 10, 10), dtype="int64")
     verify(
         (1, 224, 224, 3),
         [0, 20, 20, 0],
diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py
index de199dd62deb..9273b019ec96 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level5.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level5.py
@@ -36,8 +36,7 @@ def test_resize_infer_type():
     assert zz.checked_type == relay.TensorType((n, c, relay.Any(), relay.Any()), "int8")
 
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_resize():
     def verify_resize(dshape, scale, method, layout):
         if layout == "NHWC":
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index bab8b9cf3078..aeed8db7c1b6 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -22,7 +22,7 @@
 from tvm import relay
 import tvm.testing
 
-# TODO(mbrookhart): Enable when VM supports heterogenus execution
+# TODO(mbrookhart): Enable when we can get it working
 # @tvm.testing.uses_gpu
 def test_dynamic_topk():
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index ef363430a2eb..114783e55f20 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -436,8 +436,7 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
     verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1], (2, 2, 3), slice_mode="size", test_ref=True)
 
 
-# TODO(mbrookhart): enable once vm supports heterogenous execution
-# @tvm.testing.uses_gpu
+@tvm.testing.uses_gpu
 def test_dyn_strided_slice():
     def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True, dtype="int32"):
         ndim = len(dshape)
@@ -468,7 +467,6 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
             op_res = intrp.evaluate()(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
-    verify((1, 3, 10, 10), [0, 0, 0, 0], [-1, 3, 10, 10], [1], (0, 3, 10, 10), dtype="int64")
     verify(
         (1, 224, 224, 3),
         [0, 20, 20, 0],
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index cdf0b8319087..30434f6fd266 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -423,6 +423,38 @@ def check_device(device):
         check_device(device)
 
 
+def verify_dynamic_strided_slice(in_shape, begin, end, strides=None):
+    A = te.placeholder(shape=in_shape, name="A")
+    Begin = te.placeholder(shape=[len(in_shape)], name="begin", dtype="int64")
+    End = te.placeholder(shape=[len(in_shape)], name="end", dtype="int64")
+    Strides = te.placeholder(shape=[len(in_shape)], name="strides", dtype="int64")
+    strides = [1, 1, 1] if strides is None else strides
+    B = topi.strided_slice(A, Begin, End, Strides) + 1
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not tvm.testing.device_enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.Target(device):
+            s = tvm.topi.testing.get_injective_schedule(device)(B)
+
+        foo = tvm.build(s, [A, Begin, End, Strides, B], device, name="stride_slice")
+        x_np = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides) + 1
+        data_nd = tvm.nd.array(x_np, ctx)
+        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
+        begin_nd = tvm.nd.array(np.array(begin).astype("int64"), ctx)
+        end_nd = tvm.nd.array(np.array(end).astype("int64"), ctx)
+        strides_nd = tvm.nd.array(np.array(strides).astype("int64"), ctx)
+        foo(data_nd, begin_nd, end_nd, strides_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
+        check_device(device)
+
+
 def verify_strided_set(in_shape, v_shape, begin, end, strides=None):
     A = te.placeholder(shape=in_shape, name="A")
     V = te.placeholder(shape=v_shape, name="V")
@@ -787,6 +819,15 @@ def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
 
 
+@tvm.testing.uses_gpu
+def test_dynamic_strided_slice():
+    verify_dynamic_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
+    verify_dynamic_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
+    verify_dynamic_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2])
+    verify_dynamic_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
+    verify_dynamic_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
+
+
 @tvm.testing.uses_gpu
 def test_strided_set():
     verify_strided_set((3, 4, 3), (3, 2, 2), [0, 3, 0], [4, 1, 4], [1, -1, 2])

From 230dcd209e9a9e55fee331b41810123f3c5021d9 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 12 Nov 2020 04:59:03 -0800
Subject: [PATCH 165/258] [AutoScheduler] Add winograd support in tuning
 networks (#6877)

* add winograd in auto-scheduler

* trigger CI

* address comments

* fix tests

* fix test
---
 python/tvm/relay/op/strategy/cuda.py          | 163 +++++++++-----
 python/tvm/topi/cuda/conv2d_alter_op.py       |  64 ++++--
 python/tvm/topi/cuda/conv2d_winograd.py       |  14 ++
 python/tvm/topi/nn/conv2d.py                  | 211 ++++++++++++++++++
 .../search_policy/sketch_policy.cc            |   2 +-
 .../search_policy/sketch_policy_rules.cc      |  37 ++-
 .../test_auto_scheduler_task_extraction.py    |  23 ++
 .../relay/test_auto_scheduler_tuning.py       |  24 +-
 .../python/test_topi_conv2d_nhwc_winograd.py  |   2 +-
 .../topi/python/test_topi_conv2d_winograd.py  |  78 ++++++-
 .../test_auto_scheduler_sketch_generation.py  |  11 +-
 11 files changed, 520 insertions(+), 109 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 1229a71569d0..f4ce61b8fa39 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -132,13 +132,9 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 )
             _, _, kh, kw = get_const_tuple(kernel.shape)
             if (
-                2 < kh < 8
-                and 2 < kw < 8
-                and kh == kw
-                and stride_h == 1
-                and stride_w == 1
-                and dilation_h == 1
-                and dilation_w == 1
+                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
+                and (stride_h == 1 and stride_w == 1)
+                and (dilation_h == 1 and dilation_w == 1)
             ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
@@ -165,14 +161,14 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 name="conv2d_nhwc.cuda",
             )
 
-            strategy.add_auto_scheduler(
-                wrap_compute_conv2d(topi.nn.conv2d_nhwc), name="conv2d_nhwc"
-            )
-
             N, H, W, _ = get_const_tuple(data.shape)
             KH, KW, CI, CO = get_const_tuple(kernel.shape)
             # Winograd shape related judgment
-            judge_winograd_tensorcore, judge_winograd_shape = winograd_judge(
+            (
+                judge_winograd_tensorcore,
+                judge_winograd_autotvm,
+                judge_winograd_auto_scheduler,
+            ) = judge_winograd(
                 N,
                 H,
                 W,
@@ -185,9 +181,11 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 stride_w,
                 dilation_h,
                 dilation_w,
+                data.dtype,
+                kernel.dtype,
                 pre_flag=False,
             )
-            if judge_winograd_shape:
+            if judge_winograd_autotvm:
                 if (
                     target.kind.name == "cuda"
                     and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
@@ -206,19 +204,32 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                         name="conv2d_nhwc_winograd_direct.cuda",
                         plevel=5,
                     )
-            if target.kind.name == "cuda":
-                if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
-                    if (
-                        (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-                        or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-                        or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-                    ):
-                        strategy.add_implementation(
-                            wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
-                            wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore),
-                            name="conv2d_nhwc_tensorcore.cuda",
-                            plevel=20,
-                        )
+            if (
+                target.kind.name == "cuda"
+                and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
+                and (
+                    (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
+                    or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
+                    or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
+                )
+            ):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
+                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore),
+                    name="conv2d_nhwc_tensorcore.cuda",
+                    plevel=20,
+                )
+
+            # register auto-scheduler implementations
+            if judge_winograd_auto_scheduler:
+                strategy.add_auto_scheduler(
+                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), name="conv2d_nhwc.winograd"
+                )
+            else:
+                strategy.add_auto_scheduler(
+                    wrap_compute_conv2d(topi.nn.conv2d_nhwc), name="conv2d_nhwc"
+                )
+
         elif layout == "HWNC":
             assert kernel_layout in ["HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"]
             _, _, N, in_channels = get_const_tuple(data.shape)
@@ -329,6 +340,63 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+def judge_winograd(
+    N,
+    H,
+    W,
+    KH,
+    KW,
+    CI,
+    CO,
+    padding,
+    stride_h,
+    stride_w,
+    dilation_h,
+    dilation_w,
+    data_dtype,
+    kernel_dtype,
+    pre_flag,
+):
+    """Winograd judgement about tensorcore and shape"""
+    if H % 8 == 0:
+        tile_size = 4
+    else:
+        tile_size = 2
+    if pre_flag:
+        alpha = KH
+        KH = KW = alpha + 1 - tile_size
+    pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (KH, KW))
+    OH = (H + pt + pb - KH) // stride_h + 1
+    OW = (W + pl + pr - KW) // stride_w + 1
+    nH, nW = (OH + tile_size - 1) // tile_size, (OW + tile_size - 1) // tile_size
+    P = N * nH * nW
+
+    judge_winograd_tensorcore = (
+        (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
+        or (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
+        or (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
+    )
+
+    judge_winograd_autotvm = (
+        2 < KH < 8
+        and 2 < KW < 8
+        and KH == KW
+        and stride_h == 1
+        and stride_w == 1
+        and dilation_h == 1
+        and dilation_w == 1
+    )
+
+    judge_winograd_auto_scheduler = (
+        ("float" in data_dtype and "float" in kernel_dtype)
+        and (KH == 3 and KW == 3)
+        and (stride_h == 1 and stride_w == 1)
+        and (dilation_h == 1 and dilation_w == 1)
+    )
+
+    return judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler
+
+
 @conv2d_winograd_without_weight_transfrom_strategy.register(["cuda", "gpu"])
 def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
     """conv2d_winograd_without_weight_transfrom cuda strategy"""
@@ -351,7 +419,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
         N, H, W, _ = get_const_tuple(data.shape)
         alpha, _, CI, CO = get_const_tuple(kernel.shape)
         dilation_h, dilation_w = dilation
-        judge_winograd_tensorcore, _ = winograd_judge(
+        judge_winograd_tensorcore, _, _ = judge_winograd(
             N,
             H,
             W,
@@ -364,6 +432,8 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
             stride_w,
             dilation_h,
             dilation_w,
+            data.dtype,
+            kernel.dtype,
             pre_flag=True,
         )
         if (
@@ -388,6 +458,12 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
                 ),
                 name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
             )
+
+        # register auto-scheduler implementations
+        strategy.add_auto_scheduler(
+            wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
+            name="conv2d_nhwc_winograd_without_weight_transform",
+        )
     else:
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
@@ -835,39 +911,6 @@ def proposal_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
-def winograd_judge(
-    N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, pre_flag
-):
-    """Winograd judgement about tensorcore and shape"""
-    if H % 8 == 0:
-        tile_size = 4
-    else:
-        tile_size = 2
-    if pre_flag:
-        alpha = KH
-        KH = KW = alpha + 1 - tile_size
-    pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (KH, KW))
-    OH = (H + pt + pb - KH) // stride_h + 1
-    OW = (W + pl + pr - KW) // stride_w + 1
-    nH, nW = (OH + tile_size - 1) // tile_size, (OW + tile_size - 1) // tile_size
-    P = N * nH * nW
-    judge_winograd_tensorcore = (
-        (P % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
-        or (P % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
-        or (P % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
-    )
-    judge_winograd_shape = (
-        2 < KH < 8
-        and 2 < KW < 8
-        and KH == KW
-        and stride_h == 1
-        and stride_w == 1
-        and dilation_h == 1
-        and dilation_w == 1
-    )
-    return judge_winograd_tensorcore, judge_winograd_shape
-
-
 @correlation_strategy.register(["cuda", "gpu"])
 def correlation_strategy_cuda(attrs, inputs, out_type, target):
     """correlation cuda strategy"""
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 609ead3e6398..3a58d40cb847 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -19,9 +19,7 @@
 
 import logging
 import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
+from tvm import te, relay, autotvm, auto_scheduler
 
 from .. import nn
 from ..utils import get_const_tuple
@@ -36,31 +34,58 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
 
-    _, outs = relay.backend.compile_engine.select_implementation(
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data, kernel = tinfos
+    out_dtype = out_type.dtype
+
+    impl, outs = relay.backend.compile_engine.select_implementation(
         relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
     if workload is None:
-        # The best implementation is not an AutoTVM template,
-        # we then assume it's not necessary to alter this op.
+        # The best implementation is not an AutoTVM template.
+        # It may be from the auto-scheduler
+
+        if impl.name == (
+            "conv2d_nhwc.winograd" + auto_scheduler.relay_integration.auto_schedule_impl_suffix
+        ):
+            if dilation != (1, 1):
+                logger.warning("Does not support weight pre-transform for dilated convolution.")
+                return None
+
+            assert data_layout == "NHWC" and kernel_layout == "HWIO"
+            N, H, W, CI = get_const_tuple(data.shape)
+            KH, KW, _, CO = get_const_tuple(kernel.shape)
+
+            # Pre-compute weight transformation in winograd
+            tile_size = _infer_tile_size(tinfos[0], tinfos[1])
+
+            # HWIO -> OIHW
+            kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
+            # alpha, alpha, CO, CI
+            weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+                kernel_transform, tile_size=tile_size
+            )
+            new_attrs["tile_size"] = tile_size
+            new_attrs["channels"] = CO
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
         return None
+
     cfg = dispatch_ctx.query(target, workload)
     if cfg.is_fallback:  # if is fallback, clear query cache and return None
         autotvm.task.clear_fallback_cache(target, workload)
         return None
 
     topi_tmpl = workload[0]
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data, kernel = tinfos
-    out_dtype = out_type.dtype
-
     if topi_tmpl == "conv2d_NCHWc_int8.cuda":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         N, CI, H, W = get_const_tuple(data.shape)
@@ -136,10 +161,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         KH, KW, _, CO = get_const_tuple(kernel.shape)
 
         # Pre-compute weight transformation in winograd
-        if H % 8 == 0:
-            tile_size = 4
-        else:
-            tile_size = 2
+        tile_size = _infer_tile_size(data, kernel)
         kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
         weight = relay.nn.contrib_conv2d_winograd_weight_transform(
             kernel_transform, tile_size=tile_size
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
index 407f05e64912..2d8d6de33828 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -25,6 +25,7 @@
 from .. import nn
 from ..utils import get_const_int, get_const_tuple, traverse_inline
 from ..nn.winograd_util import winograd_transform_matrices
+from ..nn.conv2d import conv2d_winograd_nhwc, _conv2d_winograd_nhwc_impl
 
 
 logger = logging.getLogger("conv2d_winograd")
@@ -354,3 +355,16 @@ def _callback(op):
 
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+@conv2d_winograd_nhwc.register(["cuda", "gpu"])
+def conv2d_winograd_nhwc_cuda(
+    data, weight, strides, padding, dilation, out_dtype, pre_computed=False
+):
+    """Conv2D Winograd in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+    """
+    tile_size = _infer_tile_size(data, weight)
+    return _conv2d_winograd_nhwc_impl(
+        data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed
+    )
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index cd10c757e956..7c9cef613439 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -944,3 +944,214 @@ def unpack_NCHWc_to_nchw(packed_out, out_dtype):
         tag=tag.INJECTIVE + ",unpack_nchwc",
     )
     return unpacked_out
+
+
+def _conv2d_winograd_nhwc_impl(
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    tile_size,
+    pre_computed=False,
+):
+    """Conv2D Winograd implementation in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    weight : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+    out_dtype : str, optional
+        Specifies the output data type.
+    tile_size : int
+        The size of the tile to use for the Winograd filter
+    pre_computed: bool
+        Whether the kernel is precomputed
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    N, H, W, CI = get_const_tuple(data.shape)
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
+    if not pre_computed:
+        KH, KW, CI, CO = get_const_tuple(weight.shape)
+    else:
+        H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
+        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+
+    pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
+    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
+    assert HSTR == 1 and WSTR == 1 and KH == 3 and KW == 3
+
+    r = KW
+    m = tile_size
+    alpha = m + r - 1
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    H = (H + pad_t + pad_b - KH) // HSTR + 1
+    W = (W + pad_l + pad_r - KW) // WSTR + 1
+    nH, nW = (H + m - 1) // m, (W + m - 1) // m
+    P = N * nH * nW
+
+    pad_extra = (nW - 1) * m + alpha - (H + pad_t + pad_b)
+    data_pad = pad(
+        data, (0, pad_t, pad_l, 0), (0, pad_b + pad_extra, pad_r + pad_extra, 0), name="data_pad"
+    )
+
+    if not pre_computed:
+        r_kh = te.reduce_axis((0, KH), name="r_kh")
+        r_kw = te.reduce_axis((0, KW), name="r_kw")
+        kernel_pack = te.compute(
+            (alpha, alpha, CO, CI),
+            lambda eps, nu, co, ci: te.sum(
+                weight[r_kh][r_kw][ci][co] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
+            ),
+            name="kernel_pack",
+        )
+    else:
+        kernel_pack = weight
+
+    # pack data tile
+    input_tile = te.compute(
+        (alpha, alpha, P, CI),
+        lambda eps, nu, p, ci: data_pad[p // (nH * nW)][((p // nW) % nH) * m + eps][
+            (p % nW) * m + nu
+        ][ci],
+        name="input_tile",
+    )
+
+    # transform data
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_b")
+    data_pack = te.compute(
+        (alpha, alpha, P, CI),
+        lambda eps, nu, p, ci: te.sum(
+            input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
+        ),
+        name="data_pack",
+        attrs={"auto_scheduler_simplify_const_tensor_indices": ["eps", "nu", "r_a", "r_b"]},
+        # the attrs are necessary hints for the auto-scheduler
+    )
+
+    # do batch gemm
+    ci = te.reduce_axis((0, CI), name="ci")
+    bgemm = te.compute(
+        (alpha, alpha, P, CO),
+        lambda eps, nu, p, co: te.sum(
+            data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][co][ci], axis=[ci]
+        ),
+        name="bgemm",
+        attrs={"layout_free_placeholders": [kernel_pack]},
+    )
+
+    # inverse transform
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_b")
+    inverse = te.compute(
+        (m, m, P, CO),
+        lambda vh, vw, p, co: te.sum(
+            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
+        ),
+        name="inverse",
+        attrs={"auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"]},
+        # the attrs are necessary hints for the auto-scheduler
+    )
+
+    # output
+    output = te.compute(
+        (N, H, W, CO),
+        lambda n, h, w, co: inverse[h % m, w % m, n * nH * nW + (h // m) * nW + (w // m), co],
+        name="conv2d_winograd",
+    )
+
+    return output
+
+
+@tvm.target.generic_func
+def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, pre_computed=False):
+    """Conv2D Winograd in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    weight : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+    out_dtype : str, optional
+        Specifies the output data type.
+    pre_computed: bool
+        Whether the kernel is precomputed
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    tile_size = 4
+
+    return _conv2d_winograd_nhwc_impl(
+        data,
+        weight,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        tile_size,
+        pre_computed,
+    )
+
+
+def conv2d_winograd_nhwc_without_weight_transform(
+    data, weight, strides, padding, dilation, out_dtype
+):
+    """Conv2D Winograd without layout transform in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for both CPU and GPU.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    weight : tvm.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+    out_dtype : str, optional
+        Specifies the output data type.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+
+    return conv2d_winograd_nhwc(
+        data, weight, strides, padding, dilation, out_dtype, pre_computed=True
+    )
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 6360c72e3e9e..b64776ff342a 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -116,8 +116,8 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
   } else if (IsGPUTask(node->search_task)) {
     // Sketch Generation Rules
     node->sketch_rules.push_back(&rule_add_cache_read_stage);
-    node->sketch_rules.push_back(&rule_always_inline);
     node->sketch_rules.push_back(&rule_special_compute_location_gpu);
+    node->sketch_rules.push_back(&rule_always_inline);
     node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor);
     node->sketch_rules.push_back(&rule_cross_thread_reduction);
     node->sketch_rules.push_back(&rule_add_cache_write_stage);
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 1c69397833df..814e72a9478c 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -54,21 +54,32 @@ std::vector<std::pair<State, int>> RuleSkipStage::Apply(const SketchPolicyNode&
 }
 
 /********** RuleAlwaysInline **********/
+inline bool ShouldAlwaysBeInlined(const SketchPolicyNode& policy, const State& state,
+                                  int stage_id) {
+  const SearchTask& task = policy.search_task;
+  const Stage& stage = state->stages[stage_id];
+
+  // Check the inline limitation of TE
+  if (stage->op_type == StageKind::kPlaceholder || IsOutputOp(task, state, stage_id) ||
+      HasReduceIter(stage)) {
+    return false;
+  }
+
+  if (IsGPUTask(task)) {  // Greedily inline all inlinable ops on gpu
+    return true;
+  } else {
+    // Only always-inline strict-inlinable ops on cpu.
+    // The computation location of other ops will be tuned by InitChangeComputeLocation
+    // and MutateComputeLocation.
+    return IsStrictlyInlineable(task, state, stage_id);
+  }
+}
 
 SketchGenerationRule::ConditionKind RuleAlwaysInline::MeetCondition(const SketchPolicyNode& policy,
                                                                     const State& state,
                                                                     int stage_id) const {
-  const Stage& stage = state->stages[stage_id];
-  // Check the inline limitation of TE first
-  if (stage->op_type == StageKind::kPlaceholder ||
-      IsOutputOp(policy.search_task, state, stage_id) || HasReduceIter(stage)) {
-    return ConditionKind::kSkip;
-  }
-
-  // Always do compute inline if it's strictly inlineable or is in GPU policy
-  return IsStrictlyInlineable(policy.search_task, state, stage_id) || IsGPUTask(policy.search_task)
-             ? ConditionKind::kApplyAndSkipRest
-             : ConditionKind::kSkip;
+  return ShouldAlwaysBeInlined(policy, state, stage_id) ? ConditionKind::kApplyAndSkipRest
+                                                        : ConditionKind::kSkip;
 }
 
 std::vector<std::pair<State, int>> RuleAlwaysInline::Apply(const SketchPolicyNode& policy,
@@ -417,6 +428,10 @@ SketchGenerationRule::ConditionKind RuleSpecialComputeLocationGPU::MeetCondition
     return ConditionKind::kSkip;
   }
 
+  if (!ShouldAlwaysBeInlined(policy, state, stage_id)) {
+    return ConditionKind::kSkip;
+  }
+
   const std::set<int>& consumers = GetConsumers(policy.search_task, state, stage_id);
   if (consumers.size() == 1 && state->stages[*consumers.begin()]->op->attrs.count(
                                    SearchPolicyKey::simplify_const_tensor_indices)) {
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py
index 9f6ddb652469..4ca2ddb3cf10 100644
--- a/tests/python/relay/test_auto_scheduler_task_extraction.py
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -43,6 +43,29 @@ def get_network(name, batch_size=1, layout="NHWC"):
         mod, params = relay.testing.resnet.get_workload(
             num_layers=50, batch_size=batch_size, layout=layout, image_shape=image_shape
         )
+    elif name == "winograd-test":
+        input_shape = [1, 7, 7, 64]
+        output_shape = input_shape
+
+        data = relay.var("data", shape=input_shape, dtype="float32")
+        net = relay.testing.layers.conv2d(
+            data=data,
+            channels=64,
+            kernel_size=3,
+            strides=1,
+            padding=1,
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+            name="",
+        )
+        bias = relay.var("conv1_bias")
+        net = relay.nn.bias_add(net, bias, 3)
+        net = relay.nn.relu(net)
+        mod, params = relay.testing.create_workload(net)
+    elif name == "resnet3d-18":
+        mod, params = relay.testing.resnet_3d.get_workload(
+            num_layers=18, batch_size=batch_size, layout=layout, image_shape=image_shape
+        )
     elif name == "mobilenet":
         mod, params = relay.testing.mobilenet.get_workload(
             batch_size=batch_size, layout=layout, image_shape=image_shape
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
index b8f5145de4aa..ad882b3eaf24 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -23,13 +23,12 @@
 from test_auto_scheduler_task_extraction import get_network
 
 
-@tvm.testing.requires_cuda
-def test_tuning_cuda():
+def tune_network(network, target):
     auto_scheduler.enable_relay_integration()
 
     # Extract tasks
-    mod, params = get_network("mlp")
-    target = tvm.target.Target("cuda")
+    mod, params = get_network(network)
+    target = tvm.target.Target(target)
     tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
     objective = lambda costs: sum(c * w for c, w in zip(costs, task_weights))
 
@@ -37,12 +36,13 @@ def test_tuning_cuda():
         log_file = fp.name
 
         # Tuning
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=100)
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
         tuner = auto_scheduler.TaskScheduler(tasks, objective)
         tune_option = auto_scheduler.TuningOptions(
-            num_measure_trials=2,
-            num_measures_per_round=1,
+            num_measure_trials=4,
+            num_measures_per_round=2,
             runner=measure_ctx.runner,
+            builder=auto_scheduler.LocalBuilder(timeout=60),
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
         tuner.tune(tune_option, search_policy="sketch.random")
@@ -53,10 +53,18 @@ def test_tuning_cuda():
             with tvm.transform.PassContext(opt_level=3):
                 lib = relay.build(mod, target=target, params=params)
 
-    # Todo(merrymercy): compile without any history to test the fallback mechanism
+    # Todo(merrymercy): when the cpu backend is upstreamed, do the following things:
+    # 1. compile without history to test the fallback mechanism
+    # 2. check the correctness of layout rewrite / winograd pre-transform
 
     auto_scheduler.enable_relay_integration(False)
 
 
+@tvm.testing.requires_cuda
+def test_tuning_cuda():
+    tune_network("mlp", "cuda")
+    tune_network("winograd-test", "cuda")
+
+
 if __name__ == "__main__":
     test_tuning_cuda()
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
index ac29fd6d0cff..436270173316 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
@@ -74,7 +74,7 @@ def verify_conv2d_nhwc(
     bias_shape = get_const_tuple(bias.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_conv2d_nhwc")
+    @memoize("topi.tests.test_topi_conv2d_nhwc_winograd.verify_conv2d_nhwc")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
diff --git a/tests/python/topi/python/test_topi_conv2d_winograd.py b/tests/python/topi/python/test_topi_conv2d_winograd.py
index 9f07dbd1be46..34febfd9460a 100644
--- a/tests/python/topi/python/test_topi_conv2d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_winograd.py
@@ -67,7 +67,7 @@ def verify_conv2d_nchw(
     bias_shape = get_const_tuple(bias.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
+    @memoize("topi.tests.test_topi_conv2d_winograd.verify_conv2d_nhwc")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
@@ -140,7 +140,6 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
     verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
     verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
-    verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2, devices=["cuda"])
 
     # batch size = 2
     verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1)
@@ -154,6 +153,7 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1)
     verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1)
     verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
+    verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2, devices=["cuda"])
 
     # Asymmetric padding
     verify_conv2d_nchw(1, 48, 56, 48, 3, 1, (1, 1, 1, 1))
@@ -170,5 +170,79 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 48, 35, 48, 5, 1, "VALID", devices=["cuda"])
 
 
+def verify_conv2d_nhwc(
+    batch,
+    in_channel,
+    in_size,
+    num_filter,
+    kernel,
+    stride,
+    padding,
+    dilation=1,
+):
+    # This version is intented to be used by the auto-scheduler,
+    # so we only test the correctness of compute declaration
+    # with the default naive schedule in cpu
+
+    A = te.placeholder((batch, in_size, in_size, in_channel), name="A")
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
+    bias = te.placeholder((1, 1, 1, num_filter), name="bias")
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_winograd.verify_conv2d_nhwc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    target = "llvm"
+    ctx = tvm.context(target)
+
+    C = topi.nn.conv2d_winograd_nhwc(A, W, stride, padding, dilation, dtype)
+    s = te.create_schedule([C.op])
+
+    a = tvm.nd.array(a_np, ctx=ctx)
+    w = tvm.nd.array(w_np, ctx=ctx)
+    b = tvm.nd.array(b_np, ctx=ctx)
+    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx=ctx)
+    func = tvm.build(s, [A, W, C], target=target)
+    func(a, w, c)
+
+    rtol = 1e-3
+    tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
+
+
+def test_conv2d_nhwc():
+    # This version is intented to be used by the auto-scheduler,
+    # so we only test the correctness of compute declaration
+    # with the default naive schedule in cpu
+
+    # resnet 18 workloads
+    verify_conv2d_nhwc(1, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nhwc(1, 128, 28, 128, 3, 1, 1)
+    verify_conv2d_nhwc(1, 256, 14, 256, 3, 1, 1)
+    verify_conv2d_nhwc(1, 512, 7, 512, 3, 1, 1)
+
+    # more shapes
+    verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nhwc(1, 1, 1, 1, 3, 1, 1)
+    verify_conv2d_nhwc(3, 3, 3, 3, 3, 1, 1)
+    verify_conv2d_nhwc(2, 13, 71, 59, 3, 1, 1)
+
+    # Asymmetric padding
+    verify_conv2d_nhwc(1, 3, 7, 3, 3, 1, "SAME")
+    verify_conv2d_nhwc(1, 48, 35, 48, 3, 1, "VALID")
+
+
 if __name__ == "__main__":
     test_conv2d_nchw()
+    test_conv2d_nhwc()
diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
index 1a47da00eda3..1fef31550e67 100644
--- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py
+++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
@@ -373,18 +373,19 @@ def test_cuda_conv2d_winograd_sketch():
     """ 1 multi-level tiling sketch """
     assert len(sketches) == 1
     assert_compute_at_condition(sketches[0].stages[1], "inlined")
-    assert_compute_at_condition(sketches[0].stages[2], "inlined")
+    assert_compute_at_condition(sketches[0].stages[2], "iter")
     assert_compute_at_condition(sketches[0].stages[3], "inlined")
     assert_is_tiled(sketches[0].stages[4])
     assert_has_cache_read(sketches[0], 4)
     assert_compute_at_condition(sketches[0].stages[5], "iter")
     assert_has_cache_read(sketches[0], 6)
     assert_compute_at_condition(sketches[0].stages[7], "iter")
-    assert_is_not_tiled(sketches[0].stages[8])
+    assert_is_tiled(sketches[0].stages[8])
     assert_compute_at_condition(sketches[0].stages[8], "iter")
-    assert_compute_at_condition(sketches[0].stages[9], "inlined")
-    assert_is_tiled(sketches[0].stages[10])
-    assert_is_not_tiled(sketches[0].stages[11])
+    assert_has_cache_write(sketches[0], 8)
+    assert_compute_at_condition(sketches[0].stages[9], "root")
+    assert_is_tiled(sketches[0].stages[11])
+    assert_is_not_tiled(sketches[0].stages[12])
 
 
 if __name__ == "__main__":

From 9674d495810a2803068d60e80423ffa4e41d67c3 Mon Sep 17 00:00:00 2001
From: Alex Gladkov <gladkova@lab126.com>
Date: Thu, 12 Nov 2020 15:42:08 -0800
Subject: [PATCH 166/258] Bump up tophup cuda version (#6908)

---
 python/tvm/autotvm/tophub.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index c17c611f5499..7e72fe08fe32 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -48,7 +48,7 @@
 PACKAGE_VERSION = {
     "arm_cpu": "v0.08",
     "llvm": "v0.04",
-    "cuda": "v0.09",
+    "cuda": "v0.10",
     "rocm": "v0.05",
     "opencl": "v0.04",
     "mali": "v0.06",

From e09edaa218fb6ac771e5a424374ab8957f385155 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Thu, 12 Nov 2020 15:58:40 -0800
Subject: [PATCH 167/258] [TFLite runtime] Allow to set number of threads to
 TFLite interpreter (#6901)

* Support for setting thread count in TFLite runtime,

Co-authored-by: FrozenGene <zhaowu@apache.org>

* fix lint

Co-authored-by: FrozenGene <zhaowu@apache.org>
---
 python/tvm/contrib/tflite_runtime.py         | 10 ++++++++++
 src/runtime/contrib/tflite/tflite_runtime.cc |  8 ++++++++
 src/runtime/contrib/tflite/tflite_runtime.h  |  5 +++++
 3 files changed, 23 insertions(+)

diff --git a/python/tvm/contrib/tflite_runtime.py b/python/tvm/contrib/tflite_runtime.py
index 92501f950c56..3b0e268e2a44 100644
--- a/python/tvm/contrib/tflite_runtime.py
+++ b/python/tvm/contrib/tflite_runtime.py
@@ -73,6 +73,7 @@ def __init__(self, module):
         self._set_input = module["set_input"]
         self._invoke = module["invoke"]
         self._get_output = module["get_output"]
+        self._set_num_threads = module["set_num_threads"]
 
     def set_input(self, index, value):
         """Set inputs to the module via kwargs
@@ -109,3 +110,12 @@ def get_output(self, index):
             The output index
         """
         return self._get_output(index)
+
+    def set_num_threads(self, num_threads):
+        """Set the number of threads via kwargs
+        Parameters
+        ----------
+        num_threads : int
+           The number of threads
+        """
+        self._set_num_threads(num_threads)
diff --git a/src/runtime/contrib/tflite/tflite_runtime.cc b/src/runtime/contrib/tflite/tflite_runtime.cc
index f56e62ec1a40..9a434fde2955 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.cc
+++ b/src/runtime/contrib/tflite/tflite_runtime.cc
@@ -128,6 +128,8 @@ void TFLiteRuntime::SetInput(int index, DLTensor* data_in) {
   });
 }
 
+void TFLiteRuntime::SetNumThreads(int num_threads) { interpreter_->SetNumThreads(num_threads); }
+
 NDArray TFLiteRuntime::GetOutput(int index) const {
   TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[index]);
   DataType dtype = TfLiteDType2TVMDType(output->type);
@@ -163,6 +165,12 @@ PackedFunc TFLiteRuntime::GetFunction(const std::string& name,
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetOutput(args[0]); });
   } else if (name == "invoke") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Invoke(); });
+  } else if (name == "set_num_threads") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int num_threads = args[0];
+      CHECK_GE(num_threads, 1);
+      this->SetNumThreads(num_threads);
+    });
   } else {
     return PackedFunc();
   }
diff --git a/src/runtime/contrib/tflite/tflite_runtime.h b/src/runtime/contrib/tflite/tflite_runtime.h
index ff0e6ab0db56..3311f10975be 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.h
+++ b/src/runtime/contrib/tflite/tflite_runtime.h
@@ -93,6 +93,11 @@ class TFLiteRuntime : public ModuleNode {
    * \return NDArray corresponding to given output node index.
    */
   NDArray GetOutput(int index) const;
+  /*!
+   * \brief Set the number of threads available to the interpreter.
+   * \param num_threads The number of threads to be set.
+   */
+  void SetNumThreads(int num_threads);
 
   // Buffer backing the interpreter's model
   std::unique_ptr<char[]> flatBuffersBuffer_;

From b11f17346a9db6ab8aa05ac2949b80733510ceb3 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 13 Nov 2020 00:36:24 -0800
Subject: [PATCH 168/258] [AutoScheduler] Tutorial on auto-scheduling a network
 for GPU (#6882)

* add a tutorial on auto-scheduling a network for cuda

* fix typo

* fix training time printing

* fix lint

* fix

* upload logs

* fix

* use weighted sum as the default objective function

* update ci logs

* fix the bug in kill_child_processes

* fix test

* address comments

* add early stopping in task scheduler & fix a stuck issue in measurement

* fix lint

* trigger CI

* fix early stopping
---
 include/tvm/auto_scheduler/measure.h          |   3 +
 .../auto_scheduler/cost_model/xgb_model.py    |   4 -
 python/tvm/auto_scheduler/dispatcher.py       |   2 +-
 python/tvm/auto_scheduler/measure.py          |  59 ++--
 python/tvm/auto_scheduler/measure_record.py   |  81 ++++-
 python/tvm/auto_scheduler/task_scheduler.py   | 100 ++++--
 python/tvm/auto_scheduler/utils.py            |  42 ++-
 python/tvm/autotvm/record.py                  |   4 +-
 src/auto_scheduler/feature.cc                 |  10 -
 src/auto_scheduler/measure.cc                 |  19 +-
 .../search_policy/sketch_policy.cc            |  30 +-
 src/auto_scheduler/search_policy/utils.h      |   6 +-
 .../relay/test_auto_scheduler_tuning.py       |   6 +-
 .../test_auto_scheduler_search_policy.py      |  46 +--
 .../test_auto_scheduler_task_scheduler.py     |   9 +-
 tests/scripts/task_python_docs.sh             |   2 +-
 .../ci_logs/resnet-18-NHWC-B1.json            |  23 ++
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |  10 +-
 tutorials/auto_scheduler/tune_network_cuda.py | 302 ++++++++++++++++++
 tutorials/autotvm/tune_relay_cuda.py          |   1 +
 20 files changed, 631 insertions(+), 128 deletions(-)
 create mode 100644 tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
 create mode 100644 tutorials/auto_scheduler/tune_network_cuda.py

diff --git a/include/tvm/auto_scheduler/measure.h b/include/tvm/auto_scheduler/measure.h
index 339f42896b66..e8c01e84f289 100755
--- a/include/tvm/auto_scheduler/measure.h
+++ b/include/tvm/auto_scheduler/measure.h
@@ -43,6 +43,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 namespace tvm {
@@ -436,6 +437,8 @@ class ProgramMeasurerNode : public Object {
   std::unordered_map<std::string, State> best_state;
   /*! \brief Workload key to best state's count index map. */
   std::unordered_map<std::string, int> best_ct;
+  /*! \brief The set of workloads that have at least one valid schedule */
+  std::unordered_set<std::string> has_valid;
   /*! \brief The ProgramBuilder to build each program. */
   ProgramBuilder builder;
   /*! \brief The ProgramRunner to measure each program. */
diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index b9afd98be21d..ef5472d6b77e 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -20,7 +20,6 @@
 import multiprocessing
 import logging
 from collections import defaultdict
-import time
 
 import numpy as np
 
@@ -138,7 +137,6 @@ def update(self, inputs, results):
         if len(inputs) <= 0:
             return
         assert len(inputs) == len(results)
-        tic = time.time()
 
         self.inputs.extend(inputs)
         self.results.extend(results)
@@ -178,8 +176,6 @@ def update(self, inputs, results):
             ],
         )
 
-        logger.info("XGBModel Training time: %.2f s", time.time() - tic)
-
     def predict(self, task, states):
         """Predict the scores of states
         Parameters
diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index 7c0c6ef64322..8822f3963f7b 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -249,7 +249,7 @@ def query(self, target, workload_key):
 
         if not self.silent:
             msg = (
-                "Cannot find tuned schedule for target=%s, workload_key=%s. "
+                "Cannot find tuned schedules for target=%s, workload_key=%s. "
                 "A fallback schedule is used, "
                 "which may bring great performance regression." % (target, workload_key)
             )
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 642e8f85e86b..117cd4f8bc71 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -34,7 +34,6 @@
 import os
 import time
 import shutil
-import traceback
 import tempfile
 import multiprocessing
 
@@ -48,10 +47,11 @@
 from . import _ffi_api
 from .loop_state import StateObject
 from .utils import (
-    get_const_tuple,
     call_func_with_timeout,
-    request_remote,
     check_remote,
+    get_const_tuple,
+    make_traceback_info,
+    request_remote,
 )
 from .compute_dag import ComputeDAG
 from .search_task import SearchTask
@@ -60,8 +60,6 @@
     deserialize_workload_registry_entry,
 )
 
-# The maximum length of error message
-MAX_ERROR_MSG_LEN = 512
 
 # The time cost for measurements with errors
 # We use 1e10 instead of sys.float_info.max for better readability in log
@@ -536,16 +534,6 @@ class MeasureErrorNo(object):
     UNKNOWN_ERROR = 8  # Unknown error
 
 
-def make_error_msg():
-    """ Get the error message from traceback. """
-    error_msg = str(traceback.format_exc())
-    if len(error_msg) > MAX_ERROR_MSG_LEN:
-        error_msg = (
-            error_msg[: MAX_ERROR_MSG_LEN // 2] + "\n...\n" + error_msg[-MAX_ERROR_MSG_LEN // 2 :]
-        )
-    return error_msg
-
-
 def _timed_func(inp_serialized, build_func, verbose):
     tic = time.time()
     inp = MeasureInput.deserialize(inp_serialized)
@@ -560,14 +548,13 @@ def _timed_func(inp_serialized, build_func, verbose):
     # pylint: disable=broad-except
     except Exception:
         error_no = MeasureErrorNo.INSTANTIATION_ERROR
-        error_msg = make_error_msg()
+        error_msg = make_traceback_info()
 
     if error_no == 0:
         dirname = tempfile.mkdtemp()
         filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
 
         try:
-            # TODO(merrymercy): Port the unroll pass.
             with transform.PassContext():
                 func = build_module.build(
                     sch, args, target=task.target, target_host=task.target_host
@@ -576,7 +563,7 @@ def _timed_func(inp_serialized, build_func, verbose):
         # pylint: disable=broad-except
         except Exception:
             error_no = MeasureErrorNo.COMPILE_HOST
-            error_msg = make_error_msg()
+            error_msg = make_traceback_info()
     else:
         filename = ""
 
@@ -585,6 +572,7 @@ def _timed_func(inp_serialized, build_func, verbose):
             print(".", end="")
         else:
             print(".E", end="")  # Build error
+
     return filename, args, error_no, error_msg, time.time() - tic
 
 
@@ -615,6 +603,10 @@ def local_build_worker(args):
         if verbose >= 1:
             print(".T", end="")  # Build timeout
         res = None, [], MeasureErrorNo.BUILD_TIMEOUT, None, timeout
+    elif isinstance(res, Exception):
+        if verbose >= 1:
+            print(".E", end="")  # Build error
+        res = None, [], MeasureErrorNo.COMPILE_HOST, str(res), timeout
 
     return res
 
@@ -703,7 +695,7 @@ def _timed_eval_func(
     except Exception:
         costs = (MAX_FLOAT,)
         error_no = MeasureErrorNo.COMPILE_DEVICE
-        error_msg = make_error_msg()
+        error_msg = make_traceback_info()
 
     if error_no == 0:
         try:
@@ -718,7 +710,7 @@ def _timed_eval_func(
         except Exception:
             costs = (MAX_FLOAT,)
             error_no = MeasureErrorNo.RUNTIME_DEVICE
-            error_msg = make_error_msg()
+            error_msg = make_traceback_info()
 
     shutil.rmtree(os.path.dirname(build_res.filename))
     toc = time.time()
@@ -825,6 +817,17 @@ def local_run(
                     build_res.time_cost + timeout,
                     time.time(),
                 )
+            elif isinstance(res, Exception):
+                if verbose >= 1:
+                    print("*E", end="")  # Run error
+                res = (
+                    (MAX_FLOAT,),
+                    MeasureErrorNo.RUNTIME_DEVICE,
+                    str(res),
+                    build_res.time_cost + timeout,
+                    time.time(),
+                )
+
         measure_results.append(MeasureResult(*res))
 
     if verbose >= 1:
@@ -876,7 +879,7 @@ def _timed_rpc_run(
     except Exception:
         costs = (MAX_FLOAT,)
         error_no = MeasureErrorNo.COMPILE_DEVICE
-        error_msg = make_error_msg()
+        error_msg = make_traceback_info()
 
     if error_no == 0:
         try:
@@ -900,7 +903,7 @@ def _timed_rpc_run(
         except Exception:
             costs = (MAX_FLOAT,)
             error_no = MeasureErrorNo.RUNTIME_DEVICE
-            error_msg = make_error_msg()
+            error_msg = make_traceback_info()
 
     shutil.rmtree(os.path.dirname(build_res.filename))
     toc = time.time()
@@ -939,7 +942,6 @@ def _rpc_run_worker(args):
         )
 
     res = call_func_with_timeout(timeout, _timed_rpc_run, args=args)
-
     if isinstance(res, TimeoutError):
         if verbose >= 1:
             print("*T", end="")  # Run timeout
@@ -950,6 +952,17 @@ def _rpc_run_worker(args):
             build_res.time_cost + timeout,
             time.time(),
         )
+    elif isinstance(res, Exception):
+        if verbose >= 1:
+            print("*E", end="")  # Run error
+        res = (
+            (MAX_FLOAT,),
+            MeasureErrorNo.RUNTIME_DEVICE,
+            str(res),
+            build_res.time_cost + timeout,
+            time.time(),
+        )
+
     return res
 
 
diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
index f0d930e3257e..2569f3984f3c 100644
--- a/python/tvm/auto_scheduler/measure_record.py
+++ b/python/tvm/auto_scheduler/measure_record.py
@@ -14,8 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name, pointless-string-statement
 
 """ Serialization and other I/O support for measurement records (tuning logs). """
+import argparse
+import logging
+import os
+import itertools
 
 import numpy as np
 
@@ -24,6 +29,8 @@
 from .measure import MeasureErrorNo, MeasureCallback
 from . import _ffi_api
 
+logger = logging.getLogger("auto_scheduler")
+
 
 @tvm._ffi.register_object("auto_scheduler.RecordToFile")
 class RecordToFile(MeasureCallback):
@@ -36,7 +43,7 @@ class RecordToFile(MeasureCallback):
         File name for this callback to write log to.
     """
 
-    def __init__(self, filename="auto_scheduler_tuning.json"):
+    def __init__(self, filename):
         self.__init_handle_by_constructor__(_ffi_api.RecordToFile, filename)
 
 
@@ -47,11 +54,11 @@ class RecordReader(Object):
 
     Parameters
     ----------
-    filename : str = "auto_scheduler_tuning.json"
+    filename : str
         File name for this reader to load log from.
     """
 
-    def __init__(self, filename="auto_scheduler_tuning.json"):
+    def __init__(self, filename):
         self.__init_handle_by_constructor__(_ffi_api.RecordReader, filename)
 
     def read_lines(self, max_lines=None, skip_lines=0):
@@ -173,3 +180,71 @@ def load_best(filename, workload_key=None, target=None):
             best_res = res
 
     return best_inp, best_res
+
+
+def distill_record_file(in_file, out_file):
+    """
+    Pick the best entries from a record file and store them to another file.
+    This function distills the useful log entries from a large log file.
+    If out_file already exists, the best entries from both
+    in_file and out_file will be saved.
+
+    Parameters
+    ----------
+    in_file: str
+        The filename of input
+    out_file: str or file
+        The filename of output
+    """
+    # pylint: disable=import-outside-toplevel
+    from .dispatcher import ApplyHistoryBest
+
+    context = load_records(in_file)
+    if os.path.isfile(out_file):
+        out_context = load_records(out_file)
+        context = itertools.chain(context, out_context)
+    context, context_clone = itertools.tee(context)
+    best_context = ApplyHistoryBest(context)
+    best_set = set()
+
+    def measure_input_str_key(inp):
+        return _ffi_api.SerializeMeasureInput(inp)
+
+    for v in best_context.best_by_model.values():
+        best_set.add(measure_input_str_key(v[0]))
+
+    for v in best_context.best_by_targetkey.values():
+        best_set.add(measure_input_str_key(v[0]))
+
+    inputs = []
+    results = []
+    for inp, res in context_clone:
+        if measure_input_str_key(inp) in best_set:
+            inputs.append(inp)
+            results.append(res)
+            best_set.remove(measure_input_str_key(inp))
+
+    # create a new file and save the best records
+    open(out_file, "w")
+    save_records(out_file, inputs, results)
+    logger.info("Extract %d best records from %s to %s", len(inputs), in_file, out_file)
+
+
+"""
+Usage:
+* Distill the best entries from a large log file
+e.g. python -m tvm.auto_scheduler.measure_record --mode distill --i input.json
+"""
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", choices=["distill"], required=True)
+    parser.add_argument("--i", type=str, help="input file")
+    parser.add_argument("--o", type=str, default=None, help="output file")
+
+    args = parser.parse_args()
+    logging.basicConfig()
+    logger.setLevel(logging.INFO)
+
+    if args.mode == "distill":
+        args.o = args.o or args.i + ".best.json"
+        distill_record_file(args.i, args.o)
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index e45573be61c6..c81a4b680b95 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -31,9 +31,10 @@
 
 from .search_policy import SearchPolicy, SketchPolicy
 from .cost_model import RandomModel, XGBModel
-from .utils import array_mean, to_str_round
+from .utils import array_mean
 from .measure import ProgramMeasurer
 from .measure_record import RecordReader
+from . import _ffi_api
 
 logger = logging.getLogger("auto_scheduler")
 
@@ -75,10 +76,10 @@ def make_search_policies(
         if model_type == "xgb":
             cost_model = XGBModel(num_warmup_sample=len(tasks) * num_measures_per_round)
             if load_model_file:
-                logger.info("Load pretrained model...")
+                logger.info("TaskScheduler: Load pretrained model...")
                 cost_model.load(load_model_file)
             elif load_log_file:
-                cost_model.load_log_file(load_log_file)
+                cost_model.update_from_file(load_log_file)
         elif model_type == "random":
             cost_model = RandomModel()
         else:
@@ -137,10 +138,18 @@ class TaskScheduler:
     ----------
     tasks: List[SearchTask]
         All tasks to tune
+    task_weights: Optional[List[float]]
+        The weights of tasks.
+        If provided, the task scheduler will set the objective function to
+        sum(weight[t] * latency[t]), where weight[t] is the weight of a task
+        and the lantecy[t] is the lantecy of the task.
+        If not provided, the task scheduer will assign equal weights to all
+        tasks (i.e., the objective function is sum(latency[t])).
     objective_func: Optional[Callable[List[float] -> float]]
         The objective function to be minimized.
         The objective function accepts the current latencies of all tasks and returns the
-        objective. If not presented, the objective is the sum of the latencies of all task.
+        objective.
+        If not provided, the objective is the weighted sum of the latencies of all tasks.
     strategy: str = "gradient"
         The scheduling strategy.
         "round-robin": Tune tasks in round robin order.
@@ -164,20 +173,26 @@ class TaskScheduler:
     def __init__(
         self,
         tasks,
+        task_weights=None,
         objective_func=None,
         strategy="gradient",
         load_model_file: str = None,
         load_log_file: str = None,
-        verbose: int = 1,
         alpha: float = 0.2,
         beta: float = 2,
         gamma: float = 0.5,
         backward_window_size: int = 3,
     ):
         self.tasks = tasks
-        self.objective_func = objective_func or sum
+        if objective_func:  # use custom objective function
+            self.objective_func = objective_func
+        else:  # use weighted sum
+            if task_weights:
+                self.objective_func = lambda costs: sum(c * w for c, w in zip(costs, task_weights))
+            else:
+                self.objective_func = sum
+
         self.strategy = strategy
-        self.verbose = verbose
         self.load_log_file = load_log_file
         self.load_model_file = load_model_file
         self.alpha = alpha
@@ -198,7 +213,8 @@ def __init__(
         self.best_costs = 1e10 * np.ones(len(self.tasks))
         self.cur_score = self._compute_score(self.best_costs)
 
-        self.tune_option = self.measurer = self.search_policies = self.ct = self.tic = None
+        self.tune_option = self.measurer = self.search_policies = None
+        self.ct = self.best_ct = self.best_score = self.tic = None
         self.num_measures_per_round = None
         self.dead_tasks = set()
 
@@ -234,14 +250,17 @@ def tune(self, tune_option, search_policy="default"):
         """
         # init members
         self.tune_option = tune_option
+        early_stopping = 1e20 if tune_option.early_stopping < 0 else tune_option.early_stopping
+
         self.measurer = ProgramMeasurer(
             tune_option.builder,
             tune_option.runner,
             tune_option.measure_callbacks,
             tune_option.verbose,
         )
-        self.ct = 0
+        self.ct = self.best_ct = 0
         self.tic = time.time()
+
         # reset num_measures_per_round to make sure every task is tuned at least once
         self.num_measures_per_round = min(
             tune_option.num_measures_per_round, tune_option.num_measure_trials // len(self.tasks)
@@ -266,6 +285,8 @@ def tune(self, tune_option, search_policy="default"):
         # do a round robin first to warm up
         for i in range(len(self.tasks)):
             self._tune_task(i)
+        self.best_ct = self.ct
+        self.best_score = self.cur_score
 
         # use the specific strategy to choose workload to tune
         task_idx = -1
@@ -282,7 +303,7 @@ def tune(self, tune_option, search_policy="default"):
                         continue
 
                     # compute gradient from chain rule : (delta f / delta g_i)
-                    delta = 1e-7
+                    delta = 1e-4
                     new_costs = list(self.best_costs)
                     new_costs[i] -= delta
                     chain_grad = (
@@ -337,10 +358,54 @@ def tune(self, tune_option, search_policy="default"):
             self._tune_task(task_idx)
             self._adjust_similarity_group(task_idx)
 
+            if self.cur_score < self.best_score:
+                self.best_score = self.cur_score
+                self.best_ct = self.ct
+            elif self.ct - self.best_ct >= early_stopping and all(
+                cost < 1e9 for cost in self.best_costs
+            ):
+                if self.tune_option.verbose >= 1:
+                    print(
+                        "Stop early since no performance improvement in the last "
+                        + str(early_stopping)
+                        + " measurement trials."
+                    )
+                break
+
+    def _print_table_info(self, next_task_idx):
+        # table header
+        _ffi_api.PrintTitle("Task Scheduler")
+        print("|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |")
+        print("-------------------------------------------------")
+
+        # content
+        for i in range(len(self.tasks)):
+            id_str = "%d" % i
+            latency_str = "%.3f" % (1e3 * self.best_costs[i]) if self.best_costs[i] < 1e9 else "-"
+            speed_str = (
+                "%.2f" % (self.tasks[i].compute_dag.flop_ct / self.best_costs[i] / 1e9)
+                if self.best_costs[i] < 1e9
+                else "-"
+            )
+            trials_str = "%d" % (self.task_cts[i] * self.num_measures_per_round)
+            print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str))
+        print("-------------------------------------------------")
+
+        # overall info
+        if all(cost < 1e9 for cost in self.best_costs):
+            total_latency_str = "%.3f" % (self.cur_score * 1e3)
+        else:
+            total_latency_str = "-"
+        print(
+            "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t"
+            % (total_latency_str, self.ct, time.time() - self.tic, next_task_idx)
+        )
+
     def _tune_task(self, task_idx):
         """Tune the select task for one round"""
-        if self.verbose >= 1:
-            logger.info("TaskScheduler: task id:\t%d", task_idx)
+        if self.tune_option.verbose >= 1:
+            self._print_table_info(task_idx)
+
         measure_inputs, measure_results = self.search_policies[task_idx].continue_search_one_round(
             self.num_measures_per_round, self.measurer
         )
@@ -359,17 +424,6 @@ def _tune_task(self, task_idx):
         self.ct += len(measure_inputs)
         self.cur_score = self._compute_score(self.best_costs)
 
-        if self.verbose >= 1:
-            logger.info(
-                "TaskScheduler\tct: %d\testimated cost (ms): %.3f\ttime elapsed: %.2f\t"
-                "best_costs (ms): %s\ttask_ct: %s",
-                self.ct,
-                self.cur_score * 1e3,
-                time.time() - self.tic,
-                to_str_round(self.best_costs * 1e3, decimal=3),
-                self.task_cts,
-            )
-
     def _compute_score(self, costs):
         """compute the objective function"""
         return self.objective_func(costs)
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index 0780d39e9042..9a7c199e6745 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -23,6 +23,7 @@
 import queue
 import signal
 import threading
+import traceback
 import os
 
 import numpy as np
@@ -138,32 +139,49 @@ def kill_child_processes(parent_pid, sig=signal.SIGTERM):
         parent = psutil.Process(parent_pid)
     except psutil.NoSuchProcess:
         return
-    children = parent.children(recursive=True)
-    for process in children:
-        try:
+
+    try:
+        children = parent.children(recursive=True)
+        for process in children:
             process.send_signal(sig)
-        except psutil.NoSuchProcess:
-            return
+    except psutil.NoSuchProcess:
+        return
+
+
+# The maximum length of traceback information
+MAX_TRACEBACK_INFO_LEN = 512
+
+
+def make_traceback_info():
+    """ Get the error message from traceback. """
+    info = str(traceback.format_exc())
+    if len(info) > MAX_TRACEBACK_INFO_LEN:
+        info = (
+            info[: MAX_TRACEBACK_INFO_LEN // 2] + "\n...\n" + info[-MAX_TRACEBACK_INFO_LEN // 2 :]
+        )
+    return info
 
 
 def _func_wrapper(que, func, args, kwargs):
     """Call function and return the result over the queue."""
-    if kwargs:
-        que.put(func(*args, **kwargs))
-    else:
-        que.put(func(*args))
+    try:
+        if kwargs:
+            que.put(func(*args, **kwargs))
+        else:
+            que.put(func(*args))
+    # pylint: disable=broad-except
+    except Exception:
+        que.put(Exception(make_traceback_info()))
 
 
 def call_func_with_timeout(timeout, func, args=(), kwargs=None):
     """Call a function with timeout"""
-
     que = multiprocessing.Queue(2)
     process = multiprocessing.Process(target=_func_wrapper, args=(que, func, args, kwargs))
     process.start()
-    process.join(timeout)
 
     try:
-        res = que.get(block=False)
+        res = que.get(timeout=timeout)
     except queue.Empty:
         res = TimeoutError()
 
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index a1b89404b5a1..4f11aea2911f 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -268,8 +268,8 @@ def split_workload(in_file, clean=True):
 
 def pick_best(in_file, out_file):
     """
-    Pick best entries from a file and store it to another file.
-    This distill the useful log entries from a large log file.
+    Pick the best entries from a file and store them to another file.
+    This function distills the useful log entries from a large log file.
     If out_file already exists, the best entries from both
     in_file and out_file will be saved.
 
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 8d17c4bba10f..a60c87cc600d 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1345,11 +1345,6 @@ void GetPerStoreFeaturesFromStates(const Array<State>& states, const SearchTask&
                           GetPerStoreFeaturesWorkerFunc(task, states[i], max_n_bufs,
                                                         &(*features)[i], &error_ct);
                         });
-
-  if (error_ct > 0) {
-    std::cerr << "Encountered " << error_ct
-              << " errors during feature extraction, which are safely ignored." << std::endl;
-  }
 }
 
 void GetPerStoreFeaturesFromStates(const Array<State>& states, const std::vector<SearchTask>& tasks,
@@ -1365,11 +1360,6 @@ void GetPerStoreFeaturesFromStates(const Array<State>& states, const std::vector
                           GetPerStoreFeaturesWorkerFunc(tasks[i], states[i], max_n_bufs,
                                                         &(*features)[i], &error_ct);
                         });
-
-  if (error_ct > 0) {
-    std::cerr << "Encountered " << error_ct
-              << " errors during feature extraction. which are safely ignored." << std::endl;
-  }
 }
 
 void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int max_n_bufs,
diff --git a/src/auto_scheduler/measure.cc b/src/auto_scheduler/measure.cc
index 6c5c10e5aaee..c77bafc84e6e 100755
--- a/src/auto_scheduler/measure.cc
+++ b/src/auto_scheduler/measure.cc
@@ -203,6 +203,7 @@ void ProgramMeasurerNode::Reset() {
   best_flops.clear();
   best_ct.clear();
   best_state.clear();
+  has_valid.clear();
 }
 
 Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
@@ -217,8 +218,7 @@ Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
     batch_size = builder->n_parallel * 2;
   }
 
-  StdCout(verbose) << "Get " << inputs.size() << " programs for measure. (This may take a while)"
-                   << std::endl;
+  StdCout(verbose) << "Get " << inputs.size() << " programs to measure." << std::endl;
 
   for (size_t i = 0; i < inputs.size(); i += batch_size) {
     Array<MeasureInput> input_batch(inputs.begin() + i,
@@ -230,16 +230,18 @@ Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
 
     // update current best state according to the new measure result
     for (size_t j = 0; j < input_batch.size(); ++j) {
+      const String& workload_key = input_batch[j]->task->workload_key;
       double flops;
+
       if (result_batch[j]->error_no == 0) {
         flops = task->compute_dag->flop_ct / FloatArrayMean(result_batch[j]->costs);
         error_ct = 0;
+        has_valid.insert(workload_key);
       } else {
         flops = 0.0;
         error_ct++;
       }
 
-      const String& workload_key = input_batch[j]->task->workload_key;
       if (flops > best_flops[workload_key]) {
         best_flops[workload_key] = flops;
         best_state[workload_key] = input_batch[j]->state;
@@ -247,11 +249,12 @@ Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
       }
 
       ct++;
-      StdCout(verbose) << std::fixed << std::setprecision(2) << Chars('=', 50) << "\n"
-                       << "No: " << ct << "\tGFLOPS: " << flops / 1e9 << " / "
-                       << best_flops[workload_key] / 1e9 << "\tresults: " << result_batch[j] << "\n"
-                       << Chars('=', 50) << "\n"
-                       << input_batch[j]->state << "\n";
+      StdCout(verbose, 2) << std::fixed << std::setprecision(2) << Chars('=', 50) << "\n"
+                          << "No: " << ct << "\tGFLOPS: " << flops / 1e9 << " / "
+                          << best_flops[workload_key] / 1e9 << "\tresults: " << result_batch[j]
+                          << "\n"
+                          << Chars('=', 50) << "\n"
+                          << input_batch[j]->state << "\n";
     }
 
     // Call callback functions
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index b64776ff342a..4c3e8ac5593d 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -162,9 +162,17 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
     Array<MeasureResult> results;
     while (ct < n_trials) {
       if (!inputs.empty()) {
-        // Retrain cost models before the next search round
+        auto tic_begin = std::chrono::high_resolution_clock::now();
+
+        // Retrain the cost model before the next search round
         PrintTitle("Train cost model", verbose);
         program_cost_model->Update(inputs, results);
+
+        double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+                              std::chrono::high_resolution_clock::now() - tic_begin)
+                              .count();
+        StdCout(verbose) << "Time elapsed: " << std::fixed << std::setprecision(2) << duration
+                         << " s" << std::endl;
       }
 
       // Search one round to get promising states
@@ -200,9 +208,10 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
       ct += inputs.size();
 
       // Check if reach the early stopping condition
-      if (ct - measurer->best_ct[search_task->workload_key] > early_stopping) {
+      if (ct - measurer->best_ct[search_task->workload_key] > early_stopping &&
+          measurer->has_valid.count(search_task->workload_key)) {
         StdCout(verbose) << "Stop early since no performance improvement in the last "
-                         << early_stopping << " measure steps.\n";
+                         << early_stopping << " measurements trials.\n";
         break;
       }
 
@@ -249,10 +258,18 @@ std::pair<Array<MeasureInput>, Array<MeasureResult>> SketchPolicyNode::ContinueS
     measured_states_throughputs_.push_back(1.0 / FloatArrayMean(res->costs));
   }
 
+  auto tic_begin = std::chrono::high_resolution_clock::now();
+
   // Update the cost model
   PrintTitle("Train cost model", verbose);
   program_cost_model->Update(inputs, results);
 
+  double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+                        std::chrono::high_resolution_clock::now() - tic_begin)
+                        .count();
+  StdCout(verbose) << "Time elapsed: " << std::fixed << std::setprecision(2) << duration << " s"
+                   << std::endl;
+
   return std::make_pair(std::move(inputs), std::move(results));
 }
 
@@ -362,6 +379,8 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
   // At least we should sample this number of valid programs
   int min_population = GetIntParam(params, SketchParamKey::SampleInitPopulation::min_population);
 
+  auto tic_begin = std::chrono::high_resolution_clock::now();
+
   int fail_ct = 0;
   Array<State> out_states;
   std::vector<std::mt19937> rand_gens;
@@ -369,7 +388,6 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
   for (int i = 0; i < population; i++) {
     rand_gens.push_back(std::mt19937(rand_gen()));
   }
-  auto tic_begin = std::chrono::high_resolution_clock::now();
 
   std::unordered_set<std::string> explored_state_strs;
   size_t iter = 1;
@@ -673,5 +691,9 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicyEvolutionarySearch")
       return states;
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.PrintTitle").set_body_typed([](std::string title) {
+  PrintTitle(title, 1);
+});
+
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index ecc46af5a5de..d59a6ca220ca 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -657,9 +657,9 @@ inline int RandomChoose(const std::vector<double>& prefix_sum_probs, std::mt1993
 
 /*! \brief Print a title */
 inline void PrintTitle(const std::string& title, int verbose) {
-  StdCout(verbose) << Chars('-', 60) << "\n"
-                   << Chars('-', 25) << "  [ " << title << " ]\n"
-                   << Chars('-', 60) << std::endl;
+  StdCout(verbose) << Chars('-', 70) << "\n"
+                   << Chars('-', 30) << "  [ " << title << " ]\n"
+                   << Chars('-', 70) << std::endl;
 }
 
 /*!
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
index ad882b3eaf24..089f51cdf047 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -30,17 +30,17 @@ def tune_network(network, target):
     mod, params = get_network(network)
     target = tvm.target.Target(target)
     tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-    objective = lambda costs: sum(c * w for c, w in zip(costs, task_weights))
 
     with tempfile.NamedTemporaryFile() as fp:
         log_file = fp.name
 
         # Tuning
         measure_ctx = auto_scheduler.LocalRPCMeasureContext(timeout=60)
-        tuner = auto_scheduler.TaskScheduler(tasks, objective)
+        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
         tune_option = auto_scheduler.TuningOptions(
-            num_measure_trials=4,
+            num_measure_trials=100,
             num_measures_per_round=2,
+            early_stopping=1,
             runner=measure_ctx.runner,
             builder=auto_scheduler.LocalBuilder(timeout=60),
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index 6493c246d406..a4f3c4e06843 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -33,14 +33,14 @@
 def search_common(
     workload=matmul_auto_scheduler_test,
     target="llvm",
-    search_policy="empty",
-    seed=random.randint(1, 1 << 30),
+    search_policy="sketch",
+    seed=0,
     runner="local",
+    num_measure_trials=100,
     cost_model=auto_scheduler.RandomModel(),
-    num_measure_trials=10,
     init_search_callbacks=None,
 ):
-    print("Test %s schedule search with the default search policy" % (target))
+    print("Test search policy '%s' for '%s'" % (search_policy, target))
 
     random.seed(seed)
     N = 128
@@ -59,17 +59,18 @@ def search_common(
             search_policy = auto_scheduler.SketchPolicy(
                 task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks
             )
+        else:
+            raise ValueError("Invalid policy: " + search_policy)
 
         tuning_options = auto_scheduler.TuningOptions(
             num_measure_trials=num_measure_trials,
+            num_measures_per_round=2,
+            early_stopping=1,
             runner=runner,
-            verbose=1,
+            verbose=2,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
         sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options)
-        print("*" * 80)
-        print(target)
-        print("*" * 80)
         inp, res = auto_scheduler.load_best(log_file, task.workload_key, target)
 
         print("==== Python Code ====")
@@ -97,17 +98,30 @@ def search_common(
 def test_workload_registry_search_basic():
     # wrap the search in a new thread to avoid the conflict
     # between python's multiprocessing and tvm's thread pool
-    t = PropagatingThread(target=search_common, kwargs={"seed": 944563397})
+    t = PropagatingThread(
+        target=search_common, kwargs={"search_policy": "empty", "num_measure_trials": 2}
+    )
     t.start()
     t.join()
+
     t = PropagatingThread(
-        target=search_common, kwargs={"seed": 944563397, "workload": "matmul_auto_scheduler_test"}
+        target=search_common,
+        kwargs={
+            "workload": "matmul_auto_scheduler_test",
+            "num_measure_trials": 2,
+            "search_policy": "empty",
+        },
     )
     t.start()
     t.join()
+
     t = PropagatingThread(
         target=search_common,
-        kwargs={"seed": 944563397, "workload": "matmul_auto_scheduler_test_rename_1"},
+        kwargs={
+            "workload": "matmul_auto_scheduler_test_rename_1",
+            "num_measure_trials": 2,
+            "search_policy": "empty",
+        },
     )
     t.start()
     t.join()
@@ -117,9 +131,7 @@ def test_workload_registry_search_basic():
 def test_sketch_search_policy_basic():
     # wrap the search in a new thread to avoid the conflict
     # between python's multiprocessing and tvm's thread pool
-    t = PropagatingThread(
-        target=search_common, kwargs={"seed": 944563397, "search_policy": "sketch"}
-    )
+    t = PropagatingThread(target=search_common)
     t.start()
     t.join()
 
@@ -144,8 +156,6 @@ def test_sketch_search_policy_xgbmodel():
     t = PropagatingThread(
         target=search_common,
         kwargs={
-            "seed": 944563397,
-            "search_policy": "sketch",
             "cost_model": auto_scheduler.XGBModel(),
         },
     )
@@ -161,8 +171,6 @@ def test_sketch_search_policy_cuda_rpc_runner():
     t = PropagatingThread(
         target=search_common,
         kwargs={
-            "seed": 944563397,
-            "search_policy": "sketch",
             "target": "cuda",
             "runner": measure_ctx.runner,
         },
@@ -179,8 +187,6 @@ def test_sketch_search_policy_cuda_xgbmodel_rpc_runner():
     t = PropagatingThread(
         target=search_common,
         kwargs={
-            "seed": 944563397,
-            "search_policy": "sketch",
             "target": "cuda",
             "runner": measure_ctx.runner,
             "cost_model": auto_scheduler.XGBModel(),
diff --git a/tests/python/unittest/test_auto_scheduler_task_scheduler.py b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
index 2debc14fc356..b0fb37a830f7 100644
--- a/tests/python/unittest/test_auto_scheduler_task_scheduler.py
+++ b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
@@ -34,9 +34,6 @@ def test_task_scheduler_round_robin():
     for n in [2, 4, 8]:
         tasks.append(auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n), "llvm"))
 
-    def objective_func(costs):
-        return sum(costs)
-
     with tempfile.NamedTemporaryFile() as fp:
         log_file = fp.name
         num_trials_per_task = 2
@@ -49,7 +46,7 @@ def objective_func(costs):
             num_measures_per_round=1,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
-        task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func, strategy="round-robin")
+        task_scheduler = auto_scheduler.TaskScheduler(tasks, strategy="round-robin")
         task_scheduler.tune(tune_option, search_policy="sketch.random")
 
         # Check the result of round robin
@@ -65,7 +62,7 @@ def objective_func(costs):
 
         # test continuous tuning (restoring the status)
         task_scheduler = auto_scheduler.TaskScheduler(
-            tasks, objective_func, strategy="round-robin", load_log_file=log_file
+            tasks, strategy="round-robin", load_log_file=log_file
         )
         tune_option = auto_scheduler.TuningOptions(
             num_measure_trials=len(tasks),
@@ -111,7 +108,7 @@ def objective_func(costs):
             num_measures_per_round=1,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
-        task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func)
+        task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func=objective_func)
 
         # Forcely rewrite the initial values.
         # This can make this test more stable on the slow CI machines
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 3d229651cb4f..459b680daeb1 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -40,7 +40,7 @@ rm -rf docs/doxygen
 
 # prepare auto scheduler tutorials
 rm -rf tutorials/auto_scheduler/*.json
-cp -f tutorials/auto_scheduler/ci_logs/{matmul,conv2d}.json tutorials/auto_scheduler
+cp -f tutorials/auto_scheduler/ci_logs/*.json tutorials/auto_scheduler
 
 # remove stale tutorials and always build from scratch.
 rm -rf docs/tutorials
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
new file mode 100644
index 000000000000..37a129844390
--- /dev/null
+++ b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
@@ -0,0 +1,23 @@
+# Provide valid schedules for resnet-18.
+# This is used to run the tutorial on the documentation web server.
+{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [2, 5, 2, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 2, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 1, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"]]]], "r": [[7.2561e-05], 0, 1.93892, 1605186325], "v": "v0.3"}
+{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 4, 1], 1], ["SP", 6, 10, 16, [4, 2, 1, 1], 1], ["SP", 6, 15, 512, [1, 16, 1, 1], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000195701], 0, 2.67988, 1605186412], "v": "v0.3"}
+{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [1, 16, 1, 1], 1], ["SP", 6, 15, 512, [2, 1, 4, 1], 1], ["SP", 6, 20, 512, [32, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000162045], 0, 2.32406, 1605186499], "v": "v0.3"}
+{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 1, 8, 1], 1], ["SP", 6, 15, 512, [2, 64, 1, 1], 1], ["SP", 6, 20, 512, [16, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [4], 1], ["SP", 4, 4, 512, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [2], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102843], 0, 2.42044, 1605186574], "v": "v0.3"}
+{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [49], 1], ["SP", 8, 4, 256, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 7, 1, 7], 1], ["SP", 6, 15, 256, [1, 8, 1, 2], 1], ["SP", 6, 20, 256, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [2], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[9.61516e-05], 0, 2.69389, 1605186690], "v": "v0.3"}
+{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 1, 1], 1], ["SP", 6, 15, 256, [1, 4, 8, 1], 1], ["SP", 6, 20, 256, [1, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 2, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000156995], 0, 2.11666, 1605186772], "v": "v0.3"}
+{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [49], 1], ["SP", 8, 4, 256, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 4, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [4, 2, 1, 1], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [4], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000131082], 0, 2.24166, 1605186844], "v": "v0.3"}
+{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 128, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 1, 1, 1], 1], ["SP", 6, 5, 4, [2, 2, 1, 1], 1], ["SP", 6, 10, 196, [2, 7, 2, 1], 1], ["SP", 6, 15, 128, [1, 32, 1, 4], 1], ["SP", 6, 20, 128, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [16], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [16], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000166673], 0, 2.43832, 1605186977], "v": "v0.3"}
+{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 2, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [1, 1, 4, 8], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [49], 1], ["SP", 4, 4, 128, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 1024, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000108367], 0, 3.89975, 1605187058], "v": "v0.3"}
+{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 2, 2], 1], ["SP", 6, 10, 196, [1, 4, 7, 1], 1], ["SP", 6, 15, 128, [2, 16, 2, 1], 1], ["SP", 6, 20, 128, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.0137e-05], 0, 2.28468, 1605187134], "v": "v0.3"}
+{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 2, 2, 1], 1], ["SP", 3, 10, 28, [1, 14, 1, 1], 1], ["SP", 3, 15, 128, [1, 2, 16, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 64, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 384, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 24, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[9.74847e-05], 0, 1.97907, 1605187182], "v": "v0.3"}
+{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 2, 1], 1], ["SP", 6, 10, 196, [1, 7, 14, 1], 1], ["SP", 6, 15, 64, [2, 4, 2, 1], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [16], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [8], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.09982e-05], 0, 3.52776, 1605187295], "v": "v0.3"}
+{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [7], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 1], 1], ["SP", 6, 10, 196, [1, 14, 1, 2], 1], ["SP", 6, 15, 64, [1, 2, 8, 2], 1], ["SP", 6, 20, 64, [4, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [4], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 512, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[7.15745e-05], 0, 3.73944, 1605187404], "v": "v0.3"}
+{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [7], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 2, 3, 1], 1], ["SP", 6, 10, 196, [1, 4, 1, 7], 1], ["SP", 6, 15, 64, [1, 8, 2, 1], 1], ["SP", 6, 20, 64, [2, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [4], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 144, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 252, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[6.79478e-05], 0, 5.10446, 1605187506], "v": "v0.3"}
+{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [2, 14, 1, 1], 1], ["SP", 3, 10, 112, [1, 8, 2, 1], 1], ["SP", 3, 15, 64, [2, 2, 2, 2], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [7, 1], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 1176, [21], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 189, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[5.53397e-05], 0, 2.2607, 1605187548], "v": "v0.3"}
+{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [2, 28, 1, 1], 1], ["SP", 3, 10, 56, [1, 2, 2, 1], 1], ["SP", 3, 15, 64, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [1, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 16, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[8.11163e-06], 0, 1.93343, 1605187596], "v": "v0.3"}
+{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [2, 2, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 1], 1], ["SP", 3, 15, 128, [2, 8, 4, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [4, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 256, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 96, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[1.40126e-05], 0, 1.82931, 1605187624], "v": "v0.3"}
+{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 7, 1, 2], 1], ["SP", 3, 10, 14, [1, 1, 1, 2], 1], ["SP", 3, 15, 256, [4, 64, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 32, [16], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 324, [6], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[2.35384e-05], 0, 1.78652, 1605187663], "v": "v0.3"}
+{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 32, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 64], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [4], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 4, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.09105e-05], 0, 1.85659, 1605187687], "v": "v0.3"}
+{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 7, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 8, 2, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 48, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000154153], 0, 2.18601, 1605187723], "v": "v0.3"}
+{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 2], 1], ["SP", 3, 10, 14, [1, 14, 1, 1], 1], ["SP", 3, 15, 256, [1, 32, 1, 2], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 128, [2, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 72, [24], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[5.97747e-05], 0, 2.13918, 1605187759], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index d1b3c22d2084..a8bb8dd08f59 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -76,11 +76,11 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 
 ######################################################################
 # Next, we set parameters for the auto-scheduler. These parameters
-# mainly specify how we do the measurement during the search and auto-tuning.
+# mainly specify how we do the measurement during the search.
 #
-# * :code:`measure_ctx` launches a different process for measurement. This
-#   provides an isolation. It can protect the master process from GPU crashes
-#   happended during measurement and avoid other runtime conflicts.
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the master process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
 #   Typically, we recommend a value > 300 ms.
@@ -96,7 +96,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 log_file = "conv2d.json"
 measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
 tune_option = auto_scheduler.TuningOptions(
-    num_measure_trials=10,
+    num_measure_trials=10,  # change this to 1000 to achieve the best performance
     runner=measure_ctx.runner,
     measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
 )
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
new file mode 100644
index 000000000000..9eb5d5cdff0c
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -0,0 +1,302 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a Neural Network for NVIDIA GPU
+===========================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and 
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` while does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_runtime
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# Note that although auto-scheduler can work with any layouts,
+# we found that the best performance is typically archived with NHWC layout
+# for convolutional neural networks, so we use NHWC layout in this tutorial.
+#
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-18"
+batch_size = 1
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+log_file = "%s-%s-B%d.json" % (network, layout, batch_size)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Enable auto-scheduler in relay
+auto_scheduler.enable_relay_integration()
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the master process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
+# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
+#   This can warmup the GPU, which is necessary to get accurate measurement results.
+#   Typically, we recommend a value > 300 ms.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`1000 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 21 tasks in resnet-18, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into the log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=200,  # change this to 20000 to achieve the best performance
+        runner=measure_ctx.runner,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.014 |          72.07 |     64 |
+#     |    1 |        0.185 |        1250.68 |    128 |
+#     |    2 |        0.142 |        1626.36 |    192 |
+#     |    3 |        0.137 |        1689.42 |    128 |
+#     |    4 |        0.097 |        1189.75 |    128 |
+#     |    5 |        0.092 |        2505.25 |    128 |
+#     |    6 |        0.080 |        2893.08 |    128 |
+#     |    7 |        0.119 |        1947.84 |    128 |
+#     |    8 |        0.090 |        1292.62 |     64 |
+#     |    9 |        0.107 |        2172.30 |     64 |
+#     |   10 |        0.095 |        2439.36 |     64 |
+#     |   11 |        0.077 |        3003.22 |     64 |
+#     |   12 |        0.068 |        1695.13 |     64 |
+#     |   13 |        0.058 |        3979.29 |     64 |
+#     |   14 |        0.048 |        4859.95 |    128 |
+#     |   15 |        0.073 |        3151.76 |     64 |
+#     |   16 |        0.056 |        4265.94 |     64 |
+#     |   17 |        0.009 |        2754.90 |     64 |
+#     |   18 |        0.011 |        1156.08 |     64 |
+#     |   19 |        0.013 |         955.80 |     64 |
+#     |   20 |        0.029 |         437.71 |     64 |
+#     -------------------------------------------------
+#     Estimated total latency: 1.649 ms  Trials: 1920  Used time : 3598 s  Next ID: 9
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "dmlc::Error"s and CUDA errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the master process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+print("Compile...")
+with auto_scheduler.ApplyHistoryBest(log_file):
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params=params)
+
+# Create graph runtime
+ctx = tvm.context(str(target), 0)
+module = graph_runtime.GraphModule(lib["default"](ctx))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
+print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
+
+
+#################################################################
+# Other Tips
+# --------------------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. If you have multiple GPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.
+#
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 3dccefef4de9..91407133d695 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -315,6 +315,7 @@ def tune_and_evaluate(tuning_opt):
 #################################################################
 # Scale up measurement by using multiple devices
 # ----------------------------------------------
+# .. _tutorials-autotvm-rpc-tracker:
 #
 # If you have multiple devices, you can use all of them for measurement.
 # TVM uses the RPC Tracker to manage distributed devices.

From fcf9e78976b725bcd525128ed452187e933a7f05 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Fri, 13 Nov 2020 06:42:14 -0700
Subject: [PATCH 169/258] Fix edge cases in const_int_bound and fold_scale_axis
 (#6911)

---
 src/arith/const_int_bound.cc                  |  4 +-
 src/relay/transforms/fold_scale_axis.cc       | 13 ++++++-
 .../python/relay/test_pass_fold_scale_axis.py | 38 +++++++++++++++++++
 .../unittest/test_arith_const_int_bound.py    | 14 +++++++
 4 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 2c01b9143155..75c09ac05073 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -519,8 +519,8 @@ class ConstIntBoundAnalyzer::Impl
    */
   static Entry MakeBound(int64_t min_value, int64_t max_value) {
     Entry e;
-    e.min_value = min_value;
-    e.max_value = max_value;
+    e.min_value = (min_value == kPosInf) ? min_value - 1 : min_value;
+    e.max_value = (max_value == kNegInf) ? max_value + 1 : max_value;
     return e;
   }
   /*!
diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc
index 23be70c1e442..a93532895b5a 100644
--- a/src/relay/transforms/fold_scale_axis.cc
+++ b/src/relay/transforms/fold_scale_axis.cc
@@ -243,7 +243,18 @@ class ForwardPrep : private ExprVisitor {
     }
   }
   // Visitor pattern override.
-  void VisitExpr_(const LetNode* call) { LOG(FATAL) << "FoldScaleAxis only accept dataflow-form"; }
+  void VisitExpr_(const LetNode* op) {
+    ExprVisitor::VisitExpr_(op);
+    // do pass through condition
+    // by assigning NullValue<Message>
+    // it means fuse signal cannot pass
+    // through into these subexpressions.
+    auto flazy = [this, op]() {
+      this->Update(op->value, NullValue<Message>());
+      this->Update(op->body, NullValue<Message>());
+    };
+    flist_.push_back(flazy);
+  }
 
   void VisitExpr_(const FunctionNode* op) {
     ExprVisitor::VisitExpr_(op);
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index 421c6c5e8ef2..3c2dc82cb07b 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -311,6 +311,44 @@ def check(shape, channels, blocking, in_scale):
     check((2, 11, 10, 2, 2), 4, (2, 2), in_scale)
 
 
+def test_fold_fwd_let_fail():
+    """testcase where we canont fold"""
+
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        args = [x, conv_weight, in_bias]
+        x = relay.multiply(x, in_scale)
+        x = relay.nn.relu(x)
+        x = relay.add(x, in_bias)
+        x_var = relay.Var("x_var")
+        y1 = relay.nn.conv2d(
+            x_var,
+            conv_weight,
+            channels=channels,
+            kernel_size=(3, 3),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+            padding=(1, 1),
+        )
+        z = relay.add(y1, x)
+        let = relay.Let(x_var, x, z)
+        return relay.Function(args, let)
+
+    def check(shape, channels):
+        x = relay.var("x", shape=shape)
+        in_channels = shape[-1]
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.const(_get_positive_scale(size=(in_channels,)))
+        # test depthwise
+        assert in_channels == channels
+        weight = relay.var("weight")
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = run_opt_pass(y1, transform.InferType())
+        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
+        assert tvm.ir.structural_equal(y1, y1_folded)
+
+    check((2, 11, 10, 4), 4)
+
+
 def test_fold_fwd_negative_scale():
     """Testcase of folding negative scale"""
 
diff --git a/tests/python/unittest/test_arith_const_int_bound.py b/tests/python/unittest/test_arith_const_int_bound.py
index 84fc7fd64614..57e488f4f302 100644
--- a/tests/python/unittest/test_arith_const_int_bound.py
+++ b/tests/python/unittest/test_arith_const_int_bound.py
@@ -76,6 +76,20 @@ def test_add_sub_bound():
     assert bd.min_value == bd.NEG_INF
     assert bd.max_value == 1
 
+    ## constants with negative or positive max(int64) occassionally show up
+    ## in models, this is to ensure we can handle those cases
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.NEG_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
+    bd = analyzer.const_int_bound(x + y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.POS_INF, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
+    bd = analyzer.const_int_bound(x + y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
 
 def test_mul_bound():
     analyzer = tvm.arith.Analyzer()

From c5aebfaa592584d96106daa43d5545151c32b45b Mon Sep 17 00:00:00 2001
From: Rohan Mukherjee <mukrohan@amazon.com>
Date: Fri, 13 Nov 2020 17:31:19 -0600
Subject: [PATCH 170/258] [TRT][BYOC] handling dynamism in TensorRT to support
 OD models (#6905)

* handling dynamism in TensorRT to support OD models

refactoring test tensort code

added comments to dynamic check wrapper

log.warn changed to logger.info

TRT codegen taking slice_mode into account

TRT codegen to handle both stride_mode

refactoring TRT codegen

adding a test for dynamic offload

[TRT] bug in codegen for slice_mode=end

ctx determined from target in test + io test was missing

* Addressed the formatting/refactoring comments

* Addressed comment in TRT codegen

Lint formatting

* Lint error

* using slice_mode during strided slice registration in tensorrt.py

* removed a few blank lines

* addressing cli comment on elif-return

* Added decorator for tensorrt functions with dynamism check

skip_codegen added for test_tensorrt::test_dynamic_offload

* addressed comments in PR + black linting

* resolved import error in test_tensorrt

* import mxnet location changed to pass CI

* test_integration removed as components were run by pytest anyway
---
 python/tvm/relay/op/contrib/tensorrt.py       | 222 ++++++---
 src/relay/backend/contrib/tensorrt/codegen.cc |  17 +-
 src/runtime/contrib/tensorrt/tensorrt_ops.cc  |   2 +-
 tests/python/contrib/test_tensorrt.py         | 441 +++++++++++-------
 4 files changed, 438 insertions(+), 244 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 24c468fee0fe..739d49c412e8 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.expr import Call, Constant, Tuple, GlobalVar
+from tvm.relay.expr import Call, Constant, Tuple, GlobalVar, Var, TupleGetItem
 from tvm.relay.expr_functor import ExprMutator
 
 logger = logging.getLogger("TensorRT")
@@ -155,10 +155,46 @@ def partition_for_tensorrt(
     return mod, config
 
 
+def check_dynamism(args, op_name):
+    """
+    Check for dynamism inside any of the args in the op.
+
+    Parameters
+    ----------
+    args : tvm.ir.container.Array
+        Arguments of the op. Each of the argument shape is checked for presence of dynamic
+        components.
+    op_name: str
+        Name of the op for debugging purposes only.
+    Returns
+    ----------
+    ret : bool
+        True if dynamism is present, False otherwise
+    """
+    for arg in args:
+        if isinstance(arg, (Call, Var, Constant, TupleGetItem)):
+            for dim_shape in arg.checked_type.shape:
+                if isinstance(dim_shape, tvm.tir.expr.Any):
+                    return True
+        elif isinstance(arg, Tuple):
+            return check_dynamism(arg.fields, op_name)
+        else:
+            logger.info(
+                "Arg not supported in TensorRT for %s with type %s",
+                op_name,
+                type(arg),
+            )
+            return True
+    return False
+
+
 def _register_external_op_helper_with_checker(op_name, checker):
     @tvm.ir.register_op_attr(op_name, "target.tensorrt")
     def _func_wrapper(expr):
         attrs, args = expr.attrs, expr.args
+        # ops with dynamic shapes are offloaded to VM
+        if check_dynamism(args, op_name):
+            return False
         if any([x.checked_type.dtype != "float32" for x in args]):
             logger.info("Only float32 inputs are supported for TensorRT.")
             return False
@@ -173,6 +209,23 @@ def _register_external_op_helper(op_name, supported=True):
     )
 
 
+def _register_external_dynamic_check_func(op_name):
+    """Wrapper to check dynamic shapes inside any of the args in the op."""
+
+    def _decorator_helper(checker):
+        @tvm.ir.register_op_attr(op_name, "target.tensorrt")
+        def _func_wrapper(expr):
+            args = expr.args
+            # ops with dynamic shapes are offloaded to VM
+            if check_dynamism(args, op_name):
+                return False
+            return checker(expr)
+
+        return _func_wrapper
+
+    return _decorator_helper
+
+
 # Ops which are always supported
 _register_external_op_helper("nn.relu")
 _register_external_op_helper("sigmoid")
@@ -192,7 +245,49 @@ def _register_external_op_helper(op_name, supported=True):
 _register_external_op_helper("clip")
 
 
-@tvm.ir.register_op_attr("add", "target.tensorrt")
+def reduce_annotate_fn(attrs, args, op_name):
+    """Helper for reduce operations."""
+    if not attrs.axis or len(attrs.axis) == 0:
+        logger.info("%s: cannot reduce to scalar.", op_name)
+        return False
+    if attrs.exclude:
+        logger.info("%s: exclude not supported.", op_name)
+        return False
+    if get_tensorrt_use_implicit_batch_mode() and any([x == 0 for x in map(int, attrs.axis)]):
+        logger.info("%s: can't modify batch dimension.", op_name)
+        return False
+    return True
+
+
+_register_external_op_helper_with_checker("sum", reduce_annotate_fn)
+_register_external_op_helper_with_checker("prod", reduce_annotate_fn)
+_register_external_op_helper_with_checker("max", reduce_annotate_fn)
+_register_external_op_helper_with_checker("min", reduce_annotate_fn)
+_register_external_op_helper_with_checker("mean", reduce_annotate_fn)
+
+
+def trt_version_annotate_fn(version):
+    """Helper for ops which require a minimum TRT version"""
+
+    def _func_wrapper(attrs, args, op_name):
+        if get_tensorrt_version() < version:
+            logger.info(
+                "%s: requires TensorRT version %s or higher.", op_name, ".".join(map(str, version))
+            )
+            return False
+        return True
+
+    return _func_wrapper
+
+
+_register_external_op_helper_with_checker("nn.leaky_relu", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("sin", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("cos", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("atan", trt_version_annotate_fn((5, 1, 5)))
+_register_external_op_helper_with_checker("ceil", trt_version_annotate_fn((5, 1, 5)))
+
+
+@_register_external_dynamic_check_func("add")
 def add_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if add is supported by TensorRT."""
 
@@ -212,7 +307,7 @@ def add_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.batch_norm", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.batch_norm")
 def batch_norm_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.batch_norm is supported by TensorRT."""
 
@@ -226,7 +321,7 @@ def batch_norm_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.softmax", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.softmax")
 def softmax_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.softmax is supported by TensorRT."""
 
@@ -240,7 +335,7 @@ def softmax_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.conv2d", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.conv2d")
 def conv2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv2d is supported by TensorRT."""
 
@@ -260,7 +355,7 @@ def conv2d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.dense", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.dense")
 def dense_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if dense is supported by TensorRT."""
 
@@ -279,7 +374,7 @@ def dense_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.bias_add", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.bias_add")
 def bias_add_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.bias_add is supported by TensorRT."""
 
@@ -294,7 +389,7 @@ def bias_add_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.max_pool2d", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.max_pool2d")
 def max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.max_pool2d is supported by TensorRT."""
 
@@ -311,7 +406,7 @@ def max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.avg_pool2d", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.avg_pool2d")
 def avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.avg_pool2d is supported by TensorRT."""
 
@@ -341,7 +436,7 @@ def avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.global_max_pool2d")
 def global_max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.global_max_pool2d is supported by TensorRT."""
 
@@ -355,7 +450,7 @@ def global_max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.global_avg_pool2d")
 def global_avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.global_avg_pool2d is supported by TensorRT."""
 
@@ -369,7 +464,7 @@ def global_avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("expand_dims", "target.tensorrt")
+@_register_external_dynamic_check_func("expand_dims")
 def expand_dims_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if expand_dims is supported by TensorRT."""
 
@@ -383,7 +478,7 @@ def expand_dims_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("squeeze", "target.tensorrt")
+@_register_external_dynamic_check_func("squeeze")
 def squeeze_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if squeeze is supported by TensorRT."""
 
@@ -400,7 +495,7 @@ def squeeze_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("concatenate", "target.tensorrt")
+@_register_external_dynamic_check_func("concatenate")
 def concatenate_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if concatenate is supported by TensorRT."""
 
@@ -421,7 +516,7 @@ def concatenate_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.conv2d_transpose", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.conv2d_transpose")
 def conv2d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv2d_transpose is supported by TensorRT."""
 
@@ -446,7 +541,7 @@ def conv2d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("transpose", "target.tensorrt")
+@_register_external_dynamic_check_func("transpose")
 def transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if transpose is supported by TensorRT."""
 
@@ -460,7 +555,7 @@ def transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("layout_transform", "target.tensorrt")
+@_register_external_dynamic_check_func("layout_transform")
 def layout_transform_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if layout_transform is supported by TensorRT."""
 
@@ -481,7 +576,7 @@ def layout_transform_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("reshape", "target.tensorrt")
+@_register_external_dynamic_check_func("reshape")
 def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if reshape is supported by TensorRT."""
 
@@ -514,7 +609,7 @@ def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.pad", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.pad")
 def pad_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.pad is supported by TensorRT."""
 
@@ -536,49 +631,7 @@ def pad_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-def reduce_annotate_fn(attrs, args, op_name):
-    """Helper for reduce operations."""
-    if not attrs.axis or len(attrs.axis) == 0:
-        logger.info("%s: cannot reduce to scalar.", op_name)
-        return False
-    if attrs.exclude:
-        logger.info("%s: exclude not supported.", op_name)
-        return False
-    if get_tensorrt_use_implicit_batch_mode() and any([x == 0 for x in map(int, attrs.axis)]):
-        logger.info("%s: can't modify batch dimension.", op_name)
-        return False
-    return True
-
-
-_register_external_op_helper_with_checker("sum", reduce_annotate_fn)
-_register_external_op_helper_with_checker("prod", reduce_annotate_fn)
-_register_external_op_helper_with_checker("max", reduce_annotate_fn)
-_register_external_op_helper_with_checker("min", reduce_annotate_fn)
-_register_external_op_helper_with_checker("mean", reduce_annotate_fn)
-
-
-def trt_version_annotate_fn(version):
-    """Helper for ops which require a minimum TRT version"""
-
-    def _func_wrapper(attrs, args, op_name):
-        if get_tensorrt_version() < version:
-            logger.info(
-                "%s: requires TensorRT version %s or higher.", op_name, ".".join(map(str, version))
-            )
-            return False
-        return True
-
-    return _func_wrapper
-
-
-_register_external_op_helper_with_checker("nn.leaky_relu", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("sin", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("cos", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("atan", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("ceil", trt_version_annotate_fn((5, 1, 5)))
-
-
-@tvm.ir.register_op_attr("strided_slice", "target.tensorrt")
+@_register_external_dynamic_check_func("strided_slice")
 def strided_slice_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if strided_slice is supported by TensorRT."""
 
@@ -601,11 +654,33 @@ def strided_slice_annotate_fn(expr):  # pylint: disable=unused-variable
     if any([x is not None and x <= 0 for x in attrs.strides]):
         logger.info("strided_slice: stride must be positive")
         return False
+    for i in range(0, len(args[0].checked_type.shape)):
+        begin = int(attrs.begin[i])
+        if attrs.slice_mode == "end":
+            end = (
+                int(attrs.end[i])
+                if attrs.end[i] is not None and int(attrs.end[i]) != -1
+                else args[0].checked_type.shape[i]
+            )
+            size = int(end) - int(begin)
+        elif attrs.slice_mode == "size":
+            size = (
+                int(attrs.end[i])
+                if attrs.end[i] is not None and int(attrs.end[i]) != -1
+                else args[0].checked_type.shape[i] - begin
+            )
+        else:
+            logger.warning("strided_slice: unknown slice mode encountered")
+
+        if int(size) < 1:
+            logger.info("strided_slice: size of slice must be at least 1")
+            return False
+
     return True
 
 
-@tvm.ir.register_op_attr("nn.adaptive_max_pool2d", "target.tensorrt")
-def adapative_max_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
+@_register_external_dynamic_check_func("nn.adaptive_max_pool2d")
+def adaptive_max_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.adaptive_max_pool2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
@@ -618,8 +693,8 @@ def adapative_max_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.adaptive_avg_pool2d", "target.tensorrt")
-def adapative_avg_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
+@_register_external_dynamic_check_func("nn.adaptive_avg_pool2d")
+def adaptive_avg_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.adaptive_avg_pool2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
@@ -632,7 +707,7 @@ def adapative_avg_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.conv3d", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.conv3d")
 def conv3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv3d is supported by TensorRT."""
 
@@ -654,7 +729,7 @@ def conv3d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.max_pool3d", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.max_pool3d")
 def max_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.max_pool3d is supported by TensorRT."""
 
@@ -670,7 +745,7 @@ def max_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.avg_pool3d", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.avg_pool3d")
 def avg_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.avg_pool3d is supported by TensorRT."""
 
@@ -686,7 +761,7 @@ def avg_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
-@tvm.ir.register_op_attr("nn.conv3d_transpose", "target.tensorrt")
+@_register_external_dynamic_check_func("nn.conv3d_transpose")
 def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv3d_transpose is supported by TensorRT."""
 
@@ -774,13 +849,10 @@ def visit_call(self, call):
                     new_body = relay.bind(func.body, var_map)
                     return new_body
                 if name != "main":
-                    # Copy the GlobalVar (subgraph function) to the new module and call.
                     args = []
                     for arg in call.args:
                         args.append(super().visit(arg))
-                    subgraph_gv = relay.GlobalVar(name)
-                    self.new_mod[subgraph_gv] = self.mod[name]
-                    return subgraph_gv(*args)
+                    return call.op(*args)
             return super().visit_call(call)
 
     subgraphs_to_remove = []
@@ -792,7 +864,7 @@ def visit_call(self, call):
         if not is_valid_subgraph(mod[name].params, mod[name].body):
             subgraphs_to_remove.append(name)
     # Create new pruned module
-    new_mod = tvm.IRModule()
+    new_mod = tvm.IRModule(mod.functions, mod.type_definitions)
     new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
     return new_mod
 
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index 26f674dcd7b5..cb648333df8d 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -140,18 +140,25 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
     std::vector<std::string> start, size, strides;
     for (size_t i = 0; i < attrs->begin.value().size(); ++i) {
       const int begin_value = process_slice_index(attrs->begin.value()[i], 0, ishape[i]);
-      const int end_value = process_slice_index(attrs->end.value()[i], ishape[i], ishape[i]);
+      ICHECK_GE(begin_value, 0);
+      start.push_back(std::to_string(begin_value));
       const int stride_value = (default_strides || i >= attrs->strides.value().size() ||
                                 !attrs->strides.value()[i].defined())
                                    ? 1
                                    : attrs->strides.value()[i].as<IntImmNode>()->value;
       ICHECK_GT(stride_value, 0);
-      const int size_value = (end_value - begin_value + stride_value - 1) / stride_value;
-      ICHECK_GE(begin_value, 0);
+      strides.push_back(std::to_string(stride_value));
+      int size_value;
+      if (attrs->slice_mode == "end") {
+        const int end_value = process_slice_index(attrs->end.value()[i], ishape[i], ishape[i]);
+        size_value = (end_value - begin_value + stride_value - 1) / stride_value;
+      } else if (attrs->slice_mode == "size") {
+        // with slice_mode = "size", attrs->end_value mean the size of the slice
+        int end_value = attrs->end.value()[i].as<IntImmNode>()->value;
+        size_value = (end_value == -1) ? ishape[i] - begin_value : end_value;
+      }
       ICHECK_GT(size_value, 0);
-      start.push_back(std::to_string(begin_value));
       size.push_back(std::to_string(size_value));
-      strides.push_back(std::to_string(stride_value));
     }
     std::vector<dmlc::any> start_attr, size_attr, strides_attr;
     start_attr.emplace_back(start);
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index a86f107941bc..057743c3b588 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -944,7 +944,7 @@ class ReduceOpConverter : public TensorRTOpConverter {
 #if TRT_VERSION_GE(5, 1, 5)
 class StridedSliceOpConverter : public TensorRTOpConverter {
  public:
-  StridedSliceOpConverter() : TensorRTOpConverter({kTensor, kWeight, kWeight, kWeight}) {}
+  StridedSliceOpConverter() : TensorRTOpConverter({kTensor}) {}
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 8e8e54e8650a..8b61323a71ad 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -17,12 +17,15 @@
 import numpy as np
 import time
 import pytest
+import itertools
 
 import tvm
 import tvm.relay.testing
 from tvm import relay
 from tvm.relay.op.contrib import tensorrt
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_runtime, utils
+from tvm.runtime.vm import VirtualMachine
+from tvm.relay import Any, GlobalVar, transform
 
 
 def skip_codegen_test():
@@ -46,6 +49,23 @@ def skip_runtime_test():
     return False
 
 
+def vmobj_to_list(o):
+    if isinstance(o, tvm.nd.NDArray):
+        return [o.asnumpy()]
+    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
+        return [vmobj_to_list(f) for f in o]
+    else:
+        raise RuntimeError("Unknown object type: %s" % type(o))
+
+
+def assert_result_dict_holds(result_dict):
+    for k1, k2 in itertools.combinations(result_dict, 2):
+        res1 = vmobj_to_list(result_dict[k1])
+        res2 = vmobj_to_list(result_dict[k2])
+        for r1, r2 in zip(res1, res2):
+            tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
+
+
 def run_and_verify_func(config, target="cuda"):
     """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
 
@@ -64,100 +84,76 @@ def run_and_verify_func(config, target="cuda"):
         for k, v in input_shapes.items()
         if k not in is_param
     }
-
-    # Run TRT
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod, config = tensorrt.partition_for_tensorrt(mod, params)
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
-        graph, lib, graph_params = relay.build(mod, target, params=params)
-    if skip_runtime_test():
-        return
     ctx = tvm.context(target)
-    mod = graph_runtime.create(graph, lib, ctx=ctx)
-    mod.set_input(**graph_params)
-    mod.run(**input_dict)
-    results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
-
-    # Run reference
-    mod = tvm.IRModule()
-    mod["main"] = f
-    with tvm.transform.PassContext(opt_level=3):
-        graph, lib, graph_params = relay.build(mod, target, params=params)
-    mod = graph_runtime.create(graph, lib, ctx=ctx)
-    mod.set_input(**graph_params)
-    mod.run(**input_dict)
-    ref_results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
 
-    assert len(results) == len(ref_results)
-    for i in range(len(results)):
-        res = results[i].asnumpy()
-        ref_res = ref_results[i].asnumpy()
-        assert res.shape == ref_res.shape
-        tvm.testing.assert_allclose(res, ref_res, rtol=1e-3, atol=1e-3)
+    result_dict = dict()
+    for mode in ["graph", "vm"]:
+        for use_trt in [False, True]:
+            mod = tvm.IRModule()
+            mod["main"] = f
+            result_key = mode + ("_trt" if use_trt else "")
+            if use_trt:
+                mod, config = tensorrt.partition_for_tensorrt(mod, params)
+                with tvm.transform.PassContext(
+                    opt_level=3, config={"relay.ext.tensorrt.options": config}
+                ):
+                    exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target)
+            else:
+                with tvm.transform.PassContext(opt_level=3):
+                    exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target)
+            if not skip_runtime_test():
+                result_dict[result_key] = exec.evaluate()(**input_dict, **params)
+
+    if not skip_runtime_test():
+        assert_result_dict_holds(result_dict)
 
 
 def run_and_verify_model(model):
     if skip_codegen_test():
         return
 
-    def compile_and_run(i_data, input_shape, dtype, use_trt=True, num_iteration=1):
-        import mxnet as mx
-        from mxnet.gluon.model_zoo.vision import get_model
-
-        def check_trt_used(graph):
-            import json
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision import get_model
 
-            graph = json.loads(graph)
-            num_trt_subgraphs = sum(
-                [
-                    1
-                    for n in graph["nodes"]
-                    if n.get("attrs", {}).get("func_name", "").startswith("tensorrt_")
-                ]
-            )
-            assert num_trt_subgraphs >= 1
+    def check_trt_used(mod):
+        num_trt_subgraphs = sum(
+            [1 if gv.name_hint == "tensorrt_0" else 0 for gv in mod.get_global_vars()]
+        )
+        assert num_trt_subgraphs == 1
 
-        block = get_model(model, pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+    def compile_and_run(mod, params, i_data, mode="vm", use_trt=True):
+        assert mode in ["graph", "vm"]
 
         if use_trt:
             mod, config = tensorrt.partition_for_tensorrt(mod, params)
+            check_trt_used(mod)
             with tvm.transform.PassContext(
                 opt_level=3, config={"relay.ext.tensorrt.options": config}
             ):
-                graph, lib, params = relay.build(mod, "cuda", params=params)
-            check_trt_used(graph)
+                exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
         else:
             with tvm.transform.PassContext(opt_level=3):
-                graph, lib, params = relay.build(mod, "cuda", params=params)
-
-        if skip_runtime_test():
-            return
-        mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
-        mod.set_input(**params)
-        # Warmup
-        for i in range(10):
-            mod.run(data=i_data)
-        # Time
-        times = []
-        for i in range(num_iteration):
-            start_time = time.time()
-            mod.run(data=i_data)
-            res = mod.get_output(0)
-            times.append(time.time() - start_time)
-        latency = 1000.0 * np.mean(times)
-        print(model, latency)
+                exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+
+        res = exec.evaluate()(i_data, **params) if not skip_runtime_test() else None
         return res
 
     dtype = "float32"
     input_shape = (1, 3, 224, 224)
     i_data = np.random.uniform(-1, 1, input_shape).astype(dtype)
-    res = compile_and_run(i_data, input_shape, dtype, use_trt=True)
-    if skip_runtime_test():
-        return
-    ref_res = compile_and_run(i_data, input_shape, dtype, use_trt=False)
-    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-3)
+    block = get_model(model, pretrained=True)
+    mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+
+    result_dict = dict()
+    for mode in ["vm", "graph"]:
+        for use_trt in [True, False]:
+            result_key = mode + ("_trt" if use_trt else "")
+            result_dict[result_key] = compile_and_run(
+                mod, params, i_data, mode=mode, use_trt=use_trt
+            )
+
+    if not skip_runtime_test():
+        assert_result_dict_holds(result_dict)
 
 
 def test_tensorrt_simple():
@@ -174,19 +170,30 @@ def test_tensorrt_simple():
     out = relay.nn.relu(w)
     f = relay.Function([x, y, z], out)
 
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod, config = tensorrt.partition_for_tensorrt(mod)
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
-        graph, lib, params = relay.build(mod, "cuda")
-    if skip_runtime_test():
-        return
-    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
     x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
     y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
     z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
-    mod.run(x=x_data, y=y_data, z=z_data)
-    results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
+
+    result_dict = dict()
+    for mode in ["vm", "graph"]:
+        for use_trt in [True, False]:
+            mod = tvm.IRModule()
+            mod["main"] = f
+            result_key = mode + ("_trt" if use_trt else "")
+            if use_trt:
+                mod, config = tensorrt.partition_for_tensorrt(mod)
+                with tvm.transform.PassContext(
+                    opt_level=3, config={"relay.ext.tensorrt.options": config}
+                ):
+                    relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            else:
+                with tvm.transform.PassContext(opt_level=3):
+                    relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            if not skip_runtime_test():
+                result_dict[result_key] = relay_exec.evaluate()(x_data, y_data, z_data)
+
+    if not skip_runtime_test():
+        assert_result_dict_holds(result_dict)
 
 
 def test_tensorrt_simple_cpu_io():
@@ -211,6 +218,8 @@ def test_tensorrt_not_compatible():
         return
     dtype = "float32"
     xshape = (1, 32, 14, 14)
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+
     x = relay.var("x", shape=(xshape), dtype=dtype)
     y = relay.add(x, x)
     z = relay.erf(y)
@@ -219,40 +228,116 @@ def test_tensorrt_not_compatible():
     mod = tvm.IRModule()
     mod["main"] = f
     mod, config = tensorrt.partition_for_tensorrt(mod)
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
-        graph, lib, params = relay.build(mod, "cuda")
-    if skip_runtime_test():
-        return
-    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-    mod.run(x=x_data)
-    results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
+    for mode in ["graph", "vm"]:
+        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+            exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            if not skip_runtime_test():
+                results = exec.evaluate()(x_data)
 
 
-def test_tensorrt_serialize():
+def test_tensorrt_serialize_graph_runtime():
     if skip_codegen_test():
         return
-    import mxnet
+    import mxnet as mx
     from mxnet.gluon.model_zoo.vision import get_model
 
+    data_shape = (1, 3, 224, 224)
+    data_type = "float32"
+    i_data = np.random.uniform(0, 1, data_shape).astype(data_type)
     block = get_model("resnet18_v1", pretrained=True)
-    mod, params = relay.frontend.from_mxnet(
-        block, shape={"data": (1, 3, 224, 224)}, dtype="float32"
-    )
-    # Compile
-    mod, config = tensorrt.partition_for_tensorrt(mod, params)
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
-        lib = relay.build(mod, "cuda", params=params)
-    # Serialize
-    lib.export_library("compiled.so")
-    # Deserialize
-    loaded_lib = tvm.runtime.load_module("compiled.so")
-    # Run
-    if skip_runtime_test():
+    mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type)
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    tmpdir = utils.tempdir()
+
+    def compile_graph(mod, params):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+            graph, lib, params = relay.build(mod, params=params, target="cuda")
+            params = relay.save_param_dict(params)
+        return graph, lib, params
+
+    def run_graph(graph, lib, params):
+        mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+        mod_.load_params(params)
+        mod_.run(data=i_data)
+        res = mod_.get_output(0)
+        return res
+
+    def save_graph(graph, lib, params):
+        # Serialize
+        with open(tmpdir.relpath("compiled.json"), "w") as f_graph_json:
+            f_graph_json.write(graph)
+        with open(tmpdir.relpath("compiled.params"), "wb") as f_params:
+            f_params.write(params)
+        lib.export_library(tmpdir.relpath("compiled.so"))
+
+    def load_graph():
+        # Deserialize
+        with open(tmpdir.relpath("compiled.json"), "r") as f_graph_json:
+            graph = f_graph_json.read()
+        with open(tmpdir.relpath("compiled.params"), "rb") as f_params:
+            params = bytearray(f_params.read())
+        lib = tvm.runtime.load_module(tmpdir.relpath("compiled.so"))
+        return graph, lib, params
+
+    # Test serialization with graph runtime
+    graph, lib, graph_params = compile_graph(mod, params)
+    save_graph(graph, lib, graph_params)
+    loaded_graph, loaded_lib, loaded_params = load_graph()
+
+    if not skip_runtime_test():
+        result_dict = dict()
+        result_dict["graph"] = run_graph(graph, lib, graph_params)
+        result_dict["graph_ref"] = run_graph(loaded_graph, loaded_lib, loaded_params)
+        assert_result_dict_holds(result_dict)
+
+
+def test_tensorrt_serialize_vm():
+    if skip_codegen_test():
         return
-    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib["default"](tvm.gpu(0)))
-    i_data = np.random.uniform(0, 1, (1, 3, 224, 224)).astype("float32")
-    gen_module.run(data=i_data)
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    data_shape = (1, 3, 224, 224)
+    data_type = "float32"
+    i_data = np.random.uniform(0, 1, data_shape).astype(data_type)
+    block = get_model("resnet18_v1", pretrained=True)
+    mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type)
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    tmpdir = utils.tempdir()
+
+    def compile_vm(mod, params):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+            vm_exec = relay.vm.compile(mod, target="cuda", params=params)
+            code, lib = vm_exec.save()
+        return code, lib
+
+    def run_vm(code, lib):
+        vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
+        vm = VirtualMachine(vm_exec, tvm.gpu(0))
+        result = vm.invoke("main", data=i_data)
+        return result
+
+    def save_vm(code, lib):
+        # save and load the code and lib file.
+        lib.export_library(tmpdir.relpath("path_lib.so"))
+        with open(tmpdir.relpath("path_code.ro"), "wb") as fo:
+            fo.write(code)
+
+    def load_vm():
+        lib = tvm.runtime.load_module(tmpdir.relpath("path_lib.so"))
+        code = bytearray(open(tmpdir.relpath("path_code.ro"), "rb").read())
+        return lib, code
+
+    # Test serialization with VM
+    code_vm, lib_vm = compile_vm(mod, params)
+    save_vm(code_vm, lib_vm)
+    loaded_lib_vm, loaded_code_vm = load_vm()
+
+    if not skip_runtime_test():
+        result_dict = dict()
+        result_dict["vm"] = run_vm(code_vm, lib_vm)
+        result_dict["vm_ref"] = run_vm(loaded_code_vm, loaded_lib_vm)
+        assert_result_dict_holds(result_dict)
 
 
 def test_conv2d():
@@ -701,27 +786,40 @@ def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False):
 
 
 def test_strided_slice():
-    def get_graph(x_shape, begin, end, strides=None):
+    def get_graph(x_shape, begin, end, strides=None, slice_mode="size"):
         x = relay.var("x", shape=(x_shape), dtype="float32")
         if strides:
             out = relay.strided_slice(
                 x,
-                relay.expr.const(begin, dtype="int32"),
-                relay.expr.const(end, dtype="int32"),
-                relay.expr.const(strides, dtype="int32"),
+                begin,
+                end,
+                strides,
+                slice_mode=slice_mode,
             )
         else:
             out = relay.strided_slice(
                 x,
-                relay.expr.const(begin, dtype="int32"),
-                relay.expr.const(end, dtype="int32"),
+                begin,
+                end,
+                slice_mode=slice_mode,
             )
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph((1, 3, 6, 7), [0, 0, 0, 0], [1, 1, 6, 7]))
-    run_and_verify_func(get_graph((1, 3, 6, 7), [0, 1, 0, 0], [1, 2, 6, 6]))
-    run_and_verify_func(get_graph((1, 10), [0, 0], [1, 10], [1, 2]))
+    for slice_mode in ["size", "end"]:
+        run_and_verify_func(
+            get_graph((1, 3, 6, 7), (0, 0, 0, 0), (1, 1, 6, 7), slice_mode=slice_mode)
+        )
+        run_and_verify_func(
+            get_graph((1, 3, 6, 7), [0, 1, 0, 0], [1, 2, 6, 6], slice_mode=slice_mode)
+        )
+        run_and_verify_func(
+            get_graph((2, 3, 6, 7), [0, 0, 0, 0], [-1, -1, -1, -1], slice_mode=slice_mode)
+        )
+        run_and_verify_func(
+            get_graph((2, 3, 6, 7), [0, 1, 0, 0], [-1, -1, -1, -1], slice_mode=slice_mode)
+        )
+        run_and_verify_func(get_graph((1, 6), [0, 1], [1, 3], slice_mode=slice_mode))
 
 
 def test_adaptive_pool2d():
@@ -874,50 +972,67 @@ def test_densenet121():
     run_and_verify_model("densenet121")
 
 
+def test_dynamic_offload():
+    """
+    This test checks for proper dynamic offloading of relay graphs. An addition between
+    the outputs of two conv2d's is performed, one of them having all static args whereas
+    the other has a arg with dynamic shape. It is expected for the TRT partitioner to
+    offload the conv2d with dynamic arg to TVM while running the other in TRT.
+    """
+
+    if skip_codegen_test():
+        return
+
+    data_shape = (1, 32, 8, 8)
+    k_shape = (1, 32, 3, 3)
+
+    x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32")
+    y = relay.var("y", shape=(data_shape), dtype="float32")
+    kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+
+    def get_expected():
+        def set_func_attr(func, compile_name, symbol_name):
+            func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+            func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
+            func = func.with_attr("Compiler", compile_name)
+            func = func.with_attr("global_symbol", symbol_name)
+            return func
+
+        # Create a nested TRT function that matches the expected output
+        mod = tvm.IRModule()
+        var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
+        kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32")
+        out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4])
+        f1 = GlobalVar("tensorrt_0")
+        func = relay.Function([var1, kernel_trt], out1)
+        func = set_func_attr(func, "tensorrt", "tensorrt_0")
+        mod[f1] = func
+        mod = relay.transform.InferType()(mod)
+
+        # Create the main function
+        out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
+        out = relay.add(out1, f1(y, kernel))
+        f = relay.Function([x, y, kernel], out)
+        mod["main"] = f
+        mod = relay.transform.InferType()(mod)
+        return mod
+
+    # Create relay function that will be offloaded to TRT
+    out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
+    out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
+    out = relay.add(out1, out2)
+    f = relay.Function([x, y, kernel], out)
+
+    # Pass the function to TRT compilation
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod = relay.transform.InferType()(mod)
+    mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={})
+
+    # Get the expected relay graph and compare
+    mod_exp = get_expected()
+    tvm.ir.assert_structural_equal(mod_trt, mod_exp, map_free_vars=True)
+
+
 if __name__ == "__main__":
-    test_tensorrt_not_compatible()
-    test_tensorrt_simple()
-    test_tensorrt_simple_cpu_io()
-    test_tensorrt_serialize()
-
-    # Op tests
-    test_conv2d()
-    test_conv2d_nhwc()
-    test_conv2d_weights_const()
-    test_conv2d_weights_transposed()
-    test_dense()
-    test_bias_add()
-    test_pool2d()
-    test_global_pool2d()
-    test_batch_flatten()
-    test_expand_dims()
-    test_squeeze()
-    test_concatenate()
-    test_conv2d_transpose()
-    test_reshape()
-    test_transpose()
-    test_float_const()
-    test_pad()
-    test_softmax()
-    test_batch_norm()
-    test_unary()
-    test_clip()
-    test_leaky_relu()
-    test_binary()
-    test_reduce()
-    test_strided_slice()
-    test_adaptive_pool2d()
-    test_multiple_outputs()
-    test_conv3d()
-    test_pool3d()
-    test_conv3d_transpose()
-
-    # Integration tests
-    test_alexnet()
-    test_resnet18_v1()
-    test_resnet18_v2()
-    test_squeezenet()
-    test_mobilenet()
-    test_mobilenet_v2()
-    test_vgg11()
-    test_densenet121()
+    pytest.main([__file__])

From 058463431cda33190168991f043cb3d977c61835 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 13 Nov 2020 16:18:40 -0800
Subject: [PATCH 171/258] Consolidate RPC Context helper functions (#6915)

---
 include/tvm/runtime/device_api.h  | 52 ++++++++++++++++++++++++++++---
 src/runtime/rpc/rpc_device_api.cc | 35 ++++++++-------------
 src/runtime/rpc/rpc_endpoint.cc   |  2 +-
 src/runtime/rpc/rpc_module.cc     | 19 +++++------
 4 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index c6a2ce3d28d0..a6f5624de084 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -240,13 +240,55 @@ inline const char* DeviceName(int type) {
   }
 }
 
+/*!
+ * \brief Return true if a TVMContext is owned by an RPC session.
+ */
+inline bool IsRPCSessionContext(TVMContext ctx) { return (ctx.device_type / kRPCSessMask) > 0; }
+
+/*!
+ * \brief Return the RPCSessTable index of the RPC Session that owns this context.
+ * \return the table index.
+ */
+inline int GetRPCSessionIndex(TVMContext ctx) {
+  ICHECK(IsRPCSessionContext(ctx)) << "GetRPCSessionIndex: ctx has no RPC session";
+  return ctx.device_type / kRPCSessMask - 1;
+}
+
+/*!
+ * \brief Remove the RPC session mask from a TVMContext.
+ * RPC clients typically do this when encoding a TVMContext for transmission to an RPC remote.
+ * On the wire, RPCContext are expected to be valid on the server without interpretation.
+ * \param ctx A TVMContext with non-zero RPC Session mask, valid on the RPC client.
+ * \return A TVMContext without any RPC Session mask, valid on the RPC server.
+ */
+inline TVMContext RemoveRPCSessionMask(TVMContext ctx) {
+  ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
+  return ctx;
+}
+
+inline std::ostream& operator<<(std::ostream& os, DLContext ctx);
+
+/*!
+ * \brief Add a RPC session mask to a TVMContext.
+ * RPC clients typically do this when decoding a TVMContext received from a RPC remote.
+ * \param ctx A TVMContext without any RPC Session mask, valid on the RPC server.
+ * \param session_table_index Numeric index of the RPC session in the session table.
+ * \return A TVMContext with RPC session mask added, valid on the RPC client.
+ */
+inline TVMContext AddRPCSessionMask(TVMContext ctx, int session_table_index) {
+  CHECK(!IsRPCSessionContext(ctx))
+      << "AddRPCSessionMask: ctx already non-zero RPCSessionIndex: " << ctx;
+  ctx.device_type =
+      static_cast<DLDeviceType>(ctx.device_type | (kRPCSessMask * (session_table_index + 1)));
+  return ctx;
+}
+
 inline std::ostream& operator<<(std::ostream& os, DLContext ctx) {  // NOLINT(*)
-  int device_type = static_cast<int>(ctx.device_type);
-  if (device_type > kRPCSessMask) {
-    os << "remote[" << (device_type / kRPCSessMask) << "]-";
-    device_type = device_type % kRPCSessMask;
+  if (IsRPCSessionContext(ctx)) {
+    os << "remote[" << GetRPCSessionIndex(ctx) << "]-";
+    ctx = RemoveRPCSessionMask(ctx);
   }
-  os << runtime::DeviceName(device_type) << "(" << ctx.device_id << ")";
+  os << runtime::DeviceName(static_cast<int>(ctx.device_type)) << "(" << ctx.device_id << ")";
   return os;
 }
 }  // namespace runtime
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 943990fd9585..a1e96e92b4e0 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -34,19 +34,19 @@ namespace runtime {
 class RPCDeviceAPI final : public DeviceAPI {
  public:
   void SetDevice(TVMContext ctx) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->SetDevice(remote_ctx);
   }
 
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv);
   }
 
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
     auto sess = GetSess(ctx);
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     void* data =
         sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, nbytes, alignment, type_hint);
 
@@ -57,7 +57,7 @@ class RPCDeviceAPI final : public DeviceAPI {
   }
   void FreeDataSpace(TVMContext ctx, void* ptr) final {
     RemoteSpace* space = static_cast<RemoteSpace*>(ptr);
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     try {
       GetSess(ctx)->GetDeviceAPI(remote_ctx)->FreeDataSpace(remote_ctx, space->data);
     } catch (const dmlc::Error& e) {
@@ -68,13 +68,11 @@ class RPCDeviceAPI final : public DeviceAPI {
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
-    int from_dev_type = ctx_from.device_type;
-    int to_dev_type = ctx_to.device_type;
-    if (from_dev_type > kRPCSessMask && to_dev_type > kRPCSessMask) {
+    if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) {
       ICHECK(ctx_from.device_type == ctx_to.device_type)
           << "Cannot copy across two different remote session";
-      auto remote_ctx_from = RemoveSessMask(ctx_from);
-      auto remote_ctx_to = RemoveSessMask(ctx_to);
+      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
+      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
       auto remote_ctx = remote_ctx_from;
       if (remote_ctx.device_type == kDLCPU) remote_ctx = remote_ctx_to;
       GetSess(ctx_from)
@@ -82,12 +80,12 @@ class RPCDeviceAPI final : public DeviceAPI {
           ->CopyDataFromTo(static_cast<const RemoteSpace*>(from)->data, from_offset,
                            static_cast<const RemoteSpace*>(to)->data, to_offset, size,
                            remote_ctx_from, remote_ctx_to, type_hint, stream);
-    } else if (from_dev_type > kRPCSessMask && to_dev_type == kDLCPU) {
-      auto remote_ctx_from = RemoveSessMask(ctx_from);
+    } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) {
+      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
       GetSess(ctx_from)->CopyFromRemote(static_cast<const RemoteSpace*>(from)->data, from_offset,
                                         to, to_offset, size, remote_ctx_from, type_hint);
-    } else if (from_dev_type == kDLCPU && to_dev_type > kRPCSessMask) {
-      auto remote_ctx_to = RemoveSessMask(ctx_to);
+    } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) {
+      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
       GetSess(ctx_to)->CopyToRemote(const_cast<void*>(from), from_offset,
                                     static_cast<const RemoteSpace*>(to)->data, to_offset, size,
                                     remote_ctx_to, type_hint);
@@ -97,22 +95,15 @@ class RPCDeviceAPI final : public DeviceAPI {
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    auto remote_ctx = RemoveSessMask(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream);
   }
 
  private:
   std::shared_ptr<RPCSession> GetSess(TVMContext ctx) {
-    int dev_type = ctx.device_type;
-    ICHECK_GE(dev_type, kRPCSessMask);
-    int tbl_index = dev_type / kRPCSessMask - 1;
+    int tbl_index = GetRPCSessionIndex(ctx);
     return RPCSession::Get(tbl_index);
   }
-
-  static TVMContext RemoveSessMask(TVMContext ctx) {
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
-    return ctx;
-  }
 };
 
 TVM_REGISTER_GLOBAL("device_api.rpc").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index b8c2a3bb0b97..fbdd93fb4f62 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -178,7 +178,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
                    << args[i].AsObjectRef<ObjectRef>()->GetTypeKey() << " is not supported by RPC";
       } else if (tcode == kTVMContext) {
         DLContext ctx = args[i];
-        ICHECK_LT(static_cast<int>(ctx.device_type), kRPCSessMask)
+        ICHECK(!IsRPCSessionContext(ctx))
             << "InternalError: cannot pass RPC context in the channel";
       }
     }
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index a3d888e927ed..165c0fe73b36 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -108,11 +108,10 @@ class RPCWrappedFunc : public Object {
 
   // remove a remote session mask
   TVMContext RemoveSessMask(TVMContext ctx) const {
-    int dev_type = ctx.device_type;
-    ICHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
-        << "Can not pass in local context or context with a different remote session";
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
-    return ctx;
+    ICHECK(IsRPCSessionContext(ctx)) << "Can not pass in local context";
+    ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index())
+        << "Can not pass in context with a different remote session";
+    return RemoveRPCSessionMask(ctx);
   }
 
   // deleter of RPC remote array
@@ -141,13 +140,12 @@ class RPCWrappedFunc : public Object {
     // setup dtype
     data->dl_tensor.dtype = tensor->dtype;
     // setup ctx, encode as remote session
-    data->dl_tensor.ctx.device_id = tensor->ctx.device_id;
-    data->dl_tensor.ctx.device_type = static_cast<DLDeviceType>(
-        static_cast<int>(tensor->ctx.device_type) + kRPCSessMask * (sess_->table_index() + 1));
+    data->dl_tensor.ctx = AddRPCSessionMask(tensor->ctx, sess_->table_index());
     // check strides.
     ICHECK(tensor->strides == nullptr);
     // setup byteoffset
     data->dl_tensor.byte_offset = tensor->byte_offset;
+
     return ret;
   }
 };
@@ -189,10 +187,9 @@ class RPCModuleNode final : public ModuleNode {
                               int min_repeat_ms, const std::string& f_preproc_name) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass ctx by parts.
-    int dev_type = ctx.device_type;
-    ICHECK_EQ(dev_type / kRPCSessMask, sess_->table_index() + 1)
+    ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index())
         << "ValueError: Need to pass the matched remote context to RPCModule.GetTimeEvaluator";
-    ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
+    ctx = RemoveRPCSessionMask(ctx);
 
     if (module_handle_ != nullptr) {
       return remote_get_time_evaluator_(GetRef<Module>(this), name,

From 1552a761b66fb9ad0042d7b28ae0f0c94918f8ea Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sat, 14 Nov 2020 05:39:48 -0800
Subject: [PATCH 172/258] Make TVMLogf platform-independent (#6916)

* Make TVMLogf platform-independent.

 * Some platforms need to use an alternate printf() to support basic
   things like %zu. Since %zu is platform-specific, we prefer to
   use a printf() that supports it or allow the platform to fix it up
   as needed.

* git-clang-format
---
 include/tvm/runtime/crt/platform.h            | 17 +++++++++++++++++
 src/runtime/crt/host/main.cc                  |  5 +++++
 src/runtime/crt/utvm_rpc_server/rpc_server.cc |  2 +-
 tests/micro/qemu/zephyr-runtime/src/main.c    |  5 +++++
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index 782060dfd000..0f8c6ba7baf2 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -25,6 +25,8 @@
 #ifndef TVM_RUNTIME_CRT_PLATFORM_H_
 #define TVM_RUNTIME_CRT_PLATFORM_H_
 
+#include <stdarg.h>
+#include <stddef.h>
 #include <tvm/runtime/crt/error_codes.h>
 
 #ifdef __cplusplus
@@ -39,6 +41,21 @@ extern "C" {
  */
 void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t code);
 
+/*! \brief Called by the microTVM RPC server to implement TVMLogf.
+ *
+ * Not required to be implemented when the RPC server is not linked into the binary. This
+ * function's signature matches that of vsnprintf, so trivial implementations can just call
+ * vsnprintf.
+ *
+ * \param out_buf A char buffer where the formatted string should be written.
+ * \param out_buf_size_bytes Number of bytes available for writing in out_buf.
+ * \param fmt The printf-style formatstring.
+ * \param args extra arguments to be formatted.
+ * \return number of bytes written.
+ */
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index 5623b2515585..664dae7ab857 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -43,6 +43,11 @@ ssize_t UTvmWriteFunc(void* context, const uint8_t* data, size_t num_bytes) {
   return to_return;
 }
 
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args) {
+  return vsnprintf(out_buf, out_buf_size_bytes, fmt, args);
+}
+
 void TVMPlatformAbort(tvm_crt_error_t error_code) {
   std::cerr << "TVMPlatformAbort: " << error_code << std::endl;
   throw "Aborted";
diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
index 34eff6a3270d..6674d5993cc6 100644
--- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
@@ -219,7 +219,7 @@ void TVMLogf(const char* format, ...) {
   va_list args;
   char log_buffer[256];
   va_start(args, format);
-  size_t num_bytes_logged = vsnprintf(log_buffer, sizeof(log_buffer), format, args);
+  size_t num_bytes_logged = TVMPlatformFormatMessage(log_buffer, sizeof(log_buffer), format, args);
   va_end(args);
 
   // Most header-based logging frameworks tend to insert '\n' at the end of the log message.
diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c
index 19e72e1c076d..1fa32e384c0b 100644
--- a/tests/micro/qemu/zephyr-runtime/src/main.c
+++ b/tests/micro/qemu/zephyr-runtime/src/main.c
@@ -57,6 +57,11 @@ ssize_t write_serial(void* unused_context, const uint8_t* data, size_t size) {
   return size;
 }
 
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args) {
+  return vsnprintk(out_buf, out_buf_size_bytes, fmt, args);
+}
+
 void TVMPlatformAbort(tvm_crt_error_t error) {
   sys_reboot(SYS_REBOOT_COLD);
   for (;;)

From a9017dcf2f5c2b8f8229c4dec463d320d22bd39f Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sat, 14 Nov 2020 10:18:59 -0800
Subject: [PATCH 173/258] [TF parser] Handle int64 dtype in range (#6918)

---
 python/tvm/relay/frontend/tensorflow.py          | 10 +++++-----
 tests/python/frontend/tensorflow/test_forward.py |  9 +++++----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 4a7a7da307fc..ca24e1b35374 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1454,9 +1454,9 @@ def _impl(inputs, attr, params, mod):
                 break
 
         if is_symbolic_shape:
-            ret = _op.shape_of(inputs[0], dtype="int32")
+            ret = _op.shape_of(inputs[0], dtype=attr["out_type"].name)
         else:
-            ret = np.array(input_shape, dtype="int32")
+            ret = np.array(input_shape, dtype=attr["out_type"].name)
         return ret
 
     return _impl
@@ -1862,11 +1862,11 @@ def _impl(inputs, attr, params, mod):
 
         dtype = attr["Tidx"].name if "Tidx" in attr else str(start.dtype)
         if isinstance(start, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            start = _expr.const(start)
+            start = _expr.const(start, dtype=dtype)
         if isinstance(limit, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            limit = _expr.const(limit)
+            limit = _expr.const(limit, dtype=dtype)
         if isinstance(delta, (np.int32, np.int64, int, np.float32, np.float64, float)):
-            delta = _expr.const(delta)
+            delta = _expr.const(delta, dtype=dtype)
 
         return AttrCvt(
             op_name="arange",
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 93bfd0cbaf83..23a4b7abe5ab 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -2783,10 +2783,11 @@ def test_forward_unpack():
 
 def test_forward_range():
     """test operator Range"""
-    tf.reset_default_graph()
-    with tf.Graph().as_default():
-        tf.range(1, 18, 3, name="range")
-        compare_tf_with_tvm([], [], "range:0")
+    for dtype in [tf.int32, tf.int64]:
+        tf.reset_default_graph()
+        with tf.Graph().as_default():
+            tf.range(1, 18, 3, name="range", dtype=dtype)
+            compare_tf_with_tvm([], [], "range:0")
 
     """test type assignment for operator Range"""
     tf.reset_default_graph()

From 7483aa2e90fbe0ac69bef6ed2d7ad46038ed922b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sat, 14 Nov 2020 10:19:23 -0800
Subject: [PATCH 174/258] [ShapeFunc] Handle weights in shape func (#6912)

* [ShapeFunc] Handle weights in shape func

* Comments
---
 src/relay/backend/compile_engine.cc | 22 +++++++++++++++++++++-
 tests/python/relay/test_vm.py       | 25 +++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 767cb6f644de..c8327de94232 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -420,8 +420,28 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
     using tir::make_const;
     ICHECK(data_dependants_.size());
-    ICHECK(op->is_scalar());
     bool data_dependant = data_dependants_.back();
+    if (!op->is_scalar()) {
+      // This is a constant weight, extract the shape of the weight tensor.
+      // This can not be data dependent.
+      CHECK(!data_dependant);
+      auto ttype = op->checked_type().as<TensorTypeNode>();
+      int ndim = static_cast<int>(ttype->shape.size());
+      Array<PrimExpr> out_shape{ndim};
+      te::Tensor value = tvm::te::compute(
+          out_shape,
+          [&](const Array<tvm::tir::Var>& indices) {
+            auto idx = indices[0];
+            PrimExpr ret = make_const(DataType::Int(64), 0);
+            for (int i = 0; i < ndim; i++) {
+              ret = tvm::if_then_else(idx == i, ttype->shape[i], ret);
+            }
+            return ret;
+          },
+          "shape_const", topi::kBroadcast);
+      scalars_.push_back(value);
+      return {value};
+    }
     if (data_dependant) {
       void* data = op->data->data;
       DataType dtype = DataType(op->data->dtype);
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 92d6e8e55db4..6958010176e3 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -770,5 +770,30 @@ def test_vm_reshape_tuple(x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
         tvm.testing.assert_allclose(res.asnumpy(), np.reshape(x_data, (1, -1)))
 
 
+def test_constant_shape_with_external_codegen():
+    mod = tvm.IRModule()
+    shape = (relay.Any(), 25)
+    dtype = "float32"
+
+    # external function
+    x = relay.var("x", shape=shape, dtype=dtype)
+    weight = relay.const(np.random.rand(5, 25).astype("float32"), dtype="float32")
+    out = relay.nn.dense(x, weight)
+    f1 = relay.Function([x], out)
+    f1 = f1.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+    f1 = f1.with_attr("Inline", tvm.tir.IntImm("int32", 1))
+    f1 = f1.with_attr("Compiler", "a")
+    glb_f1 = relay.GlobalVar("f1")
+    mod[glb_f1] = f1
+    mod = relay.transform.InferType()(mod)
+
+    # Main function
+    x = relay.var("x", shape=shape, dtype=dtype)
+    mod["main"] = relay.Function([x], glb_f1(x))
+    comp = relay.vm.VMCompiler()
+    opt_mod, _ = comp.optimize(mod, target="llvm")
+    assert "shape_func" in opt_mod.astext(False)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From ca61eb8b6a118651b0f12f6f3bb994d0c36f694d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 15 Nov 2020 05:19:49 -0800
Subject: [PATCH 175/258] [Doc] Minor improvements for auto-tuning tutorials
 (#6919)

---
 docs/index.rst                                    |  2 +-
 python/tvm/auto_scheduler/compute_dag.py          | 15 +++++++++++++++
 python/tvm/auto_scheduler/relay_integration.py    |  2 ++
 .../auto_scheduler/tune_conv2d_layer_cuda.py      |  5 ++++-
 tutorials/auto_scheduler/tune_matmul_x86.py       |  8 ++++++--
 tutorials/auto_scheduler/tune_network_cuda.py     | 10 +++++++---
 tutorials/autotvm/tune_relay_arm.py               |  2 +-
 tutorials/autotvm/tune_relay_cuda.py              |  2 +-
 tutorials/autotvm/tune_relay_mobile_gpu.py        |  2 +-
 tutorials/autotvm/tune_relay_x86.py               |  2 +-
 tutorials/autotvm/tune_simple_template.py         |  4 ++--
 11 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index 18b2da7fc387..f407fa2d4f29 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,7 +25,7 @@ Get Started
 -----------
 
 - Follow the :doc:`instructions <install/index>` to install TVM.
-- Checkout the :doc:`Tutorials <tutorials/index>`.
+- Checkout the :doc:`tutorials <tutorials/index>`.
 
 For Developers
 --------------
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index 9390a9c4589a..93467e27d0e7 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name
 
 """ The auto-scheduler's computational graph and related program analyses. """
 
@@ -188,6 +189,20 @@ def hash_key(self):
         str_key = str_key.encode(encoding="utf-8")
         return hashlib.md5(str_key).hexdigest()
 
+    def __str__(self):
+        # pretty print
+        MAX_LINE_WIDTH = 256
+
+        raw_lines = super().__str__().split("\n")
+        lines = []
+        for line in raw_lines:
+            if len(line) > MAX_LINE_WIDTH:
+                line = (
+                    line[: MAX_LINE_WIDTH // 2] + " ..(OMITTED).. " + line[-MAX_LINE_WIDTH // 2 :]
+                )
+            lines.append(line)
+        return "\n".join(lines)
+
     def __getstate__(self):
         return {"compute": SaveJSON(self.compute), "sche": SaveJSON(self.sche)}
 
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index c8a4ed5ac9d2..0b0157c421b5 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -214,6 +214,8 @@ def auto_schedule_topi(outs):
     if env is None:  # in the final build mode
         state = DispatchContext.current.query(tvm.target.Target.current(), key)
         if state is None:
+            if "gpu" in tvm.target.Target.current().keys:
+                raise RuntimeError("Cannot compile for GPU targets if no valid schedule is found.")
             return te.create_schedule([x.op for x in outs])
 
         dag = ComputeDAG(io_tensors)
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index a8bb8dd08f59..a28e98b8792a 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -17,11 +17,13 @@
 """
 .. _auto-scheduler-conv-gpu:
 
-Auto-scheduling a convolution layer for GPU
+Auto-scheduling a Convolution Layer for GPU
 ===========================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
+This is a tutorial on how to use the auto-scheduler for GPUs.
+
 Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
 manual templates to define the search space, the auto-scheduler does not require any templates.
 Users only need to write the computation declaration without any schedule commands or templates.
@@ -99,6 +101,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
     num_measure_trials=10,  # change this to 1000 to achieve the best performance
     runner=measure_ctx.runner,
     measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    verbose=2,
 )
 
 ######################################################################
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 2bd47ded11c8..6d756299c5d8 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -15,11 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-scheduling matrix multiplication for CPU
+Auto-scheduling Matrix Multiplication for CPU
 =============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
+This is a tutorial on how to use the auto-scheduler for CPUs.
+
 Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
 manual templates to define the search space, the auto-scheduler does not require any templates.
 Users only need to write the computation declaration without any schedule commands or templates.
@@ -88,7 +90,9 @@ def matmul_add(N, L, M, dtype):
 
 log_file = "matmul.json"
 tune_option = auto_scheduler.TuningOptions(
-    num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
+    num_measure_trials=10,
+    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    verbose=2,
 )
 
 ######################################################################
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 9eb5d5cdff0c..4756ea390b5c 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-tuning a Neural Network for NVIDIA GPU
-===========================================
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
 
 Auto-tuning for specific devices and workloads is critical for getting the
@@ -156,6 +156,10 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
 tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
 
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
 #################################################################
 # Begin Tuning
 # ------------
@@ -250,7 +254,7 @@ def run_tuning():
 #   There will also be some "dmlc::Error"s and CUDA errors, because the
 #   auto-scheduler will try some invalid schedules.
 #   You can safely ignore them if the tuning can continue, because these
-#   errors are isolated from the master process.
+#   errors are isolated from the main process.
 #
 
 ######################################################################
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index c69c7d9eaf8a..1e1e98ae5ab9 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -17,7 +17,7 @@
 """
 .. _tune_relay_arm:
 
-Auto-tuning a convolutional network for ARM CPU
+Auto-tuning a Convolutional Network for ARM CPU
 ===============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Zhao Wu <https://github.com/FrozenGene>`_, `Eddie Yan <https://github.com/eqy>`_
 
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 91407133d695..33b62bbf8f19 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-tuning a convolutional network for NVIDIA GPU
+Auto-tuning a Convolutional Network for NVIDIA GPU
 ==================================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Eddie Yan <https://github.com/eqy/>`_
 
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 3611696996b9..10e201fd9fb5 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-tuning a convolutional network for Mobile GPU
+Auto-tuning a Convolutional Network for Mobile GPU
 ==================================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Eddie Yan <https://github.com/eqy>`_
 
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 5b3d0320f580..30e62efe0d9d 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -17,7 +17,7 @@
 """
 .. _tune_relay_x86:
 
-Auto-tuning a convolutional network for x86 CPU
+Auto-tuning a Convolutional Network for x86 CPU
 ===============================================
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Eddie Yan <https://github.com/eqy>`_
 
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 4c5c7dae63f8..db199fc717fa 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Writing tunable template and Using auto-tuner
-=============================================
+Writing Tunable Templates and Using the Auto-tuner
+==================================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
 
 This is an introduction tutorial to the auto-tuning module in TVM.

From 32129b6be6dedb1da9b34925c4af3c4cbb33dfb6 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Tue, 17 Nov 2020 00:14:10 +0800
Subject: [PATCH 176/258] [Relay] Add dynamic SparseToDense (#6892)

* [Relay] Add dynamic SparseToDense

* Fix comments
---
 include/tvm/topi/transform.h                  |  2 +-
 python/tvm/relay/frontend/tensorflow.py       |  2 +-
 python/tvm/relay/op/dyn/_transform.py         | 14 ++++
 python/tvm/relay/op/transform.py              |  2 +
 src/relay/op/dyn/tensor/transform.cc          | 71 ++++++++++++++++++
 src/relay/op/make_op.h                        |  2 +
 src/relay/op/tensor/transform.cc              | 27 ++++---
 src/relay/transforms/dynamic_to_static.cc     | 10 +++
 .../relay/dyn/test_dynamic_op_level3.py       | 68 +++++++++++++++--
 .../relay/test_pass_dynamic_to_static.py      | 75 +++++++++++++++----
 10 files changed, 242 insertions(+), 31 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 9fe3fb10822b..c866dfb7f86b 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1511,7 +1511,7 @@ inline Tensor one_hot(const Tensor& indices, const PrimExpr on_value, const Prim
  * \param tag output tensor tag.
  * \return Tensor of output_shape.
  */
-inline Tensor sparse_to_dense(const Tensor& sparse_indices, const Array<Integer>& output_shape,
+inline Tensor sparse_to_dense(const Tensor& sparse_indices, const Array<PrimExpr>& output_shape,
                               const Tensor& sparse_values, const PrimExpr& default_value,
                               const std::string name = "T_sparse_to_dense",
                               const std::string tag = kInjective) {
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index ca24e1b35374..38431933f7cf 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1329,9 +1329,9 @@ def _impl(inputs, attr, params, mod):
 def _sparse_to_dense():
     def _impl(inputs, attr, params, mod):
         sparse_indices = inputs[0]
+        output_shape = inputs[1]
         sparse_values = inputs[2]
         default_value = inputs[3]
-        output_shape = attr["_output_shapes"][0]
 
         return _op.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
 
diff --git a/python/tvm/relay/op/dyn/_transform.py b/python/tvm/relay/op/dyn/_transform.py
index 559d63acaefd..b61d4f9655f6 100644
--- a/python/tvm/relay/op/dyn/_transform.py
+++ b/python/tvm/relay/op/dyn/_transform.py
@@ -28,6 +28,7 @@
 _reg.register_injective_schedule("dyn.one_hot")
 _reg.register_injective_schedule("dyn.full")
 _reg.register_injective_schedule("dyn.strided_slice")
+_reg.register_injective_schedule("dyn.sparse_to_dense")
 
 
 @script
@@ -198,3 +199,16 @@ def strided_slice_shape_func(attrs, inputs, _):
     """
     slice_mode = convert(0 if attrs.slice_mode == "end" else 1)
     return [_strided_slice_shape_func_input_data(*inputs, slice_mode)]
+
+
+@script
+def _sparse_to_dense_shape_func(output_shape, ndim):
+    out = output_tensor((ndim,), "int64")
+    for i in const_range(ndim):
+        out[i] = int64(output_shape[i])
+    return out
+
+
+@_reg.register_shape_func("dyn.sparse_to_dense", True)
+def sparse_to_dense_shape_func(attrs, inputs, out_ndims):
+    return [_sparse_to_dense_shape_func(inputs[3], out_ndims[0])]
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index b7df6001e59e..92ecd34bc359 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1242,6 +1242,8 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0
 
     if default_value == 0:
         default_value = const(0)
+    if isinstance(output_shape, Expr):
+        return _dyn_make.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
     return _make.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value)
 
 
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index a609e701c49f..815f24b6bda9 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -534,6 +534,77 @@ Examples::
     .set_attr<TOpPattern>("TOpPattern", kInjective)
     .set_attr<AnyCodegenStrategy>("AnyCodegenStrategy", kVariableDimensions);
 
+bool SparseToDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  ICHECK_EQ(num_inputs, 4);
+  auto sparse_indices = types[0].as<TensorTypeNode>();
+  auto sparse_values = types[1].as<TensorTypeNode>();
+  auto default_value = types[2].as<TensorTypeNode>();
+  auto output_shape = types[3].as<TensorTypeNode>();
+
+  if (sparse_indices == nullptr || sparse_values == nullptr || default_value == nullptr ||
+      output_shape == nullptr) {
+    return false;
+  }
+
+  CHECK(sparse_indices->dtype.is_int()) << "sparse_indices must be tensor of integers";
+
+  CHECK_LE(sparse_indices->shape.size(), 3)
+      << "sparse_indices must be a tensor of either 0D, 1D or 2D";
+
+  CHECK_LE(sparse_values->shape.size(), 2) << "sparse_values must be a tensor of either 0D, 1D";
+
+  CHECK_EQ(default_value->shape.size(), 0) << "default_value should be a scalar";
+
+  Array<IndexExpr> oshape;
+  for (int i = 0; i < output_shape->shape[0].as<IntImmNode>()->value; i++) {
+    oshape.push_back(Any());
+  }
+  reporter->Assign(types[4], TensorType(oshape, sparse_values->dtype));
+  return true;
+}
+
+Array<te::Tensor> SparseToDenseCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
+                                       const Type& out_type) {
+  ICHECK_EQ(inputs.size(), 4);
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  ICHECK(out_ttype);
+  return {topi::sparse_to_dense(inputs[0], out_ttype->shape, inputs[1], inputs[2]())};
+}
+
+TVM_REGISTER_GLOBAL("relay.op.dyn._make.sparse_to_dense")
+    .set_body_typed([](Expr indices, Expr output_shape, Expr values, Expr default_value) {
+      static const Op& op = Op::Get("dyn.sparse_to_dense");
+      return Call(op, {indices, values, default_value, output_shape});
+    });
+
+RELAY_REGISTER_OP("dyn.sparse_to_dense")
+    .describe(R"code(A dense tensor from a sparse representation.
+
+    - **sparse_indices**: A 0-D, 1-D, or 2-D tensor of integers containing location of sparse values
+
+    - **output_shape**: A list of integers. Shape of the dense output tensor.
+
+    - **sparse_values**: A 0-D or 1-D tensor containing the sparse values for the sparse indices.
+
+    - **default_value**: A 0-D tensor containing the default value for the remaining locations. Defaults to 0.
+
+    Example::
+      -  sparse_to_dense([0, 0], [1, 2]], [3, 4], [1, 2], 0) = [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
+
+    )code" TVM_ADD_FILELINE)
+    .set_num_inputs(4)
+    .set_support_level(3)
+    .add_argument("sparse_indices", "Tensor", "Contains sparse indices.")
+    .add_argument("sparse_values", "Tensor", "Contains values for sparse indices.")
+    .add_argument("default_value", "Tensor", "Value to set for non-sparse indices. Defaults to 0.")
+    .add_argument("output_shape", "Tensor", "Shape of the dense output tensor")
+    .add_type_rel("DynSparseToDense", SparseToDenseRel)
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<FTVMCompute>("FTVMCompute", SparseToDenseCompute);
+
 }  // namespace dyn
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 0e1f5c560081..34bff0f5b858 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -94,6 +94,8 @@ Expr MakeOneHot(Expr indices, Expr on_value, Expr off_value, int depth, int axis
 Expr MakeResize(Expr data, Array<IndexExpr> size, String layout, String method,
                 String coordinate_transformation_mode, DataType out_dtype);
 
+Expr MakeSparseToDense(Expr indices, Array<Integer> output_shape, Expr values, Expr default_value);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_MAKE_OP_H_
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 3ca816a6caae..c6b3260886c1 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2889,9 +2889,9 @@ RELAY_REGISTER_OP("gather")
 
 E.g. for a 3D tensor, output is computed as:
 
-	out[i][j][k] = data[indices[i][j][k]][j][k]  # if axis == 0
-	out[i][j][k] = data[i][indices[i][j][k]][k]  # if axis == 1
-	out[i][j][k] = data[i][j][indices[i][j][k]]  # if axis == 2
+       out[i][j][k] = data[indices[i][j][k]][j][k]  # if axis == 0
+       out[i][j][k] = data[i][indices[i][j][k]][k]  # if axis == 1
+       out[i][j][k] = data[i][j][indices[i][j][k]]  # if axis == 2
 
 ``indices`` must have same shape as ``data``, except at dimension ``axis``
 which must just be not null. Output will have same shape as ``indices``.
@@ -3231,16 +3231,21 @@ Array<te::Tensor> SparseToDenseCompute(const Attrs& attrs, const Array<te::Tenso
   ICHECK_EQ(inputs.size(), 3);
   const auto* param = attrs.as<SparseToDenseAttrs>();
   ICHECK(param != nullptr);
-  return {topi::sparse_to_dense(inputs[0], param->output_shape, inputs[1], inputs[2]())};
+  Array<IndexExpr> output_shape;
+  for (auto val : param->output_shape) {
+    output_shape.push_back(val);
+  }
+  return {topi::sparse_to_dense(inputs[0], output_shape, inputs[1], inputs[2]())};
 }
 
-TVM_REGISTER_GLOBAL("relay.op._make.sparse_to_dense")
-    .set_body_typed([](Expr indices, Array<Integer> output_shape, Expr values, Expr default_value) {
-      auto attrs = make_object<SparseToDenseAttrs>();
-      attrs->output_shape = std::move(output_shape);
-      static const Op& op = Op::Get("sparse_to_dense");
-      return Call(op, {indices, values, default_value}, Attrs(attrs));
-    });
+Expr MakeSparseToDense(Expr indices, Array<Integer> output_shape, Expr values, Expr default_value) {
+  auto attrs = make_object<SparseToDenseAttrs>();
+  attrs->output_shape = std::move(output_shape);
+  static const Op& op = Op::Get("sparse_to_dense");
+  return Call(op, {indices, values, default_value}, Attrs(attrs));
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.sparse_to_dense").set_body_typed(MakeSparseToDense);
 
 RELAY_REGISTER_OP("sparse_to_dense")
     .describe(R"code(A dense tensor from a sparse representation.
diff --git a/src/relay/transforms/dynamic_to_static.cc b/src/relay/transforms/dynamic_to_static.cc
index d16d6328301a..f78d05bd9d2c 100644
--- a/src/relay/transforms/dynamic_to_static.cc
+++ b/src/relay/transforms/dynamic_to_static.cc
@@ -189,6 +189,16 @@ class DynamicToStaticMutator : public MixedModeMutator {
            }
            return Expr(nullptr);
          }},
+        {Op::Get("dyn.sparse_to_dense"),
+         [](const CallNode* call_node) {
+           const ConstantNode* output_shape = call_node->args[3].as<ConstantNode>();
+           if (output_shape) {
+             ICHECK_EQ(output_shape->data->ndim, 1);
+             return MakeSparseToDense(call_node->args[0], ToVector(output_shape->data),
+                                      call_node->args[1], call_node->args[2]);
+           }
+           return Expr(nullptr);
+         }},
     };
   }
 
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index e6e866342639..dd73b9a96a52 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -140,9 +140,67 @@ def verify_full(fill_value, src_shape, dtype):
     verify_full(4.0, (2, 50), "float32")
 
 
+@tvm.testing.uses_gpu
+def test_dyn_sparse_to_dense():
+    def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
+        sparse_indices_data = np.array(sparse_indices)
+        sparse_values_data = np.array(sparse_values)
+        default_value_data = np.array(default_value)
+        output_shape_data = np.array(output_shape)
+
+        a = relay.var(
+            "a", relay.TensorType(sparse_indices_data.shape, str(sparse_indices_data.dtype))
+        )
+        b = relay.var(
+            "b", relay.TensorType(sparse_values_data.shape, str(sparse_values_data.dtype))
+        )
+        output_shape_var = relay.var(
+            "output_shape", relay.TensorType(output_shape_data.shape, str(output_shape_data.dtype))
+        )
+        if default_value is None:
+            args = [a, b, output_shape_var]
+            d = relay.sparse_to_dense(a, output_shape_var, b)
+        else:
+            c = relay.var(
+                "c", relay.TensorType(default_value_data.shape, str(default_value_data.dtype))
+            )
+            args = [a, b, c, output_shape_var]
+            d = relay.sparse_to_dense(a, output_shape_var, b, c)
+
+        zz = run_infer_type(d)
+        assert len(zz.checked_type.shape) == len(output_shape)
+
+        func = relay.Function(args, d)
+
+        if default_value is None:
+            arguments = [sparse_indices_data, sparse_values_data, output_shape_data]
+        else:
+            arguments = [
+                sparse_indices_data,
+                sparse_values_data,
+                default_value_data,
+                output_shape_data,
+            ]
+
+        verify_func(func, arguments, xpected)
+
+    verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
+    verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
+    verify_sparse_to_dense(
+        [[0, 0], [1, 2]], [1, 2], 0, [3, 4], [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
+    )  # nXd
+    verify_sparse_to_dense(
+        [[0, 0, 0], [1, 2, 3]],
+        [1, 2],
+        4,
+        [2, 3, 4],
+        [[[1, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]], [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 2]]],
+    )  # nXd
+    verify_sparse_to_dense(
+        [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
+    )  # floats
+    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
+
+
 if __name__ == "__main__":
-    test_dyn_reshape()
-    test_dyn_shape_reshape()
-    test_dyn_tile()
-    test_dyn_zeros_ones()
-    test_dyn_full()
+    pytest.main([__file__])
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
index ba3d2795047f..141023d77019 100644
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ b/tests/python/relay/test_pass_dynamic_to_static.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
 import tvm
 from tvm import te
 from tvm import relay
@@ -457,17 +458,65 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
     verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1], (2, 2, 3), slice_mode="size", test_ref=True)
 
 
+@tvm.testing.uses_gpu
+def test_dyn_to_static_sparse_to_dense():
+    def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
+        sparse_indices_data = np.array(sparse_indices)
+        sparse_values_data = np.array(sparse_values)
+        default_value_data = np.array(default_value)
+        output_shape_data = np.array(output_shape)
+
+        a = relay.var(
+            "a", relay.TensorType(sparse_indices_data.shape, str(sparse_indices_data.dtype))
+        )
+        b = relay.var(
+            "b", relay.TensorType(sparse_values_data.shape, str(sparse_values_data.dtype))
+        )
+        output_shape_const = relay.const(output_shape_data)
+
+        if default_value is None:
+            args = [a, b]
+            d = relay.sparse_to_dense(a, output_shape_const, b)
+        else:
+            c = relay.var(
+                "c", relay.TensorType(default_value_data.shape, str(default_value_data.dtype))
+            )
+            args = [a, b, c]
+            d = relay.sparse_to_dense(a, output_shape_const, b, c)
+
+        zz = run_infer_type(d)
+        assert len(zz.checked_type.shape) == len(output_shape)
+
+        func = relay.Function(args, d)
+
+        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+        assert isinstance(func2.body, relay.Call)
+        assert func2.body.op == relay.op.get("sparse_to_dense")
+
+        if default_value is None:
+            arguments = [sparse_indices_data, sparse_values_data]
+        else:
+            arguments = [sparse_indices_data, sparse_values_data, default_value_data]
+
+        verify_func(func2, arguments, xpected)
+
+    verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
+    verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
+    verify_sparse_to_dense(
+        [[0, 0], [1, 2]], [1, 2], 0, [3, 4], [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]]
+    )  # nXd
+    verify_sparse_to_dense(
+        [[0, 0, 0], [1, 2, 3]],
+        [1, 2],
+        4,
+        [2, 3, 4],
+        [[[1, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]], [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 2]]],
+    )  # nXd
+    verify_sparse_to_dense(
+        [0, 1, 4], [3.1, 3.1, 3.1], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]
+    )  # floats
+    verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
+
+
 if __name__ == "__main__":
-    test_dynamic_to_static_reshape()
-    test_dynamic_to_static_double_reshape()
-    test_dynamic_to_static_quad_reshape()
-    test_dynamic_to_static_tile()
-    test_dynamic_to_static_topk()
-    test_dynamic_to_static_broadcast_to()
-    test_dynamic_to_static_zeros_ones()
-    test_dynamic_to_static_resize()
-    test_dynamic_to_static_one_hot()
-    test_dynamic_to_static_full()
-    test_dynamic_to_static_upsampling()
-    test_dynamic_to_static_pad()
-    test_dynamic_to_static_strided_slice()
+    pytest.main([__file__])

From a6618960758ca36d76ba57f2ed4bd55e5fa568be Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 16 Nov 2020 18:00:00 -0500
Subject: [PATCH 177/258] [CI] Update actions miniconda (#6926)

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index be50a81e527d..dd85ef2a5d17 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -53,7 +53,7 @@ jobs:
       with:
         path: ~/conda_pkgs_dir
         key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda/build-environment.yaml') }}
-    - uses: conda-incubator/setup-miniconda@v1
+    - uses: conda-incubator/setup-miniconda@v2
       with:
         activate-environment: tvm-build
         channel-priority: strict

From 9167d723c69715733938fcf9046ce3642a779b7d Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 16 Nov 2020 20:10:20 -0800
Subject: [PATCH 178/258] [AutoSchedule] Extract tasks via compile engine
 (#6903)

* make use TOPI schedule optional

* extract auto_schedule task

* format

* add extract mode

* silent autotvm

* fallback to TOPI

* use PassContext

* lint

* surpass fallback warnings

* nit

* fix test

* address comments

* address comments

* doc

* address comments

* lint

* skip unsupported tasks

* reigger CI
---
 python/tvm/auto_scheduler/__init__.py         |   2 -
 python/tvm/auto_scheduler/dispatcher.py       |  30 +++--
 python/tvm/auto_scheduler/env.py              |  56 ---------
 .../tvm/auto_scheduler/relay_integration.py   |  59 +++++----
 .../tvm/auto_scheduler/workload_registry.py   |  14 ++-
 python/tvm/relay/backend/compile_engine.py    |  24 ++--
 python/tvm/relay/build_module.py              |  15 ++-
 python/tvm/relay/op/op.py                     |  25 ----
 python/tvm/relay/op/strategy/cuda.py          |  58 +++------
 python/tvm/topi/cuda/conv2d_alter_op.py       |   6 +-
 src/relay/backend/compile_engine.cc           |  34 ++++-
 .../test_auto_scheduler_task_extraction.py    | 116 ++++++++++++++++--
 .../relay/test_auto_scheduler_tuning.py       |   8 +-
 .../ci_logs/resnet-18-NHWC-B1.json            |  45 +++----
 tutorials/auto_scheduler/tune_network_cuda.py |  54 ++++----
 15 files changed, 298 insertions(+), 248 deletions(-)
 delete mode 100644 python/tvm/auto_scheduler/env.py

diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index 46d606c628d9..f0d076e75f02 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -19,7 +19,6 @@
 
 from . import compute_dag
 from . import dispatcher
-from . import env
 from . import feature
 from . import loop_state
 from . import measure
@@ -36,7 +35,6 @@
 from .compute_dag import ComputeDAG
 from .cost_model import RandomModel, XGBModel
 from .dispatcher import DispatchContext, ApplyHistoryBest
-from .env import enable_relay_integration, is_relay_integration_enabled
 from .measure import (
     MeasureInput,
     MeasureResult,
diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index 8822f3963f7b..19bae8622355 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -44,7 +44,7 @@ class DispatchContext(object):
     def __init__(self):
         self._old_ctx = DispatchContext.current
 
-    def query(self, target, workload_key):
+    def query(self, target, workload_key, has_complex_op, dag):
         """
         Query the context to get the specific config for a workload.
         If cannot find the result inside this context, this function will query it
@@ -56,6 +56,10 @@ def query(self, target, workload_key):
             The current target
         workload_key : str
             The workload key
+        has_complex_op: bool
+            Whether this workload has at least one complex op.
+        dag: ComputeDAG
+            The ComputeDAG of the workload.
 
         Returns
         -------
@@ -64,7 +68,7 @@ def query(self, target, workload_key):
         """
         ret = self._query_inside(target, workload_key)
         if ret is None:
-            ret = self._old_ctx.query(target, workload_key)
+            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag)
         return ret
 
     def update(self, target, workload_key, state):
@@ -220,11 +224,11 @@ def _query_inside(self, target, workload_key):
 
     def update(self, target, workload_key, state):
         model = target.model
-        key = (model, workload)
+        key = (model, workload_key)
         self._best_user_defined[key] = state
 
         for k in target.keys:
-            key = (k, workload)
+            key = (k, workload_key)
             self._best_user_defined[key] = state
 
 
@@ -237,21 +241,27 @@ class FallbackContext(DispatchContext):
     def __init__(self):
         super(FallbackContext, self).__init__()
         self.memory = {}
-        self.silent = False
+
+        # Verbose level:
+        # 0: Completely silent.
+        # 1: Warning the missing configs for querying complex tasks.
+        # 2: Warning the missing configs for querying all tasks.
+        self.verbose = 1
 
         # a set to prevent print duplicated message
         self.messages = set()
 
-    def query(self, target, workload_key):
+    def query(self, target, workload_key, has_complex_op, dag):
         key = (str(target), workload_key)
         if key in self.memory:
             return self.memory[key]
 
-        if not self.silent:
+        if self.verbose == 2 or (has_complex_op and self.verbose == 1):
             msg = (
-                "Cannot find tuned schedules for target=%s, workload_key=%s. "
-                "A fallback schedule is used, "
-                "which may bring great performance regression." % (target, workload_key)
+                "Cannot find tuned schedules for target=%s, workload_key=%s, compute:\n%s"
+                "A fallback TOPI schedule is used, "
+                "which may bring great performance regression or even compilation failure."
+                % (target, workload_key, dag)
             )
             if msg not in self.messages:
                 self.messages.add(msg)
diff --git a/python/tvm/auto_scheduler/env.py b/python/tvm/auto_scheduler/env.py
deleted file mode 100644
index 95c7ccf971a2..000000000000
--- a/python/tvm/auto_scheduler/env.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The scope to store global environmental variables of the auto-scheduler"""
-
-
-class AutoSchedulerGlobalScope(object):
-    """The global scope to store environmental variables of the auot-scheduler"""
-
-    def __init__(self):
-        self.enable_relay_integration = False
-
-
-GLOBAL_SCOPE = AutoSchedulerGlobalScope()
-
-
-def is_relay_integration_enabled():
-    """Return whether the relay integration is enabled
-
-    Returns
-    -------
-    enabled: bool
-        Whether the relay integration is enabled
-    """
-    return GLOBAL_SCOPE.enable_relay_integration
-
-
-def enable_relay_integration(new_value=True):
-    """Set the relay integration
-
-    Parameters
-    ---------
-    new_value: bool = True
-        The new setting of relay integration
-
-    Returns
-    -------
-    old_value: bool
-        The old setting.
-    """
-    old_value = GLOBAL_SCOPE.enable_relay_integration
-    GLOBAL_SCOPE.enable_relay_integration = new_value
-    return old_value
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 0b0157c421b5..283d8bf7db45 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -25,7 +25,7 @@
 import threading
 
 import tvm
-from tvm import te, transform
+from tvm import autotvm, te, transform
 from tvm.te.tensor import ComputeOp, PlaceholderOp
 from .compute_dag import ComputeDAG
 from .dispatcher import DispatchContext
@@ -34,18 +34,26 @@
 
 
 def call_all_topi_funcs(mod, params, target):
-    """Call all TOPI compute + schedule to extract tasks in a relay program"""
+    """Call all TOPI compute to extract auto_scheduler tasks in a Relay program"""
     # pylint: disable=import-outside-toplevel
     from tvm import relay
     from tvm.relay.backend import graph_runtime_codegen
 
-    with transform.PassContext(opt_level=3):
+    # Turn off AutoTVM config not found warnings
+    old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
+    autotvm.GLOBAL_SCOPE.silent = True
+
+    with transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
         opt_mod, _ = relay.optimize(mod, target, params)
         grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
         grc.codegen(opt_mod["main"])
 
+    autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
+
 
-def extract_tasks(mod, params, target, target_host=None, hardware_params=None):
+def extract_tasks(
+    mod, params, target, target_host=None, hardware_params=None, include_simple_tasks=False
+):
     """Extract tuning tasks from a relay program.
 
     Parameters
@@ -60,6 +68,8 @@ def extract_tasks(mod, params, target, target_host=None, hardware_params=None):
         The host compilation target
     hardware_params : Optional[HardwareParams]
         Hardware parameters used for the search tasks
+    include_simple_tasks: bool
+        Whether to extract simple tasks that do not include complicated ops.
 
     Returns
     -------
@@ -77,7 +87,9 @@ def extract_tasks(mod, params, target, target_host=None, hardware_params=None):
         target_host = tvm.target.Target(target_host)
 
     # Run the compiler to collect all TOPI calls during compilation.
-    env = TracingEnvironment(TracingMode.EXTRACT_TASK)
+    env = TracingEnvironment(
+        TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY
+    )
     with env:
         # Wrap build call in a new thread to avoid the conflict
         # between python's multiprocessing and tvm's thread pool
@@ -109,7 +121,8 @@ class TracingMode:
     """Two modes for tracing"""
 
     EXTRACT_TASK = 0  # trace all topi calls to extract tasks
-    PREPARE_LAYOUT_REWRITE = 1  # trace topi calls to prepare layout rewrite
+    EXTRACT_COMPLEX_TASK_ONLY = 1  # same as EXTRACT_TASK but ignore the task without complex ops
+    PREPARE_LAYOUT_REWRITE = 2  # trace topi calls to prepare layout rewrite
 
 
 class TracingEnvironment:
@@ -181,11 +194,8 @@ def traverse(t):
     return inputs + list(outs), has_layout_free
 
 
-# The suffix of implementations that use the auto-scheduler in the OpStrategy.
-auto_schedule_impl_suffix = ".auto_scheduler"
-
-
-def auto_schedule_topi(outs):
+@tvm._ffi.register_func("auto_scheduler.relay_integration.auto_schedule_topi_compute")
+def auto_schedule_topi(outs, has_complex_op):
     """Use auto-scheduler to schedule any topi compute function.
 
     Note: This is used internally for relay integration. Do
@@ -195,35 +205,40 @@ def auto_schedule_topi(outs):
     ----------
     outs: List[Tensor]
         The output tensors of topi compute functions
+    has_complex_op: bool
+        Whether the topi compute function includes at least one complex op.
 
     Returns
     -------
-    sch: te.Schedule
-        A topi schedule function
+    sch: Optional[te.Schedule]
+        A tuned schedule or none (if not tuned) in the final build mode;
+        An initial schdule in the tracing mode.
     """
     # pylint: disable=import-outside-toplevel
     from tvm import relay
 
     io_tensors, has_layout_free = traverse_to_get_io_tensors(outs)
     key = register_workload_tensors(io_tensors)
+    if key is None:  # skip this compute if failed to register the workload
+        return None
 
     # only enable layout rewrite for cpu backend
     enable_layout_rewrite = "cpu" in tvm.target.Target.current().keys
 
     env = TracingEnvironment.current
     if env is None:  # in the final build mode
-        state = DispatchContext.current.query(tvm.target.Target.current(), key)
+        dag = ComputeDAG(io_tensors)
+        state = DispatchContext.current.query(tvm.target.Target.current(), key, has_complex_op, dag)
         if state is None:
-            if "gpu" in tvm.target.Target.current().keys:
-                raise RuntimeError("Cannot compile for GPU targets if no valid schedule is found.")
-            return te.create_schedule([x.op for x in outs])
+            return None
 
-        dag = ComputeDAG(io_tensors)
         schedule, _ = dag.apply_steps_from_state(state)
-    elif env.tracing_mode == TracingMode.EXTRACT_TASK:  # in the task extraction mode
-        engine = relay.backend.compile_engine.get()
-        ccache_key = engine.get_current_ccache_key()
-        env.add_workload_key(key, ccache_key)
+    elif env.tracing_mode in [TracingMode.EXTRACT_TASK, TracingMode.EXTRACT_COMPLEX_TASK_ONLY]:
+        # in the task extraction mode
+        if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK:
+            engine = relay.backend.compile_engine.get()
+            ccache_key = engine.get_current_ccache_key()
+            env.add_workload_key(key, ccache_key)
         schedule = te.create_schedule([x.op for x in outs])
     elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
         # todo(merrymercy, minminsun): port layout rewrite
diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
index 8a42c5f9b83a..6a4809b1796c 100644
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ b/python/tvm/auto_scheduler/workload_registry.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name
 
 """
 Workload registration and serialization.
@@ -29,12 +30,14 @@
 When we need the dag, we decode the string and call the function, which will return the dag.
 """
 
+import logging
 import pickle
 import json
 
 import tvm._ffi
 from .utils import serialize_args, deserialize_args, get_func_name
 
+logger = logging.getLogger("auto_scheduler")
 
 # Global workload function and hash key registry
 # It stores two types of workload:
@@ -105,13 +108,18 @@ def register_workload_tensors(tensors):
 
     Returns
     -------
-    key: str
-        The workload key
+    key: Optional[str]
+        The workload key, or None if failed to create a compute DAG.
     """
     # pylint: disable=import-outside-toplevel
     from .compute_dag import ComputeDAG
 
-    key = ComputeDAG(tensors).hash_key()
+    try:
+        key = ComputeDAG(tensors).hash_key()
+    except tvm.error.TVMError as err:
+        logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err))
+        return None
+
     WORKLOAD_FUNC_REGISTRY[key] = tensors
     return json.dumps((key,))
 
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index d874732d6fa0..28f2ac6d489b 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -21,7 +21,7 @@
 import logging
 import numpy as np
 import tvm
-from tvm import te, autotvm, auto_scheduler
+from tvm import te, autotvm
 from tvm.runtime import Object
 from tvm.support import libinfo
 from tvm.target import Target
@@ -196,25 +196,13 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
         outs = best_plevel_impl.compute(attrs, inputs, out_type)
         return best_plevel_impl, outs
 
-    # If auto-scheduler is enabled for Relay, always prefer auto-scheduler
-    if auto_scheduler.is_relay_integration_enabled():
-        auto_scheduler_impls = []
-        for impl in all_impls:
-            if impl.name.endswith(auto_scheduler.relay_integration.auto_schedule_impl_suffix):
-                auto_scheduler_impls.append(impl)
-
-        if auto_scheduler_impls:
-            assert len(auto_scheduler_impls) == 1
-            impl = auto_scheduler_impls[0]
-            outs = impl.compute(attrs, inputs, out_type)
-            return impl, outs
-
     # Otherwise, try autotvm templates
     outputs = {}
     workloads = {}
     best_autotvm_impl = None
     best_cfg = None
     dispatch_ctx = autotvm.task.DispatchContext.current
+    old_silent = autotvm.GLOBAL_SCOPE.silent
     autotvm.GLOBAL_SCOPE.silent = True
     for impl in all_impls:
         outs = impl.compute(attrs, inputs, out_type)
@@ -232,7 +220,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
         if best_cfg is None or best_cfg.cost > cfg.cost:
             best_autotvm_impl = impl
             best_cfg = cfg
-    autotvm.GLOBAL_SCOPE.silent = False
+    autotvm.GLOBAL_SCOPE.silent = old_silent
 
     if best_autotvm_impl:
         # The best autotvm implementation definitely doesn't use fallback config
@@ -251,7 +239,10 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
             "is used, which may bring great performance regression."
             % (target, workloads[best_plevel_impl])
         )
-        if msg not in autotvm.task.DispatchContext.warning_messages:
+        if (
+            not autotvm.env.GLOBAL_SCOPE.silent
+            and msg not in autotvm.task.DispatchContext.warning_messages
+        ):
             autotvm.task.DispatchContext.warning_messages.add(msg)
             autotvm_logger.warning(msg)
     logger.info(
@@ -300,7 +291,6 @@ def lower_call(call, inputs, target):
         best_impl, outputs = select_implementation(op, call.attrs, inputs, ret_type, target)
     else:
         # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.
-        #   Currently, we just use the implementation with highest plevel
         best_impl, outputs = select_implementation(
             op, call.attrs, inputs, ret_type, target, use_autotvm=False
         )
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 35bd8e6d3d4d..cba97c43b25a 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -23,6 +23,7 @@
 
 from tvm.ir import IRModule
 
+from tvm.ir.transform import PassContext
 from tvm.tir import expr as tvm_expr
 from .. import nd as _nd, autotvm
 from ..target import Target
@@ -123,8 +124,20 @@ def build(self, mod, target=None, target_host=None, params=None):
         # Setup the params.
         if params:
             self._set_params(params)
-        # Build the IR module
+
+        # Build the IR module. If auto_scheduler is not enabled,
+        # then use the TOPI-defined schedule.
+        use_auto_scheduler = PassContext.current().config.get(
+            "relay.backend.use_auto_scheduler", False
+        )
+
+        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
+        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
+        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
+
         self._build(mod, target, target_host)
+        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
+
         # Get artifacts
         graph_json = self.get_json()
         mod = self.get_module()
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index fa420c4e71a3..d4d20b3ebc4a 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -18,7 +18,6 @@
 """The base node types for the Relay language."""
 import tvm._ffi
 import tvm.ir
-from tvm.auto_scheduler.relay_integration import auto_schedule_topi, auto_schedule_impl_suffix
 from tvm.driver import lower, build
 from tvm.target import get_native_generic_func, GenericFunc
 from tvm.runtime import Object
@@ -144,30 +143,6 @@ def add_implementation(self, compute, schedule, name="default", plevel=10):
         """
         _OpStrategyAddImplementation(self, compute, schedule, name, plevel)
 
-    def add_auto_scheduler(self, compute, name, plevel=10):
-        """Add an implementation using the auto-scheduler.
-
-        Parameters
-        ----------
-        compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type)
-                           -> List[Tensor]
-            The compute function.
-
-        name : str
-            The name of implementation.
-
-        plevel : int
-            The priority level of implementation.
-        """
-
-        def wrap_schedule(attrs, outs, target):
-            with target:
-                return auto_schedule_topi(outs)
-
-        self.add_implementation(
-            compute, wrap_schedule, name=name + auto_schedule_impl_suffix, plevel=plevel
-        )
-
 
 def _wrap_default_fstrategy(compute, schedule, name):
     def _fstrategy(attrs, inputs, out_type, target):
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index f4ce61b8fa39..105f50116c3e 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -18,6 +18,7 @@
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 from tvm import topi
 import tvm
+from tvm.ir.transform import PassContext
 from tvm.te import SpecializedCondition
 from tvm.contrib import nvcc
 from tvm._ffi import get_global_func
@@ -142,10 +143,6 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                     name="conv2d_nchw_winograd.cuda",
                     plevel=5,
                 )
-
-            strategy.add_auto_scheduler(
-                wrap_compute_conv2d(topi.nn.conv2d_nchw), name="conv2d_nchw"
-            )
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
             strategy.add_implementation(
@@ -221,13 +218,15 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 )
 
             # register auto-scheduler implementations
-            if judge_winograd_auto_scheduler:
-                strategy.add_auto_scheduler(
-                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), name="conv2d_nhwc.winograd"
-                )
-            else:
-                strategy.add_auto_scheduler(
-                    wrap_compute_conv2d(topi.nn.conv2d_nhwc), name="conv2d_nhwc"
+            use_auto_scheduler = PassContext.current().config.get(
+                "relay.backend.use_auto_scheduler", False
+            )
+            if use_auto_scheduler and judge_winograd_auto_scheduler:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
+                    wrap_topi_schedule(tvm.te.create_schedule),
+                    name="conv2d_nhwc.winograd",
+                    plevel=15,
                 )
 
         elif layout == "HWNC":
@@ -286,11 +285,6 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                 name="depthwise_conv2d_nchw.cuda",
             )
-
-            strategy.add_auto_scheduler(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.cuda",
-            )
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
             strategy.add_implementation(
@@ -298,11 +292,6 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                 name="depthwise_conv2d_nhwc.cuda",
             )
-
-            strategy.add_auto_scheduler(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nhwc.cuda",
-            )
         else:
             raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
     else:  # group_conv2d
@@ -459,11 +448,13 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
                 name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
             )
 
-        # register auto-scheduler implementations
-        strategy.add_auto_scheduler(
-            wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
-            name="conv2d_nhwc_winograd_without_weight_transform",
-        )
+        if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
+                wrap_topi_schedule(tvm.te.create_schedule),
+                name="conv2d_nhwc_winograd_without_weight_transform",
+                plevel=15,
+            )
     else:
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
@@ -553,11 +544,6 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                 name="conv3d_ncdhw_winograd.cuda",
                 plevel=5,
             )
-
-        strategy.add_auto_scheduler(
-            wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
-            name="conv3d_ncdhw.cuda",
-        )
     else:  # layout == "NDHWC":
         strategy.add_implementation(
             wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
@@ -581,11 +567,6 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                         plevel=20,
                     )
 
-        strategy.add_auto_scheduler(
-            wrap_compute_conv3d(topi.nn.conv3d_ndhwc),
-            name="conv3d_ndhwc.cuda",
-        )
-
     if target.kind.name == "cuda" and "cudnn" in target.libs:
         strategy.add_implementation(
             wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True),
@@ -681,11 +662,6 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
             name="dense_small_batch.cuda",
         )
 
-        strategy.add_auto_scheduler(
-            wrap_compute_dense(topi.nn.dense),
-            name="dense",
-        )
-
         with SpecializedCondition(b >= 32):
             strategy.add_implementation(
                 wrap_compute_dense(topi.cuda.dense_large_batch),
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 3a58d40cb847..ad6635de0116 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -19,7 +19,7 @@
 
 import logging
 import tvm
-from tvm import te, relay, autotvm, auto_scheduler
+from tvm import te, relay, autotvm
 
 from .. import nn
 from ..utils import get_const_tuple
@@ -52,9 +52,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         # The best implementation is not an AutoTVM template.
         # It may be from the auto-scheduler
 
-        if impl.name == (
-            "conv2d_nhwc.winograd" + auto_scheduler.relay_integration.auto_schedule_impl_suffix
-        ):
+        if impl.name.find("winograd") != -1:
             if dilation != (1, 1):
                 logger.warning("Does not support weight pre-transform for dilated convolution.")
                 return None
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index c8327de94232..1559d7edf35f 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -99,7 +99,12 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
 class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
  public:
   explicit ScheduleGetter(Target target)
-      : target_(target), device_copy_op_(Op::Get("device_copy")) {}
+      : target_(target), device_copy_op_(Op::Get("device_copy")) {
+    // Whether to use auto_scheduler schedule.
+    use_auto_scheduler_ = transform::PassContext::Current()
+                              ->GetConfig<Bool>("relay.backend.use_auto_scheduler", Bool(false))
+                              .value();
+  }
 
   CachedFunc Create(const Function& prim_func) {
     auto cache_node = make_object<CachedFuncNode>();
@@ -145,11 +150,27 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
         tensor_outs.push_back(tensor);
       }
     }
+
     te::Schedule schedule;
     // No need to register schedule for device copy op.
     if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr) {
-      ICHECK(anchor_implementation_.defined());
-      schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_);
+      if (use_auto_scheduler_) {
+        const auto* fauto_schedule =
+            runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
+        ICHECK(fauto_schedule != nullptr)
+            << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered";
+        bool has_complex_op = anchor_op_pattern_ >= kCommReduce;
+        ObjectRef obj = (*fauto_schedule)(tensor_outs, has_complex_op);
+        if (obj.defined()) {
+          schedule = Downcast<te::Schedule>(obj);
+        }
+      }
+
+      // Use TOPI schdule if user specificed, or the function has no auto_scheduler schedule.
+      if (!schedule.defined()) {
+        ICHECK(anchor_implementation_.defined());
+        schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_);
+      }
       for (const auto& scalar : scalars_) {
         if (schedule->Contain(scalar)) {
           schedule[scalar].compute_inline();
@@ -228,9 +249,9 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     }
 
     int op_pattern = fpattern[op];
-    if (op_pattern >= kCommReduce) {
+    if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
       ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
-          << "Two complicated op in a primitive function "
+          << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
           << " anchor=" << anchor_op_ << " current=" << op;
     }
     if (op_pattern >= anchor_op_pattern_) {
@@ -295,6 +316,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   OpImplementation anchor_implementation_;
   std::ostringstream readable_name_stream_;
   Array<te::Operation> scalars_;
+  bool use_auto_scheduler_;
   // Cache device copy op for equivalence checking to reduce registry lookup
   // overhead for each invocation of call node when retrieving schedules.
   const Op& device_copy_op_;
@@ -812,6 +834,8 @@ CompileEngine& CompileEngine::Global() {
   return *inst;
 }
 
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
+
 TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
     .set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
       return LoweredOutput(outputs, impl);
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py
index 4ca2ddb3cf10..1899f9521013 100644
--- a/tests/python/relay/test_auto_scheduler_task_extraction.py
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test task extraction for auto-scheduler"""
+import pytest
+
 import tvm.relay.testing
 import tvm.testing
 from tvm import auto_scheduler, relay
@@ -45,7 +47,6 @@ def get_network(name, batch_size=1, layout="NHWC"):
         )
     elif name == "winograd-test":
         input_shape = [1, 7, 7, 64]
-        output_shape = input_shape
 
         data = relay.var("data", shape=input_shape, dtype="float32")
         net = relay.testing.layers.conv2d(
@@ -96,7 +97,6 @@ def get_network(name, batch_size=1, layout="NHWC"):
 
 @tvm.testing.requires_cuda
 def test_task_extraction_cuda():
-    auto_scheduler.enable_relay_integration()
     target = tvm.target.Target("cuda")
 
     mod, params = get_network("mlp")
@@ -108,24 +108,122 @@ def test_task_extraction_cuda():
         mod, params = get_network("resnet-18", layout=layout)
         tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
 
-        assert len(tasks) == 21
-        assert sum(task_weights) == 22
+        assert len(tasks) == 24
+        assert sum(task_weights) == 25
 
         mod, params = get_network("mobilenet", layout=layout)
         tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
 
-        assert len(tasks) == 20
-        assert sum(task_weights) == 28
+        assert len(tasks) == 22
+        assert sum(task_weights) == 30
 
     for layout in ["NCDHW", "NDHWC"]:
         mod, params = get_network("resnet3d-18", layout=layout)
         tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
 
-        assert len(tasks) == 21
-        assert sum(task_weights) == 22
+        assert len(tasks) == 23
+        assert sum(task_weights) == 24, sum(task_weights)
+
+
+def test_task_extraction():
+    ishape = (1, 3, 224, 224)
+    w1shape = (32, 3, 3, 3)
+    w2shape = (32, 32, 3, 3)
+    dtype = "float32"
+    target = tvm.target.Target("llvm")
+
+    def get_func():
+        data = relay.var("data", shape=(ishape), dtype=dtype)
+        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
+        weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype)
+
+        conv2d = relay.nn.conv2d(data, weight1, kernel_size=(3, 3), padding=(1, 1))
+        relu = relay.nn.relu(conv2d)
+        conv2d = relay.nn.conv2d(relu, weight2, kernel_size=(3, 3), padding=(1, 1))
+        out = relay.nn.relu(conv2d)
+        return relay.Function([data, weight1, weight2], out)
+
+    def get_fused_func():
+        data = relay.var("data", shape=(ishape), dtype=dtype)
+        weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
+        weight2 = relay.var("weight2", shape=(w2shape), dtype=dtype)
+
+        fused_func = get_func()
+
+        # Set to primitive to keep fuse_ops untouch.
+        fused_func = fused_func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+
+        call = relay.Call(fused_func, [data, weight1, weight2])
+        return relay.Function([data, weight1, weight2], call)
+
+    def get_simple_func():
+        data = relay.var("data", relay.TensorType((1, 2, 3), "float32"))
+        out = relay.image.affine_grid(data, (150, 150))
+        return relay.Function([data], out)
+
+    def get_func_with_unsupported_op():
+        def get_postproc_func():
+            data = relay.var("data", shape=((1, 3, 6)), dtype=dtype)
+            out = relay.nn.relu(data)
+            func = relay.Function([data], out)
+            func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+            return func
+
+        cls_prob = relay.var("cls_prob", relay.ty.TensorType((1, 3, 3), "float32"))
+        loc_pred = relay.var("loc_pred", relay.ty.TensorType((1, 3 * 4), "float32"))
+        anchors = relay.var("anchors", relay.ty.TensorType((1, 3, 4), "float32"))
+
+        mtl = relay.vision.multibox_transform_loc(
+            cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors
+        )
+        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False)
+        out = relay.Call(get_postproc_func(), [nms])
+        return relay.Function([cls_prob, loc_pred, anchors], out)
+
+    func = get_func()
+    mod = tvm.IRModule.from_expr(func)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
+
+    # Relay FuseOps puts two conv2ds to separate functions and results in two tasks.
+    assert len(tasks) == 2
+    assert len(task_weights) == 2
+
+    func = get_fused_func()
+    mod = tvm.IRModule.from_expr(func)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
+
+    # By setting the function to primitive, Relay FuseOps will not break it and result in one task.
+    assert len(tasks) == 1
+    assert len(task_weights) == 1
+
+    func = get_simple_func()
+    mod = tvm.IRModule.from_expr(func)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
 
-    auto_scheduler.enable_relay_integration(False)
+    # The Relay function without complex ops will not form a task by default.
+    assert len(tasks) == 0
+    assert len(task_weights) == 0
+
+    tasks, task_weights = auto_scheduler.extract_tasks(
+        mod["main"], None, target, include_simple_tasks=True
+    )
+
+    # Every Relay function becomes a task regardless what ops in its body.
+    assert len(tasks) == 1
+    assert len(task_weights) == 1
+
+    # Func1 (with NMS) -> Func2 (injective).
+    func = get_func_with_unsupported_op()
+    mod = tvm.IRModule.from_expr(func)
+    tasks, task_weights = auto_scheduler.extract_tasks(
+        mod["main"], None, target, include_simple_tasks=True
+    )
+
+    # The function with NMS should fail, but the other function with ReLU should be a task.
+    assert len(tasks) == 1
+    assert len(task_weights) == 1
 
 
 if __name__ == "__main__":
     test_task_extraction_cuda()
+    test_task_extraction()
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
index 089f51cdf047..d42373c86626 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -24,8 +24,6 @@
 
 
 def tune_network(network, target):
-    auto_scheduler.enable_relay_integration()
-
     # Extract tasks
     mod, params = get_network(network)
     target = tvm.target.Target(target)
@@ -50,15 +48,15 @@ def tune_network(network, target):
 
         # Compile with the history best
         with auto_scheduler.ApplyHistoryBest(log_file):
-            with tvm.transform.PassContext(opt_level=3):
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
+            ):
                 lib = relay.build(mod, target=target, params=params)
 
     # Todo(merrymercy): when the cpu backend is upstreamed, do the following things:
     # 1. compile without history to test the fallback mechanism
     # 2. check the correctness of layout rewrite / winograd pre-transform
 
-    auto_scheduler.enable_relay_integration(False)
-
 
 @tvm.testing.requires_cuda
 def test_tuning_cuda():
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
index 37a129844390..41b6c0e554ed 100644
--- a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
+++ b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
@@ -1,23 +1,26 @@
 # Provide valid schedules for resnet-18.
 # This is used to run the tutorial on the documentation web server.
-{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [2, 5, 2, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 2, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 1, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"]]]], "r": [[7.2561e-05], 0, 1.93892, 1605186325], "v": "v0.3"}
-{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 4, 1], 1], ["SP", 6, 10, 16, [4, 2, 1, 1], 1], ["SP", 6, 15, 512, [1, 16, 1, 1], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000195701], 0, 2.67988, 1605186412], "v": "v0.3"}
-{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [1, 16, 1, 1], 1], ["SP", 6, 15, 512, [2, 1, 4, 1], 1], ["SP", 6, 20, 512, [32, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000162045], 0, 2.32406, 1605186499], "v": "v0.3"}
-{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 1, 8, 1], 1], ["SP", 6, 15, 512, [2, 64, 1, 1], 1], ["SP", 6, 20, 512, [16, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [4], 1], ["SP", 4, 4, 512, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [2], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102843], 0, 2.42044, 1605186574], "v": "v0.3"}
-{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [49], 1], ["SP", 8, 4, 256, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 7, 1, 7], 1], ["SP", 6, 15, 256, [1, 8, 1, 2], 1], ["SP", 6, 20, 256, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [2], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[9.61516e-05], 0, 2.69389, 1605186690], "v": "v0.3"}
-{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 1, 1], 1], ["SP", 6, 15, 256, [1, 4, 8, 1], 1], ["SP", 6, 20, 256, [1, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 2, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000156995], 0, 2.11666, 1605186772], "v": "v0.3"}
-{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [49], 1], ["SP", 8, 4, 256, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 4, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [4, 2, 1, 1], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [4], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000131082], 0, 2.24166, 1605186844], "v": "v0.3"}
-{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 128, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 1, 1, 1], 1], ["SP", 6, 5, 4, [2, 2, 1, 1], 1], ["SP", 6, 10, 196, [2, 7, 2, 1], 1], ["SP", 6, 15, 128, [1, 32, 1, 4], 1], ["SP", 6, 20, 128, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [16], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [16], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000166673], 0, 2.43832, 1605186977], "v": "v0.3"}
-{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 2, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [1, 1, 4, 8], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [49], 1], ["SP", 4, 4, 128, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 1024, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000108367], 0, 3.89975, 1605187058], "v": "v0.3"}
-{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 2, 2], 1], ["SP", 6, 10, 196, [1, 4, 7, 1], 1], ["SP", 6, 15, 128, [2, 16, 2, 1], 1], ["SP", 6, 20, 128, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.0137e-05], 0, 2.28468, 1605187134], "v": "v0.3"}
-{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 2, 2, 1], 1], ["SP", 3, 10, 28, [1, 14, 1, 1], 1], ["SP", 3, 15, 128, [1, 2, 16, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 64, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 384, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 24, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[9.74847e-05], 0, 1.97907, 1605187182], "v": "v0.3"}
-{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 2, 1], 1], ["SP", 6, 10, 196, [1, 7, 14, 1], 1], ["SP", 6, 15, 64, [2, 4, 2, 1], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [16], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [8], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.09982e-05], 0, 3.52776, 1605187295], "v": "v0.3"}
-{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [7], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 1], 1], ["SP", 6, 10, 196, [1, 14, 1, 2], 1], ["SP", 6, 15, 64, [1, 2, 8, 2], 1], ["SP", 6, 20, 64, [4, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [4], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 512, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[7.15745e-05], 0, 3.73944, 1605187404], "v": "v0.3"}
-{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [7], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 2, 3, 1], 1], ["SP", 6, 10, 196, [1, 4, 1, 7], 1], ["SP", 6, 15, 64, [1, 8, 2, 1], 1], ["SP", 6, 20, 64, [2, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [4], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 144, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 252, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[6.79478e-05], 0, 5.10446, 1605187506], "v": "v0.3"}
-{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [2, 14, 1, 1], 1], ["SP", 3, 10, 112, [1, 8, 2, 1], 1], ["SP", 3, 15, 64, [2, 2, 2, 2], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [7, 1], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 1176, [21], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 189, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[5.53397e-05], 0, 2.2607, 1605187548], "v": "v0.3"}
-{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [2, 28, 1, 1], 1], ["SP", 3, 10, 56, [1, 2, 2, 1], 1], ["SP", 3, 15, 64, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [1, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 16, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[8.11163e-06], 0, 1.93343, 1605187596], "v": "v0.3"}
-{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [2, 2, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 1], 1], ["SP", 3, 15, 128, [2, 8, 4, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [4, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 256, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 96, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[1.40126e-05], 0, 1.82931, 1605187624], "v": "v0.3"}
-{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 7, 1, 2], 1], ["SP", 3, 10, 14, [1, 1, 1, 2], 1], ["SP", 3, 15, 256, [4, 64, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 32, [16], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 324, [6], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[2.35384e-05], 0, 1.78652, 1605187663], "v": "v0.3"}
-{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 32, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 64], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [4], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 4, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.09105e-05], 0, 1.85659, 1605187687], "v": "v0.3"}
-{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 7, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 8, 2, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 48, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000154153], 0, 2.18601, 1605187723], "v": "v0.3"}
-{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 2], 1], ["SP", 3, 10, 14, [1, 14, 1, 1], 1], ["SP", 3, 15, 256, [1, 32, 1, 2], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 128, [2, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 72, [24], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[5.97747e-05], 0, 2.13918, 1605187759], "v": "v0.3"}
+{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 4, 1, 1000, [50], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$0"], ["PR", 3, 0, "auto_unroll_max_step$1024"]]]], "r": [[4.54041e-06], 0, 1.27943, 1605490839], "v": "v0.3"}
+{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 4], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 4, [4], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 4, [2], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.03431e-05], 0, 2.09134, 1605490924], "v": "v0.3"}
+{"i": [["[\"7de313da0ca29a8c63f647791692430d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [8], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[5.51259e-06], 0, 1.30207, 1605491060], "v": "v0.3"}
+{"i": [["[\"944921d3fd999ba7aa9ffe5a592a9241\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [56], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$512"]]]], "r": [[2.24305e-05], 0, 1.60311, 1605493879], "v": "v0.3"}
+{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [2, 1, 1, 8], 1], ["SP", 3, 10, 112, [1, 8, 1, 1], 1], ["SP", 3, 15, 64, [2, 16, 2, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 294, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 441, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[7.63468e-05], 0, 2.59544, 1605493932], "v": "v0.3"}
+{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [7, 4, 2, 1], 1], ["SP", 3, 10, 56, [1, 2, 2, 1], 1], ["SP", 3, 15, 64, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [8, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 32, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 128, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.26775e-05], 0, 1.94247, 1605494103], "v": "v0.3"}
+{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 2], 1], ["SP", 3, 10, 28, [1, 1, 2, 1], 1], ["SP", 3, 15, 128, [1, 16, 1, 8], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 128, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.13004e-05], 0, 1.86312, 1605494224], "v": "v0.3"}
+{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 2, 1], 1], ["SP", 3, 10, 14, [1, 14, 1, 1], 1], ["SP", 3, 15, 256, [1, 8, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 48, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.29425e-05], 0, 1.70493, 1605494303], "v": "v0.3"}
+{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 16, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.04683e-05], 0, 1.80217, 1605494406], "v": "v0.3"}
+{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 1, 7], 1], ["SP", 3, 10, 28, [1, 4, 1, 1], 1], ["SP", 3, 15, 128, [1, 32, 2, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 64, [1, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 72, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 348, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[4.93528e-05], 0, 1.74125, 1605498773], "v": "v0.3"}
+{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [8], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 8, 1], 1], ["SP", 6, 15, 512, [1, 32, 2, 1], 1], ["SP", 6, 20, 512, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [49], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000129562], 0, 3.40317, 1605500470], "v": "v0.3"}
+{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 1, 7], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 16, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 256, [4, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 288, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 1440, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[7.57476e-05], 0, 2.59558, 1605501054], "v": "v0.3"}
+{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 2, 1], 1], ["SP", 6, 10, 196, [4, 1, 1, 7], 1], ["SP", 6, 15, 128, [2, 32, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [49], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[6.77244e-05], 0, 2.67201, 1605501438], "v": "v0.3"}
+{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 128, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 7, 1], 1], ["SP", 6, 15, 128, [8, 16, 1, 1], 1], ["SP", 6, 20, 128, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [16], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[6.23875e-05], 0, 1.93274, 1605501606], "v": "v0.3"}
+{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 2, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 4], 1], ["SP", 6, 15, 64, [2, 16, 1, 1], 1], ["SP", 6, 20, 64, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[6.65448e-05], 0, 2.94376, 1605501803], "v": "v0.3"}
+{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 2], 1], ["SP", 3, 10, 14, [2, 7, 1, 1], 1], ["SP", 3, 15, 256, [1, 32, 2, 1], 1], ["SP", 3, 20, 3, [1, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 192, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 240, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[6.31245e-05], 0, 1.9322, 1605501903], "v": "v0.3"}
+{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 2, 4, 2], 1], ["SP", 6, 15, 512, [2, 32, 1, 1], 1], ["SP", 6, 20, 512, [16, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000143154], 0, 2.20107, 1605502293], "v": "v0.3"}
+{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [8, 2, 2, 2], 1], ["SP", 6, 20, 256, [2, 16], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [1], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000115017], 0, 3.89122, 1605502608], "v": "v0.3"}
+{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [4], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [2, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 1, 2, 14], 1], ["SP", 6, 15, 128, [1, 32, 1, 2], 1], ["SP", 6, 20, 128, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.20936e-05], 0, 3.36582, 1605502968], "v": "v0.3"}
+{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [8, 1, 2, 2], 1], ["SP", 6, 20, 256, [1, 32], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [2], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000122349], 0, 4.2774, 1605503135], "v": "v0.3"}
+{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 1, 7], 1], ["SP", 6, 15, 256, [8, 4, 1, 1], 1], ["SP", 6, 20, 256, [1, 16], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 256, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.9277e-05], 0, 3.07064, 1605503350], "v": "v0.3"}
+{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 1], 1], ["SP", 6, 5, 6, [1, 2, 1, 1], 1], ["SP", 6, 10, 196, [7, 7, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 64, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.64176e-05], 0, 5.45091, 1605503568], "v": "v0.3"}
+{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 64, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [14, 7, 1, 2], 1], ["SP", 6, 15, 64, [1, 16, 1, 2], 1], ["SP", 6, 20, 64, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 4, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[7.60496e-05], 0, 3.00771, 1605503805], "v": "v0.3"}
+{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 1, 4, 4], 1], ["SP", 6, 15, 512, [1, 64, 1, 1], 1], ["SP", 6, 20, 512, [1, 32], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [16], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [16], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135079], 0, 2.40957, 1605504233], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 4756ea390b5c..723b8d15ea88 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -102,10 +102,10 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
             batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
         )
     elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
         mod, params = relay.testing.squeezenet.get_workload(
             version="1.1",
             batch_size=batch_size,
-            layout=layout,
             dtype=dtype,
             image_shape=image_shape,
         )
@@ -148,9 +148,6 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 # latency of a task and :code:`weight[t]` is the weight of the task.
 # The task scheduler will just optimize this objective.
 
-# Enable auto-scheduler in relay
-auto_scheduler.enable_relay_integration()
-
 # Extract tasks from the network
 print("Extract tasks...")
 mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
@@ -219,29 +216,32 @@ def run_tuning():
 #     ----------------------------------------------------------------------
 #     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
 #     -------------------------------------------------
-#     |    0 |        0.014 |          72.07 |     64 |
-#     |    1 |        0.185 |        1250.68 |    128 |
-#     |    2 |        0.142 |        1626.36 |    192 |
-#     |    3 |        0.137 |        1689.42 |    128 |
-#     |    4 |        0.097 |        1189.75 |    128 |
-#     |    5 |        0.092 |        2505.25 |    128 |
-#     |    6 |        0.080 |        2893.08 |    128 |
-#     |    7 |        0.119 |        1947.84 |    128 |
-#     |    8 |        0.090 |        1292.62 |     64 |
-#     |    9 |        0.107 |        2172.30 |     64 |
-#     |   10 |        0.095 |        2439.36 |     64 |
-#     |   11 |        0.077 |        3003.22 |     64 |
-#     |   12 |        0.068 |        1695.13 |     64 |
-#     |   13 |        0.058 |        3979.29 |     64 |
-#     |   14 |        0.048 |        4859.95 |    128 |
-#     |   15 |        0.073 |        3151.76 |     64 |
-#     |   16 |        0.056 |        4265.94 |     64 |
-#     |   17 |        0.009 |        2754.90 |     64 |
-#     |   18 |        0.011 |        1156.08 |     64 |
-#     |   19 |        0.013 |         955.80 |     64 |
-#     |   20 |        0.029 |         437.71 |     64 |
+#     |    0 |        0.005 |           0.88 |     64 |
+#     |    1 |        0.010 |          99.10 |     64 |
+#     |    2 |        0.006 |           0.00 |     64 |
+#     |    3 |        0.145 |         979.78 |    384 |
+#     |    4 |        0.130 |        1097.02 |    384 |
+#     |    5 |        0.143 |         992.69 |    384 |
+#     |    6 |        0.076 |        1526.86 |    192 |
+#     |    7 |        0.115 |         999.44 |    320 |
+#     |    8 |        0.079 |        1449.39 |    320 |
+#     |    9 |        0.122 |         938.73 |    384 |
+#     |   10 |        0.063 |        1832.98 |    192 |
+#     |   11 |        0.072 |        1763.62 |    256 |
+#     |   12 |        0.062 |        2036.40 |    192 |
+#     |   13 |        0.068 |        1874.44 |    192 |
+#     |   14 |        0.049 |        2346.50 |    128 |
+#     |   15 |        0.076 |        1694.31 |    256 |
+#     |   16 |        0.067 |        1933.30 |    448 |
+#     |   17 |        0.076 |        1680.90 |    256 |
+#     |   18 |        0.022 |          98.43 |     64 |
+#     |   19 |        0.076 |        3112.55 |    192 |
+#     |   20 |        0.013 |        2026.44 |     64 |
+#     |   21 |        0.011 |        1136.69 |     64 |
+#     |   22 |        0.013 |         992.47 |     64 |
+#     |   23 |        0.020 |         627.56 |     64 |
 #     -------------------------------------------------
-#     Estimated total latency: 1.649 ms  Trials: 1920  Used time : 3598 s  Next ID: 9
+#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
 #
 #   This table lists the latency and (estimated) speed of all tasks.
 #   It also lists the allocation of measurement trials for all tasks.
@@ -276,7 +276,7 @@ def run_tuning():
 # Compile with the history best
 print("Compile...")
 with auto_scheduler.ApplyHistoryBest(log_file):
-    with tvm.transform.PassContext(opt_level=3):
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
         lib = relay.build(mod, target=target, params=params)
 
 # Create graph runtime

From 82abda7021929f019fce2b673bf51a0800b336fb Mon Sep 17 00:00:00 2001
From: Taylor Zowtuk <zowtukt@gmail.com>
Date: Tue, 17 Nov 2020 01:14:01 -0700
Subject: [PATCH 179/258] Make AutoScheduler handling of errors during measure
 consistent with AutoTvm (#6909)

* Match ansor handling of 'too many errors' during measure to that of autoTVM and match default level of logging

* Set correct level of verbosity for debug mode

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>

* Lint

* trigger CI

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
Co-authored-by: Taylor Zowtuk 84152750 <taylor.zowtuk@huawei.com>
---
 python/tvm/autotvm/tuner/tuner.py | 2 +-
 src/auto_scheduler/measure.cc     | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index ba54291ada67..fa609306140b 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -176,7 +176,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
 
             if error_ct > 150:
                 logging.basicConfig()
-                logger.warning("Too many errors happen in the tuning. Now is in debug mode")
+                logger.warning("Too many errors happen in the tuning. Switching to debug mode.")
                 logger.setLevel(logging.DEBUG)
             else:
                 logger.setLevel(old_level)
diff --git a/src/auto_scheduler/measure.cc b/src/auto_scheduler/measure.cc
index c77bafc84e6e..03585ea40c03 100755
--- a/src/auto_scheduler/measure.cc
+++ b/src/auto_scheduler/measure.cc
@@ -218,6 +218,8 @@ Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
     batch_size = builder->n_parallel * 2;
   }
 
+  int old_verbosity = verbose;
+
   StdCout(verbose) << "Get " << inputs.size() << " programs to measure." << std::endl;
 
   for (size_t i = 0; i < inputs.size(); i += batch_size) {
@@ -270,7 +272,11 @@ Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
     }
 
     if (error_ct > max_continuous_error) {
-      LOG(FATAL) << "Too many errors happened during tuning";
+      LOG(WARNING) << "Too many errors happened during tuning. Switching to debug mode."
+                   << std::endl;
+      verbose = 2;
+    } else {
+      verbose = old_verbosity;
     }
   }
 

From ddc295f27085b5b568dcd561f84a133c207dca34 Mon Sep 17 00:00:00 2001
From: BhushanIMG <71267391+BhushanIMG@users.noreply.github.com>
Date: Tue, 17 Nov 2020 05:04:12 -0800
Subject: [PATCH 180/258] [Relay] Add space_to_batch_nd and batch_to_space_nd
 operators (#6477)

* [Relay] Add space_to_batch_nd and batch_to_space_nd operators

* Correct python-format errors

* correct lint errors

* tflite frontend to use batch_to_space and space_to_batch operators

* Add new pad_value parameter with default value is 0 for space_to_batch_nd and correct variable names

* Fix cppdocs - add documentation for pad_value
---
 include/tvm/relay/attrs/nn.h                  |  28 +++
 include/tvm/topi/nn.h                         | 178 ++++++++++++++
 python/tvm/relay/frontend/tensorflow.py       |  88 +------
 python/tvm/relay/frontend/tflite.py           |  85 +------
 python/tvm/relay/op/nn/_nn.py                 |   5 +
 python/tvm/relay/op/nn/nn.py                  |  61 +++++
 python/tvm/relay/op/op_attrs.py               |  10 +
 python/tvm/topi/nn/__init__.py                |   2 +
 python/tvm/topi/nn/batch_to_space_nd.py       |  49 ++++
 python/tvm/topi/nn/space_to_batch_nd.py       |  52 +++++
 python/tvm/topi/testing/__init__.py           |   2 +
 python/tvm/topi/testing/batch_to_space_nd.py  |  97 ++++++++
 python/tvm/topi/testing/space_to_batch_nd.py  |  93 ++++++++
 src/relay/op/nn/nn.cc                         | 218 ++++++++++++++++++
 src/topi/nn.cc                                |   8 +
 tests/python/relay/test_op_level5.py          |  56 +++++
 .../python/test_topi_batch_to_space_nd.py     |  70 ++++++
 .../python/test_topi_space_to_batch_nd.py     |  72 ++++++
 18 files changed, 1018 insertions(+), 156 deletions(-)
 create mode 100644 python/tvm/topi/nn/batch_to_space_nd.py
 create mode 100644 python/tvm/topi/nn/space_to_batch_nd.py
 create mode 100644 python/tvm/topi/testing/batch_to_space_nd.py
 create mode 100644 python/tvm/topi/testing/space_to_batch_nd.py
 create mode 100644 tests/python/topi/python/test_topi_batch_to_space_nd.py
 create mode 100644 tests/python/topi/python/test_topi_space_to_batch_nd.py

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index b2555de6d35e..e697ac45bd12 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -1324,6 +1324,34 @@ struct CorrelationAttrs : public tvm::AttrsNode<CorrelationAttrs> {
   }
 };  // struct CorrelationAttrs
 
+/*! \brief Attributes used in SpaceToBatchND operator */
+struct SpaceToBatchNDAttrs : public tvm::AttrsNode<SpaceToBatchNDAttrs> {
+  Array<Integer> block_shape;
+  Array<Array<IndexExpr>> paddings;
+  double pad_value;
+
+  TVM_DECLARE_ATTRS(SpaceToBatchNDAttrs, "relay.attrs.SpaceToBatchNDAttrs") {
+    TVM_ATTR_FIELD(block_shape)
+        .set_default(Array<Integer>({1, 1}))
+        .describe("1-D containing block size for each spatial dimension.");
+    TVM_ATTR_FIELD(paddings).describe("2-D containing paddings for each spatial dimension.");
+    TVM_ATTR_FIELD(pad_value).set_default(0.0).describe("The value used for padding.");
+  }
+};  // struct SpaceToBatchNDAttrs
+
+/*! \brief Attributes used in BatchToSpaceND operator */
+struct BatchToSpaceNDAttrs : public tvm::AttrsNode<BatchToSpaceNDAttrs> {
+  Array<Integer> block_shape;
+  Array<Array<IndexExpr>> crops;
+
+  TVM_DECLARE_ATTRS(BatchToSpaceNDAttrs, "relay.attrs.BatchToSpaceNDAttrs") {
+    TVM_ATTR_FIELD(block_shape)
+        .set_default(Array<Integer>({1, 1}))
+        .describe("1-D containing block size for each spatial dimension.");
+    TVM_ATTR_FIELD(crops).describe("2-D containing amount to crop from spatial dimension.");
+  }
+};  // struct BatchToSpaceNDAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/include/tvm/topi/nn.h b/include/tvm/topi/nn.h
index ba1be3424fcc..f958048f13c3 100644
--- a/include/tvm/topi/nn.h
+++ b/include/tvm/topi/nn.h
@@ -30,6 +30,7 @@
 #include <tvm/tir/op.h>
 #include <tvm/topi/detail/constant_utils.h>
 #include <tvm/topi/tags.h>
+#include <tvm/topi/transform.h>
 
 #include <algorithm>
 #include <string>
@@ -459,6 +460,183 @@ inline tvm::te::Tensor group_conv2d_ngchw(const tvm::te::Tensor& I, const tvm::t
   return tvm::te::compute(output_shape, l, name, tag);
 }
 
+/*!
+ * \brief Divide spatial dimensions of the input into a grid of blocks.
+ *
+ * \param data The input tensor.
+ * \param block_shape The size of the spatial block.
+ * \param pad_before The zero-padding size before each spatial dimension.
+ * \param pad_after The zero-padding size after each spatial dimension.
+ * \param pad_value The value used for padding.
+ * \param name The name of the operation.
+ * \param tag The tag to mark the operation.
+ *
+ * \return A Tensor whose op member is the space_to_batch_nd operation
+ */
+inline tvm::te::Tensor space_to_batch_nd(const tvm::te::Tensor& data,
+                                         const tvm::Array<Integer>& block_shape,
+                                         const tvm::Array<tvm::PrimExpr>& pad_before,
+                                         const tvm::Array<tvm::PrimExpr>& pad_after,
+                                         PrimExpr pad_value = PrimExpr(),
+                                         std::string name = "space_to_batch_nd",
+                                         std::string tag = kInjective) {
+  tvm::te::Tensor padded_t;
+  CHECK_EQ(pad_before.size(), pad_after.size());
+  CHECK_EQ(block_shape.size(), pad_before.size())
+      << "Paddings must be provided for each spatial dimension";
+  tvm::Array<tvm::PrimExpr> pad_before_int32;
+  tvm::Array<tvm::PrimExpr> pad_after_int32;
+
+  // pad size for batch dimension is 0
+  pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), 0));
+  pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), 0));
+  // insert pad sizes given for spatial dimensions
+  for (const auto& ele : pad_before) {
+    pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+  }
+  for (const auto& ele : pad_after) {
+    pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+  }
+
+  // pad the input with paddings provided
+  if (!pad_value.defined()) {
+    pad_value = tvm::tir::make_const(data->dtype, 0);
+  }
+  padded_t = pad(data, pad_before_int32, pad_after_int32, pad_value);
+
+  auto input_shape = data->shape;
+  auto padded_shape = padded_t->shape;
+
+  // infer shapes
+  tvm::Array<PrimExpr> r_shape;
+  tvm::Array<Integer> axis;
+  tvm::Array<PrimExpr> o_shape;
+
+  size_t num_block_dims = block_shape.size();
+  int batch = static_cast<int>(GetConstInt(input_shape[0]));
+  tvm::PrimExpr block_shape_prod(1);
+  r_shape.push_back(batch);
+
+  for (size_t i = 1; i <= num_block_dims; i++) {
+    int padded_input = static_cast<int>(GetConstInt(padded_shape[i]));
+    int block_size = static_cast<int>(GetConstInt(block_shape[i - 1]));
+    CHECK_EQ((padded_input % block_size), 0)
+        << "(" << i
+        << ")th "
+           "Input dimension after padding ("
+        << padded_input << ")"
+        << " must be divisible by its block size (" << block_size << ")";
+
+    r_shape.push_back(div(padded_shape[i], block_shape[i - 1]));
+    r_shape.push_back(block_shape[i - 1]);
+    block_shape_prod *= block_shape[i - 1];
+    axis.push_back(Integer(r_shape.size() - 1));  // index of block_shape[i - 1]
+  }
+
+  size_t n = axis.size();
+  axis.push_back(0);  // batch is at index 0
+  // index of (padded_shape[i] / block_shape[i - 1]) in r_shape
+  for (size_t i = 0; i < n; i++) {
+    axis.push_back(static_cast<int>(GetConstInt(axis[i] - 1)));
+  }
+  o_shape.push_back(tvm::PrimExpr(batch) * block_shape_prod);
+  for (size_t i = 1; i <= num_block_dims; i++) {
+    o_shape.push_back(div(padded_shape[i], block_shape[i - 1]));
+  }
+  // append remaining shape
+  for (size_t i = num_block_dims + 1; i < input_shape.size(); i++) {
+    r_shape.push_back(input_shape[i]);
+    axis.push_back(Integer(r_shape.size() - 1));  // index of remaining shape in r_shape
+    o_shape.push_back(input_shape[i]);
+  }
+
+  tvm::te::Tensor output = reshape(padded_t, r_shape);
+  output = transpose(output, axis);
+  output = reshape(output, o_shape);
+
+  return output;
+}
+
+/*!
+ * \brief Reshape the batch dimension into spatial dimensions.
+ *
+ * \param data The input tensor.
+ * \param block_shape The size of the spatial block.
+ * \param crop_begin_list The begin crop size for each spatial dimension.
+ * \param crop_end_list The end crop size for each spatial dimension.
+ * \param name The name of the operation.
+ * \param tag The tag to mark the operation.
+ *
+ * \return A Tensor whose op member is the batch_to_space_nd operation
+ */
+inline tvm::te::Tensor batch_to_space_nd(const tvm::te::Tensor& data,
+                                         const tvm::Array<Integer>& block_shape,
+                                         const tvm::Array<tvm::PrimExpr>& crop_begin_list,
+                                         const tvm::Array<tvm::PrimExpr>& crop_end_list,
+                                         std::string name = "batch_to_space_nd",
+                                         std::string tag = kInjective) {
+  // Construct shapes for reshape and transpose operation
+  Array<PrimExpr> in_shape = data->shape;
+  Array<PrimExpr> r_shape;
+  Array<Integer> axis;
+  size_t num_block_dims = block_shape.size();
+  size_t num_input_dims = in_shape.size();
+  tvm::PrimExpr block_shape_prod(1);
+  int batch = static_cast<int>(GetConstInt(in_shape[0]));
+
+  for (size_t i = 0; i < num_block_dims; i++) {
+    r_shape.push_back(block_shape[i]);
+    block_shape_prod *= block_shape[i];
+  }
+  axis.push_back(Integer(r_shape.size()));  // axis of (batch / block_shape_prod)
+  r_shape.push_back(batch / block_shape_prod);
+
+  for (size_t i = 1; i < num_input_dims; i++) {
+    axis.push_back(Integer(r_shape.size()));  // axis of in_shape[i]
+    if (axis.size() < (num_block_dims + num_input_dims)) {
+      axis.push_back(Integer(r_shape.size() - (num_block_dims + 1)));  // axis of block_shape[i]
+    }
+    r_shape.push_back(in_shape[i]);
+  }
+
+  Array<PrimExpr> r_p_shape;
+  r_p_shape.push_back(batch / block_shape_prod);
+  for (size_t i = 1; i <= num_block_dims; i++) {
+    r_p_shape.push_back(in_shape[i] * block_shape[i - 1]);
+  }
+  for (size_t i = num_block_dims + 1; i < num_input_dims; i++) {
+    r_p_shape.push_back(in_shape[i]);
+  }
+
+  tvm::te::Tensor out;
+  out = reshape(data, r_shape);
+  out = transpose(out, axis);
+  out = reshape(out, r_p_shape);
+
+  // Crop the start and end of dimensions of out
+  Array<Integer> begin_idx, end_idx, strides;
+  for (size_t i = 0; i < r_p_shape.size(); ++i) {
+    strides.push_back(Integer(1));
+    if (i > 0 && i <= num_block_dims) {
+      // prepare begin and end index for spatial dimensions
+      int begin_i = static_cast<int>(GetConstInt(crop_begin_list[i - 1]));
+      int end_i = static_cast<int>(GetConstInt(crop_end_list[i - 1]));
+      int out_i = static_cast<int>(GetConstInt(r_p_shape[i]));
+      CHECK_GT(out_i, (begin_i + end_i))
+          << "Incorrect crop sizes for (" << i << ")th dim, can not crop more than"
+          << " output size" << out_i << " vs " << (begin_i + end_i);
+      begin_idx.push_back(begin_i);
+      end_idx.push_back(out_i - end_i);
+    } else {
+      // ignore the batch and remaining dimension
+      begin_idx.push_back(Integer(0));
+      end_idx.push_back(static_cast<int>(GetConstInt(r_p_shape[i])));
+    }
+  }
+
+  out = strided_slice(out, begin_idx, end_idx, strides);
+  return out;
+}
 }  // namespace topi
 }  // namespace tvm
 #endif  // TVM_TOPI_NN_H_
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 38431933f7cf..abdc1a0e3114 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -2060,8 +2060,6 @@ def _impl(inputs, attr, params, mod):
 
 def _space_to_batch_nd():
     def _impl(inputs, attr, params, mod):
-        input_node = inputs[0]
-        input_shape = _infer_shape(input_node, mod)
         try:
             block_shape = _get_list_param(params, inputs[1])
         except (IndexError, KeyError, AttributeError):
@@ -2075,48 +2073,18 @@ def _impl(inputs, attr, params, mod):
             if len(paddings.shape) == 1:
                 paddings = np.expand_dims(paddings, axis=0)
             paddings = paddings.tolist()
-        N = len(input_shape)
-        M = len(block_shape)
-        batch = input_shape[0]
-        remaining_shape_length = N - M - 1
-        paddings = [(0, 0)] + paddings + [(0, 0)] * remaining_shape_length
-        # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/space-to-batch-n-d:
-        # Zero-pad the start and end of dimensions [1, ..., M] of the input according to paddings
-        # to produce padded of shape padded_shape.
-        padded = tvm.relay.nn.pad(input_node, pad_width=paddings)
-        # Reshape padded to reshaped_padded of shape:
-        # [batch] + [padded_shape[1] / block_shape[0], block_shape[0], ...,
-        # padded_shape[M] / block_shape[M-1], block_shape[M-1]] + remaining_shape
-        shape1 = [batch] + [item for i in range(M) for item in [-4, -1, block_shape[i]]] + [-2]
-        reshaped_padded = tvm.relay.reshape(padded, newshape=shape1)
-        # Permute dimensions of reshaped_padded to produce permuted_reshaped_padded of shape:
-        # block_shape + [batch] + [padded_shape[1] / block_shape[0], ...,
-        # padded_shape[M] / block_shape[M-1]] + remaining_shape
-        axes = (
-            [2 * i + 2 for i in range(M)]
-            + [0]
-            + [2 * i + 1 for i in range(M)]
-            + list(range(1 + 2 * M, 1 + 2 * M + remaining_shape_length))
-        )
-        permuted_reshaped_padded = tvm.relay.transpose(reshaped_padded, axes=axes)
-        permuted_reshaped_padded_shape = _infer_shape(permuted_reshaped_padded, mod)
-        # Reshape permuted_reshaped_padded to flatten block_shape into the batch dimension,
-        # producing an output tensor of shape:
-        # [batch * prod(block_shape)] + [padded_shape[1] / block_shape[0], ...,
-        # padded_shape[M] / block_shape[M-1]] + remaining_shape
-        shape2 = [batch * np.prod(block_shape)] + list(permuted_reshaped_padded_shape)[M + 1 :]
-        reshaped_permuted_reshaped_padded = tvm.relay.reshape(
-            permuted_reshaped_padded, newshape=shape2
-        )
-        return reshaped_permuted_reshaped_padded
+
+        attr["block_shape"] = block_shape
+        attr["paddings"] = paddings
+        out = AttrCvt("space_to_batch_nd", ignores=["Tblock_shape", "Tpaddings"])([inputs[0]], attr)
+
+        return out
 
     return _impl
 
 
 def _batch_to_space_nd():
     def _impl(inputs, attr, params, mod):
-        input_node = inputs[0]
-        input_shape = _infer_shape(input_node, mod)
         try:
             block_shape = _get_list_param(params, inputs[1])
         except (IndexError, KeyError, AttributeError):
@@ -2130,46 +2098,12 @@ def _impl(inputs, attr, params, mod):
             if len(crops.shape) == 1:
                 crops = np.expand_dims(crops, axis=0)
             crops = crops.tolist()
-        M = len(block_shape)
-        batch = input_shape[0]
-        # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-to-space-n-d:
-        # Reshape input to reshaped of shape:
-        # [block_shape[0], ..., block_shape[M-1], batch / prod(block_shape),
-        #  input_shape[1], ..., input_shape[N-1]]
-        shape1 = block_shape + [batch // np.prod(block_shape)] + list(input_shape[1:])
-        reshaped = tvm.relay.reshape(input_node, newshape=shape1)
-        # Permute dimensions of reshaped to produce permuted of shape
-        # [batch / prod(block_shape), input_shape[1], block_shape[0], ...,
-        # input_shape[M], block_shape[M-1], input_shape[M+1], ..., input_shape[N-1]]
-        axes = (
-            [M]
-            + [axis for i in range(M) for axis in [M + i + 1, i]]
-            + list(range(2 * M + 1, len(shape1)))
-        )
-        permuted = tvm.relay.transpose(reshaped, axes=axes)
-        # Reshape permuted to produce reshaped_permuted of shape
-        # [batch / prod(block_shape), input_shape[1] * block_shape[0], ...,
-        #  input_shape[M] * block_shape[M-1], input_shape[M+1], ..., input_shape[N-1]]
-        shape2 = [0] + [-3] * M + [-2]
-        reshaped_permuted = tvm.relay.reshape(permuted, newshape=shape2)
-        # Crop the start and end of dimensions [1, ..., M] of reshaped_permuted according to crops
-        # to produce the output of shape:
-        # [batch / prod(block_shape), input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-        #  ..., input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-        #  input_shape[M+1], ..., input_shape[N-1]]
-        reshaped_permuted_shape = _infer_shape(reshaped_permuted, mod)
-        cropped = reshaped_permuted
-        for axis in range(1, M + 1):
-            crop = crops[axis - 1]
-            if crop != [0, 0]:
-                indices = tvm.relay.arange(
-                    _expr.const(crop[0]),
-                    _expr.const(reshaped_permuted_shape[axis] - crop[1]),
-                    dtype="int32",
-                )
-                cropped = tvm.relay.take(cropped, indices=indices, axis=axis)
 
-        return cropped
+        attr["block_shape"] = block_shape
+        attr["crops"] = crops
+        out = AttrCvt("batch_to_space_nd", ignores=["Tblock_shape", "Tcrops"])([inputs[0]], attr)
+
+        return out
 
     return _impl
 
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 6da06ac4a20b..623aeee358a6 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -2570,46 +2570,12 @@ def convert_batch_to_space_nd(self, op):
         input_tensor_idx = input_tensor.tensor_idx
         in_expr = self.get_expr(input_tensor_idx)
 
-        input_shape = list(input_tensor.tensor.ShapeAsNumpy())
-        batch = input_shape[0]
-
         block_shape = list(self.get_tensor_value(input_tensors[1]))
-        M = len(block_shape)
-
-        crops = list(self.get_tensor_value(input_tensors[2]))
+        crops = self.get_tensor_value(input_tensors[2]).tolist()
 
-        # From https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-to-space-n-d:
-        # Reshape input to reshaped of shape
-        shape1 = block_shape + [batch // np.prod(block_shape)] + input_shape[1:]
-        reshaped = _op.reshape(in_expr, newshape=shape1)
-
-        # Permute dimensions of reshaped to produce permuted of shape
-        axes = (
-            [M]
-            + [axis for i in range(M) for axis in [M + i + 1, i]]
-            + list(range(2 * M + 1, len(shape1)))
-        )
-        permuted = _op.transpose(reshaped, axes=axes)
-
-        # Reshape permuted to produce reshaped_permuted of shape
-        shape2 = [0] + [-3] * M + [-2]
-        reshaped_permuted = _op.reshape(permuted, newshape=shape2)
-
-        # Crop the start and end of dimensions [1, ..., M] of reshaped_permuted according to crops
-        # to produce the output of shape:
-        reshaped_permuted_shape = _infer_shape(reshaped_permuted)
-        cropped = reshaped_permuted
-        for axis in range(1, M + 1):
-            crop = crops[axis - 1]
-            if (crop != [0, 0]).any():
-                indices = _op.arange(
-                    _expr.const(crop[0]),
-                    _expr.const(reshaped_permuted_shape[axis] - crop[1]),
-                    dtype="int32",
-                )
-                cropped = _op.take(cropped, indices=indices, axis=axis)
+        out = _op.nn.batch_to_space_nd(in_expr, block_shape, crops)
 
-        return cropped
+        return out
 
     def convert_space_to_batch_nd(self, op):
         """space_to_batch_nd implementation."""
@@ -2620,51 +2586,12 @@ def convert_space_to_batch_nd(self, op):
         input_tensor_idx = input_tensor.tensor_idx
         in_expr = self.get_expr(input_tensor_idx)
 
-        input_shape = list(input_tensor.tensor.ShapeAsNumpy())
-        batch = input_shape[0]
-        N = len(input_shape)
-
         block_shape = list(self.get_tensor_value(input_tensors[1]))
-        M = len(block_shape)
-
-        paddings = list(self.get_tensor_value(input_tensors[2]))
+        paddings = self.get_tensor_value(input_tensors[2]).tolist()
 
-        # From https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd:
-        # Zero-pad the start and end of dimensions [1, ..., M] of the input according to paddings
-        # to produce padded of shape padded_shape.
-        remaining_shape_length = N - M - 1
-        padded_list = [(0, 0)] + paddings + [(0, 0)] * remaining_shape_length
+        out = _op.nn.space_to_batch_nd(in_expr, block_shape, paddings)
 
-        padded_shape = []
-        for element in padded_list:
-            if isinstance(element, np.ndarray):
-                element = element.tolist()
-
-            padded_shape.append(element)
-
-        padded_shape = tuple(padded_shape)
-        padded = _op.nn.pad(in_expr, pad_width=tuple(padded_shape))
-
-        # Reshape padded to reshaped_padded of shape:
-        shape1 = [batch] + [item for i in range(M) for item in [-4, -1, block_shape[i]]] + [-2]
-        reshaped_padded = _op.reshape(padded, newshape=shape1)
-
-        # Permute dimensions of reshaped_padded to produce permuted_reshaped_padded of shape:
-        axes = (
-            [2 * i + 2 for i in range(M)]
-            + [0]
-            + [2 * i + 1 for i in range(M)]
-            + list(range(1 + 2 * M, 1 + 2 * M + remaining_shape_length))
-        )
-        permuted_reshaped_padded = _op.transpose(reshaped_padded, axes=axes)
-        permuted_reshaped_padded_shape = _infer_shape(permuted_reshaped_padded)
-
-        # Reshape permuted_reshaped_padded to flatten block_shape into the batch dimension,
-        # producing an output tensor of shape:
-        shape2 = [batch * np.prod(block_shape)] + list(permuted_reshaped_padded_shape)[M + 1 :]
-        reshaped_permuted_reshaped_padded = _op.reshape(permuted_reshaped_padded, newshape=shape2)
-
-        return reshaped_permuted_reshaped_padded
+        return out
 
     def convert_depth_to_space(self, op):
         """Convert TFLite DEPTH_TO_SPACE"""
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index c235f87d1e99..f7115da64a52 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -746,6 +746,11 @@ def compute_space_to_depth(attrs, inputs, out_dtype):
 reg.register_pattern("nn.correlation", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+# space_to_batch_nd and batch_to_space_nd
+reg.register_injective_schedule("nn.space_to_batch_nd")
+reg.register_injective_schedule("nn.batch_to_space_nd")
+
+
 #####################
 #  Shape functions  #
 #####################
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 4810bdc35bbd..05ca777186bb 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -3158,3 +3158,64 @@ def correlation(
     return _make.correlation(
         data1, data2, kernel_size, max_displacement, stride1, stride2, padding, is_multiply, layout
     )
+
+
+def space_to_batch_nd(data, block_shape, paddings, pad_value=0):
+    r"""Divide spatial dimensions of the data into a grid of blocks
+    and interleave them into batch dim.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D with shape [batch, spatial_shape, remaining_shape]
+
+    block_shape : relay.Expr
+        1-D of size [M] where M is number of spatial dims, specifies block size
+        for each spatial dimension.
+
+    paddings : relay.Expr
+        2-D of shape [M, 2] where M is number of spatial dims, specifies
+        [before, after] paddings for each spatial dimension.
+
+    pad_value : float, or relay.Expr, optional, default=0
+        The value used for padding.
+
+    Returns
+    -------
+    result : relay.Expr
+        N-D Tensor with shape
+        [in_batch * prod(block_shape),
+        padded_data[1] / block_shape[0], ..., padded_data[M] / block_shape[M-1],
+        remaining_shape]
+    """
+
+    return _make.space_to_batch_nd(data, block_shape, paddings, pad_value)
+
+
+def batch_to_space_nd(data, block_shape, crops):
+    r"""Reshape the batch dimension into spatial dimensions.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D with shape [batch, spatial_shape, remaining_shape]
+
+    block_shape : relay.Expr
+        1-D of size [M] where M is number of spatial dims, specifies block size
+        for each spatial dimension.
+
+    crops : relay.Expr
+        2-D of shape [M, 2] where M is number of spatial dims, specifies
+        [begin, end] crop size for each spatial dimension.
+
+    Returns
+    -------
+    result : relay.Expr
+        N-D Tensor with shape
+        [batch / prod(block_shape),
+        in_shape[1] * block_shape[0] - crops[0,0] - crops[0,1], ...,
+        in_shape[M] * block_shape[M-1] - crops[M-1, 0] - crops[M-1, 1],
+        remaining_shape]
+    """
+
+    return _make.batch_to_space_nd(data, block_shape, crops)
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 2c5f046bb7e8..bbaded431788 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -537,3 +537,13 @@ class TupleGetItemAttrs(Attrs):
 @tvm._ffi.register_object("relay.attrs.WithFuncIdAttrs")
 class WithFuncIdAttrs(Attrs):
     """Attributes used in with_funcid annotation operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.SpaceToBatchNDAttrs")
+class SpaceToBatchNDAttrs(Attrs):
+    """Attributes used in SpaceToBatchND operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.BatchToSpaceNDAttrs")
+class BatchToSpaceNDAttrs(Attrs):
+    """Attributes used in BatchToSpaceNDAttrs operators"""
diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py
index a035f6778c97..2ebbd1d67bd1 100644
--- a/python/tvm/topi/nn/__init__.py
+++ b/python/tvm/topi/nn/__init__.py
@@ -46,3 +46,5 @@
 from .fifo_buffer import *
 from .depth_to_space import *
 from .space_to_depth import *
+from .space_to_batch_nd import *
+from .batch_to_space_nd import *
diff --git a/python/tvm/topi/nn/batch_to_space_nd.py b/python/tvm/topi/nn/batch_to_space_nd.py
new file mode 100644
index 000000000000..c61a90a7777b
--- /dev/null
+++ b/python/tvm/topi/nn/batch_to_space_nd.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""TVM operator batch_to_space_nd compute."""
+from __future__ import absolute_import
+from . import cpp
+
+
+def batch_to_space_nd(data, block_shape, crop_begin_list, crop_end_list):
+    """Perform space to batch transformation on the data
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D Tensor with shape [batch, spatial_shape, remaining_shapes],
+        where spatial_shape has M dimensions.
+
+    block_size : list of ints
+        list of size [M] where M is number of spatial dims, specifies block
+        size for each spatial dimension.
+
+    crop_begin_list : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        begin crop size for each spatial dimension.
+
+    crop_end_list : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        end crop size for each spatial dimension.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+    """
+
+    return cpp.nn.batch_to_space_nd(data, block_shape, crop_begin_list, crop_end_list)
diff --git a/python/tvm/topi/nn/space_to_batch_nd.py b/python/tvm/topi/nn/space_to_batch_nd.py
new file mode 100644
index 000000000000..149f2b6464c6
--- /dev/null
+++ b/python/tvm/topi/nn/space_to_batch_nd.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""TVM operator space_to_batch_nd compute."""
+from __future__ import absolute_import
+from . import cpp
+
+
+def space_to_batch_nd(data, block_shape, pad_before, pad_after, pad_value=0.0):
+    """Perform batch to space transformation on the data
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        N-D Tensor with shape [batch, spatial_shape, remaining_shapes],
+        where spatial_shape has M dimensions.
+
+    block_shape : list of ints
+        list of size [M] where M is number of spatial dims, specifies block
+        size for each spatial dimension.
+
+    pad_before : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        zero-padding size before each spatial dimension.
+
+    pad_after : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        zero-padding size after each spatial dimension.
+
+    pad_value : float, optional
+        The value used for padding.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+    """
+
+    return cpp.nn.space_to_batch_nd(data, block_shape, pad_before, pad_after, pad_value)
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 5b23e8f4600e..4f905500d3f1 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -67,3 +67,5 @@
 from .adaptive_pool_python import adaptive_pool
 from .grid_sample_python import affine_grid_python, grid_sample_nchw_python
 from .matrix_set_diag import matrix_set_diag
+from .space_to_batch_nd import space_to_batch_nd_python
+from .batch_to_space_nd import batch_to_space_nd_python
diff --git a/python/tvm/topi/testing/batch_to_space_nd.py b/python/tvm/topi/testing/batch_to_space_nd.py
new file mode 100644
index 000000000000..80af79b8cacb
--- /dev/null
+++ b/python/tvm/topi/testing/batch_to_space_nd.py
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Batch to space ND in python"""
+import numpy as np
+from . import strided_slice_python
+
+
+def batch_to_space_nd_python(data, block_shape, crop_begin_list, crop_end_list):
+    """Batch to Space operator in python for NHWC layout.
+
+    Parameters
+    ----------
+    data : np.ndarray
+        N-D with shape [batch, spatial_shape, remaining_shapes],
+        where spatial_shape has M dimensions.
+
+    block_shape : list of ints
+        1-D array of size [M] where M is number of spatial dims, specifies block
+        size for each spatial dimension.
+
+    crop_begin_list : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        begin crop size for each spatial dimension.
+
+    crop_end_list : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        end crop size for each spatial dimension.
+
+    Returns
+    -------
+    b2s_out : np.ndarray
+        N-D with shape
+        [batch / prod(block_shape),
+        in_shape[1] * block_shape[0] - crop_begin_list[0] - crop_end_list[0], ...,
+        in_shape[M] * block_shape[M-1] - crop_begin_list[M-1] - crop_end_list[M-1],
+        remaining_shape]
+    """
+    in_shape = data.shape
+    N = len(in_shape)
+    M = len(block_shape)
+    block_shape_prod = np.prod(block_shape)
+    in_batch = data.shape[0]
+    axis = []
+    r_p_shape = []
+
+    r_shape = [block_shape[i] for i in range(0, M)]
+    axis.append(len(r_shape))
+    r_shape.append(in_batch // block_shape_prod)
+
+    for i in range(1, N):
+        axis.append(len(r_shape))
+        if len(axis) < (M + N):
+            axis.append(len(r_shape) - (M + 1))
+        r_shape.append(in_shape[i])
+
+    r_p_shape.append(int((in_batch / block_shape_prod)))
+    for i in range(1, M + 1):
+        r_p_shape.append(in_shape[i] * block_shape[i - 1])
+    for i in range(M + 1, N):
+        r_p_shape.append(in_shape[i])
+
+    b2s_out = np.reshape(data, newshape=r_shape)
+    b2s_out = np.transpose(b2s_out, axes=axis)
+    b2s_out = np.reshape(b2s_out, newshape=r_p_shape)
+
+    # Crop the start and end of dimensions of b2s_out
+    begin_idx = []
+    end_idx = []
+    strides = []
+
+    for i, _ in enumerate(r_p_shape):
+        strides.append(1)
+        if 0 < i <= M:
+            # begin and end index for spatial dimensions
+            begin_idx.append(crop_begin_list[i - 1])
+            end_idx.append(r_p_shape[i] - crop_end_list[i - 1])
+        else:
+            begin_idx.append(0)
+            end_idx.append(r_p_shape[i])
+
+    b2s_out = strided_slice_python(b2s_out, begin_idx, end_idx, strides)
+    return b2s_out
diff --git a/python/tvm/topi/testing/space_to_batch_nd.py b/python/tvm/topi/testing/space_to_batch_nd.py
new file mode 100644
index 000000000000..de88c27e56d6
--- /dev/null
+++ b/python/tvm/topi/testing/space_to_batch_nd.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""Space to batch ND in python"""
+import numpy as np
+
+
+def space_to_batch_nd_python(data, block_shape, pad_before, pad_after, pad_value=0):
+    """Space to Batch operator in python for NHWC layout.
+
+    Parameters
+    ----------
+    data : np.ndarray
+        N-D with shape [batch, spatial_shape, remaining_shapes],
+        where spatial_shape has M dimensions.
+
+    block_shape : list of ints
+        1-D array of size [M] where M is number of spatial dims, specifies block
+        size for each spatial dimension.
+
+    pad_before : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        zero-padding size before each spatial dimension.
+
+    pad_after : list of ints
+        list of shape [M] where M is number of spatial dims, specifies
+        zero-padding size after each spatial dimension.
+
+    pad_value : float, optional
+        the value used for padding. Defaults to 0.
+
+    Returns
+    -------
+    s2b_out : np.ndarray
+        N-D with shape [batch * prod(block_shape),
+                        padded_data[1] / block_shape[0], ..., padded_data[M] / block_shape[M-1],
+                        remaining_shape]
+    """
+    M = len(block_shape)
+    in_batch = data.shape[0]
+    block_shape_prod = np.prod(block_shape)
+
+    # Apply padding to input data
+    input_shape = data.shape
+    # Add the paddings for batch and remaining dims
+    paddings = map(list, zip(pad_before, pad_after))
+    paddings = [[0, 0]] + list(paddings) + [[0, 0]] * (data.ndim - 1 - M)
+    padded_data = np.pad(data, paddings, mode="constant", constant_values=pad_value)
+    padded_shape = padded_data.shape
+
+    # Get the reshape shape and transpose axes
+    r_shape = []
+    trans_axis = []
+    r_shape.append(in_batch)
+    for i in range(1, M + 1):
+        r_shape.append((int(padded_shape[i] // block_shape[i - 1])))
+        r_shape.append(block_shape[i - 1])
+        trans_axis.append(len(r_shape) - 1)
+
+    axis_len = len(trans_axis)
+    trans_axis.append(0)
+    for i in range(axis_len):
+        trans_axis.append(trans_axis[i] - 1)
+
+    out_shape = []
+    out_shape.append(int((in_batch * block_shape_prod)))
+    for i in range(1, M + 1):
+        out_shape.append(int(padded_shape[i] // block_shape[i - 1]))
+
+    for i in range(M + 1, len(input_shape)):
+        r_shape.append(input_shape[i])
+        trans_axis.append(len(r_shape) - 1)
+        out_shape.append(input_shape[i])
+
+    s2b_out = np.reshape(padded_data, newshape=r_shape)
+    s2b_out = np.transpose(s2b_out, axes=trans_axis)
+    s2b_out = np.reshape(s2b_out, newshape=out_shape)
+
+    return s2b_out
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index ea25c1a9c0f9..816b98038e46 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -1145,5 +1145,223 @@ RELAY_REGISTER_OP("nn.space_to_depth")
     .set_support_level(5)
     .add_type_rel("SpaceToDepth", SpaceToDepthRel);
 
+// Positional relay function to create SpaceToBatchND operator
+// used by frontend FFI
+TVM_REGISTER_NODE_TYPE(SpaceToBatchNDAttrs);
+
+Expr MakeSpaceToBatchND(Expr data, Array<Integer> block_shape, Array<Array<IndexExpr>> paddings,
+                        double pad_value) {
+  auto attrs = make_object<SpaceToBatchNDAttrs>();
+  attrs->block_shape = std::move(block_shape);
+  attrs->paddings = std::move(paddings);
+  attrs->pad_value = pad_value;
+  static const Op& op = Op::Get("nn.space_to_batch_nd");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+bool SpaceToBatchNDRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+
+  auto* input = types[0].as<TensorTypeNode>();
+  // Input must be a TensorType
+  if (input == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "SpaceToBatchND: expect input type to be TensorType but got " << types[0];
+    return false;
+  }
+
+  if (input->shape.size() <= 1) return false;
+
+  const auto* param = attrs.as<SpaceToBatchNDAttrs>();
+  CHECK(param != nullptr);
+
+  auto block_shape = param->block_shape;
+  auto paddings = param->paddings;
+  const int bdims = static_cast<int>(block_shape.size());
+  const int pdims = static_cast<int>(paddings.size());
+  // Paddings must be provided for each spatial dim.
+  CHECK(pdims == bdims) << "SpaceToBatchND: Paddings must be provided for each spatial dim";
+
+  // Apply paddings to input
+  auto in_shape = input->shape;
+  std::vector<IndexExpr> padded_shape(input->shape.begin(), input->shape.end());
+  for (size_t i = 0; i < paddings.size(); i++) {
+    CHECK_EQ(paddings[i].size(), 2U);
+    auto pad_before = tir::as_const_int(param->paddings[i][0]);
+    auto pad_after = tir::as_const_int(param->paddings[i][1]);
+    auto padding = tir::make_const(input->shape[i].dtype(), *pad_before + *pad_after);
+    padded_shape[i + 1] = in_shape[i + 1] + padding;
+  }
+
+  auto block_shape_numele = tir::make_const(DataType::Int(32), 1);
+  for (size_t i = 0; i < block_shape.size(); i++) {
+    block_shape_numele *= block_shape[i];
+  }
+
+  // Construct output shape
+  std::vector<IndexExpr> out_shape(padded_shape);
+  out_shape[0] = in_shape[0] * block_shape_numele;
+  for (size_t i = 1; i <= block_shape.size(); i++) {
+    out_shape[i] = div(padded_shape[i], block_shape[i - 1]);
+  }
+
+  // Assign output shape
+  reporter->Assign(types[1], TensorType(Array<IndexExpr>(out_shape), input->dtype));
+  return true;
+}
+
+Array<te::Tensor> SpaceToBatchNDCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
+                                        const Type& out_type) {
+  const auto* param = attrs.as<SpaceToBatchNDAttrs>();
+  CHECK(param != nullptr);
+
+  auto b_shape = param->block_shape;
+  auto paddings = param->paddings;
+  Array<IndexExpr> pad_before;
+  Array<IndexExpr> pad_after;
+
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    pad_before.push_back(paddings[i][0]);
+  }
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    pad_after.push_back(paddings[i][1]);
+  }
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  return Array<te::Tensor>{
+      topi::space_to_batch_nd(inputs[0], b_shape, pad_before, pad_after,
+                              tvm::tir::make_const(out_ttype->dtype, param->pad_value))};
+}
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.space_to_batch_nd").set_body_typed(MakeSpaceToBatchND);
+
+RELAY_REGISTER_OP("nn.space_to_batch_nd")
+    .describe(R"code(Divide spatial dimensions of the input into a grid of blocks
+and interleave them into batch dim.
+
+- **data**: data is a ND array of shape
+            (batch, spatial_shapes, remaining_shapes) for NHWC
+
+- **out**: Output is a ND array of shape
+           (batch * prod(block_shape), padded_data[1] / block_shape[0], ..., padded_data[M] / block_shape[M-1],
+            remaining_shape) for NHWC, where M is the number of spatial dimensions.
+
+Example::
+
+  x = [[[[1], [2]], [[3], [4]]]]
+
+  space_to_batch_nd(x, block_shape = [2, 2]) =
+    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_attrs_type<SpaceToBatchNDAttrs>()
+    .set_support_level(5)
+    .add_type_rel("SpaceToBatchND", SpaceToBatchNDRel)
+    .set_attr<FTVMCompute>("FTVMCompute", SpaceToBatchNDCompute)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
+
+/*****************************************************************/
+
+// Positional relay function to create BatchToSpaceND operator
+// used by frontend FFI
+TVM_REGISTER_NODE_TYPE(BatchToSpaceNDAttrs);
+
+Expr MakeBatchToSpaceND(Expr data, Array<Integer> block_shape, Array<Array<IndexExpr>> crops) {
+  auto attrs = make_object<BatchToSpaceNDAttrs>();
+  attrs->block_shape = std::move(block_shape);
+  attrs->crops = std::move(crops);
+  static const Op& op = Op::Get("nn.batch_to_space_nd");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+bool BatchToSpaceNDRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+
+  auto* input = types[0].as<TensorTypeNode>();
+  // Input must be a TensorType
+  if (input == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "BatchToSpaceND: expect input type to be TensorType but got " << types[0];
+    return false;
+  }
+
+  if (input->shape.size() <= 1) return false;
+
+  const auto* param = attrs.as<BatchToSpaceNDAttrs>();
+  CHECK(param != nullptr);
+
+  auto block_shape = param->block_shape;
+  auto crops = param->crops;
+  const int bdims = static_cast<int>(block_shape.size());
+  const int cdims = static_cast<int>(crops.size());
+  const int indims = static_cast<int>(input->shape.size());
+  // crops must be provided for each spatial dim.
+  CHECK(cdims == bdims) << "BatchToSpaceND: crops must be provided for each spatial dim";
+  CHECK(bdims < indims) << "BatchToSpaceND: block_shape must be less than input shape";
+
+  auto block_shape_numele = tir::make_const(DataType::Int(32), 1);
+  for (size_t i = 0; i < block_shape.size(); i++) {
+    block_shape_numele *= block_shape[i];
+  }
+
+  auto in_shape = input->shape;
+
+  // Construct output shape
+  // Start with input shape, only batch and spatial dims shapes are modified.
+  std::vector<IndexExpr> out_shape(input->shape.begin(), input->shape.end());
+  out_shape[0] = in_shape[0] / block_shape_numele;
+  for (size_t i = 1; i <= block_shape.size(); i++) {
+    out_shape[i] = (in_shape[i] * block_shape[i - 1]) - crops[i - 1][0] - crops[i - 1][1];
+  }
+  for (int i = bdims + 1; i < indims; i++) {
+    out_shape[i] = in_shape[i];
+  }
+
+  // Assign output shape
+  reporter->Assign(types[1], TensorType(Array<IndexExpr>(out_shape), input->dtype));
+  return true;
+}
+
+Array<te::Tensor> BatchToSpaceNDCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
+                                        const Type& out_type) {
+  const auto* param = attrs.as<BatchToSpaceNDAttrs>();
+  CHECK(param != nullptr);
+
+  auto b_shape = param->block_shape;
+  auto crops = param->crops;
+  Array<IndexExpr> crop_begin_list, crop_end_list;
+  for (size_t i = 0; i < crops.size(); ++i) {
+    crop_begin_list.push_back(crops[i][0]);
+    crop_end_list.push_back(crops[i][1]);
+  }
+
+  return Array<te::Tensor>{
+      topi::batch_to_space_nd(inputs[0], b_shape, crop_begin_list, crop_end_list)};
+}
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.batch_to_space_nd").set_body_typed(MakeBatchToSpaceND);
+
+RELAY_REGISTER_OP("nn.batch_to_space_nd")
+    .describe(R"code(Reshape the batch dimension into spatial dimensions.
+
+Example::
+
+  x = [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+
+  batch_to_space_nd(x, block_shape = [2, 2]) =
+    [[[[1], [2]], [[3], [4]]]]
+
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_attrs_type<BatchToSpaceNDAttrs>()
+    .set_support_level(5)
+    .add_type_rel("BatchToSpaceND", BatchToSpaceNDRel)
+    .set_attr<FTVMCompute>("FTVMCompute", BatchToSpaceNDCompute)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/topi/nn.cc b/src/topi/nn.cc
index 2c9546507de6..092fe65e19dc 100644
--- a/src/topi/nn.cc
+++ b/src/topi/nn.cc
@@ -57,6 +57,14 @@ TVM_REGISTER_GLOBAL("topi.nn.pad").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = pad(args[0], args[1], args[2], args[3]);
 });
 
+TVM_REGISTER_GLOBAL("topi.nn.space_to_batch_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = space_to_batch_nd(args[0], args[1], args[2], args[3], args[4]);
+});
+
+TVM_REGISTER_GLOBAL("topi.nn.batch_to_space_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = batch_to_space_nd(args[0], args[1], args[2], args[3]);
+});
+
 /* Ops from nn/dense.h */
 TVM_REGISTER_GLOBAL("topi.nn.dense").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = nn::dense(args[0], args[1], args[2], args[3]);
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index cfb85b6d1e91..5a5a12c9efe0 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -1141,6 +1141,60 @@ def verify_grid_sample(data_shape, grid_shape):
     verify_grid_sample((4, 4, 16, 32), (4, 2, 32, 32))
 
 
+@tvm.testing.uses_gpu
+def test_space_to_batch_nd():
+    def verify_space_to_batch_nd(dshape, block_shape, paddings):
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        pad_before, pad_after = map(list, zip(*paddings))
+        ref_res = tvm.topi.testing.space_to_batch_nd_python(
+            x_data, block_shape, pad_before, pad_after
+        )
+
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.nn.space_to_batch_nd(x, block_shape, paddings)
+        assert "block_shape=" in z.astext()
+        assert "paddings=" in z.astext()
+        zz = run_infer_type(z)
+        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
+        func = relay.Function([x], z)
+
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
+
+    verify_space_to_batch_nd([3, 3, 2, 1], [3], [[0, 0]])
+    verify_space_to_batch_nd([2, 2, 4, 1], [2, 2], [[0, 0], [2, 0]])
+
+
+@tvm.testing.uses_gpu
+def test_batch_to_space_nd():
+    def verify_batch_to_space_nd(dshape, block_shape, crops):
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        crop_begin_list, crop_end_list = map(list, zip(*crops))
+        ref_res = tvm.topi.testing.batch_to_space_nd_python(
+            x_data, block_shape, crop_begin_list, crop_end_list
+        )
+
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.nn.batch_to_space_nd(x, block_shape, crops)
+        assert "block_shape=" in z.astext()
+        assert "crops=" in z.astext()
+        zz = run_infer_type(z)
+        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
+        func = relay.Function([x], z)
+
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
+
+    verify_batch_to_space_nd([4, 1, 1, 3], [2, 2], [[0, 0], [0, 0]])
+    verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [[0, 0], [2, 0]])
+
+
 if __name__ == "__main__":
     test_resize_infer_type()
     test_resize()
@@ -1163,3 +1217,5 @@ def verify_grid_sample(data_shape, grid_shape):
     test_dilation2d_run()
     test_affine_grid()
     test_grid_sample()
+    test_space_to_batch_nd()
+    test_batch_to_space_nd()
diff --git a/tests/python/topi/python/test_topi_batch_to_space_nd.py b/tests/python/topi/python/test_topi_batch_to_space_nd.py
new file mode 100644
index 000000000000..89d044fed963
--- /dev/null
+++ b/tests/python/topi/python/test_topi_batch_to_space_nd.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for batch to space"""
+import numpy as np
+import tvm
+from tvm import te
+from tvm import topi
+import tvm.testing
+import tvm.topi.testing
+
+
+def verify_batch_to_space_nd(input_shape, block_shape, crop_begin_list, crop_end_list):
+    out_shape = []
+    out_shape.append(int((input_shape[0] / np.prod(block_shape))))
+    for i in range(1, len(block_shape) + 1):
+        crop = crop_begin_list[i - 1] + crop_end_list[i - 1]
+        out_shape.append(input_shape[i] * block_shape[i - 1] - crop)
+    for i in range(len(block_shape) + 1, len(input_shape)):
+        out_shape.append(input_shape[i])
+
+    A = te.placeholder(input_shape, name="A", dtype="float32")
+    dtype = A.dtype
+    a_np = np.random.uniform(size=input_shape).astype(dtype)
+
+    B = topi.nn.batch_to_space_nd(A, block_shape, crop_begin_list, crop_end_list)
+
+    b_np = tvm.topi.testing.batch_to_space_nd_python(
+        a_np, block_shape, crop_begin_list, crop_end_list
+    )
+
+    def check_device(device, ctx):
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = tvm.topi.testing.get_injective_schedule(device)(B)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
+
+    for device, ctx in tvm.testing.enabled_targets():
+        check_device(device, ctx)
+
+
+@tvm.testing.uses_gpu
+def test_batch_to_space():
+    # Without crops
+    verify_batch_to_space_nd([4, 1, 1, 1], [2, 2], [0, 0], [0, 0])
+    # With crops
+    verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [0, 2], [0, 0])
+    verify_batch_to_space_nd([18, 2, 1, 2], [2, 3], [1, 1], [0, 0])
+    verify_batch_to_space_nd([20, 5, 8, 7], [2, 2], [1, 1], [1, 1])
+
+
+if __name__ == "__main__":
+    test_batch_to_space()
diff --git a/tests/python/topi/python/test_topi_space_to_batch_nd.py b/tests/python/topi/python/test_topi_space_to_batch_nd.py
new file mode 100644
index 000000000000..6f969f391002
--- /dev/null
+++ b/tests/python/topi/python/test_topi_space_to_batch_nd.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for space to batch"""
+import numpy as np
+import tvm
+from tvm import te
+from tvm import topi
+import tvm.testing
+import tvm.topi.testing
+
+
+def verify_space_to_batch_nd(input_shape, block_shape, pad_before, pad_after, pad_value=0):
+    out_shape = []
+    out_shape.append(int((input_shape[0] * np.prod(block_shape))))
+    for i in range(1, len(block_shape) + 1):
+        pad = pad_before[i - 1] + pad_after[i - 1]
+        out_shape.append(int((input_shape[i] + pad) // block_shape[i - 1]))
+    for i in range(len(block_shape) + 1, len(input_shape)):
+        out_shape.append(input_shape[i])
+
+    A = te.placeholder(input_shape, name="A", dtype="float32")
+    dtype = A.dtype
+    a_np = np.random.uniform(size=input_shape).astype(dtype)
+
+    B = topi.nn.space_to_batch_nd(A, block_shape, pad_before, pad_after, pad_value)
+
+    b_np = tvm.topi.testing.space_to_batch_nd_python(
+        a_np, block_shape, pad_before, pad_after, pad_value
+    )
+
+    def check_device(device, ctx):
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = tvm.topi.testing.get_injective_schedule(device)(B)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
+        f = tvm.build(s, [A, B], device)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
+
+    for device, ctx in tvm.testing.enabled_targets():
+        check_device(device, ctx)
+
+
+@tvm.testing.uses_gpu
+def test_space_to_batch():
+    # Without paddings
+    verify_space_to_batch_nd([3, 3, 2, 1], [3], [0], [0])
+    # With paddings
+    verify_space_to_batch_nd([3, 3, 2, 1], [3], [1], [2])
+    # Multiple spatial dims
+    verify_space_to_batch_nd([3, 3, 4, 5, 2], [3, 4, 2], [1, 0, 3], [2, 0, 0])
+    # No remaining dims
+    verify_space_to_batch_nd([3, 3, 4, 5, 2], [3, 4, 2, 2], [1, 4, 0, 0], [2, 0, 1, 0])
+
+
+if __name__ == "__main__":
+    test_space_to_batch()

From 9978f7f226908675e48dc3246fcb596da6080e51 Mon Sep 17 00:00:00 2001
From: Bernhard Klein <62101642+bernhardklein@users.noreply.github.com>
Date: Tue, 17 Nov 2020 17:56:48 +0100
Subject: [PATCH 181/258] [AutoTVM][RPCRunner] timeout is not passed correctly
 (#6924)

* [AutoTVM][RPCRunner] timeout is not passed correctly

* like @merrymercy suggests, scale timeout with (n_parallel + 1)

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
---
 python/tvm/autotvm/measure/measure_methods.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 913d62b94427..4d6c5daad378 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -168,7 +168,7 @@ class RPCRunner(Runner):
     Parameters
     ----------
     timeout: float
-        The timeout of a compilation
+        The timeout of a RPCRunner measurement task
     n_parallel: int
         The number of tasks run in parallel. "None" will use all cpu cores
     key: str
@@ -240,7 +240,7 @@ def __init__(
         self.check_correctness = check_correctness
         self.cooldown_interval = cooldown_interval
 
-        self.executor = LocalExecutor()
+        self.executor = LocalExecutor(timeout=timeout * (self.n_parallel + 1))
 
     def set_task(self, task):
         self.task = task

From 25c71abd812259349d6a48b221b4fca842ef11aa Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@gmail.com>
Date: Wed, 18 Nov 2020 03:12:39 +0900
Subject: [PATCH 182/258] [CI] Install libc6-dev-i386 to compile wasm32 (#6886)

* [CI] Pin wasmtime version to 0.16.0

* Keep the wasmtime version to the latest
---
 docker/install/ubuntu_install_rust.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index 310e6507e3f3..6b5b4379cc9c 100755
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -30,6 +30,7 @@ curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default
 rustup component add rustfmt
 
 # install wasmtime
+apt-get install -y --no-install-recommends libc6-dev-i386
 export WASMTIME_HOME=/opt/wasmtime
 curl https://wasmtime.dev/install.sh -sSf | bash
 export PATH="${WASMTIME_HOME}/bin:${PATH}"

From b12a9db981ba58f1c409d682becd39b81577eb6d Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Tue, 17 Nov 2020 15:46:31 -0800
Subject: [PATCH 183/258] [TensorFlow] Support NonMaxSuppressionV5 (#6933)

---
 python/tvm/relay/frontend/tensorflow.py       | 13 +++++++-
 .../frontend/tensorflow/test_forward.py       | 30 +++++++++++++++++--
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index abdc1a0e3114..861a73aa2ad8 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -665,7 +665,7 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
-def _nms():
+def _nms(return_scores=False):
     def _impl(inputs, attr, params, mod):
         # Get parameter values
         try:
@@ -724,6 +724,16 @@ def _impl(inputs, attr, params, mod):
         ret = get_relay_op("strided_slice")(
             data_slice, begin=_expr.const([0]), end=size, slice_mode="size"
         )
+
+        # NonMaxSuppressionV5 returns scores. pad_output is always False for NMSv5.
+        if return_scores:
+            if "soft_nms_sigma" in attr and attr["soft_nms_sigma"] != 0.0:
+                raise tvm.error.OpAttributeUnImplemented(
+                    "soft_nms_sigma for NonMaxSuppressionV5 is not supported"
+                )
+            ret_scores = _op.take(inputs[1], ret, axis=0)
+            return _expr.TupleWrapper(_expr.Tuple([ret, ret_scores, size]), 3)
+
         return ret
 
     return _impl
@@ -2354,6 +2364,7 @@ def _impl(inputs, attr, params, mod):
     "NonMaxSuppressionV2": _nms(),
     "NonMaxSuppressionV3": _nms(),
     "NonMaxSuppressionV4": _nms(),
+    "NonMaxSuppressionV5": _nms(True),
     "NoOp": _no_op(),
     "NotEqual": _broadcast("not_equal"),
     "OneHot": _one_hot(),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 23a4b7abe5ab..fc1b191a1ecd 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -2662,9 +2662,35 @@ def _test_forward_nms_v4(
     )
 
 
+def _test_forward_nms_v5(
+    bx_shape, score_shape, iou_threshold, score_threshold, out_size, dtype="float32"
+):
+    boxes = np.random.uniform(0, 10, size=bx_shape).astype(dtype)
+    scores = np.random.uniform(size=score_shape).astype(dtype)
+    max_output_size = np.int32(out_size)
+    tf.reset_default_graph()
+    in_data_1 = tf.placeholder(dtype, boxes.shape, name="in_data_1")
+    in_data_2 = tf.placeholder(dtype, scores.shape, name="in_data_2")
+    in_data_3 = tf.placeholder(tf.int32, name="in_data_3")
+    tf.image.non_max_suppression_with_scores(
+        boxes=in_data_1,
+        scores=in_data_2,
+        max_output_size=in_data_3,
+        iou_threshold=iou_threshold,
+        score_threshold=score_threshold,
+        name="nms",
+    )
+    compare_tf_with_tvm(
+        [boxes, scores, max_output_size],
+        ["in_data_1:0", "in_data_2:0", "in_data_3:0"],
+        ["nms/NonMaxSuppressionV5:0", "nms/NonMaxSuppressionV5:1"],
+        mode="vm",
+    )
+
+
 def test_forward_nms():
-    """ NonMaxSuppressionV3,4 """
-    for _test_forward_nms in [_test_forward_nms_v3]:
+    """ NonMaxSuppressionV3,5 """
+    for _test_forward_nms in [_test_forward_nms_v3, _test_forward_nms_v5]:
         _test_forward_nms((5, 4), (5,), 0.7, 0.5, 5)
         _test_forward_nms((20, 4), (20,), 0.5, 0.6, 10)
         _test_forward_nms((1000, 4), (1000,), 0.3, 0.7, 1000)

From 0c5df019453e966ab5cec0479c8125baf04862a3 Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Tue, 17 Nov 2020 17:03:37 -0800
Subject: [PATCH 184/258] [DOC] Fix typo (#6920)

---
 python/tvm/contrib/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/utils.py b/python/tvm/contrib/utils.py
index f3397ce186ba..6451896c6bd1 100644
--- a/python/tvm/contrib/utils.py
+++ b/python/tvm/contrib/utils.py
@@ -112,7 +112,7 @@ def __init__(self, custom_path=None):
             self.TEMPDIRS.add(self.temp_dir)
 
     def remove(self):
-        """Remote the tmp dir"""
+        """Remove the tmp dir"""
         if self.temp_dir:
             if not self._created_with_keep_for_debug:
                 shutil.rmtree(self.temp_dir, ignore_errors=True)

From 6aa3d964893747687298501faf19035589b89170 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 17 Nov 2020 23:20:18 -0800
Subject: [PATCH 185/258] [AutoScheduler] Fix task scheduler restoring (#6934)

* [AutoScheduler] Fix task scheduler restore

* miner fix
---
 python/tvm/auto_scheduler/task_scheduler.py               | 8 ++++++--
 .../python/unittest/test_auto_scheduler_task_scheduler.py | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index c81a4b680b95..884741bd08cc 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -283,8 +283,10 @@ def tune(self, tune_option, search_policy="default"):
         )
 
         # do a round robin first to warm up
-        for i in range(len(self.tasks)):
-            self._tune_task(i)
+        for idx in range(len(self.tasks)):
+            # skip warming up this task if it has been tuned before (restored from the log file)
+            if not self.task_cts[idx]:
+                self._tune_task(idx)
         self.best_ct = self.ct
         self.best_score = self.cur_score
 
@@ -473,4 +475,6 @@ def _restore_status(self, log_file, num_measures_per_round):
             self.task_cts[i] = int(self.task_cts[i] / num_measures_per_round + 0.5)
             self.task_costs_history[i].append(self.best_costs[i])
 
+        self.cur_score = self._compute_score(self.best_costs)
+
         logger.info("TaskScheduler: Loaded %d measurement records from %s", total_ct + 1, log_file)
diff --git a/tests/python/unittest/test_auto_scheduler_task_scheduler.py b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
index b0fb37a830f7..680a783e25f4 100644
--- a/tests/python/unittest/test_auto_scheduler_task_scheduler.py
+++ b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
@@ -54,7 +54,7 @@ def test_task_scheduler_round_robin():
         for task in tasks:
             counters[task.workload_key] = 0
 
-        for inp, res in auto_scheduler.load_records(log_file):
+        for inp, _ in auto_scheduler.load_records(log_file):
             counters[inp.task.workload_key] += 1
 
         for task in tasks:
@@ -121,7 +121,7 @@ def objective_func(costs):
         for task in tasks:
             counters[task.workload_key] = 0
 
-        for inp, res in auto_scheduler.load_records(log_file):
+        for inp, _ in auto_scheduler.load_records(log_file):
             counters[inp.task.workload_key] += 1
 
         assert counters[tasks[0].workload_key] == n_trials - 1

From 7ecb7391da3d565850ad364648f19b9c683a1e55 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 18 Nov 2020 08:29:21 -0800
Subject: [PATCH 186/258] [COMMUNITY] New committer -- @mbrookhart (#6936)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8328e1c625e2..5f01340f095d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -41,6 +41,7 @@ We do encourage everyone to work anything they are interested in.
 
 - [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
 - [Matthew Barrett](https://github.com/mbaret): @mbaret - byoc, arm
+- [Matthew Brookhart](https://github.com/mbrookhart): @mbrookhart - relay, frontends
 - [Tianqi Chen](https://github.com/tqchen) (PPMC): @tqchen - topi, compiler, relay, docs
 - [Liangfu Chen](https://github.com/liangfu): @liangfu - vta, chisel, intel FPGA, c runtime
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm

From e21424cfde87bf6b7cb71e718337abcf455c8759 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Wed, 18 Nov 2020 09:46:26 -0800
Subject: [PATCH 187/258] Add Handling of Zero Len Arguments (#6923)

* Update tensorrt.py

* Update tensorrt.py

* Update tensorrt.py
---
 python/tvm/relay/op/contrib/tensorrt.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 739d49c412e8..44336073d842 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -292,6 +292,10 @@ def add_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if add is supported by TensorRT."""
 
     args = expr.args
+    # RelayVM + TRT doesn't support scalar addition yet.
+    for arg in args:
+        if not arg.checked_type.shape:
+            return False
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False

From 110f268fefa2baf1b13400cd0c68d27d85e801a1 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 18 Nov 2020 11:13:27 -0800
Subject: [PATCH 188/258] [AutoScheduler] Improve warning messages (#6935)

* [AutoScheduler] Improve warning messages

* fix lint
---
 python/tvm/auto_scheduler/dispatcher.py |  7 ++++---
 python/tvm/relay/op/strategy/cuda.py    | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index 19bae8622355..b0b98d8d0f56 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -258,10 +258,11 @@ def query(self, target, workload_key, has_complex_op, dag):
 
         if self.verbose == 2 or (has_complex_op and self.verbose == 1):
             msg = (
-                "Cannot find tuned schedules for target=%s, workload_key=%s, compute:\n%s"
+                "-----------------------------------\n"
+                "Cannot find tuned schedules for target=%s, workload_key=%s. "
                 "A fallback TOPI schedule is used, "
-                "which may bring great performance regression or even compilation failure."
-                % (target, workload_key, dag)
+                "which may bring great performance regression or even compilation failure. "
+                "Compute DAG info:\n%s" % (target, workload_key, dag)
             )
             if msg not in self.messages:
                 self.messages.add(msg)
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 105f50116c3e..ceaf9ddb84b0 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -101,6 +101,18 @@ def schedule_lrn_cuda(attrs, outs, target):
         return topi.cuda.schedule_lrn(outs)
 
 
+def naive_schedule(_, outs, target):
+    """Return the naive default schedule"""
+    if "gpu" in target.keys:
+        # For GPU, we at least need thread binding to make a valid schedule.
+        # So the naive schedule cannot be compiled.
+        raise RuntimeError(
+            "Cannot compile for GPU targets if no tuned schedule is found."
+            "Please see the warning messages above for more information about the failed workloads."
+        )
+    return tvm.te.create_schedule(outs[-1].op)
+
+
 @conv2d_strategy.register(["cuda", "gpu"])
 def conv2d_strategy_cuda(attrs, inputs, out_type, target):
     """conv2d cuda strategy"""
@@ -224,7 +236,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
             if use_auto_scheduler and judge_winograd_auto_scheduler:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
-                    wrap_topi_schedule(tvm.te.create_schedule),
+                    naive_schedule,  # this implementation should never be picked by autotvm
                     name="conv2d_nhwc.winograd",
                     plevel=15,
                 )
@@ -451,7 +463,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
         if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
-                wrap_topi_schedule(tvm.te.create_schedule),
+                naive_schedule,  # this implementation should never be picked by autotvm
                 name="conv2d_nhwc_winograd_without_weight_transform",
                 plevel=15,
             )

From 792e4f2cf44ef3b4486be2a055183e29378ef62b Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Thu, 19 Nov 2020 08:19:15 -0800
Subject: [PATCH 189/258] explicitly use new to avoid exit-time destruction of
 global state for VM (#6938)

---
 src/runtime/vm/memory_manager.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc
index 4e480507e71a..960b2e20145a 100644
--- a/src/runtime/vm/memory_manager.cc
+++ b/src/runtime/vm/memory_manager.cc
@@ -110,8 +110,10 @@ NDArray StorageObj::AllocNDArray(size_t offset, std::vector<int64_t> shape, DLDa
 }
 
 MemoryManager* MemoryManager::Global() {
-  static MemoryManager memory_manager;
-  return &memory_manager;
+  // NOTE: explicitly use new to avoid exit-time destruction of global state
+  // Global state will be recycled by OS as the process exits.
+  static auto* inst = new MemoryManager();
+  return inst;
 }
 
 Allocator* MemoryManager::GetOrCreateAllocator(TVMContext ctx, AllocatorType type) {

From 34d921ce0beafea87798f4fb3d616f1a5d4802ae Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 19 Nov 2020 08:34:20 -0800
Subject: [PATCH 190/258] Lazy import XGBoost (#6939)

---
 .../tvm/auto_scheduler/cost_model/xgb_model.py   | 16 ++++++++--------
 python/tvm/autotvm/tuner/xgboost_cost_model.py   | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index ef5472d6b77e..d6503918ee5e 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -23,16 +23,13 @@
 
 import numpy as np
 
-try:
-    import xgboost as xgb
-except ImportError:
-    xgb = None
-
 from tvm.autotvm.tuner.metric import max_curve
 from .cost_model import PythonBasedModel
 from ..feature import get_per_store_features_from_measure_pairs, get_per_store_features_from_states
 from ..measure_record import RecordReader
 
+xgb = None
+
 logger = logging.getLogger("auto_scheduler")
 
 
@@ -92,9 +89,12 @@ class XGBModel(PythonBasedModel):
     """
 
     def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
-
-        if xgb is None:
-            raise ImportError(
+        global xgb
+        try:
+            if xgb is None:
+                xgb = __import__("xgboost")
+        except ImportError:
+            print(
                 "XGBoost is required for XGBModel. "
                 "Please install its python package first. "
                 "Help: (https://xgboost.readthedocs.io/en/latest/) "
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 14bc683c10b1..a144617596b5 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -23,16 +23,13 @@
 
 import numpy as np
 
-try:
-    import xgboost as xgb
-except ImportError:
-    xgb = None
-
 from .. import feature
 from ..utils import get_rank
 from .metric import max_curve, recall_curve, cover_curve
 from .model_based_tuner import CostModel, FeatureCache
 
+xgb = None
+
 logger = logging.getLogger("autotvm")
 
 
@@ -75,10 +72,13 @@ class XGBoostCostModel(CostModel):
     def __init__(
         self, task, feature_type, loss_type, num_threads=None, log_interval=25, upper_model=None
     ):
+        global xgb
         super(XGBoostCostModel, self).__init__()
-
-        if xgb is None:
-            raise RuntimeError(
+        try:
+            if xgb is None:
+                xgb = __import__("xgboost")
+        except ImportError:
+            print(
                 "XGBoost is required for XGBoostCostModel. "
                 "Please install its python package first. "
                 "Help: (https://xgboost.readthedocs.io/en/latest/) "

From 8e1deb0869fcc7598fe39432bc542487ed737150 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 19 Nov 2020 11:32:32 -0800
Subject: [PATCH 191/258] =?UTF-8?q?[=C2=B5TVM]=20Fix=20problems=20with=20t?=
 =?UTF-8?q?he=20debug=20flow=20(#6930)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Allow blocking read and write in micro transport, for debugging.

* add support for None timeout to micro transport, add tests

* fix GdbTransport and friends.

 * GDB itself was just busted (would not launch inferior properly)
 * GdbDebugger would kill the debugger without waiting for user
   input. change to always wait for an explicit user quit.
 * immediately resurrect Ctrl+C handler when debugger dies.
 * remove on-terminate callback complexity, unnecessary
---
 python/tvm/micro/debugger.py                  | 194 +++++++++++-------
 python/tvm/micro/session.py                   |   4 +-
 python/tvm/micro/transport/__init__.py        |   1 +
 python/tvm/micro/transport/base.py            |  55 ++---
 python/tvm/micro/transport/debug.py           |   1 -
 python/tvm/micro/transport/file_descriptor.py |   9 +
 python/tvm/micro/transport/serial.py          |   4 +
 python/tvm/micro/transport/wakeup.py          |   9 +-
 src/runtime/micro/micro_session.cc            | 169 ++++++++++-----
 tests/python/unittest/test_micro_transport.py | 187 +++++++++++++++++
 10 files changed, 481 insertions(+), 152 deletions(-)
 create mode 100644 tests/python/unittest/test_micro_transport.py

diff --git a/python/tvm/micro/debugger.py b/python/tvm/micro/debugger.py
index b76d46a04db6..8119940a018c 100644
--- a/python/tvm/micro/debugger.py
+++ b/python/tvm/micro/debugger.py
@@ -19,19 +19,23 @@
 
 import atexit
 import abc
+import errno
 import logging
 import os
+import shlex
 import signal
 import subprocess
 import sys
 import termios
 import threading
+import time
 
 import psutil
 
 from .._ffi import register_func
 from . import class_factory
 from . import transport
+from .transport.file_descriptor import FdTransport
 
 
 _LOG = logging.getLogger(__name__)
@@ -40,9 +44,6 @@
 class Debugger(metaclass=abc.ABCMeta):
     """An interface for controlling micro TVM debuggers."""
 
-    def __init__(self):
-        self.on_terminate_callbacks = []
-
     @abc.abstractmethod
     def start(self):
         """Start the debugger, but do not block on it.
@@ -56,13 +57,6 @@ def stop(self):
         """Terminate the debugger."""
         raise NotImplementedError()
 
-    def _run_on_terminate_callbacks(self):
-        for callback in self.on_terminate_callbacks:
-            try:
-                callback()
-            except Exception:  # pylint: disable=broad-except
-                _LOG.warning("on_terminate_callback raised exception", exc_info=True)
-
 
 class GdbDebugger(Debugger):
     """Handles launching, suspending signals, and potentially dealing with terminal issues."""
@@ -82,73 +76,90 @@ def _stop_all(cls):
     def __init__(self):
         super(GdbDebugger, self).__init__()
         self._is_running = False
-        self._child_alive_lock = threading.RLock()
-        self._is_child_alive = False
+        self._is_running_lock = threading.RLock()
+        self._child_exited_event = threading.Event()
+        self._signals_reset_event = threading.Event()
 
     @abc.abstractmethod
     def popen_kwargs(self):
         raise NotImplementedError()
 
-    def _wait_for_child(self):
-        self.popen.wait()
-        with self._child_alive_lock:
-            self._is_child_alive = True
-
-    @classmethod
-    def _sigint_handler(cls, signum, stack_frame):  # pylint: disable=unused-argument
-        if cls._STARTED_INSTANCE is not None:
-            with cls._STARTED_INSTANCE._child_alive_lock:
-                exists = cls._STARTED_INSTANCE._is_child_alive
-            if exists:
-                try:
-                    os.killpg(cls._STARTED_INSTANCE.child_pgid, signal.SIGINT)
-                    return
-                except ProcessLookupError:
-                    pass
-
-        raise Exception()
-
-    def start(self):
-        assert not self._is_running
-        assert not self._STARTED_INSTANCE
-
-        kwargs = self.popen_kwargs()
-        self.did_start_new_session = kwargs.setdefault("start_new_session", True)
-
-        self.old_termios = termios.tcgetattr(sys.stdin.fileno())
-        self.popen = subprocess.Popen(**kwargs)
-        self._is_running = True
-        self.__class__._STARTED_INSTANCE = self
-        try:
-            self.child_pgid = os.getpgid(self.popen.pid)
-        except Exception:
-            self.stop()
-            raise
-        with self._child_alive_lock:
-            self._is_child_alive = True
-        self.old_sigint_handler = signal.signal(signal.SIGINT, self._sigint_handler)
-        t = threading.Thread(target=self._wait_for_child)
-        t.daemon = True
-        t.start()
-
-    def stop(self):
+    def _internal_stop(self):
         if not self._is_running:
             return
 
-        signal.signal(signal.SIGINT, self.old_sigint_handler)
+        os.kill(os.getpid(), signal.SIGUSR1)
+        self._signals_reset_event.wait()
         termios.tcsetattr(sys.stdin.fileno(), termios.TCSAFLUSH, self.old_termios)
 
         try:
             children = psutil.Process(self.popen.pid).children(recursive=True)
             for c in children:
                 c.terminate()
-            _, alive = psutil.wait_procs(children, timeout=self._GRACEFUL_SHUTDOWN_TIMEOUT_SEC)
-            for a in alive:
-                a.kill()
+                _, alive = psutil.wait_procs(children, timeout=self._GRACEFUL_SHUTDOWN_TIMEOUT_SEC)
+                for a in alive:
+                    a.kill()
+        except psutil.NoSuchProcess:
+            pass
         finally:
             self.__class__._STARTED_INSTANCE = None
             self._is_running = False
-            self._run_on_terminate_callbacks()
+            self._child_exited_event.set()
+
+    def _wait_for_child(self):
+        self.popen.wait()
+        with self._is_running_lock:
+            self._internal_stop()
+
+    @classmethod
+    def _sigusr1_handler(cls, signum, stack_frame):  # pylint: disable=unused-argument
+        assert (
+            cls._STARTED_INSTANCE is not None
+        ), "overridden sigusr1 handler should not be invoked when GDB not started"
+        signal.signal(signal.SIGINT, cls._STARTED_INSTANCE.old_sigint_handler)
+        signal.signal(signal.SIGUSR1, cls._STARTED_INSTANCE.old_sigusr1_handler)
+        cls._STARTED_INSTANCE._signals_reset_event.set()
+
+    @classmethod
+    def _sigint_handler(cls, signum, stack_frame):  # pylint: disable=unused-argument
+        assert (
+            cls._STARTED_INSTANCE is not None
+        ), "overridden sigint handler should not be invoked when GDB not started"
+        with cls._STARTED_INSTANCE._is_running_lock:
+            exists = cls._STARTED_INSTANCE._is_running
+        if exists:
+            try:
+                os.killpg(cls._STARTED_INSTANCE.child_pgid, signal.SIGINT)
+            except ProcessLookupError:
+                pass
+
+    def start(self):
+        with self._is_running_lock:
+            assert not self._is_running
+            assert not self._STARTED_INSTANCE
+
+            kwargs = self.popen_kwargs()
+            self.did_start_new_session = kwargs.setdefault("start_new_session", True)
+
+            self.old_termios = termios.tcgetattr(sys.stdin.fileno())
+            self.popen = subprocess.Popen(**kwargs)
+            self._is_running = True
+            self.old_sigint_handler = signal.signal(signal.SIGINT, self._sigint_handler)
+            self.old_sigusr1_handler = signal.signal(signal.SIGUSR1, self._sigusr1_handler)
+            self.__class__._STARTED_INSTANCE = self
+            try:
+                self.child_pgid = os.getpgid(self.popen.pid)
+            except Exception:
+                self.stop()
+                raise
+            with self._is_running_lock:
+                self._is_child_alive = True
+            t = threading.Thread(target=self._wait_for_child)
+            t.daemon = True
+            t.start()
+
+    def stop(self):
+        self._child_exited_event.wait()
 
 
 atexit.register(GdbDebugger._stop_all)
@@ -189,13 +200,22 @@ def popen_kwargs(self):
                     ["-O", "settings set target.run-args {}".format(" ".join(self.args[1:]))]
                 )
         elif sysname == "Linux":
-            args = (
-                ["gdb", "--args"] + self.args + ["</dev/fd/{stdin_read}", ">/dev/fd/{stdout_write}"]
-            )
+            args = [
+                "gdb",
+                "-ex",
+                f"file {self.args[0]}",
+                "-ex",
+                (
+                    f"set args {' '.join(shlex.quote(a) for a in self.args[1:])} "
+                    f"</dev/fd/{stdin_read} >/dev/fd/{stdout_write}"
+                ),
+            ]
         else:
             raise NotImplementedError(f"System {sysname} is not yet supported")
 
-        self.fd_transport = fd.FdTransport(stdout_read, stdin_write)
+        self.fd_transport = FdTransport(
+            stdout_read, stdin_write, transport.debug_transport_timeouts()
+        )
         self.fd_transport.open()
 
         return {
@@ -203,18 +223,9 @@ def popen_kwargs(self):
             "pass_fds": [stdin_read, stdout_write],
         }
 
-    def _wait_for_process_death(self):
-        self.popen.wait()
-        self.fd_transport.close()
-
-    def start(self):
-        to_return = super(GdbTransportDebugger, self).start()
-        threading.Thread(target=self._wait_for_process_death, daemon=True).start()
-        return to_return
-
-    def stop(self):
+    def _internal_stop(self):
         self.fd_transport.close()
-        super(GdbTransportDebugger, self).stop()
+        super(GdbTransportDebugger, self)._internal_stop()
 
     class _Transport(transport.Transport):
         def __init__(self, gdb_transport_debugger):
@@ -227,10 +238,38 @@ def open(self):
             pass  # Pipes opened by parent class.
 
         def write(self, data, timeout_sec):
-            return self.gdb_transport_debugger.fd_transport.write(data, timeout_sec)
+            end_time = time.monotonic() + timeout_sec if timeout_sec is not None else None
+            while True:
+                try:
+                    return self.gdb_transport_debugger.fd_transport.write(data, timeout_sec)
+                except OSError as exc:
+                    # NOTE: this error sometimes happens when writes are initiated before the child
+                    # process launches.
+                    if exc.errno == errno.EAGAIN:
+                        if end_time is None or time.monotonic() < end_time:
+                            time.sleep(0.1)  # sleep to avoid excessive CPU usage
+                            continue
+
+                    raise exc
+
+            raise base.IoTimeoutError()
 
         def read(self, n, timeout_sec):
-            return self.gdb_transport_debugger.fd_transport.read(n, timeout_sec)
+            end_time = time.monotonic() + timeout_sec if timeout_sec is not None else None
+            while True:
+                try:
+                    return self.gdb_transport_debugger.fd_transport.read(n, timeout_sec)
+                except OSError as exc:
+                    # NOTE: this error sometimes happens when reads are initiated before the child
+                    # process launches.
+                    if exc.errno == errno.EAGAIN:
+                        if end_time is None or time.monotonic() < end_time:
+                            time.sleep(0.1)  # sleep to avoid excessive CPU usage
+                            continue
+
+                    raise exc
+
+            raise base.IoTimeoutError()
 
         def close(self):
             pass  # Pipes closed by parent class.
@@ -343,7 +382,6 @@ def start(self):
     def stop(self):
         try:
             self.stop_debugger()
-            self._run_on_terminate_callbacks()
         finally:
             if self.wrapping_context_manager is not None:
                 self.wrapping_context_manager.__exit__(None, None, None)
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 3f84f3beab5b..88bdf6cd8b5a 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -99,7 +99,7 @@ def get_system_lib(self):
     def _wrap_transport_read(self, n, timeout_microsec):
         try:
             return self.transport.read(
-                n, float(timeout_microsec) / 1e6 if timeout_microsec is not None else 0
+                n, float(timeout_microsec) / 1e6 if timeout_microsec is not None else None
             )
         except IoTimeoutError:
             return bytes([])
@@ -107,7 +107,7 @@ def _wrap_transport_read(self, n, timeout_microsec):
     def _wrap_transport_write(self, data, timeout_microsec):
         try:
             return self.transport.write(
-                data, float(timeout_microsec) / 1e6 if timeout_microsec is not None else 0
+                data, float(timeout_microsec) / 1e6 if timeout_microsec is not None else None
             )
         except IoTimeoutError:
             return 0
diff --git a/python/tvm/micro/transport/__init__.py b/python/tvm/micro/transport/__init__.py
index 1e1709707568..dffe9ae32792 100644
--- a/python/tvm/micro/transport/__init__.py
+++ b/python/tvm/micro/transport/__init__.py
@@ -22,5 +22,6 @@
 from .base import TransportClosedError
 from .base import TransportLogger
 from .base import TransportTimeouts
+from .base import debug_transport_timeouts
 from .debug import DebugWrapperTransport
 from .subprocess import SubprocessTransport
diff --git a/python/tvm/micro/transport/base.py b/python/tvm/micro/transport/base.py
index f8951f6226a5..07a6a6ac7fdc 100644
--- a/python/tvm/micro/transport/base.py
+++ b/python/tvm/micro/transport/base.py
@@ -108,11 +108,12 @@ def read(self, n, timeout_sec):
         ----------
         n : int
             Maximum number of bytes to read from the transport.
-        timeout_sec : float
+        timeout_sec : Union[float, None]
             Number of seconds to wait for all `n` bytes to be received before timing out. The
             transport can wait additional time to account for transport latency or bandwidth
             limitations based on the selected configuration and number of bytes being received. If
             timeout_sec is 0, read should attempt to service the request in a non-blocking fashion.
+            If timeout_sec is None, read should block until at least 1 byte of data can be returned.
 
         Returns
         -------
@@ -142,17 +143,19 @@ def write(self, data, timeout_sec):
         ----------
         data : bytes
             The data to write over the channel.
-        timeout_sec : float
-            Number of seconds to wait for all `n` bytes to be received before timing out. The
+        timeout_sec : Union[float, None]
+            Number of seconds to wait for at least one byte to be written before timing out. The
             transport can wait additional time to account for transport latency or bandwidth
             limitations based on the selected configuration and number of bytes being received. If
-            timeout_sec is 0, read should attempt to service the request in a non-blocking fashion.
+            timeout_sec is 0, write should attempt to service the request in a non-blocking fashion.
+            If timeout_sec is None, write should block until at least 1 byte of data can be
+            returned.
 
         Returns
         -------
         int :
             The number of bytes written to the underlying channel. This can be less than the length
-            of `data`, but cannot be 0.
+            of `data`, but cannot be 0 (raise an exception instead).
 
         Raises
         ------
@@ -200,34 +203,35 @@ def timeouts(self):
         return self.child.timeouts()
 
     def open(self):
-        self.logger.log(self.level, "opening transport")
+        self.logger.log(self.level, "%s: opening transport", self.name)
         self.child.open()
 
     def close(self):
-        self.logger.log(self.level, "closing transport")
+        self.logger.log(self.level, "%s: closing transport", self.name)
         return self.child.close()
 
     def read(self, n, timeout_sec):
+        timeout_str = f"{timeout_sec:5.2f}s" if timeout_sec is not None else " None "
         try:
             data = self.child.read(n, timeout_sec)
         except IoTimeoutError:
             self.logger.log(
                 self.level,
-                "%s read {%5.2fs} %4d B -> [IoTimeoutError %.2f s]",
+                "%s: read {%s} %4d B -> [IoTimeoutError %s]",
                 self.name,
-                timeout_sec,
+                timeout_str,
                 n,
-                timeout_sec,
+                timeout_str,
             )
             raise
         except Exception as err:
             self.logger.log(
                 self.level,
-                "%s read {%5.2fs} %4d B -> [err: %s]",
+                "%s: read {%s} %4d B -> [err: %s]",
                 self.name,
-                timeout_sec,
+                timeout_str,
                 n,
-                str(err),
+                err.__class__.__name__,
                 exc_info=1,
             )
             raise err
@@ -236,9 +240,9 @@ def read(self, n, timeout_sec):
         if len(hex_lines) > 1:
             self.logger.log(
                 self.level,
-                "%s read {%5.2fs} %4d B -> [%3d B]:\n%s",
+                "%s: read {%s} %4d B -> [%3d B]:\n%s",
                 self.name,
-                timeout_sec,
+                timeout_str,
                 n,
                 len(data),
                 "\n".join(hex_lines),
@@ -246,9 +250,9 @@ def read(self, n, timeout_sec):
         else:
             self.logger.log(
                 self.level,
-                "%s read {%5.2fs} %4d B -> [%3d B]: %s",
+                "%s: read {%s} %4d B -> [%3d B]: %s",
                 self.name,
-                timeout_sec,
+                timeout_str,
                 n,
                 len(data),
                 hex_lines[0],
@@ -257,24 +261,27 @@ def read(self, n, timeout_sec):
         return data
 
     def write(self, data, timeout_sec):
+        timeout_str = f"{timeout_sec:5.2f}s" if timeout_sec is not None else " None "
         try:
             bytes_written = self.child.write(data, timeout_sec)
         except IoTimeoutError:
             self.logger.log(
                 self.level,
-                "%s write                <- [%3d B]: [IoTimeoutError %.2f s]",
+                "%s: write {%s}       <- [%3d B]: [IoTimeoutError %s]",
                 self.name,
+                timeout_str,
                 len(data),
-                timeout_sec,
+                timeout_str,
             )
             raise
         except Exception as err:
             self.logger.log(
                 self.level,
-                "%s write                <- [%3d B]: [err: %s]",
+                "%s: write {%s}       <- [%3d B]: [err: %s]",
                 self.name,
+                timeout_str,
                 len(data),
-                str(err),
+                err.__class__.__name__,
                 exc_info=1,
             )
             raise err
@@ -283,16 +290,18 @@ def write(self, data, timeout_sec):
         if len(hex_lines) > 1:
             self.logger.log(
                 self.level,
-                "%s write                <- [%3d B]:\n%s",
+                "%s: write {%s}        <- [%3d B]:\n%s",
                 self.name,
+                timeout_str,
                 bytes_written,
                 "\n".join(hex_lines),
             )
         else:
             self.logger.log(
                 self.level,
-                "%s write                <- [%3d B]: %s",
+                "%s: write {%s}        <- [%3d B]: %s",
                 self.name,
+                timeout_str,
                 bytes_written,
                 hex_lines[0],
             )
diff --git a/python/tvm/micro/transport/debug.py b/python/tvm/micro/transport/debug.py
index e897b3d99df8..71e12c7ed391 100644
--- a/python/tvm/micro/transport/debug.py
+++ b/python/tvm/micro/transport/debug.py
@@ -31,7 +31,6 @@ def __init__(self, debugger, transport, disable_session_start_retry=False):
         self.debugger = debugger
         self.transport = transport
         self.disable_session_start_retry = disable_session_start_retry
-        self.debugger.on_terminate_callbacks.append(self.transport.close)
 
     def timeouts(self):
         child_timeouts = self.transport.timeouts()
diff --git a/python/tvm/micro/transport/file_descriptor.py b/python/tvm/micro/transport/file_descriptor.py
index 3f69c4c26751..6df6cd425eff 100644
--- a/python/tvm/micro/transport/file_descriptor.py
+++ b/python/tvm/micro/transport/file_descriptor.py
@@ -62,8 +62,11 @@ def open(self):
     def close(self):
         if self.read_fd is not None:
             os.close(self.read_fd)
+            self.read_fd = None
+
         if self.write_fd is not None:
             os.close(self.write_fd)
+            self.write_fd = None
 
     def _await_ready(self, rlist, wlist, timeout_sec=None, end_time=None):
         if end_time is None:
@@ -78,6 +81,9 @@ def _await_ready(self, rlist, wlist, timeout_sec=None, end_time=None):
         return True
 
     def read(self, n, timeout_sec):
+        if self.read_fd is None:
+            raise base.TransportClosedError()
+
         end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
 
         self._await_ready([self.read_fd], [], end_time=end_time)
@@ -90,6 +96,9 @@ def read(self, n, timeout_sec):
         return to_return
 
     def write(self, data, timeout_sec):
+        if self.write_fd is None:
+            raise base.TransportClosedError()
+
         end_time = None if timeout_sec is None else time.monotonic() + timeout_sec
 
         data_len = len(data)
diff --git a/python/tvm/micro/transport/serial.py b/python/tvm/micro/transport/serial.py
index 3b36f1e0e83f..6640bb5a8a0c 100644
--- a/python/tvm/micro/transport/serial.py
+++ b/python/tvm/micro/transport/serial.py
@@ -90,6 +90,10 @@ def close(self):
         self._port = None
 
     def read(self, n, timeout_sec):
+        if timeout_sec is None:
+            self._port.timeout = None
+            return self._port.read(n)
+
         end_time = time.monotonic() + timeout_sec
         to_return = bytearray()
         while True:
diff --git a/python/tvm/micro/transport/wakeup.py b/python/tvm/micro/transport/wakeup.py
index 4e5427939263..418f8bdbb27a 100644
--- a/python/tvm/micro/transport/wakeup.py
+++ b/python/tvm/micro/transport/wakeup.py
@@ -45,9 +45,14 @@ def timeouts(self):
         return self.child_transport.timeouts()
 
     def _await_wakeup(self, end_time):
+        def _time_remaining():
+            if end_time is None:
+                return None
+            return max(0, end_time - time.monotonic())
+
         if not self.found_wakeup_sequence:
             while self.wakeup_sequence not in self.wakeup_sequence_buffer:
-                x = self.child_transport.read(1, max(0, end_time - time.monotonic()))
+                x = self.child_transport.read(1, _time_remaining())
                 self.wakeup_sequence_buffer.extend(x)
                 if x[0] in (b"\n", b"\xff"):
                     _LOG.debug("%s", self.wakeup_sequence_buffer[self.line_start_index : -1])
@@ -57,7 +62,7 @@ def _await_wakeup(self, end_time):
             self.found_wakeup_sequence = True
             time.sleep(0.2)
 
-        return max(0, end_time - time.monotonic())
+        return _time_remaining()
 
     def read(self, n, timeout_sec):
         if not self.found_wakeup_sequence:
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 662597086d8a..f054c3afde5c 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -55,8 +55,11 @@ class CallbackWriteStream : public WriteStream {
     TVMByteArray bytes;
     bytes.data = (const char*)data;
     bytes.size = data_size_bytes;
-    int64_t n = fsend_(bytes, write_timeout_.count());
-    return n;
+    if (write_timeout_ == ::std::chrono::microseconds::zero()) {
+      return static_cast<int64_t>(fsend_(bytes, nullptr));
+    } else {
+      return static_cast<int64_t>(fsend_(bytes, write_timeout_.count()));
+    }
   }
 
   void PacketDone(bool is_valid) override {}
@@ -76,6 +79,21 @@ class MicroTransportChannel : public RPCChannel {
     kSessionEstablished = 2,  // session is alive.
   };
 
+  /*!
+   * \brief Construct a new MicroTransportChannel.
+   * \param fsend A PackedFunc accepting (data_bytes, timeout_usec) and returning the number of
+   *  bytes sent. If a timeout_usec elapses before all data is sent, it should return 0.
+   * \param frecv A PackedFunc accepting (num_bytes, timeout_usec) and returning a string containing
+   *  the received data. Must not return an empty string, except to indicate a timeout.
+   * \param session_start_retry_timeout During session initialization, the session start message is
+   *  re-sent after this many microseconds elapse without a reply. If 0, the session start message
+   *  is sent only once.
+   * \param session_start_timeout Session initialization is considered "timed out" if no reply is
+   *  received this many microseconds after the session start is sent. If 0, a session start never
+   *  times out.
+   * \param session_established_timeout Timeout used for the Recv() function. This is used for
+   *  messages sent after a session is already established. If 0, Recv() never times out.
+   */
   MicroTransportChannel(PackedFunc fsend, PackedFunc frecv,
                         ::std::chrono::microseconds session_start_retry_timeout,
                         ::std::chrono::microseconds session_start_timeout,
@@ -93,41 +111,46 @@ class MicroTransportChannel : public RPCChannel {
         frecv_{frecv},
         message_buffer_{nullptr} {}
 
-  bool ReceiveUntil(TypedPackedFunc<bool(void)> pf, ::std::chrono::microseconds timeout) {
-    size_t bytes_received = 0;
+ private:
+  static constexpr const size_t kReceiveBufferSizeBytes = 128;
+
+  /*
+   * \brief Receive data until either pf() returns true or a timeout occurs.
+   *
+   * The condition function is called first, so this function may return without performing a read.
+   * Following this call, received data is consumed and frecv_ is invoked until the timeout occurs
+   * or the condition function passes.
+   *
+   * \param pf A condition function that returns true when enough data has been received for the
+   *  caller to proceed.
+   * \param timeout Pointer to number of microseconds to wait before timing out. If nullptr, no
+   *  timeout ever occurs in this function, so it may block forever. If 0, a single non-blocking
+   *  read is performed, and any data returned is processed.
+   * \return true if the condition passed, false if the timeout expired.
+   */
+  bool ReceiveUntil(TypedPackedFunc<bool(void)> pf, ::std::chrono::microseconds* timeout) {
     if (pf()) {
       return true;
     }
 
-    auto end_time = ::std::chrono::steady_clock::now() + timeout;
+    auto end_time = ::std::chrono::steady_clock::now();
+    if (timeout != nullptr) {
+      end_time += *timeout;
+    }
     for (;;) {
-      while (pending_chunk_.size() > 0) {
-        size_t bytes_consumed = 0;
-        int unframer_error = unframer_.Write((const uint8_t*)pending_chunk_.data(),
-                                             pending_chunk_.size(), &bytes_consumed);
-
-        ICHECK(bytes_consumed <= pending_chunk_.size())
-            << "consumed " << bytes_consumed << " want <= " << pending_chunk_.size();
-        pending_chunk_ = pending_chunk_.substr(bytes_consumed);
-        bytes_received += bytes_consumed;
-        if (unframer_error < 0) {
-          LOG(ERROR) << "unframer got error code: " << unframer_error;
-        } else {
-          if (pf()) {
-            return true;
-          }
-        }
+      if (ConsumeReceivedPayload(pf)) {
+        return true;
       }
 
       ::std::string chunk;
-      if (timeout != ::std::chrono::microseconds::zero()) {
+      if (timeout != nullptr) {
         ::std::chrono::microseconds iter_timeout{
             ::std::max(::std::chrono::microseconds{0},
                        ::std::chrono::duration_cast<::std::chrono::microseconds>(
                            end_time - ::std::chrono::steady_clock::now()))};
-        chunk = frecv_(128, iter_timeout.count()).operator std::string();
+        chunk = frecv_(kReceiveBufferSizeBytes, iter_timeout.count()).operator std::string();
       } else {
-        chunk = frecv_(128, nullptr).operator std::string();
+        chunk = frecv_(kReceiveBufferSizeBytes, nullptr).operator std::string();
       }
       pending_chunk_ = chunk;
       if (pending_chunk_.size() == 0) {
@@ -137,41 +160,61 @@ class MicroTransportChannel : public RPCChannel {
     }
   }
 
-  bool StartSession() {
-    ICHECK(state_ == State::kReset)
-        << "MicroSession: state_: expected kReset, got " << uint8_t(state_);
+  bool StartSessionInternal() {
+    using ::std::chrono::duration_cast;
+    using ::std::chrono::microseconds;
+    using ::std::chrono::steady_clock;
 
-    ::std::chrono::steady_clock::time_point start_time = ::std::chrono::steady_clock::now();
-    auto session_start_end_time = start_time + session_start_timeout_;
+    steady_clock::time_point start_time = steady_clock::now();
+    ICHECK_EQ(kTvmErrorNoError, session_.Initialize());
+    ICHECK_EQ(kTvmErrorNoError, session_.StartSession());
 
-    ::std::chrono::steady_clock::time_point end_time;
+    if (session_start_timeout_ == microseconds::zero() &&
+        session_start_retry_timeout_ == microseconds::zero()) {
+      ICHECK(ReceiveUntil([this]() -> bool { return session_.IsEstablished(); }, nullptr))
+          << "ReceiveUntil indicated timeout expired, but no timeout set!";
+      ICHECK(session_.IsEstablished()) << "Session not established, but should be";
+      return true;
+    }
+
+    auto session_start_end_time = start_time + session_start_timeout_;
+    steady_clock::time_point end_time;
     if (session_start_retry_timeout_ != ::std::chrono::microseconds::zero()) {
       end_time = start_time + session_start_retry_timeout_;
     } else {
       end_time = session_start_end_time;
     }
+
     while (!session_.IsEstablished()) {
+      microseconds time_remaining =
+          ::std::max(microseconds{0}, duration_cast<microseconds>(end_time - steady_clock::now()));
+      if (ReceiveUntil([this]() -> bool { return session_.IsEstablished(); }, &time_remaining)) {
+        break;
+      }
+
+      if (session_start_timeout_ != microseconds::zero() && end_time >= session_start_end_time) {
+        return false;
+      }
+      end_time += session_start_retry_timeout_;
+
       ICHECK_EQ(kTvmErrorNoError, session_.Initialize());
       ICHECK_EQ(kTvmErrorNoError, session_.StartSession());
+    }
 
-      ::std::chrono::microseconds time_remaining = ::std::max(
-          ::std::chrono::microseconds{0}, ::std::chrono::duration_cast<::std::chrono::microseconds>(
-                                              end_time - ::std::chrono::steady_clock::now()));
+    return true;
+  }
 
-      if (!ReceiveUntil([this]() -> bool { return session_.IsEstablished(); }, time_remaining)) {
-        if (session_start_timeout_ != ::std::chrono::microseconds::zero() &&
-            end_time >= session_start_end_time) {
-          break;
-        }
-        end_time += session_start_retry_timeout_;
-      }
-    }
+ public:
+  bool StartSession() {
+    ICHECK(state_ == State::kReset)
+        << "MicroSession: state_: expected kReset, got " << uint8_t(state_);
 
-    if (session_.IsEstablished()) {
+    bool to_return = StartSessionInternal();
+    if (to_return) {
       write_stream_.SetWriteTimeout(session_established_timeout_);
     }
 
-    return session_.IsEstablished();
+    return to_return;
   }
 
   size_t Send(const void* data, size_t size) override {
@@ -198,9 +241,12 @@ class MicroTransportChannel : public RPCChannel {
       }
 
       did_receive_message_ = false;
-      if (!ReceiveUntil([this]() -> bool { return did_receive_message_; },
-                        session_established_timeout_)) {
-        if (session_established_timeout_ != ::std::chrono::microseconds::zero()) {
+      if (session_established_timeout_ == ::std::chrono::microseconds::zero()) {
+        ICHECK(ReceiveUntil([this]() -> bool { return did_receive_message_; }, nullptr))
+            << "ReceiveUntil timeout expired, but no timeout configured!";
+      } else {
+        if (!ReceiveUntil([this]() -> bool { return did_receive_message_; },
+                          &session_established_timeout_)) {
           std::stringstream ss;
           ss << "MicroSessionTimeoutError: failed to read reply message after timeout "
              << session_established_timeout_.count() / 1e6 << "s";
@@ -223,6 +269,37 @@ class MicroTransportChannel : public RPCChannel {
   }
 
  private:
+  /*!
+   * \brief Consume the entire received payload, unless the pf condition is met halfway through.
+   *
+   * This function expects pending_chunk_ to contain a chunk of unprocessed packet data. It
+   * repeatedly writes the chunk to the Unframer until either a) pf() returns True or b) no more
+   * data remains to be written.
+   *
+   * \param pf A PackedFunc which returns true when ReceiveUntil should return.
+   * \returns true if pf() returned true during processing; false otherwise.
+   */
+  bool ConsumeReceivedPayload(TypedPackedFunc<bool(void)> pf) {
+    while (pending_chunk_.size() > 0) {
+      size_t bytes_consumed = 0;
+      int unframer_error = unframer_.Write((const uint8_t*)pending_chunk_.data(),
+                                           pending_chunk_.size(), &bytes_consumed);
+
+      ICHECK(bytes_consumed <= pending_chunk_.size())
+          << "consumed " << bytes_consumed << " want <= " << pending_chunk_.size();
+      pending_chunk_ = pending_chunk_.substr(bytes_consumed);
+      if (unframer_error < 0) {
+        LOG(ERROR) << "unframer got error code: " << unframer_error;
+      } else {
+        if (pf()) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
   static void HandleMessageReceivedCb(void* context, MessageType message_type, FrameBuffer* buf) {
     static_cast<MicroTransportChannel*>(context)->HandleMessageReceived(message_type, buf);
   }
diff --git a/tests/python/unittest/test_micro_transport.py b/tests/python/unittest/test_micro_transport.py
new file mode 100644
index 000000000000..b0f99681af2e
--- /dev/null
+++ b/tests/python/unittest/test_micro_transport.py
@@ -0,0 +1,187 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Tests for common micro transports."""
+
+import logging
+import sys
+import unittest
+
+import pytest
+
+import tvm.testing
+
+
+@tvm.testing.requires_micro
+class TransportLoggerTests(unittest.TestCase):
+    import tvm.micro
+
+    class TestTransport(tvm.micro.transport.Transport):
+        def __init__(self):
+            self.exc = None
+            self.to_return = None
+
+        def _raise_or_return(self):
+            if self.exc is not None:
+                to_raise = self.exc
+                self.exc = None
+                raise to_raise
+            elif self.to_return is not None:
+                to_return = self.to_return
+                self.to_return = None
+                return to_return
+            else:
+                assert False, "should not get here"
+
+        def open(self):
+            pass
+
+        def close(self):
+            pass
+
+        def timeouts(self):
+            raise NotImplementedError()
+
+        def read(self, n, timeout_sec):
+            return self._raise_or_return()
+
+        def write(self, data, timeout_sec):
+            return self._raise_or_return()
+
+    def test_transport_logger(self):
+        """Tests the TransportLogger class."""
+
+        logger = logging.getLogger("transport_logger_test")
+        with self.assertLogs(logger) as test_log:
+            transport = self.TestTransport()
+            transport_logger = tvm.micro.transport.TransportLogger("foo", transport, logger=logger)
+
+            transport_logger.open()
+            assert test_log.records[-1].getMessage() == "foo: opening transport"
+
+            ########### read() tests ##########
+
+            # Normal log, single-line data returned.
+            transport.to_return = b"data"
+            transport_logger.read(23, 3.0)
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { 3.00s}   23 B -> [  4 B]: 64 61 74 61"
+                "                                      data"
+            )
+
+            # Normal log, multi-line data returned.
+            transport.to_return = b"data" * 6
+            transport_logger.read(23, 3.0)
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { 3.00s}   23 B -> [ 24 B]:\n"
+                "0000  64 61 74 61 64 61 74 61 64 61 74 61 64 61 74 61  datadatadatadata\n"
+                "0010  64 61 74 61 64 61 74 61                          datadata"
+            )
+
+            # Lack of timeout prints.
+            transport.to_return = b"data"
+            transport_logger.read(15, None)
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { None }   15 B -> [  4 B]: 64 61 74 61"
+                "                                      data"
+            )
+
+            # IoTimeoutError includes the timeout value.
+            transport.exc = tvm.micro.transport.IoTimeoutError()
+            with self.assertRaises(tvm.micro.transport.IoTimeoutError):
+                transport_logger.read(23, 0.0)
+
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { 0.00s}   23 B -> [IoTimeoutError  0.00s]"
+            )
+
+            # Other exceptions are logged by name.
+            transport.exc = tvm.micro.transport.TransportClosedError()
+            with self.assertRaises(tvm.micro.transport.TransportClosedError):
+                transport_logger.read(8, 0.0)
+
+            assert test_log.records[-1].getMessage() == (
+                "foo: read { 0.00s}    8 B -> [err: TransportClosedError]"
+            )
+
+            # KeyboardInterrupt produces no log record.
+            before_len = len(test_log.records)
+            transport.exc = KeyboardInterrupt()
+            with self.assertRaises(KeyboardInterrupt):
+                transport_logger.read(8, 0.0)
+
+            assert len(test_log.records) == before_len
+
+            ########### write() tests ##########
+
+            # Normal log, single-line data written.
+            transport.to_return = 3
+            transport_logger.write(b"data", 3.0)
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { 3.00s}        <- [  3 B]: 64 61 74   "
+                "                                      dat"
+            )
+
+            # Normal log, multi-line data written.
+            transport.to_return = 20
+            transport_logger.write(b"data" * 6, 3.0)
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { 3.00s}        <- [ 20 B]:\n"
+                "0000  64 61 74 61 64 61 74 61 64 61 74 61 64 61 74 61  datadatadatadata\n"
+                "0010  64 61 74 61                                      data"
+            )
+
+            # Lack of timeout prints.
+            transport.to_return = 3
+            transport_logger.write(b"data", None)
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { None }        <- [  3 B]: 64 61 74   "
+                "                                      dat"
+            )
+
+            # IoTimeoutError includes the timeout value.
+            transport.exc = tvm.micro.transport.IoTimeoutError()
+            with self.assertRaises(tvm.micro.transport.IoTimeoutError):
+                transport_logger.write(b"data", 0.0)
+
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { 0.00s}       <- [  4 B]: [IoTimeoutError  0.00s]"
+            )
+
+            # Other exceptions are logged by name.
+            transport.exc = tvm.micro.transport.TransportClosedError()
+            with self.assertRaises(tvm.micro.transport.TransportClosedError):
+                transport_logger.write(b"data", 0.0)
+
+            assert test_log.records[-1].getMessage() == (
+                "foo: write { 0.00s}       <- [  4 B]: [err: TransportClosedError]"
+            )
+
+            # KeyboardInterrupt produces no log record.
+            before_len = len(test_log.records)
+            transport.exc = KeyboardInterrupt()
+            with self.assertRaises(KeyboardInterrupt):
+                transport_logger.write(b"data", 0.0)
+
+            assert len(test_log.records) == before_len
+
+            transport_logger.close()
+            assert test_log.records[-1].getMessage() == "foo: closing transport"
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 4e238e847564a3867a3b62cf8df23e54fe1d4831 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 19 Nov 2020 13:49:44 -0800
Subject: [PATCH 192/258] fix tvm.relay.build() docs (#6940)

---
 python/tvm/relay/build_module.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index cba97c43b25a..5dc6f81b97a2 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -201,16 +201,16 @@ def get_params(self):
 
 
 def build(mod, target=None, target_host=None, params=None, mod_name="default"):
-    """Helper function that builds a Relay function to run on TVM graph
-    runtime.
+    # fmt: off
+    # pylint: disable=line-too-long
+    """Helper function that builds a Relay function to run on TVM graph runtime.
 
     Parameters
     ----------
     mod : :py:class:`~tvm.IRModule`
         The IR module to build. Using relay.Function is deprecated.
 
-    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context
-    name) to str/tvm.target.Target, optional
+    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional
         For heterogeneous compilation, it is a dictionary indicating context to
         target mapping. For homogeneous compilation, it is a build target.
 
@@ -241,6 +241,8 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
     params : dict
         The parameters of the final graph.
     """
+    # pylint: enable=line-too-long
+    # fmt: on
     if not isinstance(mod, (IRModule, _function.Function)):
         raise ValueError("Type of input parameter mod must be tvm.IRModule")
 

From 5ec48f5142d7ae085c963480de75a671b13a1e8a Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 20 Nov 2020 01:31:52 +0000
Subject: [PATCH 193/258] Bug-fix] Fix tir allocation with multiple lanes
 (#6941)

* Bug-fix] Fix tir allocation with multiple lanes

This PR stemmed from https://github.com/apache/incubator-tvm/pull/6907
and it is fixing a small error in the getter and setter of a buffer for
the case where `t.lanes > 1`. I also added a test to stress the issue.

* Address dtyped vs non-dtyped constant cases
---
 python/tvm/tir/ir_builder.py                                | 6 ++++--
 tests/python/unittest/test_tir_transform_narrow_datatype.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/tvm/tir/ir_builder.py b/python/tvm/tir/ir_builder.py
index 77fe79b327b6..75c5c2921ff4 100644
--- a/python/tvm/tir/ir_builder.py
+++ b/python/tvm/tir/ir_builder.py
@@ -103,7 +103,8 @@ def __getitem__(self, index):
         index = self._linear_index(index)
         if t.lanes > 1:
             base = index * t.lanes
-            index = _expr.Ramp(base, const(1, base.dtype), t.lanes)
+            stride = 1 if (not hasattr(base, "dtype")) else const(1, base.dtype)
+            index = _expr.Ramp(base, stride, t.lanes)
         return _expr.Load(self._content_type, self._buffer_var, index)
 
     def __setitem__(self, index, value):
@@ -116,7 +117,8 @@ def __setitem__(self, index, value):
         t = DataType(self._content_type)
         if t.lanes > 1:
             base = index * t.lanes
-            index = _expr.Ramp(base, const(1, base.dtype), t.lanes)
+            stride = 1 if (not hasattr(base, "dtype")) else const(1, base.dtype)
+            index = _expr.Ramp(base, stride, t.lanes)
         self._builder.emit(_stmt.Store(self._buffer_var, value, index))
 
 
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index b1a9eae7893a..cb8968cfc880 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -126,9 +126,10 @@ def check(m, lanes, target_bits, target_dtype):
         B = ib.buffer_ptr(Bb)
         with ib.for_range(0, m, name="i", dtype=m.dtype) as i:
             B[i] = A[i] + 1
+        A[0] = B[1]
         stmt = ib.get()
         stmt = lower_stmt([Ab, Bb], stmt, target_bits)
-        assert stmt.loop_var.dtype == target_dtype
+        assert stmt.seq[0].loop_var.dtype == target_dtype
 
     # i32 -> i32
     check(const(2 ** 10, dtype="int32"), 2, target_bits=32, target_dtype="int32")

From 1e7c114bcd7cbeb773309e82c42ccbe34bf9a399 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Fri, 20 Nov 2020 14:49:39 -0800
Subject: [PATCH 194/258] [AutoScheduler] Register workload when deserializing
 tasks (#6927)

* [AutoScheduler] Register workload when deserializing tasks

* fix name

* format

* merge

* fix test

* more checks
---
 python/tvm/auto_scheduler/compute_dag.py      |  2 +-
 .../tvm/auto_scheduler/relay_integration.py   | 12 ++++++--
 python/tvm/auto_scheduler/search_task.py      | 16 +++++++++++
 .../tvm/auto_scheduler/workload_registry.py   | 28 ++++++++-----------
 .../test_auto_scheduler_compute_dag.py        |  5 +++-
 5 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index 93467e27d0e7..3427709d819a 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -61,7 +61,7 @@ def __init__(self, compute_or_sche):
         if isinstance(compute_or_sche, str):
             compute = workload_key_to_tensors(compute_or_sche)
             sche = None
-        elif isinstance(compute_or_sche, list):
+        elif isinstance(compute_or_sche, (list, tvm.ir.container.Array)):
             for item in compute_or_sche:
                 if not isinstance(item, tvm.te.Tensor):
                     raise ValueError(
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 283d8bf7db45..6864bcce66e3 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -22,6 +22,7 @@
 2. Provide auto-scheduling for all TOPI compute functions
 """
 
+import logging
 import threading
 
 import tvm
@@ -32,6 +33,8 @@
 from .search_task import SearchTask
 from .workload_registry import register_workload_tensors
 
+logger = logging.getLogger("auto_scheduler")
+
 
 def call_all_topi_funcs(mod, params, target):
     """Call all TOPI compute to extract auto_scheduler tasks in a Relay program"""
@@ -218,16 +221,19 @@ def auto_schedule_topi(outs, has_complex_op):
     from tvm import relay
 
     io_tensors, has_layout_free = traverse_to_get_io_tensors(outs)
-    key = register_workload_tensors(io_tensors)
-    if key is None:  # skip this compute if failed to register the workload
+    try:
+        dag = ComputeDAG(io_tensors)
+    except tvm.error.TVMError as err:
+        logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err))
         return None
 
+    key = register_workload_tensors(dag.hash_key(), io_tensors)
+
     # only enable layout rewrite for cpu backend
     enable_layout_rewrite = "cpu" in tvm.target.Target.current().keys
 
     env = TracingEnvironment.current
     if env is None:  # in the final build mode
-        dag = ComputeDAG(io_tensors)
         state = DispatchContext.current.query(tvm.target.Target.current(), key, has_complex_op, dag)
         if state is None:
             return None
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 7c5021b3f9b7..f2dadccbf891 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -17,10 +17,13 @@
 
 """ The definiton of SearchTask """
 
+import json
+
 import tvm._ffi
 from tvm.runtime import Object
 
 from . import _ffi_api
+from .workload_registry import register_workload_tensors
 
 
 @tvm._ffi.register_object("auto_scheduler.SearchTask")
@@ -63,6 +66,19 @@ def __getstate__(self):
     def __setstate__(self, state):
         self.dag = state["dag"]
         self.workload_key = state["workload_key"]
+
+        # Register the workload if needed
+        try:
+            workload = json.loads(self.workload_key)
+        except Exception:  # pylint: disable=broad-except
+            raise RuntimeError("Invalid workload key %s" % self.workload_key)
+
+        # The workload from a compute DAG does not have arguments and is not registered
+        # by default so we register it here. If the workload has already been registered,
+        # the later registration overrides the prvious one.
+        if len(workload) == 1:
+            register_workload_tensors(workload[0], self.dag.tensors)
+
         self.target = state["target"]
         self.target_host = state["target_host"]
         self.hardware_params = state["hardware_params"]
diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
index 6a4809b1796c..9a7c15c877aa 100644
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ b/python/tvm/auto_scheduler/workload_registry.py
@@ -64,7 +64,7 @@ def register_workload(func_name, f=None, override=False):
     f : Optional[Function]
         The generation function to be registered.
     override : boolean = False
-        Whether override existing entry.
+        Whether to override existing entry.
 
     Examples
     --------
@@ -98,30 +98,26 @@ def register(myf):
     return register
 
 
-def register_workload_tensors(tensors):
-    """Register a workload by provding input/output tensors
+def register_workload_tensors(func_name, tensors, override=True):
+    """Register a workload by provding input/output tensors. Since this function is used
+    when extracting/deserializing tasks, it expects duplicated registrations by default.
 
     Parameters
     ----------
+    func_name: str
+        The function name or the hash key of the compute DAG.
     tensors: List[Tensor]
         The input/output tensors of a compute DAG
+    override : boolean = True
+        Whether to override existing entry.
 
     Returns
     -------
-    key: Optional[str]
-        The workload key, or None if failed to create a compute DAG.
+    key: str
+        The serialized JSON string as the workload key.
     """
-    # pylint: disable=import-outside-toplevel
-    from .compute_dag import ComputeDAG
-
-    try:
-        key = ComputeDAG(tensors).hash_key()
-    except tvm.error.TVMError as err:
-        logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err))
-        return None
-
-    WORKLOAD_FUNC_REGISTRY[key] = tensors
-    return json.dumps((key,))
+    register_workload(func_name, override=override)(tensors)
+    return json.dumps((func_name,))
 
 
 def make_workload_key(func, args):
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index e7774753796c..caf3c9d888b6 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -16,6 +16,7 @@
 # under the License.
 
 """Test ComputeDAG (replay, infer bound)"""
+import json
 import pickle
 
 import tvm
@@ -120,11 +121,13 @@ def test_stage_order():
     # Serialize and deserialize the search task.
     task = auto_scheduler.SearchTask(
         dag,
-        "test1",
+        json.dumps(("test-key",)),
         tvm.target.Target("llvm"),
         hardware_params=auto_scheduler.HardwareParams(100000, 16, 64),
     )
+
     task2 = pickle.loads(pickle.dumps(task))
+    assert "test-key" in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY
     assert str(task.dag.get_init_state()) == str(task2.dag.get_init_state())
     assert len(task.dag.get_init_state().stage_ops) == len(task2.dag.get_init_state().stage_ops)
     assert task.workload_key == task2.workload_key

From 2e59f2441f9c0fcf73576067bd228106a48949fb Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 20 Nov 2020 20:12:04 -0500
Subject: [PATCH 195/258] [DOCS] Improve windows build instruction via conda
 (#6944)

---
 cmake/config.cmake           |  9 +++---
 conda/build-environment.yaml |  4 ++-
 docs/install/from_source.rst | 57 ++++++++++++++++++++++++++++--------
 3 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 6a3ace2c9283..8ed06b26de5e 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -113,15 +113,16 @@ set(USE_MICRO_STANDALONE_RUNTIME OFF)
 #
 # Possible values:
 # - ON: enable llvm with cmake's find search
-# - OFF: disable llvm
+# - OFF: disable llvm, note this will disable CPU codegen
+#        which is needed for most cases
 # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
-set(USE_LLVM OFF)
+set(USE_LLVM ON)
 
 #---------------------------------------------
 # Contrib libraries
 #---------------------------------------------
 # Whether to build with BYODT software emulated posit custom datatype
-# 
+#
 # Possible values:
 # - ON: enable BYODT posit, requires setting UNIVERSAL_PATH
 # - OFF: disable BYODT posit
@@ -234,7 +235,7 @@ set(USE_TENSORRT_RUNTIME OFF)
 
 # Whether use VITIS-AI codegen
 set(USE_VITIS_AI OFF)
- 
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/conda/build-environment.yaml b/conda/build-environment.yaml
index 600933fc18b3..31b39bfafcd0 100644
--- a/conda/build-environment.yaml
+++ b/conda/build-environment.yaml
@@ -15,13 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Build environment.
+# Build environment that can be used to build tvm.
 name: tvm-build
 
+# The conda channels to lookup the dependencies
 channels:
   - anaconda
   - conda-forge
 
+# The packages to install to the environment
 dependencies:
   - conda-build
   - git
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 2bb6e551b1a0..f329e9f7e6b9 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -90,7 +90,7 @@ The configuration of TVM can be modified by `config.cmake`.
     you want to build for (OpenCL, RCOM, METAL, VULKAN, ...).
   - To help with debugging, ensure the embedded graph runtime and debugging functions are enabled with ``set(USE_GRAPH_RUNTIME ON)`` and ``set(USE_GRAPH_RUNTIME_DEBUG ON)``
 
-- TVM optionally depends on LLVM. LLVM is required for CPU codegen that needs LLVM.
+- TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on.
 
   - LLVM 4.0 or higher is needed for build with LLVM. Note that version of LLVM from default apt may lower than 4.0.
   - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from
@@ -102,7 +102,7 @@ The configuration of TVM can be modified by `config.cmake`.
   - You can also use `LLVM Nightly Ubuntu Build <https://apt.llvm.org/>`_
 
     - Note that apt-package append ``llvm-config`` with version number.
-      For example, set ``set(LLVM_CONFIG llvm-config-4.0)`` if you installed 4.0 package
+      For example, set ``set(USE_LLVM llvm-config-10)`` if you installed LLVM 10 package
 
 - We can then build tvm and related libraries.
 
@@ -122,27 +122,58 @@ The configuration of TVM can be modified by `config.cmake`.
 
 If everything goes well, we can go to :ref:`python-package-installation`
 
+.. _build-with-conda:
+
+Building with a Conda Environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Conda is a very handy way to the necessary obtain dependencies needed for running TVM.
+First, follow the `conda's installation guide <https://docs.conda.io/projects/conda/en/latest/user-guide/install/>`_
+to install miniconda or anaconda if you do not yet have conda in your system. Run the following command in a conda environment:
+
+.. code:: bash
+
+    # Create a conda environment with the dependencies specified by the yaml
+    conda env create --file conda/build-environment.yaml
+    # Activate the created environment
+    conda activate tvm-build
+
+The above command will install all necessary build dependencies such as cmake and LLVM. You can then run the standard build process in the last section.
+
+If you want to use the compiled binary outside the conda environment,
+you can set LLVM to static linking mode ``set(USE_LLVM "llvm-config --link-static")``.
+In this way, the resulting library won't depend on the dynamic LLVM libraries in the conda environment.
+
+The above instructions show how to use conda to provide the necessary build dependencies to build libtvm.
+If you are already using conda as your package manager and wish to directly build and install tvm as a conda package, you can follow the instructions below:
+
+.. code:: bash
+
+   conda build --output-folder=conda/pkg  conda/recipe
+   # Run conda/build_cuda.sh to build with cuda enabled
+   conda install tvm -c ./conda/pkg
+
 Building on Windows
 ~~~~~~~~~~~~~~~~~~~
-
-TVM support build via MSVC using cmake. The minimum required VS version is **Visual Studio Community 2015 Update 3**.
-In order to generate the VS solution file using cmake, make sure you have a recent version of cmake added to your path and then from the TVM directory:
+TVM support build via MSVC using cmake. You will need to ontain a visual studio compiler.
+The minimum required VS version is **Visual Studio Community 2015 Update 3**.
+We recommend following :ref:`build-with-conda` to obtain necessary dependencies and
+get an activated tvm-build environment. Then you can run the following command to build
 
 .. code:: bash
 
-  mkdir build
-  cd build
-  cmake -G "Visual Studio 14 2015 Win64" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" ..
+    mkdir build
+    cd build
+    cmake -A x64 -Thost=x64 ..
+    cd ..
 
-Starting with Visual Studio 2019 the architecture is specified differently so use this command
+The above command generates the solution file under the build directory.
+You can then run the following command to build
 
 .. code:: bash
 
-  cmake -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" ..
+    cmake --build build --config Release -- /m
 
-This will generate the VS project using the MSVC 64 bit generator.
-Open the .sln file in the build directory and build with Visual Studio.
-In order to build with LLVM in windows, you will need to build LLVM from source.
 
 Building ROCm support
 ~~~~~~~~~~~~~~~~~~~~~

From 5a329852085ce79c8f3364581e174bd21e3e98c0 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 22 Nov 2020 19:56:10 -0800
Subject: [PATCH 196/258] [Relay] Add DefuseOps pass (#6946)

Co-authored-by: minminsun <minmin.smm@alibaba-inc.com>

Co-authored-by: minminsun <minmin.smm@alibaba-inc.com>
---
 python/tvm/relay/transform/transform.py    | 12 +++
 src/relay/transforms/defuse_ops.cc         | 88 ++++++++++++++++++++++
 tests/python/relay/test_pass_defuse_ops.py | 68 +++++++++++++++++
 tests/python/relay/test_pass_fuse_ops.py   |  2 -
 4 files changed, 168 insertions(+), 2 deletions(-)
 create mode 100644 src/relay/transforms/defuse_ops.cc
 create mode 100644 tests/python/relay/test_pass_defuse_ops.py

diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 4907a0bf2bd4..33a46cc6e6af 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -268,6 +268,18 @@ def FuseOps(fuse_opt_level=-1):
     return _ffi_api.FuseOps(fuse_opt_level)
 
 
+def DefuseOps():
+    """The inverse operation of FuseOps. It transforms a fused program returned by FuseOps into the
+    program before FuseOps. (i.e., x == DefuseOps(FuseOps(x)))
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The registered pass for operator defusion.
+    """
+    return _ffi_api.DefuseOps()
+
+
 def CombineParallelConv2D(min_num_branches=3):
     """Combine multiple conv2d operators into one.
 
diff --git a/src/relay/transforms/defuse_ops.cc b/src/relay/transforms/defuse_ops.cc
new file mode 100644
index 000000000000..6abf4c31d359
--- /dev/null
+++ b/src/relay/transforms/defuse_ops.cc
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *
+ * \file src/relay/transforms/defuse_ops.cc
+ * \brief This is an inverse operation of fusion pass. It transforms a fused
+ * program returned by relay::transform::FuseOps into the program before FuseOps.
+ * (i.e., x == DefuseOps(FuseOps(x)))
+ */
+
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include <string>
+#include <unordered_map>
+
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+class DefuseOpsMutator : public ExprMutator {
+ public:
+  class FuncBodyMutator : public ExprMutator {
+   public:
+    explicit FuncBodyMutator(const Array<Expr>& args) : ExprMutator() { args_ = args; }
+
+    Expr VisitExpr_(const VarNode* n) {
+      const std::string& name = n->name_hint();
+      ICHECK(!name.empty() && (name[0] == 'p'));
+      std::string id_str = name.substr(1);
+      int id = std::stoi(id_str);
+      ICHECK(id >= 0 && size_t(id) < args_.size());
+      return args_[id];
+    }
+
+   private:
+    Array<Expr> args_;
+  };
+
+  Expr VisitExpr_(const CallNode* n) {
+    auto new_n = ExprMutator::VisitExpr_(n);
+
+    if (const auto* call = new_n.as<CallNode>()) {
+      if (const auto* func = call->op.as<FunctionNode>()) {
+        if (func->body->IsInstance<CallNode>()) {
+          return FuncBodyMutator(call->args).Mutate(func->body);
+        }
+      }
+    }
+    return new_n;
+  }
+};
+
+Expr DefuseOps(const Expr& expr) { return DefuseOpsMutator().Mutate(expr); }
+
+namespace transform {
+
+Pass DefuseOps() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) { return Downcast<Function>(DefuseOps(f)); };
+  return CreateFunctionPass(pass_func, 3, "DefuseOps", {"InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.DefuseOps").set_body_typed(DefuseOps);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_defuse_ops.py b/tests/python/relay/test_pass_defuse_ops.py
new file mode 100644
index 000000000000..2312b2d9ec47
--- /dev/null
+++ b/tests/python/relay/test_pass_defuse_ops.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.testing import run_opt_pass
+
+
+def test_defuse_simple():
+    """Simple testcase."""
+
+    def before():
+        x = relay.var("x", shape=(10, 20))
+        y = relay.add(x, relay.const(1, "float32"))
+        z = relay.exp(y)
+        w = relay.squeeze(z)
+        return relay.Function([x], w)
+
+    x = before()
+    x = run_opt_pass(x, transform.InferType())
+    fused = run_opt_pass(x, transform.FuseOps())
+    defused = run_opt_pass(fused, transform.DefuseOps())
+
+    assert tvm.ir.structural_equal(x, defused)
+
+
+def test_inception_like():
+    def conv(data):
+        y = relay.nn.conv2d(data, relay.var("w"), kernel_size=(3, 3), padding=(1, 1), channels=16)
+        return relay.nn.relu(data=y)
+
+    def inception_like(data):
+        c0 = conv(data)
+        c1 = conv(data)
+        return relay.concatenate((c0, c1), axis=1)
+
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        in1 = inception_like(x)
+        in2 = inception_like(in1)
+        return relay.Function(relay.analysis.free_vars(in2), in2)
+
+    dshape = (1, 16, 64, 64)
+    x = before(dshape)
+    x = run_opt_pass(x, transform.InferType())
+    fused = run_opt_pass(x, transform.FuseOps())
+    defused = run_opt_pass(fused, transform.DefuseOps())
+
+    assert tvm.ir.structural_equal(x, defused)
+
+
+if __name__ == "__main__":
+    test_defuse_simple()
+    test_inception_like()
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index ff282df7c832..a3146de55d5a 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_opt_pass
@@ -44,7 +43,6 @@ def expected():
         return relay.Function([x], y)
 
     z = before()
-    zz = run_opt_pass(z, transform.FuseOps(fuse_opt_level=2))
     zz = run_opt_pass(z, transform.FuseOps())
     after = run_opt_pass(expected(), transform.InferType())
     assert tvm.ir.structural_equal(zz, after)

From b302b76144dead7214db8ae8940a8f1b775b641d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 23 Nov 2020 05:58:53 -0800
Subject: [PATCH 197/258] =?UTF-8?q?[=C2=B5TVM]=20Remove=20binutils=20modul?=
 =?UTF-8?q?e,=20no=20longer=20needed=20after=20microTVM=20refactor.=20(#69?=
 =?UTF-8?q?47)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/tvm/contrib/binutils.py        | 320 --------------------------
 python/tvm/micro/compiler.py          |  32 ++-
 tests/python/contrib/test_binutils.py | 167 --------------
 3 files changed, 27 insertions(+), 492 deletions(-)
 delete mode 100644 python/tvm/contrib/binutils.py
 delete mode 100644 tests/python/contrib/test_binutils.py

diff --git a/python/tvm/contrib/binutils.py b/python/tvm/contrib/binutils.py
deleted file mode 100644
index 646362a5587f..000000000000
--- a/python/tvm/contrib/binutils.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Utilities for binary file manipulation"""
-import os
-import subprocess
-import tvm._ffi
-from . import utils
-
-# TODO does this file still belong in `contrib`. is it too µTVM-specific?
-
-# TODO shouldn't need so many `ALIGN` directives
-RELOCATION_LD_SCRIPT_TEMPLATE = """
-/* linker symbol for use in UTVMInit */
-_utvm_stack_pointer_init = 0x{stack_pointer_init:x};
-
-SECTIONS
-{{
-  . = 0x{text_start:x};
-  . = ALIGN({word_size});
-  .text :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.text))
-    KEEP(*(.text*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{rodata_start:x};
-  . = ALIGN({word_size});
-  .rodata :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.rodata))
-    KEEP(*(.rodata*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{data_start:x};
-  . = ALIGN({word_size});
-  .data :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.data))
-    KEEP(*(.data*))
-    . = ALIGN({word_size});
-  }}
-
-  . = 0x{bss_start:x};
-  . = ALIGN({word_size});
-  .bss :
-  {{
-    . = ALIGN({word_size});
-    KEEP(*(.bss))
-    KEEP(*(.bss*))
-    . = ALIGN({word_size});
-  }}
-}}
-"""
-
-
-def run_cmd(cmd):
-    """Runs `cmd` in a subprocess and awaits its completion.
-
-    Parameters
-    ----------
-    cmd : List[str]
-        list of command-line arguments
-
-    Returns
-    -------
-    output : str
-        resulting stdout capture from the subprocess
-    """
-    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    (output, _) = proc.communicate()
-    output = output.decode("utf-8")
-    if proc.returncode != 0:
-        cmd_str = " ".join(cmd)
-        msg = f'error while running command "{cmd_str}":\n{output}'
-        raise RuntimeError(msg)
-    return output
-
-
-@tvm._ffi.register_func("tvm_callback_get_section_size")
-def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
-    """Finds size of the section in the binary.
-    Assumes `size` shell command exists (typically works only on Linux machines)
-
-    Parameters
-    ----------
-    binary_path : str
-        path of the binary file
-
-    section_name : str
-        name of section
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    size : integer
-        size of the section in bytes
-    """
-    if not os.path.isfile(binary_path):
-        raise RuntimeError('no such file "{}"'.format(binary_path))
-    # We use the "-A" flag here to get the ".rodata" section's size, which is
-    # not included by default.
-    size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])
-
-    # TODO(weberlo): Refactor this method and `*relocate_binary` so they are
-    # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss".
-    section_mapping = {
-        ".text": [".text"],
-        ".rodata": [".rodata"],
-        ".data": [".data", ".sdata"],
-        ".bss": [".bss", ".sbss"],
-    }
-    sections_to_sum = section_mapping["." + section_name]
-    section_size = 0
-    # Skip the first two header lines in the `size` output.
-    for line in size_output.split("\n")[2:]:
-        tokens = list(filter(lambda s: len(s) != 0, line.split(" ")))
-        if len(tokens) != 3:
-            continue
-        entry_name = tokens[0]
-        entry_size = int(tokens[1])
-        for section in sections_to_sum:
-            if entry_name.startswith(section):
-                section_size += entry_size
-                break
-
-    # NOTE: in the past, section_size has been wrong on x86. it may be
-    # inconsistent. TODO: maybe stop relying on `*size` to give us the size and
-    # instead read the section with `*objcopy` and count the bytes.
-    # NOTE(areusch): I think the problem is due to alignment ops in the linker.
-    # Since this is going away in the impending switch to on-device runtime,
-    # add a constant to hopefully absorb these relocations.
-    if section_size > 0:
-        section_size += 64
-
-    return section_size
-
-
-@tvm._ffi.register_func("tvm_callback_relocate_binary")
-def tvm_callback_relocate_binary(
-    binary_path,
-    word_size,
-    text_start,
-    rodata_start,
-    data_start,
-    bss_start,
-    stack_end,
-    toolchain_prefix,
-):
-    """Relocates sections in the binary to new addresses
-
-    Parameters
-    ----------
-    binary_path : str
-        path of the binary file
-
-    word_size : int
-        word size on the target machine
-
-    text_start : int
-        text section address
-
-    rodata_start : int
-        rodata section address
-
-    data_start : int
-        data section address
-
-    bss_start : int
-        bss section address
-
-    stack_end : int
-        stack section end address
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    rel_bin : bytearray
-        the relocated binary
-    """
-    assert text_start < rodata_start < data_start < bss_start < stack_end
-    stack_pointer_init = stack_end - word_size
-    ld_script_contents = ""
-    # TODO(weberlo): There should be a better way to configure this for different archs.
-    # TODO is this line even necessary?
-    if "riscv" in toolchain_prefix:
-        ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n'
-    ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
-        word_size=word_size,
-        text_start=text_start,
-        rodata_start=rodata_start,
-        data_start=data_start,
-        bss_start=bss_start,
-        stack_pointer_init=stack_pointer_init,
-    )
-
-    tmp_dir = utils.tempdir()
-    rel_obj_path = tmp_dir.relpath("relocated.obj")
-    rel_ld_script_path = tmp_dir.relpath("relocate.lds")
-    with open(rel_ld_script_path, "w") as f:
-        f.write(ld_script_contents)
-    run_cmd(
-        ["{}ld".format(toolchain_prefix), binary_path, "-T", rel_ld_script_path, "-o", rel_obj_path]
-    )
-
-    with open(rel_obj_path, "rb") as f:
-        rel_bin = bytearray(f.read())
-
-    gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR")
-    if gdb_init_dir is not None:
-        gdb_init_path = f"{gdb_init_dir}/.gdbinit"
-        with open(gdb_init_path, "r") as f:
-            gdbinit_contents = f.read().split("\n")
-        new_contents = []
-        for line in gdbinit_contents:
-            new_contents.append(line)
-            if line.startswith("target"):
-                new_contents.append(f"add-symbol-file {rel_obj_path}")
-        with open(gdb_init_path, "w") as f:
-            f.write("\n".join(new_contents))
-
-    return rel_bin
-
-
-@tvm._ffi.register_func("tvm_callback_read_binary_section")
-def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
-    """Returns the contents of the specified section in the binary byte array
-
-    Parameters
-    ----------
-    binary : bytearray
-        contents of the binary
-
-    section : str
-        type of section
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    section_bin : bytearray
-        contents of the read section
-    """
-    tmp_dir = utils.tempdir()
-    tmp_bin = tmp_dir.relpath("temp.bin")
-    tmp_section = tmp_dir.relpath("tmp_section.bin")
-    with open(tmp_bin, "wb") as out_file:
-        out_file.write(bytes(binary))
-    run_cmd(
-        [
-            "{}objcopy".format(toolchain_prefix),
-            "--dump-section",
-            ".{}={}".format(section, tmp_section),
-            tmp_bin,
-        ]
-    )
-    if os.path.isfile(tmp_section):
-        # Get section content if it exists.
-        with open(tmp_section, "rb") as f:
-            section_bin = bytearray(f.read())
-    else:
-        # Return empty bytearray if the section does not exist.
-        section_bin = bytearray("", "utf-8")
-    return section_bin
-
-
-@tvm._ffi.register_func("tvm_callback_get_symbol_map")
-def tvm_callback_get_symbol_map(binary, toolchain_prefix):
-    """Obtains a map of symbols to addresses in the passed binary
-
-    Parameters
-    ----------
-    binary : bytearray
-        contents of the binary
-
-    toolchain_prefix : str
-        prefix for binary names in target compiler toolchain
-
-    Returns
-    -------
-    map_str : str
-        map of defined symbols to addresses, encoded as a series of
-        alternating newline-separated keys and values
-    """
-    tmp_dir = utils.tempdir()
-    tmp_obj = tmp_dir.relpath("tmp_obj.bin")
-    with open(tmp_obj, "wb") as out_file:
-        out_file.write(bytes(binary))
-    nm_output = run_cmd(["{}nm".format(toolchain_prefix), "-C", "--defined-only", tmp_obj])
-    nm_output = nm_output.splitlines()
-    map_str = ""
-    for line in nm_output:
-        line = line.split()
-        map_str += line[2] + "\n"
-        map_str += line[0] + "\n"
-    return map_str
diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py
index 069f600a823e..a265f2a3f7ac 100644
--- a/python/tvm/micro/compiler.py
+++ b/python/tvm/micro/compiler.py
@@ -21,8 +21,8 @@
 import glob
 import os
 import re
+import subprocess
 
-from tvm.contrib import binutils
 import tvm.target
 from . import build
 from . import class_factory
@@ -30,6 +30,28 @@
 from . import transport
 
 
+def run_cmd(cmd):
+    """Runs `cmd` in a subprocess and awaits its completion.
+
+    Parameters
+    ----------
+    cmd : List[str]
+        list of command-line arguments
+
+    Returns
+    -------
+    output : str
+        resulting stdout capture from the subprocess
+    """
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    (output, _) = proc.communicate()
+    output = output.decode("utf-8")
+    if proc.returncode != 0:
+        cmd_str = " ".join(cmd)
+        msg = f'error while running command "{cmd_str}":\n{output}'
+        raise RuntimeError(msg)
+
+
 class DetectTargetError(Exception):
     """Raised when no target comment was detected in the sources given."""
 
@@ -232,13 +254,13 @@ def library(self, output, sources, options=None):
 
             output_filename = f"{src_base}.o"
             output_abspath = os.path.join(output, output_filename)
-            binutils.run_cmd(args + ["-c", "-o", output_abspath, src])
+            run_cmd(args + ["-c", "-o", output_abspath, src])
             outputs.append(output_abspath)
 
         output_filename = f"{os.path.basename(output)}.a"
         output_abspath = os.path.join(output, output_filename)
-        binutils.run_cmd([prefix + "ar", "-r", output_abspath] + outputs)
-        binutils.run_cmd([prefix + "ranlib", output_abspath])
+        run_cmd([prefix + "ar", "-r", output_abspath] + outputs)
+        run_cmd([prefix + "ranlib", output_abspath])
 
         return tvm.micro.MicroLibrary(output, [output_filename])
 
@@ -273,7 +295,7 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non
             for lib_name in obj.library_files:
                 args.append(obj.abspath(lib_name))
 
-        binutils.run_cmd(args)
+        run_cmd(args)
         return tvm.micro.MicroBinary(output, output_filename, [])
 
     @property
diff --git a/tests/python/contrib/test_binutils.py b/tests/python/contrib/test_binutils.py
deleted file mode 100644
index f0aa2d157aed..000000000000
--- a/tests/python/contrib/test_binutils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test various utilities for interaction with compiled binaries.
-
-Specifically, we test the following capabilities:
-  - querying the size of a binary section
-  - relocating sections within a binary to new addresses
-  - reading the contents of a binary section
-  - querying the address of a symbol in the binary
-"""
-
-import tvm
-from tvm import te
-import subprocess
-from tvm.contrib import utils
-from tvm.contrib import cc
-from tvm.contrib.binutils import *
-
-TOOLCHAIN_PREFIX = ""
-
-
-def make_binary():
-    prog = "int a = 7; \
-            int main() { \
-                int b = 5; \
-                return 0; \
-            }"
-    tmp_dir = utils.tempdir()
-    tmp_source = tmp_dir.relpath("source.c")
-    tmp_obj = tmp_dir.relpath("obj.obj")
-    with open(tmp_source, "w") as f:
-        f.write(prog)
-    cc.create_executable(tmp_obj, tmp_source, [], cc="{}gcc".format(TOOLCHAIN_PREFIX))
-    prog_bin = bytearray(open(tmp_obj, "rb").read())
-    return prog_bin
-
-
-def test_tvm_callback_get_section_size(binary=None):
-    if binary is None:
-        binary = make_binary()
-    tmp_dir = utils.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        print(
-            "Text section size: %d"
-            % tvm_callback_get_section_size(tmp_bin, "text", TOOLCHAIN_PREFIX)
-        )
-        print(
-            "Data section size: %d"
-            % tvm_callback_get_section_size(tmp_bin, "data", TOOLCHAIN_PREFIX)
-        )
-        print(
-            "Bss section size: %d" % tvm_callback_get_section_size(tmp_bin, "bss", TOOLCHAIN_PREFIX)
-        )
-        print()
-
-    verify()
-
-
-def test_tvm_callback_relocate_binary():
-    binary = make_binary()
-    tmp_dir = utils.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        word_size = 8
-        text_loc = 0x0
-        rodata_loc = 0x10000
-        data_loc = 0x20000
-        bss_loc = 0x30000
-        stack_end = 0x50000
-        rel_bin = tvm_callback_relocate_binary(
-            tmp_bin, word_size, text_loc, rodata_loc, data_loc, bss_loc, stack_end, TOOLCHAIN_PREFIX
-        )
-        print("Relocated binary section sizes")
-        test_tvm_callback_get_section_size(binary=rel_bin)
-        relf = tmp_dir.relpath("rel.bin")
-        with open(relf, "wb") as f:
-            f.write(rel_bin)
-        nm_proc = subprocess.Popen(
-            ["nm", "-C", "--defined-only", relf], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-        )
-        (out, _) = nm_proc.communicate()
-        symbol_entries = out.decode("utf-8").split("\n")
-        for entry in symbol_entries:
-            if len(entry) == 0:
-                continue
-            sym_loc, section, sym_name = entry.split(" ")
-            sym_loc = int(sym_loc, 16)
-            if section == "T":  # text
-                assert sym_loc >= text_loc and sym_loc < data_loc
-            elif section == "D":  # data
-                assert sym_loc >= data_loc and sym_loc < bss_loc
-            elif section == "B":  # bss
-                assert sym_loc >= bss_loc
-
-    verify()
-
-
-def test_tvm_callback_read_binary_section():
-    binary = make_binary()
-
-    def verify():
-        text_bin = tvm_callback_read_binary_section(binary, "text", TOOLCHAIN_PREFIX)
-        data_bin = tvm_callback_read_binary_section(binary, "data", TOOLCHAIN_PREFIX)
-        bss_bin = tvm_callback_read_binary_section(binary, "bss", TOOLCHAIN_PREFIX)
-        print("Read text section part of binary? %r" % (text_bin in binary))
-        print("Read data section part of binary? %r" % (data_bin in binary))
-        print("Read bss section part of binary? %r" % (bss_bin in binary))
-        print()
-
-    verify()
-
-
-def test_tvm_callback_get_symbol_map():
-    binary = make_binary()
-    tmp_dir = utils.tempdir()
-    tmp_bin = tmp_dir.relpath("obj.bin")
-    with open(tmp_bin, "wb") as f:
-        f.write(binary)
-
-    def verify():
-        word_size = 8
-        text_loc = 0x0
-        rodata_loc = 0x10000
-        data_loc = 0x20000
-        bss_loc = 0x30000
-        stack_end = 0x50000
-        rel_bin = tvm_callback_relocate_binary(
-            tmp_bin, word_size, text_loc, rodata_loc, data_loc, bss_loc, stack_end, TOOLCHAIN_PREFIX
-        )
-        symbol_map = tvm_callback_get_symbol_map(rel_bin, TOOLCHAIN_PREFIX)
-        symbols = set()
-        for i, line in enumerate(symbol_map.split("\n")):
-            # Every other line is the value the symbol maps to.
-            if i % 2 == 0:
-                symbols.add(line)
-        assert "a" in symbols
-        assert "main" in symbols
-
-    verify()
-
-
-if __name__ == "__main__":
-    test_tvm_callback_get_section_size()
-    test_tvm_callback_relocate_binary()
-    test_tvm_callback_read_binary_section()
-    test_tvm_callback_get_symbol_map()

From c9a3efb11ed3670fd4c729521008d80e0ae0ea75 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 23 Nov 2020 20:15:32 +0000
Subject: [PATCH 198/258] AArch64 base algorithm refactoring in LLVM (#6907)

* AArch64 base algorithm refactoring in LLVM

- I refactored the assembly in arm_cpu/tensor_intrin.py to use LLVM+TIR
- Removed the `interleave` boolean parameter in the intrinsic to switch
among two different interleaving modes. LLVM will now take care of
interleaving the instructions
- Applied the changes accordingly to conv2d_gemm.py to call the right
instrinsic

Note: I found LLVM very sensible to the choice of the `-mcpu`.
So, in order to preserve performance, it is important to specify the
right `-mcpu` when creating the LLVM target

* Fix linting

* Fix linting -2

* Fixing comments

* Address review comments

* Fix spaces around ':' in docstrings
---
 python/tvm/topi/arm_cpu/conv2d_gemm.py   |  15 +-
 python/tvm/topi/arm_cpu/tensor_intrin.py | 746 +++++++++++------------
 2 files changed, 348 insertions(+), 413 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 6a5cb2ae890e..85c03997a98d 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -24,8 +24,7 @@
 from ..utils import get_const_tuple, get_const_int
 from ..nn.utils import get_pad_tuple
 from .tensor_intrin import (
-    gemm_quantized,
-    gemm_quantized_impl,
+    gemm_4x4_int8_int8_int32,
     gemm_acc_4x4_int8_int8_int32,
     gemm_acc_nx16_int8_int8_int32,
     gemm_acc_2x2_int8_int8_int32,
@@ -51,11 +50,8 @@ def configure_knobs(cfg, M, K):
 
     if not is_dotprod_available():
         cfg.define_knob("gemm_quantized_unroll", [True, False])
-        cfg.define_knob("gemm_quantized_interleave", [True, False])
-
         if cfg.is_fallback:
             cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
-            cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
 
 
 # Compute function
@@ -361,14 +357,9 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
         elif is_aarch64_arm():
             s[C_interleaved].reorder(yi, xi)
             K = A_interleaved_input.shape[2]
+            assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
             unroll = cfg["gemm_quantized_unroll"].val
-            interleave = cfg["gemm_quantized_interleave"].val
-            gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type)
-            s[C_interleaved].pragma(
-                b_outer_gemm_fused,
-                "import_llvm",
-                gemm_quantized_impl(M, N, K, unroll, interleave, in_type),
-            )
+            gemm = gemm_4x4_int8_int8_int32(M, N, K, unroll, in_type)
             s[C_interleaved].tensorize(yi, gemm)
 
     # Output transform
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 8ccbe0c41298..4055d7b05c24 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -19,392 +19,42 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import utils, clang
-
-
-def gemm_quantized_4_4_batched():
-    return """
-           // First half
-           // Higher part of a0 * {b0,b1,b2,b3}
-           "umull v8.8h, v0.8b, v4.8b\\n"
-           "umull v9.8h, v0.8b, v5.8b\\n"
-           "umull v10.8h, v0.8b, v6.8b\\n"
-           "umull v11.8h, v0.8b, v7.8b\\n"
-
-           // Higher part of a1 * {b0,b1,b2,b3}
-           "umull v12.8h, v1.8b, v4.8b\\n"
-           "umull v13.8h, v1.8b, v5.8b\\n"
-           "umull v14.8h, v1.8b, v6.8b\\n"
-           "umull v15.8h, v1.8b, v7.8b\\n"
-
-           // Accumulate
-           "uadalp v16.4s, v8.8h\\n"
-           "uadalp v17.4s, v9.8h\\n"
-           "uadalp v18.4s, v10.8h\\n"
-           "uadalp v19.4s, v11.8h\\n"
-           "uadalp v20.4s, v12.8h\\n"
-           "uadalp v21.4s, v13.8h\\n"
-           "uadalp v22.4s, v14.8h\\n"
-           "uadalp v23.4s, v15.8h\\n"
-
-           // Lower part of a0 * {b0,b1,b2,b3}
-           "umull2 v8.8h, v0.16b, v4.16b\\n"
-           "umull2 v9.8h, v0.16b, v5.16b\\n"
-           "umull2 v10.8h, v0.16b, v6.16b\\n"
-           "umull2 v11.8h, v0.16b, v7.16b\\n"
-
-           // Lower part of a1 * {b0,b1,b2,b3}
-           "umull2 v12.8h, v1.16b, v4.16b\\n"
-           "umull2 v13.8h, v1.16b, v5.16b\\n"
-           "umull2 v14.8h, v1.16b, v6.16b\\n"
-           "umull2 v15.8h, v1.16b, v7.16b\\n"
-
-            // Accumulate again
-           "uadalp v16.4s, v8.8h\\n"
-           "uadalp v17.4s, v9.8h\\n"
-           "uadalp v18.4s, v10.8h\\n"
-           "uadalp v19.4s, v11.8h\\n"
-           "uadalp v20.4s, v12.8h\\n"
-           "uadalp v21.4s, v13.8h\\n"
-           "uadalp v22.4s, v14.8h\\n"
-           "uadalp v23.4s, v15.8h\\n"
-
-           // Second half
-           // Lower part of a2 * {b0,b1,b2,b3}
-           "umull v8.8h, v2.8b, v4.8b\\n"
-           "umull v9.8h, v2.8b, v5.8b\\n"
-           "umull v10.8h, v2.8b, v6.8b\\n"
-           "umull v11.8h, v2.8b, v7.8b\\n"
-
-           // Lower part of a3 * {b0,b1,b2,b3}
-           "umull v12.8h, v3.8b, v4.8b\\n"
-           "umull v13.8h, v3.8b, v5.8b\\n"
-           "umull v14.8h, v3.8b, v6.8b\\n"
-           "umull v15.8h, v3.8b, v7.8b\\n"
-
-           // Accumulate
-           "uadalp v24.4s, v8.8h\\n"
-           "uadalp v25.4s, v9.8h\\n"
-           "uadalp v26.4s, v10.8h\\n"
-           "uadalp v27.4s, v11.8h\\n"
-           "uadalp v28.4s, v12.8h\\n"
-           "uadalp v29.4s, v13.8h\\n"
-           "uadalp v30.4s, v14.8h\\n"
-           "uadalp v31.4s, v15.8h\\n"
-
-           // Higher part of a2 * {b0,b1,b2,b3}
-           "umull2 v8.8h, v2.16b, v4.16b\\n"
-           "umull2 v9.8h, v2.16b, v5.16b\\n"
-           "umull2 v10.8h, v2.16b, v6.16b\\n"
-           "umull2 v11.8h, v2.16b, v7.16b\\n"
-
-           // Higher part of a3 * {b0,b1,b2,b3}
-           "umull2 v12.8h, v3.16b, v4.16b\\n"
-           "umull2 v13.8h, v3.16b, v5.16b\\n"
-           "umull2 v14.8h, v3.16b, v6.16b\\n"
-           "umull2 v15.8h, v3.16b, v7.16b\\n"
-
-           // Accumulate again
-           "uadalp v24.4s, v8.8h\\n"
-           "uadalp v25.4s, v9.8h\\n"
-           "uadalp v26.4s, v10.8h\\n"
-           "uadalp v27.4s, v11.8h\\n"
-           "uadalp v28.4s, v12.8h\\n"
-           "uadalp v29.4s, v13.8h\\n"
-           "uadalp v30.4s, v14.8h\\n"
-           "uadalp v31.4s, v15.8h\\n"
-    """
-
-
-def gemm_quantized_4_4_interleaved():
-    return """
-             // First half
-             // Higher part of a0 * {b0,b1,b2,b3} and accumulate
-             "umull v8.8h, v0.8b, v4.8b\\n"
-             "uadalp v16.4s, v8.8h\\n"
-             "umull v9.8h, v0.8b, v5.8b\\n"
-             "uadalp v17.4s, v9.8h\\n"
-             "umull v10.8h, v0.8b, v6.8b\\n"
-             "uadalp v18.4s, v10.8h\\n"
-             "umull v11.8h, v0.8b, v7.8b\\n"
-             "uadalp v19.4s, v11.8h\\n"
-
-             // Higher part of a1 * {b0,b1,b2,b3} and accumulate
-             "umull v12.8h, v1.8b, v4.8b\\n"
-             "uadalp v20.4s, v12.8h\\n"
-             "umull v13.8h, v1.8b, v5.8b\\n"
-             "uadalp v21.4s, v13.8h\\n"
-             "umull v14.8h, v1.8b, v6.8b\\n"
-             "uadalp v22.4s, v14.8h\\n"
-             "umull v15.8h, v1.8b, v7.8b\\n"
-             "uadalp v23.4s, v15.8h\\n"
-
-             // Lower part of a0 * {b0,b1,b2,b3} and accumulate
-             "umull2 v8.8h, v0.16b, v4.16b\\n"
-             "uadalp v16.4s, v8.8h\\n"
-             "umull2 v9.8h, v0.16b, v5.16b\\n"
-             "uadalp v17.4s, v9.8h\\n"
-             "umull2 v10.8h, v0.16b, v6.16b\\n"
-             "uadalp v18.4s, v10.8h\\n"
-             "umull2 v11.8h, v0.16b, v7.16b\\n"
-             "uadalp v19.4s, v11.8h\\n"
-
-             // Lower part of a1 * {b0,b1,b2,b3} and accumulate
-             "umull2 v12.8h, v1.16b, v4.16b\\n"
-             "uadalp v20.4s, v12.8h\\n"
-             "umull2 v13.8h, v1.16b, v5.16b\\n"
-             "uadalp v21.4s, v13.8h\\n"
-             "umull2 v14.8h, v1.16b, v6.16b\\n"
-             "uadalp v22.4s, v14.8h\\n"
-             "umull2 v15.8h, v1.16b, v7.16b\\n"
-             "uadalp v23.4s, v15.8h\\n"
-
-             // Second half
-             // Higher part of a2 * {b0,b1,b2,b3} and accumulate
-             "umull v8.8h, v2.8b, v4.8b\\n"
-             "uadalp v24.4s, v8.8h\\n"
-             "umull v9.8h, v2.8b, v5.8b\\n"
-             "uadalp v25.4s, v9.8h\\n"
-             "umull v10.8h, v2.8b, v6.8b\\n"
-             "uadalp v26.4s, v10.8h\\n"
-             "umull v11.8h, v2.8b, v7.8b\\n"
-             "uadalp v27.4s, v11.8h\\n"
-
-             // Higher part of a3 * {b0,b1,b2,b3} and accumulate
-             "umull v12.8h, v3.8b, v4.8b\\n"
-             "uadalp v28.4s, v12.8h\\n"
-             "umull v13.8h, v3.8b, v5.8b\\n"
-             "uadalp v29.4s, v13.8h\\n"
-             "umull v14.8h, v3.8b, v6.8b\\n"
-             "uadalp v30.4s, v14.8h\\n"
-             "umull v15.8h, v3.8b, v7.8b\\n"
-             "uadalp v31.4s, v15.8h\\n"
-
-             // Lower part of a2 * {b0,b1,b2,b3} and accumulate
-             "umull2 v8.8h, v2.16b, v4.16b\\n"
-             "uadalp v24.4s, v8.8h\\n"
-             "umull2 v9.8h, v2.16b, v5.16b\\n"
-             "uadalp v25.4s, v9.8h\\n"
-             "umull2 v10.8h, v2.16b, v6.16b\\n"
-             "uadalp v26.4s, v10.8h\\n"
-             "umull2 v11.8h, v2.16b, v7.16b\\n"
-             "uadalp v27.4s, v11.8h\\n"
-
-             // Lower part of a3 * {b0,b1,b2,b3} and accumulate
-             "umull2 v12.8h, v3.16b, v4.16b\\n"
-             "uadalp v28.4s, v12.8h\\n"
-             "umull2 v13.8h, v3.16b, v5.16b\\n"
-             "uadalp v29.4s, v13.8h\\n"
-             "umull2 v14.8h, v3.16b, v6.16b\\n"
-             "uadalp v30.4s, v14.8h\\n"
-             "umull2 v15.8h, v3.16b, v7.16b\\n"
-             "uadalp v31.4s, v15.8h\\n"
-    """
-
-
-def gemm_quantized_impl(M, N, K, unroll, interleave, data_type="uint8"):
-    """Assembly implementation of a blocked gemv. Given
-    a block a of shape (4, k) and a block b' of shape (4, k)
-    produces the output block c = a*b of shape (4,4)"""
 
-    stepA = min(4, M)
-    stepB = min(4, N)
-    assert data_type in ["uint8", "int8"], "Only uint8/int8 supported for this implementation"
 
-    signature = """extern "C" int gemm_quantized_{0}_{0}_int32_{1}_{2}""".format(
-        data_type, stepA, stepB
-    )
-    if unroll:
-        signature += "_" + str(K)
-
-    if interleave:
-        signature += "_interleaved"
-
-    signature += """(int *c_buffer,
-                      unsigned char *a_buffer,
-                      unsigned char *b_buffer,
-                      int K, int m, int n)"""
-
-    cc_code = signature
-    cc_code += """
-    {
-            unsigned char * a_ptr = a_buffer;
-            unsigned char * b_ptr = b_buffer;
-            int * c_ptr = c_buffer;
-
-            int k = K / 16;
-
-            __asm__  __volatile__ (
-                "movi v16.4s, #0\\n"
-                "movi v17.4s, #0\\n"
-                "movi v18.4s, #0\\n"
-                "movi v19.4s, #0\\n"
-                "movi v20.4s, #0\\n"
-                "movi v21.4s, #0\\n"
-                "movi v22.4s, #0\\n"
-                "movi v23.4s, #0\\n"
-                "movi v24.4s, #0\\n"
-                "movi v25.4s, #0\\n"
-                "movi v26.4s, #0\\n"
-                "movi v27.4s, #0\\n"
-                "movi v28.4s, #0\\n"
-                "movi v29.4s, #0\\n"
-                "movi v30.4s, #0\\n"
-                "movi v31.4s, #0\\n"
-            "1:"
+def gemm_4x4_int8_int8_int32(M, N, K, unroll, in_type):
     """
+    Int8 4x4 matrix multiplication and accumulation using a sequence of
+    umull -> uadalp -> umull2 -> uadalp instructions. This function
+    takes two arrays of int8 data type  A[4][K] and B[4][K], and produces
+    a 4x4 matrix which is equal to A*B'.
 
-    main_loop = ' "ldr q0, [%[a_ptr]]\\n" '
-
-    if M > 1:
-        main_loop += ' "ldr q1, [%[a_ptr], #16]\\n" '
-    else:
-        main_loop += ' "movi v1.4s, #0\\n" '
-
-    if M > 2:
-        main_loop += ' "ldr q2, [%[a_ptr], #32]\\n" '
-    else:
-        main_loop += ' "movi v2.4s, #0\\n" '
-
-    if M > 3:
-        main_loop += ' "ldr q3, [%[a_ptr], #48]\\n" '
-    else:
-        main_loop += ' "movi v3.4s, #0\\n" '
-
-    main_loop += ' "ldr q4, [%[b_ptr]]\\n" '
-
-    if N > 1:
-        main_loop += ' "ldr q5, [%[b_ptr], #16]\\n" '
-
-    if N > 2:
-        main_loop += ' "ldr q6, [%[b_ptr], #32]\\n" '
-
-    if N > 3:
-        main_loop += ' "ldr q7, [%[b_ptr], #48]\\n" '
-
-    # Main computation can interleave multiply/accumulate instructions
-    # or schedule them in batches (first all multiplies then all accumulates)
-    if interleave:
-        main_loop += gemm_quantized_4_4_interleaved()
-    else:
-        main_loop += gemm_quantized_4_4_batched()
+    The pseudo code is as follows.
 
-    blockA = min(64, M * 16)
-    blockB = min(64, N * 16)
-    main_loop += """// Increment pointers
-                    "add %[a_ptr], %[a_ptr], #{0}\\n"
-                    "add %[b_ptr], %[b_ptr], #{1}\\n" """.format(
-        blockA, blockB
-    )
+    .. code-block:: c
 
-    if unroll:
-        k = int(K // 16)
-        for l in range(0, k):
-            cc_code += main_loop
-    else:
-        cc_code += main_loop
-        cc_code += """
-                    "subs %w[k], %w[k], #1\\n"
-                    "cbnz %w[k], 1b\\n"
-                   """
-    cc_code += """
-                // Final additions
-
-                // v16 contains the four partial sums of a[0, 0:K].*b[0,0:K], let's call them (a,b,c,d)
-                // v17 contains the four partial sums of a[0, 0:K].*b[1,0:K], let's call them (e,f,g,h)
-                // v18 contains the four partial sums of a[0, 0:K].*b[2,0:K], let's call them (i,j,k,l)
-                // v19 contains the four partial sums of a[0, 0:K].*b[3,0:K], let's call them (m,n,o,p)
-                "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b, c+d, e+f, g+h)
-                "addp v17.4s, v18.4s, v19.4s\\n" // v17 = (i+j, k+l, m+n, o+p)
-                "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-                // v20 contains the four partial sums of a[1, 0:K].*b[0,0:K], let's call them (a,b,c,d)
-                // v21 contains the four partial sums of a[1, 0:K].*b[1,0:K], let's call them (e,f,g,h)
-                // v22 contains the four partial sums of a[1, 0:K].*b[2,0:K], let's call them (i,j,k,l)
-                // v23 contains the four partial sums of a[1, 0:K].*b[3,0:K], let's call them (m,n,o,p)
-                "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b, c+d, e+f, g+h)
-                "addp v21.4s, v22.4s, v23.4s\\n" // v21 = (i+j, k+l, m+n, o+p)
-                "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-                // v24 contains the four partial sums of a[2, 0:K].*b[0,0:K], let's call them (a,b,c,d)
-                // v25 contains the four partial sums of a[2, 0:K].*b[1,0:K], let's call them (e,f,g,h)
-                // v26 contains the four partial sums of a[2, 0:K].*b[2,0:K], let's call them (i,j,k,l)
-                // v27 contains the four partial sums of a[2, 0:K].*b[3,0:K], let's call them (m,n,o,p)
-                "addp v24.4s, v24.4s, v25.4s\\n"  // v24 = (a+b, c+d, e+f, g+h)
-                "addp v25.4s, v26.4s, v27.4s\\n"  // v25 = (i+j, k+l, m+n, o+p)
-                "addp v24.4s, v24.4s, v25.4s\\n"  // v24 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-                // v28 contains the four partial sums of a[3, 0:K].*b[0,0:K], let's call them (a,b,c,d)
-                // v29 contains the four partial sums of a[3, 0:K].*b[1,0:K], let's call them (e,f,g,h)
-                // v30 contains the four partial sums of a[3, 0:K].*b[2,0:K], let's call them (i,j,k,l)
-                // v31 contains the four partial sums of a[3, 0:K].*b[3,0:K], let's call them (m,n,o,p)
-                "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b, c+d, e+f, g+h)
-                "addp v29.4s, v30.4s, v31.4s\\n" // v29 = (i+j, k+l, m+n, o+p)
-                "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
-
-                "str q16, [%[c_ptr]]\\n"
-            """
-
-    stepC = min(4, N)
-    if M > 1:
-        cc_code += ' "str q20, [%[c_ptr], #{0}]\\n" '.format(stepC * 4)
-
-    if M > 2:
-        cc_code += ' "str q24, [%[c_ptr], #{0}]\\n" '.format(stepC * 8)
-
-    if M > 3:
-        cc_code += ' "str q28, [%[c_ptr], #{0}]\\n" '.format(stepC * 12)
-
-    cc_code += """
-             : [c_ptr] "+r" (c_ptr), [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [k] "+r" (k)
-             :
-             : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                    "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-                    "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-                    "v27", "v28", "v29", "v30", "v31"
-             );
-        return 0;
+        void gemm_4x4_int8_int8_int32(int8 A[4][K], int8 B[4][K], int32 C[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; j < 4; j++){
+                    for (int k = 0; k < K; k++){
+                        C[i][j] += A[i][k] * B[j][k]
+                    }
+            }
         }
-    """
 
-    if data_type == "int8":
-        cc_code = cc_code.replace("unsigned char", "char")
-        cc_code = cc_code.replace("umull", "smull")
-        cc_code = cc_code.replace("uadalp", "sadalp")
-
-    temp = utils.tempdir()
-    ll_path = temp.relpath("temp.ll")
-    # Create LLVM ir from c source code
-    ll_code = clang.create_llvm(
-        cc_code, options=["--target=aarch64-linux-gnu -mattr=+neon"], output=ll_path
-    )
-    return ll_code
-
-
-def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
-    """
-    Use integer ARM v8 instructions in order to produce a block c of 4x4 elements
-    given two 4xK blocks a and b' (where b' is a Kx4 block transposed). The final
-    result is c = a*b (where '*' indicates the matrix product)
-
-    Every row of the matrix c is obtained (for uint8) by a sequence of
-
-          umull -> uadalp -> umull2 -> uadalp
-
-    The block size is constrained by the number of registers available in arvm8. This
-    function returns a TensorIntrin that can be used to tensorize
-    a schedule.
+    Notes:
+        * The tiling strategy is picked to maximize register usage.
 
     Parameters
     ----------
-    M: int
+    M : int
         rows of the matrix A
-    N: int
+    N : int
         columns of the matrix B
-    K: int
+    K : int
         columns of matrix A
-    in_type: str, {'uint8', 'int8'}
-    out_type: str, {'uint32', 'int32'}
+    unroll : bool
+        Unroll the loop accumulation if True
+    in_type : str, {'uint8', 'int8'}
 
     Returns
     -------
@@ -414,7 +64,7 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
     assert in_type in ["uint8", "int8"]
     A = te.placeholder((K // 16, te.var("m"), 16), dtype=in_type, name="A")
     B = te.placeholder((K // 16, te.var("n"), 16), dtype=in_type, name="B")
-
+    dtype_vec = in_type + "x16"
     idxm = tvm.tir.indexmod
 
     k = te.reduce_axis((0, K), "k")
@@ -447,28 +97,322 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
         C.shape, dtype="int32", name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1]
     )
 
+    # Intrinsics used in the following algorithm
+    umull_intrin = "llvm.aarch64.neon.umull" if in_type == "uint8" else "llvm.aarch64.neon.smull"
+    uaddlp_intrin = "llvm.aarch64.neon.uaddlp" if in_type == "uint8" else "llvm.aarch64.neon.saddlp"
+    addp_intrin = "llvm.aarch64.neon.addp"
+
+    def uadalp(a, b):
+        """Add pair and accumulate
+
+        Parameters:
+        ----------
+        a: int16x8 vector
+        b: int16x8 vector
+
+        Returns:
+        --------
+            return a int32x4 vector
+
+        Pseudocode:
+        ----------
+            a += (b0+b1, b2+b3, b4+b5, b6+b7)
+        """
+
+        return a + tvm.tir.call_llvm_pure_intrin(
+            "int32x4", uaddlp_intrin, tvm.tir.const(1, "uint32"), b
+        )
+
+    def umull(a, b):
+        """Multiply long (higher part)
+
+        Parameters:
+        ----------
+        a: int8x16 vector
+        b: int8x16 vector
+
+        Returns:
+        --------
+            return a int16x8 vector
+
+        Pseudocode:
+        ----------
+            c = (a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7)
+        """
+        a_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", a)
+        b_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", b)
+        c = tvm.tir.call_llvm_pure_intrin(
+            "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_high, b_high
+        )
+        return c
+
+    def umull2(a, b):
+        """Multiply long (lower part)
+
+        Parameters:
+        ----------
+        a: int8x16 vector
+        b: int8x16 vector
+
+        Returns:
+        --------
+            return a int16x8 vector
+
+        Pseudocode:
+        ----------
+            c = (a8*b8, a9*b9, a10*b10, a11*b11, a12*b12, a13*b13, a14*b14, a15*b15)
+        """
+        a_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", a)
+        b_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", b)
+        c = tvm.tir.call_llvm_pure_intrin(
+            "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_low, b_low
+        )
+        return c
+
+    def addp(a, b):
+        """Add two vectors in pairs
+
+        Parameters:
+        ----------
+        a: int32x4 vector
+        b: int32x4 vector
+
+        Returns:
+        --------
+            return a int32x4 vector
+
+        Pseudocode:
+        ----------
+            c = (a0+a1, a2+a3, b0+b1, b0+b3)
+        """
+        return tvm.tir.call_llvm_pure_intrin(
+            "int32x4", addp_intrin, tvm.tir.const(2, "uint32"), a, b
+        )
+
+    def accumulation_loop(M, N, ins, acc, tile_idx):
+        """Internal tile accumulation. This function
+        takes two arrays of int8 data type  A[tile_idx][4][16] and B[tile_idx][4][16], produces
+        a 4x4 matrix which is equal to A*B' and accumulates into C[4][4]
+
+        The pseudo code is as follows.
+
+        .. code-block:: c
+
+            void gemm_4x4_int8_int8_int32(int8 A[tile_idx][4][K],
+                                          int8 B[tile_idx][4][K],
+                                          int32 C[4][4]){
+                for (int i = 0; i < 4; i++){
+                    for (int j = 0; j < 4; j++){
+                        for (int k = 0; k < 16; k++){
+                            C[i][j] += A[tile_idx][i][k] * B[tile_idx][j][k]
+                        }
+                }
+            }
+
+        Notes:
+            * The tiling strategy is picked to maximize register usage.
+
+        Parameters:
+        ----------
+        M : int
+            Number of total rows of the output matrix
+        N : int
+            Number of total columns of the output matrix
+        ins : list of tvm.tir.buffer
+            Input buffers
+        acc : tvm.tir.ir_builder.BufferVar
+            Bank of register accumulators
+        tiled_idx : int
+            Index of a sub-tile of A and B in A[tile_idx][:][:] and B[tile_idx][:][:].
+            Please note that  0 <= tile_idx <= K//16
+
+        """
+        a0 = ins[0].vload([tile_idx, 0, 0], dtype_vec)
+        a1 = tvm.tir.const(0, "int8x16")
+        if M > 1:
+            a1 = ins[0].vload([tile_idx, 1, 0], dtype_vec)
+        a2 = tvm.tir.const(0, "int8x16")
+        if M > 2:
+            a2 = ins[0].vload([tile_idx, 2, 0], dtype_vec)
+        a3 = tvm.tir.const(0, "int8x16")
+        if M > 3:
+            a3 = ins[0].vload([tile_idx, 3, 0], dtype_vec)
+
+        b0 = ins[1].vload([tile_idx, 0, 0], dtype_vec)
+        b1 = tvm.tir.const(0, "int8x16")
+        if N > 1:
+            b1 = ins[1].vload([tile_idx, 1, 0], dtype_vec)
+        b2 = tvm.tir.const(0, "int8x16")
+        if N > 2:
+            b2 = ins[1].vload([tile_idx, 2, 0], dtype_vec)
+        b3 = tvm.tir.const(0, "int8x16")
+        if N > 3:
+            b3 = ins[1].vload([tile_idx, 3, 0], dtype_vec)
+
+        # First half
+        # Lower part of a0 * {b0,b1,b2,b3}
+        d00 = umull(a0, b0)
+        d01 = umull(a0, b1)
+        d02 = umull(a0, b2)
+        d03 = umull(a0, b3)
+
+        # Lower part of a1 * {b0,b1,b2,b3}
+        d10 = umull(a1, b0)
+        d11 = umull(a1, b1)
+        d12 = umull(a1, b2)
+        d13 = umull(a1, b3)
+
+        # Accumulate
+        acc[0] = uadalp(acc[0], d00)
+        acc[1] = uadalp(acc[1], d01)
+        acc[2] = uadalp(acc[2], d02)
+        acc[3] = uadalp(acc[3], d03)
+        acc[4] = uadalp(acc[4], d10)
+        acc[5] = uadalp(acc[5], d11)
+        acc[6] = uadalp(acc[6], d12)
+        acc[7] = uadalp(acc[7], d13)
+
+        # Higher part of a0 * {b0,b1,b2,b3}
+        d00 = umull2(a0, b0)
+        d01 = umull2(a0, b1)
+        d02 = umull2(a0, b2)
+        d03 = umull2(a0, b3)
+
+        # Higher part of a1 * {b0,b1,b2,b3}
+        d10 = umull2(a1, b0)
+        d11 = umull2(a1, b1)
+        d12 = umull2(a1, b2)
+        d13 = umull2(a1, b3)
+
+        # Accumulate again
+        acc[0] = uadalp(acc[0], d00)
+        acc[1] = uadalp(acc[1], d01)
+        acc[2] = uadalp(acc[2], d02)
+        acc[3] = uadalp(acc[3], d03)
+        acc[4] = uadalp(acc[4], d10)
+        acc[5] = uadalp(acc[5], d11)
+        acc[6] = uadalp(acc[6], d12)
+        acc[7] = uadalp(acc[7], d13)
+
+        # Second half
+        # Lower part of a2 * {b0,b1,b2,b3}
+        d00 = umull(a2, b0)
+        d01 = umull(a2, b1)
+        d02 = umull(a2, b2)
+        d03 = umull(a2, b3)
+
+        # Lower part of a3 * {b0,b1,b2,b3}
+        d10 = umull(a3, b0)
+        d11 = umull(a3, b1)
+        d12 = umull(a3, b2)
+        d13 = umull(a3, b3)
+
+        # Accumulate
+        acc[8] = uadalp(acc[8], d00)
+        acc[9] = uadalp(acc[9], d01)
+        acc[10] = uadalp(acc[10], d02)
+        acc[11] = uadalp(acc[11], d03)
+        acc[12] = uadalp(acc[12], d10)
+        acc[13] = uadalp(acc[13], d11)
+        acc[14] = uadalp(acc[14], d12)
+        acc[15] = uadalp(acc[15], d13)
+
+        # Higher part of a2 * {b0,b1,b2,b3}
+        d00 = umull2(a2, b0)
+        d01 = umull2(a2, b1)
+        d02 = umull2(a2, b2)
+        d03 = umull2(a2, b3)
+
+        # Lower part of a3 * {b0,b1,b2,b3}
+        d10 = umull2(a3, b0)
+        d11 = umull2(a3, b1)
+        d12 = umull2(a3, b2)
+        d13 = umull2(a3, b3)
+
+        # Accumulate
+        acc[8] = uadalp(acc[8], d00)
+        acc[9] = uadalp(acc[9], d01)
+        acc[10] = uadalp(acc[10], d02)
+        acc[11] = uadalp(acc[11], d03)
+        acc[12] = uadalp(acc[12], d10)
+        acc[13] = uadalp(acc[13], d11)
+        acc[14] = uadalp(acc[14], d12)
+        acc[15] = uadalp(acc[15], d13)
+
     def _intrin_func(ins, outs):
         def _instr():
             ib = tvm.tir.ir_builder.create()
-            aa, bb = ins
-            cc = outs[0]
-            stepA = min(4, M)
-            stepB = min(4, N)
-            intrin_name = "gemm_quantized_{0}_{0}_int32_{1}_{2}".format(in_type, stepA, stepB)
+            # Allocate a local buffer (possibly translates to registers)
+            acc = ib.allocate("int32x4", 16, name="accs", scope="local")
+            m = outs[0].shape[0]
+            n = outs[0].shape[1]
+            # Initialization
+            for i in range(0, 16):
+                acc[i] = tvm.tir.const(0, "int32x4")
+
             if unroll:
-                intrin_name += "_" + str(K)
-            if interleave:
-                intrin_name += "_interleaved"
-            ib.emit(
-                tvm.tir.call_extern(
-                    "int32",
-                    intrin_name,
-                    outs[0].access_ptr("w"),
-                    a_buffer.access_ptr("r"),
-                    b_buffer.access_ptr("r"),
-                    K,
-                )
-            )
+                for i in range(0, int(K // 16)):
+                    accumulation_loop(M, N, ins, acc, i)
+            else:
+                with ib.for_range(0, K // 16, name="i") as i:
+                    accumulation_loop(M, N, ins, acc, i)
+
+            # Final accumulations
+            # acc[4*r + c] contains the partial accumulations of element C[r][c]
+            #
+            # In particular:
+            # acc[4*r] contains the partial sums of a[r,0:K].*b[0,0:K] -> (a,b,c,d)
+            # acc[4*r+1] contains the partial sums of a[r, 0:K].*b[1,0:K] -> (e,f,g,h)
+            # acc[4*r+2] contains the partial sums of a[r, 0:K].*b[2,0:K] -> (i,j,k,l)
+            # acc[4*r+3] contains the partial sums of a[r, 0:K].*b[3,0:K] -> (m,n,o,p)
+            #
+            # Please note that 0<= r, c < 4
+
+            acc[0] = addp(acc[0], acc[1])  # (a+b, c+d, e+f, g+h)
+            acc[1] = addp(acc[2], acc[3])  # (i+j, k+l, m+n, o+p)
+            acc[0] = addp(acc[0], acc[1])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+            acc[4] = addp(acc[4], acc[5])  # (a+b, c+d, e+f, g+h)
+            acc[5] = addp(acc[6], acc[7])  # (i+j, k+l, m+n, o+p)
+            acc[4] = addp(acc[4], acc[5])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+            acc[8] = addp(acc[8], acc[9])  # (a+b, c+d, e+f, g+h)
+            acc[9] = addp(acc[10], acc[11])  # (i+j, k+l, m+n, o+p)
+            acc[8] = addp(acc[8], acc[9])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+            acc[12] = addp(acc[12], acc[13])  # (a+b, c+d, e+f, g+h)
+            acc[13] = addp(acc[14], acc[15])  # (i+j, k+l, m+n, o+p)
+            acc[12] = addp(acc[12], acc[13])  # (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+            # Store the result
+            if N > 3:
+                out_0 = acc[0]
+                out_1 = acc[4]
+                out_2 = acc[8]
+                out_3 = acc[12]
+            elif N > 2:
+                out_0 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[0])
+                out_1 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[4])
+                out_2 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[8])
+                out_3 = tvm.tir.call_intrin("int32x3", "tir.reinterpret", acc[12])
+            elif N > 1:
+                out_0 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[0])
+                out_1 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[4])
+                out_2 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[8])
+                out_3 = tvm.tir.call_intrin("int32x2", "tir.reinterpret", acc[12])
+            else:
+                out_0 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[0])
+                out_1 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[4])
+                out_2 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[8])
+                out_3 = tvm.tir.call_intrin("int32", "tir.reinterpret", acc[12])
+
+            ib.emit(outs[0].vstore([0, 0], out_0))
+            if M > 1:
+                ib.emit(outs[0].vstore([1, 0], out_1))
+            if M > 2:
+                ib.emit(outs[0].vstore([2, 0], out_2))
+            if M > 3:
+                ib.emit(outs[0].vstore([3, 0], out_3))
             return ib.get()
 
         # body, reset, update
@@ -509,9 +453,9 @@ def dot_int8_int8_int32(int32_lanes, dtype="uint"):
 
     Parameters
     ----------
-    int32_lanes: int
+    int32_lanes : int
         How many int32/uint32 to produce
-    dtype: str, optional, {"uint", "int"}
+    dtype : str, optional, {"uint", "int"}
         Whether it works on unsigned int or signed int
 
     Returns
@@ -602,16 +546,16 @@ def select_word(vec, lane, dtype_vec):
 
      Parameters
     ----------
-    vec: tvm.tir.Expr
+    vec : tvm.tir.Expr
          int8x16 vector expression
-    lane: int
+    lane : int
         vector lane we want to replicate
-    dtype_vec: str
+    dtype_vec : str
         vector data type (e.g., int8x16)
 
     Returns
     ----------
-    output: tvm.tir.Expr
+    output : tvm.tir.Expr
         replicated vector
     """
     # Reinterpret vec_a as 4 int32 words
@@ -648,7 +592,7 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
 
     Parameters
     ----------
-    dtype: str, {"uint8", "int8"}
+    dtype : str, {"uint8", "int8"}
         Whether it works on unsigned int or signed int
 
     Returns
@@ -779,9 +723,9 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
 
     Parameters
     ----------
-    dtype: str, {"uint8", "int8"}
+    dtype : str, {"uint8", "int8"}
         Whether it works on unsigned int or signed int
-    rows: int
+    rows : int
         Number of of the output rows "n"
 
     Returns
@@ -990,7 +934,7 @@ def gemm_acc_2x2_int8_int8_int32(dtype):
 
     Parameters
     ----------
-    dtype: str, {"uint8", "int8"}
+    dtype : str, {"uint8", "int8"}
         Whether it works on unsigned int or signed int
 
     Returns

From dafcd9c33f52f1c19135af6efe7fa7faab475dcd Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Mon, 23 Nov 2020 16:41:53 -0800
Subject: [PATCH 199/258] Fix code to work with cmake 3.2 (#6952)

---
 cmake/modules/ClangFlags.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cmake/modules/ClangFlags.cmake b/cmake/modules/ClangFlags.cmake
index 9a3ac05a2a5b..53d0e3631caf 100644
--- a/cmake/modules/ClangFlags.cmake
+++ b/cmake/modules/ClangFlags.cmake
@@ -21,7 +21,11 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
   EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version)
   string (REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION ${clang_full_version})
   message(STATUS "CLANG_VERSION ${CLANG_VERSION}")
-  if (CLANG_VERSION VERSION_GREATER_EQUAL 10.0)
+  # cmake 3.2 does not support VERSION_GREATER_EQUAL
+  set(CLANG_MINIMUM_VERSION 10.0)
+  if ((CLANG_VERSION VERSION_GREATER ${CLANG_MINIMUM_VERSION})
+      OR
+      (CLANG_VERSION VERSION_GREATER ${CLANG_MINIMUM_VERSION}))
     message(STATUS "Setting enhanced clang warning flags")
 
     # These warnings are only enabled when clang's -Weverything flag is enabled

From 9fcb6ff3550a9c1e03cbead0a96ae294c0906bd1 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 23 Nov 2020 17:16:31 -0800
Subject: [PATCH 200/258] [PatternLang] Remove unnecessary check (#6958)

Thanks @mbrookhart
---
 src/relay/ir/dataflow_matcher.cc            | 2 --
 tests/python/relay/test_dataflow_pattern.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 536e65979ee4..44b87633d208 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -643,8 +643,6 @@ class PatternGrouper {
     auto extractor = MatchExtractor(inputs);
     auto body = extractor.Mutate(expr);
 
-    // Verify the pattern still holds
-    ICHECK(DFPatternMatcher(body).Match(pattern_, body));
     group.function = Function(params, body, NullValue<Type>(), Array<TypeVar>());
     group.name = extractor.GetName();
     // Check to make sure we aren't overlapping with another group or creating an invalid fusion
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index 23c0f9366ad9..d4c169bc603e 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -1210,7 +1210,7 @@ def test_partition_overused():
 
 
 def test_partition_check():
-    pattern = is_op("nn.relu")(is_op("nn.conv2d")(wildcard(), wildcard()))
+    pattern = is_op("nn.relu")(is_op("nn.conv2d")(is_var("input"), wildcard()))
 
     def check(pre):
         return pre.args[0].attrs.data_layout == "NCHW"

From 57ba863a783198a7aab69e552aca239b2912f8f8 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 23 Nov 2020 18:17:05 -0800
Subject: [PATCH 201/258] [Bugfix][AutoScheduler] Strictly select impl using
 plevel (#6956)

* [Bugfix][AutoScheduler] Strictly select impl using plevel

* lint
---
 python/tvm/relay/backend/compile_engine.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 28f2ac6d489b..a3108a7f1b41 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -22,6 +22,7 @@
 import numpy as np
 import tvm
 from tvm import te, autotvm
+from tvm.ir.transform import PassContext
 from tvm.runtime import Object
 from tvm.support import libinfo
 from tvm.target import Target
@@ -287,7 +288,10 @@ def lower_call(call, inputs, target):
             env.tracing = False
             reenable_tracing = True
 
-    if not is_dyn:
+    # check if auto_scheduler is enabled, and use pevel to select the implementation if so
+    use_auto_scheduler = PassContext.current().config.get("relay.backend.use_auto_scheduler", False)
+
+    if not is_dyn and not use_auto_scheduler:
         best_impl, outputs = select_implementation(op, call.attrs, inputs, ret_type, target)
     else:
         # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.

From 6bee8a0503a66f79d517b5446a9c35ae81acee02 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 23 Nov 2020 18:51:34 -0800
Subject: [PATCH 202/258] [AutoScheduler] Task scheduler callbacks (#6945)

* [AutoScheduler] Task scheduler callbacks

* docstring

* address comments

* Delete the explaination of callback in the tutorial

* fix

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
---
 python/tvm/auto_scheduler/task_scheduler.py | 152 +++++++++++++++-----
 1 file changed, 120 insertions(+), 32 deletions(-)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index 884741bd08cc..de11fc1b5b11 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -22,7 +22,7 @@
 L. Zheng, C. Jia, M. Sun, Z. Wu, C. Yu, et al. "Ansor : Generating High-Performance Tensor
 Programs for Deep Learning." (OSDI 2020).
 """
-
+import os
 import time
 import math
 import logging
@@ -168,6 +168,9 @@ class TaskScheduler:
         The parameter used for 'gradient' strategy
     backward_window_size: int = 3
         The parameter used for 'gradient' strategy
+    callbacks: Optional[List[TaskSchedulerCallback]]
+        The task scheduler callbacks that will be called before and after tuning a task.
+        If None, then PrintTableInfo callback will be used.
     """
 
     def __init__(
@@ -182,6 +185,7 @@ def __init__(
         beta: float = 2,
         gamma: float = 0.5,
         backward_window_size: int = 3,
+        callbacks=None,
     ):
         self.tasks = tasks
         if objective_func:  # use custom objective function
@@ -199,6 +203,7 @@ def __init__(
         self.beta = beta
         self.gamma = gamma
         self.backward_window_size = backward_window_size
+        self.callbacks = callbacks if callbacks is not None else [PrintTableInfo()]
 
         assert len(self.tasks) != 0, "No tasks"
         assert self.strategy in ["round-robin", "gradient"]
@@ -374,39 +379,12 @@ def tune(self, tune_option, search_policy="default"):
                     )
                 break
 
-    def _print_table_info(self, next_task_idx):
-        # table header
-        _ffi_api.PrintTitle("Task Scheduler")
-        print("|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |")
-        print("-------------------------------------------------")
-
-        # content
-        for i in range(len(self.tasks)):
-            id_str = "%d" % i
-            latency_str = "%.3f" % (1e3 * self.best_costs[i]) if self.best_costs[i] < 1e9 else "-"
-            speed_str = (
-                "%.2f" % (self.tasks[i].compute_dag.flop_ct / self.best_costs[i] / 1e9)
-                if self.best_costs[i] < 1e9
-                else "-"
-            )
-            trials_str = "%d" % (self.task_cts[i] * self.num_measures_per_round)
-            print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str))
-        print("-------------------------------------------------")
-
-        # overall info
-        if all(cost < 1e9 for cost in self.best_costs):
-            total_latency_str = "%.3f" % (self.cur_score * 1e3)
-        else:
-            total_latency_str = "-"
-        print(
-            "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t"
-            % (total_latency_str, self.ct, time.time() - self.tic, next_task_idx)
-        )
-
     def _tune_task(self, task_idx):
         """Tune the select task for one round"""
-        if self.tune_option.verbose >= 1:
-            self._print_table_info(task_idx)
+
+        # Run pre-tune callbacks
+        for callback in self.callbacks:
+            callback.pre_tune(self, task_idx)
 
         measure_inputs, measure_results = self.search_policies[task_idx].continue_search_one_round(
             self.num_measures_per_round, self.measurer
@@ -426,6 +404,10 @@ def _tune_task(self, task_idx):
         self.ct += len(measure_inputs)
         self.cur_score = self._compute_score(self.best_costs)
 
+        # Run post-tune callbacks
+        for callback in self.callbacks:
+            callback.post_tune(self, task_idx)
+
     def _compute_score(self, costs):
         """compute the objective function"""
         return self.objective_func(costs)
@@ -478,3 +460,109 @@ def _restore_status(self, log_file, num_measures_per_round):
         self.cur_score = self._compute_score(self.best_costs)
 
         logger.info("TaskScheduler: Loaded %d measurement records from %s", total_ct + 1, log_file)
+
+
+class TaskSchedulerCallback:
+    """The base class of task scheduler callback functions. """
+
+    def pre_tune(self, task_scheduler, task_id):
+        """The callback before tuning each task.
+
+        Parameters
+        ----------
+        task_scheduler: TaskScheduler
+            The task scheduler.
+        task_id: int
+            The task ID going to be tuned.
+        """
+        # Do nothing by default
+
+    def post_tune(self, task_scheduler, task_id):
+        """The callback after tuning each task.
+
+        Parameters
+        ----------
+        task_scheduler: TaskScheduler
+            The task scheduler.
+        task_id: int
+            The task ID be tuned.
+        """
+        # Do nothing by default
+
+
+class PrintTableInfo(TaskSchedulerCallback):
+    """The callback that prints a table of current progress."""
+
+    def pre_tune(self, task_scheduler, task_id):
+        if task_scheduler.tune_option.verbose < 1:
+            return
+
+        _ffi_api.PrintTitle("Task Scheduler")
+        print("|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |")
+        print("-------------------------------------------------")
+
+        # content
+        for i in range(len(task_scheduler.tasks)):
+            id_str = "%d" % i
+            latency_str = (
+                "%.3f" % (1e3 * task_scheduler.best_costs[i])
+                if task_scheduler.best_costs[i] < 1e9
+                else "-"
+            )
+            speed_str = (
+                "%.2f"
+                % (task_scheduler.tasks[i].compute_dag.flop_ct / task_scheduler.best_costs[i] / 1e9)
+                if task_scheduler.best_costs[i] < 1e9
+                else "-"
+            )
+            trials_str = "%d" % (task_scheduler.task_cts[i] * task_scheduler.num_measures_per_round)
+            print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str))
+        print("-------------------------------------------------")
+
+        # overall info
+        if all(cost < 1e9 for cost in task_scheduler.best_costs):
+            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
+        else:
+            total_latency_str = "-"
+        print(
+            "Estimated total latency: %s ms\tTrials: %d\tUsed time : %.0f s\tNext ID: %d\t"
+            % (
+                total_latency_str,
+                task_scheduler.ct,
+                time.time() - task_scheduler.tic,
+                task_id,
+            )
+        )
+
+
+class LogEstimatedLatency(TaskSchedulerCallback):
+    """Log the estimated latency to the file after tuning a task.
+
+    Parameters
+    ----------
+    log_file: str
+        The log file path.
+    """
+
+    def __init__(self, log_file):
+        if os.path.exists(log_file):  # Remove existing log
+            os.remove(log_file)
+
+        self.log_file = log_file
+
+    def post_tune(self, task_scheduler, task_id):
+        if all(cost < 1e9 for cost in task_scheduler.best_costs):
+            total_latency_str = "%.3f" % (task_scheduler.cur_score * 1e3)
+        else:
+            total_latency_str = "N/A"
+
+        with open(self.log_file, "a") as filep:
+            filep.write(
+                "ElapsedTime(s)\t%.0f\tEstimatedLatency(ms)\t%s\tTrials\t%d\n"
+                % (
+                    time.time() - task_scheduler.tic,
+                    total_latency_str,
+                    task_scheduler.ct,
+                )
+            )
+            filep.flush()

From 945e29bad22a2b0303e4034656daf5386dcd055c Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Mon, 23 Nov 2020 19:49:20 -0800
Subject: [PATCH 203/258] Cleanup comments (#6951)

---
 src/relay/transforms/merge_compiler_regions.cc |  2 +-
 src/relay/transforms/partition_graph.cc        | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/relay/transforms/merge_compiler_regions.cc b/src/relay/transforms/merge_compiler_regions.cc
index c7049bb4ee25..d18c17e63ca1 100644
--- a/src/relay/transforms/merge_compiler_regions.cc
+++ b/src/relay/transforms/merge_compiler_regions.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-/*
+/*!
  * \file src/relay/transforms/merge_compiler_regions.cc
  *
  * \brief After operators have been annotated with the targets that support
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index 75bc46387cc6..7508d4437c18 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-/*
+/*!
  * \file src/relay/transforms/partition_graph.cc
  *
  * \brief Partition an input function into multiple functions according based
@@ -81,19 +81,19 @@ struct RegionFuncMetadata {
  * a compiler attribute so that it will be handled by any compilers that are not
  * in the TVM stack.
  *
- * Input : A Relay module that have functions with disjoint annotated regions
+ * Input : A Relay module that has functions with disjoint annotated regions
  *         using compiler_begin and compiler_end. There could be multiple
- * outputs.
+ *         outputs.
  *
  * Output : A Relay module with global functions for such disjoint annotated
- * regions with calls inserted at the respective location
+ *          regions with calls inserted at the respective location
  *
  * Dependencies : AnnotatedRegionSet Utility class.
  *
  * Methodology :
  *      1) The AnnotatedRegionSet utility class is able to construct a collection
- *      of nodes that are bound by a given annotation -- here we use
- *      compiler_begin and compiler_end
+ *         of nodes that are bound by a given annotation -- here we use
+ *         compiler_begin and compiler_end
  *      2) Initially, for each function in the module RegionSets are populated.
  *      3) Then, Vistor pass is traversed until a compiler_end node is encountered
  *         that belongs to a "region".

From e666d3c2ca98047adac1a99c56368bc1e1adde64 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 24 Nov 2020 03:22:13 -0800
Subject: [PATCH 204/258] [AutoScheduler] Fix task extraction (#6965)

* [AutoScheduler] Fix task extraction

* fix

* fix

* trigger CI
---
 python/tvm/relay/backend/compile_engine.py | 10 ++++++----
 python/tvm/relay/op/strategy/cuda.py       |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index a3108a7f1b41..32affe73395c 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -186,6 +186,11 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
     all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
     best_plevel_impl = max(all_impls, key=lambda x: x.plevel)
 
+    # Disable autotvm if auto_scheduler is enabled.
+    # (i.e., always return the implementation with the highest priority for auto-scheduler).
+    if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
+        use_autotvm = False
+
     # If not use autotvm, always return the implementation with the highest priority
     if not use_autotvm:
         logger.info(
@@ -288,10 +293,7 @@ def lower_call(call, inputs, target):
             env.tracing = False
             reenable_tracing = True
 
-    # check if auto_scheduler is enabled, and use pevel to select the implementation if so
-    use_auto_scheduler = PassContext.current().config.get("relay.backend.use_auto_scheduler", False)
-
-    if not is_dyn and not use_auto_scheduler:
+    if not is_dyn:
         best_impl, outputs = select_implementation(op, call.attrs, inputs, ret_type, target)
     else:
         # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index ceaf9ddb84b0..f37fc2a96cd5 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -107,7 +107,7 @@ def naive_schedule(_, outs, target):
         # For GPU, we at least need thread binding to make a valid schedule.
         # So the naive schedule cannot be compiled.
         raise RuntimeError(
-            "Cannot compile for GPU targets if no tuned schedule is found."
+            "Cannot compile for GPU targets if no tuned schedule is found. "
             "Please see the warning messages above for more information about the failed workloads."
         )
     return tvm.te.create_schedule(outs[-1].op)

From dc994ca8b31c2fec68ba161c70064f67ce061679 Mon Sep 17 00:00:00 2001
From: Tom Gall <tom.gall@linaro.org>
Date: Tue, 24 Nov 2020 08:53:57 -0600
Subject: [PATCH 205/258] Fix #6954 uTVM, fix when building the runtime for
 native hardware (#6957)

* Fix #6954 which when building the runtime for native hardware fails
with -march= is missing.

This fix:
1) adds support for march
2) picks a senable setting for f746 discovery

There is an interesting downside to this fix involving scheduling that likely needs discussion.
In the microcontroller world we really should be setting ex: -march=armv7e-m depending on what
cortex-m is being used.

-mcpu isn't as important when it comes to a command line compiler.

Signed-off-by: Tom Gall <tom.gall@linaro.org>

* Fix #6954 which when building the runtime for native hardware fails
with -march= is missing.

This fix:
1) adds support for march
2) picks a senible setting for f746 discovery

There is an interesting downside to this fix involving scheduling that likely needs discussion.
In the microcontroller world we really should be setting ex: -march=armv7e-m depending on what
cortex-m is being used.

-mcpu isn't as important when it comes to a command line compiler.

Signed-off-by: Tom Gall <tom.gall@linaro.org>
---
 python/tvm/micro/compiler.py | 4 +++-
 python/tvm/target/target.py  | 2 +-
 src/target/target_kind.cc    | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py
index a265f2a3f7ac..3b62e9347c7f 100644
--- a/python/tvm/micro/compiler.py
+++ b/python/tvm/micro/compiler.py
@@ -139,9 +139,11 @@ def _defaults_from_target(self, target):
         opts = []
         # TODO use march for arm(https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html)?
         if target.attrs.get("mcpu"):
-            opts.append(f'-march={target.attrs["mcpu"]}')
+            opts.append(f'-mcpu={target.attrs["mcpu"]}')
         if target.attrs.get("mfpu"):
             opts.append(f'-mfpu={target.attrs["mfpu"]}')
+        if target.attrs.get("march"):
+            opts.append(f'-march={target.attrs["march"]}')
 
         return opts
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index ba4a1a2f744e..c919fc31e9aa 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -233,7 +233,7 @@ def micro(model="unknown", options=None):
     """
     trans_table = {
         "host": ["-mcpu=native"],
-        "stm32f746xx": ["-mcpu=cortex-m7"],
+        "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
     }
     opts = _merge_opts(
         trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"], options
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 017ba396f861..6bef8b3c5cd7 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -219,6 +219,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
+    .add_attr_option<String>("march")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLGPU)

From 955412cadfba68eacf0be58aefad7379edcc4a29 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 24 Nov 2020 12:28:36 -0500
Subject: [PATCH 206/258] [DOCS] Update to reflect the repo name change (#6967)

---
 CONTRIBUTORS.md                               |   2 +-
 README.md                                     |   4 +-
 apps/android_deploy/README.md                 |   4 +-
 apps/android_rpc/README.md                    |   6 +-
 apps/benchmark/README.md                      |   4 +-
 .../reference-vm/zephyr/pyproject.toml        |   2 +-
 apps/wasm-standalone/wasm-graph/Cargo.toml    |   2 +-
 apps/wasm-standalone/wasm-runtime/Cargo.toml  |   2 +-
 docker/Dockerfile.demo_android                |   2 +-
 docker/Dockerfile.demo_opencl                 |   2 +-
 docker/install/install_tvm_cpu.sh             |   2 +-
 docker/install/install_tvm_gpu.sh             |   2 +-
 docs/conf.py                                  |  10 +-
 docs/contribute/community.rst                 |   2 +-
 docs/contribute/document.rst                  |   4 +-
 docs/contribute/release_process.rst           |  49 +++----
 docs/deploy/android.rst                       |   4 +-
 docs/deploy/cpp_deploy.rst                    |  10 +-
 docs/deploy/index.rst                         |   2 +-
 docs/deploy/vitis_ai.rst                      | 128 +++++++++---------
 docs/dev/convert_layout.rst                   |   4 +-
 docs/dev/frontend/tensorflow.rst              |   4 +-
 docs/dev/inferbound.rst                       |   8 +-
 docs/dev/pass_infra.rst                       |  20 +--
 docs/dev/relay_add_pass.rst                   |   6 +-
 docs/dev/relay_bring_your_own_codegen.rst     |   2 +-
 docs/dev/runtime.rst                          |  22 +--
 docs/dev/virtual_machine.rst                  |  16 +--
 docs/install/docker.rst                       |   4 +-
 docs/install/from_source.rst                  |   2 +-
 docs/install/nnpack.rst                       |   2 +-
 docs/langref/relay_adt.rst                    |   2 +-
 docs/langref/relay_pattern.rst                |   2 +-
 docs/vta/install.rst                          |   4 +-
 jvm/README.md                                 |   2 +-
 jvm/pom.xml                                   |   8 +-
 python/setup.py                               |   2 +-
 python/tvm/relay/qnn/op/legalizations.py      |   2 +-
 python/tvm/topi/x86/conv2d.py                 |   2 +-
 python/tvm/topi/x86/conv2d_avx_1x1.py         |   2 +-
 rust/tvm-graph-rt/Cargo.toml                  |   2 +-
 rust/tvm-macros/Cargo.toml                    |   2 +-
 rust/tvm-rt/Cargo.toml                        |   4 +-
 rust/tvm-rt/README.md                         |   2 +-
 rust/tvm-rt/src/lib.rs                        |   2 +-
 rust/tvm-sys/src/context.rs                   |   2 +-
 rust/tvm/Cargo.toml                           |   4 +-
 rust/tvm/README.md                            |   4 +-
 rust/tvm/src/lib.rs                           |   2 +-
 src/parser/tokenizer.h                        |   2 +-
 tests/python/frontend/tflite/test_forward.py  |   2 +-
 tests/python/relay/test_op_level2.py          |   2 +-
 .../python/test_topi_conv2d_nhwc_pack_int8.py |   2 +-
 tests/python/topi/python/test_topi_vision.py  |   2 +-
 .../unittest/test_autotvm_graph_tuner_core.py |   2 +-
 .../test_autotvm_graph_tuner_utils.py         |   2 +-
 tutorials/autotvm/tune_relay_arm.py           |   4 +-
 tutorials/autotvm/tune_relay_cuda.py          |   2 +-
 tutorials/autotvm/tune_relay_mobile_gpu.py    |   4 +-
 tutorials/dev/bring_your_own_datatypes.py     |   4 +-
 tutorials/dev/use_pass_infra.py               |   2 +-
 tutorials/frontend/deploy_model_on_android.py |   6 +-
 tutorials/frontend/deploy_model_on_rasp.py    |   4 +-
 tutorials/frontend/from_mxnet.py              |   2 +-
 .../get_started/cross_compilation_and_rpc.py  |   2 +-
 web/README.md                                 |   4 +-
 66 files changed, 209 insertions(+), 222 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 5f01340f095d..650d1bc40e6d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -139,7 +139,7 @@ We do encourage everyone to work anything they are interested in.
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
 
 ## List of Contributors
-- [Full List of Contributors](https://github.com/apache/incubator-tvm/graphs/contributors)
+- [Full List of Contributors](https://github.com/apache/tvm/graphs/contributors)
   - To contributors: please add your name to the list.
 - [Qiao Zhang](https://github.com/zhangqiaorjc)
 - [Haolong Zhang](https://github.com/haolongzhangm)
diff --git a/README.md b/README.md
index 6c82b1585c45..b3a3e850adb2 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-<img src=https://raw.githubusercontent.com/apache/incubator-tvm-site/main/images/logo/tvm-logo-small.png width=128/> Open Deep Learning Compiler Stack
+<img src=https://raw.githubusercontent.com/apache/tvm-site/main/images/logo/tvm-logo-small.png width=128/> Open Deep Learning Compiler Stack
 ==============================================
 [Documentation](https://tvm.apache.org/docs) |
 [Contributors](CONTRIBUTORS.md) |
@@ -23,7 +23,7 @@
 [Release Notes](NEWS.md)
 
 [![Build Status](https://ci.tlcpack.ai/buildStatus/icon?job=tvm/main)](https://ci.tlcpack.ai/job/tvm/job/main/)
-[![WinMacBuild](https://github.com/apache/incubator-tvm/workflows/WinMacBuild/badge.svg)](https://github.com/apache/incubator-tvm/actions?query=workflow%3AWinMacBuild)
+[![WinMacBuild](https://github.com/apache/tvm/workflows/WinMacBuild/badge.svg)](https://github.com/apache/tvm/actions?query=workflow%3AWinMacBuild)
 
 Apache TVM (incubating) is a compiler stack for deep learning systems. It is designed to close the gap between the
 productivity-focused deep learning frameworks, and the performance- and efficiency-focused hardware backends.
diff --git a/apps/android_deploy/README.md b/apps/android_deploy/README.md
index d5efba88b901..32e601840f04 100644
--- a/apps/android_deploy/README.md
+++ b/apps/android_deploy/README.md
@@ -34,7 +34,7 @@ Alternatively, you may execute Docker image we provide which contains the requir
 
 ### Build APK
 
-Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/incubator-tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
+Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
 
 ```
 dependencies {
@@ -124,7 +124,7 @@ If everything goes well, you will find compile tools in `/opt/android-toolchain-
 
 Follow instruction to get compiled version model for android target [here.](https://tvm.apache.org/docs/deploy/android.html)
 
-Copied these compiled model deploy_lib.so, deploy_graph.json and deploy_param.params to apps/android_deploy/app/src/main/assets/ and modify TVM flavor changes on [java](https://github.com/apache/incubator-tvm/blob/main/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java#L81)
+Copied these compiled model deploy_lib.so, deploy_graph.json and deploy_param.params to apps/android_deploy/app/src/main/assets/ and modify TVM flavor changes on [java](https://github.com/apache/tvm/blob/main/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java#L81)
 
 `CPU Verison flavor`
 ```
diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 29962d329165..c5e21ecbbc12 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -28,7 +28,7 @@ You will need JDK, [Android NDK](https://developer.android.com/ndk) and an Andro
 
 We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
 
-Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/incubator-tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
+Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
 
 ```
 dependencies {
@@ -146,7 +146,7 @@ android   1      1     0
 ```
 
 
-Then checkout [android\_rpc/tests/android\_rpc\_test.py](https://github.com/apache/incubator-tvm/blob/main/apps/android_rpc/tests/android_rpc_test.py) and run,
+Then checkout [android\_rpc/tests/android\_rpc\_test.py](https://github.com/apache/tvm/blob/main/apps/android_rpc/tests/android_rpc_test.py) and run,
 
 ```bash
 # Specify the RPC tracker
@@ -157,7 +157,7 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
 python android_rpc_test.py
 ```
 
-This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set `'test_opencl = True'` and on Vulkan target set `'test_vulkan = True'` in  [tests/android_rpc_test.py](https://github.com/apache/incubator-tvm/blob/main/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
+This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set `'test_opencl = True'` and on Vulkan target set `'test_vulkan = True'` in  [tests/android_rpc_test.py](https://github.com/apache/tvm/blob/main/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
 On my test device, it gives following results.
 
 ```bash
diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index 920033f755ea..43d93d9e00fa 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -20,7 +20,7 @@
 
 ## Results
 
-See results on wiki page https://github.com/apache/incubator-tvm/wiki/Benchmark
+See results on wiki page https://github.com/apache/tvm/wiki/Benchmark
 
 ## How to Reproduce
 
@@ -78,7 +78,7 @@ python3 -m tvm.exec.rpc_tracker
   `python3 -m tvm.exec.rpc_server --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
 
 * For Android device
-   * Build and install tvm RPC apk on your device [Help](https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc).
+   * Build and install tvm RPC apk on your device [Help](https://github.com/apache/tvm/tree/main/apps/android_rpc).
      Make sure you can pass the android rpc test. Then you have alreadly known how to register.
 
 3. Verify the device registration
diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/reference-vm/zephyr/pyproject.toml
index d273b25eb3cd..f1c15ee5c890 100644
--- a/apps/microtvm/reference-vm/zephyr/pyproject.toml
+++ b/apps/microtvm/reference-vm/zephyr/pyproject.toml
@@ -47,7 +47,7 @@ exclude = '''
 )
 '''
 [tool.poetry]
-name = "incubator-tvm"
+name = "tvm"
 version = "0.1.0"
 description = ""
 authors = ["Your Name <you@example.com>"]
diff --git a/apps/wasm-standalone/wasm-graph/Cargo.toml b/apps/wasm-standalone/wasm-graph/Cargo.toml
index 9cdc8f599579..cea491b2f128 100644
--- a/apps/wasm-standalone/wasm-graph/Cargo.toml
+++ b/apps/wasm-standalone/wasm-graph/Cargo.toml
@@ -22,7 +22,7 @@ authors = ["TVM Contributors"]
 edition = "2018"
 description = "WebAssembly graph to deep learning frameworks using TVM"
 readme = "README.md"
-repository = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
 license = "Apache-2.0"
 keywords = ["wasm", "machine learning", "tvm"]
 
diff --git a/apps/wasm-standalone/wasm-runtime/Cargo.toml b/apps/wasm-standalone/wasm-runtime/Cargo.toml
index db00a55c31b5..99f6db54431f 100644
--- a/apps/wasm-standalone/wasm-runtime/Cargo.toml
+++ b/apps/wasm-standalone/wasm-runtime/Cargo.toml
@@ -21,7 +21,7 @@ version = "0.1.0"
 authors = ["TVM Contributors"]
 edition = "2018"
 description = "WebAssembly runtime to deep learning frameworks using wasmtime"
-repository = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
 license = "Apache-2.0"
 keywords = ["wasm", "machine learning", "wasmtime"]
 
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index cf13daa9734e..039439a937e9 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -53,7 +53,7 @@ RUN git clone https://github.com/KhronosGroup/OpenCL-Headers /usr/local/OpenCL-H
 
 # Build TVM
 RUN cd /usr && \
-    git clone --depth=1 https://github.com/apache/incubator-tvm tvm --recursive && \
+    git clone --depth=1 https://github.com/apache/tvm tvm --recursive && \
     cd /usr/tvm && \
     mkdir -p build && \
     cd build && \
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index e39ee4128c96..2f534d8b5b5c 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -62,7 +62,7 @@ RUN echo "Cloning TVM source & submodules"
 ENV TVM_PAR_DIR="/usr"
 RUN mkdir -p TVM_PAR_DIR && \
 	cd ${TVM_PAR_DIR} && \
-	git clone --depth=1 https://github.com/apache/incubator-tvm tvm --recursive
+	git clone --depth=1 https://github.com/apache/tvm tvm --recursive
 #RUN git submodule update --init --recursive
 
 
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
index b11c9791fb2d..c3a15fa26b6d 100755
--- a/docker/install/install_tvm_cpu.sh
+++ b/docker/install/install_tvm_cpu.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 cd /usr
-git clone https://github.com/apache/incubator-tvm tvm --recursive
+git clone https://github.com/apache/tvm tvm --recursive
 cd /usr/tvm
 # checkout a hash-tag
 git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470
diff --git a/docker/install/install_tvm_gpu.sh b/docker/install/install_tvm_gpu.sh
index 2dbf8e17398d..fe2214da8409 100755
--- a/docker/install/install_tvm_gpu.sh
+++ b/docker/install/install_tvm_gpu.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 cd /usr
-git clone https://github.com/apache/incubator-tvm tvm --recursive
+git clone https://github.com/apache/tvm tvm --recursive
 cd /usr/tvm
 # checkout a hash-tag
 git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470
diff --git a/docs/conf.py b/docs/conf.py
index e3ddae214e10..32bc095272aa 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -48,7 +48,7 @@
 project = "tvm"
 author = "Apache Software Foundation"
 copyright = "2020, %s" % author
-github_doc_root = "https://github.com/apache/incubator-tvm/tree/main/docs/"
+github_doc_root = "https://github.com/apache/tvm/tree/main/docs/"
 
 os.environ["TVM_BUILD_DOC"] = "1"
 # Version information.
@@ -309,12 +309,6 @@ def __call__(self, filename):
 footer_copyright = "© 2020 Apache Software Foundation | All right reserved"
 footer_note = " ".join(
     """
-Apache TVM is an effort undergoing incubation at The Apache Software Foundation (ASF),
-sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
-until a further review indicates that the infrastructure, communications, and decision making
-process have stabilized in a manner consistent with other successful ASF projects. While
-incubation status is not necessarily a reflection of the completeness or stability of the code,
-it does indicate that the project has yet to be fully endorsed by the ASF.
 Copyright © 2020 The Apache Software Foundation. Apache TVM, Apache, the Apache feather,
 and the Apache TVM project logo are either trademarks or registered trademarks of
 the Apache Software Foundation.""".split(
@@ -332,7 +326,7 @@ def __call__(self, filename):
     ("Blog", "https://tvm.apache.org/blog"),
     ("Docs", "https://tvm.apache.org/docs"),
     ("Conference", "https://tvmconf.org"),
-    ("Github", "https://github.com/apache/incubator-tvm/"),
+    ("Github", "https://github.com/apache/tvm/"),
 ]
 
 header_dropdown = {
diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index fd6df0f991bd..8867202a674c 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -20,7 +20,7 @@
 TVM Community Guideline
 =======================
 
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/apache/incubator-tvm/blob/main/CONTRIBUTORS.md>`_ for the current list of contributors.
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_ for the current list of contributors.
 
 
diff --git a/docs/contribute/document.rst b/docs/contribute/document.rst
index 1bfab1e1c061..3652a2891b37 100644
--- a/docs/contribute/document.rst
+++ b/docs/contribute/document.rst
@@ -68,7 +68,7 @@ Be careful to leave blank lines between sections of your documents.
 In the above case, there has to be a blank line before `Parameters`, `Returns` and `Examples`
 in order for the doc to be built correctly. To add a new function to the doc,
 we need to add the `sphinx.autodoc <http://www.sphinx-doc.org/en/master/ext/autodoc.html>`_
-rules to the `docs/api/python <https://github.com/apache/incubator-tvm/tree/main/docs/api/python>`_).
+rules to the `docs/api/python <https://github.com/apache/tvm/tree/main/docs/api/python>`_).
 You can refer to the existing files under this folder on how to add the functions.
 
 
@@ -96,7 +96,7 @@ to add comments about code logics to improve readability.
 Write Tutorials
 ---------------
 We use the `sphinx-gallery <https://sphinx-gallery.github.io/>`_ to build python tutorials.
-You can find the source code under `tutorials <https://github.com/apache/incubator-tvm/tree/main/tutorials>`_ quite self explanatory.
+You can find the source code under `tutorials <https://github.com/apache/tvm/tree/main/tutorials>`_ quite self explanatory.
 One thing that worth noting is that the comment blocks are written in reStructuredText instead of markdown so be aware of the syntax.
 
 The tutorial code will run on our build server to generate the document page.
diff --git a/docs/contribute/release_process.rst b/docs/contribute/release_process.rst
index 0f1e5151f5a9..f330a7ddd3e6 100644
--- a/docs/contribute/release_process.rst
+++ b/docs/contribute/release_process.rst
@@ -17,8 +17,8 @@
 
 .. _release_process:
 
-Apache TVM (incubating) Release Process
-=======================================
+Apache TVM Release Process
+==========================
 
 The release manager role in TVM means you are responsible for a few different things:
 
@@ -64,13 +64,13 @@ The last step is to update the KEYS file with your code signing key https://www.
 .. code-block:: bash
 
 	# the --depth=files will avoid checkout existing folders
-	svn co --depth=files "https://dist.apache.org/repos/dist/dev/incubator/tvm" svn-tvm
+	svn co --depth=files "https://dist.apache.org/repos/dist/dev/tvm" svn-tvm
 	cd svn-tvm
 	# edit KEYS file
 	svn ci --username $ASF_USERNAME --password "$ASF_PASSWORD" -m "Update KEYS"
 	# update downloads.apache.org
-	svn rm --username $ASF_USERNAME --password "$ASF_PASSWORD" https://dist.apache.org/repos/dist/release/incubator/tvm/KEYS -m "Update KEYS"
-	svn cp --username $ASF_USERNAME --password "$ASF_PASSWORD" https://dist.apache.org/repos/dist/dev/incubator/tvm/KEYS https://dist.apache.org/repos/dist/release/incubator/tvm/ -m "Update KEYS"
+	svn rm --username $ASF_USERNAME --password "$ASF_PASSWORD" https://dist.apache.org/repos/dist/release/tvm/KEYS -m "Update KEYS"
+	svn cp --username $ASF_USERNAME --password "$ASF_PASSWORD" https://dist.apache.org/repos/dist/dev/tvm/KEYS https://dist.apache.org/repos/dist/release/tvm/ -m "Update KEYS"
 
 
 Cut a Release Candidate
@@ -80,8 +80,8 @@ To cut a release candidate, one needs to first cut a branch using selected versi
 
 .. code-block:: bash
 
-	git clone https://github.com/apache/incubator-tvm.git
-	cd incubator-tvm/
+	git clone https://github.com/apache/tvm.git
+	cd tvm/
 	git branch v0.6.0
 	git push --set-upstream origin v0.6.0
 
@@ -107,8 +107,8 @@ Create source code artifacts,
 
 .. code-block:: bash
 
-	git clone git@github.com:apache/incubator-tvm.git apache-tvm-src-v0.6.0.rc0-incubating
-	cd apache-tvm-src-v0.6.0.rc0-incubating
+	git clone git@github.com:apache/tvm.git apache-tvm-src-v0.6.0.rc0
+	cd apache-tvm-src-v0.6.0.rc0
 	git checkout v0.6
 	git submodule update --init --recursive
 	git checkout v0.6.0.rc0
@@ -116,7 +116,7 @@ Create source code artifacts,
 	find . -name ".git*" -print0 | xargs -0 rm -rf
 	cd ..
 	brew install gnu-tar
-	gtar -czvf apache-tvm-src-v0.6.0.rc0-incubating.tar.gz apache-tvm-src-v0.6.0.rc0-incubating
+	gtar -czvf apache-tvm-src-v0.6.0.rc0.tar.gz apache-tvm-src-v0.6.0.rc0
 
 Use your GPG key to sign the created artifact. First make sure your GPG is set to use the correct private key,
 
@@ -129,8 +129,8 @@ Create GPG signature as well as the hash of the file,
 
 .. code-block:: bash
 
-	gpg --armor --output apache-tvm-src-v0.6.0.rc0-incubating.tar.gz.asc --detach-sig apache-tvm-src-v0.6.0.rc0-incubating.tar.gz
-	shasum -a 512 apache-tvm-src-v0.6.0.rc0-incubating.tar.gz > apache-tvm-src-v0.6.0.rc0-incubating.tar.gz.sha512
+	gpg --armor --output apache-tvm-src-v0.6.0.rc0.tar.gz.asc --detach-sig apache-tvm-src-v0.6.0.rc0.tar.gz
+	shasum -a 512 apache-tvm-src-v0.6.0.rc0.tar.gz > apache-tvm-src-v0.6.0.rc0.tar.gz.sha512
 
 
 Upload the Release Candidate
@@ -143,7 +143,7 @@ The release manager also needs to upload the artifacts to ASF SVN,
 .. code-block:: bash
 
 	# the --depth=files will avoid checkout existing folders
-	svn co --depth=files "https://dist.apache.org/repos/dist/dev/incubator/tvm" svn-tvm
+	svn co --depth=files "https://dist.apache.org/repos/dist/dev/tvm" svn-tvm
 	cd svn-tvm
 	mkdir tvm-v0.6.0-rc0
 	# copy files into it
@@ -154,9 +154,7 @@ The release manager also needs to upload the artifacts to ASF SVN,
 Call a Vote on the Release Candidate
 ------------------------------------
 
-As an incubator project, it requires voting on both dev@ and general@.
-
-The first voting takes place on the Apache TVM (incubator) developers list (dev@tvm.apache.org). To get more attention, one can create a github issue start with "[VOTE]" instead, it will be mirrored to dev@ automatically. Look at past voting threads to see how this proceeds. The email should follow this format.
+The first voting takes place on the Apache TVM developers list (dev@tvm.apache.org). To get more attention, one can create a github issue start with "[VOTE]" instead, it will be mirrored to dev@ automatically. Look at past voting threads to see how this proceeds. The email should follow this format.
 
 - Provide the link to the draft of the release notes in the email
 - Provide the link to the release candidate artifacts
@@ -164,14 +162,9 @@ The first voting takes place on the Apache TVM (incubator) developers list (dev@
 
 For the dev@ vote, there must be at least 3 binding +1 votes and more +1 votes than -1 votes. Once the vote is done, you should also send out a summary email with the totals, with a subject that looks something like [VOTE][RESULT] ....
 
-The voting then moves onto the general@incubator.apache.org. Anyone can contribute a vote, but only "Incubator PMC" (IPMC) votes are binding.
-To pass, there must be 3 binding +1 votes and more +1 votes than -1 votes.
-
 In ASF, votes are open "at least" 72hrs (3 days). If you don't get enough number of binding votes within that time, you cannot close the voting deadline. You need to extend it.
 
-Same as the one on dev@, send out a summary email to general@ once the vote passes.
-
-If either voting fails, the community needs to modified the release accordingly, create a new release candidate and re-run the voting process.
+If the voting fails, the community needs to modified the release accordingly, create a new release candidate and re-run the voting process.
 
 
 Post the Release
@@ -182,12 +175,12 @@ After the vote passes, to upload the binaries to Apache mirrors, you move the bi
 .. code-block:: bash
 
 	export SVN_EDITOR=vim
-	svn mkdir https://dist.apache.org/repos/dist/release/incubator/tvm
-	svn mv https://dist.apache.org/repos/dist/dev/incubator/tvm/tvm-v0.6.0-rc2 https://dist.apache.org/repos/dist/release/incubator/tvm/tvm-v0.6.0
+	svn mkdir https://dist.apache.org/repos/dist/release/tvm
+	svn mv https://dist.apache.org/repos/dist/dev/tvm/tvm-v0.6.0-rc2 https://dist.apache.org/repos/dist/release/tvm/tvm-v0.6.0
 
 	# If you've added your signing key to the KEYS file, also update the release copy.
-	svn co --depth=files "https://dist.apache.org/repos/dist/release/incubator/tvm" svn-tvm
-	curl "https://dist.apache.org/repos/dist/dev/incubator/tvm/KEYS" > svn-tvm/KEYS
+	svn co --depth=files "https://dist.apache.org/repos/dist/release/tvm" svn-tvm
+	curl "https://dist.apache.org/repos/dist/dev/tvm/KEYS" > svn-tvm/KEYS
 	(cd svn-tvm && svn ci --username $ASF_USERNAME --password "$ASF_PASSWORD" -m"Update KEYS")
 
 Remember to create a new release TAG (v0.6.0 in this case) on Github and remove the pre-release candidate TAG.
@@ -200,10 +193,10 @@ Remember to create a new release TAG (v0.6.0 in this case) on Github and remove
 Update the TVM Website
 ----------------------
 
-The website repository is located at `https://github.com/apache/incubator-tvm-site <https://github.com/apache/incubator-tvm-site>`_. Modify the download page to include the release artifacts as well as the GPG signature and SHA hash.
+The website repository is located at `https://github.com/apache/tvm-site <https://github.com/apache/tvm-site>`_. Modify the download page to include the release artifacts as well as the GPG signature and SHA hash.
 
 
 Post the Announcement
 ---------------------
 
-Send out an announcement email to general@incubator.apache.org, announce@apache.org, and dev@tvm.apache.org. The announcement should include the link to release note and download page.
+Send out an announcement email to announce@apache.org, and dev@tvm.apache.org. The announcement should include the link to release note and download page.
diff --git a/docs/deploy/android.rst b/docs/deploy/android.rst
index e28eef383164..8c8fcfb49679 100644
--- a/docs/deploy/android.rst
+++ b/docs/deploy/android.rst
@@ -38,5 +38,5 @@ deploy_lib.so, deploy_graph.json, deploy_param.params will go to android target.
 TVM Runtime for Android Target
 ------------------------------
 
-Refer `here <https://github.com/apache/incubator-tvm/blob/main/apps/android_deploy/README.md#build-and-installation>`_ to build CPU/OpenCL version flavor TVM runtime for android target.
-From android java TVM API to load model & execute can be referred at this `java <https://github.com/apache/incubator-tvm/blob/main/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java>`_ sample source.
+Refer `here <https://github.com/apache/tvm/blob/main/apps/android_deploy/README.md#build-and-installation>`_ to build CPU/OpenCL version flavor TVM runtime for android target.
+From android java TVM API to load model & execute can be referred at this `java <https://github.com/apache/tvm/blob/main/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java>`_ sample source.
diff --git a/docs/deploy/cpp_deploy.rst b/docs/deploy/cpp_deploy.rst
index f3de69db2d1c..44df1e55b58e 100644
--- a/docs/deploy/cpp_deploy.rst
+++ b/docs/deploy/cpp_deploy.rst
@@ -19,7 +19,7 @@
 Deploy TVM Module using C++ API
 ===============================
 
-We provide an example on how to deploy TVM modules in `apps/howto_deploy <https://github.com/apache/incubator-tvm/tree/main/apps/howto_deploy>`_
+We provide an example on how to deploy TVM modules in `apps/howto_deploy <https://github.com/apache/tvm/tree/main/apps/howto_deploy>`_
 
 To run the example, you can use the following command
 
@@ -38,17 +38,17 @@ TVM provides a minimum runtime, which costs around 300K to 600K depending on how
 In most cases, we can use ``libtvm_runtime.so`` that comes with the build.
 
 If somehow you find it is hard to build ``libtvm_runtime``, checkout
-`tvm_runtime_pack.cc <https://github.com/apache/incubator-tvm/tree/main/apps/howto_deploy/tvm_runtime_pack.cc>`_.
+`tvm_runtime_pack.cc <https://github.com/apache/tvm/tree/main/apps/howto_deploy/tvm_runtime_pack.cc>`_.
 It is an example all in one file that gives you TVM runtime.
 You can compile this file using your build system and include this into your project.
 
-You can also checkout `apps <https://github.com/apache/incubator-tvm/tree/main/apps/>`_ for example applications build with TVM on iOS, Android and others.
+You can also checkout `apps <https://github.com/apache/tvm/tree/main/apps/>`_ for example applications build with TVM on iOS, Android and others.
 
 Dynamic Library vs. System Module
 ---------------------------------
 TVM provides two ways to use the compiled library.
-You can checkout `prepare_test_libs.py <https://github.com/apache/incubator-tvm/tree/main/apps/howto_deploy/prepare_test_libs.py>`_
-on how to generate the library and `cpp_deploy.cc <https://github.com/apache/incubator-tvm/tree/main/apps/howto_deploy/cpp_deploy.cc>`_ on how to use them.
+You can checkout `prepare_test_libs.py <https://github.com/apache/tvm/tree/main/apps/howto_deploy/prepare_test_libs.py>`_
+on how to generate the library and `cpp_deploy.cc <https://github.com/apache/tvm/tree/main/apps/howto_deploy/cpp_deploy.cc>`_ on how to use them.
 
 - Store library as a shared library and dynamically load the library into your project.
 - Bundle the compiled library into your project in system module mode.
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
index e47b0a3c72fe..2b37f734c3c3 100644
--- a/docs/deploy/index.rst
+++ b/docs/deploy/index.rst
@@ -38,7 +38,7 @@ on a Linux based embedded system such as Raspberry Pi:
 
 .. code:: bash
 
-    git clone --recursive https://github.com/apache/incubator-tvm tvm
+    git clone --recursive https://github.com/apache/tvm tvm
     cd tvm
     mkdir build
     cp cmake/config.cmake build
diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst
index f0bd3edcd6e2..df29f16f9d8d 100755
--- a/docs/deploy/vitis_ai.rst
+++ b/docs/deploy/vitis_ai.rst
@@ -101,11 +101,11 @@ Hardware setup and docker build
    .. code:: bash
 
       git clone --recurse-submodules https://github.com/Xilinx/Vitis-AI
-   
+
 2. Install Docker, and add the user to the docker group. Link the user
    to docker installation instructions from the following docker's
    website:
-   
+
 
    -  https://docs.docker.com/install/linux/docker-ce/ubuntu/
    -  https://docs.docker.com/install/linux/docker-ce/centos/
@@ -114,11 +114,11 @@ Hardware setup and docker build
 3. Download the latest Vitis AI Docker with the following command. This container runs on CPU.
 
    .. code:: bash
-      
+
       docker pull xilinx/vitis-ai:latest
-    
+
    To accelerate the quantization, you can optionally use the Vitis-AI GPU docker image. Use the below commands to build the Vitis-AI GPU docker container:
-   
+
    .. code:: bash
 
       cd Vitis-AI/docker
@@ -141,32 +141,32 @@ Hardware setup and docker build
    -  Run the following commands:
 
       .. code:: bash
-      
+
          cd Vitis-AI/alveo/packages
          sudo su
          ./install.sh
-      
+
    -  Power cycle the system.
-   
+
 5. Clone tvm repo and pyxir repo
 
    .. code:: bash
-     
-      git clone --recursive https://github.com/apache/incubator-tvm.git
+
+      git clone --recursive https://github.com/apache/tvm.git
       git clone --recursive https://github.com/Xilinx/pyxir.git
-   
+
 6. Build and start the tvm runtime Vitis-AI Docker Container.
 
    .. code:: bash
 
-      ./incubator-tvm/docker/build.sh demo_vitis_ai bash
-      ./incubator-tvm/docker/bash.sh tvm.demo_vitis_ai
-	  
+      ./tvm/docker/build.sh demo_vitis_ai bash
+      ./tvm/docker/bash.sh tvm.demo_vitis_ai
+
       #Setup inside container
       source /opt/xilinx/xrt/setup.sh
       . $VAI_ROOT/conda/etc/profile.d/conda.sh
       conda activate vitis-ai-tensorflow
-      
+
 7. Install PyXIR
 
    .. code:: bash
@@ -174,27 +174,27 @@ Hardware setup and docker build
      cd pyxir
      python3 setup.py install --use_vai_rt_dpucadx8g --user
 
-   
+
 8. Build TVM inside the container with Vitis-AI
 
    .. code:: bash
 
-      cd incubator-tvm
+      cd tvm
       mkdir build
       cp cmake/config.cmake build
-      cd build  
+      cd build
       echo set\(USE_LLVM ON\) >> config.cmake
       echo set\(USE_VITIS_AI ON\) >> config.cmake
       cmake ..
       make -j$(nproc)
-   
+
 9.  Install TVM
 
     .. code:: bash
 
-      cd incubator-tvm/python
+      cd tvm/python
       pip3 install -e . --user
-      
+
 Edge (DPUCZDX8G)
 ^^^^^^^^^^^^^^^^
 
@@ -238,19 +238,19 @@ Host setup and docker build
 
    .. code:: bash
 
-      git clone --recursive https://github.com/apache/incubator-tvm.git
+      git clone --recursive https://github.com/apache/tvm.git
 2. Build and start the tvm runtime Vitis-AI Docker Container.
 
    .. code:: bash
 
-      cd incubator-tvm 
-      ./incubator-tvm/docker/build.sh demo_vitis_ai bash
-      ./incubator-tvm/docker/bash.sh tvm.demo_vitis_ai
-   
+      cd tvm
+      ./tvm/docker/build.sh demo_vitis_ai bash
+      ./tvm/docker/bash.sh tvm.demo_vitis_ai
+
       #Setup inside container
       . $VAI_ROOT/conda/etc/profile.d/conda.sh
       conda activate vitis-ai-tensorflow
-   
+
 3. Install PyXIR
 
    .. code:: bash
@@ -258,13 +258,13 @@ Host setup and docker build
       git clone --recursive https://github.com/Xilinx/pyxir.git
       cd pyxir
       python3 setup.py install --user
-   
-   
+
+
 4. Build TVM inside the container with Vitis-AI.
 
    .. code:: bash
 
-      cd incubator-tvm 
+      cd tvm
       mkdir build
       cp cmake/config.cmake build
       cd build
@@ -272,12 +272,12 @@ Host setup and docker build
       echo set\(USE_VITIS_AI ON\) >> config.cmake
       cmake ..
       make -j$(nproc)
-   
+
 5. Install TVM
 
    .. code:: bash
 
-      cd incubator-tvm/python
+      cd tvm/python
       pip3 install -e . --user
 
 Edge requirements
@@ -299,10 +299,10 @@ platform. The following development boards can be used out-of-the-box:
 
 Edge hardware setup
 ^^^^^^^^^^^^^^^^^^^
-.. note:: 
+.. note::
 
-  This section provides instructions for setting up with the `Pynq <http://www.pynq.io/>`__ platform but 
-  Petalinux based flows are also supported. 
+  This section provides instructions for setting up with the `Pynq <http://www.pynq.io/>`__ platform but
+  Petalinux based flows are also supported.
 
 1. Download the Pynq v2.5 image for your target (use Z1 or Z2 for
    Ultra96 target depending on board version) Link to image:
@@ -318,7 +318,7 @@ Edge hardware setup
    .. code:: bash
 
      python3 -c 'from pynq_dpu import DpuOverlay ; overlay = DpuOverlay("dpu.bit")'
-  
+
 6. Check whether the DPU kernel is alive:
 
    .. code:: bash
@@ -328,10 +328,10 @@ Edge hardware setup
 Edge TVM setup
 ^^^^^^^^^^^^^^
 
-.. note:: 
+.. note::
 
-  When working on Petalinux instead of Pynq, the following steps might take more manual work (e.g building     
-  hdf5 from source). Also, TVM has a scipy dependency which you then might have to build from source or 
+  When working on Petalinux instead of Pynq, the following steps might take more manual work (e.g building
+  hdf5 from source). Also, TVM has a scipy dependency which you then might have to build from source or
   circumvent. We don't depend on scipy in our flow.
 
 Building TVM depends on the Xilinx
@@ -344,7 +344,7 @@ interface between TVM and Vitis-AI tools.
 
       apt-get install libhdf5-dev
       pip3 install pydot h5py
-      
+
 2. Install PyXIR
 
    .. code:: bash
@@ -352,25 +352,25 @@ interface between TVM and Vitis-AI tools.
       git clone --recursive https://github.com/Xilinx/pyxir.git
       cd pyxir
       sudo python3 setup.py install --use_vai_rt_dpuczdx8g
-   
+
 3. Build TVM with Vitis-AI
 
    .. code:: bash
 
-      git clone --recursive https://github.com/apache/incubator-tvm
-      cd incubator-tvm
+      git clone --recursive https://github.com/apache/tvm
+      cd tvm
       mkdir build
       cp cmake/config.cmake build
       cd build
       echo set\(USE_VITIS_AI ON\) >> config.cmake
-      cmake ..     
+      cmake ..
       make
-   
+
 4. Install TVM
 
    .. code:: bash
 
-      cd incubator-tvm/python
+      cd tvm/python
       pip3 install -e . --user
 
 5. Check whether the setup was successful in the Python shell:
@@ -467,7 +467,7 @@ build call.
    tvm_target = 'llvm'
    target='DPUCADX8G'
 
-   with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target}):   
+   with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target}):
       lib = relay.build(mod, tvm_target, params=params)
 
 As one more step before we can accelerate a model with Vitis-AI in TVM
@@ -488,7 +488,7 @@ will take a substantial amount of time.
    # be executed on the CPU
    # This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64)
    for i in range(128):
-      module.set_input(input_name, inputs[i]) 
+      module.set_input(input_name, inputs[i])
       module.run()
 
 Afterwards, inference will be accelerated on the DPU.
@@ -563,11 +563,11 @@ The Vitis-AI target is DPUCZDX8G-zcu104 as we are targeting the edge DPU
 on the ZCU104 board and this target is passed as a config to the TVM
 build call. Note that different identifiers can be passed for different
 targets, see `edge targets info <#edge-requirements>`__. Additionally, we
-provide the 'export_runtime_module' config that points to a file to which we 
+provide the 'export_runtime_module' config that points to a file to which we
 can export the Vitis-AI runtime module. We have to do this because we will
 first be compiling and quantizing the model on the host machine before building
-the model for edge deployment. As you will see later on, the exported runtime 
-module will be passed to the edge build so that the Vitis-AI runtime module 
+the model for edge deployment. As you will see later on, the exported runtime
+module will be passed to the edge build so that the Vitis-AI runtime module
 can be included.
 
 .. code:: python
@@ -575,17 +575,17 @@ can be included.
    from tvm.contrib import util
 
    temp = util.tempdir()
-   
+
    tvm_target = 'llvm'
    target='DPUCZDX8G-zcu104'
    export_rt_mod_file = temp.relpath("vitis_ai.rtmod")
-  
+
    with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target,
-   						        'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file}):   
+   						        'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file}):
       lib = relay.build(mod, tvm_target, params=params)
-      
-We will quantize and compile the model for execution on the DPU using on-the-fly 
-quantization on the host machine. This makes use of TVM inference calls 
+
+We will quantize and compile the model for execution on the DPU using on-the-fly
+quantization on the host machine. This makes use of TVM inference calls
 (module.run) to quantize the model on the host with the first N inputs.
 
 .. code:: python
@@ -596,10 +596,10 @@ quantization on the host machine. This makes use of TVM inference calls
    # be executed on the CPU
    # This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64)
    for i in range(128):
-      module.set_input(input_name, inputs[i]) 
+      module.set_input(input_name, inputs[i])
       module.run()
-      
-Save the TVM lib module so that the Vitis-AI runtime module will also be exported 
+
+Save the TVM lib module so that the Vitis-AI runtime module will also be exported
 (to the 'export_runtime_module' path we previously passed as a config).
 
 .. code:: python
@@ -609,9 +609,9 @@ Save the TVM lib module so that the Vitis-AI runtime module will also be exporte
    temp = util.tempdir()
    lib.export_library(temp.relpath("tvm_lib.so"))
 
-After quantizing and compiling the model for Vitis-AI acceleration using the 
-first N inputs we can build the model for execution on the ARM edge device. 
-Here we pass the previously exported Vitis-AI runtime module so it can be included 
+After quantizing and compiling the model for Vitis-AI acceleration using the
+first N inputs we can build the model for execution on the ARM edge device.
+Here we pass the previously exported Vitis-AI runtime module so it can be included
 in the TVM build.
 
 .. code:: python
@@ -637,7 +637,7 @@ section.
 Edge steps
 ^^^^^^^^^^
 
-After setting up TVM with Vitis-AI on the edge device, you can now load 
+After setting up TVM with Vitis-AI on the edge device, you can now load
 the TVM runtime module into memory and feed inputs for inference.
 
 .. code:: python
diff --git a/docs/dev/convert_layout.rst b/docs/dev/convert_layout.rst
index 6c9890f69d85..490df1372417 100644
--- a/docs/dev/convert_layout.rst
+++ b/docs/dev/convert_layout.rst
@@ -264,5 +264,5 @@ The ordering of the layouts is defined by the implementation of `register_conver
 
 Current implementation has support for almost all the operators commonly used in image classification models. However, if one encounters too many data layout transforms in the graph, it is highly likely that there is an operator whose layouts need special handling as described in Section 3. Some pull requests that can help in such a situation are
 
-- Layout inference for `Batch Norm <https://github.com/apache/incubator-tvm/pull/4600>`_ - Batch normalization falls into the category of lightly-sensitive operator. The PR shows how to handle the layout inference for batch norm.
-- Python Callback for `Convolution <https://github.com/apache/incubator-tvm/pull/4335>`_- For highly-sensitive operators, one might have to do python callback as well. The PR shows how to define a python callback function for Convolution operator.
+- Layout inference for `Batch Norm <https://github.com/apache/tvm/pull/4600>`_ - Batch normalization falls into the category of lightly-sensitive operator. The PR shows how to handle the layout inference for batch norm.
+- Python Callback for `Convolution <https://github.com/apache/tvm/pull/4335>`_- For highly-sensitive operators, one might have to do python callback as well. The PR shows how to define a python callback function for Convolution operator.
diff --git a/docs/dev/frontend/tensorflow.rst b/docs/dev/frontend/tensorflow.rst
index b234ed7b0466..dde7179d90db 100644
--- a/docs/dev/frontend/tensorflow.rst
+++ b/docs/dev/frontend/tensorflow.rst
@@ -57,7 +57,7 @@ Export
 
 TensorFlow frontend expects a frozen protobuf (.pb) or saved model as input. It currently does not support checkpoint (.ckpt). The graphdef needed by the TensorFlow frontend can be extracted from the active session, or by using the `TFParser`_ helper class.
 
-.. _TFParser: https://github.com/apache/incubator-tvm/blob/main/python/tvm/relay/frontend/tensorflow_parser.py
+.. _TFParser: https://github.com/apache/tvm/blob/main/python/tvm/relay/frontend/tensorflow_parser.py
 
 The model should be exported with a number of transformations to prepare the model for inference. It is also important to set ```add_shapes=True```, as this will embed the output shapes of each node into the graph. Here is one function to export a model as a protobuf given a session:
 
@@ -101,7 +101,7 @@ Import the Model
 Explicit Shape:
 ~~~~~~~~~~~~~~~
 
-To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases <https://github.com/apache/incubator-tvm/blob/main/tests/python/frontend/tensorflow/test_forward.py#L36>`_ for examples.
+To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases <https://github.com/apache/tvm/blob/main/tests/python/frontend/tensorflow/test_forward.py#L36>`_ for examples.
 
 Data Layout
 ~~~~~~~~~~~
diff --git a/docs/dev/inferbound.rst b/docs/dev/inferbound.rst
index 7d0127a6c039..010d0d42d37e 100644
--- a/docs/dev/inferbound.rst
+++ b/docs/dev/inferbound.rst
@@ -22,7 +22,7 @@ InferBound Pass
 *******************************************
 
 
-The InferBound pass is run after normalize, and before ScheduleOps `build_module.py <https://github.com/apache/incubator-tvm/blob/main/python/tvm/driver/build_module.py>`_. The main job of InferBound is to create the bounds map, which specifies a Range for each IterVar in the program. These bounds are then passed to ScheduleOps, where they are used to set the extents of For loops, see `MakeLoopNest <https://github.com/apache/incubator-tvm/blob/main/src/te/operation/op_util.cc>`_, and to set the sizes of allocated buffers (`BuildRealize <https://github.com/apache/incubator-tvm/blob/main/src/te/operation/compute_op.cc>`_), among other uses.
+The InferBound pass is run after normalize, and before ScheduleOps `build_module.py <https://github.com/apache/tvm/blob/main/python/tvm/driver/build_module.py>`_. The main job of InferBound is to create the bounds map, which specifies a Range for each IterVar in the program. These bounds are then passed to ScheduleOps, where they are used to set the extents of For loops, see `MakeLoopNest <https://github.com/apache/tvm/blob/main/src/te/operation/op_util.cc>`_, and to set the sizes of allocated buffers (`BuildRealize <https://github.com/apache/tvm/blob/main/src/te/operation/compute_op.cc>`_), among other uses.
 
 The output of InferBound is a map from IterVar to Range:
 
@@ -53,9 +53,9 @@ Therefore, let's review the Range and IterVar classes:
    	};
    }
 
-Note that IterVarNode also contains a Range ``dom``. This ``dom`` may or may not have a meaningful value, depending on when the IterVar was created. For example, when ``tvm.compute`` is called, an `IterVar is created <https://github.com/apache/incubator-tvm/blob/main/src/te/operation/compute_op.cc>`_ for each axis and reduce axis, with dom's equal to the shape supplied in the call to ``tvm.compute``.
+Note that IterVarNode also contains a Range ``dom``. This ``dom`` may or may not have a meaningful value, depending on when the IterVar was created. For example, when ``tvm.compute`` is called, an `IterVar is created <https://github.com/apache/tvm/blob/main/src/te/operation/compute_op.cc>`_ for each axis and reduce axis, with dom's equal to the shape supplied in the call to ``tvm.compute``.
 
-On the other hand, when ``tvm.split`` is called, `IterVars are created <https://github.com/apache/incubator-tvm/blob/main/src/te/schedule/schedule_lang.cc>`_ for the inner and outer axes, but these IterVars are not given a meaningful ``dom`` value.
+On the other hand, when ``tvm.split`` is called, `IterVars are created <https://github.com/apache/tvm/blob/main/src/te/schedule/schedule_lang.cc>`_ for the inner and outer axes, but these IterVars are not given a meaningful ``dom`` value.
 
 In any case, the ``dom`` member of an IterVar is never modified during InferBound. However, keep in mind that the ``dom`` member of an IterVar is sometimes used as default value for the Ranges InferBound computes.
 
@@ -117,7 +117,7 @@ Tensors haven't been mentioned yet, but in the context of TVM, a Tensor represen
    	int value_index;
    };
 
-In the Operation class declaration above, we can see that each operation also has a list of InputTensors. Thus the stages of the schedule form a DAG, where each stage is a node in the graph. There is an edge in the graph from Stage A to Stage B, if the operation of Stage B has an input tensor whose source operation is the op of Stage A. Put simply, there is an edge from A to B, if B consumes a tensor produced by A. See the diagram below. This graph is created at the beginning of InferBound, by a call to `CreateReadGraph <https://github.com/apache/incubator-tvm/blob/main/src/te/schedule/bound.cc>`_.
+In the Operation class declaration above, we can see that each operation also has a list of InputTensors. Thus the stages of the schedule form a DAG, where each stage is a node in the graph. There is an edge in the graph from Stage A to Stage B, if the operation of Stage B has an input tensor whose source operation is the op of Stage A. Put simply, there is an edge from A to B, if B consumes a tensor produced by A. See the diagram below. This graph is created at the beginning of InferBound, by a call to `CreateReadGraph <https://github.com/apache/tvm/blob/main/src/te/schedule/bound.cc>`_.
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/stage_graph.png
     :align: center
diff --git a/docs/dev/pass_infra.rst b/docs/dev/pass_infra.rst
index 898e51793a44..3680cb886952 100644
--- a/docs/dev/pass_infra.rst
+++ b/docs/dev/pass_infra.rst
@@ -528,22 +528,22 @@ optimization pipeline and debug Relay and tir passes, please refer to the
 
 .. _Sequential: https://pytorch.org/docs/stable/nn.html?highlight=sequential#torch.nn.Sequential
 
-.. _Block: https://mxnet.incubator.apache.org/api/python/docs/api/gluon/block.html#gluon-block
+.. _Block: https://mxnet.apache.org/api/python/docs/api/gluon/block.html#gluon-block
 
-.. _include/tvm/ir/transform.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/ir/transform.h
+.. _include/tvm/ir/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/ir/transform.h
 
-.. _src/relay/ir/transform.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/ir/transform.cc
+.. _src/relay/ir/transform.cc: https://github.com/apache/tvm/blob/main/src/relay/ir/transform.cc
 
-.. _src/ir/transform.cc: https://github.com/apache/incubator-tvm/blob/main/src/ir/transform.cc
+.. _src/ir/transform.cc: https://github.com/apache/tvm/blob/main/src/ir/transform.cc
 
-.. _src/relay/pass/fold_constant.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/pass/fold_constant.cc
+.. _src/relay/pass/fold_constant.cc: https://github.com/apache/tvm/blob/main/src/relay/pass/fold_constant.cc
 
-.. _python/tvm/relay/transform.py: https://github.com/apache/incubator-tvm/blob/main/python/tvm/relay/transform.py
+.. _python/tvm/relay/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/relay/transform.py
 
-.. _include/tvm/relay/transform.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/relay/transform.h
+.. _include/tvm/relay/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/relay/transform.h
 
-.. _python/tvm/ir/transform.py: https://github.com/apache/incubator-tvm/blob/main/python/tvm/ir/transform.py
+.. _python/tvm/ir/transform.py: https://github.com/apache/tvm/blob/main/python/tvm/ir/transform.py
 
-.. _src/tir/transforms/unroll_loop.cc: https://github.com/apache/incubator-tvm/blob/main/src/tir/transforms/unroll_loop.cc
+.. _src/tir/transforms/unroll_loop.cc: https://github.com/apache/tvm/blob/main/src/tir/transforms/unroll_loop.cc
 
-.. _use pass infra: https://github.com/apache/incubator-tvm/blob/main/tutorials/dev/use_pass_infra.py
+.. _use pass infra: https://github.com/apache/tvm/blob/main/tutorials/dev/use_pass_infra.py
diff --git a/docs/dev/relay_add_pass.rst b/docs/dev/relay_add_pass.rst
index 02c0ba2808ad..0661df0ae35a 100644
--- a/docs/dev/relay_add_pass.rst
+++ b/docs/dev/relay_add_pass.rst
@@ -399,8 +399,8 @@ information about the pass manager interface can be found in :ref:`pass-infra`.
 Relay's standard passes are listed in `include/tvm/relay/transform.h`_ and implemented
 in `src/relay/pass/`_.
 
-.. _include/tvm/relay/transform.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/relay/transform.h
+.. _include/tvm/relay/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/relay/transform.h
 
-.. _src/relay/pass/: https://github.com/apache/incubator-tvm/tree/main/src/relay/pass
+.. _src/relay/pass/: https://github.com/apache/tvm/tree/main/src/relay/pass
 
-.. _src/relay/transforms/fold_constant.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/transforms/fold_constant.cc
+.. _src/relay/transforms/fold_constant.cc: https://github.com/apache/tvm/blob/main/src/relay/transforms/fold_constant.cc
diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/relay_bring_your_own_codegen.rst
index a4d4ebd60b88..3fcd3365c82f 100644
--- a/docs/dev/relay_bring_your_own_codegen.rst
+++ b/docs/dev/relay_bring_your_own_codegen.rst
@@ -137,7 +137,7 @@ Here we highlight the notes marked in the above code:
 
 * **Note 3** is a TVM runtime compatible wrapper function. It accepts a list of input tensors and one output tensor (the last argument), casts them to the right data type, and invokes the subgraph function described in Note 2. In addition, ``TVM_DLL_EXPORT_TYPED_FUNC`` is a TVM macro that generates another function ``gcc_0`` with unified the function arguments by packing all tensors to ``TVMArgs``. As a result, the TVM runtime can directly invoke ``gcc_0`` to execute the subgraph without additional efforts. With the above code generated, TVM is able to compile it along with the rest parts of the graph and export a single library for deployment.
 
-In the rest of this section, we will implement a codegen step-by-step to generate the above code. Your own codegen has to be located at ``src/relay/backend/contrib/<your-codegen-name>/``. In our example, we name our codegen "codegen_c" and put it under `/src/relay/backend/contrib/codegen_c/ <https://github.com/apache/incubator-tvm/blob/main/src/relay/backend/contrib/codegen_c/codegen.cc>`_. Feel free to check this file for a complete implementation.
+In the rest of this section, we will implement a codegen step-by-step to generate the above code. Your own codegen has to be located at ``src/relay/backend/contrib/<your-codegen-name>/``. In our example, we name our codegen "codegen_c" and put it under `/src/relay/backend/contrib/codegen_c/ <https://github.com/apache/tvm/blob/main/src/relay/backend/contrib/codegen_c/codegen.cc>`_. Feel free to check this file for a complete implementation.
 
 Specifically, we are going to implement two classes in this file and here is their relationship:
 
diff --git a/docs/dev/runtime.rst b/docs/dev/runtime.rst
index 91b19eee3230..c77b693f0749 100644
--- a/docs/dev/runtime.rst
+++ b/docs/dev/runtime.rst
@@ -45,7 +45,7 @@ PackedFunc
 `PackedFunc`_ is a simple but elegant solution
 we find to solve the challenges listed. The following code block provides an example in C++
 
-.. _PackedFunc: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/packed_func.h
+.. _PackedFunc: https://github.com/apache/tvm/blob/main/include/tvm/runtime/packed_func.h
 
 .. code:: c
 
@@ -131,9 +131,9 @@ which allows us to embed the PackedFunc into any languages. Besides python, so f
 `java`_ and `javascript`_.
 This philosophy of embedded API is very like Lua, except that we don't have a new language but use C++.
 
-.. _minimum C API: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/c_runtime_api.h
-.. _java: https://github.com/apache/incubator-tvm/tree/main/jvm
-.. _javascript: https://github.com/apache/incubator-tvm/tree/main/web
+.. _minimum C API: https://github.com/apache/tvm/blob/main/include/tvm/runtime/c_runtime_api.h
+.. _java: https://github.com/apache/tvm/tree/main/jvm
+.. _javascript: https://github.com/apache/tvm/tree/main/web
 
 
 One fun fact about PackedFunc is that we use it for both compiler and deployment stack.
@@ -141,7 +141,7 @@ One fun fact about PackedFunc is that we use it for both compiler and deployment
 - All TVM's compiler pass functions are exposed to frontend as PackedFunc, see `here`_
 - The compiled module also returns the compiled function as PackedFunc
 
-.. _here: https://github.com/apache/incubator-tvm/tree/main/src/api
+.. _here: https://github.com/apache/tvm/tree/main/src/api
 
 To keep the runtime minimum, we isolated the IR Object support from the deployment runtime. The resulting runtime takes around 200K - 600K depending on how many runtime driver modules (e.g., CUDA) get included.
 
@@ -162,7 +162,7 @@ TVM defines the compiled object as `Module`_.
 The user can get the compiled function from Module as PackedFunc.
 The generated compiled code can dynamically get function from Module in runtime. It caches the function handle in the first call and reuses in subsequent calls. We use this to link device code and callback into any PackedFunc(e.g., python) from generated code.
 
-.. _Module: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/module.h
+.. _Module: https://github.com/apache/tvm/blob/main/include/tvm/runtime/module.h
 
 The ModuleNode is an abstract class that can be implemented by each type of device.
 So far we support modules for CUDA, Metal, OpenCL and loading dynamic shared libraries. This abstraction makes introduction
@@ -198,7 +198,7 @@ All the language object in the compiler stack is a subclass of ``Object``. Each
 the type of object. We choose string instead of int as type key so new ``Object`` class can be added in the decentralized fashion without
 adding the code back to the central repo. To ease the speed of dispatching, we allocate an integer type_index at runtime for each type_key.
 
-.. _Object: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/object.h
+.. _Object: https://github.com/apache/tvm/blob/main/include/tvm/runtime/object.h
 
 Since usually one ``Object`` could be referenced in multiple places in the language, we use a shared_ptr to keep
 track of reference. We use ``ObjectRef`` class to represent a reference to the ``Object``.
@@ -279,17 +279,17 @@ Each argument in PackedFunc contains a union value `TVMValue`_
 and a type code. This design allows the dynamically typed language to convert to the corresponding type directly, and statically typed language to
 do runtime type checking during conversion.
 
-.. _TVMValue: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/c_runtime_api.h#L122
+.. _TVMValue: https://github.com/apache/tvm/blob/main/include/tvm/runtime/c_runtime_api.h#L122
 
 The relevant files are
 
 - `packed_func.h`_ for C++ API
 - `c_runtime_api.cc`_ for C API and how to provide callback.
 
-.. _packed_func.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/packed_func.h
-.. _c_runtime_api.cc: https://github.com/apache/incubator-tvm/blob/main/src/runtime/c_runtime_api.cc#L262
+.. _packed_func.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/packed_func.h
+.. _c_runtime_api.cc: https://github.com/apache/tvm/blob/main/src/runtime/c_runtime_api.cc#L262
 
 To support extension types, we used a registry system to register type related information, like support of any
 in C++, see `Extension types`_ for more details.
 
-.. _Extension types: https://github.com/apache/incubator-tvm/tree/main/apps/extension
+.. _Extension types: https://github.com/apache/tvm/tree/main/apps/extension
diff --git a/docs/dev/virtual_machine.rst b/docs/dev/virtual_machine.rst
index 0986328811dc..9081d50b92ef 100644
--- a/docs/dev/virtual_machine.rst
+++ b/docs/dev/virtual_machine.rst
@@ -278,11 +278,11 @@ to represent tensor, tuple/list, and closure data, respectively. More details
 for each of them can be found at `include/tvm/runtime/ndarray.h`_,
 `include/tvm/runtime/vm/vm.h`_, and `include/tvm/runtime/container.h`_, respectively.
 
-.. _include/tvm/runtime/ndarray.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/ndarray.h
+.. _include/tvm/runtime/ndarray.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/ndarray.h
 
-.. _include/tvm/runtime/vm/vm.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/vm/vm.h
+.. _include/tvm/runtime/vm/vm.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/vm/vm.h
 
-.. _include/tvm/runtime/container.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/container.h
+.. _include/tvm/runtime/container.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/container.h
 
 Stack and State
 ~~~~~~~~~~~~~~~
@@ -326,7 +326,7 @@ The functions contain metadata about the function as well as its compiled byteco
 object then can be loaded and run by a ``tvm::relay::vm::VirtualMachine`` object. For full definitions of the
 data structures, please see `include/tvm/runtime/vm/executable.h`_ and `include/tvm/runtime/vm/vm.h`_.
 
-.. _include/tvm/runtime/vm/executable.h: https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/vm/executable.h
+.. _include/tvm/runtime/vm/executable.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/vm/executable.h
 
 Optimizations
 ~~~~~~~~~~~~~
@@ -343,11 +343,11 @@ Optimizations marked with `TODO` are not implemented yet.
 - Tail Call Optimization (TODO)
 - Liveness Analysis (TODO)
 
-.. _src/relay/vm/lambda_lift.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/backend/vm/lambda_lift.cc
+.. _src/relay/vm/lambda_lift.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/lambda_lift.cc
 
-.. _src/relay/vm/inline_primitives.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/backend/vm/inline_primitives.cc
+.. _src/relay/vm/inline_primitives.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/inline_primitives.cc
 
-.. _src/relay/backend/vm/compiler.cc: https://github.com/apache/incubator-tvm/blob/main/src/relay/backend/vm/compiler.cc
+.. _src/relay/backend/vm/compiler.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/compiler.cc
 
 Serialization
 ~~~~~~~~~~~~~
@@ -386,7 +386,7 @@ load the serialized kernel binary and executable related binary code, which will
 instantiate a VM object. Please refer to the `test_vm_serialization.py`_ file for more
 examples.
 
-.. _test_vm_serialization.py: https://github.com/apache/incubator-tvm/blob/main/tests/python/relay/test_vm_serialization.py
+.. _test_vm_serialization.py: https://github.com/apache/tvm/blob/main/tests/python/relay/test_vm_serialization.py
 
 Unresolved Questions
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/install/docker.rst b/docs/install/docker.rst
index 243e438b6d0c..768cad2057f8 100644
--- a/docs/install/docker.rst
+++ b/docs/install/docker.rst
@@ -28,7 +28,7 @@ Get a tvm source distribution or clone the github repo to get the auxiliary scri
 
 .. code:: bash
 
-    git clone --recursive https://github.com/apache/incubator-tvm tvm
+    git clone --recursive https://github.com/apache/tvm tvm
 
 
 We can then use the following command to launch a docker image.
@@ -67,7 +67,7 @@ with ``localhost`` when pasting it into browser.
 
 Docker Source
 -------------
-Check out `The docker source <https://github.com/apache/incubator-tvm/tree/main/docker>`_ if you are interested in
+Check out `The docker source <https://github.com/apache/tvm/tree/main/docker>`_ if you are interested in
 building your own docker images.
 
 
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index f329e9f7e6b9..3cf0a78f244f 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -34,7 +34,7 @@ It is important to clone the submodules along, with ``--recursive`` option.
 
 .. code:: bash
 
-    git clone --recursive https://github.com/apache/incubator-tvm tvm
+    git clone --recursive https://github.com/apache/tvm tvm
 
 For windows users who use github tools, you can open the git shell, and type the following command.
 
diff --git a/docs/install/nnpack.rst b/docs/install/nnpack.rst
index 10497ba05654..2afd95a5ef3f 100644
--- a/docs/install/nnpack.rst
+++ b/docs/install/nnpack.rst
@@ -105,7 +105,7 @@ Build TVM with NNPACK support
 
 .. code:: bash
 
-   git clone --recursive https://github.com/apache/incubator-tvm tvm
+   git clone --recursive https://github.com/apache/tvm tvm
 
 - Set `set(USE_NNPACK ON)` in config.cmake.
 - Set `NNPACK_PATH` to the $(YOUR_NNPACK_INSTALL_PATH)
diff --git a/docs/langref/relay_adt.rst b/docs/langref/relay_adt.rst
index a53c7515c62a..dab2e3e70678 100644
--- a/docs/langref/relay_adt.rst
+++ b/docs/langref/relay_adt.rst
@@ -387,7 +387,7 @@ The following left fold flattens a list of lists (using concatenation):
 Note that these iteration constructs can be implemented directly in Relay's
 source language and more can easily be defined (and for more data types, like trees),
 rather than being constructs built into the language (e.g.,
-`"foreach" in MXNet <https://mxnet.incubator.apache.org/versions/master/tutorials/control_flow/ControlFlowTutorial.html>`__).
+`"foreach" in MXNet <https://mxnet.apache.org/versions/master/tutorials/control_flow/ControlFlowTutorial.html>`__).
 ADTs and their extensibility allow for a broad range of iterations and data structures to be expressed
 in Relay and supported by the type system without having to modify the language implementation.
 
diff --git a/docs/langref/relay_pattern.rst b/docs/langref/relay_pattern.rst
index 17282e142b2a..8b34b7619840 100644
--- a/docs/langref/relay_pattern.rst
+++ b/docs/langref/relay_pattern.rst
@@ -35,7 +35,7 @@ There are quite a few properties of operators that are worth matching. Below we
 demonstrates how to write patterns. It is recommended to check `tests/python/relay/test_dataflow_pattern.py`_
 for more use cases.
 
-.. _tests/python/relay/test_dataflow_pattern.py: https://github.com/apache/incubator-tvm/blob/main/tests/python/relay/test_dataflow_pattern.py
+.. _tests/python/relay/test_dataflow_pattern.py: https://github.com/apache/tvm/blob/main/tests/python/relay/test_dataflow_pattern.py
 
 .. note::
 
diff --git a/docs/vta/install.rst b/docs/vta/install.rst
index bb5c1c9c9669..2248975b61b1 100644
--- a/docs/vta/install.rst
+++ b/docs/vta/install.rst
@@ -135,7 +135,7 @@ Because the direct board-to-computer connection prevents the board from directly
    mkdir <mountpoint>
    sshfs xilinx@192.168.2.99:/home/xilinx <mountpoint>
    cd <mountpoint>
-   git clone --recursive https://github.com/apache/incubator-tvm tvm
+   git clone --recursive https://github.com/apache/tvm tvm
    # When finished, you can leave the moutpoint and unmount the directory
    cd ~
    sudo umount <mountpoint>
@@ -466,7 +466,7 @@ This would add quartus binary path into your ``PATH`` environment variable, so y
 Chisel-based Custom VTA Bitstream Compilation for DE10-Nano
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file `Configs.scala <https://github.com/apache/incubator-tvm/blob/main/3rdparty/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala>`_, and they can be customized by the user.
+Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file `Configs.scala <https://github.com/apache/tvm/blob/main/3rdparty/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala>`_, and they can be customized by the user.
 
 For Intel FPGA, bitstream generation is driven by a top-level ``Makefile`` under ``<tvm root>/3rdparty/vta-hw/hardware/intel``.
 
diff --git a/jvm/README.md b/jvm/README.md
index 320e769adb74..e23c632fb04a 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -176,4 +176,4 @@ Server server = new Server(proxyHost, proxyPort, "key");
 server.start();
 ```
 
-You can also use `StandaloneServerProcessor` and `ConnectProxyServerProcessor` to build your own RPC server. Refer to [Android RPC Server](https://github.com/apache/incubator-tvm/blob/main/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/RPCProcessor.java) for more details.
+You can also use `StandaloneServerProcessor` and `ConnectProxyServerProcessor` to build your own RPC server. Refer to [Android RPC Server](https://github.com/apache/tvm/blob/main/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/RPCProcessor.java) for more details.
diff --git a/jvm/pom.xml b/jvm/pom.xml
index 886f0e6bc5fd..1aeaa0e57921 100644
--- a/jvm/pom.xml
+++ b/jvm/pom.xml
@@ -7,7 +7,7 @@
   <artifactId>tvm4j-parent</artifactId>
   <version>0.0.1-SNAPSHOT</version>
   <name>TVM4J Package - Parent</name>
-  <url>https://github.com/apache/incubator-tvm/tree/main/jvm</url>
+  <url>https://github.com/apache/tvm/tree/main/jvm</url>
   <description>TVM4J Package</description>
   <organization>
     <name>Apache Software Foundation</name>
@@ -20,9 +20,9 @@
     </license>
   </licenses>
   <scm>
-    <connection>scm:git:git@github.com:apache/incubator-tvm.git</connection>
-    <developerConnection>scm:git:git@github.com:apache/incubator-tvm.git</developerConnection>
-    <url>https://github.com/apache/incubator-tvm</url>
+    <connection>scm:git:git@github.com:apache/tvm.git</connection>
+    <developerConnection>scm:git:git@github.com:apache/tvm.git</developerConnection>
+    <url>https://github.com/apache/tvm</url>
   </scm>
 
   <properties>
diff --git a/python/setup.py b/python/setup.py
index ec98e94f80eb..8af62f9c9102 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -207,7 +207,7 @@ def get_package_data_files():
     package_dir={"tvm": "tvm"},
     package_data={"tvm": get_package_data_files()},
     distclass=BinaryDistribution,
-    url="https://github.com/apache/incubator-tvm",
+    url="https://github.com/apache/tvm",
     ext_modules=config_cython(),
     **setup_kwargs,
 )
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 50e5a02f84c0..3f151ebc01a5 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -75,7 +75,7 @@ def helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay_op):
     """Converts QNN operators into a sequence of Relay operators that are friendly to HW that do
     not have fast Int8 arithmetic. For example, for ARM, LLVM utilizes the assembly instructions
     much more efficiently if the convolution or dense operator input datatypes are int16 instead of
-    int8. More details are present at https://github.com/apache/incubator-tvm/pull/4277.
+    int8. More details are present at https://github.com/apache/tvm/pull/4277.
 
     Parameters
     ----------
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index 7b9da8a90ede..a3b7e473415e 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -263,7 +263,7 @@ def _callback(op):
     return s
 
 
-# FIXME - https://github.com/apache/incubator-tvm/issues/4122
+# FIXME - https://github.com/apache/tvm/issues/4122
 # _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO
 # layout. Commenting until we have clarity about the nhwc_pack implementation from the author.
 # elif layout == 'NHWC' and kh == 1 and kw == 1 and kernel.dtype == "int8":
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index b4a966e1db13..3e5a12bc43b2 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -229,7 +229,7 @@ def _schedule_conv_nhwc_pack_int8(s, cfg, data, conv_out, last):
     packing of weight to make the address access be friendly to int8
     intrinsic
     """
-    # FIXME - https://github.com/apache/incubator-tvm/issues/3598
+    # FIXME - https://github.com/apache/tvm/issues/3598
     # pylint: disable=unreachable
     return s
 
diff --git a/rust/tvm-graph-rt/Cargo.toml b/rust/tvm-graph-rt/Cargo.toml
index d8dfcdb73269..13837f62695d 100644
--- a/rust/tvm-graph-rt/Cargo.toml
+++ b/rust/tvm-graph-rt/Cargo.toml
@@ -20,7 +20,7 @@ name = "tvm-graph-rt"
 version = "0.1.0"
 license = "Apache-2.0"
 description = "A static graph runtime for TVM."
-repository = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["tvm"]
 categories = ["api-bindings", "science"]
diff --git a/rust/tvm-macros/Cargo.toml b/rust/tvm-macros/Cargo.toml
index e491177d8599..37275d6a941e 100644
--- a/rust/tvm-macros/Cargo.toml
+++ b/rust/tvm-macros/Cargo.toml
@@ -20,7 +20,7 @@ name = "tvm-macros"
 version = "0.1.1"
 license = "Apache-2.0"
 description = "Procedural macros of the TVM crate."
-repository = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["tvm"]
 authors = ["TVM Contributors"]
diff --git a/rust/tvm-rt/Cargo.toml b/rust/tvm-rt/Cargo.toml
index 9660943da50d..13c05373f6b6 100644
--- a/rust/tvm-rt/Cargo.toml
+++ b/rust/tvm-rt/Cargo.toml
@@ -20,8 +20,8 @@ name = "tvm-rt"
 version = "0.1.0"
 license = "Apache-2.0"
 description = "Rust bindings for the TVM runtime API."
-repository = "https://github.com/apache/incubator-tvm"
-homepage = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
+homepage = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["rust", "tvm"]
 categories = ["api-bindings", "science"]
diff --git a/rust/tvm-rt/README.md b/rust/tvm-rt/README.md
index a586cd73b303..a99eeaa578dd 100644
--- a/rust/tvm-rt/README.md
+++ b/rust/tvm-rt/README.md
@@ -17,7 +17,7 @@
 
 # TVM Runtime Support
 
-This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/incubator-tvm) runtime.
+This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime.
 Currently this is tested on `1.42.0` and above.
 
 ## What Does This Crate Offer?
diff --git a/rust/tvm-rt/src/lib.rs b/rust/tvm-rt/src/lib.rs
index 84951f4c8e67..4b163eff9c8f 100644
--- a/rust/tvm-rt/src/lib.rs
+++ b/rust/tvm-rt/src/lib.rs
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-//! [TVM](https://github.com/apache/incubator-tvm) is a compiler stack for deep learning systems.
+//! [TVM](https://github.com/apache/tvm) is a compiler stack for deep learning systems.
 //!
 //! This crate provides an idiomatic Rust API for TVM runtime.
 //!
diff --git a/rust/tvm-sys/src/context.rs b/rust/tvm-sys/src/context.rs
index 3747bfcba314..a5165fccf0aa 100644
--- a/rust/tvm-sys/src/context.rs
+++ b/rust/tvm-sys/src/context.rs
@@ -51,7 +51,7 @@ use enumn::N;
 use thiserror::Error;
 
 /// Device type represents the set of devices supported by
-/// [TVM](https://github.com/apache/incubator-tvm).
+/// [TVM](https://github.com/apache/tvm).
 ///
 /// ## Example
 ///
diff --git a/rust/tvm/Cargo.toml b/rust/tvm/Cargo.toml
index 153a1950e46b..29d2003b5089 100644
--- a/rust/tvm/Cargo.toml
+++ b/rust/tvm/Cargo.toml
@@ -20,8 +20,8 @@ name = "tvm"
 version = "0.1.0"
 license = "Apache-2.0"
 description = "Rust frontend support for TVM"
-repository = "https://github.com/apache/incubator-tvm"
-homepage = "https://github.com/apache/incubator-tvm"
+repository = "https://github.com/apache/tvm"
+homepage = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["rust", "tvm"]
 categories = ["api-bindings", "science"]
diff --git a/rust/tvm/README.md b/rust/tvm/README.md
index 13aef8928aa7..26f9f1fbedfd 100644
--- a/rust/tvm/README.md
+++ b/rust/tvm/README.md
@@ -17,13 +17,13 @@
 
 # TVM Runtime Frontend Support
 
-This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/incubator-tvm) runtime frontend. Currently this requires **Nightly Rust** and tested on `rustc 1.32.0-nightly`
+This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime frontend. Currently this requires **Nightly Rust** and tested on `rustc 1.32.0-nightly`
 
 ## What Does This Crate Offer?
 
 Here is a major workflow
 
-1. Train your **Deep Learning** model using any major framework such as [PyTorch](https://pytorch.org/), [Apache MXNet](https://mxnet.incubator.apache.org/) or [TensorFlow](https://www.tensorflow.org/)
+1. Train your **Deep Learning** model using any major framework such as [PyTorch](https://pytorch.org/), [Apache MXNet](https://mxnet.apache.org/) or [TensorFlow](https://www.tensorflow.org/)
 2. Use **TVM** to build optimized model artifacts on a supported context such as CPU, GPU, OpenCL and specialized accelerators.
 3. Deploy your models using **Rust** :heart:
 
diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs
index 27e794984094..e86420eb70c9 100644
--- a/rust/tvm/src/lib.rs
+++ b/rust/tvm/src/lib.rs
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-//! [TVM](https://github.com/apache/incubator-tvm) is a compiler stack for deep learning systems.
+//! [TVM](https://github.com/apache/tvm) is a compiler stack for deep learning systems.
 //!
 //! This crate provides an idiomatic Rust API for TVM runtime frontend.
 //!
diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h
index a9ae64ba8fb1..c6fb3e09f4d1 100644
--- a/src/parser/tokenizer.h
+++ b/src/parser/tokenizer.h
@@ -328,7 +328,7 @@ struct Tokenizer {
       }
     } else if (next == '"') {
       // TODO(@jroesch): Properly tokenize escape sequences in strings.
-      // see https://github.com/apache/incubator-tvm/issues/6153.
+      // see https://github.com/apache/tvm/issues/6153.
       Next();
       std::stringstream string_content;
       while (More() && Peek() != '"') {
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 89ae34899331..b7f3b91f4243 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1100,7 +1100,7 @@ def test_forward_convolution():
             [1, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], "SAME", "NHWC", quantized=True
         )
 
-        # Disable as tests are flaky - https://github.com/apache/incubator-tvm/issues/6064
+        # Disable as tests are flaky - https://github.com/apache/tvm/issues/6064
         # depthwise convolution
         # _test_tflite2_quantized_depthwise_convolution([1, 8, 8, 128], [1, 1, 128, 1], [1, 1], [1, 1],
         #                                               'SAME', 'NHWC', 1)
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index a2b791d8d33f..06bd01b4189a 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -298,7 +298,7 @@ def compile_test_conv2d_arm_cpu(
     )
 
     # CUDA is disabled for 'direct' schedule:
-    # https://github.com/apache/incubator-tvm/pull/3070#issuecomment-486597553
+    # https://github.com/apache/tvm/pull/3070#issuecomment-486597553
     # group conv2d
     dshape = (1, 32, 18, 18)
     kshape = (32, 4, 3, 3)
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
index c547ec7d0272..66ce6ffe41f4 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
@@ -73,7 +73,7 @@ def check_device(device):
         check_device(device)
 
 
-# TODO(@llyfacebook): Please fix https://github.com/apache/incubator-tvm/issues/4122 to enable this test.
+# TODO(@llyfacebook): Please fix https://github.com/apache/tvm/issues/4122 to enable this test.
 @pytest.mark.skip
 def test_conv2d_nhwc():
     verify_conv2d_1x1_nhwc_pack_int8(1, 256, 32, 256, 1, 1, 0)
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 24035ba9bba6..22c9045fd457 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -124,7 +124,7 @@ def check_device(device):
 @tvm.testing.uses_gpu
 @pytest.mark.skip(
     "Skip this test as it is intermittent."
-    "See https://github.com/apache/incubator-tvm/pull/4901#issuecomment-595040094"
+    "See https://github.com/apache/tvm/pull/4901#issuecomment-595040094"
 )
 def test_get_valid_counts():
     verify_get_valid_counts((1, 1000, 5), 0.5, -1, 0)
diff --git a/tests/python/unittest/test_autotvm_graph_tuner_core.py b/tests/python/unittest/test_autotvm_graph_tuner_core.py
index f5947619d295..3d7d304f13d5 100644
--- a/tests/python/unittest/test_autotvm_graph_tuner_core.py
+++ b/tests/python/unittest/test_autotvm_graph_tuner_core.py
@@ -18,7 +18,7 @@
 # NOTE: We name this test file to start with test_graph_tuner
 # to make it execute after zero_rank tensor test cases. This
 # helps avoid topi arithmetic operator overloading issue:
-# https://github.com/apache/incubator-tvm/issues/3240.
+# https://github.com/apache/tvm/issues/3240.
 # TODO: restore the file name after this issue is resolved.
 import os
 import copy
diff --git a/tests/python/unittest/test_autotvm_graph_tuner_utils.py b/tests/python/unittest/test_autotvm_graph_tuner_utils.py
index 9fc415c09dc6..6ab194c10ea7 100644
--- a/tests/python/unittest/test_autotvm_graph_tuner_utils.py
+++ b/tests/python/unittest/test_autotvm_graph_tuner_utils.py
@@ -18,7 +18,7 @@
 # NOTE: We name this test file to start with test_graph_tuner
 # to make it execute after zero_rank tensor test cases. This
 # helps avoid topi arithmetic operator overloading issue:
-# https://github.com/apache/incubator-tvm/issues/3240
+# https://github.com/apache/tvm/issues/3240
 # TODO: restore the file name after this issue is resolved.
 import tvm
 from tvm import te
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 1e1e98ae5ab9..317af5f1632d 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -33,7 +33,7 @@
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some arm devices. You can go to
-`ARM CPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#arm-cpu>`_
+`ARM CPU Benchmark <https://github.com/apache/tvm/wiki/Benchmark#arm-cpu>`_
 to see the results.
 
 Note that this tutorial will not run on Windows or recent versions of macOS. To
@@ -164,7 +164,7 @@ def get_network(name, batch_size):
 #   (replace :code:`[HOST_IP]` with the IP address of your host machine)
 #
 # * For Android:
-#   Follow this `readme page <https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc>`_ to
+#   Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
 #   install the TVM RPC APK on the android device. Make sure you can pass the android rpc test.
 #   Then you have already registered your device. During tuning, you have to go to developer option
 #   and enable "Keep screen awake during changing" and charge your phone to make it stable.
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 33b62bbf8f19..76a30ec15eb6 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -31,7 +31,7 @@
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
-`NVIDIA GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#nvidia-gpu>`_
+`NVIDIA GPU Benchmark <https://github.com/apache/tvm/wiki/Benchmark#nvidia-gpu>`_
 to see the results.
 
 Note that this tutorial will not run on Windows or recent versions of macOS. To
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 10e201fd9fb5..5e972730d9be 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -31,7 +31,7 @@
 these operators, it will query this log file to get the best knob values.
 
 We also released pre-tuned parameters for some arm devices. You can go to
-`Mobile GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#mobile-gpu>`_
+`Mobile GPU Benchmark <https://github.com/apache/tvm/wiki/Benchmark#mobile-gpu>`_
 to see the results.
 
 Note that this tutorial will not run on Windows or recent versions of macOS. To
@@ -163,7 +163,7 @@ def get_network(name, batch_size):
 #   (replace :code:`[HOST_IP]` with the IP address of your host machine)
 #
 # * For Android:
-#   Follow this `readme page <https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc>`_ to
+#   Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
 #   install TVM RPC APK on the android device. Make sure you can pass the android RPC test.
 #   Then you have already registered your device. During tuning, you have to go to developer option
 #   and enable "Keep screen awake during changing" and charge your phone to make it stable.
diff --git a/tutorials/dev/bring_your_own_datatypes.py b/tutorials/dev/bring_your_own_datatypes.py
index c85ec073f875..f9dc8bcdc948 100644
--- a/tutorials/dev/bring_your_own_datatypes.py
+++ b/tutorials/dev/bring_your_own_datatypes.py
@@ -116,7 +116,7 @@
 
 ######################################################################
 # Note that the type code, 150, is currently chosen manually by the user.
-# See ``TVMTypeCode::kCustomBegin`` in `include/tvm/runtime/c_runtime_api.h <https://github.com/apache/incubator-tvm/blob/main/include/tvm/runtime/data_type.h>`_.
+# See ``TVMTypeCode::kCustomBegin`` in `include/tvm/runtime/c_runtime_api.h <https://github.com/apache/tvm/blob/main/include/tvm/runtime/data_type.h>`_.
 # Now we can generate our program again:
 
 x_myfloat = relay.cast(x, dtype="custom[myfloat]32")
@@ -176,7 +176,7 @@
 # To provide for the general case, we have made a helper function, ``create_lower_func(...)``,
 # which does just this: given a dictionary, it replaces the given operation with a ``Call`` to the appropriate function name provided based on the op and the bit widths.
 # It additionally removes usages of the custom datatype by storing the custom datatype in an opaque ``uint`` of the appropriate width; in our case, a ``uint32_t``.
-# For more information, see `the source code <https://github.com/apache/incubator-tvm/blob/main/python/tvm/target/datatype.py>`_.
+# For more information, see `the source code <https://github.com/apache/tvm/blob/main/python/tvm/target/datatype.py>`_.
 
 # We can now re-try running the program:
 try:
diff --git a/tutorials/dev/use_pass_infra.py b/tutorials/dev/use_pass_infra.py
index b16eb93749de..6a33d14e38c8 100644
--- a/tutorials/dev/use_pass_infra.py
+++ b/tutorials/dev/use_pass_infra.py
@@ -142,7 +142,7 @@ def alter_conv2d(attrs, inputs, tinfos, out_type):
 # packing them as a whole to execute. For example, the same passes can now be
 # applied using the sequential style as the following. :py:class:`tvm.transform.Sequential` is
 # similiar to `torch.nn.sequential <https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential>`_
-# and `mxnet.gluon.block <https://mxnet.incubator.apache.org/api/python/docs/_modules/mxnet/gluon/block.html>`_.
+# and `mxnet.gluon.block <https://mxnet.apache.org/api/python/docs/_modules/mxnet/gluon/block.html>`_.
 # For example, `torch.nn.sequential` is used to contain a sequence of PyTorch
 # `Modules` that will be added to build a network. It focuses on the network
 # layers. Instead, the :py:class:`tvm.transform.Sequential` in our pass infra works on the optimizing
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index 35f989c7a5ca..ff7ef44a7acb 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -47,7 +47,7 @@
 #
 # .. code-block:: bash
 #
-#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   git clone --recursive https://github.com/apache/tvm tvm
 #   cd tvm
 #   docker build -t tvm.demo_android -f docker/Dockerfile.demo_android ./docker
 #   docker run --pid=host -h tvm -v $PWD:/workspace \
@@ -106,7 +106,7 @@
 # --------------------------------------
 # Now we can register our Android device to the tracker.
 #
-# Follow this `readme page <https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc>`_ to
+# Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
 # install TVM RPC APK on the android device.
 #
 # Here is an example of config.mk. I enabled OpenCL and Vulkan.
@@ -139,7 +139,7 @@
 #
 # .. note::
 #
-#   At this time, don't forget to `create a standalone toolchain <https://github.com/apache/incubator-tvm/tree/main/apps/android_rpc#architecture-and-android-standalone-toolchain>`_ .
+#   At this time, don't forget to `create a standalone toolchain <https://github.com/apache/tvm/tree/main/apps/android_rpc#architecture-and-android-standalone-toolchain>`_ .
 #
 #   for example
 #
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py
index 36879910a1b9..cae9d905898b 100644
--- a/tutorials/frontend/deploy_model_on_rasp.py
+++ b/tutorials/frontend/deploy_model_on_rasp.py
@@ -53,7 +53,7 @@
 #
 # .. code-block:: bash
 #
-#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   git clone --recursive https://github.com/apache/tvm tvm
 #   cd tvm
 #   mkdir build
 #   cp cmake/config.cmake build
@@ -96,7 +96,7 @@
 # Back to the host machine, which should have a full TVM installed (with LLVM).
 #
 # We will use pre-trained model from
-# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# `MXNet Gluon model zoo <https://mxnet.apache.org/api/python/gluon/model_zoo.html>`_.
 # You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
 
 from mxnet.gluon.model_zoo.vision import get_model
diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py
index 3eeef874eaae..d103d17e5d24 100644
--- a/tutorials/frontend/from_mxnet.py
+++ b/tutorials/frontend/from_mxnet.py
@@ -33,7 +33,7 @@
     pip install mxnet --user
 
 or please refer to offical installation guide.
-https://mxnet.incubator.apache.org/versions/master/install/index.html
+https://mxnet.apache.org/versions/master/install/index.html
 """
 # some standard imports
 import mxnet as mx
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py
index 69284c0521a3..2386e7bdd135 100644
--- a/tutorials/get_started/cross_compilation_and_rpc.py
+++ b/tutorials/get_started/cross_compilation_and_rpc.py
@@ -49,7 +49,7 @@
 #
 # .. code-block:: bash
 #
-#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   git clone --recursive https://github.com/apache/tvm tvm
 #   cd tvm
 #   make runtime -j2
 #
diff --git a/web/README.md b/web/README.md
index b4d7eb104d56..4154300e62e4 100644
--- a/web/README.md
+++ b/web/README.md
@@ -63,11 +63,11 @@ This command will create the tvmjs library that we can use to interface with the
 
 Check code snippet in
 
-- [tests/python/prepare_test_libs.py](https://github.com/apache/incubator-tvm/tree/main/web/tests/python/prepare_test_libs.py)
+- [tests/python/prepare_test_libs.py](https://github.com/apache/tvm/tree/main/web/tests/python/prepare_test_libs.py)
   shows how to create a wasm library that links with tvm runtime.
   - Note that all wasm libraries have to created using the `--system-lib` option
   - emcc.create_wasm will automatically link the runtime library `dist/wasm/libtvm_runtime.bc`
-- [tests/web/test_module_load.js](https://github.com/apache/incubator-tvm/tree/main/web/tests/node/test_module_load.js) demonstrate
+- [tests/web/test_module_load.js](https://github.com/apache/tvm/tree/main/web/tests/node/test_module_load.js) demonstrate
   how to run the generated library through tvmjs API.
 
 
From 2980ae0930a8976fd22ba148376b0b41375ccef5 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 24 Nov 2020 13:49:55 -0800
Subject: [PATCH 207/258] Raise ImportError for XGBoost (#6969)

---
 python/tvm/auto_scheduler/cost_model/xgb_model.py | 2 +-
 python/tvm/autotvm/tuner/xgboost_cost_model.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index d6503918ee5e..5b10054d4600 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -94,7 +94,7 @@ def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
             if xgb is None:
                 xgb = __import__("xgboost")
         except ImportError:
-            print(
+            raise ImportError(
                 "XGBoost is required for XGBModel. "
                 "Please install its python package first. "
                 "Help: (https://xgboost.readthedocs.io/en/latest/) "
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index a144617596b5..287cbffe6956 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -78,7 +78,7 @@ def __init__(
             if xgb is None:
                 xgb = __import__("xgboost")
         except ImportError:
-            print(
+            raise ImportError(
                 "XGBoost is required for XGBoostCostModel. "
                 "Please install its python package first. "
                 "Help: (https://xgboost.readthedocs.io/en/latest/) "

From ed62b3fa448a54089bccf0202bfd37b21a7ac8b1 Mon Sep 17 00:00:00 2001
From: manupa-arm <61496855+manupa-arm@users.noreply.github.com>
Date: Tue, 24 Nov 2020 21:50:08 +0000
Subject: [PATCH 208/258] Bug fix for debug builds in micro_session.cc (#6968)

* If the build decides not to inline kReceiveBufferSizeBytes,
  we will encounter a linking error.

Change-Id: Ibbe5b20fdd63acb2b4652ca9896f5737eaf14b00
---
 src/runtime/micro/micro_session.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index f054c3afde5c..ceaa5dd6245b 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -148,9 +148,10 @@ class MicroTransportChannel : public RPCChannel {
             ::std::max(::std::chrono::microseconds{0},
                        ::std::chrono::duration_cast<::std::chrono::microseconds>(
                            end_time - ::std::chrono::steady_clock::now()))};
-        chunk = frecv_(kReceiveBufferSizeBytes, iter_timeout.count()).operator std::string();
+        chunk =
+            frecv_(size_t(kReceiveBufferSizeBytes), iter_timeout.count()).operator std::string();
       } else {
-        chunk = frecv_(kReceiveBufferSizeBytes, nullptr).operator std::string();
+        chunk = frecv_(size_t(kReceiveBufferSizeBytes), nullptr).operator std::string();
       }
       pending_chunk_ = chunk;
       if (pending_chunk_.size() == 0) {

From 14fbcfb7e766ba3fd7302b5fd8478a62ca71d872 Mon Sep 17 00:00:00 2001
From: manupa-arm <61496855+manupa-arm@users.noreply.github.com>
Date: Wed, 25 Nov 2020 14:18:04 +0000
Subject: [PATCH 209/258] [CI] Disable ASF header checking on untracked files
 (#6975)

* This patch will disable checking for ASF header in untracked
  files that are never going to make its way into the repo.
* That would help developers to have their untracked local files.

Change-Id: Ie9f1aae28a474f10f52f22fe9e27a52afd95b4be
---
 tests/scripts/task_lint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index deb9b9bde6c1..700c63194fe3 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,7 +31,7 @@ echo "Checking file types..."
 python3 tests/lint/check_file_type.py
 
 echo "Checking ASF license headers..."
-tests/lint/check_asf_header.sh
+tests/lint/check_asf_header.sh --local
 
 echo "Linting the C++ code..."
 tests/lint/cpplint.sh

From 1914e045c92aa100f23e63398de4eea574dc2320 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 25 Nov 2020 11:11:58 -0800
Subject: [PATCH 210/258] [AutoScheduler] Print the time used for measurement
 (#6972)

* [AutoScheduler] Print the time used for measurement

* address comments
---
 src/auto_scheduler/measure.cc                    |  6 +++++-
 .../search_policy/sketch_policy.cc               | 16 ++++------------
 src/auto_scheduler/utils.h                       | 11 +++++++++++
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/auto_scheduler/measure.cc b/src/auto_scheduler/measure.cc
index 03585ea40c03..5b7e886f073c 100755
--- a/src/auto_scheduler/measure.cc
+++ b/src/auto_scheduler/measure.cc
@@ -210,6 +210,8 @@ Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
                                                   const SearchPolicy& policy,
                                                   const Array<MeasureInput>& inputs,
                                                   int batch_size) {
+  auto t_begin = std::chrono::high_resolution_clock::now();
+
   Array<MeasureResult> results;
   results.reserve(inputs.size());
 
@@ -220,7 +222,7 @@ Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
 
   int old_verbosity = verbose;
 
-  StdCout(verbose) << "Get " << inputs.size() << " programs to measure." << std::endl;
+  StdCout(verbose) << "Get " << inputs.size() << " programs to measure:" << std::endl;
 
   for (size_t i = 0; i < inputs.size(); i += batch_size) {
     Array<MeasureInput> input_batch(inputs.begin() + i,
@@ -280,6 +282,8 @@ Array<MeasureResult> ProgramMeasurerNode::Measure(const SearchTask& task,
     }
   }
 
+  PrintTimeElapsed(t_begin, "measurement", verbose);
+
   return results;
 }
 
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 4c3e8ac5593d..07d2837ab994 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -162,17 +162,13 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure
     Array<MeasureResult> results;
     while (ct < n_trials) {
       if (!inputs.empty()) {
-        auto tic_begin = std::chrono::high_resolution_clock::now();
+        auto t_begin = std::chrono::high_resolution_clock::now();
 
         // Retrain the cost model before the next search round
         PrintTitle("Train cost model", verbose);
         program_cost_model->Update(inputs, results);
 
-        double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
-                              std::chrono::high_resolution_clock::now() - tic_begin)
-                              .count();
-        StdCout(verbose) << "Time elapsed: " << std::fixed << std::setprecision(2) << duration
-                         << " s" << std::endl;
+        PrintTimeElapsed(t_begin, "training", verbose);
       }
 
       // Search one round to get promising states
@@ -258,17 +254,13 @@ std::pair<Array<MeasureInput>, Array<MeasureResult>> SketchPolicyNode::ContinueS
     measured_states_throughputs_.push_back(1.0 / FloatArrayMean(res->costs));
   }
 
-  auto tic_begin = std::chrono::high_resolution_clock::now();
+  auto t_begin = std::chrono::high_resolution_clock::now();
 
   // Update the cost model
   PrintTitle("Train cost model", verbose);
   program_cost_model->Update(inputs, results);
 
-  double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
-                        std::chrono::high_resolution_clock::now() - tic_begin)
-                        .count();
-  StdCout(verbose) << "Time elapsed: " << std::fixed << std::setprecision(2) << duration << " s"
-                   << std::endl;
+  PrintTimeElapsed(t_begin, "training", verbose);
 
   return std::make_pair(std::move(inputs), std::move(results));
 }
diff --git a/src/auto_scheduler/utils.h b/src/auto_scheduler/utils.h
index 88c649c6f919..bc29a3761129 100755
--- a/src/auto_scheduler/utils.h
+++ b/src/auto_scheduler/utils.h
@@ -32,6 +32,7 @@
 #include <deque>
 #include <exception>
 #include <future>
+#include <iomanip>
 #include <numeric>
 #include <random>
 #include <string>
@@ -253,6 +254,16 @@ inline std::string Chars(const char& str, int times) {
   return ret.str();
 }
 
+/*! \brief Print the time elapsed */
+inline void PrintTimeElapsed(std::chrono::time_point<std::chrono::high_resolution_clock> t_begin,
+                             const std::string& info, int verbose) {
+  double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
+                        std::chrono::high_resolution_clock::now() - t_begin)
+                        .count();
+  StdCout(verbose) << "Time elapsed for " << info << ": " << std::fixed << std::setprecision(2)
+                   << duration << " s" << std::endl;
+}
+
 /*!
  * \brief Parse shape and axis names from layout string
  */

From 23de2d3d5b2a743974558731eee176d2f35068bd Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 25 Nov 2020 11:13:07 -0800
Subject: [PATCH 211/258] [AutoScheduler] Check duplicated names in the compute
 dag (#6973)

* [AutoScheduler] check duplicated names in the compute dag

* fix lint

* fix pooling

* fix pooling
---
 include/tvm/topi/nn/pooling.h                 | 20 +++++++++--------
 src/auto_scheduler/compute_dag.cc             | 22 +++++++++++++++++++
 .../unittest/test_auto_scheduler_common.py    | 11 ++++++++++
 .../test_auto_scheduler_compute_dag.py        | 13 +++++++++++
 4 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h
index 882793877ed6..8c30e673b304 100644
--- a/include/tvm/topi/nn/pooling.h
+++ b/include/tvm/topi/nn/pooling.h
@@ -103,8 +103,8 @@ inline Tensor pool_impl(const Tensor& x, const Array<PrimExpr>& kernel_size,
   auto out_width =
       analyzer.Simplify(indexdiv(width - kernel_width + pad_left + pad_right, stride_width) + 1);
 
-  auto dheight = tvm::te::reduce_axis(Range(0, kernel_height));
-  auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width));
+  auto dheight = tvm::te::reduce_axis(Range(0, kernel_height), "dh");
+  auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width), "dw");
 
   Array<PrimExpr> out_shape = x->shape;
   for (size_t i = 0; i < out_shape.size(); ++i) {
@@ -220,8 +220,8 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
   auto out_width =
       analyzer.Simplify((width - kernel_width + pad_left + pad_right) / stride_width + 1);
 
-  auto dheight = tvm::te::reduce_axis(Range(0, kernel_height));
-  auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width));
+  auto dheight = tvm::te::reduce_axis(Range(0, kernel_height), "dh");
+  auto dwidth = tvm::te::reduce_axis(Range(0, kernel_width), "dw");
 
   Array<PrimExpr> data_shape = x->shape;
   for (size_t i = 0; i < data_shape.size(); ++i) {
@@ -245,8 +245,9 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
     ravel_shape.Set(width_axis, ravel_shape[width_axis] + pad_left + pad_right);
 
     auto windowh =
-        tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height));
-    auto windoww = tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width));
+        tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height), "wh");
+    auto windoww =
+        tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width), "ww");
 
     auto argmax = MakeArgmaxReducer();
     auto pad_x = do_pad ? pad(x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
@@ -293,8 +294,9 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
         "T_pool_grad", "pool_grad_max");
   } else if (pool_type == kAvgPool) {
     auto windowh =
-        tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height));
-    auto windoww = tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width));
+        tvm::te::reduce_axis(Range(0, (kernel_height + stride_height - 1) / stride_height), "wh");
+    auto windoww =
+        tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width), "ww");
     return tvm::te::compute(
         data_shape,
         [&](const Array<Var>& inds) {
@@ -696,7 +698,7 @@ inline Tensor pool_impl_nd(const Tensor& x, const Array<PrimExpr>& kernel_size,
       pad_tail[i] += stride[i] - 1;
     }
 
-    daxis.push_back(tvm::te::reduce_axis(Range(0, kernel[i])));
+    daxis.push_back(tvm::te::reduce_axis(Range(0, kernel[i]), "rv" + std::to_string(i)));
 
     pad_before.Set(ii, pad_head[i]);
     pad_after.Set(ii, pad_tail[i]);
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 090e6daf9859..27a30127ba65 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -658,6 +658,22 @@ class FlopEstimator : public ExprFunctor<double(const PrimExpr& n)> {
   int cur_type_code_;
 };
 
+void CheckComputeValidity(const te::Schedule& sch) {
+  // Check the validity of a compute definition:
+  // The name of each iterator should be unique.
+  for (auto stage : sch->stages) {
+    if (stage->op->IsInstance<te::ComputeOpNode>()) {
+      std::unordered_set<std::string> names;
+      for (const auto& x : stage->leaf_iter_vars) {
+        ICHECK(!names.count(x->var->name_hint))
+            << "Find duplicated iterator names in the compute definition: " << x->var->name_hint
+            << ". Please use different names for different iterators.";
+        names.insert(x->var->name_hint);
+      }
+    }
+  }
+}
+
 ComputeDAG::ComputeDAG(Array<te::Tensor> tensors) {
   auto node = make_object<ComputeDAGNode>();
   node->tensors = std::move(tensors);
@@ -674,6 +690,9 @@ ComputeDAG::ComputeDAG(Array<te::Tensor> tensors) {
     node->ops.push_back(stage->op);
   }
 
+  // Make sure it is a valid compute definition
+  CheckComputeValidity(sch);
+
   node->flop_ct = FlopEstimator().EstimateFlop(node->ops);
   node->init_state = State(node->ops);
   data_ = std::move(node);
@@ -682,6 +701,9 @@ ComputeDAG::ComputeDAG(Array<te::Tensor> tensors) {
 ComputeDAG::ComputeDAG(const te::Schedule& sch) {
   auto node = make_object<ComputeDAGNode>();
 
+  // Make sure it is a valid compute definition
+  CheckComputeValidity(sch);
+
   // Initialize ops. Here we enforce the order of ops and stages are consistent
   for (auto stage : sch->stages) {
     node->ops.push_back(stage->op);
diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py
index 5b7add9733de..87814f28ad72 100644
--- a/tests/python/unittest/test_auto_scheduler_common.py
+++ b/tests/python/unittest/test_auto_scheduler_common.py
@@ -137,6 +137,17 @@ def softmax_abcd_auto_scheduler_test(a, b, c, d):
     return [A, B]
 
 
+@auto_scheduler.register_workload
+def invalid_compute_definition():
+    A = te.placeholder((10, 10), name="A")
+    # The names of the following two iterators are the same.
+    # This is invalid.
+    r1 = te.reduce_axis((0, 2), name="r1")
+    r2 = te.reduce_axis((0, 2), name="r1")
+    B = te.compute((10,), lambda i: te.sum(A[i][r1 + r2], axis=[r1, r2]), name="B")
+    return [A, B]
+
+
 @auto_scheduler.register_workload
 def conv2d_winograd_nhwc_auto_scheduler_test(
     N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index caf3c9d888b6..bde3b786d370 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -25,6 +25,7 @@
 
 from test_auto_scheduler_common import (
     get_tiled_matmul,
+    invalid_compute_definition,
     matmul_auto_scheduler_test,
     parallel_matmul_auto_scheduler_test,
 )
@@ -137,8 +138,20 @@ def test_stage_order():
     assert task.hardware_params.cache_line_bytes == task2.hardware_params.cache_line_bytes
 
 
+def test_invalid_compute_dag():
+    failed = False
+    try:
+        A, B = invalid_compute_definition()
+        dag = auto_scheduler.ComputeDAG([A, B])
+    except tvm.TVMError as e:
+        failed = True
+
+    assert failed
+
+
 if __name__ == "__main__":
     test_apply_steps()
     test_infer_bound()
     test_estimate_flop()
     test_stage_order()
+    test_invalid_compute_dag()

From 5f37380055403199897c667053be799733e550e8 Mon Sep 17 00:00:00 2001
From: Andrew Liu <andrewlliu@gmail.com>
Date: Wed, 25 Nov 2020 17:00:31 -0800
Subject: [PATCH 212/258] [Frontend][Relay][Parser] fix unparsable yolo formals
 (#6963)

* fix yolo formals

* fix lint

* move test to test_forward
---
 python/tvm/relay/frontend/darknet.py          |  2 +-
 tests/python/frontend/darknet/test_forward.py | 15 +++++++++++++++
 tests/python/relay/test_ir_text_printer.py    |  5 ++---
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/darknet.py b/python/tvm/relay/frontend/darknet.py
index 87e55593e943..363812fd562b 100644
--- a/python/tvm/relay/frontend/darknet.py
+++ b/python/tvm/relay/frontend/darknet.py
@@ -40,7 +40,7 @@ def _darknet_not_support(attr, op="relay"):
 
 def _get_params_prefix(opname, layer_num):
     """Makes the params prefix name from opname and layer number."""
-    return str(opname) + str(layer_num)
+    return str(opname).replace(".", "_") + str(layer_num)
 
 
 def _get_params_name(prefix, item):
diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py
index 74c1a2199caa..b6dc815a9530 100644
--- a/tests/python/frontend/darknet/test_forward.py
+++ b/tests/python/frontend/darknet/test_forward.py
@@ -46,6 +46,17 @@
 )
 
 
+def astext(program, unify_free_vars=False):
+    """check that program is parsable in text format"""
+    text = program.astext()
+    if isinstance(program, relay.Expr):
+        roundtrip_program = tvm.parser.parse_expr(text)
+    else:
+        roundtrip_program = tvm.parser.fromtext(text)
+
+    tvm.ir.assert_structural_equal(roundtrip_program, program, map_free_vars=True)
+
+
 def _read_memory_buffer(shape, data, dtype="float32"):
     length = 1
     for x in shape:
@@ -60,6 +71,10 @@ def _get_tvm_output(net, data, build_dtype="float32", states=None):
     """Compute TVM output"""
     dtype = "float32"
     mod, params = relay.frontend.from_darknet(net, data.shape, dtype)
+    # verify that from_darknet creates a valid, parsable relay program
+    mod = relay.transform.InferType()(mod)
+    astext(mod)
+
     target = "llvm"
     shape_dict = {"data": data.shape}
     lib = relay.build(mod, target, params=params)
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 4a3569aca2ec..72a243dbbb67 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -21,6 +21,7 @@
 import numpy as np
 from tvm.relay import Expr
 from tvm.relay.analysis import free_vars
+import pytest
 
 DEBUG_PRINT = False
 
@@ -269,6 +270,4 @@ def test_span():
 
 
 if __name__ == "__main__":
-    import sys
-
-    pytext.argv(sys.argv)
+    pytest.main([__file__])

From 48f4f648792f92e0daf114ab43838371534331c7 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 25 Nov 2020 23:46:59 -0700
Subject: [PATCH 213/258] Don't fuse take with dynamic inputs (#6979)

* add a regression test for fusing dynamic take

* add legalize for take that stops fusion on dynamic inputs

* fix lint

* fix typo
---
 python/tvm/relay/op/_transform.py        | 19 +++++++++++++++++
 python/tvm/topi/transform.py             | 22 +++++++++++++++++++
 tests/python/relay/test_pass_fuse_ops.py | 27 ++++++++++++++++++++++++
 3 files changed, 68 insertions(+)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 2fa806c07e11..a06aff11855b 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -341,6 +341,25 @@ def take_shape_func(attrs, inputs, out_ndims):
     return [_take_with_axis_shape_func(*inputs, convert(axis), out_ndims[0])]
 
 
+@_reg.register_legalize("take")
+def legalize_dyn_topk(attrs, inputs, types):
+    """Legalize take op.
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current op
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.take_legalize(attrs, inputs, types)
+
+
 @script
 def _argwhere_shape_func_1d(condition):
     out = output_tensor((2,), "int64")
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index cdf9ce5c9275..6ddbc73e4666 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -426,6 +426,28 @@ def take(a, indices, axis=None, mode="clip"):
     return cpp.take(a, indices, int(axis), mode)
 
 
+@tvm.target.generic_func
+def take_legalize(attrs, inputs, types):
+    """Legalizes dyn.topk op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current op
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    if tvm.relay.ty.is_dynamic(types[0]):
+        return tvm.relay.take(tvm.relay.annotation.stop_fusion(inputs[0]), inputs[1], **attrs)
+    return None
+
+
 def gather(data, axis, indices):
     """Gather values along given axis from given indices.
 
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index a3146de55d5a..30ee29525daa 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
+
 import tvm
 from tvm import relay
 from tvm.relay import transform
@@ -757,6 +759,31 @@ def create_diamond_func(inp):
     assert tvm.ir.structural_equal(fused, expected)
 
 
+def test_fuse_dynamic_squeeze_slice_take():
+    input_data = [
+        np.random.random([1, 2, 4]).astype("float32"),
+        np.array([0]).astype("int64"),
+    ]
+
+    x = relay.var("p0107", shape=(relay.Any(), relay.Any(), 4), dtype="float32")
+    take_val = relay.var("p166", shape=(relay.Any(),), dtype="int64")
+
+    squeeze = relay.op.squeeze(x, axis=[0])
+    strided_slice = relay.op.strided_slice(
+        squeeze, begin=[0, 0], end=[15130, 9223372036854775807], strides=[1, 1]
+    )
+    take = relay.op.take(strided_slice, take_val, axis=0)
+
+    mod = tvm.IRModule.from_expr(take)
+    ex = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(), target="llvm")
+
+    result = ex.evaluate()(*input_data)
+
+    np_result = np.squeeze(input_data[0][:, input_data[1][0], :], axis=0)
+
+    assert np.allclose(result.asnumpy(), np_result)
+
+
 if __name__ == "__main__":
     test_fuse_simple()
     test_conv2d_fuse()

From 9cd8c6c8d9903b2aa0c7bec9a5550458da92a866 Mon Sep 17 00:00:00 2001
From: Daniel Steger <dstegs@gmail.com>
Date: Thu, 26 Nov 2020 06:06:39 -0800
Subject: [PATCH 214/258] bumping vta version (#6977)

---
 3rdparty/vta-hw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 87ce9acfae55..519263cc45de 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 87ce9acfae550d1a487746e9d06c2e250076e54c
+Subproject commit 519263cc45de40449702fd7942fa0a745297180f

From 65e63ee5ad4ef69dbc4045195b7be14b681ff248 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 26 Nov 2020 06:09:22 -0800
Subject: [PATCH 215/258] Add Relay option to link parameters into runtime
 Modules (#6917)

* refactor RPCSessionContext utils

* Make TVMLogf platform-independent.

 * Some platforms need to use an alternate printf() to support basic
   things like %zu. Since %zu is platform-specific, we prefer to
   use a printf() that supports it or allow the platform to fix it up
   as needed.
---
 cmake/modules/StandaloneCrt.cmake             |   1 +
 include/tvm/runtime/crt/error_codes.h         |  16 +
 include/tvm/runtime/crt/graph_runtime.h       |  16 +-
 .../tvm/runtime/crt/graph_runtime_module.h    |  42 ++
 include/tvm/runtime/crt/module.h              |   8 +
 include/tvm/runtime/module.h                  |   4 +
 include/tvm/tir/function.h                    |  48 +++
 python/tvm/micro/build.py                     |  23 +-
 python/tvm/micro/debugger.py                  |   2 +-
 python/tvm/micro/session.py                   |  73 +++-
 python/tvm/micro/transport/base.py            |   2 +-
 python/tvm/target/target.py                   |   3 +-
 src/relay/backend/build_module.cc             |  49 ++-
 src/relay/backend/graph_runtime_codegen.cc    |  36 +-
 src/runtime/crt/Makefile                      |   2 +-
 src/runtime/crt/common/crt_runtime_api.c      |  24 +-
 src/runtime/crt/common/memory.c               |  13 +-
 src/runtime/crt/graph_runtime/graph_runtime.c |  89 +++-
 .../graph_runtime_module.c                    | 221 ++++++++++
 src/runtime/crt/host/main.cc                  |   9 +
 .../internal/graph_runtime/graph_runtime.h    |  10 +-
 .../graph/debug/graph_runtime_debug.cc        |  15 +-
 src/runtime/graph/graph_runtime.cc            |  87 +++-
 src/runtime/graph/graph_runtime.h             |  25 +-
 src/runtime/graph/graph_runtime_factory.cc    |   2 +-
 src/runtime/rpc/rpc_module.cc                 |  85 ++--
 src/target/llvm/codegen_llvm.cc               |  88 ++++
 src/target/llvm/codegen_llvm.h                |  12 +
 src/target/llvm/codegen_params.cc             | 176 ++++++++
 src/target/llvm/codegen_params.h              |  49 +++
 src/target/llvm/llvm_module.cc                |  20 +-
 src/target/source/codegen_c_host.cc           |  64 +++
 src/target/source/codegen_c_host.h            |   3 +
 src/target/source/codegen_params.cc           | 248 +++++++++++
 src/target/source/codegen_params.h            |  52 +++
 src/target/target_kind.cc                     |   2 +
 src/tir/ir/function.cc                        |   7 +
 tests/cpp/target_test.cc                      |   3 +-
 tests/python/unittest/test_crt.py             |   1 +
 tests/python/unittest/test_link_params.py     | 408 ++++++++++++++++++
 .../unittest/test_target_codegen_llvm.py      |   5 +-
 41 files changed, 1927 insertions(+), 116 deletions(-)
 create mode 100644 include/tvm/runtime/crt/graph_runtime_module.h
 create mode 100644 src/runtime/crt/graph_runtime_module/graph_runtime_module.c
 create mode 100644 src/target/llvm/codegen_params.cc
 create mode 100644 src/target/llvm/codegen_params.h
 create mode 100644 src/target/source/codegen_params.cc
 create mode 100644 src/target/source/codegen_params.h
 create mode 100644 tests/python/unittest/test_link_params.py

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 73c85d13e2ef..256ce2a48a6c 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -44,6 +44,7 @@ if(USE_MICRO)
          "src/runtime/crt/include *.h -> include"
          "src/runtime/crt/common *.c -> src/runtime/crt/common"
          "src/runtime/crt/graph_runtime *.c -> src/runtime/crt/graph_runtime"
+         "src/runtime/crt/graph_runtime_module *.c -> src/runtime/crt/graph_runtime_module"
          "src/runtime/crt/host crt_config.h -> src/runtime/crt/host"
          "src/runtime/crt/utvm_rpc_common *.cc -> src/runtime/crt/utvm_rpc_common"
          "src/runtime/crt/utvm_rpc_server *.cc -> src/runtime/crt/utvm_rpc_server"
diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index 16d0e793848b..93a332a5924f 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -41,6 +41,9 @@ typedef enum {
   kTvmErrorCategoryWriteStream = 3,
   kTvmErrorCategorySession = 4,
   kTvmErrorCategoryPlatform = 5,
+  kTvmErrorCategoryGenerated = 6,
+  kTvmErrorCategoryGraphRuntime = 7,
+  kTvmErrorCategoryFunctionCall = 8,
 } tvm_crt_error_category_t;
 
 typedef enum {
@@ -74,6 +77,19 @@ typedef enum {
   kTvmErrorPlatformMemoryManagerInitialized = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 1),
   kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
 
+  // Common error codes returned from generated functions.
+  kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
+
+  // Graph runtime
+  kTvmErrorGraphModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 0),
+  kTvmErrorGraphModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 1),
+  kTvmErrorGraphModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 2),
+
+  // Function Calls - common problems encountered calling functions.
+  kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0),
+  kTvmErrorFunctionCallWrongArgType = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 1),
+  kTvmErrorFunctionCallNotImplemented = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 2),
+
   // System errors are always negative integers; this mask indicates presence of a system error.
   // Cast tvm_crt_error_t to a signed integer to interpret the negative error code.
   kTvmErrorSystemErrorMask = (1 << (sizeof(int) * 4 - 1)),
diff --git a/include/tvm/runtime/crt/graph_runtime.h b/include/tvm/runtime/crt/graph_runtime.h
index d2eb3b7785e9..e8413aa1723d 100644
--- a/include/tvm/runtime/crt/graph_runtime.h
+++ b/include/tvm/runtime/crt/graph_runtime.h
@@ -61,14 +61,20 @@ typedef struct TVMGraphRuntime TVMGraphRuntime;
  * \brief Allocate a new GraphRuntime with vmalloc and initialize it.
  *
  * \param sym_json JSON-encoded graph.
- * \param m TVM Module that exposes the functions to call.
+ * \param module_handle TVM Module that exposes the functions to call.
  * \param ctxs runtime execution context.
  */
-TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, const struct TVMModule* m,
+TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle,
                                         const TVMContext* ctxs);
 
 int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name);
 
+/*!
+ * \brief get number of input tensors allocated.
+ * \return integer number of tensors available to use.
+ */
+int TVMGraphRuntime_GetNumInputs();
+
 /*!
  * \brief set input to the graph based on name.
  * \param runtime The graph runtime.
@@ -77,6 +83,12 @@ int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name);
  */
 void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in);
 
+/*!
+ * \brief get number of output tensors allocated.
+ * \return integer number of output tensors allocated.
+ */
+int TVMGraphRuntime_GetNumOutputs();
+
 /*!
  * \brief Return NDArray for given output index.
  * \param runtime The graph runtime.
diff --git a/include/tvm/runtime/crt/graph_runtime_module.h b/include/tvm/runtime/crt/graph_runtime_module.h
new file mode 100644
index 000000000000..04e9184c8b8d
--- /dev/null
+++ b/include/tvm/runtime/crt/graph_runtime_module.h
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_runtime.h
+ * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc.
+ */
+#ifndef TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
+#define TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <tvm/runtime/crt/error_codes.h>
+
+/*!
+ * \brief Register the "tvm.graph_runtime.create" constructor PackedFunc.
+ */
+tvm_crt_error_t TVMGraphRuntimeModule_Register();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
diff --git a/include/tvm/runtime/crt/module.h b/include/tvm/runtime/crt/module.h
index 2359025f6fe1..7b124c4faa3a 100644
--- a/include/tvm/runtime/crt/module.h
+++ b/include/tvm/runtime/crt/module.h
@@ -39,6 +39,14 @@ typedef struct TVMModule {
   const TVMFuncRegistry* registry;
 } TVMModule;
 
+/*!
+ * \brief Create a new module handle from the given TVMModule instance.
+ * \param mod The module instance to register.
+ * \param out_handle Pointer to recieve the newly-minted handle for this module.
+ * \return 0 on success, non-zero on error.
+ */
+int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle);
+
 /*! \brief Entry point for the system lib module. */
 const TVMModule* TVMSystemLibEntryPoint(void);
 
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 0e7cd2b08784..04a5cf8bf25d 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -226,6 +226,10 @@ constexpr const char* tvm_global_barrier_state = "__tvm_global_barrier_state";
 constexpr const char* tvm_prepare_global_barrier = "__tvm_prepare_global_barrier";
 /*! \brief Placeholder for the module's entry function. */
 constexpr const char* tvm_module_main = "__tvm_main__";
+/*! \brief Prefix for parameter symbols emitted into the main program. */
+constexpr const char* tvm_param_prefix = "__tvm_param__";
+/*! \brief A PackedFunc that looks up linked parameters by storage_id. */
+constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
 }  // namespace symbol
 
 // implementations of inline functions.
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index 64dbb5cf8ec3..97ee7f7211d4 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -25,6 +25,7 @@
 #define TVM_TIR_FUNCTION_H_
 
 #include <tvm/ir/function.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
@@ -150,6 +151,42 @@ class PrimFunc : public BaseFunc {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(PrimFuncNode);
 };
 
+/*!
+ * \brief Describes one parameter that should be linked into the generated module.
+ *
+ * When parameters are to be linked in with generated code (i.e. on target_host-compatible
+ * backends), Relay attaches instances of this object to a global TIR function. Code-generators
+ * use the information contained in this node to include the parameter data in the generated
+ * module.
+ */
+class LinkedParamNode : public Object {
+ public:
+  /*! \brief Unique numeric identifier used by runtimes to lookup this parameter. */
+  int64_t id;
+
+  /*! \brief Parameter data which should get linked into the final module. */
+  ::tvm::runtime::NDArray param;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("id", &id);
+    v->Visit("param", &param);
+  }
+
+  static constexpr const char* _type_key = "tir.LinkedParam";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LinkedParamNode, Object);
+};
+
+/*!
+ * \brief Managed reference to LinkedParamNode.
+ */
+class LinkedParam : public ObjectRef {
+ public:
+  TVM_DLL LinkedParam(int64_t id, ::tvm::runtime::NDArray param);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(LinkedParam, ObjectRef, LinkedParamNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(LinkedParamNode);
+};
+
 /*!
  * \brief PrimFunc specific attribute names.
  *
@@ -192,6 +229,17 @@ constexpr const char* kNoAlias = "tir.noalias";
  * \note There can only be one entry function per module.
  */
 constexpr const char* kIsEntryFunc = "tir.is_entry_func";
+
+/*!
+ * \brief Parameters used in the module that should be linked by the codegen.
+ *
+ * Type: Map<String, LinkableParam>
+ *
+ * \note This should be present only on a function named
+ *     tvm::target::packed_func::kLookupLinkedParam.
+ */
+constexpr const char* kLinkedParams = "tir.linked_params";
+
 }  // namespace attr
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index d1a3c4163755..4aec9ea5ecbb 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -23,6 +23,8 @@
 import re
 from tvm.contrib import utils
 
+from .micro_library import MicroLibrary
+
 
 _LOG = logging.getLogger(__name__)
 
@@ -109,7 +111,13 @@ def default_options(target_include_dir):
 
 
 def build_static_runtime(
-    workspace, compiler, module, lib_opts=None, bin_opts=None, generated_lib_opts=None
+    workspace,
+    compiler,
+    module,
+    lib_opts=None,
+    bin_opts=None,
+    generated_lib_opts=None,
+    extra_libs=None,
 ):
     """Build the on-device runtime, statically linking the given modules.
 
@@ -131,6 +139,12 @@ def build_static_runtime(
         The `options` parameter passed to compiler.library() when compiling the generated TVM C
         source module.
 
+    extra_libs : Optional[List[MicroLibrary|str]]
+        If specified, extra libraries to be compiled into the binary. If a MicroLibrary, it is
+        included into the binary directly. If a string, the path to a directory; all direct children
+        of this directory matching RUNTIME_SRC_REGEX are built into a library. These libraries are
+        placed before any common CRT libraries in the link order.
+
     Returns
     -------
     MicroBinary :
@@ -150,7 +164,12 @@ def build_static_runtime(
     module.save(mod_src_path, "cc")
 
     libs = []
-    for lib_src_dir in RUNTIME_LIB_SRC_DIRS:
+    for mod_or_src_dir in (extra_libs or []) + RUNTIME_LIB_SRC_DIRS:
+        if isinstance(mod_or_src_dir, MicroLibrary):
+            libs.append(mod_or_src_dir)
+            continue
+
+        lib_src_dir = mod_or_src_dir
         lib_name = os.path.basename(lib_src_dir)
         lib_build_dir = workspace.relpath(f"build/{lib_name}")
         os.makedirs(lib_build_dir)
diff --git a/python/tvm/micro/debugger.py b/python/tvm/micro/debugger.py
index 8119940a018c..65cafe7e9c8a 100644
--- a/python/tvm/micro/debugger.py
+++ b/python/tvm/micro/debugger.py
@@ -272,7 +272,7 @@ def read(self, n, timeout_sec):
             raise base.IoTimeoutError()
 
         def close(self):
-            pass  # Pipes closed by parent class.
+            pass  # Pipes closed by parent class (DebugWrapperTransport calls stop() next).
 
     def transport(self):
         return self._Transport(self)
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 88bdf6cd8b5a..fba612b84d1f 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -154,6 +154,43 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
         self.transport.__exit__(exc_type, exc_value, exc_traceback)
 
 
+def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
+    """Lookup a parameter that has been pre-linked into a remote (i.e. over RPC) Module.
+
+    This function signature matches the signature built by
+
+    Parameters
+    ----------
+    mod : tvm.runtime.Module
+        The remote Module containing the pre-linked parameters.
+    storage_id : int
+        An integer identifying the pre-linked paramter to find
+    template_tensor : DLTensor
+        A DLTensor containing metadata that should be filled-in to the returned NDArray. This
+        function should mostly not inspect this, and just pass it along to
+        NDArrayFromRemoteOpaqueHandle.
+    ctx : TVMContext
+        The remote CPU context to be used with the returned NDArray.
+
+    Returns
+    -------
+    tvm.nd.NDArray :
+        NDArray containing the pre-linked parameter.
+    """
+    try:
+        lookup_linked_param = mod.get_function("_lookup_linked_param")
+    except AttributeError:
+        return None
+
+    remote_data = lookup_linked_param(storage_id)
+    if remote_data is None:
+        return None
+
+    return get_global_func("tvm.rpc.NDArrayFromRemoteOpaqueHandle")(
+        mod, remote_data, template_tensor, ctx, lambda: None
+    )
+
+
 def create_local_graph_runtime(graph_json_str, mod, ctx):
     """Create a local graph runtime driving execution on the remote CPU context given.
 
@@ -175,4 +212,38 @@ def create_local_graph_runtime(graph_json_str, mod, ctx):
     """
     device_type_id = [ctx.device_type, ctx.device_id]
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return graph_runtime.GraphModule(fcreate(graph_json_str, mod, *device_type_id))
+    return graph_runtime.GraphModule(
+        fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id)
+    )
+
+
+def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None):
+    """Create a local debug runtime driving execution on the remote CPU context given.
+
+    Parameters
+    ----------
+    graph_json_str : str
+        A string containing the graph representation.
+
+    mod : tvm.runtime.Module
+        The remote module containing functions in graph_json_str.
+
+    ctx : tvm.Context
+        The remote CPU execution context.
+
+    dump_root : Optional[str]
+        If given, passed as dump_root= to GraphModuleDebug.
+
+    Returns
+    -------
+    tvm.contrib.GraphRuntime :
+         A local graph runtime instance that executes on the remote device.
+    """
+    device_type_id = [ctx.device_type, ctx.device_id]
+    fcreate = get_global_func("tvm.graph_runtime_debug.create")
+    return debug_runtime.GraphModuleDebug(
+        fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id),
+        [ctx],
+        graph_json_str,
+        dump_root=dump_root,
+    )
diff --git a/python/tvm/micro/transport/base.py b/python/tvm/micro/transport/base.py
index 07a6a6ac7fdc..fdc7e9b2afce 100644
--- a/python/tvm/micro/transport/base.py
+++ b/python/tvm/micro/transport/base.py
@@ -64,7 +64,7 @@ class IoTimeoutError(Exception):
 )
 
 
-def debug_transport_timeouts(session_start_retry_timeout_sec=0.0):
+def debug_transport_timeouts(session_start_retry_timeout_sec=0):
     return TransportTimeouts(
         session_start_retry_timeout_sec=session_start_retry_timeout_sec,
         session_start_timeout_sec=0,
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index c919fc31e9aa..edbb0fa3792a 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -236,7 +236,8 @@ def micro(model="unknown", options=None):
         "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
     }
     opts = _merge_opts(
-        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"], options
+        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"],
+        options,
     )
 
     # NOTE: in the future, the default micro target will be LLVM except when
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index ddea5456585b..82ac1c57018e 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -22,6 +22,7 @@
  * \brief Code generation for TVM's graph runtime.
  */
 #include <tvm/driver/driver_api.h>
+#include <tvm/ir/expr.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/qnn/transform.h>
@@ -30,6 +31,7 @@
 
 #include <memory>
 
+#include "../../target/func_registry_generator.h"
 #include "../../target/source/codegen_source_base.h"
 #include "compile_engine.h"
 #include "utils.h"
@@ -88,6 +90,17 @@ struct GraphCodegen {
     return ret;
   }
 
+  std::unordered_map<std::string, int64_t> GetParamIds() {
+    std::unordered_map<std::string, int64_t> ret;
+    auto names = CallFunc<Array<runtime::String>>("list_params_name", nullptr);
+    for (const auto& expr : names) {
+      // Implicit cast from runtime::String to std::string
+      std::string key = expr;
+      ret[key] = CallFunc<int64_t>("get_param_id", key);
+    }
+    return ret;
+  }
+
  protected:
   tvm::runtime::Module mod;
   template <typename R, typename... Args>
@@ -443,16 +456,36 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     auto lowered_funcs = graph_codegen_->GetIRModule();
 
-    // When there is no lowered_funcs due to reasons such as optimization.
-    if (lowered_funcs.size() == 0) {
-      Target target_host = GetTargetHost();
+    Target target_host = GetTargetHost();
+    // If no target_host has been set, we choose a default one, which is
+    // llvm if "codegen.LLVMModuleCreate" is accessible.
+    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
+    if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
+
+    // Generate a placeholder function that attaches linked params as its arguments.
+    if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
+      CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
+      auto param_ids = graph_codegen_->GetParamIds();
+      auto link_params = Map<String, tir::LinkedParam>();
+      for (auto param : ret_.params) {
+        link_params.Set(param.first, tir::LinkedParam(param_ids[param.first], param.second));
+      }
 
-      // If no target_host has been set, we choose a default one, which is
-      // llvm if "codegen.LLVMModuleCreate" is accessible.
-      const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
-      if (!target_host.defined())
-        target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
+      Map<String, ObjectRef> dict;
+      dict.Set(tvm::tir::attr::kLinkedParams, link_params);
+      dict.Set(tvm::attr::kGlobalSymbol, String(::tvm::runtime::symbol::tvm_lookup_linked_param));
+      DictAttrs attrs{dict};
+      auto prim = tir::PrimFunc(Array<tir::Var>(), tir::SeqStmt(Array<tir::Stmt>()), VoidType(),
+                                Map<tir::Var, tir::Buffer>(), attrs);
+      if (lowered_funcs.find(target_host->str()) == lowered_funcs.end()) {
+        lowered_funcs.Set(target_host->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
+      }
+      lowered_funcs[target_host->str()]->Add(
+          GlobalVar(::tvm::runtime::symbol::tvm_lookup_linked_param), prim);
+    }
 
+    // When there is no lowered_funcs due to reasons such as optimization.
+    if (lowered_funcs.size() == 0) {
       if (target_host.defined() && target_host->kind->name == "llvm") {
         // If we can decide the target is LLVM, we then create an empty LLVM module.
         ret_.mod = (*pf)(target_host->str(), "empty_module");
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index e24d18de931c..7ed150495104 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -56,7 +56,7 @@ struct LoweredOutput {
   std::string graph_json;
   Map<String, IRModule> lowered_funcs;
   Array<tvm::runtime::Module> external_mods;
-  std::unordered_map<std::string, tvm::runtime::NDArray> params;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
 };
 
 /*! \brief Node types */
@@ -203,7 +203,12 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     GetJSON(&writer);
     LoweredOutput ret;
     ret.graph_json = os.str();
-    ret.params = params_;
+    ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
+    for (auto param : params_) {
+      ret.params.emplace(std::make_pair(
+          param.first,
+          std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
+    }
 
     for (auto& kv : lowered_funcs_) {
       if (ret.lowered_funcs.count(kv.first) == 0) {
@@ -312,9 +317,12 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
     Expr expr = GetRef<Expr>(op);
     size_t index = params_.size();
     std::string name = "p" + std::to_string(index);
-    params_[name] = op->data;
     auto node = GraphInputNode::make_node_ptr(name, GraphAttrs());
-    return AddNode(node, expr);
+    auto to_return = AddNode(node, expr);
+    CHECK_EQ(to_return.size(), 1) << "Expected exactly 1 parameter node created";
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    params_[name] = op->data;
+    return to_return;
   }
 
   std::vector<GraphNodeRef> VisitExpr_(const TupleNode* op) override {
@@ -531,8 +539,14 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   std::unordered_map<const Object*, std::vector<GraphNodeRef>> var_map_;
   /*! \brief target device */
   TargetsMap targets_;
-  /*! \brief params */
+  /*!
+   * \brief parameters (i.e. ConstantNodes found in the graph).
+   * These are take as inputs to the GraphRuntime.
+   * Maps param name to a pair of storage_id and NDArray. At runtime, the storage_id can be
+   * used to lookup the parameter.
+   */
   std::unordered_map<std::string, runtime::NDArray> params_;
+  std::unordered_map<std::string, int64_t> param_storage_ids_;
   /*! \brief plan memory of device result */
   Map<Expr, Array<IntegerArray>> storage_device_map_;
   /*! \brief lowered funcs */
@@ -581,8 +595,16 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
     } else if (name == "get_param_by_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         String key = args[0];
-        ICHECK_GT(this->output_.params.count(key), 0);
-        *rv = this->output_.params[key];
+        auto it = this->output_.params.find(key);
+        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+        *rv = (*it).second.second;
+      });
+    } else if (name == "get_param_id") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        String key = args[0];
+        auto it = this->output_.params.find(key);
+        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
+        *rv = (*it).second.first;
       });
     } else if (name == "get_irmodule") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile
index 8a24db4e8b2b..6e462431173f 100644
--- a/src/runtime/crt/Makefile
+++ b/src/runtime/crt/Makefile
@@ -65,7 +65,7 @@ $(notdir $(1)): $${BUILD_DIR}/lib$(notdir $(1)).a
 
 endef
 
-LIBS = src/runtime/crt/common src/runtime/crt/graph_runtime src/runtime/crt/utvm_rpc_common src/runtime/crt/utvm_rpc_server
+LIBS = src/runtime/crt/common src/runtime/crt/graph_runtime src/runtime/crt/graph_runtime_module src/runtime/crt/utvm_rpc_common src/runtime/crt/utvm_rpc_server
 
 $(foreach lib,$(LIBS),$(eval $(call LIB_template,$(lib))))
 
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index d6f78d9e3a03..f2d67ccfbeab 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -127,7 +127,7 @@ static TVMModuleHandle EncodeModuleHandle(tvm_module_index_t module_index) {
   return (TVMModuleHandle)((uintptr_t)(module_index | 0x8000));
 }
 
-static int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle) {
+int TVMModCreateFromCModule(const TVMModule* mod, TVMModuleHandle* out_handle) {
   tvm_module_index_t idx;
 
   for (idx = 0; idx < TVM_CRT_MAX_REGISTERED_MODULES; idx++) {
@@ -229,17 +229,17 @@ int TVMFuncCall(TVMFunctionHandle func_handle, TVMValue* arg_values, int* type_c
   return func(arg_values, type_codes, num_args, ret_val, ret_type_code, resource_handle);
 }
 
-static int FindFunctionOrSetAPIError(tvm_module_index_t module_index,
-                                     const TVMFuncRegistry* registry, const char* name,
-                                     TVMFunctionHandle* out) {
+static tvm_crt_error_t FindFunctionOrSetAPIError(tvm_module_index_t module_index,
+                                                 const TVMFuncRegistry* registry, const char* name,
+                                                 TVMFunctionHandle* out) {
   tvm_function_index_t function_index;
-  if (TVMFuncRegistry_Lookup(registry, name, &function_index) != 0) {
-    TVMAPIErrorf("failed to get function: mod_index=%04" PRIx16 ", name=%s", module_index, name);
-    return -1;
+  tvm_crt_error_t err = TVMFuncRegistry_Lookup(registry, name, &function_index);
+  if (err != kTvmErrorNoError) {
+    return err;
   }
 
   *out = EncodeFunctionHandle(module_index, function_index);
-  return 0;
+  return kTvmErrorNoError;
 }
 
 int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out) {
@@ -279,6 +279,14 @@ int ModuleGetFunction(TVMValue* args, int* type_codes, int num_args, TVMValue* r
 
   if (to_return == 0) {
     ret_type_codes[0] = kTVMPackedFuncHandle;
+  } else {
+    ret_value->v_handle = NULL;
+  }
+
+  // NOTE: For compatibility with C++ runtime API, return no error (but NULL function) when the
+  // function lookup failed.
+  if (to_return == kTvmErrorFunctionNameNotFound) {
+    to_return = kTvmErrorNoError;
   }
 
   return to_return;
diff --git a/src/runtime/crt/common/memory.c b/src/runtime/crt/common/memory.c
index 68cad3645146..876c10efe3ea 100644
--- a/src/runtime/crt/common/memory.c
+++ b/src/runtime/crt/common/memory.c
@@ -151,8 +151,8 @@ void* MemoryManager_Alloc(MemoryManager* mgr, tvm_index_t size) {
   }
   vleak_size++;
 #if TVM_CRT_DEBUG > 1
-  printf("allocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", data, start,
-         ptable->max_pages, npage, vleak_size);
+  TVMLogf("allocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", data, start,
+          ptable->max_pages, npage, vleak_size);
 #endif  // TVM_CRT_DEBUG
   return data;
 }
@@ -229,9 +229,8 @@ void* MemoryManager_Realloc(MemoryManager* mgr, void* ptr, tvm_index_t size) {
     vleak_size++;
   }
 #if TVM_CRT_DEBUG > 1
-  printf("reallocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d, size=%" PRId64
-         "\n",
-         data, start, mgr->ptable.max_pages, npage, vleak_size, size);
+  TVMLogf("reallocate: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d, size=%zu", data,
+          start, mgr->ptable.max_pages, npage, vleak_size, size);
 #endif  // TVM_CRT_DEBUG
   return data;
 }
@@ -251,8 +250,8 @@ void MemoryManager_Free(MemoryManager* mgr, void* ptr) {
   free_map->insert(free_map, p->num_pages, p);
   vleak_size--;
 #if TVM_CRT_DEBUG > 1
-  printf("release: addr=%p, start=%" PRId64 "/%zu, npage=%" PRId64 ", vleak=%d\n", ptr,
-         entry->page.ptable_begin, mgr->ptable.max_pages, entry->page.num_pages, vleak_size);
+  TVMLogf("release: addr=%p, start=%" PRId64 "/%zu, npage=%zu, vleak=%d", ptr,
+          entry->page.ptable_begin, mgr->ptable.max_pages, entry->page.num_pages, vleak_size);
 #endif  // TVM_CRT_DEBUG
 }
 
diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c
index a6cd77ad6a22..450272d8722b 100644
--- a/src/runtime/crt/graph_runtime/graph_runtime.c
+++ b/src/runtime/crt/graph_runtime/graph_runtime.c
@@ -539,6 +539,13 @@ uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint
   return runtime->node_row_ptr[nid] + index;
 }
 
+/*!
+ * \brief Get the number of input tensors allocated.
+ * \param runtime The graph runtime.
+ * \return the number of input tensors allocated.
+ */
+int TVMGraphRuntime_GetNumInputs(TVMGraphRuntime* runtime) { return runtime->input_nodes_count; }
+
 /*!
  * \brief Get the input index given the name of input.
  * \param runtime The graph runtime.
@@ -675,6 +682,13 @@ void TVMGraphRuntime_Run(TVMGraphRuntime* runtime) {
   }
 }
 
+/*!
+ * \brief Get the number of output tensors allocated.
+ * \param runtime The graph runtime.
+ * \return the number of output tensors allocated.
+ */
+int TVMGraphRuntime_GetNumOutputs(TVMGraphRuntime* runtime) { return runtime->outputs_count; }
+
 int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTensor* out) {
   int status = 0;
   uint32_t nid = runtime->outputs[idx].node_id;
@@ -693,8 +707,20 @@ int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTen
 }
 
 void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
+  TVMPackedFunc lookup_linked_param;
+  int lookup_linked_param_valid;
   uint32_t idx;
 
+  {
+    TVMArgs temp_args;
+    temp_args.values[0].v_int64 = 0;
+    temp_args.tcodes[0] = kTVMArgInt;
+    temp_args.values_count = 1;
+    lookup_linked_param_valid =
+        (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, runtime->module_handle,
+                                      "_lookup_linked_param", &temp_args) == 0);
+  }
+
   // Grab saved optimization plan from graph.
   TVMGraphRuntimeGraphAttr* attrs = &(runtime->attrs);
   DLDataType* vtype = vmalloc(sizeof(DLDataType) * attrs->dltype_count);
@@ -721,24 +747,47 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     if (sid >= pool_entry_count) {
       pool_entry_count = sid + 1;
     }
+    pool_entry[sid].entry_id = idx;
     pool_entry[sid].size = MAX(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
   }
 
   // Allocate the space.
   for (idx = 0; idx < pool_entry_count; idx++) {
-    runtime->storage_pool =
-        vrealloc(runtime->storage_pool, sizeof(TVMNDArray) * (runtime->storage_pool_count + 1));
+    runtime->storage_pool = vrealloc(runtime->storage_pool, sizeof(TVMGraphRuntimeStorageEntry) *
+                                                                (runtime->storage_pool_count + 1));
     TVMGraphRuntimePoolEntry pit = pool_entry[idx];
-    int64_t shape[TVM_CRT_MAX_NDIM] = {
-        0,
-    };
     TVMContext ctx = runtime->ctxs[0];
-    DLDataType dtype = {kDLFloat, 32, 1};
-    shape[0] = (pit.size + 3) / 4;
-    runtime->storage_pool[runtime->storage_pool_count] = TVMNDArray_Empty(1, shape, dtype, ctx);
-    CHECK_NE(runtime->storage_pool[runtime->storage_pool_count].dl_tensor.data, 0,
-             "fail to create storage_pool with idx=%d\n", idx);
+    uint8_t did_find_linked_param = 0;
+    if (lookup_linked_param_valid) {
+      lookup_linked_param.args.values[0].v_int64 = idx;
+      CHECK_EQ(lookup_linked_param.Call(&lookup_linked_param), 0, "lookup_linked_param");
+
+      void* linked_param_data = lookup_linked_param.ret_value.values[0].v_handle;
+      if (linked_param_data != NULL) {
+        runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 1;
+        DLTensor* tensor = &runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor;
+        tensor->data = linked_param_data;
+        tensor->ctx = ctx;
+        tensor->ndim = attrs->ndim[pit.entry_id];
+        tensor->shape = attrs->shape + idx * TVM_CRT_MAX_NDIM;
+        tensor->strides = NULL;
+        tensor->byte_offset = 0;
+        did_find_linked_param = 1;
+      }
+    }
+    if (did_find_linked_param == 0) {
+      int64_t shape[TVM_CRT_MAX_NDIM] = {
+          0,
+      };
+      DLDataType dtype = {kDLFloat, 32, 1};
+      shape[0] = (pit.size + 3) / 4;
+      runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 0;
+      runtime->storage_pool[runtime->storage_pool_count].array =
+          TVMNDArray_Empty(1, shape, dtype, ctx);
+      CHECK_NE(runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor.data, 0,
+               "fail to create storage_pool with idx=%d\n", idx);
+    }
     runtime->storage_pool_count++;
   }
 
@@ -751,7 +800,7 @@ void TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     uint32_t storage_id = attrs->storage_id[idx];
     CHECK(storage_id < runtime->storage_pool_count);
     runtime->data_entry[idx] =
-        TVMNDArray_CreateView(&(runtime->storage_pool[storage_id]),
+        TVMNDArray_CreateView(&(runtime->storage_pool[storage_id].array),
                               attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx], vtype[idx]);
     CHECK_NE(runtime->data_entry[idx].dl_tensor.data, 0,
              "fail to create for node with idx=%d, storage_id=%u\n", idx, storage_id);
@@ -858,28 +907,28 @@ int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam*
 /*!
  * \brief Initialize the graph executor with graph and context.
  * \param graph_json The execution graph.
- * \param module The module containing the compiled functions for the host
+ * \param module_handle The module containing the compiled functions for the host
  * processor.
  * \param ctxs The context of the host and devices where graph nodes will be
  * executed on.
  */
-void TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, const TVMModule* module,
-                          const TVMContext* ctxs) {
+void TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json,
+                          TVMModuleHandle module_handle, const TVMContext* ctxs) {
   JSONReader reader = JSONReader_Create(graph_json);
   TVMGraphRuntime_Load(runtime, &reader);
   JSONReader_Release(&reader);
+  runtime->module_handle = module_handle;
   runtime->ctxs[0] = ctxs[0];
   TVMGraphRuntime_SetupStorage(runtime);
   TVMGraphRuntime_SetupOpExecs(runtime);
 }
 
-TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, const TVMModule* m,
+TVMGraphRuntime* TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle,
                                         const TVMContext* ctxs) {
-  CHECK_EQ(vleak_size, 1, "memory leak checking won't work with concurrent CRT use");
   TVMGraphRuntime* runtime = (TVMGraphRuntime*)vmalloc(sizeof(TVMGraphRuntime));  // NOLINT(*)
   memset(runtime, 0, sizeof(TVMGraphRuntime));
   // init
-  TVMGraphRuntime_Init(runtime, sym_json, m, ctxs);
+  TVMGraphRuntime_Init(runtime, sym_json, module_handle, ctxs);
   return runtime;
 }
 
@@ -892,7 +941,9 @@ void TVMGraphRuntime_Release(TVMGraphRuntime** pptr) {
   vfree(runtime->nodes);
   TVMGraphRuntimeGraphAttr_Release(&(runtime->attrs));
   for (idx = 0; idx < runtime->storage_pool_count; ++idx) {
-    TVMNDArray_Release(&(runtime->storage_pool[idx]));
+    if (runtime->storage_pool[idx].is_linked_param == 0) {
+      TVMNDArray_Release(&(runtime->storage_pool[idx].array));
+    }
   }
   for (idx = 0; idx < runtime->data_entry_count; ++idx) {
     vfree(runtime->data_entry[idx].dl_tensor.shape);
@@ -909,6 +960,4 @@ void TVMGraphRuntime_Release(TVMGraphRuntime** pptr) {
     vfree(g_fexecs);
     g_fexecs = 0;
   }
-
-  CHECK_EQ(vleak_size, 1, "found memory leak, leak size=%d", vleak_size - 1);
 }
diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
new file mode 100644
index 000000000000..2a32a0251507
--- /dev/null
+++ b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+/*!
+ * \file graph_runtime_module.c
+ * \brief wrap graph_runtime into a TVMModule for use with RPC.
+ */
+
+#include <tvm/runtime/crt/func_registry.h>
+#include <tvm/runtime/crt/graph_runtime.h>
+#include <tvm/runtime/crt/graph_runtime_module.h>
+#include <tvm/runtime/crt/module.h>
+
+#include "tvm/runtime/crt/internal/graph_runtime/graph_runtime.h"
+
+typedef struct {
+  TVMModule mod;
+  TVMGraphRuntime* runtime;
+} GraphRuntimeModule;
+
+static GraphRuntimeModule graph_runtime;
+
+int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                     int* ret_tcodes, void* resource_handle) {
+  if (graph_runtime.runtime != NULL) {
+    return kTvmErrorGraphModuleAlreadyCreated;
+  }
+
+  if (nargs != 4) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr || tcodes[1] != kTVMModuleHandle || tcodes[2] != kTVMArgInt ||
+      tcodes[3] != kTVMArgInt) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  if (args[2].v_int64 != kDLCPU || args[3].v_int64 != 0) {
+    return kTvmErrorGraphModuleBadContext;
+  }
+
+  TVMContext ctx = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64};
+  graph_runtime.runtime = TVMGraphRuntime_Create(args[0].v_str, args[1].v_handle, &ctx);
+
+  TVMModuleHandle out;
+  int ret_value = TVMModCreateFromCModule(&graph_runtime.mod, &out);
+  if (ret_value != 0) {
+    ret_tcodes[0] = kTVMNullptr;
+    TVMGraphRuntime_Release(&graph_runtime.runtime);
+    return ret_value;
+  }
+
+  ret_values[0].v_handle = out;
+  ret_tcodes[0] = kTVMModuleHandle;
+  return kTvmErrorNoError;
+}
+
+int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                       int* ret_tcodes, void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  int index = TVMGraphRuntime_GetInputIndex(graph_runtime.runtime, args[0].v_str);
+  if (index < 0) {
+    return kTvmErrorGraphModuleNoSuchInput;
+  }
+
+  uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime,
+                                            graph_runtime.runtime->input_nodes[index], 0);
+  ret_values[0].v_handle = (void*)&graph_runtime.runtime->data_entry[eid].dl_tensor;
+  ret_tcodes[0] = kTVMNDArrayHandle;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs,
+                                           TVMValue* ret_values, int* ret_tcodes,
+                                           void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  ret_values[0].v_int64 = TVMGraphRuntime_GetNumInputs();
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs,
+                                            TVMValue* ret_values, int* ret_tcodes,
+                                            void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  ret_values[0].v_int64 = TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime);
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
+                                        TVMValue* ret_values, int* ret_tcodes,
+                                        void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMArgInt) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  int output_index = args[0].v_int64;
+  if (output_index < 0 || output_index > TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime)) {
+    return kTvmErrorGraphModuleNoSuchInput;
+  }
+
+  uint32_t nid = graph_runtime.runtime->outputs[output_index].node_id;
+  uint32_t index = graph_runtime.runtime->outputs[output_index].index;
+  uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, nid, index);
+
+  ret_values[0].v_handle = (void*)&(graph_runtime.runtime->data_entry[eid].dl_tensor);
+  ret_tcodes[0] = kTVMNDArrayHandle;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs,
+                                         TVMValue* ret_values, int* ret_tcodes,
+                                         void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMBytes) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  ret_tcodes[0] = kTVMNullptr;
+
+  TVMByteArray* arr = (TVMByteArray*)args[0].v_handle;
+  return TVMGraphRuntime_LoadParams(graph_runtime.runtime, arr->data, arr->size);
+}
+
+int32_t TVMGraphRuntimeModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                  int* ret_tcodes, void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  TVMGraphRuntime_Run(graph_runtime.runtime);
+
+  ret_tcodes[0] = kTVMNullptr;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                       int* ret_tcodes, void* resource_handle) {
+  if (nargs != 2) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMStr || tcodes[1] != kTVMDLTensorHandle) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  TVMGraphRuntime_SetInput(graph_runtime.runtime, args[0].v_str, (DLTensor*)args[1].v_handle);
+
+  ret_tcodes[0] = kTVMNullptr;
+  return 0;
+}
+
+int32_t TVMGraphRuntimeModule_NotImplemented(TVMValue* args, int* tcodes, int nargs,
+                                             TVMValue* ret_values, int* ret_tcodes,
+                                             void* resource_handle) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
+
+static const TVMBackendPackedCFunc graph_runtime_registry_funcs[] = {
+    &TVMGraphRuntimeModule_GetInput,      &TVMGraphRuntimeModule_GetNumInputs,
+    &TVMGraphRuntimeModule_GetNumOutputs, &TVMGraphRuntimeModule_GetOutput,
+    &TVMGraphRuntimeModule_LoadParams,    &TVMGraphRuntimeModule_Run,
+    &TVMGraphRuntimeModule_SetInput,      &TVMGraphRuntimeModule_NotImplemented,
+};
+
+static const TVMFuncRegistry graph_runtime_registry = {
+    "\x08get_input\0"
+    "get_num_inputs\0"
+    "get_num_outputs\0"
+    "get_output\0"
+    "load_params\0"
+    "run\0"
+    "set_input\0"
+    "share_params\0",
+    graph_runtime_registry_funcs};
+
+tvm_crt_error_t TVMGraphRuntimeModule_Register() {
+  graph_runtime.mod.registry = &graph_runtime_registry;
+  graph_runtime.runtime = NULL;
+
+  return TVMFuncRegisterGlobal("tvm.graph_runtime.create", &TVMGraphRuntimeModule_Create, 0);
+}
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index 664dae7ab857..41f2dc3b0a1b 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -32,6 +32,10 @@
 
 #include "crt_config.h"
 
+#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE
+#include <tvm/runtime/crt/graph_runtime_module.h>
+#endif
+
 using namespace std::chrono;
 
 extern "C" {
@@ -95,6 +99,11 @@ int main(int argc, char** argv) {
   utvm_rpc_server_t rpc_server =
       UTvmRpcServerInit(memory, sizeof(memory), 8, &UTvmWriteFunc, nullptr);
 
+#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE
+  CHECK_EQ(TVMGraphRuntimeModule_Register(), kTvmErrorNoError,
+           "failed to register GraphRuntime TVMModule");
+#endif
+
   if (TVMFuncRegisterGlobal("tvm.testing.reset_server", (TVMFunctionHandle)&testonly_reset_server,
                             0)) {
     fprintf(stderr, "utvm runtime: internal error registering global packedfunc; exiting\n");
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
index 7ea7a4f035c8..8e0faaa4f199 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
@@ -33,6 +33,7 @@
 typedef struct TVMGraphRuntimePoolEntry {
   size_t size;
   int device_type;
+  int entry_id;
 } TVMGraphRuntimePoolEntry;
 
 // Node entry
@@ -44,6 +45,12 @@ typedef struct TVMGraphRuntimeNodeEntry {
   void (*Load)(JSONReader* reader);
 } TVMGraphRuntimeNodeEntry;
 
+// Storage entry.
+typedef struct TVMGraphRuntimeStorageEntry {
+  uint8_t is_linked_param;
+  TVMNDArray array;
+} TVMGraphRuntimeStorageEntry;
+
 // Node
 typedef struct TVMGraphRuntimeNode {
   // operator type in string
@@ -87,7 +94,7 @@ typedef struct TVMGraphRuntime {
   TVMContext ctxs[1];
   uint32_t ctxs_count;
   /*! \brief Common storage pool for all devices. */
-  TVMNDArray* storage_pool;
+  TVMGraphRuntimeStorageEntry* storage_pool;
   uint32_t storage_pool_count;
   /*! \brief Data entry of each node. */
   TVMNDArray* data_entry;
@@ -100,6 +107,7 @@ typedef struct TVMGraphRuntime {
 typedef DLTensor* DLTensorPtr;
 
 // private functions
+uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint32_t index);
 void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in);
 int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
                                const uint32_t param_size);
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 3e9ff4f279e7..d02a6d9a0d64 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -202,9 +202,10 @@ PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name,
  * \param ctxs All devices contexts.
  */
 Module GraphRuntimeDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m,
-                               const std::vector<TVMContext>& ctxs) {
+                               const std::vector<TVMContext>& ctxs,
+                               PackedFunc lookup_linked_param_func) {
   auto exec = make_object<GraphRuntimeDebug>();
-  exec->Init(sym_json, m, ctxs);
+  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
   return Module(exec);
 }
 
@@ -212,7 +213,15 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create").set_body([](TVMArgs args,
   ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
                                  "at least 4, but it has "
                               << args.num_args;
-  *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
+  PackedFunc lookup_linked_param_func;
+  int ctx_start_arg = 2;
+  if (args[2].type_code() == kTVMPackedFuncHandle) {
+    lookup_linked_param_func = args[2];
+    ctx_start_arg++;
+  }
+
+  *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args, ctx_start_arg),
+                                lookup_linked_param_func);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 21960d9d4b1b..9e1670e67fc0 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -64,14 +64,20 @@ void GraphRuntime::Run() {
  * processor.
  * \param ctxs The context of the host and devices where graph nodes will be
  * executed on.
+ * \param lookup_linked_param_func Linked parameter lookup function.
  */
 void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module,
-                        const std::vector<TVMContext>& ctxs) {
+                        const std::vector<TVMContext>& ctxs, PackedFunc lookup_linked_param_func) {
   std::istringstream is(graph_json);
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
   module_ = module;
   ctxs_ = ctxs;
+  lookup_linked_param_ = lookup_linked_param_func;
+  if (lookup_linked_param_ == nullptr) {
+    lookup_linked_param_ = PackedFunc(
+        [this](TVMArgs args, TVMRetValue* rv) { this->DefaultLookupLinkedParam(args, rv); });
+  }
   this->SetupStorage();
   this->SetupOpExecs();
   for (size_t i = 0; i < input_nodes_.size(); i++) {
@@ -286,6 +292,43 @@ void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
   this->SetupOpExecs();
 }
 
+void GraphRuntime::LinkedNDArrayDeleter(Object* container) {
+  // container is the NDArray::Container which needs to get deleted.
+  // The data member points to global const memory, so it does not need deleting.
+  delete static_cast<NDArray::Container*>(container);
+}
+
+void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
+  Module mod = args[0];
+  int64_t storage_id = args[1];
+  DLTensor* template_tensor = args[2];
+  TVMContext ctx = args[3];
+  // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
+  // params are present.
+  if (!module_lookup_linked_param_valid_) {
+    module_lookup_linked_param_ =
+        mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
+  }
+  if (module_lookup_linked_param_ == nullptr) {
+    *rv = nullptr;
+    return;
+  }
+
+  TVMRetValue opaque_handle = module_lookup_linked_param_(storage_id);
+  if (opaque_handle.type_code() == kTVMNullptr) {
+    *rv = nullptr;
+    return;
+  }
+
+  std::vector<int64_t> shape_vec{template_tensor->shape,
+                                 template_tensor->shape + template_tensor->ndim};
+
+  std::unique_ptr<NDArray::Container> container{new NDArray::Container(
+      static_cast<void*>(opaque_handle), shape_vec, template_tensor->dtype, ctx)};
+  container->SetDeleter(GraphRuntime::LinkedNDArrayDeleter);
+  *rv = NDArray(GetObjectPtr<Object>(container.release()));
+}
+
 void GraphRuntime::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<DLDataType> vtype;
@@ -320,21 +363,37 @@ void GraphRuntime::SetupStorage() {
       ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
     }
+    TVMRetValue lookup_rv;
+    {
+      std::vector<int64_t> shape_vec{attrs_.shape[i].begin(), attrs_.shape[i].end()};
+      DLTensor template_tensor{nullptr,  TVMContext{kDLCPU, 0}, static_cast<int>(shape_vec.size()),
+                               vtype[i], shape_vec.data(),      nullptr,
+                               0};
+      lookup_rv = lookup_linked_param_(module_, sid, &template_tensor, ctxs_[0]);
+    }
+    if (lookup_rv.type_code() != kTVMNullptr) {
+      pool_entry[sid].linked_param = lookup_rv;
+    }
+    pool_entry[sid].param_data_entry = i;
     pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
   }
 
   // Allocate the space.
   for (const auto& pit : pool_entry) {
-    std::vector<int64_t> shape;
     // This for loop is very fast since there are usually only a couple of
     // devices available on the same hardware.
     const auto& cit = std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) {
       return pit.device_type == static_cast<int>(c.device_type);
     });
     TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
-    shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
-    storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
+    if (pit.linked_param.defined()) {
+      storage_pool_.push_back(pit.linked_param);
+    } else {
+      std::vector<int64_t> shape;
+      shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
+      storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
+    }
   }
 
   // Assign the pooled entries. A unified memory pool is used to simplifiy
@@ -346,6 +405,7 @@ void GraphRuntime::SetupStorage() {
     int storage_id = attrs_.storage_id[i];
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
     data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+
     const DLTensor* tmp = data_entry_[i].operator->();
     data_alignment_[i] = details::GetDataAlignment(*tmp);
   }
@@ -504,18 +564,19 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name,
 }
 
 Module GraphRuntimeCreate(const std::string& sym_json, const tvm::runtime::Module& m,
-                          const std::vector<TVMContext>& ctxs) {
+                          const std::vector<TVMContext>& ctxs,
+                          const PackedFunc lookup_linked_param_func) {
   auto exec = make_object<GraphRuntime>();
-  exec->Init(sym_json, m, ctxs);
+  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
   return Module(exec);
 }
 
 // Get all context for the host and other runtime devices.
-std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
+std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg) {
   // Reserve the first item as the fallback device.
   std::vector<TVMContext> ret;
   TVMContext ctx;
-  for (int i = 2; i < args.num_args; i += 2) {
+  for (int i = ctx_start_arg; i < args.num_args; i += 2) {
     int dev_type = args[i];
     ctx.device_type = static_cast<DLDeviceType>(dev_type);
     ctx.device_id = args[i + 1];
@@ -533,8 +594,14 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime.create").set_body([](TVMArgs args, TVMRet
   ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
                                  "at least 4, but it has "
                               << args.num_args;
-  const auto& contexts = GetAllContext(args);
-  *rv = GraphRuntimeCreate(args[0], args[1], contexts);
+  PackedFunc lookup_linked_param_func;
+  int ctx_start_arg = 2;
+  if (args[2].type_code() == kTVMPackedFuncHandle) {
+    lookup_linked_param_func = args[2];
+    ctx_start_arg++;
+  }
+  const auto& contexts = GetAllContext(args, ctx_start_arg);
+  *rv = GraphRuntimeCreate(args[0], args[1], contexts, lookup_linked_param_func);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index c08f5e671a08..81aa87d6ed90 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -94,10 +94,13 @@ class TVM_DLL GraphRuntime : public ModuleNode {
    *  processor.
    * \param ctxs The context of the host and devices where graph nodes will be
    *  executed on.
+   * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters
+   *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
+   *  which is not compatible with RPCModules.
    */
 
   void Init(const std::string& graph_json, tvm::runtime::Module module,
-            const std::vector<TVMContext>& ctxs);
+            const std::vector<TVMContext>& ctxs, const PackedFunc lookup_linked_param_func);
 
   /*!
    * \brief Get the input index given the name of input.
@@ -209,7 +212,10 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   struct PoolEntry {
     size_t size;
     int device_type;
-    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
+    int param_data_entry;
+    NDArray linked_param;
+    //    PoolEntry(int s, int dev_type, void* pre_linked_param) :
+    //        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
   // Node entry
   struct NodeEntry {
@@ -390,6 +396,10 @@ class TVM_DLL GraphRuntime : public ModuleNode {
     }
     ICHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
   }
+  /*! \brief PackedFunc to lookup a linked paramter from a local Module. */
+  void DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv);
+  /*! \brief Delete NDArray::Container with linked (i.e. static) data. */
+  static void LinkedNDArrayDeleter(Object* container);
   /*! \brief Setup the temporal storage */
   void SetupStorage();
   /*! \brief Setup the executors. */
@@ -437,9 +447,18 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   std::vector<size_t> data_alignment_;
   /*! \brief Operator on each node. */
   std::vector<std::function<void()>> op_execs_;
+  /*! \brief Linked parameter lookup function. */
+  PackedFunc lookup_linked_param_;
+  /*! \brief Module's _lookup_linked_param function, used by DefaultLookupLinkedParam. */
+  PackedFunc module_lookup_linked_param_;
+  /*!
+   * \brief True when module_lookup_linked_param_ is valid.
+   * When the module does not include linked parmeters, module_lookup_linked_param_ will be nullptr.
+   */
+  bool module_lookup_linked_param_valid_;
 };
 
-std::vector<TVMContext> GetAllContext(const TVMArgs& args);
+std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg);
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc
index 632a25c987bc..2c055e16cc9f 100644
--- a/src/runtime/graph/graph_runtime_factory.cc
+++ b/src/runtime/graph/graph_runtime_factory.cc
@@ -97,7 +97,7 @@ void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) {
 
 Module GraphRuntimeFactory::RuntimeCreate(const std::vector<TVMContext>& ctxs) {
   auto exec = make_object<GraphRuntime>();
-  exec->Init(this->graph_json_, this->imports_[0], ctxs);
+  exec->Init(this->graph_json_, this->imports_[0], ctxs, PackedFunc());
   // set params
   SetParams(exec.get(), this->params_);
   return Module(exec);
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 165c0fe73b36..4f721e122a4c 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -22,6 +22,7 @@
  * \brief RPC runtime module.
  */
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 
 #include <cstring>
@@ -36,6 +37,44 @@
 namespace tvm {
 namespace runtime {
 
+// deleter of RPC remote array
+static void RemoteNDArrayDeleter(Object* obj) {
+  auto* ptr = static_cast<NDArray::Container*>(obj);
+  RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
+  if (ptr->manager_ctx != nullptr) {
+    space->sess->FreeHandle(ptr->manager_ctx, kTVMNDArrayHandle);
+  }
+  delete space;
+  delete ptr;
+}
+
+/*!
+ * \brief Build a local NDArray with remote backing storage.
+ * \param sess the RPCSession which owns the given handle.
+ * \param handle A pointer valid on the remote end which should form the `data` field of the
+ *     underlying DLTensor.
+ * \param template_tensor An empty DLTensor whose shape and dtype fields are used to fill the newly
+ *     created array. Needed because it's difficult to pass a shape vector as a PackedFunc arg.
+ * \param ctx Remote context used with this tensor. Must have non-zero RPCSessMask.
+ * \param remote_ndarray_handle The handle returned by RPC server to identify the NDArray.
+ */
+NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle,
+                                      DLTensor* template_tensor, TVMContext ctx,
+                                      void* remote_ndarray_handle) {
+  ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(ctx))
+      << "The TVMContext given does not belong to the given session";
+  RemoteSpace* space = new RemoteSpace();
+  space->sess = sess;
+  space->data = handle;
+  std::vector<int64_t> shape_vec{template_tensor->shape,
+                                 template_tensor->shape + template_tensor->ndim};
+  NDArray::Container* data = new NDArray::Container(static_cast<void*>(space), std::move(shape_vec),
+                                                    template_tensor->dtype, ctx);
+  data->manager_ctx = remote_ndarray_handle;
+  data->SetDeleter(RemoteNDArrayDeleter);
+  return NDArray(GetObjectPtr<Object>(data));
+}
+
 /*!
  * \brief A wrapped remote function as a PackedFunc.
  */
@@ -113,41 +152,6 @@ class RPCWrappedFunc : public Object {
         << "Can not pass in context with a different remote session";
     return RemoveRPCSessionMask(ctx);
   }
-
-  // deleter of RPC remote array
-  static void RemoteNDArrayDeleter(Object* obj) {
-    auto* ptr = static_cast<NDArray::Container*>(obj);
-    RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
-    space->sess->FreeHandle(ptr->manager_ctx, kTVMNDArrayHandle);
-    delete space;
-    delete ptr;
-  }
-
-  // wrap return value as remote NDArray.
-  NDArray WrapRemoteNDArray(DLTensor* tensor, void* nd_handle) const {
-    NDArray::Container* data = new NDArray::Container();
-    data->manager_ctx = nd_handle;
-    data->SetDeleter(RemoteNDArrayDeleter);
-    RemoteSpace* space = new RemoteSpace();
-    space->sess = sess_;
-    space->data = tensor->data;
-    data->dl_tensor.data = space;
-    NDArray ret(GetObjectPtr<Object>(data));
-    // RAII now in effect
-    data->shape_ = std::vector<int64_t>(tensor->shape, tensor->shape + tensor->ndim);
-    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
-    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
-    // setup dtype
-    data->dl_tensor.dtype = tensor->dtype;
-    // setup ctx, encode as remote session
-    data->dl_tensor.ctx = AddRPCSessionMask(tensor->ctx, sess_->table_index());
-    // check strides.
-    ICHECK(tensor->strides == nullptr);
-    // setup byteoffset
-    data->dl_tensor.byte_offset = tensor->byte_offset;
-
-    return ret;
-  }
 };
 
 // RPC that represents a remote module session.
@@ -280,7 +284,9 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons
     ICHECK_EQ(args.size(), 3);
     DLTensor* tensor = args[1];
     void* nd_handle = args[2];
-    *rv = WrapRemoteNDArray(tensor, nd_handle);
+    *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor,
+                                        AddRPCSessionMask(tensor->ctx, sess_->table_index()),
+                                        nd_handle);
   } else {
     ICHECK_EQ(args.size(), 2);
     *rv = args[1];
@@ -466,5 +472,12 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue*
   *rv = static_cast<RPCModuleNode*>(m.operator->())->sess()->table_index();
 });
 
+TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle")
+    .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx,
+                       void* ndarray_handle) -> NDArray {
+      return NDArrayFromRemoteOpaqueHandle(RPCModuleGetSession(mod), remote_array, template_tensor,
+                                           ctx, ndarray_handle);
+    });
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index faa483d019c0..d10ed311949c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -25,6 +25,7 @@
 #include "codegen_llvm.h"
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/error_codes.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/tir/op.h>
 
@@ -32,7 +33,10 @@
 
 #include "../../arith/pattern_match.h"
 #include "../build_common.h"
+#include "../func_registry_generator.h"
 #include "codegen_cpu.h"
+#include "codegen_params.h"
+#include "llvm/Support/raw_os_ostream.h"
 namespace tvm {
 namespace codegen {
 
@@ -184,6 +188,90 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 }
 
+void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
+  // It would be nice to de-dupe these declarations frm src/tir/transforms/make_packed_api.cc,
+  // but they are at a different layer in the compiler...
+  std::vector<llvm::Type*> param_types;
+  // args
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+  // tcodes
+  param_types.push_back(t_int_->getPointerTo(GetGlobalAddressSpace()));
+  // num_args
+  param_types.push_back(t_int_);
+  // ret_args
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+  // ret_tcodes
+  param_types.push_back(t_int_->getPointerTo(GetGlobalAddressSpace()));
+  // resource_handle
+  param_types.push_back(t_void_->getPointerTo(GetGlobalAddressSpace()));
+
+  llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, param_types, false);
+
+  llvm::Function* function =
+      llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
+                             ::tvm::runtime::symbol::tvm_lookup_linked_param, module_.get());
+  function->setCallingConv(llvm::CallingConv::C);
+  function->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+
+  llvm::BasicBlock* entry = llvm::BasicBlock::Create(*ctx_, "entry", function);
+  builder_->SetInsertPoint(entry);
+  std::vector<llvm::Value*> zero_index_list{llvm::ConstantInt::get(t_int32_, 0)};
+  std::vector<llvm::Value*> zero_array_index_list{llvm::ConstantInt::get(t_int32_, 0),
+                                                  llvm::ConstantInt::get(t_int32_, 0)};
+  auto args_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+      &function->arg_begin()[0],
+#else
+      &(*(function->arg_begin())),
+#endif
+      llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1));
+  llvm::Value* sid = builder_->CreateBitCast(
+      builder_->CreateLoad(t_void_->getPointerTo(GetGlobalAddressSpace()),
+                           builder_->CreateInBoundsGEP(args_array, zero_index_list)),
+      t_int64_);
+
+  llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
+  auto ret_types_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+      &function->arg_begin()[4],
+#else
+      &(*(std::next(function->arg_begin(), 4))),
+#endif
+      llvm::ArrayType::get(t_int_, 1)->getPointerTo());
+  auto retval_array = builder_->CreateBitCast(
+#if TVM_LLVM_VERSION >= 50
+      &function->arg_begin()[3],
+#else
+      &(*std::next(function->arg_begin(), 3)),
+#endif
+      llvm::ArrayType::get(t_void_->getPointerTo(GetGlobalAddressSpace()), 1)->getPointerTo());
+  llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
+
+  builder_->SetInsertPoint(default_block);
+  builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMNullptr),
+                        builder_->CreateInBoundsGEP(ret_types_array, zero_array_index_list));
+  builder_->CreateRet(ConstInt32(kTvmErrorNoError));
+
+  // Add data to the global section.
+  for (auto kv : params) {
+    auto array = NDArrayToLLVMArray(ctx_, kv.second->param);
+    std::string symbol_name = std::string(::tvm::runtime::symbol::tvm_param_prefix) + kv.first;
+    llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
+        *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage, array, symbol_name);
+
+    llvm::BasicBlock* case_block = llvm::BasicBlock::Create(*ctx_, "case_" + symbol_name, function);
+    switch_inst->addCase(
+        llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)), case_block);
+    builder_->SetInsertPoint(case_block);
+    builder_->CreateStore(
+        builder_->CreatePointerCast(param_symbol, t_void_->getPointerTo(GetGlobalAddressSpace())),
+        builder_->CreateInBoundsGEP(retval_array, zero_array_index_list));
+    builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle),
+                          builder_->CreateInBoundsGEP(ret_types_array, zero_array_index_list));
+    builder_->CreateRet(ConstInt32(0));
+  }
+}
+
 std::unique_ptr<llvm::Module> CodeGenLLVM::Finish() {
   this->AddStartupFunction();
   for (size_t i = 0; i < link_modules_.size(); ++i) {
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 78eb5e2dcac7..71583708da2c 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -98,6 +98,18 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    * \param mod The module to be linked.
    */
   void AddLinkModule(std::unique_ptr<llvm::Module>&& mod);
+  /*!
+   * \brief Link parameters into the module so they don't need to be supplied at runtime.
+   * Parameters can be linked into the module so that the generated code is easier to use, or so
+   * that RAM space doesn't need to be allocated for them. This function adds the given parameters
+   * to the generated LLVM module.
+   * \param storage_id_offset Offset added to the index of each entry in params_by_sid to form the
+   *     storage_id of that parameter. Storage ids for parameters are expected to be contiguous.
+   * \param params_by_sid Array of NDArray. Each entry is a parameter. The index of the array (added
+   *     to sid_offset) is the storage_id of the param.
+   * \param param_names Array containing the name for each param in params_by_sid.
+   */
+  void LinkParameters(const Map<String, LinkedParam> params);
   /*!
    * \brief Create Value for expression e
    * \param e The expression to be created value for.
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
new file mode 100644
index 000000000000..694be5621606
--- /dev/null
+++ b/src/target/llvm/codegen_params.cc
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.cc
+ */
+#ifdef TVM_LLVM_VERSION
+
+#include "codegen_params.h"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace tvm {
+namespace codegen {
+
+template <typename T, typename E = void>
+struct LLVMConstantGetter {
+  static llvm::Constant* getElement(llvm::Type* ty, T t);
+};
+
+template <typename T>
+struct LLVMConstantGetter<
+    T, std::enable_if_t<(std::is_integral<T>::value && std::is_signed<T>::value)>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) {
+    return llvm::ConstantInt::getSigned(ty, t);
+  }
+};
+
+template <typename T>
+struct LLVMConstantGetter<
+    T, std::enable_if_t<(std::is_integral<T>::value && !std::is_signed<T>::value)>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) { return llvm::ConstantInt::get(ty, t); }
+};
+
+template <typename T>
+struct LLVMConstantGetter<T, std::enable_if_t<std::is_floating_point<T>::value>> {
+  static llvm::Constant* getElement(llvm::Type* ty, T t) { return llvm::ConstantFP::get(ty, t); }
+};
+
+template <typename T, typename = std::enable_if<std::is_pod<T>::value>>
+void BuildLLVMVector(llvm::Type* element_type, void* tensor_data, size_t num_elements,
+                     std::vector<llvm::Constant*>* elements) {
+  elements->resize(num_elements, nullptr);
+  std::transform(static_cast<T*>(tensor_data), static_cast<T*>(tensor_data) + num_elements,
+                 elements->begin(),
+                 [&](T t) { return LLVMConstantGetter<T>::getElement(element_type, t); });
+}
+
+llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr) {
+  llvm::Type* element_type = nullptr;
+
+  auto arr_type = arr.DataType();
+  CHECK(arr.IsContiguous()) << "CodegenParams: only support contiguous arrays";
+  CHECK_EQ(arr->ctx.device_type, kDLCPU) << "CodegenParams: only support contiguous arrays";
+  CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
+                                << arr_type.lanes();
+
+  auto shape = arr.Shape();
+  int num_elements = 1;
+  for (auto shape_elem : shape) {
+    num_elements *= shape_elem;
+  }
+
+  std::vector<llvm::Constant*> elements;
+
+  switch (arr_type.code()) {
+    case runtime::DataType::kInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+
+      switch (arr_type.bits()) {
+        case 8:
+          BuildLLVMVector<int8_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 16:
+          BuildLLVMVector<int16_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 32:
+          BuildLLVMVector<int32_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          BuildLLVMVector<int64_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          ICHECK(false) << "should not get here";
+          break;
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kUInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+
+      switch (arr_type.bits()) {
+        case 8:
+          BuildLLVMVector<uint8_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 16:
+          BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 32:
+          BuildLLVMVector<uint32_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          BuildLLVMVector<uint64_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          ICHECK(false) << "should not get here";
+          break;
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kFloat:
+      switch (arr_type.bits()) {
+        case 16:
+          // NOTE: float16 is treated as uint16_t.
+          element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 32:
+          element_type = llvm::Type::getFloatTy(*ctx);
+          BuildLLVMVector<float>(element_type, arr->data, num_elements, &elements);
+          break;
+        case 64:
+          element_type = llvm::Type::getDoubleTy(*ctx);
+          BuildLLVMVector<double>(element_type, arr->data, num_elements, &elements);
+          break;
+        default:
+          CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                       << arr_type.bits() << "-bit array";
+          break;
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kBFloat:
+      CHECK(arr_type.bits() == 16)
+          << "CodegenParams: only support 16-bit bfloat; saw " << arr_type.bits() << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+      BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
+
+    default:
+      CHECK(false) << "Data type not supported";
+  }
+
+  return llvm::cast<llvm::ConstantArray>(llvm::ConstantArray::get(
+      llvm::ArrayType::get(element_type, num_elements), llvm::ArrayRef<llvm::Constant*>(elements)));
+}
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/codegen_params.h b/src/target/llvm/codegen_params.h
new file mode 100644
index 000000000000..771bc201f7aa
--- /dev/null
+++ b/src/target/llvm/codegen_params.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.h
+ */
+
+#ifndef TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
+#define TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
+
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/ndarray.h>
+
+#include "llvm_common.h"
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Convert an NDArray to an LLVM array of constants.
+ *
+ * The supplied NDArray is flattened, and each element is converted to the appropriate LLVM type.
+ *
+ * \param ctx LLVM context used to create the various primitive datatypes.
+ * \param arr NDArray to convert.
+ * \return LLVM array containing the array data.
+ */
+llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr);
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_TARGET_LLVM_CODEGEN_PARAMS_H_
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 569082022852..73a3594427d3 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -32,6 +32,7 @@
 
 #include "../../runtime/file_utils.h"
 #include "../../runtime/library_module.h"
+#include "../func_registry_generator.h"
 #include "codegen_blob.h"
 #include "codegen_llvm.h"
 #include "llvm_common.h"
@@ -199,7 +200,21 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
     std::vector<PrimFunc> funcs;
     std::string entry_func;
+    Map<String, LinkedParam> linked_params;
+    bool found_linked_params = false;
+    bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
     for (auto kv : mod->functions) {
+      if (could_have_linked_params &&
+          kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
+        Map<String, ObjectRef> attrs_dict =
+            Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
+        CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
+            << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+        linked_params =
+            Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
+        found_linked_params = true;
+        continue;
+      }
       ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "Can only lower IR Module with PrimFuncs";
       auto f = Downcast<PrimFunc>(kv.second);
       if (f->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
@@ -209,7 +224,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       }
       funcs.push_back(f);
     }
-    ICHECK_NE(funcs.size(), 0U);
+    ICHECK(funcs.size() > 0 || (could_have_linked_params && found_linked_params));
     // TODO(tqchen): remove the entry function behavior as it does not
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
@@ -222,6 +237,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       cg->AddMainFunction(entry_func);
     }
 
+    if (found_linked_params) {
+      cg->LinkParameters(linked_params);
+    }
     module_ = cg->Finish();
     module_->addModuleFlag(llvm::Module::Warning, "tvm_target",
                            llvm::MDString::get(*ctx_, LLVMTargetToString(target)));
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 6ae11f4f9af8..0a19fc1399b7 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -23,6 +23,8 @@
 #include "codegen_c_host.h"
 
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/crt/error_codes.h>
+#include <tvm/runtime/module.h>
 #include <tvm/target/codegen.h>
 
 #include <string>
@@ -31,6 +33,7 @@
 #include "../../support/str_escape.h"
 #include "../build_common.h"
 #include "../func_registry_generator.h"
+#include "codegen_params.h"
 
 namespace tvm {
 namespace codegen {
@@ -57,6 +60,48 @@ void CodeGenCHost::AddFunction(const PrimFunc& f) {
   CodeGenC::AddFunction(f);
 }
 
+void CodeGenCHost::LinkParameters(Map<String, LinkedParam> params) {
+  PrintFuncPrefix();
+  stream << " " << tvm::runtime::symbol::tvm_lookup_linked_param
+         << "(void* args, int* arg_type_ids, int num_args, void* out_ret_value, "
+         << "int* out_ret_tcode, void* resource_handle) {\n";
+  ICHECK_EQ(GetUniqueName(tvm::runtime::symbol::tvm_lookup_linked_param),
+            tvm::runtime::symbol::tvm_lookup_linked_param)
+      << "builtin PackedFunc name already taken: " << tvm::runtime::symbol::tvm_lookup_linked_param;
+  stream << "    switch (((int64_t*) args)[0]) {\n"
+         << "    default:\n"
+         << "        out_ret_tcode[0] = " << kTVMNullptr << ";\n"
+         << "        return 0;\n";
+
+  function_names_.emplace_back(tvm::runtime::symbol::tvm_lookup_linked_param);
+  for (auto kv : params) {
+    decl_stream << "\n"
+                << "#ifdef __cplusplus\n"
+                << "extern \"C\" {\n"
+                << "#endif\n"
+                << "static const ";
+    int64_t num_elements = 1;
+    for (int64_t dim : kv.second->param.Shape()) {
+      num_elements *= dim;
+    }
+    PrintType(kv.second->param.DataType(), decl_stream);
+    decl_stream << " " << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << "["
+                << num_elements << "] = {\n";
+    NDArrayDataToC(kv.second->param, 4, decl_stream);
+    decl_stream << "};\n"
+                << "#ifdef __cplusplus\n"
+                << "}  // extern \"C\"\n"
+                << "#endif\n";
+    stream << "    case " << kv.second->id << ":\n"
+           << "        ((uint64_t*)out_ret_value)[0] = (uint64_t) (uintptr_t) "
+           << ::tvm::runtime::symbol::tvm_param_prefix << kv.first << ";\n"
+           << "        out_ret_tcode[0] = " << kTVMOpaqueHandle << ";\n"
+           << "        return 0;\n";
+  }
+  stream << "    }\n"
+         << "}\n";
+}
+
 void CodeGenCHost::PrintFuncPrefix() {  // NOLINT(*)
   stream << "#ifdef __cplusplus\n"
          << "extern \"C\"\n"
@@ -307,12 +352,31 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   CodeGenCHost cg;
   cg.Init(output_ssa, emit_asserts, target->str());
 
+  Map<String, LinkedParam> linked_params;
+  bool found_linked_params = false;
+  bool could_have_linked_params = target->GetAttr<Bool>("link-params").value_or(Bool(false));
   for (auto kv : mod->functions) {
+    if (could_have_linked_params &&
+        kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
+      Map<String, ObjectRef> attrs_dict = Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
+      CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
+          << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
+      linked_params =
+          Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
+      found_linked_params = true;
+      continue;
+    }
+
     ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     cg.AddFunction(f);
   }
 
+  if (could_have_linked_params) {
+    ICHECK(found_linked_params) << "-link-params given but none found";
+    cg.LinkParameters(linked_params);
+  }
+
   if (target->GetAttr<Bool>("system-lib").value_or(Bool(false))) {
     ICHECK_EQ(target->GetAttr<String>("runtime").value_or(""), "c")
         << "c target only supports generating C runtime SystemLibs";
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 1bf378be1422..b54b6fbfcfeb 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -42,6 +42,9 @@ class CodeGenCHost final : public CodeGenC {
 
   void AddFunction(const PrimFunc& f);
 
+  /*! \brief Add linked parameters, if they are present. */
+  void LinkParameters(Map<String, LinkedParam> params);
+
   void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
   void PrintFuncPrefix() final;                        // NOLINT(*)
   void PrintFinalReturn() final;                       // NOLINT(*)
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
new file mode 100644
index 000000000000..cc7695abfd25
--- /dev/null
+++ b/src/target/source/codegen_params.cc
@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.cc
+ */
+
+#include "codegen_params.h"
+
+#include <dlpack/dlpack.h>
+
+#include <cmath>
+#include <iomanip>
+#include <memory>
+#include <string>
+
+namespace tvm {
+namespace codegen {
+
+/*! \brief maximum line length of generated parameters, including indent. */
+static constexpr const int kMaxLineLength = 80;
+
+static int ComputeNumElementsPerRow(int one_element_size_bytes, int indent_chars) {
+  if (one_element_size_bytes > kMaxLineLength - indent_chars) {
+    return 1;
+  }
+  // When multiple elements fit per line, divide the available space by the size of one element,
+  // and return the largest power of 2 less than the result. Using power-of-2-sized elements allows
+  // for easily traversing the generated code.
+  int elements_per_row = (kMaxLineLength - indent_chars) / one_element_size_bytes;
+
+  // Implementation of fls. Iteratively clear the LSB until one bit remains.
+  while ((elements_per_row & (elements_per_row - 1)) > 0) {
+    elements_per_row &= elements_per_row - 1;
+  }
+  return elements_per_row;
+}
+
+template <typename T, typename Enable = std::enable_if<std::is_integral<T>::value>>
+void PrintIntegralArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
+  int one_element_size_bytes = (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */);
+  if (std::is_signed<T>::value) {
+    one_element_size_bytes += 1;  // sign character
+    if (sizeof(T) == 64 / 8) {
+      one_element_size_bytes += 2;  // "LL"
+    }
+  } else {
+    if (sizeof(T) == 64 / 8) {
+      one_element_size_bytes += 3;  // "ULL"
+    }
+  }
+
+  int elements_per_row = ComputeNumElementsPerRow(one_element_size_bytes, indent_chars);
+  std::string indent_str(indent_chars, ' ');
+
+  for (size_t i = 0; i < num_elements; i++) {
+    if ((i % elements_per_row) == 0) {
+      if (i != 0) {
+        os << std::endl;
+      }
+      os << indent_str;
+    }
+    int64_t elem = static_cast<T*>(data)[i];
+    if (std::is_signed<T>::value) {
+      uint64_t to_print;
+      if (elem < 0) {
+        os << "-";
+        to_print = -elem;
+      } else {
+        os << "+";
+        to_print = elem;
+      }
+      os << "0x" << std::setw(sizeof(T) * 8 / 4) << static_cast<std::uint64_t>(to_print);
+      if (sizeof(T) == 64 / 8) {
+        os << "LL";
+      }
+    } else {
+      os << "0x" << std::setw(sizeof(T) * 8 / 4) << static_cast<std::uint64_t>(elem);
+      if (sizeof(T) == 64 / 8) {
+        os << "ULL";
+      }
+    }
+    if (i < num_elements - 1) {
+      os << ", ";
+    }
+  }
+
+  if ((num_elements % elements_per_row) != 0) {
+    os << "\n";
+  }
+}
+
+template <typename T, typename Enable = std::enable_if<std::is_floating_point<T>::value>>
+void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
+  // Floats and doubles are printed as hex but casted.
+  int one_element_size_bytes = (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */) + 1 /* sign */ +
+                               1 /* decimal point */ + 1 /* exponent sign */;
+  if (sizeof(T) == 64 / 8) {
+    one_element_size_bytes += 2; /* 4 decimal digits in exponent, relative to bits / 4 */
+  } else if (sizeof(T) == 32 / 8) {
+    one_element_size_bytes += 1; /* extra decimal digit in exponent, relative to bits / 4 */
+  }
+
+  int elements_per_row = ComputeNumElementsPerRow(one_element_size_bytes, indent_chars);
+  std::string indent_str(indent_chars, ' ');
+
+  std::stringstream ss;
+  if (std::is_signed<T>::value) {
+    ss.setf(std::ios::hex | std::ios::showbase | std::ios::fixed | std::ios::scientific,
+            std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+  } else {
+    ss.setf(std::ios::hex | std::ios::fixed | std::ios::scientific,
+            std::ios::basefield | std::ios::showbase | std::ios::floatfield);
+  }
+  for (size_t i = 0; i < num_elements; i++) {
+    if ((i % elements_per_row) == 0) {
+      if (i != 0) {
+        os << std::endl;
+      }
+      os << indent_str;
+    }
+
+    T elem = static_cast<T*>(data)[i];
+    if (std::isinf(elem)) {
+      // C99 standard.
+      os << (elem < 0 ? "-" : " ") << std::setw(one_element_size_bytes - 1) << "INFINITY";
+    } else if (std::isnan(elem)) {
+      // GNU extension, implemenatation-dependent.
+      os << std::setw(one_element_size_bytes) << "NAN";
+    } else {
+      ss << elem;
+      os << std::setw(one_element_size_bytes) << ss.str();
+      ss.str("");
+    }
+    if (i < num_elements - 1) {
+      os << ", ";
+    }
+  }
+
+  if ((num_elements % elements_per_row) != 0) {
+    os << "\n";
+  }
+}
+
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
+  auto arr_type = arr.DataType();
+  CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
+                                << arr_type.lanes();
+
+  auto shape = arr.Shape();
+  int num_elements = 1;
+  for (auto shape_elem : shape) {
+    num_elements *= shape_elem;
+  }
+
+  auto old_fmtflags = os.flags();
+  os.setf(std::ios::internal | std::ios::hex,
+          std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
+  os.fill('0');
+  switch (arr_type.code()) {
+    case runtime::DataType::kInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+      if (arr_type.bits() == 8) {
+        PrintIntegralArray<int8_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 16) {
+        PrintIntegralArray<int16_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 32) {
+        PrintIntegralArray<int32_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 64) {
+        PrintIntegralArray<int64_t>(arr->data, num_elements, indent_chars, os);
+      } else {
+        CHECK(false) << "should not get here";
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kUInt:
+      CHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
+            arr_type.bits() == 64)
+          << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
+          << arr_type.bits() << "-bit array";
+
+      if (arr_type.bits() == 8) {
+        PrintIntegralArray<uint8_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 16) {
+        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 32) {
+        PrintIntegralArray<uint32_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 64) {
+        PrintIntegralArray<uint64_t>(arr->data, num_elements, indent_chars, os);
+      } else {
+        CHECK(false) << "should not get here";
+      }
+      break;
+
+    case runtime::DataType::TypeCode::kFloat: {
+      os.fill(' ');
+      os.setf(std::ios::left, std::ios::adjustfield);
+      if (arr_type.bits() == 16) {
+        // NOTE: print types not widely supported by C as uint16_t.
+        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 32) {
+        PrintFloatingPointArray<float>(arr->data, num_elements, indent_chars, os);
+      } else if (arr_type.bits() == 64) {
+        PrintFloatingPointArray<double>(arr->data, num_elements, indent_chars, os);
+      } else {
+        CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
+                     << arr_type.bits() << "-bit array";
+      }
+      break;
+    }
+
+    case runtime::DataType::TypeCode::kBFloat: {
+      // NOTE: print types not widely supported by C as uint16_t.
+      CHECK(arr_type.bits() == 16)
+          << "CodegenParams: only support generating 16-bit bfloat params; saw " << arr_type.bits()
+          << "-bit array";
+      PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      break;
+    }
+
+    default:
+      CHECK(false) << "Data type not supported";
+  }
+
+  os.flags(old_fmtflags);
+}
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/target/source/codegen_params.h b/src/target/source/codegen_params.h
new file mode 100644
index 000000000000..cc126c767c58
--- /dev/null
+++ b/src/target/source/codegen_params.h
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_params.h
+ */
+
+#ifndef TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
+#define TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
+
+#include <tvm/runtime/ndarray.h>
+
+#include <iostream>
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Write a C representation of arr to os.
+ *
+ * This function generates a comma-separated, indented list of C integer listeals suitable for use
+ * in an initializer. The NDArray is flattened and then the list is produced element by element.
+ * For the int16_t NDArray [-3, -2, -1, 0, 1, 2, 3, ...], and indent_chars = 4, the following output
+ * is produced:
+ *     -0x0003, -0x0002, -0x0001, +0x0000, +0x0001, +0x0002, +0x0003
+ *
+ * \param arr The array to generate
+ * \param indent_chars Number of chars to indent
+ * \param os Output stream where the array data should be written.
+ */
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os);
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_TARGET_SOURCE_CODEGEN_PARAMS_H_
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 6bef8b3c5cd7..903c3dcfefb5 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -213,10 +213,12 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<String>("mfloat-abi")
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<String>("runtime")
+    .add_attr_option<Bool>("link-params", Bool(false))
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<Bool>("system-lib")
+    .add_attr_option<Bool>("link-params", Bool(false))
     .add_attr_option<String>("runtime")
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("march")
diff --git a/src/tir/ir/function.cc b/src/tir/ir/function.cc
index ef7f4f8e16dd..101d80a52ea1 100644
--- a/src/tir/ir/function.cc
+++ b/src/tir/ir/function.cc
@@ -28,6 +28,13 @@
 namespace tvm {
 namespace tir {
 
+LinkedParam::LinkedParam(int64_t id, ::tvm::runtime::NDArray param) {
+  auto n = make_object<LinkedParamNode>();
+  n->id = id;
+  n->param = param;
+  data_ = std::move(n);
+}
+
 // Get the function type of a PrimFunc
 PrimFunc::PrimFunc(Array<tir::Var> params, Stmt body, Type ret_type,
                    Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index 3d528f821059..a422f12b04d7 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -147,8 +147,9 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->keys.size(), 2U);
   ICHECK_EQ(target->keys[0], "cpu");
   ICHECK_EQ(target->keys[1], "arm_cpu");
-  ICHECK_EQ(target->attrs.size(), 1U);
+  ICHECK_EQ(target->attrs.size(), 2U);
   ICHECK_EQ(target->GetAttr<String>("device"), "arm_cpu");
+  ICHECK_EQ(target->GetAttr<Bool>("link-params"), false);
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 3b5471d0bb8b..3d6923342652 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -28,6 +28,7 @@
 
 import tvm
 import tvm.relay
+import tvm.testing
 
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
new file mode 100644
index 000000000000..7b6910b0ea57
--- /dev/null
+++ b/tests/python/unittest/test_link_params.py
@@ -0,0 +1,408 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import collections
+import ctypes
+import json
+import os
+import re
+import struct
+import sys
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.relay
+import tvm.testing
+from tvm.contrib import utils
+
+
+INPUT_SHAPE = (1, 3, 16, 16)
+
+
+KERNEL_SHAPE = (3, 3, 3, 3)
+
+
+# The data types that are linkable.
+LINKABLE_DTYPES = (
+    [f"uint{b}" for b in (8, 16, 32, 64)]
+    + [f"int{b}" for b in (8, 16, 32, 64)]
+    + ["float32", "float64"]
+)
+
+
+def dtype_info(dtype):
+    """Lookup numpy type info for the given string dtype (of LINKABLE_DTYPES above)."""
+    if "int" in dtype:
+        return np.iinfo(getattr(np, dtype))
+    else:
+        return np.finfo(getattr(np, dtype))
+
+
+# Note: for debugging, set this to an integer (i.e. 1.0). Then all "random" tensors will become
+# predictable
+RANDOM_TENSOR_START = None
+
+
+def _make_random_tensor(dtype, shape):
+    """Create a random test tensor with given shape and dtype."""
+    global RAND_SEED
+    if RANDOM_TENSOR_START is not None:
+        to_return = np.arange(
+            RANDOM_TENSOR_START, RANDOM_TENSOR_START + np.prod(shape), dtype=dtype
+        ).reshape(shape)
+        RAND_SEED += np.prod(shape)
+        return to_return
+
+    dinfo = dtype_info(dtype)
+    if "int" in dtype:
+        return np.random.randint(dinfo.min, dinfo.max, shape, dtype=dtype)
+    else:
+        to_return = np.random.uniform(0, dinfo.max, shape).astype(dtype)
+        np.reshape(to_return, np.prod(shape))[::2] *= -1
+        return to_return
+
+
+def _lookup_sid(graph, name):
+    """Lookup the storage id of a named parameter.
+
+    Arguments
+    ---------
+    graph : dict
+        Parsed JSON graph.
+
+    name : str
+        Name of the tensor parameter to lookup.
+
+    Returns
+    -------
+    int :
+        The storage_id of the parameter.
+    """
+    num_outputs_seen = 0
+    for i, n in enumerate(graph["nodes"]):
+        if n["name"] == name:
+            print("sid", name, graph["attrs"]["storage_id"][1], num_outputs_seen)
+            return graph["attrs"]["storage_id"][1][num_outputs_seen]
+        else:
+            if "attrs" in n and "num_outputs" in n["attrs"]:
+                num_outputs_seen += int(n["attrs"]["num_outputs"])
+            else:
+                num_outputs_seen += 1
+
+    raise KeyError(f"no such param: {name}")
+
+
+def _get_ctypes_dtype(dt):
+    """Return a ctypes c_* datatype given a string data type."""
+    if "int" in dt:
+        return getattr(ctypes, f"c_{dt}")
+    elif dt == "float32":
+        return ctypes.c_float
+    elif dt == "float64":
+        return ctypes.c_double
+    else:
+        assert False, f"unknown dtype: {dt}"
+
+
+def _verify_linked_param(dtype, lib, mod, graph, name):
+    """Directly read memory from the linked library to verify the linked parameter is correct."""
+    sid = _lookup_sid(graph, name)
+    # NOTE: query_imports=True because when loading a module from disk (i.e. for C backend),
+    # a GraphRuntimeFactory module is created instead of the module itself.
+    param_ptr = mod.get_function("_lookup_linked_param", True)(sid)
+    gen_param = lib.params[name]
+    arr_data = (_get_ctypes_dtype(dtype) * np.prod(gen_param.shape)).from_address(param_ptr.value)
+    arr = np.ndarray(shape=gen_param.shape, dtype=gen_param.dtype, buffer=arr_data, order="C")
+    if "int" in gen_param.dtype:
+        np.testing.assert_equal(gen_param.asnumpy(), arr)
+    else:
+        np.testing.assert_allclose(gen_param.asnumpy(), arr)
+    return dtype == gen_param.dtype
+
+
+def _make_mod_and_params(dtype):
+    """Create a Relay module and parameters to test the given datatype."""
+    param_decls = collections.OrderedDict()
+    param_init = {}
+
+    def _add_decl(name, dtype):
+        param_decls[name] = f"%{name} : Tensor[{KERNEL_SHAPE}, {dtype}]"
+        param_init[name] = _make_random_tensor(dtype, KERNEL_SHAPE)
+
+    # Add several parameters so that the number of parameters
+    _add_decl(f"{dtype}_a", dtype)
+    _add_decl(f"{dtype}_b", dtype)
+
+    mod_lines = [
+        '#[version = "0.0.5"]',
+        f"def @main(%rand_input : Tensor[{INPUT_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } )  {{",
+        # This program ensures that GraphPlanMemory alternates between the same two storage IDs for a
+        # while. In doing this, it ensures that param %{dtype}_b will be placed into the graph at an
+        # index unequal to its storage_id. This ensures that GraphRuntimeCodegen encodes the storage_id
+        # and not the parameter index into the graph.
+        (
+            f'    %0 = nn.conv2d(%rand_input, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %1 = nn.conv2d(%0, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %2 = nn.conv2d(%1, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        (
+            f'    %3 = nn.conv2d(%2, %{dtype}_b, data_layout="NCHW", kernel_layout="OIHW", '
+            f'kernel_size=[3, 3], out_dtype="{dtype}");'
+        ),
+        "    %3",
+        "}",
+    ]
+
+    mod = tvm.parser.fromtext("\n".join(mod_lines))
+    return mod, param_init
+
+
+@tvm.testing.requires_llvm
+def test_llvm_link_params():
+    for dtype in LINKABLE_DTYPES:
+        mod, param_init = _make_mod_and_params(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
+        main_func = mod["main"]
+        target = "llvm --runtime=c --system-lib --link-params"
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, target, params=param_init)
+            assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
+
+            print("graph", lib.graph_json)
+            graph = json.loads(lib.graph_json)
+            for p in lib.params:
+                _verify_linked_param(dtype, lib, lib.lib, graph, p) or found_one
+
+            # Wrap in function to explicitly deallocate the runtime.
+            def _run_linked(lib):
+                graph_json, mod, _ = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            linked_output = _run_linked(lib)
+
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, "llvm --system-lib", params=param_init)
+
+            def _run_unlinked(lib):
+                graph_json, mod, lowered_params = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input("rand_input", rand_input, **lowered_params)
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            unlinked_output = _run_unlinked(lib)
+
+        if "int" in dtype:
+            np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy())
+        else:
+            np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
+
+
+def _get_c_datatype(dtype):
+    """Translate LINKABLE_DTYPES element to c datatype."""
+    if "int" in dtype:
+        return f"{dtype}_t"
+    elif dtype == "float32":
+        return "float"
+    elif dtype == "float64":
+        return "double"
+    else:
+        assert False, f"unknown dtype {dtype}"
+
+
+def _format_c_value(dtype, width, x):
+    if "int" in dtype:
+        hex_formatstr = f'{{:{"+" if dtype.startswith("int") else ""}#0{width}x}}'
+        return hex_formatstr.format(x)
+    elif "float" in dtype:
+        to_ret = float(x).hex()
+        if "inf" in to_ret:
+            return ("-" if x < 0 else "") + "INFINITY"
+        elif "nan" in to_ret:
+            return "NAN"
+
+        before, after = to_ret.split("p")
+        return f'{before.rstrip("0")}p{after}'
+    else:
+        assert False, f"don't know dtype {dtype}"
+
+
+HEX_NUM_RE = re.compile(r"[+\-]?(?:(?:0x[0-9A-Fa-f.p+-]+)|(?:INFINITY)|(?:NAN))")
+
+
+def test_c_link_params():
+    temp_dir = utils.tempdir()
+    for dtype in LINKABLE_DTYPES:
+        mod, param_init = _make_mod_and_params(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
+        main_func = mod["main"]
+        target = "c --link-params"
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            lib = tvm.relay.build(mod, target, params=param_init)
+            assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
+
+            src = lib.lib.get_source()
+            lib.lib.save("test.c", "cc")
+            c_dtype = _get_c_datatype(dtype)
+            src_lines = src.split("\n")
+            param = lib.params["p0"].asnumpy().reshape(np.prod(KERNEL_SHAPE))
+            param_def = f"static const {c_dtype} __tvm_param__p0[{np.prod(param.shape)}] = {{"
+            for i, line in enumerate(src_lines):
+                if line == param_def:
+                    i += 1
+                    break
+            else:
+                assert False, f'did not find parameter definition "{param_def}":\n{src}'
+
+            cursor = 0
+            width = dtype_info(dtype).bits // 4 + 2
+            if dtype.startswith("int"):
+                width += 1  # Account for sign
+
+            while "};" not in src_lines[i]:
+                for match in HEX_NUM_RE.finditer(src_lines[i]):
+                    assert match.group() == _format_c_value(dtype, width, param[cursor]), (
+                        f'p0 byte {cursor}: want "{_format_c_value(dtype, width, param[cursor])}" got '
+                        f'"{match.group(0)}"; full p0 follows:\n{src}'
+                    )
+                    cursor += 1
+                i += 1
+
+            assert cursor == np.prod(param.shape)
+            temp = utils.tempdir()
+
+            # Need a unique name per library to avoid dlopen caching the lib load.
+            lib_path = temp_dir.relpath(f"test-{dtype}-linked.so")
+            lib["remove_params"]().export_library(lib_path)
+            lib_mod = tvm.runtime.load_module(lib_path)
+
+            #            lib_mod = lib_factory['default']()
+            graph = json.loads(lib.graph_json)
+            for p in lib.params:
+                _verify_linked_param(dtype, lib, lib_mod, graph, p)
+
+            # Wrap in function to explicitly deallocate the runtime.
+            def _run_linked(lib_mod):
+                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
+                graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
+                graph_rt.run()
+
+                return graph_rt.get_output(0)
+
+            linked_output = _run_linked(lib_mod)
+
+        linked_params = lib.params
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            lib = tvm.relay.build(mod, "c", params=param_init)
+            _, _, params = lib
+            # Need a unique name per library to avoid dlopen caching the lib load.
+            lib_path = temp_dir.relpath(f"test-{dtype}-unlinked.so")
+            lib.export_library(lib_path)
+            lib_mod = tvm.runtime.load_module(lib_path)
+
+            def _run_unlinked(lib_mod):
+                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
+                graph_rt.set_input("rand_input", rand_input, **params)
+                graph_rt.run()
+                return graph_rt.get_output(0)
+
+            unlinked_output = _run_unlinked(lib_mod)
+
+        if "int" in dtype:
+            np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy())
+        else:
+            np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
+
+
+@tvm.testing.requires_micro
+def test_crt_link_params():
+    import tvm.micro
+
+    for dtype in LINKABLE_DTYPES:
+        mod, param_init = _make_mod_and_params(dtype)
+        rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
+        main_func = mod["main"]
+        target = "c -mcpu=native --system-lib --runtime=c --link-params"
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            graph_json, lib, params = tvm.relay.build(mod, target, params=param_init)
+            assert set(params.keys()) == {"p0", "p1"}  # NOTE: op folded
+
+            workspace = tvm.micro.Workspace()
+            compiler = tvm.micro.DefaultCompiler(target=target)
+            opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+            opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE")
+
+            micro_binary = tvm.micro.build_static_runtime(
+                # the x86 compiler *expects* you to give the exact same dictionary for both
+                # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
+                # the binary compiler is expecting those mutations to be in bin_opts.
+                # TODO(weberlo) fix this very bizarre behavior
+                workspace,
+                compiler,
+                lib,
+                lib_opts=opts["bin_opts"],
+                bin_opts=opts["bin_opts"],
+                extra_libs=[
+                    os.path.join(tvm.micro.CRT_ROOT_DIR, m)
+                    for m in ("graph_runtime_module", "graph_runtime")
+                ],
+            )
+
+            flasher_kw = {
+                "debug": False,
+            }
+            flasher = compiler.flasher(**flasher_kw)
+            with tvm.micro.Session(binary=micro_binary, flasher=flasher) as sess:
+                rpc_lib = sess.get_system_lib()
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, rpc_lib, sess.context)
+
+                # NOTE: not setting params here.
+                graph_rt.set_input("rand_input", rand_input)
+                graph_rt.run()
+                linked_output = graph_rt.get_output(0).asnumpy()
+
+        with tvm.transform.PassContext(opt_level=3):
+            lib = tvm.relay.build(mod, "llvm --system-lib", params=param_init)
+
+            def _run_unlinked(lib):
+                graph_json, mod, lowered_params = lib
+                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt.set_input("rand_input", rand_input, **lowered_params)
+                graph_rt.run()
+                return graph_rt.get_output(0).asnumpy()
+
+            unlinked_output = _run_unlinked(lib)
+
+        if "int" in dtype:
+            np.testing.assert_equal(unlinked_output, linked_output)
+        else:
+            np.testing.assert_allclose(unlinked_output, linked_output)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 3599493a74cb..162481bfdb6e 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -14,11 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import collections
+import ctypes
+import json
 import tvm
 import tvm.testing
 from tvm import te
 from tvm import topi
-from tvm.contrib import utils, clang
+from tvm.contrib import utils
 import numpy as np
 import ctypes
 import math

From 8dbc3cb626929f408c121dd7242eb9affb577ad1 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 26 Nov 2020 16:00:04 +0000
Subject: [PATCH 216/258] Add initial support for quantized transpose
 convolution in Relay (#6899)

* Add initial support for quantized transpose convolution in Relay

This work is based on @jainris initial PR: https://github.com/apache/incubator-tvm/pull/6523

I added a relay.qnn.conv2d_transpose node. The strategy I followed is to
convert to int16 and invoke nn.conv2d_transpose (which already exists in
relay). Main changes:

- The node declaration lives in relay/qnn/op/convolution_transpose.cc
- Cast int8->int16 and subsequent offset removal is in tvm/relay/qnn/op/legalizations.py.
- I added and tested the operator in the tflite front-end
- I added a unit-test in Relay for qnn.conv2d_transpose

Co-authored-by: Rishabh Jain <jainris@users.noreply.github.com>

* Fix linting

* Addressing review comments

Co-authored-by: Rishabh Jain <jainris@users.noreply.github.com>
---
 python/tvm/relay/frontend/tflite.py           |  72 +-
 python/tvm/relay/qnn/op/legalizations.py      |  24 +
 python/tvm/relay/qnn/op/qnn.py                | 112 +++
 src/relay/qnn/op/convolution_transpose.cc     | 154 +++++
 tests/python/frontend/tflite/test_forward.py  | 159 +++--
 .../relay/test_op_qnn_conv2_transpose.py      | 638 ++++++++++++++++++
 6 files changed, 1103 insertions(+), 56 deletions(-)
 create mode 100644 src/relay/qnn/op/convolution_transpose.cc
 create mode 100644 tests/python/relay/test_op_qnn_conv2_transpose.py

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 623aeee358a6..3572d35c6e3b 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -2736,7 +2736,7 @@ def convert_transpose_conv(self, op):
         # Weights
         weights_tensor_type = weights_tensor.tensor.Type()
         # weights tensor type should be UINT8 (quantization) or FLOAT32
-        assert weights_tensor_type in (TensorType.UINT8, TensorType.FLOAT32)
+        assert weights_tensor_type in (TensorType.INT8, TensorType.UINT8, TensorType.FLOAT32)
         weight_tensor_type_str = self.get_tensor_type_str(weights_tensor_type)
         weight_value_ohwi = self.get_tensor_value(weights_tensor)
         # Relay kernel_layout should be OIHW
@@ -2758,19 +2758,40 @@ def convert_transpose_conv(self, op):
         else:
             padding = (0, 0, 0, 0)
 
-        out = _op.nn.conv2d_transpose(
-            in_expr,
-            weight_expr_iohw,
-            strides=(stride_h, stride_w),
-            padding=padding,
-            channels=int(out_channels),
-            kernel_size=(int(kernel_h), int(kernel_w)),
-            data_layout="NHWC",
-            kernel_layout="OIHW",
-            out_dtype=output_tensor_type_str,
-        )
+        if input_tensor.qnn_params:
+            input_zero_point = input_tensor.qnn_params["zero_point"]
+            kernel_zero_point = weights_tensor.qnn_params["zero_point"]
+            input_scale = input_tensor.qnn_params["scale"]
+            kernel_scale = weights_tensor.qnn_params["scale"]
+            out = _qnn.op.conv2d_transpose(
+                in_expr,
+                weight_expr_iohw,
+                input_zero_point,
+                kernel_zero_point,
+                input_scale,
+                kernel_scale,
+                strides=(stride_h, stride_w),
+                padding=padding,
+                channels=int(out_channels),
+                kernel_size=(int(kernel_h), int(kernel_w)),
+                data_layout="NHWC",
+                kernel_layout="OIHW",
+                out_dtype="int32",
+            )
+        else:
+            out = _op.nn.conv2d_transpose(
+                in_expr,
+                weight_expr_iohw,
+                strides=(stride_h, stride_w),
+                padding=padding,
+                channels=int(out_channels),
+                kernel_size=(int(kernel_h), int(kernel_w)),
+                data_layout="NHWC",
+                kernel_layout="OIHW",
+                out_dtype=output_tensor_type_str,
+            )
 
-        # if we have bias
+        # Checking if there is a fused bias
         if len(input_tensors) == 4:
             bias_tensor = input_tensors[3]
             bias_tensor_type = bias_tensor.tensor.Type()
@@ -2783,6 +2804,31 @@ def convert_transpose_conv(self, op):
             channel_axis = 3
             out = _op.nn.bias_add(out, bias_expr, axis=channel_axis)
 
+        if output_tensor.qnn_params:
+            # Calculate the intermediate scale and zero point of the int32 output.
+            data_scale = input_tensor.qnn_params["scale"]
+            data_scale_val = get_scalar_from_constant(data_scale)
+
+            weight_scale = weights_tensor.qnn_params["scale"]
+            # If weight scale is scalar, it is per-tensor quantization
+            if isinstance(weight_scale, float):
+                weight_scale_val = get_scalar_from_constant(weight_scale)
+            else:
+                weight_scale_val = get_tensor_from_constant(weight_scale)
+
+            new_input_scale_val = data_scale_val * weight_scale_val
+            new_input_scale = relay.const(new_input_scale_val, "float32")
+            new_input_zero_point = relay.const(0, "int32")
+
+            out = _qnn.op.requantize(
+                out,
+                input_scale=new_input_scale,
+                input_zero_point=new_input_zero_point,
+                output_scale=output_tensor.qnn_params["scale"],
+                output_zero_point=output_tensor.qnn_params["zero_point"],
+                out_dtype=output_tensor_type_str,
+                axis=3,
+            )
         return out
 
     def convert_quantize(self, op):
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 3f151ebc01a5..d74b3d989270 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -32,6 +32,12 @@ def legalize_qnn_conv2d(attrs, inputs, types):
     return qnn_conv2d_legalize(attrs, inputs, types)
 
 
+# Registering QNN Conv2DTranspose legalization function.
+@reg.register_qnn_legalize("qnn.conv2d_transpose")
+def legalize_qnn_conv2d_transpose(attrs, inputs, types):
+    return qnn_conv2d_transpose_legalize(attrs, inputs, types)
+
+
 # Registering QNN dense legalization function.
 @reg.register_qnn_legalize("qnn.dense")
 def legalize_qnn_dense(attrs, inputs, types):
@@ -46,6 +52,24 @@ def qnn_conv2d_legalize(attrs, inputs, types):
     return None
 
 
+# Generic QNN Conv2DTranspose legalization function.
+@tvm.target.generic_func
+def qnn_conv2d_transpose_legalize(attrs, inputs, types):
+    """Convert kernel and data to int16, subtract offsets upfront
+    and calls into relay.nn.conv2d_transpose."""
+
+    # Collect the input exprs.
+    data, kernel, input_zero_point, kernel_zero_point, _, _ = inputs
+
+    shift_data = relay.subtract(
+        relay.cast(data, dtype="int16"), relay.cast(input_zero_point, "int16")
+    )
+    shift_kernel = relay.subtract(
+        relay.cast(kernel, dtype="int16"), relay.cast(kernel_zero_point, "int16")
+    )
+    return relay.nn.conv2d_transpose(shift_data, shift_kernel, **attrs)
+
+
 # Generic QNN Conv2D legalization function.
 @tvm.target.generic_func
 def qnn_dense_legalize(attrs, inputs, types):
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 9a8f22bfb9bc..a5892f331f06 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -296,6 +296,118 @@ def conv2d(
     )
 
 
+def conv2d_transpose(
+    data,
+    weight,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    strides=(1, 1),
+    padding=(0, 0),
+    dilation=(1, 1),
+    groups=1,
+    channels=None,
+    kernel_size=None,
+    data_layout="NCHW",
+    kernel_layout="OIHW",
+    out_layout="",
+    output_padding=(0, 0),
+    out_dtype="",
+):
+    """This operator deconvolves quantized data with quantized kernel. The scale of
+    the output quantized tensor is the product of the kernel_scale and
+    input_scale of the input quantized tensors. The zero point of the output
+    quantized tensor is 0. By default, the dtype of output is int32. Please also
+    refer to Requantize operator to understand how to scale back the int32
+    output to (u)int8.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    input_zero_point: tvm.relay.Expr
+           The zero point of the data distribution.
+
+    kernel_zero_point: tvm.relay.Expr
+           The zero point of the quantized_kernel distribution.
+
+    input_scale: tvm.relay.Expr
+           The scale for the input tensor. The scale for the input tensor is
+           stored purely for convenience here. See more commentary below.
+
+    kernel_scale: tvm.relay.Expr
+           The scale for the weight tensor. The scale for the weight tensor is
+           stored for access to this during relay. This information is not
+           needed in the pass pipeline after qnn.transpose_conv2d is lowered to the
+           sequence of steps as in nn.transpose_conv2d. See also input_scale in Requantize.
+
+    strides : Tuple[int], optional
+        The strides of convolution.
+
+    padding : Tuple[int], optional
+        The padding of convolution.
+
+    dilation : Tuple[int], optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial dimensions of the convolution kernel.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    kernel_layout : str, optional
+        Layout of the weight.
+
+    out_layout : Optional[str]
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    output_padding : Tuple[int], optional
+        Used to identify the padding within the output shape
+        (only used in training, where transpose_conv represents the gradient of a convolution )
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # convert 2-way padding to 4-way padding
+    padding = get_pad_tuple2d(padding)
+    return _make.conv2d_transpose(
+        data,
+        weight,
+        input_zero_point,
+        kernel_zero_point,
+        input_scale,
+        kernel_scale,
+        strides,
+        padding,
+        dilation,
+        groups,
+        channels,
+        kernel_size,
+        data_layout,
+        kernel_layout,
+        out_layout,
+        output_padding,
+        out_dtype,
+    )
+
+
 def add(
     lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
 ):
diff --git a/src/relay/qnn/op/convolution_transpose.cc b/src/relay/qnn/op/convolution_transpose.cc
new file mode 100644
index 000000000000..c7515b5904f1
--- /dev/null
+++ b/src/relay/qnn/op/convolution_transpose.cc
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/convolution_transpose.cc
+ * \brief Property def of qnn transpose convolution operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/base.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/data_layout.h>
+
+#include "../../op/nn/convolution.h"
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+// relay.op.qnn.conv2d_transpose
+
+inline Expr MakeQnnConv2DTranspose(Expr data, Expr weight, Expr input_zero_point,
+                                   Expr kernel_zero_point, Expr input_scale, Expr kernel_scale,
+                                   Array<IndexExpr> strides, Array<IndexExpr> padding,
+                                   Array<IndexExpr> dilation, int groups, IndexExpr channels,
+                                   Array<IndexExpr> kernel_size, std::string data_layout,
+                                   std::string kernel_layout, std::string out_layout,
+                                   Array<IndexExpr> output_padding, DataType out_dtype) {
+  auto attrs = make_object<Conv2DTransposeAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = std::move(channels);
+  attrs->kernel_size = std::move(kernel_size);
+  attrs->data_layout = std::move(data_layout);
+  attrs->kernel_layout = std::move(kernel_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->output_padding = std::move(output_padding);
+  attrs->out_dtype = std::move(out_dtype);
+  const Op& op = Op::Get("qnn.conv2d_transpose");
+  return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale},
+              Attrs(attrs), {});
+}
+
+Array<Array<Layout>> QnnConvTransposeInferCorrectLayout(
+    const Attrs& attrs, const Array<Layout>& new_in_layouts, const Array<Layout>& old_in_layouts,
+    const Array<tvm::relay::Type>& old_in_types) {
+  // Use Relay Conv2D Infer correct layout.
+  auto layouts = ConvInferCorrectLayout<Conv2DTransposeAttrs>(attrs, new_in_layouts, old_in_layouts,
+                                                              old_in_types);
+
+  // Fill the layouts of remaining input tensors - scales and zero points. The layouts of these
+  // tensors can be treated as channel layout.
+  Layout channel_layout = Layout("C");
+  Array<Layout> input_layouts = {layouts[0][0],  layouts[0][1],  channel_layout,
+                                 channel_layout, channel_layout, channel_layout};
+  Array<Layout> output_layouts = layouts[1];
+  return {input_layouts, output_layouts};
+}
+
+bool QnnConv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                           const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 7);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr || weight == nullptr) return false;
+  const auto* param = attrs.as<Conv2DTransposeAttrs>();
+  ICHECK(param != nullptr) << "Conv2DTransposeAttrs cannot be nullptr.";
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+      << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype;
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
+      << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
+  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
+      << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype;
+  ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
+
+  // Check the types of scale and zero points.
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
+  // Kernel scale can be a vector of length output_channels or a scalar.
+  if (param->groups == 1) {
+    size_t axis = param->kernel_layout.find('O');
+    ICHECK(axis != std::string::npos) << "Kernel layout attribute is not defined";
+    AssignType(types[5], DataType::Float(32), weight->shape[axis], reporter);  // kernel scale
+  } else {
+    // Here, total number of output channels depend on depth multiplier.
+    size_t o_axis = param->kernel_layout.find('O');
+    size_t i_axis = param->kernel_layout.find('I');
+    ICHECK(o_axis != std::string::npos || i_axis != std::string::npos)
+        << "Kernel layout attribute is not defined";
+    AssignType(types[5], DataType::Float(32), weight->shape[i_axis] * weight->shape[o_axis],
+               reporter);  // kernel scale
+  }
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // Conv2D infer type function.
+  Array<Type> tensor_types = {types[0], types[1], types[6]};
+  return Conv2DTransposeRel<Conv2DTransposeAttrs>(tensor_types, 3, attrs, reporter);
+}
+
+RELAY_REGISTER_OP("qnn.conv2d_transpose")
+    .describe(R"code(Quantized transposed 2D convolution layer (sometimes called Deconvolution).
+This operator deconvolves quantized weight with quantized data. The scale of the
+output quantized tensor is the product of the weight_scale and input_scale of
+the input quantized tensors. The zero point of the output quantized tensor is
+0. By default, the dtype of output is int32. Please also refer to Requantize
+operator to understand how to scale back the int32 output to (u)int8.
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<Conv2DTransposeAttrs>()
+    .set_num_inputs(6)
+    .add_argument("data", "Tensor", "The quantized input data tensor.")
+    .add_argument("weight", "Tensor", "The quantized weight tensor.")
+    .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .add_argument("weight_scale", "Tensor", "The quantization scale of the weight tensor.")
+    .add_argument("weight_zero_point", "Tensor",
+                  "The quantization zero_point of the weight tensor.")
+    .set_support_level(11)
+    .add_type_rel("QnnConv2DTranspose", QnnConv2DTransposeRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnConvTransposeInferCorrectLayout);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.conv2d_transpose").set_body_typed(MakeQnnConv2DTranspose);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index b7f3b91f4243..de962fea282f 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1115,7 +1115,9 @@ def test_forward_convolution():
 # ---------------------
 
 
-def _test_transpose_conv(tensor_in_sizes, filter_in_sizes, output_shape, strides, padding):
+def _test_transpose_conv(
+    tensor_in_sizes, filter_in_sizes, output_shape, strides, padding, quantized=False
+):
     """ One iteration of transpose convolution with given shapes and attributes """
 
     total_size_1 = 1
@@ -1124,53 +1126,124 @@ def _test_transpose_conv(tensor_in_sizes, filter_in_sizes, output_shape, strides
         total_size_1 *= s
     for s in filter_in_sizes:
         total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    data_array = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
 
     with tf.Graph().as_default():
-        in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype="float32")
-        in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype="float32")
-        strides = [1] + strides + [1]
-        # in_filter layout is HWOI
-        out = nn_ops.conv2d_transpose(
-            in_data, in_filter, output_shape=output_shape, strides=strides, padding=padding
-        )
-        data_array = np.reshape(data_array, tensor_in_sizes).astype("float32")
-        compare_tflite_with_tvm(data_array, "Placeholder:0", [in_data], [out])
+        if quantized:
+            # Initializes the input tensor with array containing incrementing
+            # numbers from 1.
+            data_array = [max(f, 255) for f in range(1, total_size_1 + 1)]
+            filter_array = [max(f, 255) for f in range(1, total_size_2 + 1)]
+            data_array = np.reshape(data_array, tensor_in_sizes).astype("uint8")
+            filter_array = np.reshape(filter_array, filter_in_sizes).astype("uint8")
+
+            in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype="float32", name="in_data")
+            inq_data = tf.quantization.fake_quant_with_min_max_args(
+                in_data, min=-100, max=100, name="q_data"
+            )
+            input_range = {"q_data": (-100, 100)}
+
+            in_filter = constant_op.constant(
+                filter_array, shape=filter_in_sizes, dtype="float32", name="in_filter"
+            )
+            inq_filter = tf.quantization.fake_quant_with_min_max_args(
+                in_filter, min=-100, max=100, name="q_filter"
+            )
+
+            strides = [1] + strides + [1]
+
+            out = nn_ops.conv2d_transpose(
+                inq_data, inq_filter, output_shape=output_shape, strides=strides, padding=padding
+            )
+            out = tf.quantization.fake_quant_with_min_max_args(out, min=-100, max=100, name="out")
+            compare_tflite_with_tvm(
+                [data_array], ["q_data"], [inq_data], [out], quantized=True, input_range=input_range
+            )
+        else:
+            # Initializes the input tensor with array containing incrementing
+            # numbers from 1.
+            data_array = [f * 1.0 for f in range(1, total_size_1 + 1)]
+            filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
+
+            in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype="float32", name="in_data")
+            in_filter = constant_op.constant(
+                filter_array, shape=filter_in_sizes, dtype="float32", name="in_filter"
+            )
+            strides = [1] + strides + [1]
+            # in_filter layout is HWOI
+            out = nn_ops.conv2d_transpose(
+                in_data, in_filter, output_shape=output_shape, strides=strides, padding=padding
+            )
+            data_array = np.reshape(data_array, tensor_in_sizes).astype("float32")
+            compare_tflite_with_tvm([data_array], ["in_data"], [in_data], [out])
 
 
 def test_forward_transpose_conv():
-    # kernel 3x3, padding VALID
-    _test_transpose_conv([4, 32, 32, 16], [3, 3, 5, 16], [4, 34, 34, 5], [1, 1], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [3, 3, 5, 16], [1, 65, 65, 5], [2, 2], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [3, 3, 5, 16], [1, 65, 34, 5], [2, 1], "VALID")
-
-    # kernel 3x3, padding SAME
-    _test_transpose_conv([4, 32, 32, 16], [3, 3, 5, 16], [4, 32, 32, 5], [1, 1], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [3, 3, 5, 16], [1, 64, 64, 5], [2, 2], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [3, 3, 5, 16], [1, 64, 32, 5], [2, 1], "SAME")
-
-    # kernel 2x2, padding VALID
-    _test_transpose_conv([4, 32, 32, 16], [2, 2, 5, 16], [4, 33, 33, 5], [1, 1], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 64, 5], [2, 2], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 33, 5], [2, 1], "VALID")
-
-    # kernel 2x2, padding SAME
-    _test_transpose_conv([4, 32, 32, 16], [2, 2, 5, 16], [4, 32, 32, 5], [1, 1], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 64, 5], [2, 2], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 32, 5], [2, 1], "SAME")
-
-    # kernel 1x1, padding VALID
-    _test_transpose_conv([4, 32, 32, 16], [1, 1, 5, 16], [4, 32, 32, 5], [1, 1], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 63, 5], [2, 2], "VALID")
-    _test_transpose_conv([1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 32, 5], [2, 1], "VALID")
-
-    # kernel 1x1, padding SAME
-    _test_transpose_conv([4, 32, 32, 16], [1, 1, 5, 16], [4, 32, 32, 5], [1, 1], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 63, 5], [2, 2], "SAME")
-    _test_transpose_conv([1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 32, 5], [2, 1], "SAME")
+    for quantized in [True, False]:
+        # kernel 3x3, padding VALID
+        _test_transpose_conv(
+            [4, 32, 32, 16], [3, 3, 5, 16], [4, 34, 34, 5], [1, 1], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [3, 3, 5, 16], [1, 65, 65, 5], [2, 2], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [3, 3, 5, 16], [1, 65, 34, 5], [2, 1], "VALID", quantized
+        )
+
+        # kernel 3x3, padding SAME
+        _test_transpose_conv(
+            [4, 32, 32, 16], [3, 3, 5, 16], [4, 32, 32, 5], [1, 1], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [3, 3, 5, 16], [1, 64, 64, 5], [2, 2], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [3, 3, 5, 16], [1, 64, 32, 5], [2, 1], "SAME", quantized
+        )
+
+        # kernel 2x2, padding VALID
+        _test_transpose_conv(
+            [4, 32, 32, 16], [2, 2, 5, 16], [4, 33, 33, 5], [1, 1], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 64, 5], [2, 2], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 33, 5], [2, 1], "VALID", quantized
+        )
+
+        # kernel 2x2, padding SAME
+        _test_transpose_conv(
+            [4, 32, 32, 16], [2, 2, 5, 16], [4, 32, 32, 5], [1, 1], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 64, 5], [2, 2], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [2, 2, 5, 16], [1, 64, 32, 5], [2, 1], "SAME", quantized
+        )
+
+        # kernel 1x1, padding VALID
+        _test_transpose_conv(
+            [4, 32, 32, 16], [1, 1, 5, 16], [4, 32, 32, 5], [1, 1], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 63, 5], [2, 2], "VALID", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 32, 5], [2, 1], "VALID", quantized
+        )
+
+        # kernel 1x1, padding SAME
+        _test_transpose_conv(
+            [4, 32, 32, 16], [1, 1, 5, 16], [4, 32, 32, 5], [1, 1], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 63, 5], [2, 2], "SAME", quantized
+        )
+        _test_transpose_conv(
+            [1, 32, 32, 16], [1, 1, 5, 16], [1, 63, 32, 5], [2, 1], "SAME", quantized
+        )
 
 
 #######################################################################
diff --git a/tests/python/relay/test_op_qnn_conv2_transpose.py b/tests/python/relay/test_op_qnn_conv2_transpose.py
new file mode 100644
index 000000000000..a86f9e1c6a80
--- /dev/null
+++ b/tests/python/relay/test_op_qnn_conv2_transpose.py
@@ -0,0 +1,638 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+import numpy as np
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.testing import run_infer_type
+from tvm.contrib import graph_runtime
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+
+
+def get_ref_func(
+    data,
+    kernel,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    data_layout,
+    kernel_layout,
+    out_dtype,
+    groups,
+    channels=None,
+):
+    casted_data = relay.op.cast(data, "int32")
+    casted_kernel = relay.op.cast(kernel, "int32")
+    shifted_data = relay.op.subtract(casted_data, relay.const(input_zero_point, "int32"))
+    shifted_kernel = relay.op.subtract(casted_kernel, relay.const(kernel_zero_point, "int32"))
+    func = relay.op.nn.conv2d_transpose(
+        shifted_data,
+        shifted_kernel,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+        groups=groups,
+        channels=channels,
+        kernel_size=kernel_size,
+        out_dtype=out_dtype,
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+    )
+
+    func = relay.Function(relay.analysis.free_vars(func), func)
+    return func
+
+
+def get_qnn_func(
+    data,
+    kernel,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    data_layout,
+    kernel_layout,
+    out_dtype,
+    channels,
+    groups,
+):
+    func = relay.qnn.op.conv2d_transpose(
+        data,
+        kernel,
+        input_zero_point=relay.const(input_zero_point, "int32"),
+        kernel_zero_point=relay.const(kernel_zero_point, "int32"),
+        input_scale=relay.const(input_scale, "float32"),
+        kernel_scale=relay.const(kernel_scale, "float32"),
+        kernel_size=kernel_size,
+        strides=strides,
+        dilation=dilation,
+        padding=padding,
+        out_dtype=out_dtype,
+        groups=groups,
+        channels=channels,
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+    )
+
+    mod = relay.Function(relay.analysis.free_vars(func), func)
+    mod = tvm.IRModule.from_expr(mod)
+    return mod
+
+
+def get_funcs(
+    data_shape,
+    data_dtype,
+    kernel_shape,
+    kernel_dtype,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    data_layout,
+    kernel_layout,
+    out_dtype,
+    groups=1,
+    channels=None,
+):
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
+
+    ref_func = get_ref_func(
+        data,
+        kernel,
+        input_zero_point,
+        kernel_zero_point,
+        input_scale,
+        kernel_scale,
+        kernel_size,
+        padding,
+        strides,
+        dilation,
+        data_layout,
+        kernel_layout,
+        out_dtype,
+        groups,
+        channels,
+    )
+    ref_func = run_infer_type(ref_func)
+    ref_func = tvm.IRModule.from_expr(ref_func)
+    qnn_func = get_qnn_func(
+        data,
+        kernel,
+        input_zero_point,
+        kernel_zero_point,
+        input_scale,
+        kernel_scale,
+        kernel_size,
+        padding,
+        strides,
+        dilation,
+        data_layout,
+        kernel_layout,
+        out_dtype,
+        channels,
+        groups,
+    )
+
+    return (ref_func, qnn_func)
+
+
+def verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype):
+    def get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype):
+        # Keeping inputs multiple of 4 because of a bug in Average Pool2d
+        # https://discuss.tvm.apache.org/t/pool2d-gives-bad-output-for-integer-inputs/3377
+        low = -128
+        high = 127
+        if data_dtype == "uint8":
+            low = 0
+            high = 255
+        golden_data = np.random.randint(low=low, high=high, size=data_shape).astype(data_dtype)
+        low = -128
+        high = 127
+        if kernel_dtype == "uint8":
+            low = 0
+            high = 255
+        golden_weight = np.random.randint(low=low, high=high, size=kernel_shape).astype(
+            kernel_dtype
+        )
+        return (golden_data, golden_weight)
+
+    def get_output(func, golden_inputs):
+        with tvm.transform.PassContext(opt_level=2):
+            golden_data, golden_weight = golden_inputs
+            params = {"kernel": golden_weight}
+            graph, lib, params = relay.build(func, "llvm", params=params)
+            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod.set_input("data", golden_data)
+            mod.set_input(**params)
+            mod.run()
+            res = mod.get_output(0).asnumpy()
+            return res
+
+    golden_inputs = get_inputs(data_shape, data_dtype, kernel_shape, kernel_dtype)
+    golden_output = get_output(ref_func, golden_inputs)
+    qnn_output = get_output(qnn_func, golden_inputs)
+    np.testing.assert_equal(qnn_output, golden_output)
+
+
+def test_no_zero_point():
+    # uint8 input
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=0,
+        kernel_zero_point=0,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=0,
+        kernel_zero_point=0,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_kernel_zero_point():
+    # uint8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=0,
+        kernel_zero_point=1,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=0,
+        kernel_zero_point=5,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_input_zero_point():
+    # uint8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=0,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=0,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_both_zero_point():
+    # uint8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # int8 input
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "int8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "int8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_layout():
+    # uint8 input
+    data_shape = (2, 2, 4, 4)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (2, 2, 3, 4)  # HWIO
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    data_shape = (2, 2, 4, 3)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (2, 2, 1, 3)  # HWIO
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=5,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_padding():
+    # uint8 input
+    data_shape = (1, 4, 2, 2)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=8,
+        kernel_zero_point=5,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(1, 1),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # Try different layout
+    data_shape = (2, 2, 4, 4)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (2, 2, 3, 4)  # HWIO
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=8,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(1, 1),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+    # Try asymmetric padding
+    data_shape = (2, 8, 6, 4)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (2, 2, 3, 4)  # HWIO
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=8,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(1, 1, 2, 2),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
+def test_const_folding():
+    data_shape = (2, 4, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (4, 3, 2, 2)
+    kernel_dtype = "uint8"
+
+    golden_weight = np.random.randint(low=0, high=255, size=kernel_shape).astype(kernel_dtype)
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    kernel = relay.const(golden_weight)
+    qnn_func = get_qnn_func(
+        data,
+        kernel,
+        input_zero_point=8,
+        kernel_zero_point=3,
+        kernel_size=(2, 2),
+        input_scale=1.0,
+        kernel_scale=1.0,
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+        channels=kernel_shape[1],
+        groups=1,
+    )
+    folded_mod = transform.FoldConstant()(qnn_func)
+    folded_func = folded_mod["main"]
+    assert "reshape" not in folded_func.astext()
+
+
+def test_broadcast_layout():
+    # Test broadcast support for NHWC layout.
+    data_shape = (1, 229, 229, 3)  # NHWC
+    data_dtype = "uint8"
+    kernel_shape = (7, 7, 64, 3)  # HWIO
+    kernel_dtype = "int8"
+    _, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=8,
+        kernel_zero_point=3,
+        input_scale=1.0,
+        kernel_scale=1.0,
+        kernel_size=(7, 7),
+        padding=(1, 1),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+    )
+    func = qnn_func["main"].body
+    bias = relay.var("bias", shape=(64,), dtype="int32")
+    bias2 = relay.var("bias2", shape=(1, 233, 233, 64), dtype="int32")
+
+    # Check broadcast support on both lhs and rhs
+    func = relay.add(func, bias2)
+    func = relay.add(bias2, func)
+    func = relay.add(bias, func)
+    func = relay.add(func, bias)
+    func = relay.Function(relay.analysis.free_vars(func), func)
+    mod = tvm.IRModule.from_expr(func)
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512")
+
+
+def test_per_channel_kernel_scale():
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "uint8"
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype)
+    kernel_scales = [2, 2, 2]
+    kernel_scales = relay.const(np.array(kernel_scales).astype("float32"))
+    func = relay.qnn.op.conv2d_transpose(
+        data,
+        kernel,
+        input_zero_point=relay.const(0, "int32"),
+        kernel_zero_point=relay.const(0, "int32"),
+        input_scale=relay.const(2.0, "float32"),
+        kernel_scale=kernel_scales,
+        kernel_size=(2, 2),
+        channels=kernel_shape[0],
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+
+    mod = relay.Function(relay.analysis.free_vars(func), func)
+    mod = tvm.IRModule.from_expr(mod)
+
+
+if __name__ == "__main__":
+    test_no_zero_point()
+    test_input_zero_point()
+    test_kernel_zero_point()
+    test_both_zero_point()
+    test_layout()
+    test_padding()
+    test_const_folding()
+    test_broadcast_layout()
+    test_per_channel_kernel_scale()

From 39d359ffd30b527a7a0b43c47a906f2966094bab Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 28 Nov 2020 04:06:42 -0800
Subject: [PATCH 217/258] [AutoScheduler] Accelerate feature extraction for
 winograd (#6981)

* [AutoScheduler] Accelerate feature extraction for winograd

* fix an overflow in feature.cc

* address comments

* address comments

* Update include/tvm/te/schedule.h

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Use a smaller min_repeat_ms

* Use a smaller min_repeat_ms

Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 include/tvm/te/schedule.h                     | 12 ++++++++++
 python/tvm/topi/utils.py                      |  2 +-
 src/auto_scheduler/compute_dag.cc             |  2 +-
 src/auto_scheduler/feature.cc                 |  8 +++----
 src/te/schedule/schedule_dataflow_rewrite.cc  | 22 ++++++++++++++++---
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |  2 +-
 tutorials/auto_scheduler/tune_network_cuda.py |  4 ++--
 7 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/include/tvm/te/schedule.h b/include/tvm/te/schedule.h
index ee4fb33349f7..6f26d07dc8a5 100644
--- a/include/tvm/te/schedule.h
+++ b/include/tvm/te/schedule.h
@@ -378,6 +378,18 @@ class Schedule : public ObjectRef {
    * \return A normalized schedule, can be same as current one.
    */
   Schedule normalize();
+
+  /*!
+   * \brief Normalize the schedule for feature extraction in auto-scheduler.
+   * This is similar to `Schedule::normalize`, but we do aggressive simplification
+   * to the TE compute with const_matrix=True for faster compilation and feature extraction.
+   * The resulted schedule may be wrong, but it is good enough for feature extraction
+   * purposes.
+   *
+   * \return A normalized schedule, can be same as current one.
+   */
+  Schedule normalize_for_feature_extraction();
+
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index a5df788d38cb..c3e14eff3919 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -337,7 +337,7 @@ def select_array(i, j):
                 )
         return now
 
-    return te.compute(matrix.shape, select_array, name=name)
+    return te.compute(matrix.shape, select_array, name=name, attrs={"const_matrix": True})
 
 
 def get_max_power2_factor(n, max_value=None):
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 27a30127ba65..e57fc8c9c2d9 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1235,7 +1235,7 @@ State ComputeDAG::InferBound(const State& state) const {
   Array<te::Tensor> tensors;
   // Replay steps to tvm::Schedule
   std::tie(sch, tensors) = ApplySteps(pstate->transform_steps, &stages, &stage_to_axes);
-  sch = sch.normalize();
+  sch = sch.normalize_for_feature_extraction();
   // Get bound information from TVM schedule
   Map<IterVar, Range> bounds = te::InferBound(sch);
 
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index a60c87cc600d..0df69b967d3b 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -669,7 +669,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     math_op_counter(node->value);
     std::vector<float> mem_bytes_list;
     std::vector<float> compute_ops_list;
-    int cur_compute_ops;
+    double cur_compute_ops;
 
     // Group 1: Computation related features
     ExtractComputationFeature(node, math_op_counter);
@@ -768,7 +768,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
   // Extract buffer access related features (group 2)
   void ExtractBufferAccessFeature(const BufferStoreNode* node, const MathOpCounter& math_op_counter,
-                                  int* cur_compute_ops, std::vector<float>* compute_ops_list,
+                                  double* cur_compute_ops, std::vector<float>* compute_ops_list,
                                   std::vector<float>* mem_bytes_list) {
     FeatureSet& fea = buffer_features[node->buffer];
 
@@ -920,7 +920,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
   }
 
   // Extract arithmetic intensity related feature (group 3)
-  void ExtractArithmeticIntensityFeature(const BufferStoreNode* node, int cur_compute_ops,
+  void ExtractArithmeticIntensityFeature(const BufferStoreNode* node, double cur_compute_ops,
                                          const std::vector<float>& compute_ops_list,
                                          const std::vector<float>& mem_bytes_list) {
     FeatureSet& fea = buffer_features[node->buffer];
@@ -1267,7 +1267,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
   Array<te::Tensor> tensors;
 
   std::tie(sch, tensors) = task->compute_dag.ApplySteps(state->transform_steps);
-  sch = sch.normalize();
+  sch = sch.normalize_for_feature_extraction();
   auto bounds = te::InferBound(sch);
 
   try {
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index 6aac3b769a47..bae8e069bcdb 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -502,7 +502,7 @@ void RebaseNonZeroMinLoop(ScheduleNode* sch) {
   }
 }
 
-void InjectInline(ScheduleNode* sch) {
+void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
   sch->InvalidateCache();
 
   std::vector<Array<PrimExpr> > new_body(sch->stages.size());
@@ -524,7 +524,15 @@ void InjectInline(ScheduleNode* sch) {
           args.push_back(iv->var);
         }
         ICHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
-        body = compute->body[0];
+
+        if (feature_extraction_mode && compute->attrs.count("const_matrix")) {
+          // Use constant value to replace access of const matrices.
+          // This produces wrong IR but is good enough for feature extraction purposes.
+          // This simplification can accelerate the feature extration and evolutionary search.
+          body = make_const(compute->output_dtype(0), 1.0f);
+        } else {
+          body = compute->body[0];
+        }
       }
       for (size_t j = i; j < sch->stages.size(); ++j) {
         Stage s = sch->stages[j];
@@ -700,7 +708,15 @@ void LegalizeInvalidAttach(ScheduleNode* sch) {
 
 Schedule Schedule::normalize() {
   Schedule sn = copy();
-  InjectInline(sn.operator->());
+  InjectInline(sn.operator->(), false);
+  RebaseNonZeroMinLoop(sn.operator->());
+  LegalizeInvalidAttach(sn.operator->());
+  return sn;
+}
+
+Schedule Schedule::normalize_for_feature_extraction() {
+  Schedule sn = copy();
+  InjectInline(sn.operator->(), true);
   RebaseNonZeroMinLoop(sn.operator->());
   LegalizeInvalidAttach(sn.operator->());
   return sn;
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index a28e98b8792a..9aeea8487444 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -85,7 +85,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 #   during measurement and avoid other runtime conflicts.
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
-#   Typically, we recommend a value > 300 ms.
+#   Typically, we recommend a value >= 300 ms.
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
 #   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
 #   good value for the search to converge. You can do more trials according to your time budget.
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 723b8d15ea88..8f8cf7f1e99a 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -167,7 +167,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 #   during measurement and avoid other runtime conflicts.
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
-#   Typically, we recommend a value > 300 ms.
+#   Typically, we recommend a value >= 300 ms.
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
 #   You can set it to a small number (e.g., 200) for a fast demonstrative run.
 #   In practice, we recommend setting it around :code:`1000 * len(tasks)`,
@@ -184,7 +184,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 
 def run_tuning():
     print("Begin tuning...")
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400, timeout=10)
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
 
     tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
     tune_option = auto_scheduler.TuningOptions(

From 874911faf92cc060f5fa63a1858d1f3b06b6c003 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sat, 28 Nov 2020 16:38:08 -0800
Subject: [PATCH 218/258] Fix GraphRuntime with -link-params over RPC (#6985)

* Fix GraphRuntime with remotely-linked params.

 * Previous test did not exercise this correctly.

* fix incorrect function name
---
 python/tvm/micro/session.py               | 2 +-
 tests/python/unittest/test_link_params.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index fba612b84d1f..1f91cdda10f6 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -187,7 +187,7 @@ def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
         return None
 
     return get_global_func("tvm.rpc.NDArrayFromRemoteOpaqueHandle")(
-        mod, remote_data, template_tensor, ctx, lambda: None
+        mod, remote_data, template_tensor, ctx, None
     )
 
 
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 7b6910b0ea57..c3c2232c2188 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -378,8 +378,9 @@ def test_crt_link_params():
             }
             flasher = compiler.flasher(**flasher_kw)
             with tvm.micro.Session(binary=micro_binary, flasher=flasher) as sess:
-                rpc_lib = sess.get_system_lib()
-                graph_rt = tvm.contrib.graph_runtime.create(graph_json, rpc_lib, sess.context)
+                graph_rt = tvm.micro.session.create_local_graph_runtime(
+                    graph_json, sess.get_system_lib(), sess.context
+                )
 
                 # NOTE: not setting params here.
                 graph_rt.set_input("rand_input", rand_input)

From e96cae92f3abfa821f10a4783b156feff82a7548 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Sat, 28 Nov 2020 20:01:16 -0800
Subject: [PATCH 219/258] [Hardware][Verilator] Integrating and simulating
 hardware accelerators in TVM (#6971)

* add files

* update interface

* update

* fix comment

* fix fmt

* remove widget repo

* remove
---
 CMakeLists.txt                                |   1 +
 cmake/config.cmake                            |   3 +
 cmake/modules/contrib/Verilator.cmake         |  27 ++++
 .../backend/contrib/verilator/codegen.cc      | 100 +++++++++++++++
 .../contrib/verilator/verilator_device.h      |  57 +++++++++
 .../contrib/verilator/verilator_kernel.h      |  42 +++++++
 .../contrib/verilator/verilator_runtime.cc    | 116 ++++++++++++++++++
 7 files changed, 346 insertions(+)
 create mode 100644 cmake/modules/contrib/Verilator.cmake
 create mode 100644 src/relay/backend/contrib/verilator/codegen.cc
 create mode 100644 src/runtime/contrib/verilator/verilator_device.h
 create mode 100644 src/runtime/contrib/verilator/verilator_kernel.h
 create mode 100644 src/runtime/contrib/verilator/verilator_runtime.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c1ff7035d62..8fe416e9de93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -369,6 +369,7 @@ include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/VitisAI.cmake)
+include(cmake/modules/contrib/Verilator.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 include(cmake/modules/RustExt.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 8ed06b26de5e..0080e239f16d 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -236,6 +236,9 @@ set(USE_TENSORRT_RUNTIME OFF)
 # Whether use VITIS-AI codegen
 set(USE_VITIS_AI OFF)
 
+# Build Verilator codegen and runtime, example located in 3rdparty/hw-widgets
+set(USE_VERILATOR_HW OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/contrib/Verilator.cmake b/cmake/modules/contrib/Verilator.cmake
new file mode 100644
index 000000000000..907944706a82
--- /dev/null
+++ b/cmake/modules/contrib/Verilator.cmake
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_VERILATOR_HW STREQUAL "ON")
+  file(GLOB VERILATOR_RELAY_CONTRIB_SRC src/relay/backend/contrib/verilator/codegen.cc)
+  list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC})
+  find_library(EXTERN_LIBRARY_VERILATOR NAMES verilator PATHS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/hw-widgets)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_VERILATOR})
+  file(GLOB VERILATOR_CONTRIB_SRC src/runtime/contrib/verilator/verilator_runtime.cc)
+  list(APPEND RUNTIME_SRCS ${VERILATOR_CONTRIB_SRC})
+endif()
+
diff --git a/src/relay/backend/contrib/verilator/codegen.cc b/src/relay/backend/contrib/verilator/codegen.cc
new file mode 100644
index 000000000000..4124fa2459d6
--- /dev/null
+++ b/src/relay/backend/contrib/verilator/codegen.cc
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/verilator/codegen.cc
+ * \brief Implementation of Verilator codegen APIs.
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <numeric>
+#include <sstream>
+
+#include "../../../../runtime/contrib/json/json_node.h"
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+using namespace backend;
+
+class VerilatorJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  VerilatorJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr) {}
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
+    Expr expr = GetRef<Expr>(cn);
+    std::string name;
+    const CallNode* call = cn;
+    if (const auto* op_node = cn->op.as<OpNode>()) {
+      name = op_node->name;
+    } else {
+      LOG(FATAL) << "Verilator JSON runtime does not support calls to " << cn->op->GetTypeKey();
+    }
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+    SetCallNodeAttribute(node, call);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+};
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
+ * compile it into a runtime module.
+ */
+runtime::Module VerilatorCompiler(const ObjectRef& ref) {
+  CHECK(ref->IsInstance<FunctionNode>());
+  auto func = Downcast<Function>(ref);
+  auto func_name = GetExtSymbol(func);
+  VerilatorJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto params = serializer.GetParams();
+
+  const auto* pf = runtime::Registry::Get("runtime.VerilatorJSONRuntimeCreate");
+  CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
+  auto mod = (*pf)(func_name, graph_json, params);
+  return mod;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.verilator").set_body_typed(VerilatorCompiler);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/verilator/verilator_device.h b/src/runtime/contrib/verilator/verilator_device.h
new file mode 100644
index 000000000000..acd91a53bcff
--- /dev/null
+++ b/src/runtime/contrib/verilator/verilator_device.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/verilator/verilator_device.h
+ * \brief Use external verilator device.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
+#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+typedef void* VerilatorHandle;
+
+/* allocate Verilator object */
+extern "C" TVM_DLL VerilatorHandle VerilatorAlloc();
+
+/* deallocate Verilator object */
+extern "C" TVM_DLL void VerilatorDealloc(VerilatorHandle handle);
+
+/* read Verilator register or memory */
+extern "C" TVM_DLL int VerilatorRead(VerilatorHandle handle, int id, int addr);
+
+/* write Verilator register or memory */
+extern "C" TVM_DLL void VerilatorWrite(VerilatorHandle handle, int id, int addr, int value);
+
+/* reset Verilator for n clock cycles */
+extern "C" TVM_DLL void VerilatorReset(VerilatorHandle handle, int n);
+
+/* run Verilator for n clock cycles */
+extern "C" TVM_DLL void VerilatorRun(VerilatorHandle handle, int n);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_DEVICE_H_
diff --git a/src/runtime/contrib/verilator/verilator_kernel.h b/src/runtime/contrib/verilator/verilator_kernel.h
new file mode 100644
index 000000000000..f62097c0d795
--- /dev/null
+++ b/src/runtime/contrib/verilator/verilator_kernel.h
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/verilator/verilator_kernel.h
+ * \brief Use external verilator library kernels.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
+#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+#include "verilator_device.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+extern "C" TVM_DLL void verilator_add(VerilatorHandle handle, int* data, int* weight, int* out,
+                                      int p_h_, int p_w_);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_KERNEL_H_
diff --git a/src/runtime/contrib/verilator/verilator_runtime.cc b/src/runtime/contrib/verilator/verilator_runtime.cc
new file mode 100644
index 000000000000..a44faf6d3274
--- /dev/null
+++ b/src/runtime/contrib/verilator/verilator_runtime.cc
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/verilator/verilator_runtime.cc
+ * \brief A simple JSON runtime for Verilator.
+ */
+
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+#include "verilator_device.h"
+#include "verilator_kernel.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::json;
+
+class VerilatorJSONRuntime : public JSONRuntimeBase {
+ public:
+  VerilatorJSONRuntime(const std::string& symbol_name, const std::string& graph_json,
+                       const Array<String> const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+  const char* type_key() const { return "verilator_json"; }
+
+  void Init(const Array<NDArray>& consts) override {
+    BuildEngine();
+
+    CHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+
+    // Setup constants entries for weights.
+    SetupConstants(consts);
+  }
+
+  void Run() override {
+    std::vector<int*> in_ptr;
+    std::vector<int*> out_ptr;
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      uint32_t eid = EntryID(input_nodes_[i], 0);
+      int* data = static_cast<int*>(data_entry_[eid]->data);
+      in_ptr.push_back(data);
+    }
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      int* data = static_cast<int*>(data_entry_[eid]->data);
+      out_ptr.push_back(data);
+    }
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() == "kernel") {
+        CHECK_EQ(node.GetOpType(), "kernel");
+        auto op_name = node.GetOpName();
+        if ("add" == op_name) {
+          auto entry = node.GetInputs()[0];
+          auto shape = nodes_[entry.id_].GetOpShape()[entry.index_];
+          verilator_add(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]);
+        } else {
+          LOG(FATAL) << "Unsupported op: " << op_name;
+        }
+      }
+    }
+  }
+
+ private:
+  void BuildEngine() {
+    device_ = VerilatorAlloc();
+    // reset for 10 cycles
+    VerilatorReset(device_, 10);
+  }
+
+  /* The verilator handle. */
+  VerilatorHandle device_{nullptr};
+};
+
+runtime::Module VerilatorJSONRuntimeCreate(String symbol_name, String graph_json,
+                                           const Array<String>& const_names) {
+  auto n = make_object<VerilatorJSONRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.VerilatorJSONRuntimeCreate")
+    .set_body_typed(VerilatorJSONRuntimeCreate);
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_verilator_json")
+    .set_body_typed(JSONRuntimeBase::LoadFromBinary<VerilatorJSONRuntime>);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm

From 0900113f6df9258892387bf6d7965ef1355766d8 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 29 Nov 2020 07:05:52 -0800
Subject: [PATCH 220/258] Fix C runtime NDArray allocation bug (#6991)

---
 src/runtime/crt/common/ndarray.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index 4bae6de7da39..b7d4999254e6 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -51,7 +51,8 @@ TVMNDArray TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType d
   for (idx = 0; idx < ret.dl_tensor.ndim; ++idx) {
     num_elems *= shape[idx];
   }
-  ret.dl_tensor.data = TVMBackendAllocWorkspace(kDLCPU, 0, num_elems, dtype.code, dtype.bits);
+  ret.dl_tensor.data =
+      TVMBackendAllocWorkspace(kDLCPU, 0, num_elems * dtype.bits / 8, dtype.code, dtype.bits);
   memset(ret.dl_tensor.data, 0, num_elems * elem_bytes);
   return ret;
 }

From aac5468eb7eda29035c3e6f78f6dd29dea51c10f Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 29 Nov 2020 07:06:01 -0800
Subject: [PATCH 221/258] Demote session traffic logs to DEBUG log level
 (#6989)

---
 python/tvm/micro/session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 1f91cdda10f6..0f2f09a83652 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -124,7 +124,7 @@ def __enter__(self):
             self.transport_context_manager = self.flasher.flash(self.binary)
 
         self.transport = TransportLogger(
-            self.session_name, self.transport_context_manager, level=logging.INFO
+            self.session_name, self.transport_context_manager, level=logging.DEBUG
         ).__enter__()
 
         try:

From ef2e9dd4bb873bbb44b8375a74782803581b2c12 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sun, 29 Nov 2020 07:06:25 -0800
Subject: [PATCH 222/258] Include required CMSIS headers in Cortex-M micro
 kernel. (#6988)

* The existing kernels referenced CMSIS functions presuming that
   those functions were defined by user code. This was the case with
   the old blog post build flow. Add #include, since it's impossible
   to compile the kernels without it.
 * TODO: port those functions to the micro kernels and remove external dependency
---
 python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
index 943aee0227d0..fb6f7a589525 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
@@ -129,6 +129,9 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
+#include <arm_math.h>
+#include <arm_nnsupportfunctions.h>
+
 __STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{

From 5eb0252b2915bf449100f159afdc2421a3051028 Mon Sep 17 00:00:00 2001
From: isong <insop.song@gmail.com>
Date: Sun, 29 Nov 2020 07:06:46 -0800
Subject: [PATCH 223/258] Fix the shape check for vta dense strategy (#6983)

---
 vta/python/vta/top/op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index f3b808a6d1a0..a217104a9ae7 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -123,7 +123,7 @@ def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target):
 @_strategy.dense_strategy.register("vta")
 def dense_strategy_vta(attrs, inputs, out_type, target):
     """dense vta strategy"""
-    if inputs[0].shape == 4:  # this implies the layout is packed
+    if len(inputs[0].shape) == 4:  # this implies the layout is packed
         strategy = OpStrategy()
         strategy.add_implementation(
             _strategy.wrap_compute_dense(dense_packed),

From 93679ac9edb0f4a9009a0c3cad8e0c9e578ba686 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 29 Nov 2020 13:36:56 -0800
Subject: [PATCH 224/258] [AutoScheduler] Skip useless calls to RewriteLayout
 (#6993)

* [AutoScheduler] Skip useless calls of RewriteLayout

* fix lint

* fix lint
---
 src/auto_scheduler/compute_dag.cc | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index e57fc8c9c2d9..caaed6f4d667 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1121,10 +1121,25 @@ ComputeDAG ComputeDAG::RewriteLayout(Array<Step>* transform_steps,
   return new_dag;
 }
 
+// Return whether a DAG has placeholders that are marked as "layout free".
+bool HasLayoutFreeTensors(const ComputeDAG& dag) {
+  for (const auto& op : dag->ops) {
+    if (!op->IsInstance<te::ComputeOpNode>()) {
+      continue;
+    }
+    if (op->attrs.count(ComputeDAG::layout_free_placeholders_key)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 std::pair<te::Schedule, Array<te::Tensor>> ComputeDAG::ApplySteps(
     const Array<Step>& transform_steps, Array<te::Stage>* stages, StageToAxesMap* stage_to_axes,
     LayoutRewriteOption layout_rewrite) const {
-  if (layout_rewrite != LayoutRewriteOption::NoRewrite && !transform_steps.empty()) {
+  if (layout_rewrite != LayoutRewriteOption::NoRewrite && HasLayoutFreeTensors(*this) &&
+      !transform_steps.empty()) {
     Array<Step> steps = transform_steps;
     const auto& dag = RewriteLayout(&steps, layout_rewrite);
     return dag.ApplySteps(steps);

From b3f1bb844d9b17d58aba370523adf125ccb0e6bd Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 29 Nov 2020 15:19:38 -0800
Subject: [PATCH 225/258] [AutoScheduler] Use a smaller retry number (#6996)

---
 python/tvm/auto_scheduler/search_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index ecf6af32cf78..370de8f14e72 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -147,7 +147,7 @@ class SketchPolicy(SearchPolicy):
 
     DEFAULT_PARAMS = {
         "eps_greedy": 0.05,
-        "retry_search_one_round_on_empty": 10,
+        "retry_search_one_round_on_empty": 1,
         "sample_init_min_population": 50,
         "sample_init_use_measured_ratio": 0.2,
         "evolutionary_search_population": 2048,

From 5ed1deadd47f6e8f8986c8c4cb07ae4467df4bcf Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 29 Nov 2020 19:51:40 -0800
Subject: [PATCH 226/258] [AutoScheduler] Use a smaller iteration number for GA
 to acclerate the search (#6994)

* [AutoScheduler] Use a smaller GA iteration number

* fix

* fix

* add a new argument to control the search policy from task scheduler
---
 python/tvm/auto_scheduler/search_policy.py    |  2 +-
 python/tvm/auto_scheduler/task_scheduler.py   | 29 ++++++++++++++-----
 .../search_policy/sketch_policy.cc            |  4 +--
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index 370de8f14e72..35429552dc74 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -151,7 +151,7 @@ class SketchPolicy(SearchPolicy):
         "sample_init_min_population": 50,
         "sample_init_use_measured_ratio": 0.2,
         "evolutionary_search_population": 2048,
-        "evolutionary_search_num_iters": 10,
+        "evolutionary_search_num_iters": 3,
         "evolutionary_search_mutation_prob": 0.85,
         "cpu_multi_level_tiling_structure": "SSRSRS",
         "gpu_multi_level_tiling_structure": "SSSRRSRS",
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index de11fc1b5b11..26bfa2e376b4 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -40,7 +40,13 @@
 
 
 def make_search_policies(
-    search_policy, tasks, num_measures_per_round, verbose, load_model_file=None, load_log_file=None
+    search_policy,
+    search_policy_params,
+    tasks,
+    num_measures_per_round,
+    verbose,
+    load_model_file=None,
+    load_log_file=None,
 ):
     """Make a list of search policies for a list of search tasks.
     It creates one policy per task.
@@ -49,6 +55,8 @@ def make_search_policies(
     ----------
     search_policy: Union[str, List[SearchPolicy]]
         The name of search policy.
+    search_policy_params: Dict[str, Any]]
+        The parameters of the search policy.
     tasks: List[SearchTask]
         The list of all tasks
     num_measures_per_round: int
@@ -86,7 +94,10 @@ def make_search_policies(
             raise ValueError("Invalid search policy: " + search_policy)
 
         if policy_type == "sketch":
-            search_policies = [SketchPolicy(task, cost_model, verbose=verbose) for task in tasks]
+            search_policies = [
+                SketchPolicy(task, cost_model, params=search_policy_params, verbose=verbose)
+                for task in tasks
+            ]
         else:
             raise ValueError("Invalid search policy: " + search_policy)
     else:
@@ -240,18 +251,21 @@ def __init__(
                 self.group_task_ids.append([])
             self.group_task_ids[self.tag_to_group_id[tag]].append(i)
 
-    def tune(self, tune_option, search_policy="default"):
+    def tune(self, tune_option, search_policy="default", search_policy_params=None):
         """Tune a batch of tasks together.
 
         Parameters
         ----------
         tune_option: TuningOptions
             The options of tuning
-        search_policy: : Union[str, List[SearchPolicy]]
+        search_policy: : Union[str, List[SearchPolicy]] = "default"
             The list of search policies.
-            If it is str.
-            "sketch.xgb" for SketchPolicy + XGBModel
-            "sketch.random" for SketchPolicy + RandomModel
+            If it is str,
+            "default" for the default policy (SketchPolicy + XGBModel),
+            "sketch.xgb" for SketchPolicy + XGBModel,
+            "sketch.random" for SketchPolicy + RandomModel.
+        search_policy_params : Optional[Dict[str, Any]]
+            The parameters of the search policy
         """
         # init members
         self.tune_option = tune_option
@@ -280,6 +294,7 @@ def tune(self, tune_option, search_policy="default"):
         # make one search policy for one task
         self.search_policies = make_search_policies(
             search_policy,
+            search_policy_params,
             self.tasks,
             self.num_measures_per_round,
             tune_option.verbose,
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 07d2837ab994..e81e824626d6 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -482,8 +482,8 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   int num_iters = GetIntParam(params, SketchParamKey::EvolutionarySearch::num_iters);
 
   bool is_cost_model_reasonable = !program_cost_model->IsInstance<RandomModelNode>();
-  if (!is_cost_model_reasonable && num_iters > 3) {
-    num_iters = 3;
+  if (!is_cost_model_reasonable && num_iters > 2) {
+    num_iters = 2;
     StdCout(verbose) << "GA iteration number has been adjusted to " << num_iters
                      << " due to random cost model" << std::endl;
   }

From f689f467caf5c2eb53991907b8096dcf0104224e Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Sun, 29 Nov 2020 20:18:49 -0800
Subject: [PATCH 227/258] [Backend][Verilator] Multiple fixes (#6995)

* bump vta-hw submodule version

* fix cmake related stuff
---
 3rdparty/vta-hw                       | 2 +-
 cmake/config.cmake                    | 2 +-
 cmake/modules/contrib/Verilator.cmake | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 519263cc45de..12fb486a491b 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 519263cc45de40449702fd7942fa0a745297180f
+Subproject commit 12fb486a491b75d70ec4c5e0a0cd112ab49a95bc
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 0080e239f16d..4a010d3ef099 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -236,7 +236,7 @@ set(USE_TENSORRT_RUNTIME OFF)
 # Whether use VITIS-AI codegen
 set(USE_VITIS_AI OFF)
 
-# Build Verilator codegen and runtime, example located in 3rdparty/hw-widgets
+# Build Verilator codegen and runtime, example located in 3rdparty/vta-hw/apps/verilator
 set(USE_VERILATOR_HW OFF)
 
 # Build ANTLR parser for Relay text format
diff --git a/cmake/modules/contrib/Verilator.cmake b/cmake/modules/contrib/Verilator.cmake
index 907944706a82..d3c1a7161182 100644
--- a/cmake/modules/contrib/Verilator.cmake
+++ b/cmake/modules/contrib/Verilator.cmake
@@ -16,10 +16,11 @@
 # under the License.
 
 if(USE_VERILATOR_HW STREQUAL "ON")
+  execute_process(COMMAND make --directory ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw/apps/verilator)
   file(GLOB VERILATOR_RELAY_CONTRIB_SRC src/relay/backend/contrib/verilator/codegen.cc)
   list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC})
   list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC})
-  find_library(EXTERN_LIBRARY_VERILATOR NAMES verilator PATHS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/hw-widgets)
+  find_library(EXTERN_LIBRARY_VERILATOR NAMES verilator PATHS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw/apps/verilator)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_VERILATOR})
   file(GLOB VERILATOR_CONTRIB_SRC src/runtime/contrib/verilator/verilator_runtime.cc)
   list(APPEND RUNTIME_SRCS ${VERILATOR_CONTRIB_SRC})

From 9b20aa10df26a040039e897232e39577a06f74ca Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Sun, 29 Nov 2020 20:19:31 -0800
Subject: [PATCH 228/258] add files (#6986)

---
 docker/Dockerfile.ci_cpu                      |  8 +++--
 docker/Dockerfile.ci_i386                     |  8 +++--
 ...nstall_chisel.sh => ubuntu_install_sbt.sh} | 12 ++-----
 docker/install/ubuntu_install_verilator.sh    | 36 +++++++++++++++++++
 4 files changed, 50 insertions(+), 14 deletions(-)
 rename docker/install/{ubuntu_install_chisel.sh => ubuntu_install_sbt.sh} (80%)
 create mode 100644 docker/install/ubuntu_install_verilator.sh

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index b29c93b66707..a3805660b2b1 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -69,8 +69,12 @@ COPY install/ubuntu_install_universal.sh /install/ubuntu_install_universal.sh
 RUN bash /install/ubuntu_install_universal.sh
 
 # Chisel deps for TSIM
-COPY install/ubuntu_install_chisel.sh /install/ubuntu_install_chisel.sh
-RUN bash /install/ubuntu_install_chisel.sh
+COPY install/ubuntu_install_sbt.sh /install/ubuntu_install_sbt.sh
+RUN bash /install/ubuntu_install_sbt.sh
+
+# Verilator deps
+COPY install/ubuntu_install_verilator.sh /install/ubuntu_install_verilator.sh
+RUN bash /install/ubuntu_install_verilator.sh
 
 # TFLite deps
 COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index 73ca50e3eb1e..2cdf10c4369e 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -43,5 +43,9 @@ COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
 
 # Chisel deps for TSIM
-COPY install/ubuntu_install_chisel.sh /install/ubuntu_install_chisel.sh
-RUN bash /install/ubuntu_install_chisel.sh
+COPY install/ubuntu_install_sbt.sh /install/ubuntu_install_sbt.sh
+RUN bash /install/ubuntu_install_sbt.sh
+
+# Verilator deps
+COPY install/ubuntu_install_verilator.sh /install/ubuntu_install_verilator.sh
+RUN bash /install/ubuntu_install_verilator.sh
diff --git a/docker/install/ubuntu_install_chisel.sh b/docker/install/ubuntu_install_sbt.sh
similarity index 80%
rename from docker/install/ubuntu_install_chisel.sh
rename to docker/install/ubuntu_install_sbt.sh
index d6776634ffe0..b02186e3263a 100755
--- a/docker/install/ubuntu_install_chisel.sh
+++ b/docker/install/ubuntu_install_sbt.sh
@@ -22,20 +22,12 @@ set -o pipefail
 
 # The https:// source added below required an apt https transport
 # support.
-apt-get update && apt-get install -y apt-transport-https flex bison
+apt-get update && apt-get install -y apt-transport-https
 
-# Install the necessary dependencies for Chisel
+# Install the necessary dependencies for sbt
 echo "deb https://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list
 apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
 
 # Note: The settings in vta/hardware/chisel/project/build.properties
 # file determines required sbt version.
 apt-get update && apt-get install -y sbt=1.1.1
-
-# Install the Verilator with major version 4.0
-wget https://www.veripool.org/ftp/verilator-4.010.tgz
-tar xf verilator-4.010.tgz
-cd verilator-4.010/
-./configure
-make -j4
-make install
diff --git a/docker/install/ubuntu_install_verilator.sh b/docker/install/ubuntu_install_verilator.sh
new file mode 100644
index 000000000000..1c5193c053c1
--- /dev/null
+++ b/docker/install/ubuntu_install_verilator.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+# Verilator version
+version="4.104"
+
+# Install dependencies
+apt-get update && apt-get install -y autoconf g++ flex bison
+
+# Install Verilator
+wget "https://github.com/verilator/verilator/archive/v$version.tar.gz"
+tar xf "v$version.tar.gz"
+cd "verilator-$version"
+autoconf
+./configure
+make -j4
+make install

From b2968b85d076b51dd8aeea54c593ea3cd7115e02 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Mon, 30 Nov 2020 18:19:12 -0800
Subject: [PATCH 229/258] fix docker image when installing rust (#7004)

---
 docker/install/ubuntu_install_rust.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index 6b5b4379cc9c..5716b11db6c4 100755
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -26,7 +26,7 @@ export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
 # this rustc is one supported by the installed version of rust-sgx-sdk
 curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain stable
-. $CARGO_HOME/env
+export PATH=$CARGO_HOME/bin:$PATH
 rustup component add rustfmt
 
 # install wasmtime

From c9f303abeae90e15acdcef2deb1a6d62a3f76d73 Mon Sep 17 00:00:00 2001
From: eric <eun.taik.lee@samsung.com>
Date: Tue, 1 Dec 2020 13:24:48 +0900
Subject: [PATCH 230/258] [TVMC] use target_host when it is set (#6855)

* [TVMC] add cl support in tvmc runner

* [TVMC] use target_host when it is set

* Cleanup comment and asssert device type in else case

* add a test for tvmc compiler

* remove unused func
---
 python/tvm/driver/tvmc/compiler.py        |  6 +++---
 tests/python/driver/tvmc/conftest.py      | 13 +++++++++++++
 tests/python/driver/tvmc/test_compiler.py | 18 ++++++++++++++++++
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index e1a4a7481f6a..57071476b073 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -178,18 +178,18 @@ def compile_model(
         mod = common.convert_graph_layout(mod, alter_layout)
 
     tvm_target = common.target_from_cli(target)
-    target_host = target_host or ""
+    target_host = tvm_target if not target_host else target_host
 
     if tuning_records and os.path.exists(tuning_records):
         logger.debug("tuning records file provided: %s", tuning_records)
         with autotvm.apply_history_best(tuning_records):
             with tvm.transform.PassContext(opt_level=3):
                 logger.debug("building relay graph with tuning records")
-                graph_module = relay.build(mod, tvm_target, params=params, target_host=tvm_target)
+                graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host)
     else:
         with tvm.transform.PassContext(opt_level=3):
             logger.debug("building relay graph (no tuning records provided)")
-            graph_module = relay.build(mod, tvm_target, params=params, target_host=tvm_target)
+            graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host)
 
     # Generate output dump files with sources
     dump_code = dump_code or []
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index 62af34ee7758..882d793ccebd 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -148,3 +148,16 @@ def imagenet_cat(tmpdir_factory):
     np.savez(cat_file_full_path, input=image_data)
 
     return cat_file_full_path
+
+
+@pytest.fixture(scope="session")
+def tflite_mobilenet_v1_0_25_128(tmpdir_factory):
+    base_url = "https://storage.googleapis.com/download.tensorflow.org/models"
+    model_url = "mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz"
+    model_file = download_and_untar(
+        "{}/{}".format(base_url, model_url),
+        "mobilenet_v1_0.25_128.tflite",
+        temp_dir=tmpdir_factory.mktemp("data"),
+    )
+
+    return model_file
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 28a60b19b28e..4bbb6fbf2cf8 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -150,3 +150,21 @@ def test_cross_compile_aarch64_onnx_module(onnx_resnet50):
     assert type(params) is dict
     assert type(dumps) is dict
     assert "asm" in dumps.keys()
+
+
+@tvm.testing.requires_opencl
+def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
+    pytest.importorskip("tflite")
+
+    graph, lib, params, dumps = tvmc.compiler.compile_model(
+        tflite_mobilenet_v1_0_25_128,
+        target="opencl",
+        target_host="llvm",
+        alter_layout="NCHW",
+    )
+
+    # check for output types
+    assert type(graph) is str
+    assert type(lib) is tvm.runtime.module.Module
+    assert type(params) is dict
+    assert type(dumps) is dict

From e6b06d5d6f67b6ccf6474225c551eda8447cf1e3 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Tue, 1 Dec 2020 09:01:56 -0800
Subject: [PATCH 231/258] Dynamic Batch Support for TRT  (#6955)

* add_annotate_fn

* Reshape_ann_fn

* Prune Subgraph

* Dynamic Shape

* Make PT Mask RCNN Work

* Cleanup

* Remove comments

* Remove COmments

* GetBatchSizeFix

* Fix Remove Droupout

* Fix Remove Droupout

* TRT Runtime

* Add MaskrCNN R50

* New Testing code

* Fix black

* Test Maskrcnn r50 done

* Test MR50

* Space typo

* Change Log to Dlog

* Move test to tensorrt.py

* Remove imports

* Remove function

* Add it to trt

* import error

* Imports

* Add torch to CI

* trt_test

* Check test

* Revert Pytorch install

* Fix

* test dynamic batch

* TRT

* Resolve PR comments

* Zero batch size add

Co-authored-by: Ubuntu <ubuntu@ip-172-31-27-149.us-east-2.compute.internal>
---
 python/tvm/relay/op/contrib/tensorrt.py       | 117 +++++++++--
 src/relay/backend/utils.h                     |   3 +-
 .../contrib/tensorrt/tensorrt_runtime.cc      |  40 ++--
 tests/python/contrib/test_tensorrt.py         | 185 ++++++++++++++++++
 4 files changed, 318 insertions(+), 27 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 44336073d842..acd4f4740b2d 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -23,7 +23,7 @@
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.expr import Call, Constant, Tuple, GlobalVar, Var, TupleGetItem
-from tvm.relay.expr_functor import ExprMutator
+from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 logger = logging.getLogger("TensorRT")
 
@@ -173,7 +173,7 @@ def check_dynamism(args, op_name):
     """
     for arg in args:
         if isinstance(arg, (Call, Var, Constant, TupleGetItem)):
-            for dim_shape in arg.checked_type.shape:
+            for dim_shape in arg.checked_type.shape[1:]:
                 if isinstance(dim_shape, tvm.tir.expr.Any):
                     return True
         elif isinstance(arg, Tuple):
@@ -198,6 +198,21 @@ def _func_wrapper(expr):
         if any([x.checked_type.dtype != "float32" for x in args]):
             logger.info("Only float32 inputs are supported for TensorRT.")
             return False
+        if op_name == "multiply":
+            shapes = [
+                [
+                    int(x) if not isinstance(x, tvm.tir.expr.Any) else -1
+                    for x in arg.checked_type.shape
+                ]
+                for arg in args
+            ]
+            # Batched multiply operations don't work in implicit batch mode. The following shapes
+            # have been excluded because they occur in PT MaskRCNN model. The long term solution is
+            # to switch to explicit batch mode after performance regressions are solved.
+            if all(
+                [list(map(int, shape)) in [[300, 64, 7, 7], [300, 1, 1, 1]] for shape in shapes]
+            ):
+                return False
         return checker(attrs, args, op_name)
 
     return _func_wrapper
@@ -292,19 +307,26 @@ def add_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if add is supported by TensorRT."""
 
     args = expr.args
+
+    shapes = [
+        [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
+        for arg in args
+    ]
+
     # RelayVM + TRT doesn't support scalar addition yet.
-    for arg in args:
-        if not arg.checked_type.shape:
+    for shape in shapes:
+        if len(shape) < 1:
             return False
+
     if any([x.checked_type.dtype != "float32" for x in args]):
         logger.info("Only float32 inputs are supported for TensorRT.")
         return False
     if (
         not get_tensorrt_use_implicit_batch_mode()
         and (isinstance(args[0], Constant) or isinstance(args[1], Constant))
-        and args[0].checked_type.shape[0] == args[1].checked_type.shape[0]
-        and args[0].checked_type.shape[0] != 1
-        and (len(args[0].checked_type.shape) > 3 or len(args[1].checked_type.shape) > 3)
+        and shapes[0][0] == shapes[1][0]
+        and shapes[0][0] != 1
+        and (len(shapes[0]) > 3 or len(shapes[1]) > 3)
     ):
         logger.info("add: bug in TRT with adding batched constants.")
         return False
@@ -592,11 +614,35 @@ def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
         logger.info("reshape: new shape dims must be explicit.")
         return False
     if get_tensorrt_use_implicit_batch_mode():
-        shape = list(map(int, args[0].checked_type.shape))
-        new_shape = list(map(int, attrs.newshape))
+        shape = args[0].checked_type.shape
+        new_shape = attrs.newshape
         if len(new_shape) == 0 or len(shape) == 0:
             logger.info("reshape: Can't reshape to or from scalar.")
             return False
+
+        dynamic_reshape = any([isinstance(x, tvm.tir.expr.Any) for x in shape])
+
+        if dynamic_reshape:
+            # Make sure that the batch dim is unmodified.
+            if int(new_shape[0]) < 0:
+                for shape_val, new_shape_val in enumerate(shape[1:], new_shape[1:]):
+                    if not (
+                        isinstance(shape_val, int)
+                        and isinstance(new_shape_val, int)
+                        and int(shape_val) == int(new_shape_val)
+                    ):
+                        return False
+            elif int(new_shape[0]) > 0:
+                if not (
+                    isinstance(shape[0], int)
+                    and isinstance(new_shape[0], int)
+                    and int(shape[0]) == int(new_shape[0])
+                ):
+                    return False
+            return True
+        shape = list(map(int, shape))
+        new_shape = list(map(int, new_shape))
+
         # TRT cannot modify batch dimension.
         original_volume = np.prod(shape)
         # First, resolve 0.
@@ -607,6 +653,7 @@ def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
         for i, value in enumerate(new_shape):
             if value == -1:
                 new_shape[i] = original_volume // np.prod([x for x in new_shape if x != -1])
+        # Remove batch dimension and see if volumes match
         if shape[0] != new_shape[0]:
             logger.info("reshape: can't modify batch dimension.")
             return False
@@ -795,6 +842,41 @@ def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
+class IsComputeIntensiveGraph(ExprVisitor):
+    """
+    Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
+    its transpose, dense and batch mat-mul.
+    """
+
+    def __init__(self):
+        ExprVisitor.__init__(self)
+        self.is_compute_intensive = False
+
+    def visit_call(self, call):
+        compute_intensive_ops = set(
+            [
+                "nn.conv2d",
+                "nn.conv2d_transpose",
+                "nn.conv3d",
+                "nn.conv3d_transpose",
+                "nn.dense",
+                "nn.batch_matmul",
+            ]
+        )
+        if isinstance(call.op, tvm.tir.op.Op):
+            if str(call.op) in compute_intensive_ops:
+                self.is_compute_intensive = True
+
+        return super().visit_call(call)
+
+    def is_graph_compute_intensive(self, subgraph) -> bool:
+        """
+        This function recursively visits the graph and checks if it's compute intensive"
+        """
+        self.visit(subgraph)
+        return self.is_compute_intensive
+
+
 def is_valid_subgraph(params, body):
     """Final check on whether the subgraph is valid and should be offloaded to TensorRT."""
     # Remove invalid subgraphs for implicit batch mode.
@@ -802,24 +884,31 @@ def is_valid_subgraph(params, body):
         input_batch_sizes = []
         for var in params:
             # In implicit batch mode, all inputs must have same batch size
+            # TODO: (codeislife99) : Fix different dynamic batch size inputs
+
             if isinstance(var.checked_type, relay.TupleType):
                 for tupe_type in var.checked_type.fields:
                     # Scalar inputs not allowed
                     if len(tupe_type.shape) == 0:
                         logger.info("tensorrt: scalar inputs not supported")
                         return False
-                    input_batch_sizes.append(int(tupe_type.shape[0]))
+
+                    if not isinstance(tupe_type.shape[0], tvm.tir.expr.Any):
+                        input_batch_sizes.append(int(tupe_type.shape[0]))
             else:
                 # Scalar inputs not allowed
                 if len(var.checked_type.shape) == 0:
                     logger.info("tensorrt: scalar inputs not supported")
                     return False
-                input_batch_sizes.append(int(var.checked_type.shape[0]))
+                if not isinstance(var.checked_type.shape[0], tvm.tir.expr.Any):
+                    input_batch_sizes.append(int(var.checked_type.shape[0]))
         if len(input_batch_sizes) > 1 and len(set(input_batch_sizes)) != 1:
             logger.info("tensorrt: inputs have different batch sizes")
             return False
-    # Remove subgraphs with no multiply-accumulates
-    if get_tensorrt_remove_no_mac_subgraphs() and relay.analysis.get_total_mac_number(body) == 0:
+    if (
+        get_tensorrt_remove_no_mac_subgraphs()
+        and not IsComputeIntensiveGraph().is_graph_compute_intensive(body)
+    ):
         return False
     return True
 
@@ -880,6 +969,8 @@ class RemoveDropout(ExprMutator):
 
     def visit_tuple_getitem(self, op):
         visit = super().visit_tuple_getitem(op)
+        if visit.index != 0:
+            return visit
         if (
             isinstance(visit.tuple_value, Call)
             and visit.tuple_value.op.name == "nn.dropout"
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 4426642e8e18..ccb8611b7a3c 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -160,8 +160,7 @@ inline std::vector<int64_t> GetIntShape(const Array<IndexExpr>& shape) {
   std::vector<int64_t> ret;
   for (const auto& dim : shape) {
     const int64_t* pval = tir::as_const_int(dim);
-    ICHECK(pval) << "Expect integer, but received: " << dim->GetTypeKey();
-    ret.push_back(*pval);
+    ret.push_back(pval ? *pval : -1);
   }
   return ret;
 }
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 445010321668..3f87f8d00ee6 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -41,6 +41,13 @@ namespace tvm {
 namespace runtime {
 namespace contrib {
 
+struct PairHash {
+  template <class T1, class T2>
+  std::size_t operator()(const std::pair<T1, T2>& pair) const {
+    return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
+  }
+};
+
 using namespace tvm::runtime::json;
 
 class TensorRTRuntime : public JSONRuntimeBase {
@@ -105,12 +112,13 @@ class TensorRTRuntime : public JSONRuntimeBase {
   /*! \brief Run inference using built engine. */
   void Run() override {
     BuildEngine();
-    auto& engine_and_context = trt_engine_cache_.at(symbol_name_);
+    batch_size_ = data_entry_[input_var_eid_[0]]->shape[0];
+    if (batch_size_ == 0) return;
+    auto& engine_and_context = trt_engine_cache_.at(std::make_pair(symbol_name_, batch_size_));
     auto engine = engine_and_context.engine;
     auto context = engine_and_context.context;
     auto& device_buffers = engine_and_context.device_buffers;
     std::vector<void*> bindings(engine->getNbBindings(), nullptr);
-
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
       auto nid = input_nodes_[i];
       if (nodes_[nid].GetOpType() == "input") {
@@ -169,10 +177,11 @@ class TensorRTRuntime : public JSONRuntimeBase {
    * do nothing.
    */
   void BuildEngine() {
-    if (trt_engine_cache_.count(symbol_name_)) return;
-    DLOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_;
+    batch_size_ = data_entry_[input_var_eid_[0]]->shape[0];
+    if (trt_engine_cache_.count(std::make_pair(symbol_name_, batch_size_))) return;
+    DLOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_
+               << " with batch size " << batch_size_;
     const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
-    batch_size_ = GetBatchSize();
     TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
                             use_fp16, batch_size_);
 
@@ -203,8 +212,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
     }
 
     // Build engine.
-    trt_engine_cache_[symbol_name_] = builder.BuildEngine();
-    DLOG(INFO) << "Finished building TensorRT engine for subgraph " << symbol_name_;
+    trt_engine_cache_[std::make_pair(symbol_name_, batch_size_)] = builder.BuildEngine();
+    DLOG(INFO) << "Finished building TensorRT engine for subgraph " << symbol_name_
+               << " with batch size " << batch_size_;
     CacheEngineToDisk();
   }
 
@@ -240,7 +250,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
     helper.DeclareField("inputs", &engine_and_context.inputs);
     helper.DeclareField("outputs", &engine_and_context.outputs);
     helper.ReadAllFields(&reader);
-    trt_engine_cache_[symbol_name_] = engine_and_context;
+    const int batch_size = 1;
+    trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = engine_and_context;
     return true;
   }
 
@@ -248,13 +259,15 @@ class TensorRTRuntime : public JSONRuntimeBase {
    * directory so it can be loaded later.
    */
   void CacheEngineToDisk() {
+    batch_size_ = data_entry_[input_var_eid_[0]]->shape[0];
     std::string cache_dir = dmlc::GetEnv("TVM_TENSORRT_CACHE_DIR", std::string(""));
     if (cache_dir.empty()) return;
     std::string key = GetSubgraphKey();
     std::string path = cache_dir + "/" + key + ".plan";
     DLOG(INFO) << "Caching TensorRT engine to " << path;
     // Serialize engine to disk
-    nvinfer1::IHostMemory* serialized_engine = trt_engine_cache_[symbol_name_].engine->serialize();
+    nvinfer1::IHostMemory* serialized_engine =
+        trt_engine_cache_[std::make_pair(symbol_name_, batch_size_)].engine->serialize();
     SaveBinaryToFile(path, std::string(static_cast<const char*>(serialized_engine->data()),
                                        serialized_engine->size()));
     serialized_engine->destroy();
@@ -262,8 +275,10 @@ class TensorRTRuntime : public JSONRuntimeBase {
     std::ostringstream os;
     dmlc::JSONWriter writer(&os);
     writer.BeginObject();
-    writer.WriteObjectKeyValue("inputs", trt_engine_cache_[symbol_name_].inputs);
-    writer.WriteObjectKeyValue("outputs", trt_engine_cache_[symbol_name_].outputs);
+    writer.WriteObjectKeyValue("inputs",
+                               trt_engine_cache_[std::make_pair(symbol_name_, batch_size_)].inputs);
+    writer.WriteObjectKeyValue(
+        "outputs", trt_engine_cache_[std::make_pair(symbol_name_, batch_size_)].outputs);
     writer.EndObject();
     std::string meta_path = cache_dir + "/" + key + ".meta";
     SaveBinaryToFile(meta_path, os.str());
@@ -290,7 +305,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
   }
 
   /*! \brief Map of function name to TRT engine if built already. */
-  std::unordered_map<std::string, TensorRTEngineAndContext> trt_engine_cache_;
+  std::unordered_map<std::pair<std::string, int>, TensorRTEngineAndContext, PairHash>
+      trt_engine_cache_;
 
   /*! \brief TensorRT logger. */
   TensorRTLogger logger_;
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 8b61323a71ad..10c311a6d363 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -21,11 +21,15 @@
 
 import tvm
 import tvm.relay.testing
+
 from tvm import relay
 from tvm.relay.op.contrib import tensorrt
 from tvm.contrib import graph_runtime, utils
 from tvm.runtime.vm import VirtualMachine
 from tvm.relay import Any, GlobalVar, transform
+from typing import Dict, Tuple, Union
+from tvm.contrib.download import download
+from tvm.relay.op.contrib import tensorrt
 
 
 def skip_codegen_test():
@@ -1034,5 +1038,186 @@ def set_func_attr(func, compile_name, symbol_name):
     tvm.ir.assert_structural_equal(mod_trt, mod_exp, map_free_vars=True)
 
 
+def test_tensorrt_dynamic_batch():
+    if skip_codegen_test():
+        return
+
+    batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2]
+    x_shape = (relay.Any(), 1, 8, 8)
+    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
+    result_dict = {}
+    for use_trt in [True, False]:
+        x = relay.var("x", shape=x_shape, dtype="float32")
+        out = relay.nn.relu(x)
+        f = relay.Function([x], out)
+        mod = tvm.IRModule()
+        mod["main"] = f
+        if use_trt:
+            mod = relay.tensorrt.EnableTrt(mod)
+
+        if not skip_runtime_test():
+            with relay.build_config(opt_level=3):
+                relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
+
+            for i, batch_size in enumerate(batches_to_test):
+                result_dict[(i, use_trt)] = relay_exec.evaluate()(x_data[:batch_size, ...])
+
+    if not skip_runtime_test():
+        for i in range(len(batches_to_test)):
+            assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
+
+
+def test_tensorrt_dynamic_batch_conv():
+    if skip_codegen_test():
+        return
+    batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2]
+    x_shape = (relay.Any(), 32, 8, 8)
+    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
+    k_shape = (16, 32, 3, 3)
+    params = {"kernel": np.random.uniform(-1, 1, k_shape).astype("float32")}
+    result_dict = {}
+    for use_trt in [True, False]:
+        x = relay.var("x", shape=x_shape, dtype="float32")
+        kernel = relay.var("kernel", shape=k_shape, dtype="float32")
+        out = relay.nn.conv2d(x, kernel, channels=16, kernel_size=(3, 3), groups=1)
+        f = relay.Function([x, kernel], out)
+        mod = tvm.IRModule()
+        mod["main"] = f
+        if use_trt:
+            mod = tensorrt.partition_for_tensorrt(mod, params)
+
+        if not skip_runtime_test():
+            with relay.build_config(opt_level=3):
+                relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
+
+            for i, batch_size in enumerate(batches_to_test):
+                result_dict[(i, use_trt)] = relay_exec.evaluate()(
+                    x=x_data[:batch_size, ...], **params
+                )
+
+    if not skip_runtime_test():
+        for i in range(len(batches_to_test)):
+            assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
+
+
+def test_maskrcnn_resnet50() -> None:
+    """
+    This function tests the working of pytorch maskrcnn with resnet50 as backbone with
+    VM and VM + TRT. Since the order of compiled model outputs is a bit different from
+    original pytorch model, it uses a custom logic for comparison check.
+    """
+    if skip_codegen_test():
+        return
+
+    import torch
+    import torchvision
+
+    def convert_traced_model_to_vm_trt(
+        traced_module: torch.jit.TopLevelTracedModule, np_sample_input: np.ndarray, target: str
+    ) -> tvm.runtime.vm.Executable:
+        """
+        This function converts a traced pytorch model to VM + TRT.
+        """
+        input_shape = np_sample_input.shape
+        input_name = "input0"
+        shape_list = [(input_name, input_shape)]
+        mod, params = relay.frontend.from_pytorch(traced_module, shape_list)
+        mod, config = tensorrt.partition_for_tensorrt(mod, params, remove_no_mac_subgraphs=True)
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]):
+            vm_trt_exec = relay.vm.compile(mod, target=target, params=params)
+
+        return vm_trt_exec
+
+    class TraceWrapper(torch.nn.Module):
+        """
+        This class is a wrapper over the torch module to convert the outputs into traceable form
+        """
+
+        def __init__(self, model: torch.nn.Module) -> None:
+            super().__init__()
+            self.model = model
+
+        def forward(
+            self, inp: torch.Tensor
+        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+            out = self.model(inp)
+            return out[0]["boxes"], out[0]["scores"], out[0]["labels"], out[0]["masks"]
+
+    def get_traced_maskrcnn_model(np_sample_input: np.ndarray) -> torch.jit.TopLevelTracedModule:
+        """
+        This function takes a sample input and returns the traced maskrcnn model
+        """
+        model_func = torchvision.models.detection.maskrcnn_resnet50_fpn
+        model = TraceWrapper(model_func(pretrained=True))
+        model.eval()
+        inp = torch.Tensor(np.random.uniform(0.0, 250.0, size=np_sample_input.shape))
+
+        with torch.no_grad():
+            out = model(inp)
+            traced_module = torch.jit.trace(model, inp)
+            traced_module.eval()
+
+        return traced_module
+
+    def get_maskrcnn_input(in_size: int) -> np.ndarray:
+        """
+        This function gets a real image with multiple objects of interest and returns it.
+        """
+        input_shape = (1, 3, in_size, in_size)
+        img_path = "test_street_small.jpg"
+        img_url = (
+            "https://raw.githubusercontent.com/dmlc/web-data/"
+            "master/gluoncv/detection/street_small.jpg"
+        )
+        download(img_url, img_path)
+        import cv2
+
+        img = cv2.imread(img_path).astype("float32")
+        img = cv2.resize(img, (in_size, in_size))
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = np.transpose(img / 255.0, [2, 0, 1])
+        img = np.expand_dims(img, axis=0)
+
+        return img
+
+    in_size = 300
+    np_sample_input = get_maskrcnn_input(in_size)
+    traced_module = get_traced_maskrcnn_model(np_sample_input)
+    vm_trt_exec = convert_traced_model_to_vm_trt(traced_module, np_sample_input, target="llvm")
+
+    if skip_runtime_test():
+        return
+
+    ctx = tvm.cpu()
+    vm = tvm.runtime.vm.VirtualMachine(vm_trt_exec, ctx)
+    vm.set_input("main", **{"input0": np_sample_input})
+    tvm_res = vm.run()
+
+    # Descending sort by scores and get the high confidence indices. In this example 9 is chosen,
+    # because this image has 9 boxes over 0.9 confidence
+    num_high_confidence_boxes = 9
+    tvm_indices = np.argsort(-1 * tvm_res[1].asnumpy())[:num_high_confidence_boxes]
+
+    with torch.no_grad():
+        out = traced_module(torch.Tensor(np_sample_input))
+        # Descending sort by scores and get the high confidence indices
+        pt_indices = np.argsort(-1 * out[1].numpy())[:num_high_confidence_boxes]
+
+    tol = [1e-1, 5e-3, 1e-5, 4e-1]  # [Box Tol, Score Tol, Label Tol, Mask Tol]
+    # Because of certain ops, there are certain minor differences in TVM outputs and PT outputs,
+    # This means that the tolerance can't be 1e-4 or 1e-5 throughout. The ideal way to get around
+    # this is to test it on an entire dataset and compare mAP with the original model.
+    # However, since that is not practically possible on CI, the following compromise is made.
+    # These tolerances are chosen based on their impact or lack thereof to the mAP score, e.g:
+    # 0.1 pixel difference of a box in a 300X300 image wont make any change.
+    for i, tol_val in zip(range(4), tol):
+        np.testing.assert_allclose(
+            tvm_res[i].asnumpy()[tvm_indices],
+            out[i].numpy()[pt_indices],
+            rtol=tol_val,
+            atol=tol_val,
+        )
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 4ef456c623b514ffd857c5f85f7d1d1dd89415fe Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 1 Dec 2020 11:20:09 -0800
Subject: [PATCH 232/258] [RELAY,TOPI] Add scatter_nd op (#6854)

* [RELAY,TOPI] Add scatter_nd op

Scatter_nd is the inverse of gather_nd and also happens to be its
gradient. The implementation here is not optimized. There are no cpu or
gpu specific implementations.

* formatting

* Fix tests

* formatting

* specify types on test

* Fix grad test

* scatter_nd cuda impl

* cuda impl

* x86 impl

* formatting

* fix shape rel

* fix tests

* formatting
---
 include/tvm/relay/attrs/transform.h           |   8 ++
 python/tvm/relay/backend/compile_engine.py    |   5 +-
 python/tvm/relay/op/_tensor_grad.py           |   7 +
 python/tvm/relay/op/_transform.py             |   9 ++
 python/tvm/relay/op/strategy/cuda.py          |  13 ++
 python/tvm/relay/op/strategy/generic.py       |  22 ++++
 python/tvm/relay/op/strategy/x86.py           |  13 ++
 python/tvm/relay/op/transform.py              |  24 ++++
 python/tvm/relay/testing/__init__.py          |   2 +
 python/tvm/te/operation.py                    |   6 +-
 python/tvm/topi/cuda/scatter.py               | 106 ++++++++++++++++
 python/tvm/topi/scatter.py                    | 120 +++++++++++++++++-
 python/tvm/topi/testing/__init__.py           |   1 +
 python/tvm/topi/testing/common.py             |  31 +++++
 python/tvm/topi/x86/__init__.py               |   1 +
 python/tvm/topi/x86/scatter.py                | 109 ++++++++++++++++
 src/relay/analysis/type_solver.cc             |   9 +-
 src/relay/op/tensor/transform.cc              |  68 ++++++++++
 tests/python/relay/test_any.py                |   5 +-
 tests/python/relay/test_op_grad_level3.py     |   9 ++
 tests/python/topi/python/test_topi_scatter.py |  67 ++++++++++
 21 files changed, 627 insertions(+), 8 deletions(-)
 create mode 100644 python/tvm/topi/x86/scatter.py
 create mode 100644 tests/python/topi/python/test_topi_scatter.py

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 262f41edad67..5c0f6f001d2b 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -129,6 +129,14 @@ struct ScatterAddAttrs : public tvm::AttrsNode<ScatterAddAttrs> {
   }
 };
 
+struct ScatterNDAttrs : public tvm::AttrsNode<ScatterNDAttrs> {
+  Array<Integer> out_shape;
+
+  TVM_DECLARE_ATTRS(ScatterNDAttrs, "relay.attrs.ScatterNDAttrs") {
+    TVM_ATTR_FIELD(out_shape).describe("Output shape of the scatter.");
+  }
+};
+
 struct GatherAttrs : public tvm::AttrsNode<GatherAttrs> {
   Integer axis;
 
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 32affe73395c..a39f72e2e61f 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -122,7 +122,10 @@ def get_valid_implementations(op, attrs, inputs, out_type, target):
         The list of all valid op implementations.
     """
     fstrategy = op.get_attr("FTVMStrategy")
-    assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name
+    assert fstrategy is not None, (
+        "%s doesn't have an FTVMStrategy registered. You can register "
+        "one in python with `tvm.relay.op.register_strategy`." % op.name
+    )
     with target:
         strategy = fstrategy(attrs, inputs, out_type, target)
     analyzer = tvm.arith.Analyzer()
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index b070d9f5b3ff..9c84411352f2 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -62,6 +62,7 @@
     squeeze,
     strided_set,
     arange,
+    scatter_nd,
 )
 
 
@@ -803,3 +804,9 @@ def arange_grad(orig, grad):
     grad_step = cast_like(_sum(grad_step), step)
 
     return [grad_start, grad_stop, grad_step]
+
+
+@register_gradient("gather_nd")
+def gather_nd_grad(orig, grad):
+    data, indices = orig.args
+    return [scatter_nd(grad, indices, data.checked_type.concrete_shape), zeros_like(indices)]
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index a06aff11855b..10c34ea8a72f 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -124,6 +124,15 @@ def compute_interpolate(attrs, inputs, output_type):
 
 _reg.register_schedule("interpolate", strategy.schedule_interpolate)
 
+# scatter
+@_reg.register_compute("scatter_nd")
+def compute_scatter_nd(attrs, inputs, output_type):
+    """Compute definition of scatter_nd"""
+    return [topi.scatter_nd(inputs[0], inputs[1], attrs.out_shape)]
+
+
+_reg.register_strategy("scatter_nd", strategy.scatter_nd_strategy)
+
 #####################
 #  Shape functions  #
 #####################
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index f37fc2a96cd5..bd96cad8ed02 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -776,6 +776,19 @@ def scatter_add_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+@scatter_nd_strategy.register(["cuda", "gpu"])
+def scatter_nd_cuda(attrs, inputs, out_type, target):
+    """scatter_nd cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter_nd(topi.cuda.scatter_nd),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter_nd.cuda",
+        plevel=10,
+    )
+    return strategy
+
+
 @argsort_strategy.register(["cuda", "gpu"])
 def argsort_strategy_cuda(attrs, inputs, out_type, target):
     """argsort cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 59b2b3489783..756d5f0cd2ea 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1071,6 +1071,28 @@ def schedule_interpolate(attrs, outs, target):
         return topi.generic.schedule_interpolate(outs)
 
 
+# scatter_nd
+@override_native_generic_func("scatter_nd_strategy")
+def scatter_nd_strategy(attrs, inputs, out_type, target):
+    """scatter_nd generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter_nd(topi.scatter_nd),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter_nd.generic",
+    )
+    return strategy
+
+
+def wrap_compute_scatter_nd(topi_compute):
+    """Wrap scatter_nd topi compute"""
+
+    def _compute_scatter_nd(attrs, inputs, _):
+        return [topi_compute(inputs[0], inputs[1], attrs.out_shape)]
+
+    return _compute_scatter_nd
+
+
 # bitserial_conv2d
 def wrap_compute_bitserial_conv2d(topi_compute):
     """wrap bitserial_conv2d topi compute"""
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 3c5735b17aa5..3f129c471faf 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -446,3 +446,16 @@ def bitserial_dense_strategy_cpu(attrs, inputs, out_type, target):
         name="bitserial_dense.x86",
     )
     return strategy
+
+
+@scatter_nd_strategy.register("cpu")
+def scatter_nd_strategy_cpu(attrs, inputs, out_type, target):
+    """scatter_nd x86 strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter_nd(topi.x86.scatter_nd),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="scatter_nd.x86",
+        plevel=10,
+    )
+    return strategy
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 92ecd34bc359..0ffab12c7e70 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -308,6 +308,30 @@ def scatter_add(data, indices, updates, axis):
     return _make.scatter_add(data, indices, updates, axis)
 
 
+def scatter_nd(data, indices, out_shape):
+    """Scatter values from an array.
+
+    See :py:func:`tvm.topi.scatter` for how data is scattered.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    indices : relay.Expr
+        The index locations to update.
+
+    out_shape : relay.Expr
+        Output shape of the scatter.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.scatter_nd(data, indices, out_shape)
+
+
 def reshape_like(data, shape_like, lhs_begin=0, lhs_end=None, rhs_begin=0, rhs_end=None):
     """Reshapes the input tensor by the size of another tensor.
     For an input tensor with shape ``(d0, d1, ..., d(k-1))``, `reshape_like` operation reshapes
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 9c87f2795e5c..93110e313642 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -143,6 +143,8 @@ def check_grad(
                         break
             grads = tmp
 
+        assert len(grads) > 0, "You must test at least one gradient."
+
         # Get numeric gradients for each dimension of each param, using two-sided approximation.
         approx_grads = []
         for x in test_inputs:
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 30d0df382c27..0f3457af0f10 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -317,7 +317,11 @@ def extern(
     if isinstance(body, tvm.tir.PrimExpr):
         body = tvm.tir.Evaluate(body)
     if not isinstance(body, tvm.tir.Stmt):
-        raise ValueError("Function '{}' should return PrimExpr or Stmt".format(fcompute.__name__))
+        raise ValueError(
+            "Function '{}' should return PrimExpr or Stmt, but it returned '{}'".format(
+                fcompute.__name__, type(body)
+            )
+        )
 
     op = _ffi_api.ExternOp(name, tag, attrs, inputs, input_placeholders, output_placeholders, body)
     res = [op.output(i) for i in range(len(output_placeholders))]
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 0a3e96f4be30..5e03fafcfb58 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -18,6 +18,7 @@
 """Scatter operator """
 import tvm
 from tvm import te
+from ..scatter import _verify_scatter_nd_inputs
 
 
 def ceil_div(a, b):
@@ -522,3 +523,108 @@ def update_func(dst_ptr, dst_index, update):
     )
 
     return out
+
+
+def scatter_nd(data, indices, shape):
+    """Scatter elements from a n-dimension array.
+
+    Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), indices with shape
+    (M, Y_0, ..., Y_{K-1}), and output with shape (X_0, X_1, ..., X_{N-1}), scatter_nd computes
+
+    .. code-block::
+
+        output[indices[0, y_0, ..., y_{K-1}],
+               ...,
+               indices[M-1, y_0, ..., y_{K-1}],
+               x_M,
+               ...,
+               x_{N-1}
+              ] = data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]
+
+    all other entries in the output are 0. Repeated indices are summed.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The source array.
+
+    indices : tvm.te.Tensor
+        The indices of the values to extract.
+
+    shape : Sequence[int]
+        The output shape. This must be specified because it cannot be inferred.
+
+    Returns
+    -------
+    ret : tvm.te.Tensor
+    """
+    _verify_scatter_nd_inputs(data, indices, shape)
+
+    def gen_ir(data_ptr, indices_ptr, out_ptr):
+        ib = tvm.tir.ir_builder.create()
+
+        data = ib.buffer_ptr(data_ptr)
+        indices = ib.buffer_ptr(indices_ptr)
+        out = ib.buffer_ptr(out_ptr)
+
+        # We combine all the indices dimensions but the first one into a single
+        # dimension so we can iterate it in single loop instead of an arbitrary
+        # number of loops. We do the same thing for all the data dimensions.
+        fused_indices_dimension = 1
+        for i in indices_ptr.shape[1:]:
+            fused_indices_dimension *= i
+
+        fused_data_dimension = 1
+        for i in data_ptr.shape[len(indices_ptr.shape) - 1 :]:
+            fused_data_dimension *= i
+
+        fused_shape = 1
+        for i in shape:
+            fused_shape *= i
+
+        # For now we avoid parallizing over dimensions indexed by `indices` as
+        # there may be repeated indices and hadling parallel accumulation can
+        # be hard. So we parallelize over X_M .. X_{N-1} instead. This will
+        # work well when these dimensions are large enough to saturate memory
+        # bandwidth, but performance will be bad when these dimensions are
+        # small.
+        bx = te.thread_axis("blockIdx.x")
+        tx = te.thread_axis("threadIdx.x")
+        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+        tdim = min(max_threads, fused_data_dimension)
+        ib.scope_attr(tx, "thread_extent", tdim)
+        bdim = ceil_div(fused_data_dimension, tdim)
+        ib.scope_attr(bx, "thread_extent", bdim)
+
+        # zero data
+        # TODO(tkonolige): could we use topi.full to zero it instead?
+        with ib.for_range(0, ceil_div(fused_shape, bdim)) as i:
+            index = i * fused_data_dimension + bx * tdim + tx
+            with ib.if_scope(index < fused_shape):
+                out[index] = tvm.tir.Cast(data_ptr.dtype, 0)
+
+        with ib.for_range(0, fused_indices_dimension) as i:
+            j = bx * tdim + tx
+            with ib.if_scope(j < fused_data_dimension):
+                offset = fused_data_dimension
+                index = j  # This is x_M, .. x_{N-1} part of the index into out.
+                # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
+                # of the index into out.
+                for l in reversed(range(indices_ptr.shape[0].value)):
+                    # indices[i * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
+                    index += offset * indices[i + l * fused_indices_dimension]
+                    offset *= shape[l]
+                out[index] += data[i * fused_data_dimension + j]
+
+        return ib.get()
+
+    out_buf = tvm.tir.decl_buffer(shape, data.dtype, "out_buf")
+    return te.extern(
+        [shape],
+        [data, indices],
+        lambda ins, outs: gen_ir(ins[0], ins[1], outs[0]),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_nd_cuda",
+        tag="scatter_nd_cuda",
+    )
diff --git a/python/tvm/topi/scatter.py b/python/tvm/topi/scatter.py
index f1c307a43a44..a376963aa55a 100644
--- a/python/tvm/topi/scatter.py
+++ b/python/tvm/topi/scatter.py
@@ -16,7 +16,8 @@
 # under the License.
 # pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
 """Scatter operator"""
-from tvm.te import hybrid
+from ..tir import decl_buffer, ir_builder, Cast, AssertStmt, StringImm, Evaluate
+from ..te import extern, hybrid
 
 
 @hybrid.script
@@ -196,3 +197,120 @@ def scatter(data, indices, updates, axis=0):
     if len(data.shape) == 4:
         return _scatter_4d(data, indices, updates, axis)
     raise ValueError("scatter only support for 1-4 dimensions")
+
+
+def _verify_scatter_nd_inputs(data, indices, shape):
+    mdim = int(indices.shape[0])
+    assert mdim <= len(shape), (
+        f"The first dimension of the indices ({mdim}) must be less than or equal to "
+        f"the length of the shape of the output ({len(shape)})."
+    )
+    for i in range(len(indices.shape) - 1):
+        assert indices.shape[i + 1] == data.shape[i], (
+            f"Dimension of indices[{i+1}] ({indices.shape[i+1]}) must equal dimension of "
+            f"data[{i}] ({data.shape[i]})."
+        )
+    for i in range(mdim, len(shape)):
+        data_ind = i - mdim + len(indices.shape) - 1
+        assert data.shape[data_ind] == shape[i], (
+            f"Dimension of data[{data_ind}] ({data.shape[data_ind]}) must equal dimension "
+            f"of out_shape[{i}] ({shape[i]})."
+        )
+
+    assert (
+        "int" in indices.dtype
+    ), f"Indices must be a tensor of integers, but its elements are {indices.dtype}."
+
+
+def scatter_nd(data, indices, shape):
+    """Scatter elements from a n-dimension array.
+
+    Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), indices with shape
+    (M, Y_0, ..., Y_{K-1}), and output with shape (X_0, X_1, ..., X_{N-1}), scatter_nd computes
+
+    .. code-block::
+
+        output[indices[0, y_0, ..., y_{K-1}],
+               ...,
+               indices[M-1, y_0, ..., y_{K-1}],
+               x_M,
+               ...,
+               x_{N-1}
+              ] = data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]
+
+    all other entries in the output are 0. Repeated indices are summed.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The source array.
+
+    indices : tvm.te.Tensor
+        The indices of the values to extract.
+
+    shape : Sequence[int]
+        The output shape. This must be specified because it cannot be inferred.
+
+    Returns
+    -------
+    ret : tvm.te.Tensor
+    """
+    _verify_scatter_nd_inputs(data, indices, shape)
+
+    def gen_ir(data_ptr, indices_ptr, out_ptr):
+        ib = ir_builder.create()
+
+        data = ib.buffer_ptr(data_ptr)
+        indices = ib.buffer_ptr(indices_ptr)
+        out = ib.buffer_ptr(out_ptr)
+
+        # zero data
+        # TODO(tkonolige): could we use topi.full to zero it instead?
+        fused_shape = 1
+        for i in shape:
+            fused_shape *= i
+        with ib.for_range(0, fused_shape) as i:
+            out[i] = Cast(data_ptr.dtype, 0)
+
+        # We combine all the indices dimensions but the first one into a single
+        # dimension so we can iterate it in single loop instead of an arbitrary
+        # number of loops. We do the same thing for all the data dimensions.
+        fused_indices_dimension = 1
+        for i in indices_ptr.shape[1:]:
+            fused_indices_dimension *= i
+
+        fused_data_dimension = 1
+        for i in data_ptr.shape[len(indices_ptr.shape) - 1 :]:
+            fused_data_dimension *= i
+
+        with ib.for_range(0, fused_indices_dimension, name="i") as i:
+            with ib.for_range(0, fused_data_dimension, name="j") as j:
+                offset = fused_data_dimension
+                index = j  # This is x_M, .. x_{N-1} part of the index into out.
+                # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
+                # of the index into out.
+                for l in reversed(range(indices_ptr.shape[0].value)):
+                    # indices[i * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
+                    index += offset * indices[i + l * fused_indices_dimension]
+                    ib.emit(
+                        AssertStmt(
+                            indices[i + l * fused_indices_dimension] < shape[l],
+                            StringImm("index out of bounds"),
+                            Evaluate(0),
+                        )
+                    )
+                    offset *= shape[l]
+                out[index] += data[i * fused_data_dimension + j]
+
+        return ib.get()
+
+    out_buf = decl_buffer(shape, data.dtype, "out_buf")
+    return extern(
+        [shape],
+        [data, indices],
+        lambda ins, outs: gen_ir(ins[0], ins[1], outs[0]),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_nd_generic",
+        tag="scatter_nd_generic",
+    )
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 4f905500d3f1..065434499721 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -57,6 +57,7 @@
 from .space_to_depth import space_to_depth_python
 from .crop_and_resize_python import crop_and_resize_python
 from .common import (
+    compare_numpy_tvm,
     get_injective_schedule,
     get_reduce_schedule,
     get_broadcast_schedule,
diff --git a/python/tvm/topi/testing/common.py b/python/tvm/topi/testing/common.py
index 51ea19afe7ce..e4e5e811ab18 100644
--- a/python/tvm/topi/testing/common.py
+++ b/python/tvm/topi/testing/common.py
@@ -17,8 +17,10 @@
 # pylint: disable=invalid-name
 """Common utility for topi test"""
 
+import numpy as np
 import tvm
 from tvm import topi
+from tvm.testing import assert_allclose
 
 _injective_schedule = {
     "generic": topi.generic.schedule_injective,
@@ -77,3 +79,32 @@ def get_reduce_schedule(target):
 
 def get_conv2d_nchw_implement(target):
     return dispatch(target, _conv2d_nchw_implement)
+
+
+def compare_numpy_tvm(inputs, output, target, ctx, compute, schedule):
+    """Compare a numpy inputs and output of a function to the results of the TVM version.
+
+    Parameters
+    ----------
+    inputs : Sequence[numpy.nd.array]
+        List of input numpy arrays to pass to the function.
+    output : numpy.nd.array
+        Verified correct function output.
+    target : tvm.target.Target
+        Target to run on.
+    ctx : tvm.TVMContext
+        Context to run on.
+    compute : callable
+        Topi compute function to test against.
+    schedule : callable
+        Topi scheduling function to test against.
+    """
+    te_inputs = [tvm.te.placeholder(shape=i.shape, dtype=str(i.dtype)) for i in inputs]
+    te_out = tvm.nd.array(np.zeros(output.shape).astype(output.dtype), ctx=ctx)
+    with tvm.target.Target(target):
+        out = compute(*te_inputs)
+        s = schedule([out])
+        func = tvm.build(s, te_inputs + [out])
+        arys = [tvm.nd.array(x, ctx=ctx) for x in inputs]
+        func(*(arys + [te_out]))
+        assert_allclose(te_out.asnumpy(), output, atol=1e-4, rtol=1e-4)
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index 659668cbbe4c..154511010a1c 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -39,3 +39,4 @@
 from .conv3d_transpose import *
 from .sparse import *
 from .conv2d_alter_op import *
+from .scatter import *
diff --git a/python/tvm/topi/x86/scatter.py b/python/tvm/topi/x86/scatter.py
new file mode 100644
index 000000000000..8147d3a00135
--- /dev/null
+++ b/python/tvm/topi/x86/scatter.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Scatter operators for x86"""
+import tvm
+from tvm import te
+from ..scatter import _verify_scatter_nd_inputs
+
+
+def scatter_nd(data, indices, shape):
+    """Scatter elements from a n-dimension array.
+
+    Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), indices with shape
+    (M, Y_0, ..., Y_{K-1}), and output with shape (X_0, X_1, ..., X_{N-1}), scatter_nd computes
+
+    .. code-block::
+
+        output[indices[0, y_0, ..., y_{K-1}],
+               ...,
+               indices[M-1, y_0, ..., y_{K-1}],
+               x_M,
+               ...,
+               x_{N-1}
+              ] = data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]
+
+    all other entries in the output are 0. Repeated indices are summed.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The source array.
+
+    indices : tvm.te.Tensor
+        The indices of the values to extract.
+
+    shape : Sequence[int]
+        The output shape. This must be specified because it cannot be inferred.
+
+    Returns
+    -------
+    ret : tvm.te.Tensor
+    """
+    _verify_scatter_nd_inputs(data, indices, shape)
+
+    def gen_ir(data_ptr, indices_ptr, out_ptr):
+        # pylint: disable=invalid-name
+        ib = tvm.tir.ir_builder.create()
+
+        data = ib.buffer_ptr(data_ptr)
+        indices = ib.buffer_ptr(indices_ptr)
+        out = ib.buffer_ptr(out_ptr)
+
+        # We combine all the indices dimensions but the first one into a single
+        # dimension so we can iterate it in single loop instead of an arbitrary
+        # number of loops. We do the same thing for all the data dimensions.
+        fused_indices_dimension = 1
+        for i in indices_ptr.shape[1:]:
+            fused_indices_dimension *= i
+
+        fused_data_dimension = 1
+        for i in data_ptr.shape[len(indices_ptr.shape) - 1 :]:
+            fused_data_dimension *= i
+
+        fused_shape = 1
+        for i in shape:
+            fused_shape *= i
+
+        # zero data
+        # TODO(tkonolige): could we use topi.full to zero it instead?
+        with ib.for_range(0, fused_shape) as i:
+            out[i] = tvm.tir.Cast(data_ptr.dtype, 0)
+
+        with ib.for_range(0, fused_indices_dimension) as i:
+            with ib.for_range(0, fused_data_dimension, for_type="parallel") as j:
+                offset = fused_data_dimension
+                index = j  # This is x_M, .. x_{N-1} part of the index into out.
+                # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
+                # of the index into out.
+                for l in reversed(range(indices_ptr.shape[0].value)):
+                    # indices[i * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
+                    index += offset * indices[i + l * fused_indices_dimension]
+                    offset *= shape[l]
+                out[index] += data[i * fused_data_dimension + j]
+
+        return ib.get()
+
+    out_buf = tvm.tir.decl_buffer(shape, data.dtype, "out_buf")
+    return te.extern(
+        [shape],
+        [data, indices],
+        lambda ins, outs: gen_ir(ins[0], ins[1], outs[0]),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_nd_x86",
+        tag="scatter_nd_x86",
+    )
diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index 8f14b557dc54..64db13acbac0 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -246,7 +246,7 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
     for (size_t i = 0; i < tt1->shape.size(); i++) {
       auto dim = UnifyDim(tt1->shape[i], tt2->shape[i]);
       if (!dim.defined()) {
-        // NB: We push an arbitrary dimension here so we can continue error propogation.
+        // NB: We push an arbitrary dimension here so we can continue error propagation.
         shape.push_back(tt1->shape[i]);
         tvm::PrimExpr shape1 = tt1->shape[i];
         tvm::PrimExpr shape2 = tt2->shape[i];
@@ -259,10 +259,11 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
 
     if (mismatches.size() != 0) {
       auto err = Diagnostic::Error(this->span);
-      err << "in particular ";
+      err << "The Relay type checker is unable to show the following types match.\n";
+      err << "In particular ";
       for (auto mismatch : mismatches) {
-        err << "dimension " << std::get<0>(mismatch) << " conflicts " << std::get<1>(mismatch)
-            << " does not match " << std::get<2>(mismatch);
+        err << "dimension " << std::get<0>(mismatch) << " conflicts: " << std::get<1>(mismatch)
+            << " does not match " << std::get<2>(mismatch) << ".";
       }
       this->solver_->diag_ctx_.Emit(err);
       return Type(nullptr);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index c6b3260886c1..57cd9bd4118a 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -977,6 +977,74 @@ RELAY_REGISTER_OP("scatter_add")
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_support_level(10);
 
+// scatter_nd operator
+TVM_REGISTER_NODE_TYPE(ScatterNDAttrs);
+
+bool ScatterNDRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  // `types` contains: [data, indices, result]
+  ICHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* indices = types[1].as<TensorTypeNode>();
+  if (data == nullptr) {
+    ICHECK(types[0].as<IncompleteTypeNode>())
+        << "ScatterND: expect input data type to be TensorType but got " << types[0];
+    return false;
+  }
+  if (indices == nullptr) {
+    ICHECK(types[1].as<IncompleteTypeNode>())
+        << "ScatterND: expect indices type to be TensorType but got " << types[1];
+    return false;
+  }
+  ICHECK(indices->dtype.is_int()) << "ScatterND: indices must be a tensor of integers.";
+  const auto out_shape = attrs.as<ScatterNDAttrs>()->out_shape;
+  const IntImmNode* mdim = indices->shape[0].as<IntImmNode>();
+  const size_t kdim = indices->shape.size() - 1;
+  const size_t ndim = out_shape.size();
+  ICHECK_LE(size_t(mdim->value), ndim)
+      << "ScatterND: Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), and indices "
+         "with shape (M, Y_0, ..., Y_{K-1}), M must be less than or equal to N.";
+  // Indices: (M, Y_0, .. Y_{K-1}) data: (Y_0, .. Y_{K-1}, ...), verify Y's.
+  for (size_t i = 0; i < kdim; i++) {
+    reporter->AssertEQ(indices->shape[i + 1], data->shape[i]);
+  }
+
+  std::vector<IndexExpr> oshape;
+  for (auto& x : out_shape) {
+    oshape.push_back(x);
+  }
+
+  // data: (Y_0, .. Y_{K-1}, X_M, .. X_{N-1}) out: (X_0, .. X_{N-1}), verify X_M to X_{N-1}
+  for (size_t i = mdim->value; i < ndim; i++) {
+    reporter->AssertEQ(data->shape[i - mdim->value + kdim], oshape[i]);
+  }
+
+  reporter->Assign(types[2], TensorType(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeScatterND(Expr data, Expr indices, const Array<Integer> out_shape) {
+  auto attrs = make_object<ScatterNDAttrs>();
+  attrs->out_shape = out_shape;
+  static const Op& op = Op::Get("scatter_nd");
+  return Call(op, {data, indices}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.scatter_nd").set_body_typed(MakeScatterND);
+
+RELAY_REGISTER_OP("scatter_nd")
+    .describe(R"code(Scatter elements or slices from data and store to a tensor
+whose shape is defined by indices.
+
+Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}) and indices with shape
+(M, Y_0, ..., Y_{K-1}), the output will have shape (X_0, X_1, ..., X_{N-1}).
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_support_level(3)
+    .add_type_rel("ScatterND", ScatterNDRel)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
+
 // Take
 TVM_REGISTER_NODE_TYPE(TakeAttrs);
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 546973704fea..eec6aa21c69b 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -989,7 +989,10 @@ def _body(i, st):
     body = loop(start, relay.op.reshape(relay.const(0), newshape=(1, 1)))
     func = relay.Function([start], relay.TupleGetItem(body, 1))
     with DiagnosticTesting() as diagnostics:
-        diagnostics.assert_message("in particular dimension 0 conflicts 2 does not match 1")
+        diagnostics.assert_message(
+            "The Relay type checker is unable to show the following types "
+            "match.\nIn particular dimension 0 conflicts: 2 does not match 1."
+        )
         func = infer_type(func)
 
 
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 9c27afd87205..98ff62ed75d4 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -117,5 +117,14 @@ def test_arange_grad():
     check_grad(fwd_func, inputs=values)
 
 
+def test_gather_nd_grad():
+    data = relay.var("data", relay.TensorType((2, 3), "float64"))
+    indices = relay.var("indices", relay.TensorType((2, 4), "int64"))
+    fwd = relay.Function([data, indices], relay.gather_nd(data, indices))
+    data_np = np.random.rand(2, 3).astype("float64")
+    indices_np = np.array([[0, 1, 1, 0], [0, 1, 0, 0]], dtype="int64")
+    check_grad(fwd, inputs=[data_np, indices_np], test_inputs=[data_np])
+
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/tests/python/topi/python/test_topi_scatter.py b/tests/python/topi/python/test_topi_scatter.py
new file mode 100644
index 000000000000..2e701e2903d9
--- /dev/null
+++ b/tests/python/topi/python/test_topi_scatter.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import topi
+import tvm.topi.testing
+
+
+@tvm.testing.parametrize_targets
+def test_scatter_nd(ctx, target):
+    def check_scatter_nd(data, indices, shape, out):
+        implementations = {
+            "generic": (lambda x, y: topi.scatter_nd(x, y, shape), topi.generic.schedule_extern),
+            "gpu": (lambda x, y: topi.cuda.scatter_nd(x, y, shape), topi.generic.schedule_extern),
+            "cpu": (lambda x, y: topi.x86.scatter_nd(x, y, shape), topi.generic.schedule_extern),
+        }
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
+        tvm.topi.testing.compare_numpy_tvm([data, indices], out, target, ctx, fcompute, fschedule)
+
+    data = np.array([2, 3, 0])
+    indices = np.array([[1, 1, 0], [0, 1, 0]])
+    shape = (2, 2)
+    out = np.array([[0, 0], [2, 3]])
+    check_scatter_nd(data, indices, shape, out)
+
+    data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+    indices = np.array([[0, 1], [1, 1]])
+    shape = (2, 2, 2, 2)
+    out = np.array([[[[0, 0], [0, 0]], [[1, 2], [3, 4]]], [[[0, 0], [0, 0]], [[5, 6], [7, 8]]]])
+    check_scatter_nd(data, indices, shape, out)
+
+    data = np.reshape(np.arange(1560 * 3), (3, 1560)).astype("float32")
+    indices = np.array([[1, 0, 0]])
+    shape = (2, 1560)
+    out = np.zeros(shape).astype("float32")
+    out[1, :] += data[0, :]
+    out[0, :] += data[1, :]
+    out[0, :] += data[2, :]
+    check_scatter_nd(data, indices, shape, out)
+
+    data = np.ones((5, 3)).astype("float64")
+    indices = np.stack((np.random.randint(2, size=5), np.random.randint(7, size=5))).astype("int64")
+    shape = (2, 7, 3)
+    out = np.zeros(shape).astype("float64")
+    for i in range(indices.shape[1]):
+        for j in range(data.shape[1]):
+            out[indices[0, i], indices[1, i], j] += data[i, j]
+    check_scatter_nd(data, indices, shape, out)
+
+
+if __name__ == "__main__":
+    test_scatter_nd(tvm.context("cpu"), tvm.target.Target("llvm"))

From f3b79089c7e24085a2a58101561964b7a2b3a6c1 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 1 Dec 2020 14:59:09 -0500
Subject: [PATCH 233/258] [TOPI] deformable_conv2d in NHWC (#6999)

* [TOPI] deformable_conv2d in NHWC

* Update python/tvm/topi/generic/nn.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Update python/tvm/topi/testing/deformable_conv2d_python.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* style

* fix

* style

Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 include/tvm/topi/detail/tensor_utils.h        |  37 ++++++
 python/tvm/topi/generic/nn.py                 |  18 +++
 python/tvm/topi/nn/deformable_conv2d.py       | 110 +++++++++++++++++-
 python/tvm/topi/testing/__init__.py           |   2 +-
 ..._python.py => deformable_conv2d_python.py} |  49 ++++++++
 src/topi/schedule.cc                          |   4 +
 .../python/test_topi_deformable_conv2d.py     |  95 ++++++++++++++-
 7 files changed, 311 insertions(+), 4 deletions(-)
 rename python/tvm/topi/testing/{deformable_conv2d_nchw_python.py => deformable_conv2d_python.py} (74%)

diff --git a/include/tvm/topi/detail/tensor_utils.h b/include/tvm/topi/detail/tensor_utils.h
index 7004c358ad4e..65a760b1397c 100644
--- a/include/tvm/topi/detail/tensor_utils.h
+++ b/include/tvm/topi/detail/tensor_utils.h
@@ -89,6 +89,43 @@ inline PrimExpr bilinear_sample_nchw(const Tensor& input, const Array<PrimExpr>&
          D * x_lerp * y_lerp;
 }
 
+/*!
+ * \brief Sample a point in a tensor using bilinear interpolation.
+ *
+ * \param input The input tensor.
+ * \param indices The index of the target point, which can be fractional
+ * \param max_y The maximum of y dimension
+ * \param max_x The maximum of x dimension
+ *
+ * \return The interpolated value in the given index.
+ */
+inline PrimExpr bilinear_sample_nhwc(const Tensor& input, const Array<PrimExpr>& indices,
+                                     const PrimExpr max_y, const PrimExpr max_x) {
+  auto in_y = indices[1];
+  auto yf = tvm::floor(in_y);
+  auto yc = tvm::cast(DataType::Int(32), tvm::ceil(in_y));
+
+  auto y0 = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y1 = tvm::if_then_else((yc > max_y), max_y, yc);
+  auto y_lerp = in_y - yf;
+
+  auto in_x = indices[2];
+  auto xf = tvm::floor(in_x);
+  auto xc = tvm::cast(DataType::Int(32), tvm::ceil(in_x));
+
+  auto x0 = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x1 = tvm::if_then_else((xc > max_x), max_x, xc);
+  auto x_lerp = in_x - xf;
+
+  auto A = input(indices[0], y0, x0, indices[3]);
+  auto B = input(indices[0], y0, x1, indices[3]);
+  auto C = input(indices[0], y1, x0, indices[3]);
+  auto D = input(indices[0], y1, x1, indices[3]);
+
+  return A * (1 - x_lerp) * (1 - y_lerp) + B * x_lerp * (1 - y_lerp) + C * (1 - x_lerp) * y_lerp +
+         D * x_lerp * y_lerp;
+}
+
 }  // namespace detail
 }  // namespace topi
 }  // namespace tvm
diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
index 4bc3f97d850b..60ccd0d36abf 100644
--- a/python/tvm/topi/generic/nn.py
+++ b/python/tvm/topi/generic/nn.py
@@ -462,6 +462,24 @@ def schedule_deformable_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
+def schedule_deformable_conv2d_nhwc(outs):
+    """Schedule for deformable_conv2d_nhwc.
+    We only use the default schedule here and rely on auto_scheduler.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of deformable_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 def schedule_bitserial_conv2d_nchw(outs):
     """Schedule for bitserial_conv2d_nchw
 
diff --git a/python/tvm/topi/nn/deformable_conv2d.py b/python/tvm/topi/nn/deformable_conv2d.py
index a8c2745b1c77..780530cbad79 100644
--- a/python/tvm/topi/nn/deformable_conv2d.py
+++ b/python/tvm/topi/nn/deformable_conv2d.py
@@ -21,7 +21,7 @@
 
 from .utils import get_pad_tuple
 from ..utils import get_const_tuple
-from ..cpp.utils import bilinear_sample_nchw
+from ..cpp.utils import bilinear_sample_nchw, bilinear_sample_nhwc
 
 
 def deformable_conv2d_nchw(
@@ -130,3 +130,111 @@ def _bilinear(n, c, h, w):
         ),
         tag="deformable_conv2d_nchw",
     )
+
+
+def deformable_conv2d_nhwc(
+    data, offset, kernel, strides, padding, dilation, deformable_groups, groups, out_dtype
+):
+    """Deformable conv2D operator in NHWC layout.
+
+    The deformable convolution operation is described in https://arxiv.org/abs/1703.06211
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    offset : tvm.te.Tensor
+        4-D with shape [batch, out_height, out_width,
+                        deformable_groups * filter_height * filter_width * 2].
+
+    kernel : tvm.te.Tensor
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    deformable_groups : int
+        number of deformable groups
+
+    groups : int
+        number of groups
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    if out_dtype is None:
+        out_dtype = data.dtype
+
+    if isinstance(strides, int):
+        stride_h = stride_w = strides
+    else:
+        stride_h, stride_w = strides
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_height, in_width, in_channel = get_const_tuple(data.shape)
+    kernel_h, kernel_w, channel, out_channel = get_const_tuple(kernel.shape)
+    _, out_height, out_width, _ = get_const_tuple(offset.shape)
+    assert in_channel % deformable_groups == 0, "Input cahnnels must divide deformable group size"
+    assert groups == 1, "deformable_conv2d_nchw does not support groups > 1"
+
+    ic_per_dgroup = channel // deformable_groups
+
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, _, _ = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
+    rc = te.reduce_axis((0, in_channel), name="rc")
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+
+    zero = tvm.tir.const(0.0, data.dtype)
+
+    def _bilinear(n, h, w, c):
+        outside = tvm.tir.any(h < 0, w < 0, h >= in_height, w >= in_width)
+        val = bilinear_sample_nhwc(data, (n, h, w, c), in_height - 1, in_width - 1)
+        return tvm.tir.if_then_else(outside, zero, val)
+
+    data_deform = te.compute(
+        (batch, kernel_h, kernel_w, in_channel, out_height, out_width),
+        lambda n, kh, kw, c, y, x: _bilinear(
+            n,
+            y * stride_h
+            - pad_top
+            + kh * dilation_h
+            + offset[
+                n, y, x, c // ic_per_dgroup * (kernel_w * kernel_h * 2) + (kh * kernel_w + kw) * 2
+            ],
+            x * stride_w
+            - pad_left
+            + kw * dilation_w
+            + offset[
+                n,
+                y,
+                x,
+                c // ic_per_dgroup * (kernel_w * kernel_h * 2) + (kh * kernel_w + kw) * 2 + 1,
+            ],
+            c,
+        ),
+        tag="data_deform",
+    )
+    return te.compute(
+        (batch, out_height, out_width, out_channel),
+        lambda n, y, x, f: te.sum(
+            data_deform[n, ry, rx, rc, y, x].astype(out_dtype)
+            * kernel[ry, rx, rc, f].astype(out_dtype),
+            axis=[ry, rx, rc],
+        ),
+        tag="deformable_conv2d_nhwc",
+    )
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 065434499721..85f13a763c40 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -31,7 +31,7 @@
 from .conv2d_transpose_python import conv2d_transpose_nchw_python, conv2d_transpose_nhwc_python
 from .conv1d_transpose_ncw_python import conv1d_transpose_ncw_python
 from .correlation_nchw_python import correlation_nchw_python
-from .deformable_conv2d_nchw_python import deformable_conv2d_nchw_python
+from .deformable_conv2d_python import deformable_conv2d_nchw_python, deformable_conv2d_nhwc_python
 from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc
 from .dilate_python import dilate_python
 from .softmax_python import softmax_python, log_softmax_python
diff --git a/python/tvm/topi/testing/deformable_conv2d_nchw_python.py b/python/tvm/topi/testing/deformable_conv2d_python.py
similarity index 74%
rename from python/tvm/topi/testing/deformable_conv2d_nchw_python.py
rename to python/tvm/topi/testing/deformable_conv2d_python.py
index 6a7afb4b96f3..093084397ff1 100644
--- a/python/tvm/topi/testing/deformable_conv2d_nchw_python.py
+++ b/python/tvm/topi/testing/deformable_conv2d_python.py
@@ -119,3 +119,52 @@ def _bilinear(n, c, h, w):
         b_np[n, f, h, w] += np.tensordot(a_deform[n, c, h, w], w_np[f, c])
 
     return b_np
+
+
+def deformable_conv2d_nhwc_python(
+    a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups
+):
+    """Deformable convolution operator in NHWC layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_height, in_width, in_channel]
+
+    offset_np : numpy.ndarray
+        4-D with shape [batch, out_height, out_width,
+                        deformable_groups * filter_height * filter_width * 2]
+
+    w_np : numpy.ndarray
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str or a list/tuple of 2 or 4 ints
+        Padding size, or ['VALID', 'SAME'], or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 2 ints
+
+    dilation : int or a list/tuple of two ints
+        Dilation size, or [dilate_height, dilate_width]
+
+    deformable_groups : int
+        Number of deformable groups
+
+    groups : int
+        Number of groups
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    a_np = np.transpose(a_np, [0, 3, 1, 2])  # NHWC -> NCHW
+    offset_np = np.transpose(offset_np, [0, 3, 1, 2])  # NHWC -> NCHW
+    w_np = np.transpose(w_np, [3, 2, 0, 1])  # HWIO -> OIHW
+    b_np = deformable_conv2d_nchw_python(
+        a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups
+    )
+    b_np = np.transpose(b_np, [0, 2, 3, 1])  # NCHW -> NHWC
+    return b_np
diff --git a/src/topi/schedule.cc b/src/topi/schedule.cc
index c315d40be277..f9400bf59df6 100644
--- a/src/topi/schedule.cc
+++ b/src/topi/schedule.cc
@@ -190,6 +190,10 @@ TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nchw").set_body([](TVMArgs args,
   *rv = detail::bilinear_sample_nchw(args[0], args[1], args[2], args[3]);
 });
 
+TVM_REGISTER_GLOBAL("topi.utils.bilinear_sample_nhwc").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = detail::bilinear_sample_nhwc(args[0], args[1], args[2], args[3]);
+});
+
 /*! \brief Builder function for instantiating schedules. */
 using FTVMScheduleBuilder = std::function<tvm::te::Schedule(
     const tvm::Target& target, const tvm::Array<tvm::te::Tensor>& outs)>;
diff --git a/tests/python/topi/python/test_topi_deformable_conv2d.py b/tests/python/topi/python/test_topi_deformable_conv2d.py
index 34bfae7bad68..cd6f33f14fd7 100644
--- a/tests/python/topi/python/test_topi_deformable_conv2d.py
+++ b/tests/python/topi/python/test_topi_deformable_conv2d.py
@@ -26,11 +26,15 @@
 import tvm.testing
 
 
-_deformable_conv2d_implement = {
+_deformable_conv2d_nchw_implement = {
     "generic": (topi.nn.deformable_conv2d_nchw, topi.generic.schedule_deformable_conv2d_nchw),
     "cuda": (topi.cuda.deformable_conv2d_nchw, topi.cuda.schedule_deformable_conv2d_nchw),
 }
 
+_deformable_conv2d_nhwc_implement = {
+    "generic": (topi.nn.deformable_conv2d_nhwc, topi.generic.schedule_deformable_conv2d_nhwc),
+}
+
 
 def verify_deformable_conv2d_nchw(
     batch,
@@ -94,7 +98,7 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_implement)
+        fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_nchw_implement)
         with tvm.target.Target(device):
             C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype)
             s = fschedule([C])
@@ -112,6 +116,86 @@ def check_device(device):
         check_device(device)
 
 
+def verify_deformable_conv2d_nhwc(
+    batch,
+    in_channel,
+    in_size,
+    num_filter,
+    kernel,
+    stride,
+    padding,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+):
+    print(
+        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)"
+        % (
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            deformable_groups,
+            groups,
+        )
+    )
+
+    A = te.placeholder((batch, in_size, in_size, in_channel), name="A")
+    out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1
+    Offset = te.placeholder(
+        (batch, out_size, out_size, deformable_groups * kernel * kernel * 2), name="offset"
+    )
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W")
+    bias = te.placeholder((num_filter,), name="bias")
+
+    a_shape = get_const_tuple(A.shape)
+    offset_shape = get_const_tuple(Offset.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nhwc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        offset_np = np.random.randn(*offset_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        c_np = tvm.topi.testing.deformable_conv2d_nhwc_python(
+            a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups
+        )
+
+        return a_np, offset_np, w_np, c_np
+
+    a_np, offset_np, w_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not tvm.testing.device_enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_nhwc_implement)
+        with tvm.target.Target(device):
+            C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype)
+            s = fschedule([C])
+
+            a = tvm.nd.array(a_np, ctx)
+            offset = tvm.nd.array(offset_np, ctx)
+            w = tvm.nd.array(w_np, ctx)
+            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx)
+
+            func = tvm.build(s, [A, Offset, W, C], device)
+            func(a, offset, w, c)
+            tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
 @tvm.testing.uses_gpu
 def test_deformable_conv2d_nchw():
     verify_deformable_conv2d_nchw(1, 16, 7, 16, 1, 1, 0, deformable_groups=4)
@@ -119,5 +203,12 @@ def test_deformable_conv2d_nchw():
     verify_deformable_conv2d_nchw(1, 16, 7, 16, 3, 1, 2, dilation=2)
 
 
+def test_deformable_conv2d_nhwc():
+    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 1, 1, 0, deformable_groups=4)
+    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 3, 1, 1, dilation=2, deformable_groups=4)
+    verify_deformable_conv2d_nhwc(1, 16, 7, 16, 3, 1, 2, dilation=2)
+
+
 if __name__ == "__main__":
     test_deformable_conv2d_nchw()
+    test_deformable_conv2d_nhwc()

From bbf2c7eaa72499ff9694773e0af5d8eba4b2002e Mon Sep 17 00:00:00 2001
From: CaramelFc <64790599+CaramelFc@users.noreply.github.com>
Date: Wed, 2 Dec 2020 04:02:24 +0800
Subject: [PATCH 234/258] Fix call mkl gemm in mkldnn.py (#7007)

Co-authored-by: zhangfucheng <zhangfucheng.jason@bytedance.com>
---
 python/tvm/contrib/mkldnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/mkldnn.py b/python/tvm/contrib/mkldnn.py
index 04af30070293..8d5f4da0345b 100644
--- a/python/tvm/contrib/mkldnn.py
+++ b/python/tvm/contrib/mkldnn.py
@@ -45,7 +45,7 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
         (n, m),
         [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.mkl.matmul", ins[0], ins[1], outs[0], transa, transb
+            "tvm.contrib.mkldnn.matmul", ins[0], ins[1], outs[0], transa, transb
         ),
         name="C",
         **kwargs,

From dcbccdea4fe91820621cd6c6db4574032d171716 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Tue, 1 Dec 2020 22:04:43 -0800
Subject: [PATCH 235/258] Use channels from attrs if possible (#7011)

---
 src/runtime/contrib/tensorrt/tensorrt_ops.cc | 4 ++++
 tests/python/contrib/test_tensorrt.py        | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index 057743c3b588..c3ff1c45f50e 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -243,6 +243,10 @@ class Conv2DOpConverter : public TensorRTOpConverter {
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
     int groups = std::stoi(params->node.GetAttr<std::vector<std::string>>("groups")[0]);
     int channels = weight_shape[0];
+    if (params->node.HasAttr("channels") &&
+        !params->node.GetAttr<std::vector<std::string>>("channels")[0].empty()) {
+      channels = std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
+    }
     // TRT conv2d op doesn't support asymmetric padding before 5.1, so we
     // workaround by adding a padding layer before the pooling op.
     nvinfer1::DimsHW prepadding, postpadding;
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 10c311a6d363..de9822289528 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -352,6 +352,7 @@ def get_graph(
         padding=(0, 0),
         strides=(1, 1),
         dilation=(1, 1),
+        channels=None,
     ):
         x = relay.var("x", shape=(x_shape), dtype="float32")
         kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
@@ -363,6 +364,7 @@ def get_graph(
             padding=padding,
             strides=strides,
             dilation=dilation,
+            channels=channels,
         )
         f = relay.Function([x, kernel], out)
         return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
@@ -380,6 +382,9 @@ def get_graph(
                             dilation=dilation,
                         )
                     )
+    run_and_verify_func(
+        get_graph((1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24)
+    )
 
 
 def test_conv2d_nhwc():

From dd893005bb07ce596c35186170461fa860fbf41f Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Wed, 2 Dec 2020 05:55:45 -0800
Subject: [PATCH 236/258] =?UTF-8?q?[=C2=B5TVM]=20Minor=20fixes=20to=20the?=
 =?UTF-8?q?=20Reference=20VM=20tutorial=20(#7012)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add recommendation to install vbguest plugin.

* Update directories to match checked-in.
---
 tutorials/micro/micro_reference_vm.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tutorials/micro/micro_reference_vm.py b/tutorials/micro/micro_reference_vm.py
index dec1be9779fd..4b449a0e7e14 100644
--- a/tutorials/micro/micro_reference_vm.py
+++ b/tutorials/micro/micro_reference_vm.py
@@ -61,7 +61,8 @@
 1. `Vagrant <https://vagrantup.com>`__
 2. A supported Virtual Machine hypervisor.
    `VirtualBox <https://www.virtualbox.org>`__ is one suggested free hypervisor, but please note
-   that the `VirtualBox Extension Pack`_ is required for proper USB forwarding.
+   that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox,
+   also consider installing the `vbguest <https://github.com/dotless-de/vagrant-vbguest>`_ plugin.
 
 .. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
 
@@ -72,9 +73,10 @@
 
 .. code-block:: bash
 
-    ~/.../tvm $ cd apps/microtvm-vm
+    # Replace zepyhr with the name of a different platform, if you are not using Zephyr.
+    ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr
     # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox).
-    ~/.../tvm/apps/microtvm/vm $ vagrant up --provider=<provider_name>
+    ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider=<provider_name>
 
 
 This command will take a couple of minutes to run and will require 4 to 5GB of storage on your

From 0f24128ceb7f125b9d33231557848a6f02d54a69 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Wed, 2 Dec 2020 18:14:12 +0000
Subject: [PATCH 237/258] Update dmlc_tvm_commit_id.txt

---
 dmlc_tvm_commit_id.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dmlc_tvm_commit_id.txt b/dmlc_tvm_commit_id.txt
index 3476b5d864e2..bce5cf01ffce 100644
--- a/dmlc_tvm_commit_id.txt
+++ b/dmlc_tvm_commit_id.txt
@@ -1 +1 @@
-7d805b54d6adda82636d13bf7c46a2e9a933da5f
\ No newline at end of file
+636739af8d14c864d263a55323acc6c530497588
\ No newline at end of file

From d5076579d253ea9db4925a590fd3de1d48dfdd04 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Wed, 2 Dec 2020 18:27:36 +0000
Subject: [PATCH 238/258] Fix python formatting

---
 python/tvm/topi/intel_graphics/conv2d.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/topi/intel_graphics/conv2d.py b/python/tvm/topi/intel_graphics/conv2d.py
index fa1fd776b79c..bdbde91918dd 100644
--- a/python/tvm/topi/intel_graphics/conv2d.py
+++ b/python/tvm/topi/intel_graphics/conv2d.py
@@ -144,6 +144,7 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
     s[tensor].bind(xi, thread_x)
     return xi, thread_z, thread_y, thread_x
 
+
 def _pack_data(data, kernel, ic_bn, oc_bn):
     n, _, ih, iw = get_const_tuple(data.shape)
     oc, ic, kh, kw = get_const_tuple(kernel.shape)

From c43b2c53c1e652206dfcb93343e365eec4ede2aa Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Wed, 2 Dec 2020 18:20:26 +0000
Subject: [PATCH 239/258] Remove duplicate shape func for 'where'

Remove duplicate shape function for 'copy'
---
 python/tvm/relay/op/_tensor.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 5675b28c713a..b7ae715ff597 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -278,6 +278,4 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("sigmoid", False, elemwise_shape_func)
 register_shape_func("isnan", False, elemwise_shape_func)
 register_shape_func("isinf", False, elemwise_shape_func)
-register_shape_func("where", False, elemwise_shape_func)
-register_shape_func("copy", False, elemwise_shape_func)
 register_shape_func("logical_not", False, elemwise_shape_func)

From 5eabee805a8b9a48483e954d75b5c7dc5c0e7f1c Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Wed, 2 Dec 2020 12:26:23 -0800
Subject: [PATCH 240/258] [Backend][Verilator] regression tests (#7000)

* add files

* update tests

* test this

* test this case

* update jenkins file

* fix offload

* update

* update variables

* rollback ci files
---
 .../python/contrib/test_verilator/__init__.py | 18 ++++
 .../contrib/test_verilator/infrastructure.py  | 83 +++++++++++++++++++
 .../test_verilator/test_verilator_codegen.py  | 67 +++++++++++++++
 3 files changed, 168 insertions(+)
 create mode 100644 tests/python/contrib/test_verilator/__init__.py
 create mode 100644 tests/python/contrib/test_verilator/infrastructure.py
 create mode 100644 tests/python/contrib/test_verilator/test_verilator_codegen.py

diff --git a/tests/python/contrib/test_verilator/__init__.py b/tests/python/contrib/test_verilator/__init__.py
new file mode 100644
index 000000000000..4838dc3f4371
--- /dev/null
+++ b/tests/python/contrib/test_verilator/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Infrastructure and tests for Verilator codegen """
diff --git a/tests/python/contrib/test_verilator/infrastructure.py b/tests/python/contrib/test_verilator/infrastructure.py
new file mode 100644
index 000000000000..1333f484aec9
--- /dev/null
+++ b/tests/python/contrib/test_verilator/infrastructure.py
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Verilator utility functions"""
+
+import sys
+
+import tvm
+from tvm import relay
+import tvm.relay.testing
+from tvm import runtime
+from tvm.relay import transform
+
+
+def _register_verilator_op(op_name, supported=True):
+    """The helper function to indicate that a given operator can be supported by Verilator.
+
+    Paramters
+    ---------
+    op_name : Str
+        The name of operator that will be registered.
+
+    Returns
+    -------
+    f : callable
+        A function that returns if the operator is supported by DNNL.
+    """
+
+    @tvm.ir.register_op_attr(op_name, "target.verilator")
+    def _func_wrapper(expr):
+        return supported
+
+    return _func_wrapper
+
+
+def skip_test():
+    """Skip test if it requires the Verilator codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.verilator", True):
+        print("Skip test because Verilator codegen is not available.")
+        return True
+    if sys.platform == "win32":
+        print("Skip test on Windows for now")
+        return True
+    return False
+
+
+def offload(mod):
+    """Offload ops based on the registered ops"""
+
+    backend = "verilator"
+    mod = transform.AnnotateTarget([backend])(mod)
+    mod = transform.PartitionGraph()(mod)
+    return mod
+
+
+def compile_module(mod):
+    """Compile Relay module"""
+
+    with relay.build_config(opt_level=3):
+        exe = relay.vm.compile(mod, target="llvm", params=None)
+        code, lib = exe.save()
+        return runtime.vm.Executable.load_exec(code, lib)
+
+
+def run_module(exe, inputs):
+    """Run Relay module"""
+
+    ctx = tvm.cpu()
+    vm = runtime.vm.VirtualMachine(exe, ctx)
+    return vm.run(**inputs)
diff --git a/tests/python/contrib/test_verilator/test_verilator_codegen.py b/tests/python/contrib/test_verilator/test_verilator_codegen.py
new file mode 100644
index 000000000000..664e254041b2
--- /dev/null
+++ b/tests/python/contrib/test_verilator/test_verilator_codegen.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Verilator codegen tests"""
+
+import numpy as np
+
+import tvm
+from tvm import relay
+
+from test_verilator.infrastructure import (
+    _register_verilator_op,
+    skip_test,
+    compile_module,
+    run_module,
+    offload,
+)
+
+
+_register_verilator_op("add")
+
+
+def create_module_add(shape, dtype):
+    x = relay.var("x", shape=shape, dtype=dtype)
+    y = relay.var("y", shape=shape, dtype=dtype)
+    z = relay.add(x, y)
+    f = relay.Function([x, y], z)
+    mod = tvm.IRModule()
+    mod["main"] = f
+    return mod
+
+
+def run_check_add(exe, shape, dtype):
+    x_data = np.random.randint(5, size=shape, dtype=dtype)
+    y_data = np.random.randint(5, size=shape, dtype=dtype)
+    ref = x_data + y_data
+    inputs = {"x": x_data, "y": y_data}
+    out = run_module(exe, inputs)
+    tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
+
+
+def test_add():
+    if skip_test():
+        return
+    dtype = "int32"
+    shape = (8, 4)
+    mod = create_module_add(shape, dtype)
+    mod = offload(mod)
+    exe = compile_module(mod)
+    run_check_add(exe, shape, dtype)
+
+
+if __name__ == "__main__":
+    test_add()

From 699fcb7f915b26bde0aec1586cf5f77dc080ddf0 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Wed, 2 Dec 2020 17:07:41 -0800
Subject: [PATCH 241/258] =?UTF-8?q?[=C2=B5TVM]=20Modify=20reference=20VMs?=
 =?UTF-8?q?=20to=20support=20new=20=C2=B5TVM=20demo=20(#7001)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 apps/microtvm/reference-vm/base-box-tool.py   | 33 +++++++++++++++----
 apps/microtvm/reference-vm/zephyr/Vagrantfile |  4 +++
 .../reference-vm/zephyr/base-box/setup.sh     | 11 ++++---
 .../zephyr/base-box/test-config.json          |  4 +++
 .../reference-vm/zephyr/pyproject.toml        |  5 +--
 .../reference-vm/zephyr/rebuild-tvm.sh        |  1 +
 apps/microtvm/reference-vm/zephyr/setup.sh    |  9 +++--
 7 files changed, 51 insertions(+), 16 deletions(-)
 create mode 100644 apps/microtvm/reference-vm/zephyr/base-box/test-config.json

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index 12aded6e63c6..c317a373bd8b 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -232,7 +232,8 @@ def do_build_release_test_vm(release_test_dir, user_box_dir, base_box_dir, provi
             box_package = os.path.join(
                 base_box_dir, f"output-packer-{provider_name}", "package.box"
             )
-            f.write(f'{m.group(1)} = "{os.path.relpath(box_package, release_test_dir)}"\n')
+            box_relpath = os.path.relpath(box_package, release_test_dir)
+            f.write(f'{m.group(1)} = "{box_relpath}"\n')
             found_box_line = True
 
     if not found_box_line:
@@ -242,6 +243,10 @@ def do_build_release_test_vm(release_test_dir, user_box_dir, base_box_dir, provi
         )
         return False
 
+    # Delete the old box registered with Vagrant, which may lead to a falsely-passing release test.
+    remove_args = ["vagrant", "box", "remove", box_relpath]
+    return_code = subprocess.call(remove_args, cwd=release_test_dir)
+    assert return_code in (0, 1), f'{" ".join(remove_args)} returned exit code {return_code}'
     subprocess.check_call(["vagrant", "up", f"--provider={provider_name}"], cwd=release_test_dir)
 
     return True
@@ -281,7 +286,7 @@ def test_command(args):
         test_config["vid_hex"] = test_config["vid_hex"].lower()
         test_config["pid_hex"] = test_config["pid_hex"].lower()
 
-    providers = args.provider.split(",")
+    providers = args.provider
     provider_passed = {p: False for p in providers}
 
     release_test_dir = os.path.join(THIS_DIR, "release-test")
@@ -313,11 +318,20 @@ def test_command(args):
 
 
 def release_command(args):
-    #  subprocess.check_call(["vagrant", "cloud", "version", "create", f"tlcpack/microtvm-{args.platform}", args.version])
-    if not args.version:
-        sys.exit(f"--version must be specified")
+    subprocess.check_call(
+        [
+            "vagrant",
+            "cloud",
+            "version",
+            "create",
+            f"tlcpack/microtvm-{args.platform}",
+            args.release_version,
+        ]
+    )
+    if not args.release_version:
+        sys.exit(f"--release-version must be specified")
 
-    for provider_name in args.provider.split(","):
+    for provider_name in args.provider:
         subprocess.check_call(
             [
                 "vagrant",
@@ -325,7 +339,7 @@ def release_command(args):
                 "publish",
                 "-f",
                 f"tlcpack/microtvm-{args.platform}",
-                args.version,
+                args.release_version,
                 provider_name,
                 os.path.join(
                     THIS_DIR,
@@ -361,6 +375,8 @@ def parse_args():
     parser.add_argument(
         "--provider",
         choices=ALL_PROVIDERS,
+        action="append",
+        default=[],
         help="Name of the provider or providers to act on; if not specified, act on all",
     )
     parser.add_argument(
@@ -392,6 +408,9 @@ def main():
     if os.path.sep in args.platform or not os.path.isdir(os.path.join(THIS_DIR, args.platform)):
         sys.exit(f"<platform> must be a sub-direcotry of {THIS_DIR}; got {args.platform}")
 
+    if not args.provider:
+        args.provider = list(ALL_PROVIDERS)
+
     todo = []
     for phase in args.command.split(","):
         if phase not in ALL_COMMANDS:
diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile
index bb0c4eaac2c8..5a73d1f5e79b 100644
--- a/apps/microtvm/reference-vm/zephyr/Vagrantfile
+++ b/apps/microtvm/reference-vm/zephyr/Vagrantfile
@@ -20,6 +20,10 @@ Vagrant.configure("2") do |config|
 
   tvm_home = "../../../.."
   dirs_to_mount = [Pathname.new(Pathname.new(tvm_home).expand_path())]
+  if ENV.has_key?("TVM_PROJECT_DIR") then
+    dirs_to_mount.append(ENV["TVM_PROJECT_DIR"])
+    puts "NOTE: also configuring project dir: %s" % [dirs_to_mount[-1]]
+  end
 
   git_file = Pathname.new(tvm_home + "/.git")
   if git_file.ftype() == "file" then
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
index a89d650995bc..fd758064f4ca 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
@@ -27,10 +27,13 @@ wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
 sudo apt-key add kitware-archive-latest.asc
 sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
 sudo apt update
-sudo apt install -y --no-install-recommends git cmake ninja-build gperf \
-  ccache dfu-util device-tree-compiler wget \
-  python3-dev python3-pip python3-setuptools python3-tk python3-wheel xz-utils file \
-  make gcc gcc-multilib g++-multilib libsdl2-dev
+# NOTE: latest cmake cannot be installed due to
+# https://github.com/zephyrproject-rtos/zephyr/issues/30232
+sudo apt install -y --no-install-recommends git \
+     cmake=3.18.4-0kitware1 cmake-data=3.18.4-0kitware1 \
+     ninja-build gperf ccache dfu-util device-tree-compiler wget \
+     python3-dev python3-pip python3-setuptools python3-tk python3-wheel xz-utils file \
+     make gcc gcc-multilib g++-multilib libsdl2-dev
 
 # Avahi, so that ssh microtvm works.
 # apt install -y avahi-daemon
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/test-config.json b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
new file mode 100644
index 000000000000..78a6bd216e65
--- /dev/null
+++ b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
@@ -0,0 +1,4 @@
+{"vid_hex": "0483",
+ "pid_hex": "374b",
+ "test_cmd": ["pytest", "tests/micro/qemu/test_zephyr.py", "--microtvm-platforms=stm32f746xx"]
+}
diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/reference-vm/zephyr/pyproject.toml
index f1c15ee5c890..ed8182584e36 100644
--- a/apps/microtvm/reference-vm/zephyr/pyproject.toml
+++ b/apps/microtvm/reference-vm/zephyr/pyproject.toml
@@ -92,8 +92,9 @@ onnx = {version = "1.6.0", optional = true}
 onnxruntime = {version = "1.0.0", optional = true}
 
 # Pytorch (also used by ONNX)
-torch = {version = "1.4.0", optional = true}
-torchvision = {version = "0.5.0", optional = true}
+# NOTE: cannot download this right now due to https://github.com/python-poetry/poetry/issues/2247
+# torch = {url = "https://download.pytorch.org/whl/cu101/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl", optional = true}
+# torchvision = {version = "0.5.0", optional = true}
 # NOTE: torch depends on a number of other packages, but unhelpfully, does not expose that in the
 # wheel!!!
 future = {version = "*", optional = true}
diff --git a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
index 9442947438a9..df833042c670 100755
--- a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
+++ b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
@@ -28,6 +28,7 @@ fi
 cp cmake/config.cmake "${BUILD_DIR}"
 cd "${BUILD_DIR}"
 sed -i 's/USE_MICRO OFF/USE_MICRO ON/' config.cmake
+sed -i 's/USE_GRAPH_RUNTIME_DEBUG OFF/USE_GRAPH_RUNTIME_DEBUG ON/' config.cmake
 sed -i 's/USE_LLVM OFF/USE_LLVM ON/' config.cmake
 cmake ..
 make -j4
diff --git a/apps/microtvm/reference-vm/zephyr/setup.sh b/apps/microtvm/reference-vm/zephyr/setup.sh
index 6e87c1fa4eb9..053e41e85256 100644
--- a/apps/microtvm/reference-vm/zephyr/setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/setup.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
+set -ex
 
 # TVM
 # NOTE: TVM is presumed to be mounted already by Vagrantfile.
@@ -26,13 +26,16 @@ apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
 
 cd apps/microtvm/reference-vm/zephyr
 
+poetry env use 3.6
+# NOTE: due to https://github.com/python-poetry/poetry/issues/2247, download torch here.
+poetry run pip3 install torch==1.4.0 torchvision==0.5.0
+
 echo "------------------------------[ TVM Message ]------------------------------"
 echo "WARNING: running 'poetry lock', which could take several minutes (depending"
 echo "on your network connection and the state of PyPI) as dependencies are"
 echo "downloaded and cached for future use."
 echo "------------------------------[ TVM Message ]------------------------------"
-
-poetry lock
+poetry lock -vvv
 poetry install
 poetry run pip3 install -r ~/zephyr/zephyr/scripts/requirements.txt
 

From c0c4b46f0659044110c328d26db8f1cdff589a20 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 2 Dec 2020 18:05:35 -0800
Subject: [PATCH 242/258] [AutoScheduler] Support layout rewrite for whole
 networks (#6987)

* [AutoScheduler] Add layout rewrite pass in relay

* fix

* fix lint

* fix attrs

* trigger CI

* Apply suggestions from code review

* trigger CI

* Update python/tvm/auto_scheduler/relay_integration.py

* Update python/tvm/auto_scheduler/relay_integration.py

* Update python/tvm/auto_scheduler/compute_dag.py

* Trigger CI

* Apply suggestions from code review
---
 include/tvm/ir/transform.h                    |   7 +
 include/tvm/relay/attrs/nn.h                  |   1 +
 include/tvm/relay/attrs/transform.h           |  14 ++
 include/tvm/relay/transform.h                 |  14 ++
 include/tvm/topi/transform.h                  |  68 ++++++++
 python/tvm/auto_scheduler/__init__.py         |   2 +-
 python/tvm/auto_scheduler/compute_dag.py      |  17 ++
 python/tvm/auto_scheduler/measure.py          |   4 +-
 .../tvm/auto_scheduler/relay_integration.py   | 103 ++++++++++-
 python/tvm/relay/op/_transform.py             |   2 +
 python/tvm/relay/op/strategy/generic.py       |  15 +-
 python/tvm/relay/op/strategy/x86.py           |   3 +-
 python/tvm/te/tensor.py                       |   2 +-
 python/tvm/topi/nn/conv2d.py                  |  41 ++++-
 src/auto_scheduler/compute_dag.cc             |  20 ++-
 src/ir/transform.cc                           |  54 +++---
 src/relay/backend/build_module.cc             |  17 ++
 src/relay/backend/compile_engine.cc           |  26 ++-
 src/relay/backend/compile_engine.h            |   9 +
 src/relay/backend/utils.h                     |   9 +
 src/relay/op/make_op.h                        |   2 +
 src/relay/op/nn/convolution.h                 |  12 +-
 src/relay/op/tensor/transform.cc              |  50 +++++-
 .../auto_scheduler_layout_rewrite.cc          | 160 ++++++++++++++++++
 .../auto_scheduler_layout_rewrite.h           |  49 ++++++
 .../test_auto_scheduler_layout_rewrite.py     | 121 +++++++++++++
 26 files changed, 751 insertions(+), 71 deletions(-)
 create mode 100644 src/relay/transforms/auto_scheduler_layout_rewrite.cc
 create mode 100644 src/relay/transforms/auto_scheduler_layout_rewrite.h
 create mode 100644 tests/python/relay/test_auto_scheduler_layout_rewrite.py

diff --git a/include/tvm/ir/transform.h b/include/tvm/ir/transform.h
index d2931123073b..56905ded5201 100644
--- a/include/tvm/ir/transform.h
+++ b/include/tvm/ir/transform.h
@@ -197,6 +197,13 @@ class PassContext : public ObjectRef {
    */
   TVM_DLL void Trace(const IRModule& module, const PassInfo& info, bool is_before) const;
 
+  /*!
+   * \brief Check whether a pass is enabled.
+   * \param info The pass information.
+   * \return true if the pass is enabled. Otherwise, false.
+   */
+  TVM_DLL bool PassEnabled(const PassInfo& info) const;
+
   /*!
    * \brief Register a valid configuration option and its ValueType for validation.
    *
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index e697ac45bd12..f8aa1fc508b6 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -120,6 +120,7 @@ struct Conv2DAttrs : public tvm::AttrsNode<Conv2DAttrs> {
   tvm::String data_layout;
   tvm::String kernel_layout;
   tvm::String out_layout;
+  std::string auto_scheduler_rewritten_layout;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv2DAttrs, "relay.attrs.Conv2DAttrs") {
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 5c0f6f001d2b..b64070781523 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -366,6 +366,20 @@ struct LayoutTransformAttrs : public tvm::AttrsNode<LayoutTransformAttrs> {
   }
 };
 
+/*! \brief Attributes for AutoSchedulerLayoutTransform operator */
+struct AutoSchedulerLayoutTransformAttrs
+    : public tvm::AttrsNode<AutoSchedulerLayoutTransformAttrs> {
+  std::string src_layout;
+  std::string dst_layout;
+
+  TVM_DECLARE_ATTRS(AutoSchedulerLayoutTransformAttrs,
+                    "relay.attrs.AutoSchedulerLayoutTransformAttrs") {
+    TVM_ATTR_FIELD(src_layout).describe("The source layout of the tensor. (e.g. 1N32C112H112W)");
+    TVM_ATTR_FIELD(dst_layout)
+        .describe("The destination layout of the tensor. (e.g. 1N2C112H112W16c)");
+  }
+};
+
 /*! \brief Attributes for ShapeOf operator */
 struct ShapeOfAttrs : public tvm::AttrsNode<ShapeOfAttrs> {
   DataType dtype;
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index a9a45b5f101a..e4b39da85206 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -106,6 +106,14 @@ TVM_DLL Pass FoldConstant();
  */
 TVM_DLL Pass FuseOps(int fuse_opt_level = -1);
 
+/*!
+ * \brief The inverse operation of FuseOps. It transforms a fused program returned by
+ * FuseOps into the program before FuseOps. (i.e. x == DefuseOps(FuseOps(x)))
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass DefuseOps();
+
 /*!
  * \brief Rewrite the annotated program.
  *
@@ -315,6 +323,12 @@ TVM_DLL Pass CanonicalizeOps();
  */
 TVM_DLL Pass AlterOpLayout();
 
+/*!
+ * \brief Do layout rewrite according to the tile structure created by auto-scheduler.
+ * \return The pass
+ */
+TVM_DLL Pass AutoSchedulerLayoutRewrite();
+
 /*!
  * \brief Given a dest layout, this pass transforms the expr such that most of the ops input data
  * layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms, one
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index c866dfb7f86b..c2a4843dedd0 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1400,6 +1400,74 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
       name, tag);
 }
 
+/*! \brief Utility function for auto_scheduler_layout_transform */
+inline void parse_auto_scheduler_layout(const String& layout, Array<PrimExpr>* shape,
+                                        std::vector<std::string>* axes) {
+  int32_t factor = 0;
+  std::string axis = "";
+  for (char c : std::string(layout)) {
+    if (c >= 'A' && c <= 'z') {
+      axis += c;
+      if (factor != 0) {
+        shape->push_back(factor);
+        factor = 0;
+      }
+    } else if (c >= '0' && c <= '9') {
+      factor = factor * 10 + c - '0';
+      if (!axis.empty()) {
+        axes->push_back(axis);
+        axis = "";
+      }
+    } else {
+      LOG(FATAL) << "Invalid layout " << layout;
+    }
+  }
+  if (!axis.empty()) {
+    axes->push_back(axis);
+  }
+}
+
+/*!
+ * \brief Transform the auto-scheduler generated layout according to
+ *        \p src_layout and \p dst_layout
+ * \param src the source input.
+ * \param src_layout the source layout.
+ * \param dst_layout the destination layout.
+ * \param name output tensor name.
+ * \param tag output tensor tag.
+ * \return A tensor with shape in \p dst_layout
+ */
+inline Tensor auto_scheduler_layout_transform(const Tensor& src, const String& src_layout,
+                                              const String& dst_layout,
+                                              const String name = "T_auto_scheduler_layout_trans",
+                                              const String tag = kInjective) {
+  Array<PrimExpr> src_shape;
+  std::vector<std::string> src_axes;
+  Array<PrimExpr> dst_shape;
+  std::vector<std::string> dst_axes;
+
+  parse_auto_scheduler_layout(src_layout, &src_shape, &src_axes);
+  parse_auto_scheduler_layout(dst_layout, &dst_shape, &dst_axes);
+  return compute(
+      dst_shape,
+      [&](const Array<Var>& dst_indices) {
+        Array<PrimExpr> dst_indices_expr(dst_indices.begin(), dst_indices.end());
+        Array<PrimExpr> src_indices;
+        for (const std::string& src_axis : src_axes) {
+          PrimExpr src_index = 0;
+          CHECK_EQ(dst_indices_expr.size(), dst_axes.size());
+          for (size_t i = 0; i < dst_axes.size(); ++i) {
+            if (dst_axes[i] == src_axis) {
+              src_index = src_index * dst_shape[i] + dst_indices_expr[i];
+            }
+          }
+          src_indices.push_back(src_index);
+        }
+        return src(src_indices);
+      },
+      name, tag);
+}
+
 /*!
  * \brief Get the shape of input tensor.
  * \param src the input tensor.
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index f0d076e75f02..5bf2335ec7cf 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -44,7 +44,7 @@
     LocalRPCMeasureContext,
 )
 from .measure_record import RecordToFile, RecordReader, load_best, load_records, save_records
-from .relay_integration import extract_tasks
+from .relay_integration import extract_tasks, remove_index_check, rewrite_compute_body
 from .search_task import SearchTask
 from .search_policy import EmptyPolicy, SketchPolicy, PreloadMeasuredStates
 from .task_scheduler import TaskScheduler
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index 3427709d819a..cba3600ccf6e 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -162,6 +162,23 @@ def infer_bound_from_state(self, state):
                 updated_state.stage_id_map[k] = v
         return updated_state
 
+    def rewrite_layout_from_state(self, state):
+        """
+        Rewrite the layout of the DAG according to the history transform steps of a state.
+
+        Parameters
+        ----------
+        state : Union[State, StateObject]
+            The state from which we get transform steps.
+
+        Returns
+        -------
+        updated_dag : ComputeDAG
+            The compute dag with rewritten layout.
+        """
+        state_obj = state if isinstance(state, StateObject) else state.state_object
+        return _ffi_api.ComputeDAGRewriteLayoutFromState(self, state_obj)
+
     def hash_key(self):
         """Return the hash key of this compute DAG.
 
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 117cd4f8bc71..b9d7148be784 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -544,7 +544,9 @@ def _timed_func(inp_serialized, build_func, verbose):
     args = []
 
     try:
-        sch, args = task.compute_dag.apply_steps_from_state(inp.state, layout_rewrite=True)
+        sch, args = task.compute_dag.apply_steps_from_state(
+            inp.state, layout_rewrite=ComputeDAG.RewriteForPreTransformed
+        )
     # pylint: disable=broad-except
     except Exception:
         error_no = MeasureErrorNo.INSTANTIATION_ERROR
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 6864bcce66e3..25b88811709e 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -23,11 +23,15 @@
 """
 
 import logging
+import json
 import threading
 
 import tvm
 from tvm import autotvm, te, transform
-from tvm.te.tensor import ComputeOp, PlaceholderOp
+from tvm.runtime import convert_to_object
+from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor
+from tvm.tir import expr as _expr
+from . import _ffi_api
 from .compute_dag import ComputeDAG
 from .dispatcher import DispatchContext
 from .search_task import SearchTask
@@ -46,7 +50,11 @@ def call_all_topi_funcs(mod, params, target):
     old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
     autotvm.GLOBAL_SCOPE.silent = True
 
-    with transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+    with transform.PassContext(
+        opt_level=3,
+        config={"relay.backend.use_auto_scheduler": True},
+        disabled_pass={"AutoSchedulerLayoutRewrite"},
+    ):
         opt_mod, _ = relay.optimize(mod, target, params)
         grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
         grc.codegen(opt_mod["main"])
@@ -158,6 +166,20 @@ def add_workload_key(self, workload_key, ccache_key):
         self.wkl_key_to_ccache_key[workload_key] = ccache_key
 
 
+@tvm._ffi.register_func("auto_scheduler.enter_layout_rewrite")
+def enter_layout_rewrite():
+    """Enter layout rewrite tracing environment"""
+    env = TracingEnvironment(TracingMode.PREPARE_LAYOUT_REWRITE)
+    env.__enter__()
+
+
+@tvm._ffi.register_func("auto_scheduler.exit_layout_rewrite")
+def exit_layout_rewrite():
+    """Exit layout rewrite tracing environment"""
+    env = TracingEnvironment.current
+    env.__exit__(None, None, None)
+
+
 def traverse_to_get_io_tensors(outs):
     """Traverse from a list of output tensors to get both input and output tensors
 
@@ -230,11 +252,13 @@ def auto_schedule_topi(outs, has_complex_op):
     key = register_workload_tensors(dag.hash_key(), io_tensors)
 
     # only enable layout rewrite for cpu backend
-    enable_layout_rewrite = "cpu" in tvm.target.Target.current().keys
+    target = tvm.target.Target.current()
+    enable_layout_rewrite = "cpu" in target.keys
 
     env = TracingEnvironment.current
-    if env is None:  # in the final build mode
-        state = DispatchContext.current.query(tvm.target.Target.current(), key, has_complex_op, dag)
+    if env is None:
+        # in the final build mode
+        state = DispatchContext.current.query(target, key, has_complex_op, dag)
         if state is None:
             return None
 
@@ -247,9 +271,74 @@ def auto_schedule_topi(outs, has_complex_op):
             env.add_workload_key(key, ccache_key)
         schedule = te.create_schedule([x.op for x in outs])
     elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
-        # todo(merrymercy, minminsun): port layout rewrite
-        raise NotImplementedError
+        # in prepare_layout_rewrite mode
+        if enable_layout_rewrite and has_layout_free:
+            dispatch_ctx = DispatchContext.current
+            state = dispatch_ctx.query(target, key, has_complex_op, dag)
+            if state is None:
+                return None
+
+            # rewrite the layout and update the context for the new dag
+            dag = ComputeDAG(outs)
+            new_dag = dag.rewrite_layout_from_state(state)
+            new_key = json.dumps((new_dag.hash_key(),))
+            if new_key != key:
+                dispatch_ctx.update(target, new_key, state)
+        return te.create_schedule([x.op for x in outs])
     else:
         raise ValueError("Invalid tracing mode: " + env.tracing_mode)
 
     return schedule
+
+
+def tensor_no_check_call(self, *indices):
+    """An indexing function without any check.
+    This is the same as `tvm.te.Tensor::__call__` except that the safety
+    check is removed.
+    """
+    indices = convert_to_object(indices)
+    args = []
+    for x in indices:
+        if isinstance(x, _expr.PrimExpr):
+            args.append(x)
+        elif isinstance(x, _expr.IterVar):
+            args.append(x.var)
+        else:
+            raise ValueError("The indices must be expression")
+
+    return _expr.ProducerLoad(self, args)
+
+
+def remove_index_check(tensor):
+    """Remove the safety check in the indexing function for a tensor.
+    This is done by monkey patching its indexing function.
+    After removing the check, we are allowed to create a
+    temporary wrong IR and fix it later in other places.
+
+    Parameters
+    ----------
+    tensor: Tensor
+      The tensor to remove index check.
+    """
+    # Monkey patch the indexing function
+    tensor.__call__ = tensor_no_check_call.__get__(tensor, Tensor)
+
+
+def rewrite_compute_body(compute_tensor, new_layout):
+    """Rewrite the body of a ComputeOp according to a new layout of a placeholder"""
+    op = compute_tensor.op
+
+    # Get layout free placeholders
+    layout_free_placeholders = op.attrs["layout_free_placeholders"]
+    assert len(layout_free_placeholders) == 1, "Only support one layout free placeholder"
+    placeholder_op = layout_free_placeholders[0].op
+
+    # Rewrite the index expression in body
+    body = []
+    for b in op.body:
+        body.append(_ffi_api.RewriteIndexForNewLayout(placeholder_op, new_layout, b))
+    op_node = tvm.te._ffi_api.ComputeOp(op.name, op.tag, op.attrs, op.axis, body)
+
+    num = op_node.num_outputs
+    outputs = tuple(op_node.output(i) for i in range(num))
+    return outputs[0] if num == 1 else outputs
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 10c34ea8a72f..1092c308cf49 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -79,6 +79,8 @@ def compute_strided_set(attrs, inputs, output_type):
 # layout_transform
 _reg.register_injective_schedule("layout_transform")
 _reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
+_reg.register_injective_schedule("auto_scheduler_layout_transform")
+_reg.register_pattern("auto_scheduler_layout_transform", OpPattern.INJECTIVE)
 
 # argwhere
 @_reg.register_compute("argwhere")
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 756d5f0cd2ea..c289c65758d9 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -19,7 +19,7 @@
 import logging
 
 import re
-from tvm import topi
+from tvm import topi, _ffi
 from tvm.topi.utils import get_const_int, get_const_float, get_const_tuple, get_float_tuple
 from tvm.target import generic_func, override_native_generic_func
 from .. import op as _op
@@ -166,9 +166,17 @@ def schedule_bitpack(attrs, outs, target):
         return topi.generic.schedule_bitpack(outs)
 
 
+get_auto_scheduler_rewritten_layout = _ffi.get_global_func(
+    "relay.attrs.get_auto_scheduler_rewritten_layout"
+)
+
 # conv2d
 def wrap_compute_conv2d(
-    topi_compute, need_data_layout=False, need_out_layout=False, has_groups=False
+    topi_compute,
+    need_data_layout=False,
+    need_out_layout=False,
+    has_groups=False,
+    need_auto_scheduler_layout=False,
 ):
     """Wrap conv2d topi compute"""
 
@@ -179,6 +187,7 @@ def _compute_conv2d(attrs, inputs, out_type):
         data_layout = attrs.get_str("data_layout")
         out_layout = attrs.get_str("out_layout")
         out_dtype = attrs.out_dtype
+        auto_scheduler_rewritten_layout = get_auto_scheduler_rewritten_layout(attrs)
         out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
         args = [inputs[0], inputs[1], strides, padding, dilation]
         if has_groups:
@@ -188,6 +197,8 @@ def _compute_conv2d(attrs, inputs, out_type):
         if need_out_layout:
             args.append(out_layout)
         args.append(out_dtype)
+        if need_auto_scheduler_layout:
+            args.append(auto_scheduler_rewritten_layout)
         return [topi_compute(*args)]
 
     return _compute_conv2d
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 3f129c471faf..98b56ef4d1c0 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -117,9 +117,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
-            logger.warning("For x86 target, NCHW layout is recommended for conv2d.")
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
+                wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
                 wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
                 name="conv2d_nhwc.x86",
             )
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index 6294eab2cad9..bdf39544759b 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -40,7 +40,7 @@ def __getitem__(self, indices):
 
     def asobject(self):
         """Convert slice to object."""
-        return self.tensor(*self.indices)
+        return self.tensor.__call__(*self.indices)
 
     @property
     def dtype(self):
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 7c9cef613439..8d591a20839a 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
-from tvm import te
+from tvm import te, auto_scheduler
 
 from .pad import pad
 from .utils import get_pad_tuple
@@ -331,7 +331,15 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     return Output
 
 
-def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype="float32"):
+def conv2d_nhwc(
+    Input,
+    Filter,
+    stride,
+    padding,
+    dilation,
+    out_dtype="float32",
+    auto_scheduler_rewritten_layout="",
+):
     """Convolution operator in NHWC layout.
 
     Parameters
@@ -371,8 +379,30 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype="float32"):
     else:
         dilation_h, dilation_w = dilation
 
+    if auto_scheduler_rewritten_layout:
+        # Infer shape for the rewritten layout
+        # todo(merrymercy): wrap this with a more general interface.
+        if len(Filter.shape) >= 10:
+            # For cpu tile structure SSRSRS
+            base = len(Filter.shape) - 10
+            kernel_h = Filter.shape[2 + base] * Filter.shape[6 + base]
+            kernel_w = Filter.shape[3 + base] * Filter.shape[7 + base]
+            channel = Filter.shape[4 + base] * Filter.shape[8 + base]
+            num_filter = Filter.shape[5 + base] * Filter.shape[9 + base]
+            for i in range(base + 2):
+                num_filter *= Filter.shape[i]
+        elif len(Filter.shape) == 4:
+            num_filter, kernel_h, kernel_w, channel = Filter.shape
+        else:
+            raise ValueError(
+                "Don't know how to infer the layout for filter shape: %s. "
+                "Please add a new branch to handle this case." % str(Filter)
+            )
+        auto_scheduler.remove_index_check(Filter)
+    else:
+        kernel_h, kernel_w, channel, num_filter = Filter.shape
+
     batch, in_height, in_width, in_channel = Input.shape
-    kernel_h, kernel_w, channel, num_filter = Filter.shape
     # compute the output shape
     dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
     dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
@@ -399,7 +429,12 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype="float32"):
         ),
         name="Conv2dOutput",
         tag="conv2d_nhwc",
+        attrs={"layout_free_placeholders": [Filter]},
     )
+
+    if auto_scheduler_rewritten_layout:
+        Output = auto_scheduler.rewrite_compute_body(Output, auto_scheduler_rewritten_layout)
+
     return Output
 
 
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index caaed6f4d667..ca5997963520 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -42,6 +42,7 @@
 #include <vector>
 
 #include "../arith/pattern_match.h"
+#include "../relay/transforms/auto_scheduler_layout_rewrite.h"
 #include "search_policy/utils.h"
 #include "utils.h"
 
@@ -813,8 +814,7 @@ std::string GetOrigLayout(std::set<std::string>* placeholder_axis_names, const t
   ICHECK_EQ(placeholder_axis_names->size(), placeholder->shape.size());
   std::string orig_layout = os.str();
   os.str("");
-  // TODO(minmin): uncomment this line for relay integration
-  // ::tvm::relay::KernelLayoutTransformer::global_orig_layouts_queue.push_back(orig_layout);
+  ::tvm::relay::AutoSchedulerLayoutRewriter::global_ori_layouts_queue.push_back(orig_layout);
   return orig_layout;
 }
 
@@ -878,8 +878,7 @@ std::string GetNewLayout(const State& state, const int stage_id, const Stage& st
   }
   std::string new_layout = os.str();
   os.str("");
-  // TODO(minmin): uncomment this line for relay integration
-  // ::tvm::relay::KernelLayoutTransformer::global_new_layouts_queue.push_back(new_layout);
+  ::tvm::relay::AutoSchedulerLayoutRewriter::global_new_layouts_queue.push_back(new_layout);
   return new_layout;
 }
 
@@ -1440,5 +1439,18 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGInferBoundFromState")
       return dag.InferBound(state);
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGRewriteLayoutFromState")
+    .set_body_typed([](const ComputeDAG& dag, const State& state) {
+      Array<Step>* transform_steps = const_cast<Array<Step>*>(&state->transform_steps);
+      return dag.RewriteLayout(transform_steps, LayoutRewriteOption::RewriteForPreTransformed);
+    });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.RewriteIndexForNewLayout")
+    .set_body_typed([](const te::Operation& placeholder_op, const std::string& new_layout,
+                       const PrimExpr& body) {
+      IndexRewriter index_rewriter(placeholder_op, new_layout);
+      return index_rewriter.Rewrite(body);
+    });
+
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index 3b774462565e..f4516d5e57c5 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -74,6 +74,26 @@ PassContext PassContext::Current() {
   }
 }
 
+// linearly scan the pass array to match pass_name
+bool PassArrayContains(const Array<runtime::String>& pass_array, const std::string& pass_name) {
+  for (auto x : pass_array) {
+    if (x == pass_name) return true;
+  }
+  return false;
+}
+
+bool PassContext::PassEnabled(const PassInfo& info) const {
+  if (PassArrayContains(operator->()->disabled_pass, info->name)) {
+    return false;
+  }
+
+  if (PassArrayContains(operator->()->required_pass, info->name)) {
+    return true;
+  }
+
+  return operator->()->opt_level >= info->opt_level;
+}
+
 class PassConfigManager {
  public:
   void Register(std::string key, uint32_t value_type_index) {
@@ -224,15 +244,6 @@ class SequentialNode : public PassNode {
    */
   PassInfo Info() const override { return pass_info; }
 
-  /*!
-   * \brief Check if a pass is enabled.
-   *
-   * \param info The pass information.
-   *
-   * \return true if the pass is enabled. Otherwise, false.
-   */
-  bool PassEnabled(const PassInfo& info) const;
-
   /*!
    * \brief Resolve the pass dependency. It globs all required passes by
    *        a given pass and executes them.
@@ -344,29 +355,6 @@ void SequentialNode::ResolveDependency(const IRModule& mod) {
              << "\n";
 }
 
-// linearly scan the pass array to match pass_name
-inline bool PassArrayContains(const Array<runtime::String>& pass_array,
-                              const std::string& pass_name) {
-  for (auto x : pass_array) {
-    if (x == pass_name) return true;
-  }
-  return false;
-}
-
-bool SequentialNode::PassEnabled(const PassInfo& info) const {
-  PassContext ctx = PassContext::Current();
-
-  if (PassArrayContains(ctx->disabled_pass, info->name)) {
-    return false;
-  }
-
-  if (PassArrayContains(ctx->required_pass, info->name)) {
-    return true;
-  }
-
-  return ctx->opt_level >= info->opt_level;
-}
-
 Pass GetPass(const String& pass_name) {
   using tvm::runtime::Registry;
   const runtime::PackedFunc* f = nullptr;
@@ -387,7 +375,7 @@ IRModule SequentialNode::operator()(IRModule mod, const PassContext& pass_ctx) c
   for (const Pass& pass : passes) {
     ICHECK(pass.defined()) << "Found undefined pass for optimization.";
     const PassInfo& pass_info = pass->Info();
-    if (!PassEnabled(pass_info)) continue;
+    if (!pass_ctx.PassEnabled(pass_info)) continue;
     // resolve dependencies
     for (const auto& it : pass_info->required) {
       mod = GetPass(it)(std::move(mod), pass_ctx);
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 82ac1c57018e..a0828d1cac6c 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -338,7 +338,24 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     // Fuse the operations if it is needed.
     relay_module = transform::FuseOps()(relay_module);
+
+    // Do layout rewrite for auto-scheduler.
+    if (backend::IsAutoSchedulerEnabled() && targets.size() == 1) {
+      const auto& target = (*targets.begin()).second;
+      Pass major_pass = transform::AutoSchedulerLayoutRewrite();
+
+      if (target->kind->device_type == kDLCPU && pass_ctx.PassEnabled(major_pass->Info())) {
+        With<Target> tctx(target);
+        relay_module = major_pass(relay_module);
+        // Defuse ops to fold constants, then fuse them again
+        relay_module = transform::DefuseOps()(relay_module);
+        relay_module = transform::FoldConstant()(relay_module);
+        relay_module = transform::FuseOps()(relay_module);
+      }
+    }
+
     relay_module = transform::InferType()(relay_module);
+
     // Inline the functions that have been lifted by the module scope.
     //
     // TODO(@zhiics) Note that we need to be careful about the subgraphs with
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 1559d7edf35f..98d913662953 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -101,9 +101,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   explicit ScheduleGetter(Target target)
       : target_(target), device_copy_op_(Op::Get("device_copy")) {
     // Whether to use auto_scheduler schedule.
-    use_auto_scheduler_ = transform::PassContext::Current()
-                              ->GetConfig<Bool>("relay.backend.use_auto_scheduler", Bool(false))
-                              .value();
+    use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
   }
 
   CachedFunc Create(const Function& prim_func) {
@@ -322,6 +320,17 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   const Op& device_copy_op_;
 };
 
+/*!
+ * \brief Create schedule for target.
+ * \param source_func The primitive function to be lowered.
+ * \param target The target we want to create schedule for.
+ * \return Pair of schedule and cache.
+ *  The funcs field in cache is not yet populated.
+ */
+CachedFunc CreateSchedule(const Function& source_func, const Target& target) {
+  return ScheduleGetter(target).Create(source_func);
+}
+
 // Creates shape function from functor.
 class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
  public:
@@ -680,17 +689,6 @@ class CompileEngineImpl : public CompileEngineNode {
    */
   CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
 
-  /*!
-   * \brief Create schedule for target.
-   * \param source_func The primitive function to be lowered.
-   * \param target The target we want to create schedule for.
-   * \return Pair of schedule and cache.
-   *  The funcs field in cache is not yet populated.
-   */
-  CachedFunc CreateSchedule(const Function& source_func, const Target& target) {
-    return ScheduleGetter(target).Create(source_func);
-  }
-
  private:
   // implement lowered func
   CCacheValue LowerInternal(const CCacheKey& key) {
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 55822917b6b7..d7628e7a5bdf 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -241,6 +241,15 @@ class CompileEngine : public ObjectRef {
   TVM_DLL static CompileEngine& Global();
 };
 
+/*!
+ * \brief Create schedule for target.
+ * \param source_func The primitive function to be lowered.
+ * \param target The target we want to create schedule for.
+ * \return Pair of schedule and cache.
+ *  The funcs field in cache is not yet populated.
+ */
+CachedFunc CreateSchedule(const Function& source_func, const Target& target);
+
 /*!
  * \brief Check if the type is dynamic.
  * \param ty The type to be checked.
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index ccb8611b7a3c..e1677205ffa1 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -294,6 +294,15 @@ inline std::string GetExtSymbol(const Function& func) {
   return std::string(name_node.value());
 }
 
+/*!
+ * \brief Return whether the auto scheduler is enabled in the pass context.
+ */
+inline bool IsAutoSchedulerEnabled() {
+  return transform::PassContext::Current()
+      ->GetConfig<Bool>("relay.backend.use_auto_scheduler", Bool(false))
+      .value();
+}
+
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 34bff0f5b858..d2fb6aa2b9c3 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -52,6 +52,8 @@ Expr MakeFull(Expr fill_value, Array<Integer> shape, DataType dtype);
 
 Expr MakeLayoutTransform(Expr data, String src_layout, String dst_layout);
 
+Expr MakeAutoSchedulerLayoutTransform(Expr data, String src_layout, String dst_layout);
+
 Expr MakeOnes(Array<Integer> shape, DataType dtype);
 
 Expr MakePad(Expr data, Array<Array<Integer>> pad_width, double pad_value, String pad_mode);
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index f0112227153d..13e87a54b9d8 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -212,8 +212,16 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     if (weight != nullptr) {
       weight_dtype = weight->dtype;
     }
-    // assign result to reporter
-    reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+
+    if (param->auto_scheduler_rewritten_layout.size() == 0) {
+      // Normal case: assign result to reporter
+      reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+    } else {
+      // If the layout is rewritten by auto-scheduler,
+      // we just forcly apply the layout provided by auto-scheduler and
+      // skip the normal inference logic.
+      {}  // do nothing
+    }
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 57cd9bd4118a..410738a6417d 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2858,7 +2858,55 @@ the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
     .set_support_level(5)
     .set_attr<FTVMCompute>("FTVMCompute", LayoutTransformCompute);
 
-/* relay._contrib_reverse_reshape */
+// relay.auto_scheduler_layout_transform
+TVM_REGISTER_NODE_TYPE(AutoSchedulerLayoutTransformAttrs);
+
+Array<te::Tensor> AutoSchedulerLayoutTransformCompute(const Attrs& attrs,
+                                                      const Array<te::Tensor>& inputs,
+                                                      const Type& out_type) {
+  const auto* param = attrs.as<AutoSchedulerLayoutTransformAttrs>();
+  CHECK(param != nullptr);
+  return Array<te::Tensor>{
+      topi::auto_scheduler_layout_transform(inputs[0], param->src_layout, param->dst_layout)};
+}
+
+bool AutoSchedulerLayoutTransformRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                                     const TypeReporter& reporter) {
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data != nullptr);
+  const AutoSchedulerLayoutTransformAttrs* params = attrs.as<AutoSchedulerLayoutTransformAttrs>();
+
+  Array<IndexExpr> dst_shape;
+  std::vector<std::string> dst_axes;
+
+  topi::parse_auto_scheduler_layout(params->dst_layout, &dst_shape, &dst_axes);
+
+  reporter->Assign(types[1], TensorType(dst_shape, data->dtype));
+  return true;
+}
+
+Expr MakeAutoSchedulerLayoutTransform(Expr data, String src_layout, String dst_layout) {
+  auto attrs = make_object<AutoSchedulerLayoutTransformAttrs>();
+  attrs->src_layout = std::move(src_layout);
+  attrs->dst_layout = std::move(dst_layout);
+  static const Op& op = Op::Get("auto_scheduler_layout_transform");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.auto_scheduler_layout_transform")
+    .set_body_typed(MakeAutoSchedulerLayoutTransform);
+
+RELAY_REGISTER_OP("auto_scheduler_layout_transform")
+    .describe(R"code(Transform the input kernel layout.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<AutoSchedulerLayoutTransformAttrs>()
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_type_rel("auto_scheduler_layout_transform", AutoSchedulerLayoutTransformRel)
+    .set_support_level(5)
+    .set_attr<FTVMCompute>("FTVMCompute", AutoSchedulerLayoutTransformCompute);
+
+// relay._contrib_reverse_reshape
 Expr MakeReverseReshape(Expr data, Array<Integer> newshape) {
   auto attrs = make_object<ReshapeAttrs>();
   attrs->newshape = std::move(newshape);
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
new file mode 100644
index 000000000000..c9875ef5d718
--- /dev/null
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file auto_scheduler_layout_rewrite.h
+ * \brief Rewrite the layout of "layout free" tensors (e.g., the weight tensors in
+ * conv2d and dense layers) according to the tile structure generated by the auto-scheduler.
+ */
+
+#include "auto_scheduler_layout_rewrite.h"
+
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/transform.h>
+
+#include <deque>
+#include <functional>
+#include <vector>
+
+#include "../backend/compile_engine.h"
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+// Two global variables for receiving layout information from python
+std::deque<std::string> AutoSchedulerLayoutRewriter::global_ori_layouts_queue;
+std::deque<std::string> AutoSchedulerLayoutRewriter::global_new_layouts_queue;
+
+// Copy an Attrs but with a new auto_scheduler_rewritten_layout filed.
+template <typename T>
+Attrs CopyAttrsWithNewLayout(const T* ptr, const std::string& layout) {
+  auto n = make_object<T>(*ptr);
+  n->auto_scheduler_rewritten_layout = layout;
+  return Attrs(n);
+}
+
+// Mutate ops in a function
+class FuncMutator : public ExprMutator {
+ public:
+  FuncMutator(const std::deque<std::string>& ori_layouts_queue,
+              const std::deque<std::string>& new_layouts_queue)
+      : ExprMutator(),
+        ori_layouts_queue_(ori_layouts_queue),
+        new_layouts_queue_(new_layouts_queue) {}
+
+  Expr VisitExpr_(const CallNode* n) {
+    auto new_n = ExprMutator::VisitExpr_(n);
+
+    const auto* call = new_n.as<CallNode>();
+    if (call && call->op.as<OpNode>() &&
+        (std::find(target_ops_.begin(), target_ops_.end(), n->op.as<OpNode>()->name) !=
+         target_ops_.end()) &&
+        !ori_layouts_queue_.empty() && !new_layouts_queue_.empty()) {
+      // Pop a new layout from the queue
+      const std::string ori_layout = ori_layouts_queue_.front();
+      const std::string new_layout = new_layouts_queue_.front();
+      ori_layouts_queue_.pop_front();
+      new_layouts_queue_.pop_front();
+
+      // Insert a new op to do layout transform. (This will be simplified by FoldConstant later).
+      Expr updated_kernel = MakeAutoSchedulerLayoutTransform(call->args[1], ori_layout, new_layout);
+      Array<Expr> updated_args = {call->args[0], updated_kernel};
+
+      // Update the attrs
+      Attrs updated_attrs;
+      if (auto pattr = call->attrs.as<Conv2DAttrs>()) {
+        updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
+      }
+      new_n = Call(call->op, updated_args, updated_attrs);
+    }
+    return new_n;
+  }
+
+ private:
+  std::deque<std::string> ori_layouts_queue_;
+  std::deque<std::string> new_layouts_queue_;
+
+  std::vector<std::string> target_ops_{"nn.conv2d"};
+};
+
+Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
+  auto new_n = ExprMutator::VisitExpr_(n);
+
+  if (const auto* call = new_n.as<CallNode>()) {
+    if (const auto* func = call->op.as<FunctionNode>()) {
+      global_ori_layouts_queue.clear();
+      global_new_layouts_queue.clear();
+
+      // Use ScheduleGetter to call python lower functions.
+      // This is used to get the layout transform information.
+      // The layout transformation will be recorded to global_ori_layout_queue
+      // and global_new_layouts_queue in ComputeDAG::RewriteLayout.
+      auto f = runtime::Registry::Get("auto_scheduler.enter_layout_rewrite");
+      CHECK(f) << "Could not find auto_scheduler.enter_layout_rewrite function.";
+      (*f)();
+
+      CreateSchedule(GetRef<Function>(func), Target::Current());
+
+      f = runtime::Registry::Get("auto_scheduler.exit_layout_rewrite");
+      CHECK(f) << "Could not find ansor.exit_layout_rewrite function.";
+      (*f)();
+
+      // Mutate the called function
+      if (!global_ori_layouts_queue.empty() && !global_new_layouts_queue.empty()) {
+        auto ret = FuncMutator(global_ori_layouts_queue, global_new_layouts_queue).VisitExpr(new_n);
+        return ret;
+      }
+    }
+  }
+
+  return new_n;
+}
+
+Expr AutoSchedulerLayoutRewrite(const Expr& expr) {
+  return AutoSchedulerLayoutRewriter().Mutate(expr);
+}
+
+namespace transform {
+
+Pass AutoSchedulerLayoutRewrite() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(relay::AutoSchedulerLayoutRewrite(f));
+      };
+  return CreateFunctionPass(pass_func, 3, "AutoSchedulerLayoutRewrite", {"InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.AutoSchedulerLayoutRewrite")
+    .set_body_typed(AutoSchedulerLayoutRewrite);
+
+TVM_REGISTER_GLOBAL("relay.attrs.get_auto_scheduler_rewritten_layout")
+    .set_body_typed([](const Attrs& attrs) {
+      if (attrs->IsInstance<Conv2DAttrs>()) {
+        return attrs.as<Conv2DAttrs>()->auto_scheduler_rewritten_layout;
+      }
+      return std::string();
+    });
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.h b/src/relay/transforms/auto_scheduler_layout_rewrite.h
new file mode 100644
index 000000000000..d0d89db42e68
--- /dev/null
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.h
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file auto_scheduler_layout_rewrite.h
+ * \brief Rewrite the layout of "layout free" tensors (e.g., the weight tensors in
+ * conv2d and dense layers) according to the tile structure generated by the auto-scheduler.
+ */
+
+#ifndef TVM_RELAY_TRANSFORMS_AUTO_SCHEDULER_LAYOUT_REWRITE_H_
+#define TVM_RELAY_TRANSFORMS_AUTO_SCHEDULER_LAYOUT_REWRITE_H_
+
+#include <tvm/relay/expr_functor.h>
+
+#include <deque>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+class AutoSchedulerLayoutRewriter : public ExprMutator {
+ public:
+  Expr VisitExpr_(const CallNode* n) final;
+
+  // Two global variables for receiving layout information from python
+  static std::deque<std::string> global_ori_layouts_queue;
+  static std::deque<std::string> global_new_layouts_queue;
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_TRANSFORMS_AUTO_SCHEDULER_LAYOUT_REWRITE_H_
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite.py b/tests/python/relay/test_auto_scheduler_layout_rewrite.py
new file mode 100644
index 000000000000..299fcb8ebb2c
--- /dev/null
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test layout rewrite support for whole neural networks"""
+import tempfile
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+from tvm.contrib import graph_runtime
+import tvm.testing
+
+
+def get_np_array(var, dtype):
+    return np.random.randn(*[int(x) for x in var.type_annotation.shape]).astype(dtype)
+
+
+def get_relay_conv2d(
+    outc=128,
+    inc=64,
+    height=14,
+    width=14,
+    kh=3,
+    kw=3,
+    batch=1,
+    pad=0,
+    stride=1,
+    dilation=1,
+    layout="NHWC",
+):
+    dtype = "float32"
+    if layout == "NHWC":
+        kernel_layout = "HWIO"
+        d = relay.var("data", shape=(batch, height, width, inc), dtype=dtype)
+        w = relay.var("weight", shape=(kh, kw, inc, outc), dtype=dtype)
+    elif layout == "NCHW":
+        kernel_layout = "OIHW"
+        d = relay.var("data", shape=(batch, inc, height, width), dtype=dtype)
+        w = relay.var("weight", shape=(outc, inc, kh, kw), dtype=dtype)
+
+    y = relay.nn.conv2d(
+        d,
+        w,
+        padding=pad,
+        kernel_size=(kh, kw),
+        strides=(stride, stride),
+        dilation=(dilation, dilation),
+        channels=outc,
+        groups=1,
+        data_layout=layout,
+        kernel_layout=kernel_layout,
+    )
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([d, w], y)
+    data, weight = get_np_array(d, dtype), get_np_array(w, dtype)
+    return mod, data, weight
+
+
+def tune_and_check(mod, data, weight):
+    # Extract tasks from a relay program
+    target = tvm.target.Target("llvm")
+    tasks, task_weights = auto_scheduler.extract_tasks(mod, target=target, params={})
+
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+
+        # Tune tasks
+        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=1,
+            num_measures_per_round=1,
+            builder=auto_scheduler.LocalBuilder(timeout=60),
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        tuner.tune(tune_option, search_policy="sketch.random")
+
+        # Compile and run
+        def compile_and_run(disabled_pass={}):
+            with auto_scheduler.ApplyHistoryBest(log_file):
+                with tvm.transform.PassContext(
+                    opt_level=3,
+                    config={"relay.backend.use_auto_scheduler": True},
+                    disabled_pass=disabled_pass,
+                ):
+                    lib = relay.build(mod, target=target, params={"weight": weight})
+
+            ctx = tvm.cpu()
+            module = graph_runtime.GraphModule(lib["default"](ctx))
+            module.set_input("data", data)
+            module.run()
+
+            return module.get_output(0).asnumpy()
+
+        # Check correctness
+        actual_output = compile_and_run()
+        expected_output = compile_and_run(disabled_pass={"AutoSchedulerLayoutRewrite"})
+
+        tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4)
+
+
+def test_conv2d():
+    mod, data, weight = get_relay_conv2d(kh=1, kw=1)
+    tune_and_check(mod, data, weight)
+
+
+if __name__ == "__main__":
+    test_conv2d()

From 19872062af1fad6b247636374732c885fabaed75 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Wed, 2 Dec 2020 19:57:13 -0800
Subject: [PATCH 243/258] Fix trt Test (#7016)

* Fix trt Test

* Fixed stuff

* Done

* fix 0

* Trigger Build

Co-authored-by: Ubuntu <ubuntu@ip-172-31-27-149.us-east-2.compute.internal>
---
 tests/python/contrib/test_tensorrt.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index de9822289528..aadfa1303655 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -1050,7 +1050,7 @@ def test_tensorrt_dynamic_batch():
     batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2]
     x_shape = (relay.Any(), 1, 8, 8)
     x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
-    result_dict = {}
+    result_arr = [{} for _ in range(len(batches_to_test))]
     for use_trt in [True, False]:
         x = relay.var("x", shape=x_shape, dtype="float32")
         out = relay.nn.relu(x)
@@ -1058,18 +1058,18 @@ def test_tensorrt_dynamic_batch():
         mod = tvm.IRModule()
         mod["main"] = f
         if use_trt:
-            mod = relay.tensorrt.EnableTrt(mod)
+            mod, _ = tensorrt.partition_for_tensorrt(mod)
 
         if not skip_runtime_test():
             with relay.build_config(opt_level=3):
                 relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
 
             for i, batch_size in enumerate(batches_to_test):
-                result_dict[(i, use_trt)] = relay_exec.evaluate()(x_data[:batch_size, ...])
+                result_arr[i][use_trt] = relay_exec.evaluate()(x_data[:batch_size, ...])
 
     if not skip_runtime_test():
         for i in range(len(batches_to_test)):
-            assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
+            assert_result_dict_holds(result_arr[i])
 
 
 def test_tensorrt_dynamic_batch_conv():
@@ -1080,7 +1080,7 @@ def test_tensorrt_dynamic_batch_conv():
     x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
     k_shape = (16, 32, 3, 3)
     params = {"kernel": np.random.uniform(-1, 1, k_shape).astype("float32")}
-    result_dict = {}
+    result_arr = [{} for _ in range(len(batches_to_test))]
     for use_trt in [True, False]:
         x = relay.var("x", shape=x_shape, dtype="float32")
         kernel = relay.var("kernel", shape=k_shape, dtype="float32")
@@ -1089,20 +1089,18 @@ def test_tensorrt_dynamic_batch_conv():
         mod = tvm.IRModule()
         mod["main"] = f
         if use_trt:
-            mod = tensorrt.partition_for_tensorrt(mod, params)
+            mod, _ = tensorrt.partition_for_tensorrt(mod, params)
 
         if not skip_runtime_test():
             with relay.build_config(opt_level=3):
                 relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
 
             for i, batch_size in enumerate(batches_to_test):
-                result_dict[(i, use_trt)] = relay_exec.evaluate()(
-                    x=x_data[:batch_size, ...], **params
-                )
+                result_arr[i][use_trt] = relay_exec.evaluate()(x_data[:batch_size, ...], **params)
 
     if not skip_runtime_test():
         for i in range(len(batches_to_test)):
-            assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
+            assert_result_dict_holds(result_arr[i])
 
 
 def test_maskrcnn_resnet50() -> None:

From 8d703f08343006206ae348ce1c6b5e16630efd46 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 3 Dec 2020 06:17:26 -0800
Subject: [PATCH 244/258] [AutoScheduler] Add a tutorial on auto-scheduling a
 network for x86 CPU (#7019)

* [AutoScheduler] Add tutorial on auto-scheduling a network for CPU

* update

* update

* update

* improve

* improve

* address comments

* add help on layout conversion

* add help for layout conversion

* update target string

* update cuda logs
---
 docs/dev/convert_layout.rst                   |   1 +
 python/tvm/auto_scheduler/measure.py          |   4 +
 .../ci_logs/resnet-18-NHWC-B1-cuda.json       |  26 ++
 .../ci_logs/resnet-18-NHWC-B1.json            |  26 --
 .../ci_logs/resnet-50-NHWC-B1-llvm.json       |  31 ++
 tutorials/auto_scheduler/tune_network_cuda.py |  21 +-
 tutorials/auto_scheduler/tune_network_x86.py  | 306 ++++++++++++++++++
 7 files changed, 379 insertions(+), 36 deletions(-)
 create mode 100644 tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
 delete mode 100644 tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
 create mode 100644 tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
 create mode 100644 tutorials/auto_scheduler/tune_network_x86.py

diff --git a/docs/dev/convert_layout.rst b/docs/dev/convert_layout.rst
index 490df1372417..53038e9605e8 100644
--- a/docs/dev/convert_layout.rst
+++ b/docs/dev/convert_layout.rst
@@ -227,6 +227,7 @@ Second example is for a lightly-layout sensitive operator - batch normalization.
 ********
 4. Usage
 ********
+.. _convert-layout-usage:
 
 ConvertLayout pass is extremely easy to use. The pass is not a part of default relay.build pipeline. The intended usage is to call it between the framework-to-relay parser and relay.build module call.
 
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index b9d7148be784..b2826518d8c8 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -329,6 +329,10 @@ def __init__(
         cooldown_interval=0.0,
         enable_cpu_cache_flush=False,
     ):
+        if enable_cpu_cache_flush:
+            number = 1
+            min_repeat_ms = 0
+
         self.__init_handle_by_constructor__(
             _ffi_api.LocalRunner,
             timeout,
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
new file mode 100644
index 000000000000..8d0a6ae980c4
--- /dev/null
+++ b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
@@ -0,0 +1,26 @@
+# Provide valid schedules for resnet-18 on GPU.
+# This is used to run the tutorial on the documentation web server.
+{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$512"], ["PR", 3, 0, "auto_unroll_max_step$512"]]]], "r": [[4.87396e-06], 0, 1.30575, 1606984701], "v": "v0.3"}
+{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 32, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[2.25155e-05], 0, 1.5128, 1606984719], "v": "v0.3"}
+{"i": [["[\"7de313da0ca29a8c63f647791692430d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.91068e-06], 0, 1.63708, 1606984742], "v": "v0.3"}
+{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [2], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [4, 1, 4, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 2, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000190231], 0, 1.95863, 1606984773], "v": "v0.3"}
+{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [4, 2, 2, 1], 1], ["SP", 6, 15, 512, [1, 16, 2, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000218188], 0, 2.05807, 1606984806], "v": "v0.3"}
+{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 8], 1], ["SP", 6, 15, 512, [1, 16, 1, 2], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [2], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000165484], 0, 2.76154, 1606984831], "v": "v0.3"}
+{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 128, 1, 2], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000157488], 0, 2.05375, 1606984883], "v": "v0.3"}
+{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.00011824], 0, 1.84964, 1606984912], "v": "v0.3"}
+{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 1], 1], ["SP", 6, 15, 256, [4, 8, 1, 1], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.67244e-05], 0, 1.93124, 1606984935], "v": "v0.3"}
+{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 7, 2, 1], 1], ["SP", 3, 15, 256, [2, 2, 1, 4], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [4, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[9.20105e-05], 0, 1.88263, 1606984952], "v": "v0.3"}
+{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 7], 1], ["SP", 6, 15, 128, [1, 4, 1, 16], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102747], 0, 2.2858, 1606984979], "v": "v0.3"}
+{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [2, 8, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000133211], 0, 2.07337, 1606985017], "v": "v0.3"}
+{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 7, 1], 1], ["SP", 6, 15, 128, [1, 2, 2, 2], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000150142], 0, 1.90539, 1606985042], "v": "v0.3"}
+{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 2, 1], 1], ["SP", 3, 10, 28, [1, 7, 2, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 64, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 360, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000101548], 0, 1.92449, 1606985059], "v": "v0.3"}
+{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 14, 1, 1], 1], ["SP", 6, 15, 64, [2, 2, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[5.64548e-05], 0, 3.15692, 1606985088], "v": "v0.3"}
+{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 2, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 4, 1], 1], ["SP", 6, 15, 64, [1, 8, 1, 4], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135574], 0, 2.88002, 1606985120], "v": "v0.3"}
+{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 1, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 48, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 96, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000115802], 0, 4.06441, 1606985158], "v": "v0.3"}
+{"i": [["[\"ba2026d923536b75e9b4faed89287d5f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$16"]]]], "r": [[2.00968e-05], 0, 1.53065, 1606985193], "v": "v0.3"}
+{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 7, 1], 1], ["SP", 3, 10, 112, [1, 7, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 4, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 84, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 273, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.14326e-05], 0, 2.05623, 1606985220], "v": "v0.3"}
+{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 2], 1], ["SP", 3, 10, 56, [1, 7, 1, 2], 1], ["SP", 3, 15, 64, [1, 16, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 256, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.17113e-05], 0, 1.9863, 1606985239], "v": "v0.3"}
+{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 7], 1], ["SP", 3, 15, 128, [8, 8, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[1.76965e-05], 0, 1.63284, 1606985253], "v": "v0.3"}
+{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 1], 1], ["SP", 3, 10, 14, [2, 1, 7, 1], 1], ["SP", 3, 15, 256, [2, 64, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.05015e-05], 0, 1.59532, 1606985280], "v": "v0.3"}
+{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [4, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 2704, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.18808e-05], 0, 1.88033, 1606985298], "v": "v0.3"}
+{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [7, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 4, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 256, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000190239], 0, 2.28266, 1606985323], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
deleted file mode 100644
index 41b6c0e554ed..000000000000
--- a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json
+++ /dev/null
@@ -1,26 +0,0 @@
-# Provide valid schedules for resnet-18.
-# This is used to run the tutorial on the documentation web server.
-{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 4, 1, 1000, [50], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$0"], ["PR", 3, 0, "auto_unroll_max_step$1024"]]]], "r": [[4.54041e-06], 0, 1.27943, 1605490839], "v": "v0.3"}
-{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 4], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 4, [4], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 4, [2], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.03431e-05], 0, 2.09134, 1605490924], "v": "v0.3"}
-{"i": [["[\"7de313da0ca29a8c63f647791692430d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [8], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[5.51259e-06], 0, 1.30207, 1605491060], "v": "v0.3"}
-{"i": [["[\"944921d3fd999ba7aa9ffe5a592a9241\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [56], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$512"]]]], "r": [[2.24305e-05], 0, 1.60311, 1605493879], "v": "v0.3"}
-{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [2, 1, 1, 8], 1], ["SP", 3, 10, 112, [1, 8, 1, 1], 1], ["SP", 3, 15, 64, [2, 16, 2, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 294, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 441, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[7.63468e-05], 0, 2.59544, 1605493932], "v": "v0.3"}
-{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [7, 4, 2, 1], 1], ["SP", 3, 10, 56, [1, 2, 2, 1], 1], ["SP", 3, 15, 64, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [8, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 32, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 128, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.26775e-05], 0, 1.94247, 1605494103], "v": "v0.3"}
-{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 2], 1], ["SP", 3, 10, 28, [1, 1, 2, 1], 1], ["SP", 3, 15, 128, [1, 16, 1, 8], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 128, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.13004e-05], 0, 1.86312, 1605494224], "v": "v0.3"}
-{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 2, 1], 1], ["SP", 3, 10, 14, [1, 14, 1, 1], 1], ["SP", 3, 15, 256, [1, 8, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 48, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.29425e-05], 0, 1.70493, 1605494303], "v": "v0.3"}
-{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 16, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.04683e-05], 0, 1.80217, 1605494406], "v": "v0.3"}
-{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 1, 7], 1], ["SP", 3, 10, 28, [1, 4, 1, 1], 1], ["SP", 3, 15, 128, [1, 32, 2, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 64, [1, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 72, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 348, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[4.93528e-05], 0, 1.74125, 1605498773], "v": "v0.3"}
-{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [8], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 8, 1], 1], ["SP", 6, 15, 512, [1, 32, 2, 1], 1], ["SP", 6, 20, 512, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [49], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000129562], 0, 3.40317, 1605500470], "v": "v0.3"}
-{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 1, 7], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 16, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 256, [4, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 288, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 1440, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[7.57476e-05], 0, 2.59558, 1605501054], "v": "v0.3"}
-{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 2, 1], 1], ["SP", 6, 10, 196, [4, 1, 1, 7], 1], ["SP", 6, 15, 128, [2, 32, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [49], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[6.77244e-05], 0, 2.67201, 1605501438], "v": "v0.3"}
-{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 128, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 7, 1], 1], ["SP", 6, 15, 128, [8, 16, 1, 1], 1], ["SP", 6, 20, 128, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [16], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[6.23875e-05], 0, 1.93274, 1605501606], "v": "v0.3"}
-{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 2, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 4], 1], ["SP", 6, 15, 64, [2, 16, 1, 1], 1], ["SP", 6, 20, 64, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[6.65448e-05], 0, 2.94376, 1605501803], "v": "v0.3"}
-{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 2], 1], ["SP", 3, 10, 14, [2, 7, 1, 1], 1], ["SP", 3, 15, 256, [1, 32, 2, 1], 1], ["SP", 3, 20, 3, [1, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 192, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 240, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[6.31245e-05], 0, 1.9322, 1605501903], "v": "v0.3"}
-{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 2, 4, 2], 1], ["SP", 6, 15, 512, [2, 32, 1, 1], 1], ["SP", 6, 20, 512, [16, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000143154], 0, 2.20107, 1605502293], "v": "v0.3"}
-{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [8, 2, 2, 2], 1], ["SP", 6, 20, 256, [2, 16], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [1], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000115017], 0, 3.89122, 1605502608], "v": "v0.3"}
-{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [4], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [2, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 1, 2, 14], 1], ["SP", 6, 15, 128, [1, 32, 1, 2], 1], ["SP", 6, 20, 128, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.20936e-05], 0, 3.36582, 1605502968], "v": "v0.3"}
-{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [8, 1, 2, 2], 1], ["SP", 6, 20, 256, [1, 32], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [2], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000122349], 0, 4.2774, 1605503135], "v": "v0.3"}
-{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 1, 7], 1], ["SP", 6, 15, 256, [8, 4, 1, 1], 1], ["SP", 6, 20, 256, [1, 16], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 256, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.9277e-05], 0, 3.07064, 1605503350], "v": "v0.3"}
-{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 1], 1], ["SP", 6, 5, 6, [1, 2, 1, 1], 1], ["SP", 6, 10, 196, [7, 7, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 64, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.64176e-05], 0, 5.45091, 1605503568], "v": "v0.3"}
-{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 64, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [14, 7, 1, 2], 1], ["SP", 6, 15, 64, [1, 16, 1, 2], 1], ["SP", 6, 20, 64, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 4, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[7.60496e-05], 0, 3.00771, 1605503805], "v": "v0.3"}
-{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 1, 4, 4], 1], ["SP", 6, 15, 512, [1, 64, 1, 1], 1], ["SP", 6, 20, 512, [1, 32], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [16], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [16], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135079], 0, 2.40957, 1605504233], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
new file mode 100644
index 000000000000..611f7765f584
--- /dev/null
+++ b/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
@@ -0,0 +1,31 @@
+# Provide valid schedules for resnet-50 for CPU.
+# This is used to run the tutorial on the documentation web server.
+{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.3"}
+{"i": [["[\"6129df1a3d5f6326c8393a8d17160199\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1, [1, 1, 1], 1], ["SP", 2, 4, 1000, [1, 1, 1], 1], ["SP", 2, 8, 16, [2, 2, 4], 1], ["SP", 2, 12, 128, [32], 1], ["RE", 2, [0, 4, 8, 1, 5, 9, 12, 2, 6, 10, 13, 3, 7, 11]], ["CR", 5], ["CA", 3, 5, 1], ["FU", 2, [0, 1]], ["AN", 2, 0, 3], ["FU", 5, [0, 1]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 2, 12, 2]]]], "r": [[8.7769e-05, 8.6467e-05, 8.6989e-05, 9.3901e-05, 8.6221e-05, 8.4351e-05, 8.4747e-05, 8.8687e-05, 8.8928e-05, 8.3574e-05], 0, 0.33759, 1606960890], "v": "v0.3"}
+{"i": [["[\"36ee2798ed60bae3bcd1bb89a0285fe8\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.3"}
+{"i": [["[\"dcf6fcf5f56fa614bf9aef0c82382caf\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.3"}
+{"i": [["[\"7657f886f5e9d8b5f19a5fd2c5b90d8d\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.3"}
+{"i": [["[\"7e09b626cf077cd419190fee02091dd6\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.3"}
+{"i": [["[\"1dce2c5e4269b8a12dfc50cd4dd23ff1\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.3"}
+{"i": [["[\"d3b36ce001dc24d693facfbdae1979b4\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.3"}
+{"i": [["[\"a085717fb3dcb046e5c4c2c04d3dc541\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.3"}
+{"i": [["[\"8dd7d81db440763f622f03fdc99e6d46\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.3"}
+{"i": [["[\"ba2026d923536b75e9b4faed89287d5f\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.3"}
+{"i": [["[\"0fb1dfcdb5b755e2dab290ed0129dcf2\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 2], 1], ["SP", 3, 12, 128, [2, 2, 16], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 128, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 3, 8], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.000224019, 0.000238271, 0.000237129, 0.000233981, 0.000223557, 0.000238411, 0.000238778, 0.000236382, 0.000236069, 0.000239037], 0, 0.285437, 1606961576], "v": "v0.3"}
+{"i": [["[\"e043f834cc7f19597227e09dc7f59503\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.3"}
+{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.3"}
+{"i": [["[\"03614e726dc588d11887eb0953a77e53\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.3"}
+{"i": [["[\"b51e06c1131d4cded40d1b215f722a4e\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.3"}
+{"i": [["[\"a9e632e5167afb60fbe29e7aeef1d152\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.3"}
+{"i": [["[\"e0a9eb3795b531085e0ebb772e7e800c\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.3"}
+{"i": [["[\"8fcee68a4342c38248a827f1c6c69177\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.3"}
+{"i": [["[\"4d7e646d99bfa3cea8245bd7100369cb\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.3"}
+{"i": [["[\"b2010aa63c95dedf1f58f3fe8bc78634\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.3"}
+{"i": [["[\"537c8642716948c33a6eaaabc86b159d\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.3"}
+{"i": [["[\"7e3f0cf5a6dd80d36dab1a3dad92674a\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.3"}
+{"i": [["[\"cd7c4a374fb2bbc0d075c8cae638ad14\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.3"}
+{"i": [["[\"45b4de07687dee43ee1cbde9f516b2bf\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.3"}
+{"i": [["[\"95bf49cc8cf7a351e974b2359702aac0\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 2, 1], 1], ["SP", 3, 8, 14, [1, 7, 1], 1], ["SP", 3, 12, 256, [2, 1, 8], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000230538, 0.000229192, 0.000235935, 0.000233141, 0.000233405, 0.000233217, 0.000225995, 0.000231786, 0.000229054, 0.00022851], 0, 0.256995, 1606961941], "v": "v0.3"}
+{"i": [["[\"5e3ceb6e23ae8c351d5a1770d5fc6c7c\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.3"}
+{"i": [["[\"691feef049c8693bbe91bd5e7c9cdf34\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.3"}
+{"i": [["[\"45acfc473c772458684f36a34549d8aa\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 8f8cf7f1e99a..90f531f4f52e 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -37,7 +37,7 @@
 Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
 manual templates to define the search space, the auto-scheduler does not require any
 schedule templates. In other words, the auto-scheduler only uses the compute declarations
-in :code:`tvm/python/topi` while does not use existing schedule templates.
+in :code:`tvm/python/topi` and does not use existing schedule templates.
 
 Note that this tutorial will not run on Windows or recent versions of macOS. To
 get it to run, you will need to wrap the body of this tutorial in a :code:`if
@@ -59,10 +59,11 @@
 # We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
 # (see :ref:`front end tutorials<tutorial-frontend>`).
 #
-# Note that although auto-scheduler can work with any layouts,
-# we found that the best performance is typically archived with NHWC layout
-# for convolutional neural networks, so we use NHWC layout in this tutorial.
-#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
 
 
 def get_network(name, batch_size, layout="NHWC", dtype="float32"):
@@ -135,7 +136,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 layout = "NHWC"
 target = tvm.target.Target("cuda")
 dtype = "float32"
-log_file = "%s-%s-B%d.json" % (network, layout, batch_size)
+log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
 
 #################################################################
 # Extract Search Tasks
@@ -170,11 +171,11 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 #   Typically, we recommend a value >= 300 ms.
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
 #   You can set it to a small number (e.g., 200) for a fast demonstrative run.
-#   In practice, we recommend setting it around :code:`1000 * len(tasks)`,
+#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
 #   which is typically enough for the search to converge.
-#   For example, there are 21 tasks in resnet-18, so we can set it as 20000.
+#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
 #   You can adjust this parameter according to your time budget.
-# * In addition, we use :code:`RecordToFile` to dump measurement records into the log file,
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
 #   The measurement records can be used to query the history best, resume the search,
 #   and do more analyses later.
 # * see :any:`auto_scheduler.TuningOptions`,
@@ -294,7 +295,7 @@ def run_tuning():
 
 #################################################################
 # Other Tips
-# --------------------
+# ----------
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py
new file mode 100644
index 000000000000..8dd9230c5cce
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_network_x86.py
@@ -0,0 +1,306 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for x86 CPU
+============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for x86 CPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and 
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_runtime
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet50_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target.
+# If the target machine supports avx512 instructions, replace the
+# "llvm -mcpu=core-avx2" with "llvm -mcpu=skylake-avx512"
+network = "resnet-50"
+batch_size = 1
+layout = "NHWC"
+target = tvm.target.Target("llvm -mcpu=core-avx2")
+dtype = "float32"
+log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`800 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 29 tasks in resnet-50, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRunner` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=200,  # change this to 20000 to achieve the best performance
+        runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True),
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.010 |           0.40 |     64 |
+#     |    1 |        0.087 |          47.19 |     64 |
+#     |    2 |        0.008 |          -0.00 |     64 |
+#     |    3 |        0.177 |         582.07 |     64 |
+#     |    4 |        0.268 |         862.37 |    256 |
+#     |    5 |        0.166 |         621.13 |    128 |
+#     |    6 |        0.170 |         605.10 |    128 |
+#     |    7 |        0.128 |         403.20 |     64 |
+#     |    8 |        0.189 |         545.71 |     64 |
+#     |    9 |        0.231 |        1001.01 |    448 |
+#     |   10 |        0.155 |         664.80 |    256 |
+#     |   11 |        0.155 |         662.86 |    256 |
+#     |   12 |        0.119 |         434.08 |     64 |
+#     |   13 |        0.199 |         522.13 |     64 |
+#     |   14 |        0.235 |         986.56 |    320 |
+#     |   15 |        0.149 |         689.13 |    128 |
+#     |   16 |        0.155 |         664.80 |    192 |
+#     |   17 |        0.151 |         340.64 |     64 |
+#     |   18 |        0.176 |         597.55 |    128 |
+#     |   19 |        0.220 |        1054.37 |    192 |
+#     |   20 |        0.150 |         686.01 |    128 |
+#     |   21 |        0.159 |         650.88 |    128 |
+#     |   22 |        0.073 |         358.19 |     64 |
+#     |   23 |        0.031 |          70.63 |     64 |
+#     |   24 |        0.251 |         947.73 |    128 |
+#     |   25 |        0.157 |         652.47 |    128 |
+#     |   26 |        0.215 |         954.84 |    128 |
+#     |   27 |        0.237 |         868.92 |    128 |
+#     |   28 |        0.266 |         774.06 |    128 |
+#     -------------------------------------------------
+#     Estimated total latency: 10.016 ms      Trials: 3992    Used time : 1131 s      Next ID: 15
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "dmlc::Error"s errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+print("Compile...")
+with auto_scheduler.ApplyHistoryBest(log_file):
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        lib = relay.build(mod, target=target, params=params)
+
+# Create graph runtime
+ctx = tvm.context(str(target), 0)
+module = graph_runtime.GraphModule(lib["default"](ctx))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
+print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
+
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. If you have multiple target CPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.
+#

From ea7c167b270b8daf8ea94d9acb12a4e90f408259 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Thu, 3 Dec 2020 06:39:06 -0800
Subject: [PATCH 245/258] [auto_scheduler] metal default hardware params
 (#7022)

---
 src/auto_scheduler/search_task.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 0b85a03f0671..bd09a70c0655 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -71,6 +71,17 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
 
     p_hardware_params->max_vthread_extent = p_hardware_params->warp_size / 4;
 
+    return hardware_params;
+  } else if (target->kind->device_type == kDLMetal) {
+    // Reference: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+    // This setting looks working for Metal GPUs later than A10
+    auto hardware_params = HardwareParams(-1, 16, 64);
+    auto* p_hardware_params = hardware_params.CopyOnWrite();
+    p_hardware_params->max_shared_memory_per_block = 32 * 1024;
+    p_hardware_params->max_registers_per_block = 4 * 1024;
+    p_hardware_params->max_threads_per_block = 1024;
+    p_hardware_params->warp_size = 8;
+    p_hardware_params->max_vthread_extent = p_hardware_params->warp_size / 4;
     return hardware_params;
   } else {
     LOG(FATAL) << "No default hardware parameters for target: " << target;

From 578294412d4428c36d89b5f176eccd4cced7d74f Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 3 Dec 2020 10:32:37 -0800
Subject: [PATCH 246/258] [Diagnostics] Add environment variable for
 controlling top-level printing and fix issue with pretty printing/parsing
 roundtrip. (#6874)

* Update Parser in order to handle the NMS code

* Add support for displaying traces optionally

* WIP

* Fix

* Fix error reporting in parser and clean up __init__.py due to CR

* Format

* Quick fix for If

* Fix format

* Fix lint
---
 python/tvm/__init__.py               | 21 +++++--
 src/parser/parser.cc                 | 91 +++++++++++++++++++---------
 tests/python/relay/test_ir_parser.py | 14 +++++
 3 files changed, 95 insertions(+), 31 deletions(-)

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 569e8f042486..c2b4fdb2d00e 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -68,15 +68,28 @@
 from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel
 
 
+def _should_print_backtrace():
+    in_pytest = "PYTEST_CURRENT_TEST" in os.environ
+    tvm_backtrace = os.environ.get("TVM_BACKTRACE", "0")
+
+    try:
+        tvm_backtrace = bool(int(tvm_backtrace))
+    except ValueError:
+        raise ValueError(
+            f"invalid value for TVM_BACKTRACE `{tvm_backtrace}`, please set to 0 or 1."
+        )
+
+    return in_pytest or tvm_backtrace
+
+
 def tvm_wrap_excepthook(exception_hook):
     """Wrap given excepthook with TVM additional work."""
 
     def wrapper(exctype, value, trbk):
         """Clean subprocesses when TVM is interrupted."""
-        in_pytest = "PYTEST_CURRENT_TEST" in os.environ
-
-        if exctype is error.DiagnosticError and not in_pytest:
-            pass
+        if exctype is error.DiagnosticError and not _should_print_backtrace():
+            # TODO(@jroesch): consider moving to C++?
+            print("note: run with `TVM_BACKTRACE=1` environment variable to display a backtrace.")
         else:
             exception_hook(exctype, value, trbk)
 
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 987a6e20ec38..afcf70737933 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -605,30 +605,43 @@ class Parser {
     return ast;
   }
 
+  struct MetaRef {
+    std::string type_key;
+    uint64_t node_index;
+    Span span;
+    MetaRef(std::string type_key, uint64_t node_index, Span span)
+        : type_key(type_key), node_index(node_index), span(span) {}
+  };
+
+  MetaRef MetaRefFromToken(const Token& tok) {
+    Call ref = Downcast<Call>(tok->data);
+    auto attrs = ref->attrs.as<MetaRefAttrs>();
+    auto type_key = attrs->node_type_key;
+    auto index = attrs->node_index;
+    return MetaRef(type_key, index, ref->span);
+  }
+
   /*! \brief Parse a meta reference of the form `meta[type_key][node_index]`.
    * For example `meta[relay.Constant][0]` references the first constant, `meta[relay.Constant][1]`
    * the second, and so on.
    */
   ObjectRef ParseMetaRef() {
-    auto meta_ref = Match(TokenType::kMetaReference);
-    Call ref = Downcast<Call>(meta_ref->data);
-    auto attrs = ref->attrs.as<MetaRefAttrs>();
-    auto type_key = attrs->node_type_key;
-    auto index = attrs->node_index;
-    auto it = this->meta_table.find(type_key);
+    auto meta_ref_tok = Match(TokenType::kMetaReference);
+    auto meta_ref = MetaRefFromToken(meta_ref_tok);
+    auto it = this->meta_table.find(meta_ref.type_key);
     if (it != this->meta_table.end()) {
       auto nodes = (*it).second;
-      if (index < nodes.size()) {
-        return nodes[index];
+      if (meta_ref.node_index < nodes.size()) {
+        return nodes[meta_ref.node_index];
       } else {
-        this->diag_ctx.Emit(Diagnostic::Error(meta_ref->span)
-                            << "the node index `" << index << "` is out of bounds for `" << type_key
-                            << "`");
+        this->diag_ctx.Emit(Diagnostic::Error(meta_ref.span)
+                            << "the node index `" << meta_ref.node_index
+                            << "` is out of bounds for `" << meta_ref.type_key << "`");
         return ObjectRef();
       }
     } else {
-      this->diag_ctx.Emit(Diagnostic::Error(meta_ref->span)
-                          << "no entry in the meta table for `" << type_key << "`");
+      this->diag_ctx.Emit(Diagnostic::Error(meta_ref.span)
+                          << "no entry in the meta table for `" << meta_ref.type_key << "`");
       return ObjectRef();
     }
   }
@@ -922,10 +935,7 @@ class Parser {
             exprs.push_back(ParseMatch(is_total));
             break;
           }
-          case TokenType::kIf: {
-            exprs.push_back(ParseIf());
-            break;
-          }
+
           // %x ...
           case TokenType::kGraph:
             if (Lookahead(2)->token_type == TokenType::kEqual) {
@@ -1344,6 +1354,10 @@ class Parser {
             Match(TokenType::kIdentifier);
             return ObjectRef();
           }
+          if (id == "None") {
+            Match(TokenType::kIdentifier);
+            return Optional<ObjectRef>();
+          }
         }
       }
       default:
@@ -1372,7 +1386,7 @@ class Parser {
     ICHECK(op.defined()) << "the operator must be defined";
 
     DLOG(INFO) << "Parser::ParseCallArgs";
-    Map<String, ObjectRef> raw_attrs;
+    Attrs attrs;
     std::string op_key;
     bool is_op = false;
 
@@ -1388,21 +1402,40 @@ class Parser {
           [&] {
             auto is_ident = Lookahead(1)->token_type == TokenType::kIdentifier;
             auto next_is_equal = Lookahead(2)->token_type == TokenType::kEqual;
-
-            if (is_op && is_ident && next_is_equal) {
-              raw_attrs = ParseAttrs();
+            auto is_pretty_attrs = is_ident && next_is_equal;
+            auto is_meta_next = Lookahead(1)->token_type == TokenType::kMetaReference;
+            // TODO(@jroesch): might not handle trailing comma
+            auto last_meta = Lookahead(2)->token_type == TokenType::kCloseParen;
+            auto is_meta_attrs = is_meta_next && last_meta;
+
+            if (is_op && (is_pretty_attrs || is_meta_attrs)) {
+              if (is_meta_attrs) {
+                auto meta_ref = ParseMetaRef();
+                if (meta_ref.as<BaseAttrsNode>()) {
+                  attrs = Downcast<Attrs>(meta_ref);
+                } else {
+                  // Not awesome parsing code here.
+                  this->pos--;
+                  return false;
+                }
+              } else {
+                auto raw_attrs = ParseAttrs();
+                auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs);
+                ICHECK(attr_obj.defined());
+                attrs = Downcast<Attrs>(attr_obj);
+              }
               return true;
             }
 
             return false;
           });
 
-      Attrs attrs;
-
-      if (is_op && op_key.size()) {
-        auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs);
-        ICHECK(attr_obj.defined());
-        attrs = Downcast<Attrs>(attr_obj);
+      if (!attrs.defined()) {
+        if (is_op && op_key.size()) {
+          auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, {});
+          ICHECK(attr_obj.defined());
+          attrs = Downcast<Attrs>(attr_obj);
+        }
       }
 
       // TODO(@jroesch): in a secondary pass adjust spans.
@@ -1527,6 +1560,10 @@ class Parser {
           ICHECK(e->span.defined()) << "function spans must be defined.\n" << e;
           return e;
         }
+        case TokenType::kIf: {
+          Expr e = ParseIf();
+          return e;
+        }
         case TokenType::kRef: {
           Consume(TokenType::kRef);
           Match(TokenType::kOpenParen);
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index c5217ba41bfd..162271756557 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -875,6 +875,20 @@ def @example() {
     parse_module(program)
 
 
+def test_parse_if_in_binding():
+    program = """
+    def @example(%b: bool) {
+        %0 = if (%b) {
+            1
+        } else {
+            0
+        };
+        %0
+    }
+    """
+    parse_module(program)
+
+
 def test_op_string_attr():
     call = parse_text(
         """

From 7b0e7d30cc4b6b44f36f232f28c7c466f7889e9b Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Thu, 3 Dec 2020 21:03:45 +0000
Subject: [PATCH 247/258] Update dmlc_tvm_commit_id.txt

---
 dmlc_tvm_commit_id.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dmlc_tvm_commit_id.txt b/dmlc_tvm_commit_id.txt
index bce5cf01ffce..e46bbee4e022 100644
--- a/dmlc_tvm_commit_id.txt
+++ b/dmlc_tvm_commit_id.txt
@@ -1 +1 @@
-636739af8d14c864d263a55323acc6c530497588
\ No newline at end of file
+8daa97ec87118ecdf38453ca878655cb08fba329
\ No newline at end of file

From 5cad7d162a866013b43f346f42b53e79950d18f2 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Thu, 3 Dec 2020 21:13:49 +0000
Subject: [PATCH 248/258] Temporarily disable arm build/test in CI

---
 Jenkinsfile | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b9e191d5c80e..feea8c2f9489 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -215,16 +215,16 @@ stage('Build') {
       }
     }
   },
-  'BUILD : arm': {
-    node('ARM') {
-      ws(per_exec_ws("tvm/build-arm")) {
-        init_git()
-        sh "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh"
-        make(ci_arm, 'build', '-j4')
-        pack_lib('arm', tvm_multilib)
-      }
-    }
-  },
+  // 'BUILD : arm': {
+  //   node('ARM') {
+  //     ws(per_exec_ws("tvm/build-arm")) {
+  //       init_git()
+  //       sh "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh"
+  //       make(ci_arm, 'build', '-j4')
+  //       pack_lib('arm', tvm_multilib)
+  //     }
+  //   }
+  // },
   'BUILD: QEMU': {
     node('CPU') {
       ws(per_exec_ws("tvm/build-qemu")) {
@@ -269,19 +269,19 @@ stage('Unit Test') {
       }
     }
   },
-  'python3: arm': {
-    node('ARM') {
-      ws(per_exec_ws("tvm/ut-python-arm")) {
-        init_git()
-        unpack_lib('arm', tvm_multilib)
-        timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_python_setup.sh"
-          sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_unittest.sh"
-          // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
-        }
-      }
-    }
-  },
+  // 'python3: arm': {
+  //   node('ARM') {
+  //     ws(per_exec_ws("tvm/ut-python-arm")) {
+  //       init_git()
+  //       unpack_lib('arm', tvm_multilib)
+  //       timeout(time: max_time, unit: 'MINUTES') {
+  //         sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_python_setup.sh"
+  //         sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_unittest.sh"
+  //         // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
+  //       }
+  //     }
+  //   }
+  // },
   'java: GPU': {
     node('GPU') {
       ws(per_exec_ws("tvm/ut-java")) {

From d6acdd0740f3e5bc2efbbafac0b36cd93ce289f9 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Fri, 4 Dec 2020 00:12:48 +0300
Subject: [PATCH 249/258] [RPC] Prefer IPv4 between IPv4 and IPv6 (#7013)

This change fix problem with version of IP protocol on MacOS.  Previous
the `rpc_tracker` and `query_rpc_tracker` were not able connect to each
other with default hostnames.

The root cause was in method `socket.getaddrinfo`. In `rpc_tracker` the
default hostname is "0.0.0.0" and `getaddrinfo` returns IPv4 type. In
`query_rpc_tracker` the default hastname is "localhost" and
`getaddrinfo` on MacOS returns IPv6 type. Note: on Linux both have IPv4
type.
These tools worked by different protocols and this is why
`query_rpc_tracker` wasn't able connect to `rpc_tracker`.

Now we will prefer IPv4 type. And both `rpc_tracker` and
`query_rpc_tracker` will use the same version of protocol.
---
 python/tvm/rpc/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/tvm/rpc/base.py b/python/tvm/rpc/base.py
index b2bfa3b53416..1be904524ef6 100644
--- a/python/tvm/rpc/base.py
+++ b/python/tvm/rpc/base.py
@@ -60,6 +60,9 @@ class TrackerCode(object):
 
 def get_addr_family(addr):
     res = socket.getaddrinfo(addr[0], addr[1], 0, 0, socket.IPPROTO_TCP)
+    for info in res:
+        if info[0] == socket.AF_INET:
+            return info[0]
     return res[0][0]
 
 
From 70696629d2bc5d84b62628621a97f6f8a824564c Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Thu, 3 Dec 2020 14:12:54 -0800
Subject: [PATCH 250/258] [CI] Hotfix CI (see #7010) (#7025)

---
 python/tvm/relay/testing/__init__.py      | 12 +++++-
 tests/python/relay/test_op_grad_level2.py | 51 +----------------------
 2 files changed, 11 insertions(+), 52 deletions(-)

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 93110e313642..0b81cb9c7ec6 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -72,7 +72,15 @@ def _np_randn_from_type(t, scale=1, mean=0):
 
 
 def check_grad(
-    func, inputs=None, test_inputs=None, eps=1e-6, atol=1e-5, rtol=1e-3, scale=None, mean=0
+    func,
+    inputs=None,
+    test_inputs=None,
+    eps=1e-6,
+    atol=1e-5,
+    rtol=1e-3,
+    scale=None,
+    mean=0,
+    mode="higher_order",
 ):
     """Perform numerical gradient checking given a relay function.
 
@@ -112,7 +120,7 @@ def check_grad(
     """
 
     fwd_func = run_infer_type(func)
-    bwd_func = run_infer_type(gradient(fwd_func))
+    bwd_func = run_infer_type(gradient(fwd_func, mode=mode))
 
     if scale is None:
         scale = 10 * eps
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 80a567d9cb65..bcf75de7915b 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -168,13 +168,6 @@ def test_global_avg_pool2d_grad():
 
 
 def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order"):
-    try:
-        import torch
-        import torch.nn.functional as F
-    except ImportError:
-        print("Skip because pytorch is not installed")
-        return
-
     dtype = "float32"
     data = relay.var("data", shape=dshape, dtype=dtype)
     weight = relay.var("weight", shape=wshape, dtype=dtype)
@@ -182,49 +175,7 @@ def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mod
         data, weight, strides=strides, padding=padding, dilation=dilation, groups=groups
     )
     fwd_func = relay.Function([data, weight], conv)
-    fwd_func = run_infer_type(fwd_func)
-    bwd_func = run_infer_type(gradient(fwd_func, mode=mode))
-
-    data_pt = torch.randn(*dshape, dtype=torch.float32, requires_grad=True)
-    weight_pt = torch.randn(*wshape, dtype=torch.float32, requires_grad=True)
-    out_pt = F.conv2d(
-        data_pt, weight_pt, stride=strides, padding=padding, dilation=dilation, groups=groups
-    )
-    grad_output_pt = torch.ones(out_pt.shape)
-    grad_input_pt = (
-        F.grad.conv2d_input(
-            dshape,
-            weight_pt,
-            grad_output_pt,
-            stride=strides,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-        )
-        .detach()
-        .numpy()
-    )
-    grad_weight_pt = (
-        F.grad.conv2d_weight(
-            data_pt,
-            wshape,
-            grad_output_pt,
-            stride=strides,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-        )
-        .detach()
-        .numpy()
-    )
-
-    for target, ctx in tvm.testing.enabled_targets():
-        data = tvm.nd.array(data_pt.detach().numpy(), ctx)
-        weight = tvm.nd.array(weight_pt.detach().numpy(), ctx)
-        intrp = relay.create_executor(ctx=ctx, target=target)
-        op_res, (grad_input, grad_weight) = intrp.evaluate(bwd_func)(data, weight)
-        np.testing.assert_allclose(grad_input.asnumpy(), grad_input_pt, rtol=1e-4, atol=1e-4)
-        np.testing.assert_allclose(grad_weight.asnumpy(), grad_weight_pt, rtol=1e-4, atol=1e-4)
+    check_grad(fwd_func, mode=mode)
 
 
 @tvm.testing.uses_gpu

From 50ed7d830dcb39e041be70f10faa52d9816134c7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 3 Dec 2020 15:04:13 -0800
Subject: [PATCH 251/258] [AutoScheduler] Misc update to hardware parameter and
 task scheduler (#7020)

* [AutoScheduler] Mics update to hardware parameter and task scheduler

* update

* update

* update

* update

* fix

* fix

* update

* improve warning message

* update

* lint

* update

* update

* fix

* Apply suggestions from code review

* trigger CI
---
 docs/conf.py                                  |  1 +
 include/tvm/auto_scheduler/search_task.h      | 26 ++++++----
 python/tvm/auto_scheduler/__init__.py         |  7 ++-
 python/tvm/auto_scheduler/auto_schedule.py    | 32 +++++++++++-
 .../tvm/auto_scheduler/relay_integration.py   | 12 +++++
 python/tvm/auto_scheduler/search_policy.py    |  2 +-
 python/tvm/auto_scheduler/task_scheduler.py   |  8 ++-
 python/tvm/relay/op/strategy/cuda.py          |  9 ++--
 python/tvm/relay/op/strategy/x86.py           | 17 +++++--
 src/auto_scheduler/search_task.cc             | 51 +++++++++++--------
 .../test_auto_scheduler_compute_dag.py        |  2 +-
 .../unittest/test_auto_scheduler_feature.py   |  4 +-
 12 files changed, 121 insertions(+), 50 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 32bc095272aa..a7198bf22355 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -251,6 +251,7 @@
         "tune_relay_mobile_gpu.py",
     ],
     "auto_scheduler": ["tune_matmul_x86.py", "tune_conv2d_layer_cuda.py"],
+    "dev": ["low_level_custom_pass.py", "use_pass_infra.py", "bring_your_own_datatypes.py"],
 }
 
 
diff --git a/include/tvm/auto_scheduler/search_task.h b/include/tvm/auto_scheduler/search_task.h
index 85154b5e406b..6d85835d2e4b 100755
--- a/include/tvm/auto_scheduler/search_task.h
+++ b/include/tvm/auto_scheduler/search_task.h
@@ -44,17 +44,16 @@ class HardwareParamsNode : public Object {
   int cache_line_bytes;
 
   // GPU related parameters got from device query API
-
-  /*! \brief The max shared memory per block. */
-  int max_shared_memory_per_block{INT32_MAX};
-  /*! \brief The max register memory per block. */
-  int max_registers_per_block{INT32_MAX};
-  /*! \brief The max threads per block. */
-  int max_threads_per_block{INT32_MAX};
+  /*! \brief The max shared memory per block in bytes. */
+  int max_shared_memory_per_block;
+  /*! \brief The max number of register per block. */
+  int max_registers_per_block;
+  /*! \brief The max number of threads per block. */
+  int max_threads_per_block;
   /*! \brief The max vthread extent. */
-  int max_vthread_extent{INT32_MAX};
+  int max_vthread_extent;
   /*! \brief The thread numbers of a warp. */
-  int warp_size{INT32_MAX};
+  int warp_size;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_cores", &num_cores);
@@ -90,8 +89,15 @@ class HardwareParams : public ObjectRef {
    * \param num_cores The number of cores.
    * \param vector_unit_bytes The width of vector units in bytes.
    * \param cache_line_bytes The size of cache line in bytes.
+   * \param max_shared_memory_per_block The max amount of shared memory per block for GPU.
+   * \param max_registers_per_block The max number of registers per block for GPU.
+   * \param max_threads_per_block The max number of threads per block for GPU.
+   * \param max_vthread_extent The max extent of vthread for GPU.
+   * \param warp_size The warp size for GPU
    */
-  HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes);
+  HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes,
+                 int max_shared_memory_per_block, int max_registers_per_block,
+                 int max_threads_per_block, int max_vthread_extent, int warp_size);
 
   TVM_DEFINE_OBJECT_REF_METHODS(HardwareParams, ObjectRef, HardwareParamsNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(HardwareParamsNode);
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index 5bf2335ec7cf..bee2e7f423b6 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -44,7 +44,12 @@
     LocalRPCMeasureContext,
 )
 from .measure_record import RecordToFile, RecordReader, load_best, load_records, save_records
-from .relay_integration import extract_tasks, remove_index_check, rewrite_compute_body
+from .relay_integration import (
+    extract_tasks,
+    remove_index_check,
+    rewrite_compute_body,
+    is_auto_scheduler_enabled,
+)
 from .search_task import SearchTask
 from .search_policy import EmptyPolicy, SketchPolicy, PreloadMeasuredStates
 from .task_scheduler import TaskScheduler
diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index 5bc13fec62a9..57dc9588df51 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -46,11 +46,39 @@ class HardwareParams(Object):
         The width of vector units in bytes.
     cache_line_bytes : int
         The size of cache line in bytes.
+    max_shared_memory_per_block : int
+        The max shared memory per block in bytes.
+    max_registers_per_block : int
+        The max number of register per block.
+    max_threads_per_block : int
+        The max number of threads per block.
+    max_vthread_extent : int
+        The max vthread extent.
+    warp_size : int
+        The thread numbers of a warp.
     """
 
-    def __init__(self, num_cores, vector_unit_bytes, cache_line_bytes):
+    def __init__(
+        self,
+        num_cores,
+        vector_unit_bytes,
+        cache_line_bytes,
+        max_shared_memory_per_block,
+        max_registers_per_block,
+        max_threads_per_block,
+        max_vthread_extent,
+        warp_size,
+    ):
         self.__init_handle_by_constructor__(
-            _ffi_api.HardwareParams, num_cores, vector_unit_bytes, cache_line_bytes
+            _ffi_api.HardwareParams,
+            num_cores,
+            vector_unit_bytes,
+            cache_line_bytes,
+            max_shared_memory_per_block,
+            max_registers_per_block,
+            max_threads_per_block,
+            max_vthread_extent,
+            warp_size,
         )
 
 
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 25b88811709e..5a197910e334 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -28,6 +28,7 @@
 
 import tvm
 from tvm import autotvm, te, transform
+from tvm.ir.transform import PassContext
 from tvm.runtime import convert_to_object
 from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor
 from tvm.tir import expr as _expr
@@ -342,3 +343,14 @@ def rewrite_compute_body(compute_tensor, new_layout):
     num = op_node.num_outputs
     outputs = tuple(op_node.output(i) for i in range(num))
     return outputs[0] if num == 1 else outputs
+
+
+def is_auto_scheduler_enabled():
+    """Return whether the auto-scheduler is enabled.
+
+    Parameters
+    ----------
+    enabled: bool
+        Whether the auto-scheduler is enabled
+    """
+    return PassContext.current().config.get("relay.backend.use_auto_scheduler", False)
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index 35429552dc74..6f565edbd378 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -151,7 +151,7 @@ class SketchPolicy(SearchPolicy):
         "sample_init_min_population": 50,
         "sample_init_use_measured_ratio": 0.2,
         "evolutionary_search_population": 2048,
-        "evolutionary_search_num_iters": 3,
+        "evolutionary_search_num_iters": 4,
         "evolutionary_search_mutation_prob": 0.85,
         "cpu_multi_level_tiling_structure": "SSRSRS",
         "gpu_multi_level_tiling_structure": "SSSRRSRS",
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index 26bfa2e376b4..a3dbcae64b60 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -181,7 +181,7 @@ class TaskScheduler:
         The parameter used for 'gradient' strategy
     callbacks: Optional[List[TaskSchedulerCallback]]
         The task scheduler callbacks that will be called before and after tuning a task.
-        If None, then PrintTableInfo callback will be used.
+        If None, PrintTableInfo and LogEstimatedLatency callback will be used.
     """
 
     def __init__(
@@ -214,7 +214,11 @@ def __init__(
         self.beta = beta
         self.gamma = gamma
         self.backward_window_size = backward_window_size
-        self.callbacks = callbacks if callbacks is not None else [PrintTableInfo()]
+        self.callbacks = (
+            callbacks
+            if callbacks is not None
+            else [PrintTableInfo(), LogEstimatedLatency("total_latency.tsv")]
+        )
 
         assert len(self.tasks) != 0, "No tasks"
         assert self.strategy in ["round-robin", "gradient"]
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index bd96cad8ed02..029690680e7d 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -18,7 +18,7 @@
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 from tvm import topi
 import tvm
-from tvm.ir.transform import PassContext
+from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
 from tvm.contrib import nvcc
 from tvm._ffi import get_global_func
@@ -230,10 +230,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 )
 
             # register auto-scheduler implementations
-            use_auto_scheduler = PassContext.current().config.get(
-                "relay.backend.use_auto_scheduler", False
-            )
-            if use_auto_scheduler and judge_winograd_auto_scheduler:
+            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                     naive_schedule,  # this implementation should never be picked by autotvm
@@ -460,7 +457,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
                 name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
             )
 
-        if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
+        if is_auto_scheduler_enabled():
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
                 naive_schedule,  # this implementation should never be picked by autotvm
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 98b56ef4d1c0..5dfeca65e5c3 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -20,6 +20,7 @@
 
 import re
 from tvm import topi
+from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
 from tvm.relay.ty import is_dynamic
 from .generic import *
@@ -117,6 +118,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
+            if not is_auto_scheduler_enabled():
+                logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
                 wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
@@ -124,7 +127,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             )
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
-            logger.warning("conv2d HWCN layout is not optimized for x86.")
+            if not is_auto_scheduler_enabled():
+                logger.warning("conv2d HWCN layout is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_hwcn),
                 wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
@@ -157,7 +161,10 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
-            logger.warning("depthwise_conv2d NHWC layout is not optimized for x86.")
+            if not is_auto_scheduler_enabled():
+                logger.warning(
+                    "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm."
+                )
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                 wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
@@ -168,7 +175,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
     else:  # group_conv2d
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
-            logger.warning("group_conv2d is not optimized for x86.")
+            if not is_auto_scheduler_enabled():
+                logger.warning("group_conv2d is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
@@ -176,7 +184,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
-            logger.warning("group_conv2d is not optimized for x86.")
+            if not is_auto_scheduler_enabled():
+                logger.warning("group_conv2d is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index bd09a70c0655..48b3fc5eb38f 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -35,22 +35,26 @@ namespace auto_scheduler {
 TVM_REGISTER_NODE_TYPE(HardwareParamsNode);
 TVM_REGISTER_NODE_TYPE(SearchTaskNode);
 
-HardwareParams::HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes) {
+HardwareParams::HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes,
+                               int max_shared_memory_per_block, int max_registers_per_block,
+                               int max_threads_per_block, int max_vthread_extent, int warp_size) {
   auto node = make_object<HardwareParamsNode>();
   node->num_cores = num_cores;
   node->vector_unit_bytes = vector_unit_bytes;
   node->cache_line_bytes = cache_line_bytes;
+  node->max_shared_memory_per_block = max_shared_memory_per_block;
+  node->max_registers_per_block = max_registers_per_block;
+  node->max_threads_per_block = max_threads_per_block;
+  node->max_vthread_extent = max_vthread_extent;
+  node->warp_size = warp_size;
   data_ = std::move(node);
 }
 
 HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target,
                                                             const Target& target_host) {
   if (target->kind->device_type == kDLCPU) {
-    return HardwareParams(tvm::runtime::threading::MaxConcurrency(), 64, 64);
+    return HardwareParams(tvm::runtime::threading::MaxConcurrency(), 64, 64, 0, 0, 0, 0, 0);
   } else if (target->kind->device_type == kDLGPU) {
-    auto hardware_params = HardwareParams(-1, 16, 64);
-    auto* p_hardware_params = hardware_params.CopyOnWrite();
-
     auto ctx = TVMContext{kDLGPU, 0};
     auto func = tvm::runtime::Registry::Get("device_api.gpu");
     ICHECK(func != nullptr) << "Cannot find GPU device_api in registry";
@@ -58,31 +62,30 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
 
     tvm::runtime::TVMRetValue ret;
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret);
-    p_hardware_params->max_shared_memory_per_block = ret;
+    int max_shared_memory_per_block = ret;
 
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxRegistersPerBlock, &ret);
-    p_hardware_params->max_registers_per_block = ret;
+    int max_registers_per_block = ret;
 
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret);
-    p_hardware_params->max_threads_per_block = ret;
+    int max_threads_per_block = ret;
 
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kWarpSize, &ret);
-    p_hardware_params->warp_size = ret;
-
-    p_hardware_params->max_vthread_extent = p_hardware_params->warp_size / 4;
+    int warp_size = ret;
 
-    return hardware_params;
+    int max_vthread_extent = warp_size / 4;
+    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_registers_per_block,
+                          max_threads_per_block, max_vthread_extent, warp_size);
   } else if (target->kind->device_type == kDLMetal) {
     // Reference: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
     // This setting looks working for Metal GPUs later than A10
-    auto hardware_params = HardwareParams(-1, 16, 64);
-    auto* p_hardware_params = hardware_params.CopyOnWrite();
-    p_hardware_params->max_shared_memory_per_block = 32 * 1024;
-    p_hardware_params->max_registers_per_block = 4 * 1024;
-    p_hardware_params->max_threads_per_block = 1024;
-    p_hardware_params->warp_size = 8;
-    p_hardware_params->max_vthread_extent = p_hardware_params->warp_size / 4;
-    return hardware_params;
+    int max_shared_memory_per_block = 32 * 1024;
+    int max_registers_per_block = 4 * 1024;
+    int max_threads_per_block = 1024;
+    int warp_size = 8;
+    int max_vthread_extent = warp_size / 4;
+    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_registers_per_block,
+                          max_threads_per_block, max_vthread_extent, warp_size);
   } else {
     LOG(FATAL) << "No default hardware parameters for target: " << target;
   }
@@ -106,8 +109,12 @@ SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target targe
 }
 
 TVM_REGISTER_GLOBAL("auto_scheduler.HardwareParams")
-    .set_body_typed([](int num_cores, int vector_unit_bytes, int cache_line_bytes) {
-      return HardwareParams(num_cores, vector_unit_bytes, cache_line_bytes);
+    .set_body_typed([](int num_cores, int vector_unit_bytes, int cache_line_bytes,
+                       int max_shared_memory_per_block, int max_registers_per_block,
+                       int max_threads_per_block, int max_vthread_extent, int warp_size) {
+      return HardwareParams(num_cores, vector_unit_bytes, cache_line_bytes,
+                            max_shared_memory_per_block, max_registers_per_block,
+                            max_threads_per_block, max_vthread_extent, warp_size);
     });
 
 TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask")
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index bde3b786d370..1356154cacd6 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -124,7 +124,7 @@ def test_stage_order():
         dag,
         json.dumps(("test-key",)),
         tvm.target.Target("llvm"),
-        hardware_params=auto_scheduler.HardwareParams(100000, 16, 64),
+        hardware_params=auto_scheduler.HardwareParams(100000, 16, 64, 0, 0, 0, 0, 0),
     )
 
     task2 = pickle.loads(pickle.dumps(task))
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 8cbe201859cc..7412dbc1f8a4 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -153,7 +153,9 @@ def test_gpu_feature():
             inp.task.workload_key,
             inp.task.target,
             None,
-            auto_scheduler.HardwareParams(100000, 16, 64),
+            auto_scheduler.HardwareParams(
+                100000, 16, 64, 1 << 30, 1 << 30, 1 << 30, 1 << 30, 1 << 30
+            ),
         )
 
         state = dag.infer_bound_from_state(inputs[0].state)

From 41e68c1928bd2572ea03708e96898174a221054f Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Thu, 3 Dec 2020 20:50:59 -0800
Subject: [PATCH 252/258] [Topi] Fix GPU Dynamic Topk by Improving Dynamic
 Strided Slice in Topi (#7018)

* Fix GPU dynamic Topk

* Fix style

* Minor fix

* Simplfy dynamic checking

* Fix lint

* More improvements

* Disable test any topk
---
 include/tvm/topi/detail/constant_utils.h      | 15 +++++++
 include/tvm/topi/nn.h                         |  2 +-
 include/tvm/topi/transform.h                  | 43 +++++++++++++++----
 python/tvm/topi/cuda/sort.py                  | 20 +++++----
 src/relay/op/tensor/transform.cc              | 19 +++++---
 .../relay/dyn/test_dynamic_op_level6.py       |  4 +-
 tests/python/relay/test_any.py                | 10 ++---
 7 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/include/tvm/topi/detail/constant_utils.h b/include/tvm/topi/detail/constant_utils.h
index 412c79330ca9..49ce21b5732e 100644
--- a/include/tvm/topi/detail/constant_utils.h
+++ b/include/tvm/topi/detail/constant_utils.h
@@ -47,6 +47,21 @@ using namespace tvm::te;
  */
 inline bool IsConstInt(PrimExpr expr) { return expr->IsInstance<tvm::tir::IntImmNode>(); }
 
+/*!
+ * \brief Test whether the given Array has every element as constant integer
+ *
+ * \param array the array to query
+ *
+ * \return true if every element in array is constant int or uint, false otherwise.
+ */
+inline bool IsConstIntArray(Array<PrimExpr> array) {
+  bool is_const_int = true;
+  for (auto const& elem : array) {
+    is_const_int &= elem->IsInstance<tvm::tir::IntImmNode>();
+  }
+  return is_const_int;
+}
+
 /*!
  * \brief Get the value of the given constant integer expression. An error
  * is logged if the given expression is not a constant integer.
diff --git a/include/tvm/topi/nn.h b/include/tvm/topi/nn.h
index f958048f13c3..71944071a7ce 100644
--- a/include/tvm/topi/nn.h
+++ b/include/tvm/topi/nn.h
@@ -614,7 +614,7 @@ inline tvm::te::Tensor batch_to_space_nd(const tvm::te::Tensor& data,
   out = reshape(out, r_p_shape);
 
   // Crop the start and end of dimensions of out
-  Array<Integer> begin_idx, end_idx, strides;
+  Array<PrimExpr> begin_idx, end_idx, strides;
   for (size_t i = 0; i < r_p_shape.size(); ++i) {
     strides.push_back(Integer(1));
     if (i > 0 && i <= num_block_dims) {
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index c2a4843dedd0..a04762f28feb 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -598,17 +598,42 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
  *
  * \return A Tensor whose op member is the split operation
  */
-inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const Array<Integer>& end,
-                            const Array<Integer>& strides, std::string slice_mode = "end",
-                            std::string name = "T_strided_slice", std::string tag = kInjective) {
+inline Tensor strided_slice(const Tensor& x, const Array<PrimExpr>& begin,
+                            const Array<PrimExpr>& end, const Array<PrimExpr>& strides,
+                            std::string slice_mode = "end", std::string name = "T_strided_slice",
+                            std::string tag = kInjective) {
   size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
+  // Quick path for dynamic shape strided slice.
+  // This is for ease of use to dynamice strided slice in topi.
+  bool is_static = IsConstIntArray(x->shape);
+  is_static &= IsConstIntArray(begin);
+  is_static &= IsConstIntArray(end);
+  is_static &= IsConstIntArray(strides);
+
+  Array<PrimExpr> out_shape;
+  if (!is_static) {
+    for (size_t i = 0; i < src_tensor_dim; ++i) {
+      out_shape.push_back(indexdiv(end[i] - begin[i], strides[i]));
+    }
+    return te::compute(
+        out_shape,
+        [&](const Array<tvm::tir::Var>& indices) {
+          Array<PrimExpr> real_indices;
+          for (size_t i = 0; i < src_tensor_dim; ++i) {
+            real_indices.push_back(indices[i] * strides[i] + begin[i]);
+          }
+          return x(real_indices);
+        },
+        name, tag);
+  }
+
   // Setup the ranges.
   // NOTE: this code duplicates the shape inference logic relay.op
   // Consider to refactor in the future.
   std::vector<int64_t> stride_vec(src_tensor_dim, 1);
   for (size_t i = 0; i < strides.size(); ++i) {
     ICHECK(strides[i].defined());
-    stride_vec[i] = strides[i]->value;
+    stride_vec[i] = GetConstInt(strides[i]);
   }
 
   const int64_t max_range = std::numeric_limits<int64_t>::max();
@@ -619,7 +644,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
       // value=None
       begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
     } else {
-      begin_vec.push_back(begin[i]->value);
+      begin_vec.push_back(GetConstInt(begin[i]));
     }
   }
   for (size_t i = begin_vec.size(); i < src_tensor_dim; ++i) {
@@ -633,20 +658,20 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
     if (!end[i].defined()) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
     } else if (slice_mode == "size") {
-      if (end[i]->value < 0) {
+      int64_t end_val = GetConstInt(end[i]);
+      if (end_val < 0) {
         end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
       } else {
-        end_vec.push_back(begin_vec[i] + end[i]->value);
+        end_vec.push_back(begin_vec[i] + end_val);
       }
     } else {
-      end_vec.push_back(end[i]->value);
+      end_vec.push_back(GetConstInt(end[i]));
     }
   }
   for (size_t i = end_vec.size(); i < src_tensor_dim; ++i) {
     end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
   }
   // Compute
-  Array<PrimExpr> out_shape;
   Array<PrimExpr> begin_expr;
   Array<PrimExpr> strides_expr;
 
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 465299a5bc8f..ac14f5aae779 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -479,27 +479,28 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
             name="topk_gpu",
             tag="topk_gpu",
         )
-    if k < 1:
+    if isinstance(k, int) and k < 1:
         if ret_type == "indices":
             return output[1]
         return output
     beg = [0] * ndim
     end = []
+    strides = [1] * ndim
     for i in range(ndim):
         if i == axis:
-            end.append(k)
+            end.append(k if isinstance(k, int) else tvm.te.size_var("dim"))
         else:
             end.append(data.shape[i])
     if ret_type == "both":
         values_out, indices_out = output
-        values_out = strided_slice(values_out, beg, end)
-        indices_out = strided_slice(indices_out, beg, end)
+        values_out = strided_slice(values_out, beg, end, strides)
+        indices_out = strided_slice(indices_out, beg, end, strides)
         output = [values_out, indices_out]
     elif ret_type == "values":
-        output = [strided_slice(output, beg, end)]
+        output = [strided_slice(output, beg, end, strides)]
     else:  # ret_type == "indices"
         indices_out = output[1]
-        output = [strided_slice(indices_out, beg, end)]
+        output = [strided_slice(indices_out, beg, end, strides)]
     return output
 
 
@@ -561,10 +562,11 @@ def topk_thrust(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int
         tag="topk_gpu",
     )
 
-    if k > 0:
+    if not isinstance(k, int) or k > 0:
         beg = [0] * ndim
-        end = data.shape[:-1] + [k]
-        out = [strided_slice(o, beg, end) for o in out]
+        end = data.shape[:-1] + [k if isinstance(k, int) else tvm.te.size_var("dim")]
+        strides = [1] * ndim
+        out = [strided_slice(o, beg, end, strides) for o in out]
 
     if axis != ndim - 1:
         axes = swap(list(range(ndim)), axis)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 410738a6417d..a2b4f67a21d0 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2432,6 +2432,7 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
   ICHECK(param != nullptr);
   Array<Integer> begin, end, strides;
+  Array<PrimExpr> begin_expr, end_expr, strides_expr;
   begin = param->begin.value();
   end = param->end.value();
   strides = param->strides.value();
@@ -2444,8 +2445,6 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
     for (size_t i = 0; i < src_tensor_dim; ++i) {
       out_shape.push_back(tvm::tir::Var("dim"));
     }
-    Array<PrimExpr> begin_expr;
-    Array<PrimExpr> strides_expr;
     for (size_t i = 0; i < src_tensor_dim; ++i) {
       int64_t begin_i = begin[i]->value;
       if (begin_i < 0) {
@@ -2466,8 +2465,19 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
           return input(real_indices);
         },
         std::string{"T_strided_slice_dynamic"}, std::string{topi::kInjective})};
+  } else {
+    for (size_t i = 0; i < begin.size(); ++i) {
+      begin_expr.push_back(begin[i]);
+    }
+    for (size_t i = 0; i < end.size(); ++i) {
+      end_expr.push_back(end[i]);
+    }
+    for (size_t i = 0; i < strides.size(); ++i) {
+      strides_expr.push_back(strides[i]);
+    }
   }
-  return Array<te::Tensor>{topi::strided_slice(inputs[0], begin, end, strides, param->slice_mode)};
+  return Array<te::Tensor>{
+      topi::strided_slice(inputs[0], begin_expr, end_expr, strides_expr, param->slice_mode)};
 }
 
 // Positional relay function to create StridedSlice operator used by frontend FFI.
@@ -2783,8 +2793,7 @@ Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>&
           << topi::GetConstInt(src_shape[axis]);
     }
   }
-  return Array<te::Tensor>{topi::strided_slice(inputs[0], GetIntArray(begin_idx),
-                                               GetIntArray(end_idx), GetIntArray(strides), "end")};
+  return Array<te::Tensor>{topi::strided_slice(inputs[0], begin_idx, end_idx, strides, "end")};
 }
 
 TVM_REGISTER_GLOBAL("relay.op._make.slice_like").set_body_typed(MakeSliceLike);
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index aeed8db7c1b6..52abbe2a15b6 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -22,8 +22,8 @@
 from tvm import relay
 import tvm.testing
 
-# TODO(mbrookhart): Enable when we can get it working
-# @tvm.testing.uses_gpu
+
+@tvm.testing.uses_gpu
 def test_dynamic_topk():
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
         shape = (20, 100)
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index eec6aa21c69b..ee67e67b282f 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -815,15 +815,11 @@ def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False):
     else:
         ref_out = sorted[0:kval]
 
-    for kind in ["debug", "vm"]:
-        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
-        result = ex.evaluate()(*in_vals)
-        tvm.testing.assert_allclose(result.asnumpy(), ref_out)
-
-    # TODO(@zhiics) Fix topk cuda schedule for dynamic inputs
-    # check_result(in_vals, mod, ref_out)
+    check_result(in_vals, mod, ref_out)
 
 
+# TODO(kevinthesun): enable this test when Thrust is available in ci.
+# @tvm.testing.uses_gpu
 def test_any_topk():
     verify_any_topk(any_dims(1), 5, (10,), "float32")
     verify_any_topk(any_dims(2), 2, (6, 3), "int32")

From 846614f04b8275e2a1a2258b7c857566f0289098 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 3 Dec 2020 21:06:12 -0800
Subject: [PATCH 253/258] [Relay][Pass] Clean up DCE tests in preparation for
 refactoring.  (#7029)

* Clean up DCE tests

* Format

* Fix

* Fix
---
 .../relay/test_pass_dead_code_elimination.py  | 267 +++++++++++-------
 1 file changed, 159 insertions(+), 108 deletions(-)

diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index 6da6c3efe5c5..127035c5d540 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -25,128 +25,179 @@
 import pytest
 
 
-class env:
-    def __init__(self):
-        self.shape = tvm.runtime.convert([1, 2, 3])
-        self.tt = relay.TensorType(self.shape, "float32")
-        self.int32 = relay.TensorType([], "int32")
-        self.float32 = relay.TensorType([], "float32")
-        self.one = relay.const(1.0)
-        self.two = relay.const(2.0)
-        self.three = relay.const(3.0)
-        self.a = relay.Var("a", self.float32)
-        self.b = relay.Var("b", self.float32)
-        self.c = relay.Var("c", self.float32)
-        self.d = relay.Var("d", self.float32)
-        self.e = relay.Var("e", self.float32)
-        self.x = relay.Var("x", self.int32)
-        self.y = relay.Var("y", self.int32)
-        self.z = relay.Var("z", self.int32)
-
-
-e = env()
-
-
-def run_opt_pass(expr, opt_pass):
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_let():
-    orig = relay.Let(e.x, e.y, e.z)
-    orig = run_opt_pass(orig, transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(Function(free_vars(orig), orig), Function([e.z], e.z))
-
-
-def test_used_let():
-    orig = relay.Let(e.c, e.one, e.c + e.c)
-    orig = run_opt_pass(orig, transform.DeadCodeElimination())
-    expected = relay.Let(e.c, e.one, e.c + e.c)
-    assert tvm.ir.structural_equal(Function([], orig), Function([], expected))
-
-
-def test_inline():
-    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c))
-    orig = run_opt_pass(orig, transform.DeadCodeElimination(True))
-    tvm.ir.assert_structural_equal(Function(free_vars(orig), orig), Function([e.d], e.d))
-
-
-def test_chain_unused_let():
-    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.e))
-    orig = run_opt_pass(orig, transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(Function(free_vars(orig), orig), Function([e.e], e.e))
-
-
-def use_f(func):
-    f = relay.Var("f")
-    n = relay.Var("n", e.int32)
-    data = relay.Var("data", e.float32)
-    funcbody = relay.If(
-        equal(n, relay.const(0)), data, relay.Call(f, [subtract(n, relay.const(1)), log(data)])
+def optimize_source(source, passes):
+    if not isinstance(passes, list):
+        passes = [passes]
+
+    optimize = tvm.transform.Sequential(passes)
+    module = tvm.parser.parse(source)
+    return optimize(module)
+
+
+def optimize_and_check(before_source, after_source, passes):
+    optimize_module = optimize_source(before_source, passes)
+    after_module = tvm.parser.parse(after_source)
+    print(optimize_module)
+    print(after_module)
+    assert tvm.ir.structural_equal(after_module, optimize_module)
+
+
+def test_dead_let():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main(%z: int) {
+        let %x = 1;
+        %z
+    }
+    """
+    after_program = """
+    #[version = "0.0.5"]
+    def @main(%z: int) {
+        %z
+    }
+    """
+    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
+
+
+def test_one_live_let():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main(%z: int) {
+        let %x = 1;
+        let %y = 2;
+        %x + %x
+    }
+    """
+    after_program = """
+    #[version = "0.0.5"]
+    def @main(%z: int) {
+        let %x = 1;
+        %x + %x
+    }
+    """
+    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
+
+
+def test_nested_let():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main(%d: int, %b: int) {
+        let %a = %b;
+        let %c = %d;
+        %c
+    }
+    """
+    after_program = """
+    #[version = "0.0.5"]
+    def @main(%d: int, %b: int) {
+        let %c = %d;
+        %c
+    }
+    """
+    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
+
+
+def test_live_recursion():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        let %f = fn (%n: int, %data: int) -> int {
+            if (%n == 0) {
+                %data
+            } else {
+                %f(%n - 1, log(%data))
+            }
+        };
+        %f(2, 10000)
+    }
+    """
+
+    after_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        let %f = fn (%n: int, %data: int) -> int {
+            if (%n == 0) {
+                %data
+            } else {
+                %f(%n - 1, log(%data))
+            }
+        };
+        %f(2, 10000)
+    }
+    """
+
+    optimize_and_check(
+        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
     )
-    value = relay.Function([n, data], funcbody, e.float32, [])
-    return relay.Let(f, value, func(f))
 
 
-# make sure we dont infinite loop
-def test_recursion():
+def test_dead_recursion():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        let %f = fn (%n: int, %data: int) -> int {
+            if (%n == 0) {
+                %data
+            } else {
+                %f(%n - 1, log(%data))
+            }
+        };
+        ()
+    }
     """
-    Program:
-       let f(n: i32, data: f32) -> f32 = {
-          if (n == 0) {
-              return data;
-          } else {
-              return f(n - 1, log(data));
-          }
-       }
-       f(2, 10000);
+
+    after_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        ()
+    }
     """
-    orig = use_f(lambda f: relay.Call(f, [relay.const(2), relay.const(10000.0)]))
-    dced = run_opt_pass(orig, transform.DeadCodeElimination())
-    orig = run_opt_pass(orig, transform.InferType())
-    tvm.ir.assert_structural_equal(dced, orig)
 
+    optimize_and_check(
+        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
+    )
 
-def test_recursion_dead():
-    x = relay.Let(e.a, e.one, e.three)
-    dced_f = lambda f: x
-    dced = run_opt_pass(use_f(dced_f), transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(dced, e.three)
 
+def test_add_with_let():
+    before_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        (let %a = 1; 3) + 2
+    }
+    """
 
-def test_op_let():
-    dced = run_opt_pass(add(relay.Let(e.a, e.one, e.three), e.two), transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(dced, add(e.three, e.two))
+    after_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        3 + 2
+    }
+    """
+
+    optimize_and_check(
+        before_program, after_program, [transform.DeadCodeElimination(), transform.InferType()]
+    )
 
 
 def test_tuple_get_item():
-    tt = relay.TupleType([e.float32, e.float32])
-    t = relay.Var("t", tt)
-    a = relay.Var("a")
-    g = relay.TupleGetItem(t, 0)
-    dced = run_opt_pass(g, transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(Function(free_vars(dced), dced), Function(free_vars(g), g))
-    orig = relay.TupleGetItem(relay.Let(a, e.one, t), 0)
-    dced = run_opt_pass(orig, transform.DeadCodeElimination())
-    assert tvm.ir.structural_equal(Function(free_vars(dced), dced), Function(free_vars(g), g))
+    before_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        let %a = 100;
+        (1, 2, 3, 4).0
+    }
+    """
 
+    after_program = """
+    #[version = "0.0.5"]
+    def @main() {
+        (1, 2, 3, 4).0
+    }
+    """
 
-@pytest.mark.timeout(timeout=10, method="thread")
-def test_complexity():
-    g = inception_v3.get_net(1, 1000, (3, 299, 299), "float32")
-    run_opt_pass(g, transform.DeadCodeElimination())
+    optimize_and_check(before_program, after_program, transform.DeadCodeElimination())
 
 
 if __name__ == "__main__":
-    test_let()
-    test_used_let()
-    test_inline()
-    test_chain_unused_let()
-    test_recursion()
-    test_recursion_dead()
-    test_op_let()
-    test_tuple_get_item()
-    test_complexity()
+    import sys
+
+    pytest.main(sys.argv)

From eeb75c9dca512f7141976df15d36ee7c7b575b73 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 4 Dec 2020 05:03:34 -0800
Subject: [PATCH 254/258] [AutoScheduler] Refactor task interface for tuning
 single operators (#7028)

* [AutoScheduler] Refactor task interface

* updae tutorials and tests

* update

* fix lint

* fix lint

* update

* fix test
---
 include/tvm/auto_scheduler/compute_dag.h      |   4 +-
 python/tvm/auto_scheduler/__init__.py         |   7 +-
 python/tvm/auto_scheduler/compute_dag.py      |  23 +-
 python/tvm/auto_scheduler/measure.py          |  18 +-
 python/tvm/auto_scheduler/measure_record.py   |   2 +-
 .../tvm/auto_scheduler/relay_integration.py   |   9 +-
 python/tvm/auto_scheduler/search_task.py      | 304 +++++++++++++++++-
 src/auto_scheduler/utils.h                    |   2 +-
 src/tir/ir/expr.cc                            |  25 +-
 .../test_auto_scheduler_compute_dag.py        |   6 +-
 .../test_auto_scheduler_cost_model.py         |   2 +-
 ...test_auto_scheduler_evolutionary_search.py |  10 +-
 .../unittest/test_auto_scheduler_feature.py   |  15 +-
 .../test_auto_scheduler_layout_rewrite.py     |  22 +-
 .../unittest/test_auto_scheduler_measure.py   |  15 +-
 .../test_auto_scheduler_search_policy.py      |   8 +-
 .../test_auto_scheduler_sketch_generation.py  |   2 +-
 .../test_auto_scheduler_task_scheduler.py     |  12 +-
 tutorials/auto_scheduler/ci_logs/matmul.json  |   2 +-
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |  36 ++-
 tutorials/auto_scheduler/tune_matmul_x86.py   |  49 +--
 tutorials/auto_scheduler/tune_network_cuda.py |   5 +-
 tutorials/auto_scheduler/tune_network_x86.py  |   3 +-
 23 files changed, 456 insertions(+), 125 deletions(-)

diff --git a/include/tvm/auto_scheduler/compute_dag.h b/include/tvm/auto_scheduler/compute_dag.h
index da0d196f4912..b9306c64b0b5 100755
--- a/include/tvm/auto_scheduler/compute_dag.h
+++ b/include/tvm/auto_scheduler/compute_dag.h
@@ -199,7 +199,7 @@ class ComputeDAGNode : public Object {
  * This is an optimization to rewrite the layout of input tensors according to the schedule we get.
  */
 enum class LayoutRewriteOption : int {
-  /*! \brief Do not process layout rewrite. */
+  /*! \brief Do not perform layout rewrite. */
   NoRewrite = 0,
   /*! \brief Insert layout transformation stages for input placeholders in the compute DAG */
   InsertTransformStage = 1,
@@ -207,7 +207,7 @@ enum class LayoutRewriteOption : int {
    * \brief Do not insert layout transformation stages and assume the input placeholders
    * are pre-transformed.
    * \note The lowered function with this option does not accept the origial input shapes,
-   * so this option must be used along with a layout conversion pass in Relay.
+   * so this option must be used along with `AutoSchedulerLayoutRewrite` pass in Relay.
    */
   RewriteForPreTransformed = 2,
 };
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index bee2e7f423b6..4926b88e4658 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -31,8 +31,7 @@
 from . import workload_registry
 
 # Shortcut
-from .auto_schedule import TuningOptions, HardwareParams, create_task, auto_schedule
-from .compute_dag import ComputeDAG
+from .compute_dag import ComputeDAG, LayoutRewriteOption
 from .cost_model import RandomModel, XGBModel
 from .dispatcher import DispatchContext, ApplyHistoryBest
 from .measure import (
@@ -43,14 +42,14 @@
     RPCRunner,
     LocalRPCMeasureContext,
 )
-from .measure_record import RecordToFile, RecordReader, load_best, load_records, save_records
+from .measure_record import RecordToFile, RecordReader, load_best_record, load_records, save_records
 from .relay_integration import (
     extract_tasks,
     remove_index_check,
     rewrite_compute_body,
     is_auto_scheduler_enabled,
 )
-from .search_task import SearchTask
+from .search_task import SearchTask, TuningOptions, HardwareParams, create_task, auto_schedule
 from .search_policy import EmptyPolicy, SketchPolicy, PreloadMeasuredStates
 from .task_scheduler import TaskScheduler
 from .workload_registry import register_workload, make_workload_key
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index cba3600ccf6e..a6f99542e7d0 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -31,6 +31,20 @@
 from .workload_registry import workload_key_to_tensors
 
 
+class LayoutRewriteOption:
+    """Options for applying layout rewrite."""
+
+    # Do not perform layout rewrite
+    NO_REWRITE = 0
+    # Insert layout transformation stages for input placeholders in the compute DAG
+    INSERT_TRANSFORM_STAGE = 1
+    # Do not insert layout transformation stages and assume the input placeholders
+    # are pre-transformed.
+    # Note: The lowered function with this option does not accept the origial input shapes,
+    # so this option must be used along with `AutoSchedulerLayoutRewrite` pass in Relay.
+    REWRITE_FOR_PRE_TRANSFORMED = 2
+
+
 @tvm._ffi.register_object("auto_scheduler.ComputeDAG")
 class ComputeDAG(Object):
     """
@@ -52,11 +66,6 @@ class ComputeDAG(Object):
         Input/output tensors or workload key for a compute declaration.
     """
 
-    # Layout Rewrite Options
-    NoRewrite = 0
-    InsertTransformStage = 1
-    RewriteForPreTransformed = 2
-
     def __init__(self, compute_or_sche):
         if isinstance(compute_or_sche, str):
             compute = workload_key_to_tensors(compute_or_sche)
@@ -92,7 +101,7 @@ def get_init_state(self):
         """
         return State(self.init_state, self)
 
-    def apply_steps_from_state(self, state, layout_rewrite=NoRewrite):
+    def apply_steps_from_state(self, state, layout_rewrite=LayoutRewriteOption.NO_REWRITE):
         """
         Apply the history transform steps from a State to get a TVM schedule.
 
@@ -101,7 +110,7 @@ def apply_steps_from_state(self, state, layout_rewrite=NoRewrite):
         state : Union[State, StateObject]
             The state from which we get transform steps.
 
-        layout_rewrite: Bool
+        layout_rewrite: LayoutRewriteOption = NoRewrite
             Rewrite the layout of placeholders specified by "layout_free_placeholders" attr
             to make it most friendly for the generated schedule to read from.
 
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index b2826518d8c8..7e4f14933819 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -53,8 +53,7 @@
     make_traceback_info,
     request_remote,
 )
-from .compute_dag import ComputeDAG
-from .search_task import SearchTask
+from .compute_dag import LayoutRewriteOption
 from .workload_registry import (
     serialize_workload_registry_entry,
     deserialize_workload_registry_entry,
@@ -178,13 +177,15 @@ def recover_measure_input(inp, rebuild_state=False):
     new_input: MeasureInput
         The fully recovered MeasureInput with all fields rebuilt.
     """
+    # pylint: disable=import-outside-toplevel
+    from .search_task import SearchTask  # lazily import to avoid recursive dependency
+
     task = inp.task
     new_task = SearchTask(
-        ComputeDAG(task.workload_key),
-        task.workload_key,
-        task.target,
-        task.target_host,
-        task.hardware_params,
+        workload_key=task.workload_key,
+        target=task.target,
+        target_host=task.target_host,
+        hardware_params=task.hardware_params,
     )
 
     if rebuild_state:
@@ -521,6 +522,7 @@ def __del__(self):
         # Close the tracker and server before exit
         self.tracker.terminate()
         self.server.terminate()
+        time.sleep(0.5)
 
 
 class MeasureErrorNo(object):
@@ -549,7 +551,7 @@ def _timed_func(inp_serialized, build_func, verbose):
 
     try:
         sch, args = task.compute_dag.apply_steps_from_state(
-            inp.state, layout_rewrite=ComputeDAG.RewriteForPreTransformed
+            inp.state, layout_rewrite=LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
         )
     # pylint: disable=broad-except
     except Exception:
diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
index 2569f3984f3c..d6fea5c48598 100644
--- a/python/tvm/auto_scheduler/measure_record.py
+++ b/python/tvm/auto_scheduler/measure_record.py
@@ -137,7 +137,7 @@ def save_records(filename, inputs, results):
     _ffi_api.SaveRecords(filename, inputs, results)
 
 
-def load_best(filename, workload_key=None, target=None):
+def load_best_record(filename, workload_key=None, target=None):
     """Return the best measurement pair form a log file. This may return none results if
     there is no legal measure pair with the specified workload_key/target found from the log file.
 
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 5a197910e334..4c493d1d9366 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -120,7 +120,14 @@ def extract_tasks(
     weights = []
     for wkl_key, ccache_key in env.wkl_key_to_ccache_key.items():
         dag = ComputeDAG(wkl_key)
-        tasks.append(SearchTask(dag, wkl_key, target, target_host, hardware_params))
+        tasks.append(
+            SearchTask(
+                workload_key=wkl_key,
+                target=target,
+                target_host=target_host,
+                hardware_params=hardware_params,
+            )
+        )
         weights.append(use_count_dict[ccache_key] + 1)
 
     # clean the cached lowering results
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index f2dadccbf891..31698d0356de 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -22,8 +22,139 @@
 import tvm._ffi
 from tvm.runtime import Object
 
-from . import _ffi_api
+from tvm.driver.build_module import build
+from tvm.target import Target
+from .measure import LocalBuilder, LocalRunner
+from .measure_record import load_best_record
+from .workload_registry import make_workload_key
+from .compute_dag import ComputeDAG, LayoutRewriteOption
+from .cost_model import XGBModel
+from .search_policy import SketchPolicy
 from .workload_registry import register_workload_tensors
+from . import _ffi_api
+
+
+@tvm._ffi.register_object("auto_scheduler.HardwareParams")
+class HardwareParams(Object):
+    """The parameters of target hardware used to guide the search policy
+    TODO(jcf94): This is considered to be merged with the new Target specification:
+    https://discuss.tvm.apache.org/t/rfc-tvm-target-specification/6844
+    Parameters
+    ----------
+    num_cores : int
+        The number of device cores.
+    vector_unit_bytes : int
+        The width of vector units in bytes.
+    cache_line_bytes : int
+        The size of cache line in bytes.
+    max_shared_memory_per_block : int
+        The max shared memory per block in bytes.
+    max_registers_per_block : int
+        The max number of register per block.
+    max_threads_per_block : int
+        The max number of threads per block.
+    max_vthread_extent : int
+        The max vthread extent.
+    warp_size : int
+        The thread numbers of a warp.
+    """
+
+    def __init__(
+        self,
+        num_cores,
+        vector_unit_bytes,
+        cache_line_bytes,
+        max_shared_memory_per_block,
+        max_registers_per_block,
+        max_threads_per_block,
+        max_vthread_extent,
+        warp_size,
+    ):
+        self.__init_handle_by_constructor__(
+            _ffi_api.HardwareParams,
+            num_cores,
+            vector_unit_bytes,
+            cache_line_bytes,
+            max_shared_memory_per_block,
+            max_registers_per_block,
+            max_threads_per_block,
+            max_vthread_extent,
+            warp_size,
+        )
+
+
+@tvm._ffi.register_object("auto_scheduler.TuningOptions")
+class TuningOptions(Object):
+    """This controls the options of performance tuning.
+
+    Parameters
+    ----------
+    num_measure_trials: int = 0
+        The number of measurement trials.
+        The search policy measures `num_measure_trials` schedules in total and returns the best one
+        among them.
+        With `num_measure_trials` == 0, the policy will do the schedule search but won't involve
+        measurement. This can be used to get a runnable schedule quickly without auto-tuning.
+    early_stopping: Optional[int]
+        Stop the tuning early if getting no improvement after n measurements.
+    num_measures_per_round: int = 64
+        The number of schedules to be measured at each search round.
+        The whole schedule search process will try a total number of `num_measure_trials` in several
+        rounds.
+    verbose: int = 1
+        Verbosity level. 0 for silent, 1 to output information during schedule search.
+    builder: Union[ProgramBuilder, str] = 'local'
+        ProgramBuilder which builds the program.
+    runner: Union[ProgramRunner, str] = 'local'
+        ProgramRunner which runs the program and measures time costs.
+    measure_callbacks: Optional[List[MeasureCallback]]
+        Callback functions called after each measurement.
+        Candidates:
+        - auto_scheduler.RecordToFile
+    """
+
+    def __init__(
+        self,
+        num_measure_trials=0,
+        early_stopping=None,
+        num_measures_per_round=64,
+        verbose=1,
+        builder="local",
+        runner="local",
+        measure_callbacks=None,
+    ):
+        if isinstance(builder, str):
+            if builder == "local":
+                builder = LocalBuilder()
+            else:
+                raise ValueError("Invalid builder: " + builder)
+        elif not isinstance(builder, tvm.auto_scheduler.measure.ProgramBuilder):
+            raise ValueError(
+                "Invalid builder: "
+                + builder
+                + " . TuningOptions expects a ProgramBuilder or string."
+            )
+
+        if isinstance(runner, str):
+            if runner == "local":
+                runner = LocalRunner()
+            else:
+                raise ValueError("Invalid runner: " + runner)
+        elif not isinstance(runner, tvm.auto_scheduler.measure.ProgramRunner):
+            raise ValueError(
+                "Invalid runner: " + runner + " . TuningOptions expects a ProgramRunner or string."
+            )
+
+        self.__init_handle_by_constructor__(
+            _ffi_api.TuningOptions,
+            num_measure_trials,
+            early_stopping or -1,
+            num_measures_per_round,
+            verbose,
+            builder,
+            runner,
+            measure_callbacks,
+        )
 
 
 @tvm._ffi.register_object("auto_scheduler.SearchTask")
@@ -32,7 +163,12 @@ class SearchTask(Object):
 
     Parameters
     ----------
-    dag : ComputeDAG
+    func : Union[Function, str]
+        The function that returns the compute declaration Tensors.
+        Can be the a function or the function name.
+    args : Union[Tuple[Any, ...], List[Any]]
+        The args of the function.
+    compute_dag : ComputeDAG
         The ComputeDAG for the corresponding compute declaration.
     workload_key : str
         The workload key for the corresponding compute declaration.
@@ -42,18 +178,123 @@ class SearchTask(Object):
         The target host device of this search task.
     hardware_params : Optional[HardwareParams]
         Hardware parameters used in this search task.
+
+    Examples
+    --------
+    .. code-block:: python
+
+      # We support two ways to create a search task
+
+      # Way 1: create a task by a workload generation function.
+      # The `workload_func` is a function decorated by @auto_scheduler.register_workload
+      task = SearchTask(func=workload_func, args=args, target=target)
+
+      # Way 2: create a task by a workload_key.
+      # The `workload_key` is a string, which can be either a hash key or a json-serialized
+      # tuple(func, args).
+      task = SearchTask(workload_key=workload_key, target=target)
     """
 
-    def __init__(self, dag, workload_key, target, target_host=None, hardware_params=None):
-        self.dag = dag
+    def __init__(
+        self,
+        func=None,
+        args=None,
+        compute_dag=None,
+        workload_key=None,
+        target=None,
+        target_host=None,
+        hardware_params=None,
+    ):
+        assert (
+            func is not None or workload_key is not None
+        ), "Either a workload generation function or a workload key should be provided"
+
+        if func is not None:
+            workload_key = make_workload_key(func, args)
+        if compute_dag is None:
+            compute_dag = ComputeDAG(workload_key)
+
+        assert target is not None, "Must specify a target."
+        if isinstance(target, str):
+            target = Target(target)
+        if isinstance(target_host, str):
+            target_host = Target(target_host)
+
+        self.dag = compute_dag
         self.workload_key = workload_key
         self.target = target
         self.target_host = target_host
         self.hardware_params = hardware_params
         self.__init_handle_by_constructor__(
-            _ffi_api.SearchTask, dag, workload_key, target, target_host, hardware_params
+            _ffi_api.SearchTask, compute_dag, workload_key, target, target_host, hardware_params
         )
 
+    def tune(self, tuning_options, search_policy=None):
+        """Run auto scheduling search for a task
+
+        Parameters
+        ----------
+        tuning_options : TuningOptions
+            Tuning and measurement options.
+        search_policy : Optional[SearchPolicy]
+            The search policy to be used for schedule search.
+        """
+        if search_policy is None:
+            cost_model = XGBModel()
+            search_policy = SketchPolicy(self, cost_model)
+
+        _ffi_api.AutoSchedule(search_policy, tuning_options)
+
+    def apply_best(self, log_file, layout_rewrite_option=None):
+        """Apply the history best from a log file and return the schedule.
+
+        Parameters
+        ----------
+        log_file : str
+           The name of the log file.
+        layout_rewrite_option : Optional[LayoutRewriteOption]
+           The layout rewrite option.
+
+        Returns
+        -------
+            A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
+        """
+        inp, _ = load_best_record(log_file, self.workload_key)
+
+        if layout_rewrite_option is None:
+            layout_rewrite_option = LayoutRewriteOption.NO_REWRITE
+            if self.target.kind.name == "llvm":
+                layout_rewrite_option = LayoutRewriteOption.INSERT_TRANSFORM_STAGE
+        sch, args = self.compute_dag.apply_steps_from_state(inp.state, layout_rewrite_option)
+        return sch, args
+
+    def print_best(self, log_file, print_mode="schedule"):
+        """Print the best schedule as python schedule API code or CUDA source code.
+
+        Parameters
+        ----------
+        log_file : str
+           The name of the log file
+        print_mode: str
+           if "schedule", print the best schedule as python schedule API code.
+           if "cuda", print the best schedule as CUDA source code.
+
+        Returns
+        -------
+        code: str
+            The best schedule code in python API or CUDA source code
+        """
+        inp, _ = load_best_record(log_file, self.workload_key)
+
+        if print_mode == "schedule":
+            return self.compute_dag.print_python_code_from_state(inp.state)
+        if print_mode == "cuda":
+            assert self.target.kind.name == "cuda"
+            sch, args = self.compute_dag.apply_steps_from_state(inp.state)
+            func = build(sch, args, "cuda")
+            return func.imported_modules[0].get_source()
+        raise ValueError("Invalid print_mode: %s" % print_mode)
+
     def __getstate__(self):
         return {
             "dag": self.dag,
@@ -90,3 +331,56 @@ def __setstate__(self, state):
             self.target_host,
             self.hardware_params,
         )
+
+
+def create_task(func, args, target, target_host=None, hardware_params=None):
+    """THIS API IS DEPRECATED.
+
+    Create a search task.
+
+    Parameters
+    ----------
+    func : Union[Function, str]
+        The function that returns the compute declaration Tensors.
+        Can be the a function or the function name.
+    args : Union[Tuple[Any, ...], List[Any]]
+        The args of the function.
+    target : Union[tvm.target.Target, str]
+        The target device of this search task.
+    target_host : Optional[Union[tvm.target.Target, str]]
+        The target host device of this search task.
+    hardware_params : Optional[HardwareParams]
+        Hardware parameters used in this search task.
+
+    Returns
+    -------
+        SearchTask: the created task
+    """
+    raise ValueError(
+        'The API "auto_scheduler.create_task" is deprecated.'
+        "See https://github.com/apache/tvm/pull/7028 for the upgrade guide"
+    )
+
+
+def auto_schedule(task, search_policy=None, tuning_options=TuningOptions()):
+    """THIS API IS DEPRECATED.
+
+    Run auto scheduling search for a task.
+
+    Parameters
+    ----------
+    task : SearchTask
+        The SearchTask for the computation declaration.
+    search_policy : Optional[SearchPolicy]
+        The search policy to be used for schedule search.
+    tuning_options : Optional[TuningOptions]
+        Tuning and measurement options.
+
+    Returns
+    -------
+        A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
+    """
+    raise ValueError(
+        'The API "auto_scheduler.create_task" is deprecated.'
+        "See https://github.com/apache/tvm/pull/7028 for the upgrade guide."
+    )
diff --git a/src/auto_scheduler/utils.h b/src/auto_scheduler/utils.h
index bc29a3761129..9fc5a1dd8f22 100755
--- a/src/auto_scheduler/utils.h
+++ b/src/auto_scheduler/utils.h
@@ -192,7 +192,7 @@ inline bool StrEndsWith(const String& a, const String& b) {
 /*! \brief Get an int value from an Expr */
 inline int64_t GetIntImm(const PrimExpr& expr) {
   auto pint = expr.as<IntImmNode>();
-  ICHECK(pint != nullptr);
+  ICHECK(pint != nullptr) << "Expect an IntImm but get " << expr;
   return pint->value;
 }
 
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index 2d2a29943383..aa400997e2b3 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -33,18 +33,19 @@
 namespace tvm {
 namespace tir {
 
-#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                             \
-  Name::Name(PrimExpr a, PrimExpr b, Span span) {                      \
-    using T = Name::ContainerType;                                     \
-    ICHECK(a.defined()) << "ValueError: a is undefined\n";             \
-    ICHECK(b.defined()) << "ValueError: b is undefined\n";             \
-    ICHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n"; \
-    ObjectPtr<T> node = make_object<T>();                              \
-    node->dtype = a.dtype();                                           \
-    node->a = std::move(a);                                            \
-    node->b = std::move(b);                                            \
-    node->span = std::move(span);                                      \
-    data_ = std::move(node);                                           \
+#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                                               \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                                        \
+    using T = Name::ContainerType;                                                       \
+    ICHECK(a.defined()) << "ValueError: a is undefined\n";                               \
+    ICHECK(b.defined()) << "ValueError: b is undefined\n";                               \
+    ICHECK(a.dtype() == b.dtype())                                                       \
+        << "TypeError: mismatched types. " << a.dtype() << " vs. " << b.dtype() << "\n"; \
+    ObjectPtr<T> node = make_object<T>();                                                \
+    node->dtype = a.dtype();                                                             \
+    node->a = std::move(a);                                                              \
+    node->b = std::move(b);                                                              \
+    node->span = std::move(span);                                                        \
+    data_ = std::move(node);                                                             \
   }
 
 #define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                             \
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index 1356154cacd6..859964ff51ef 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -121,9 +121,9 @@ def test_stage_order():
 
     # Serialize and deserialize the search task.
     task = auto_scheduler.SearchTask(
-        dag,
-        json.dumps(("test-key",)),
-        tvm.target.Target("llvm"),
+        compute_dag=dag,
+        workload_key=json.dumps(("test-key",)),
+        target=tvm.target.Target("llvm"),
         hardware_params=auto_scheduler.HardwareParams(100000, 16, 64, 0, 0, 0, 0, 0),
     )
 
diff --git a/tests/python/unittest/test_auto_scheduler_cost_model.py b/tests/python/unittest/test_auto_scheduler_cost_model.py
index 5ed736a5b8cb..36360da45c8d 100644
--- a/tests/python/unittest/test_auto_scheduler_cost_model.py
+++ b/tests/python/unittest/test_auto_scheduler_cost_model.py
@@ -30,7 +30,7 @@
 def get_sample_records(number):
     """Generate a list of random MeasureInput and MeasureResult pairs"""
     N = 128
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), "llvm")
+    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target="llvm")
     policy = auto_scheduler.SketchPolicy(task, verbose=0)
     states = policy.sample_initial_population()[:number]
 
diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
index 70bea3afd849..e28219d0979f 100644
--- a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
+++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
@@ -48,9 +48,9 @@ def predict(self, task, states):
                 scores.append(1 if self.is_good_state(state) else 0)
             return scores
 
-    workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (10, 10, 4))
-    dag = auto_scheduler.ComputeDAG(workload_key)
-    task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.Target("llvm"))
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(10, 10, 4), target=tvm.target.Target("llvm")
+    )
     policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)
     states = policy.sample_initial_population()[:50]
 
@@ -92,7 +92,9 @@ def predict(self, task, states):
                 scores.append(1 if self.is_good_state(state) else 0)
             return scores
 
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (1024, 1024, 1024), "llvm")
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(1024, 1024, 1024), target="llvm"
+    )
     policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0)
 
     found = False
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 7412dbc1f8a4..b52b53863ee4 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -45,7 +45,7 @@ def test_cpu_matmul():
     s.unroll(C, k)
 
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(dag, "test", target)
+    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
     names = auto_scheduler.feature.get_per_store_feature_names()
     fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0]
 
@@ -103,7 +103,7 @@ def fusion_test(N, M):
     s.compute_at(1, 2, s.stages[2].iters[1])
 
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(dag, "test", target)
+    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
     names = auto_scheduler.feature.get_per_store_feature_names()
     fea = auto_scheduler.feature.get_per_store_features_from_states([s], task)[0]
 
@@ -147,18 +147,15 @@ def test_gpu_feature():
         inputs, results = auto_scheduler.RecordReader(f.name).read_lines()
 
         inp = inputs[0]
-        dag = auto_scheduler.ComputeDAG(inp.task.workload_key)
         task = auto_scheduler.SearchTask(
-            dag,
-            inp.task.workload_key,
-            inp.task.target,
-            None,
-            auto_scheduler.HardwareParams(
+            workload_key=inp.task.workload_key,
+            target=inp.task.target,
+            hardware_params=auto_scheduler.HardwareParams(
                 100000, 16, 64, 1 << 30, 1 << 30, 1 << 30, 1 << 30, 1 << 30
             ),
         )
 
-        state = dag.infer_bound_from_state(inputs[0].state)
+        state = task.dag.infer_bound_from_state(inputs[0].state)
         fea = auto_scheduler.feature.get_per_store_features_from_states([state], task)[0]
         names = auto_scheduler.feature.get_per_store_feature_names()
 
diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index 9d9704df0524..6ca56bde7c60 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -35,7 +35,7 @@ def test_apply_steps_with_layout_rewrite():
     assert bufs[1].shape[0] == 512
     assert bufs[1].shape[1] == 512
     _, bufs = dag.apply_steps_from_state(
-        s, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.RewriteForPreTransformed
+        s, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
     )
     assert bufs[1].shape[0] == 4
     assert bufs[1].shape[1] == 8
@@ -43,7 +43,7 @@ def test_apply_steps_with_layout_rewrite():
     assert bufs[1].shape[3] == 4
     assert bufs[1].shape[4] == 512
     _, bufs = dag.apply_steps_from_state(
-        s, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.InsertTransformStage
+        s, layout_rewrite=auto_scheduler.LayoutRewriteOption.INSERT_TRANSFORM_STAGE
     )
     assert bufs[1].shape[0] == 512
     assert bufs[1].shape[1] == 512
@@ -53,7 +53,7 @@ def test_apply_steps_with_layout_rewrite():
 def test_correctness_layout_rewrite_rewrite_for_preTransformed():
     N = 128
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target)
+    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target)
     dag = task.compute_dag
 
     with tempfile.NamedTemporaryFile() as fp:
@@ -65,13 +65,13 @@ def test_correctness_layout_rewrite_rewrite_for_preTransformed():
         tuning_options = auto_scheduler.TuningOptions(
             num_measure_trials=2,
             runner=measure_ctx.runner,
-            verbose=1,
+            verbose=2,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
-        auto_scheduler.auto_schedule(task, search_policy, tuning_options)
-        inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target)
+        task.tune(tuning_options, search_policy=search_policy)
+        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key, target)
         s, bufs = dag.apply_steps_from_state(
-            inp.state, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.RewriteForPreTransformed
+            inp.state, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
         )
         s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
         np_args = [np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
@@ -123,7 +123,7 @@ def test_correctness_layout_rewrite_rewrite_for_preTransformed():
 def test_correctness_layout_rewrite_insert_transform_stage():
     N = 128
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target)
+    task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target)
     dag = task.compute_dag
 
     with tempfile.NamedTemporaryFile() as fp:
@@ -138,10 +138,10 @@ def test_correctness_layout_rewrite_insert_transform_stage():
             verbose=1,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
-        auto_scheduler.auto_schedule(task, search_policy, tuning_options)
-        inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target)
+        task.tune(tuning_options, search_policy=search_policy)
+        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key, target)
         s, bufs = dag.apply_steps_from_state(
-            inp.state, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.InsertTransformStage
+            inp.state, layout_rewrite=auto_scheduler.LayoutRewriteOption.INSERT_TRANSFORM_STAGE
         )
 
         s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 80ce98d0b1c1..b214d9c990b9 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -29,7 +29,7 @@
 
 def record_common(dag, s):
     target = tvm.target.Target("llvm")
-    task = auto_scheduler.SearchTask(dag, "test", target)
+    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key="test", target=target)
 
     inp = auto_scheduler.measure.MeasureInput(task, s)
     res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
@@ -169,7 +169,9 @@ def test_record_pragma_storage_align_rfactor():
 
 
 def test_recover_measure_input():
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+    )
 
     inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
     res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
@@ -194,7 +196,9 @@ def test_measure_local_builder_runner():
     if not tvm.testing.device_enabled("llvm"):
         return
 
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+    )
 
     for enable_cpu_cache_flush in [True, False]:
         minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
@@ -213,7 +217,9 @@ def test_measure_local_builder_rpc_runner():
     if not tvm.testing.device_enabled("llvm"):
         return
 
-    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+    )
 
     for enable_cpu_cache_flush in [True, False]:
         minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
@@ -251,5 +257,4 @@ def test_measure_local_builder_rpc_runner_spawn():
     test_record_pragma_storage_align_rfactor()
     test_recover_measure_input()
     test_measure_local_builder_runner()
-    test_measure_local_builder_runner_spawn()
     test_measure_local_builder_rpc_runner()
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index a4f3c4e06843..1bb74497898c 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -45,7 +45,7 @@ def search_common(
     random.seed(seed)
     N = 128
     target = tvm.target.Target(target)
-    task = auto_scheduler.create_task(workload, (N, N, N), target)
+    task = auto_scheduler.SearchTask(func=workload, args=(N, N, N), target=target)
 
     with tempfile.NamedTemporaryFile() as fp:
         log_file = fp.name
@@ -70,11 +70,11 @@ def search_common(
             verbose=2,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
-        sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options)
-        inp, res = auto_scheduler.load_best(log_file, task.workload_key, target)
+        task.tune(tuning_options=tuning_options, search_policy=search_policy)
+        sch, args = task.apply_best(log_file)
 
         print("==== Python Code ====")
-        print(task.compute_dag.print_python_code_from_state(inp.state))
+        print(task.print_best(log_file))
 
         try:
             print("==== Lowered Stmt ====")
diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
index 1fef31550e67..d060243a45d4 100644
--- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py
+++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
@@ -37,7 +37,7 @@
 
 
 def generate_sketches(workload_func, args, target, print_for_debug=False):
-    task = auto_scheduler.create_task(workload_func, args, tvm.target.Target(target))
+    task = auto_scheduler.SearchTask(func=workload_func, args=args, target=target)
     policy = auto_scheduler.SketchPolicy(task, verbose=0)
     return policy.generate_sketches(print_for_debug)
 
diff --git a/tests/python/unittest/test_auto_scheduler_task_scheduler.py b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
index 680a783e25f4..032933f3f75f 100644
--- a/tests/python/unittest/test_auto_scheduler_task_scheduler.py
+++ b/tests/python/unittest/test_auto_scheduler_task_scheduler.py
@@ -32,7 +32,11 @@
 def test_task_scheduler_round_robin():
     tasks = []
     for n in [2, 4, 8]:
-        tasks.append(auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n), "llvm"))
+        tasks.append(
+            auto_scheduler.SearchTask(
+                func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm"
+            )
+        )
 
     with tempfile.NamedTemporaryFile() as fp:
         log_file = fp.name
@@ -90,7 +94,11 @@ def test_task_scheduler_round_robin_spawn():
 def test_task_scheduler_gradient():
     tasks = []
     for n in [2, 4]:
-        tasks.append(auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n), "llvm"))
+        tasks.append(
+            auto_scheduler.SearchTask(
+                func=matmul_auto_scheduler_test, args=(n, n, n), target="llvm"
+            )
+        )
 
     def objective_func(costs):
         return costs[0]
diff --git a/tutorials/auto_scheduler/ci_logs/matmul.json b/tutorials/auto_scheduler/ci_logs/matmul.json
index 827cfc9a6dbb..bc5d6f0ba70d 100644
--- a/tutorials/auto_scheduler/ci_logs/matmul.json
+++ b/tutorials/auto_scheduler/ci_logs/matmul.json
@@ -1,2 +1,2 @@
 # Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI.
-{"i": [["[\"matmul_add\", 128, 128, 128, \"float32\"]", "llvm -keys=cpu"], [[], [["SP", 2, 0, 128, [4, 2, 4], 1], ["SP", 2, 4, 128, [1, 32, 2], 1], ["SP", 2, 8, 128, [2], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 1], ["FSP", 4, 2, 1, 1], ["RE", 4, [0, 2, 1, 3]], ["CA", 2, 4, 1], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$0"], ["AN", 2, 9, 2]]]], "r": [[5.80388e-05], 0, 0.299169, 1603402396], "v": "v0.2"}
+{"i": [["[\"matmul_add\", 1024, 1024, 1024, \"float32\"]", "llvm -keys=cpu -link-params=0", [24, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1024, [2, 4, 16], 1], ["SP", 2, 4, 1024, [16, 4, 16], 1], ["SP", 2, 8, 1024, [8], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$0"], ["AN", 2, 6, 2]]]], "r": [[0.028777], 0, 0.613435, 1607038574], "v": "v0.3"}
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 9aeea8487444..103ceb49dced 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -71,9 +71,12 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 
 # Use the last layer in ResNet-50
 N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
-task = auto_scheduler.create_task(conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding), target)
+task = auto_scheduler.SearchTask(
+    func=conv2d_layer, args=(N, H, W, CO, CI, KH, KW, strides, padding), target=target
+)
 
 # Inspect the computational graph
+print("Computational DAG:")
 print(task.compute_dag)
 
 ######################################################################
@@ -109,11 +112,15 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # ^^^^^^^^^^^^^^
 # Now we get all inputs ready. Pretty simple, isn't it?
 # We can kick off the search and let the auto-scheduler do its magic.
-# After some measurement trials, it will return the best schedule it found.
+# After some measurement trials, we can load the best schedule from the log
+# file and apply it.
 
-sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
+# Run auto-tuning (search)
+task.tune(tune_option)
+# Apply the best schedule
+sch, args = task.apply_best(log_file)
 
-# Kill the process for measurement
+# Kill the measurement process
 del measure_ctx
 
 ######################################################################
@@ -121,6 +128,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # The auto-scheduler correctly performs optimizations including multi-level tiling,
 # cooperative fetching, unrolling and operator fusion.
 
+print("Lowered TIR:")
 print(tvm.lower(sch, args, simple_mode=True))
 
 ######################################################################
@@ -157,26 +165,20 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 ######################################################################
 # Using the record file
 # ^^^^^^^^^^^^^^^^^^^^^
-# During the search, all measuremnt records are dumpped into the record
+# During the search, all measurement records are dumped into the record
 # file "conv2d.json". The measurement records can be used to re-apply search results,
 # resume the search, and perform other analyses.
 
 ######################################################################
 # Here is an example where we load the best schedule from a file,
-# print the equivalent python schedule API, and build the binary again.
+# print the equivalent python schedule API and CUDA source code.
+# They can be used for debugging and learning the behavior of the auto-scheduler.
 
-# Load the measuremnt record for the best schedule
-inp, res = auto_scheduler.load_best(log_file, task.workload_key)
-
-# Print equivalent python schedule API. This can be used for debugging and
-# learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
-print(task.compute_dag.print_python_code_from_state(inp.state))
+print(task.print_best(log_file, print_mode="schedule"))
 
-# Rebuild the binary. This shows how you can apply the best schedule from a
-# log file without reruning the search again.
-sch, args = task.compute_dag.apply_steps_from_state(inp.state)
-func = tvm.build(sch, args, target)
+print("CUDA source code:")
+print(task.print_best(log_file, print_mode="cuda"))
 
 ######################################################################
 # A more complicated example is to resume the search.
@@ -195,7 +197,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
     runner=measure_ctx.runner,
     measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
 )
-sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+task.tune(tune_option, search_policy=search_policy)
 
 # Kill the measurement process
 del measure_ctx
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 6d756299c5d8..bdd14bea1dfd 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -56,7 +56,12 @@ def matmul_add(N, L, M, dtype):
     C = te.placeholder((N, M), name="C", dtype=dtype)
 
     k = te.reduce_axis((0, L), name="k")
-    matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="matmul")
+    matmul = te.compute(
+        (N, M),
+        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
+        name="matmul",
+        attrs={"layout_free_placeholders": [B]},  # enable automatic layout transform for tensor B
+    )
     out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
 
     return [A, B, C, out]
@@ -65,16 +70,18 @@ def matmul_add(N, L, M, dtype):
 ######################################################################
 # Create the search task
 # ^^^^^^^^^^^^^^^^^^^^^^
-# We then create a search task with N=L=M=128 and dtype="float32"
+# We then create a search task with N=L=M=1024 and dtype="float32"
 # If your machine supports avx instructions, you can
 #
 #   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
 #   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
 
 target = tvm.target.Target("llvm")
-task = tvm.auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)
+N = L = M = 1024
+task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)
 
 # Inspect the computational graph
+print("Computational DAG:")
 print(task.compute_dag)
 
 ######################################################################
@@ -100,15 +107,20 @@ def matmul_add(N, L, M, dtype):
 # ^^^^^^^^^^^^^^
 # Now we get all inputs ready. Pretty simple, isn't it?
 # We can kick off the search and let the auto-scheduler do its magic.
-# After some measurement trials, it will return the best schedule it found.
+# After some measurement trials, we can load the best schedule from the log
+# file and apply it.
 
-sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
+# Run auto-tuning (search)
+task.tune(tune_option)
+# Apply the best schedule
+sch, args = task.apply_best(log_file)
 
 ######################################################################
 # We can lower the schedule to see the IR after auto-scheduling.
 # The auto-scheduler correctly performs optimizations including multi-level tiling,
 # parallelization, vectorization, unrolling and operator fusion.
 
+print("Lowered TIR:")
 print(tvm.lower(sch, args, simple_mode=True))
 
 ######################################################################
@@ -116,10 +128,10 @@ def matmul_add(N, L, M, dtype):
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # We build the binary and check its correctness and performance.
 
-func = tvm.build(sch, args)
-a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
-b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
-c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
+func = tvm.build(sch, args, target)
+a_np = np.random.uniform(size=(N, L)).astype(np.float32)
+b_np = np.random.uniform(size=(L, M)).astype(np.float32)
+c_np = np.random.uniform(size=(N, M)).astype(np.float32)
 out_np = a_np.dot(b_np) + c_np
 
 ctx = tvm.cpu()
@@ -143,26 +155,17 @@ def matmul_add(N, L, M, dtype):
 ######################################################################
 # Using the record file
 # ^^^^^^^^^^^^^^^^^^^^^
-# During the search, all measuremnt records are dumpped into the record
+# During the search, all measurement records are dumped into the record
 # file "matmul.json". The measurement records can be used to re-apply search results,
 # resume the search, and perform other analyses.
 
 ######################################################################
 # Here is an example where we load the best schedule from a file,
-# print the equivalent python schedule API, and build the binary again.
-
-# Load the measuremnt record for the best schedule
-inp, res = auto_scheduler.load_best(log_file, task.workload_key)
+# and print the equivalent python schedule API. This can be used for
+# debugging and learning the behavior of the auto-scheduler.
 
-# Print equivalent python schedule API. This can be used for debugging and
-# learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
-print(task.compute_dag.print_python_code_from_state(inp.state))
-
-# Rebuild the binary. This shows how you can apply the best schedule from a
-# log file without reruning the search again.
-sch, args = task.compute_dag.apply_steps_from_state(inp.state)
-func = tvm.build(sch, args)
+print(task.print_best(log_file))
 
 ######################################################################
 # A more complicated example is to resume the search.
@@ -182,7 +185,7 @@ def resume_search(task, log_file_name):
     tune_option = auto_scheduler.TuningOptions(
         num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file_name)]
     )
-    sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+    task.tune(tune_option, search_policy=search_policy)
 
 
 # resume_search(task, log_file)
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 90f531f4f52e..03be05abd363 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -299,9 +299,10 @@ def run_tuning():
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. If you have multiple GPUs, you can use all of them for measurements to
+# 2. If you have multiple target GPUs, you can use all of them for measurements to
 #    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
 #    to learn how to use the RPC Tracker and RPC Server.
 #    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
 #    with :any:`auto_scheduler.RPCRunner`.
-#
+# 3. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+#    to distill the large log file and only save the best useful records.
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py
index 8dd9230c5cce..aba75b253e0c 100644
--- a/tutorials/auto_scheduler/tune_network_x86.py
+++ b/tutorials/auto_scheduler/tune_network_x86.py
@@ -303,4 +303,5 @@ def run_tuning():
 #    to learn how to use the RPC Tracker and RPC Server.
 #    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
 #    with :any:`auto_scheduler.RPCRunner`.
-#
+# 3. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+#    to distill the large log file and only save the best useful records.

From 21df600b918b4e973a8128b161c7a6dbd15c6dbf Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Fri, 4 Dec 2020 15:02:25 +0100
Subject: [PATCH 255/258] Save PyTorch frontend state in object (#7023)

While the functional approach is pretty neat, we ended up having
global state (default frontend, dtype) and it'll be more soon
(caching of inferred types, see #6900). To not have to pass around
the state, this moves the op conversion into a class with instances
having the state.
---
 python/tvm/relay/frontend/pytorch.py | 2013 ++++++++++----------------
 1 file changed, 774 insertions(+), 1239 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 38478e27ff92..4f75cf380cc6 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -17,6 +17,7 @@
 # pylint: disable=import-self, too-many-lines, len-as-condition, no-else-return, unused-variable, too-many-nested-blocks
 # pylint: disable=consider-iterating-dictionary, invalid-name, unused-argument, unused-variable, broad-except
 # pylint: disable=import-outside-toplevel, simplifiable-if-expression, cell-var-from-loop, unnecessary-lambda
+# pylint: disable=missing-function-docstring
 """PT: PyTorch frontend."""
 import itertools
 import logging
@@ -133,16 +134,24 @@ def _is_quantized_tensor(data, prelude):
 
 
 # operator implementation
-def _elemwise(name):
-    def _impl(inputs, input_types):
-        data0, data1 = _pytorch_promote_types(inputs[:2], input_types[:2])
-        return get_relay_op(name)(data0, data1)
 
-    return _impl
 
+class PyTorchOpConverter:
+    """A helper class for holding PyTorch op converters."""
+
+    def __init__(self, prelude, default_dtype):
+        self.prelude = prelude
+        self.default_dtype = default_dtype
+        self.create_convert_map()
+
+    def make_elemwise(self, name):
+        def elemwise(inputs, input_types):
+            data0, data1 = _pytorch_promote_types(inputs[:2], input_types[:2])
+            return get_relay_op(name)(data0, data1)
+
+        return elemwise
 
-def _min_max_common(name_elemwise, name_reduce):
-    def _impl(inputs, input_types):
+    def min_max_common(self, name_elemwise, name_reduce, inputs, input_types):
         if len(inputs) == 1:
             data = _pytorch_promote_types(inputs[:1], input_types[:1])
             return get_relay_op(name_reduce)(data[0])
@@ -156,38 +165,27 @@ def _impl(inputs, input_types):
             data0, data1 = _pytorch_promote_types(inputs[:2], input_types[:2])
             return get_relay_op(name_elemwise)(data0, data1)
 
-    return _impl
-
+    def max(self, inputs, input_types):
+        return self.min_max_common("maximum", "max", inputs, input_types)
 
-def _max():
-    return _min_max_common("maximum", "max")
+    def min(self, inputs, input_types):
+        return self.min_max_common("minimum", "min", inputs, input_types)
 
+    def make_unary(self, name):
+        def unary(inputs, input_types):
+            # this is just to ensure tensor input
+            (data,) = _pytorch_promote_types(inputs[:1], input_types[:1])
+            return get_relay_op(name)(data)
 
-def _min():
-    return _min_max_common("minimum", "min")
+        return unary
 
-
-def _unary(name):
-    def _impl(inputs, input_types):
-        # this is just to ensure tensor input
-        (data,) = _pytorch_promote_types(inputs[:1], input_types[:1])
-        return get_relay_op(name)(data)
-
-    return _impl
-
-
-def _log1p():
-    def _impl(inputs, input_types):
+    def log1p(self, inputs, input_types):
         # 1_plus_log x = log(x + 1)
         (dtype,) = input_types
         one = _expr.const(1, dtype=dtype)
         return _op.log(inputs[0] + one)
 
-    return _impl
-
-
-def _arange():
-    def _impl(inputs, input_types):
+    def arange(self, inputs, input_types):
         def _get_value(val, dtype):
             # dtype is a tvm dtype
             if isinstance(val, _expr.Expr):
@@ -235,11 +233,7 @@ def _get_type(val, inp_type):
 
         return _op.transform.arange(start=start, stop=stop, step=step, dtype=dtype)
 
-    return _impl
-
-
-def _squeeze():
-    def _impl(inputs, input_types):
+    def squeeze(self, inputs, input_types):
         data = inputs[0]
         if len(inputs) == 1:
             axis = None
@@ -249,33 +243,27 @@ def _impl(inputs, input_types):
 
         return _op.transform.squeeze(data, axis)
 
-    return _impl
-
-
-def _unsqueeze():
-    def _impl(inputs, input_types):
+    def unsqueeze(self, inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
 
         return _op.transform.expand_dims(data, int(axis), 1)
 
-    return _impl
-
-
-def _concatenate(prelude):
-    def tensor_array_concat(lst, axis):
-        assert axis == 0, "Tensor array concat supported only for axis 0"
-        tensor_array, shape = _convert_to_tensor_array(lst, prelude)
-        concat_shape = (Any(),) + shape[1:]
-        concat = prelude.get_global_var_static("tensor_array_concat", "float32", shape)
-        concatenated = concat(tensor_array)
-
-        static_tensor_array_ops = StaticTensorArrayOps(prelude, "float32", concat_shape)
-        static_tensor_array_ops.register()
-        get_tensor = prelude.get_global_var_static("tensor_get_data", "float32", concat_shape)
-        return get_tensor(concatenated)
+    def concatenate(self, inputs, input_types):
+        def tensor_array_concat(lst, axis):
+            assert axis == 0, "Tensor array concat supported only for axis 0"
+            tensor_array, shape = _convert_to_tensor_array(lst, self.prelude)
+            concat_shape = (Any(),) + shape[1:]
+            concat = self.prelude.get_global_var_static("tensor_array_concat", "float32", shape)
+            concatenated = concat(tensor_array)
+
+            static_tensor_array_ops = StaticTensorArrayOps(self.prelude, "float32", concat_shape)
+            static_tensor_array_ops.register()
+            get_tensor = self.prelude.get_global_var_static(
+                "tensor_get_data", "float32", concat_shape
+            )
+            return get_tensor(concatenated)
 
-    def _impl(inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
 
@@ -287,11 +275,7 @@ def _impl(inputs, input_types):
 
         return _op.tensor.concatenate(data, int(axis))
 
-    return _impl
-
-
-def _slice():
-    def _impl(inputs, input_types):
+    def slice(self, inputs, input_types):
         axis_dtype = "int64"
         index_size_limit = 2 ** 63 - 1
         data = inputs[0]
@@ -391,11 +375,7 @@ def _impl(inputs, input_types):
             data, begin=begin, end=end, strides=strides, slice_mode="end"
         )
 
-    return _impl
-
-
-def _split():
-    def _impl(inputs, input_types):
+    def split(self, inputs, input_types):
         data = inputs[0]
         split_size = int(inputs[1])
         dim = int(inputs[2])
@@ -408,11 +388,7 @@ def _impl(inputs, input_types):
 
         return _op.split(data, indices, dim)
 
-    return _impl
-
-
-def _split_with_sizes():
-    def _impl(inputs, input_types):
+    def split_with_sizes(self, inputs, input_types):
         data = inputs[0]
         sections = inputs[1]
         dim = int(inputs[2])
@@ -430,31 +406,19 @@ def _impl(inputs, input_types):
 
         return _op.split(data, indices, dim)
 
-    return _impl
-
-
-def _select():
-    def _impl(inputs, input_types):
+    def select(self, inputs, input_types):
         data = inputs[0]
         dim = int(inputs[1])
         index = _wrap_const(inputs[2])
         return _op.transform.take(data, index, axis=dim)
 
-    return _impl
-
-
-def _take():
-    def _impl(inputs, input_types):
+    def take(self, inputs, input_types):
         data = inputs[0]
         indices = _op.cast(inputs[1], "int32")
 
         return _op.transform.take(data, indices=indices)
 
-    return _impl
-
-
-def _topk():
-    def _impl(inputs, input_types):
+    def topk(self, inputs, input_types):
         data = inputs[0]
         axis = int(inputs[2])
         is_ascend = not bool(inputs[3])
@@ -473,28 +437,16 @@ def _impl(inputs, input_types):
 
         return outs[0], outs[1]
 
-    return _impl
-
-
-def _reciprocal():
-    def _impl(inputs, input_types):
+    def reciprocal(self, inputs, input_types):
         data = inputs[0]
         return _expr.const(1.0, dtype=input_types[0]) / data
 
-    return _impl
-
-
-def _repeat():
-    def _impl(inputs, input_types):
+    def repeat(self, inputs, input_types):
         data = inputs[0]
         reps = inputs[1]
         return _op.transform.tile(data, reps=reps)
 
-    return _impl
-
-
-def _repeat_interleave():
-    def _impl(inputs, input_types):
+    def repeat_interleave(self, inputs, input_types):
         data = inputs[0]
         if isinstance(inputs[1], int):
             repeats = inputs[1]
@@ -507,77 +459,60 @@ def _impl(inputs, input_types):
             axis = 0
         return _op.transform.repeat(data, repeats=repeats, axis=axis)
 
-    return _impl
-
-
-def _addcdiv():
-    def _impl(inputs, input_types):
+    def addcdiv(self, inputs, input_types):
         data, t1, t2, c = _pytorch_promote_types(inputs[:4], input_types[:4])
         return data + (c * (t1 / t2))
 
-    return _impl
-
-
-def _addcmul():
-    def _impl(inputs, input_types):
+    def addcmul(self, inputs, input_types):
         data, t1, t2, c = _pytorch_promote_types(inputs[:4], input_types[:4])
         return data + (c * (t1 * t2))
 
-    return _impl
-
-
-def _where():
-    def _impl(inputs, input_types):
+    def where(self, inputs, input_types):
         if len(inputs) == 1:
-            return _nonzero(False)([inputs[0], True], input_types)
+            return self.nonzero([inputs[0], True], input_types)
 
         cond = inputs[0]
         x, y = _pytorch_promote_types(inputs[1:3], input_types[1:3])
         return _op.where(cond, x, y)
 
-    return _impl
-
-
-def _full_impl(data, fill_value, dtype):
-    size = []
-    need_reshape = False
-    new_shape = []
-    for dim in data:
-        if isinstance(dim, _expr.Expr):
-            if isinstance(dim, _expr.Constant):
-                dim = int(dim.data.asnumpy())
-                if isinstance(size, list):
-                    size.append(dim)
-                new_shape.append(dim)
-            else:
-                dim, success = try_infer_value(dim, lambda ret: int(ret), lambda: 0)
-                new_shape.append(dim)
-
-                if success:
+    def full_impl(self, data, fill_value, dtype):
+        size = []
+        need_reshape = False
+        new_shape = []
+        for dim in data:
+            if isinstance(dim, _expr.Expr):
+                if isinstance(dim, _expr.Constant):
+                    dim = int(dim.data.asnumpy())
                     if isinstance(size, list):
                         size.append(dim)
+                    new_shape.append(dim)
                 else:
-                    size = None
-                    need_reshape = True
-        else:
-            if isinstance(size, list):
-                size.append(dim)
-            new_shape.append(dim)
+                    dim, success = try_infer_value(dim, lambda ret: int(ret), lambda: 0)
+                    new_shape.append(dim)
 
-    if size is None:
-        tmp = []
-        for dim in data:
-            tmp.append(_op.cast(_op.expand_dims(dim, axis=0), "int64"))
-        size = _op.concatenate(tmp, axis=0)
+                    if success:
+                        if isinstance(size, list):
+                            size.append(dim)
+                    else:
+                        size = None
+                        need_reshape = True
+            else:
+                if isinstance(size, list):
+                    size.append(dim)
+                new_shape.append(dim)
 
-    out = _op.full(_expr.const(fill_value), size, dtype=dtype)
-    if need_reshape:
-        out = _op.reshape(out, new_shape)
-    return out
+        if size is None:
+            tmp = []
+            for dim in data:
+                tmp.append(_op.cast(_op.expand_dims(dim, axis=0), "int64"))
+            size = _op.concatenate(tmp, axis=0)
 
+        out = _op.full(_expr.const(fill_value), size, dtype=dtype)
+        if need_reshape:
+            out = _op.reshape(out, new_shape)
+        return out
 
-def _ones(default_dtype):
-    def _impl(inputs, input_types):
+    def ones(self, inputs, input_types):
         data = inputs[0]
 
         import torch
@@ -589,14 +524,10 @@ def _impl(inputs, input_types):
         if inputs[1] is not None:
             dtype = _convert_dtype_value(inputs[1])
         else:
-            dtype = default_dtype
-        return _full_impl(data, 1, dtype)
-
-    return _impl
+            dtype = self.default_dtype
+        return self.full_impl(data, 1, dtype)
 
-
-def _ones_like(default_dtype):
-    def _impl(inputs, input_types):
+    def ones_like(self, inputs, input_types):
         data = inputs[0]
         out = _op.ones_like(data)
 
@@ -604,17 +535,13 @@ def _impl(inputs, input_types):
         if inputs[1] is not None:
             dtype = _convert_dtype_value(inputs[1])
         else:
-            dtype = default_dtype
+            dtype = self.default_dtype
         if input_types[0] != dtype:
             out = _op.cast(out, dtype)
 
         return out
 
-    return _impl
-
-
-def _zeros(default_dtype):
-    def _impl(inputs, input_types):
+    def zeros(self, inputs, input_types):
         data = inputs[0]
 
         import torch
@@ -626,14 +553,10 @@ def _impl(inputs, input_types):
         if inputs[1] is not None:
             dtype = _convert_dtype_value(inputs[1])
         else:
-            dtype = default_dtype
-        return _full_impl(data, 0, dtype)
-
-    return _impl
-
+            dtype = self.default_dtype
+        return self.full_impl(data, 0, dtype)
 
-def _zeros_like(default_dtype):
-    def _impl(inputs, input_types):
+    def zeros_like(self, inputs, input_types):
         data = inputs[0]
         out = _op.zeros_like(data)
 
@@ -641,17 +564,13 @@ def _impl(inputs, input_types):
         if inputs[1] is not None:
             dtype = _convert_dtype_value(inputs[1])
         else:
-            dtype = default_dtype
+            dtype = self.default_dtype
         if input_types[0] not in dtype:
             out = _op.cast(out, dtype)
 
         return out
 
-    return _impl
-
-
-def _full(default_dtype):
-    def _impl(inputs, input_types):
+    def full(self, inputs, input_types):
         data = inputs[0]
         fill_value = inputs[1]
 
@@ -665,15 +584,11 @@ def _impl(inputs, input_types):
             dtype = _convert_dtype_value(inputs[2])
         else:
             # if dtype is None, torch uses a global default set by torch.set_default_tensor_type()
-            dtype = default_dtype
-
-        return _full_impl(data, fill_value, dtype)
+            dtype = self.default_dtype
 
-    return _impl
+        return self.full_impl(data, fill_value, dtype)
 
-
-def _full_like(default_dtype):
-    def _impl(inputs, input_types):
+    def full_like(self, inputs, input_types):
         data = inputs[0]
         fill_value = inputs[1]
 
@@ -684,17 +599,13 @@ def _impl(inputs, input_types):
             dtype = _convert_dtype_value(inputs[2])
         else:
             # if dtype is None, torch uses a global default set by torch.set_default_tensor_type()
-            dtype = default_dtype
+            dtype = self.default_dtype
         if input_types[0] not in dtype:
             out = _op.cast(out, dtype)
 
         return out
 
-    return _impl
-
-
-def _linspace():
-    def _impl(inputs, input_types):
+    def linspace(self, inputs, input_types):
         start = inputs[0]
         stop = inputs[1]
         step = inputs[2]
@@ -713,51 +624,31 @@ def _impl(inputs, input_types):
 
         return _op.transform.arange(start=start, stop=stop, step=step, dtype=dtype)
 
-    return _impl
-
-
-def _relu(prelude):
-    def _impl(inputs, input_types):
+    def relu(self, inputs, input_types):
         data = inputs[0]
-        if _is_quantized_tensor(data, prelude):
+        if _is_quantized_tensor(data, self.prelude):
             assert len(inputs) == 3, "Input quant param not found in op inputs"
             input_zero_point = _expr.const(inputs[2], dtype="int32")
             return qnn_torch.quantized_relu(data, input_zero_point)
         return _op.nn.relu(data)
 
-    return _impl
-
-
-def _prelu():
-    def _impl(inputs, input_types):
+    def prelu(self, inputs, input_types):
         data = inputs[0]
         alpha = inputs[1]
         return _op.nn.prelu(data, alpha)
 
-    return _impl
-
-
-def _leaky_relu():
-    def _impl(inputs, input_types):
+    def leaky_relu(self, inputs, input_types):
         data = inputs[0]
         alpha = float(inputs[1])
         return _op.nn.leaky_relu(data, alpha)
 
-    return _impl
-
-
-def _elu():
-    def _impl(inputs, input_types):
+    def elu(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         alpha = _expr.const(float(inputs[1]), dtype=dtype)
         return alpha * _op.nn.relu(_expr.const(1, dtype=dtype) - _op.exp(data)) + _op.nn.relu(data)
 
-    return _impl
-
-
-def _celu():
-    def _impl(inputs, input_types):
+    def celu(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         alpha = _expr.const(float(inputs[1]), dtype=dtype)
@@ -765,11 +656,7 @@ def _impl(inputs, input_types):
             _expr.const(1, dtype=dtype) - _op.exp(data / alpha)
         ) + _op.nn.relu(data)
 
-    return _impl
-
-
-def _gelu():
-    def _impl(inputs, input_types):
+    def gelu(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         # gelu is data  * normcdf(data)
@@ -781,11 +668,7 @@ def _impl(inputs, input_types):
             + _op.erf(data * _expr.const(0.5 ** 0.5, dtype=dtype)) * _expr.const(0.5, dtype=dtype)
         )
 
-    return _impl
-
-
-def _selu():
-    def _impl(inputs, input_types):
+    def selu(self, inputs, input_types):
         data = inputs[0]
         # https://pytorch.org/docs/stable/nn.html#selu
         dtype = input_types[0]
@@ -795,65 +678,41 @@ def _impl(inputs, input_types):
             alpha * _op.nn.relu(_expr.const(1.0, dtype=dtype) - _op.exp(data)) + _op.nn.relu(data)
         )
 
-    return _impl
-
-
-def _log_sigmoid():
-    def _impl(inputs, input_types):
+    def log_sigmoid(self, inputs, input_types):
         data = inputs[0]
         return _op.log(_op.tensor.sigmoid(data))
 
-    return _impl
-
-
-def _adaptive_avg_pool_2d(prelude):
-    def _impl(inputs, input_types):
+    def adaptive_avg_pool_2d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
 
         def func(x):
             return _op.nn.adaptive_avg_pool2d(x, output_size=output_size)
 
-        if _is_quantized_tensor(data, prelude):
+        if _is_quantized_tensor(data, self.prelude):
             return qnn_torch.apply_with_upcast(data, func)
 
         return func(data)
 
-    return _impl
-
-
-def _adaptive_max_pool_2d():
-    def _impl(inputs, input_types):
+    def adaptive_max_pool_2d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
 
         # returns dummy indices too
         return _op.nn.adaptive_max_pool2d(data, output_size=output_size), None
 
-    return _impl
-
-
-def _adaptive_max_pool_3d():
-    def _impl(inputs, input_types):
+    def adaptive_max_pool_3d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
         # returns dummy indices too
         return _op.nn.adaptive_max_pool3d(data, output_size=output_size), None
 
-    return _impl
-
-
-def _adaptive_avg_pool_3d():
-    def _impl(inputs, input_types):
+    def adaptive_avg_pool_3d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
         return _op.nn.adaptive_avg_pool3d(data, output_size=output_size)
 
-    return _impl
-
-
-def _maxpool_2d():
-    def _impl(inputs, input_types):
+    def maxpool_2d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -868,19 +727,11 @@ def _impl(inputs, input_types):
 
         return _op.nn.max_pool2d(data, pool_size, strides, padding, "NCHW", ceil_mode)
 
-    return _impl
-
-
-def _maxpool_2d_with_indices():
-    def _impl(inputs, input_types):
+    def maxpool_2d_with_indices(self, inputs, input_types):
         # returns dummy indices too
-        return _maxpool_2d()(inputs, input_types), None
-
-    return _impl
+        return self.maxpool_2d(inputs, input_types), None
 
-
-def _maxpool_1d():
-    def _impl(inputs, input_types):
+    def maxpool_1d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -895,11 +746,7 @@ def _impl(inputs, input_types):
 
         return _op.nn.max_pool1d(data, pool_size, strides, padding, "NCW", ceil_mode)
 
-    return _impl
-
-
-def _maxpool_3d():
-    def _impl(inputs, input_types):
+    def maxpool_3d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -915,21 +762,13 @@ def _impl(inputs, input_types):
             data, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode
         )
 
-    return _impl
-
-
-def _hardtanh():
-    def _impl(inputs, input_types):
+    def hardtanh(self, inputs, input_types):
         a = inputs[0]
         tanh_min = float(inputs[1])
         tanh_max = float(inputs[2])
         return _op.tensor.clip(a, tanh_min, tanh_max)
 
-    return _impl
-
-
-def _convolution():
-    def _impl(inputs, input_types):
+    def convolution(self, inputs, input_types):
         # Use transpose or normal
         use_transpose = True if inputs[6] == 1 else False
 
@@ -1018,11 +857,7 @@ def _impl(inputs, input_types):
             res = _op.squeeze(res, axis=[2])
         return res
 
-    return _impl
-
-
-def _softmax():
-    def _impl(inputs, input_types):
+    def softmax(self, inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
         if isinstance(axis, str):
@@ -1030,27 +865,15 @@ def _impl(inputs, input_types):
 
         return _op.nn.softmax(data, axis=axis)
 
-    return _impl
-
-
-def _threshold():
-    def _impl(inputs, input_types):
+    def threshold(self, inputs, input_types):
         data = inputs[0]
         return _op.nn.relu(data)
 
-    return _impl
-
-
-def _contiguous():
-    def _impl(inputs, input_types):
+    def contiguous(self, inputs, input_types):
         data = inputs[0]
         return _op.tensor.copy(data)
 
-    return _impl
-
-
-def _batch_norm():
-    def _impl(inputs, input_types):
+    def batch_norm(self, inputs, input_types):
         data = inputs[0]
         data_type = input_types[0]
 
@@ -1086,11 +909,7 @@ def _impl(inputs, input_types):
             scale=scale,
         )[0]
 
-    return _impl
-
-
-def _instance_norm():
-    def _impl(inputs, input_types):
+    def instance_norm(self, inputs, input_types):
         data = inputs[0]
         data_type = input_types[0]
         channels = _infer_shape(data)
@@ -1114,28 +933,24 @@ def _impl(inputs, input_types):
             data, gamma, beta, axis=1, epsilon=epsilon, center=center, scale=scale
         )
 
-    return _impl
-
-
-def _get_dims(data):
-    import torch
-
-    if isinstance(data, _expr.Expr):
-        dims = _infer_shape(data)
-    elif isinstance(data, list):
-        dims = data
-    elif isinstance(data, (torch.Tensor, np.ndarray)):
-        dims = data.shape
-    else:
-        msg = "Data type %s could not be parsed" % type(data)
-        raise AssertionError(msg)
-    return dims
+    @staticmethod
+    def get_dims(data):
+        import torch
 
+        if isinstance(data, _expr.Expr):
+            dims = _infer_shape(data)
+        elif isinstance(data, list):
+            dims = data
+        elif isinstance(data, (torch.Tensor, np.ndarray)):
+            dims = data.shape
+        else:
+            msg = "Data type %s could not be parsed" % type(data)
+            raise AssertionError(msg)
+        return dims
 
-def _layer_norm():
-    def _impl(inputs, input_types):
+    def layer_norm(self, inputs, input_types):
         data = inputs[0]
-        ndims = len(_get_dims(inputs[1]))
+        ndims = len(self.get_dims(inputs[1]))
         assert ndims == 1, "Support only normalization over last one dimension."
 
         return _op.nn.layer_norm(
@@ -1148,11 +963,7 @@ def _impl(inputs, input_types):
             scale=True,
         )
 
-    return _impl
-
-
-def _group_norm():
-    def _impl(inputs, input_types):
+    def group_norm(self, inputs, input_types):
         data = inputs[0]
         gamma = inputs[2]
         beta = inputs[3]
@@ -1170,17 +981,13 @@ def _impl(inputs, input_types):
             scale=True,
         )
 
-    return _impl
-
-
-def _transpose(prelude):
-    def _impl(inputs, input_types):
+    def transpose(self, inputs, input_types):
         data = inputs[0]
 
         import torch
 
         if isinstance(data, _expr.Expr):
-            ndims = len(_infer_shape(data, prelude.mod))
+            ndims = len(_infer_shape(data, self.prelude.mod))
         elif isinstance(data, list):
             ndims = data
         elif isinstance(data, (torch.Tensor, np.ndarray)):
@@ -1211,11 +1018,7 @@ def _impl(inputs, input_types):
             axes = inputs[1]
         return _op.transform.transpose(data, axes)
 
-    return _impl
-
-
-def _flatten():
-    def _impl(inputs, input_types):
+    def flatten(self, inputs, input_types):
         data = inputs[0]
         start = int(inputs[1])
         end = int(inputs[2])
@@ -1237,11 +1040,7 @@ def _impl(inputs, input_types):
             out = _op.squeeze(out, axis=squeeze_axes)
         return out
 
-    return _impl
-
-
-def _addmm():
-    def _impl(inputs, input_types):
+    def addmm(self, inputs, input_types):
         input_mat = inputs[0]
         mat1 = inputs[1]
         data_type = input_types[1]
@@ -1265,35 +1064,24 @@ def _impl(inputs, input_types):
 
         return dense_out + input_mat
 
-    return _impl
-
-
-def _size(prelude):
-    def _impl_dynamic(inp, axis):
-        shape_dynamic = _op.shape_of(inp, dtype="int32")
-        if axis is not None:
-            return _op.take(shape_dynamic, _expr.const(axis), 0)
-        return shape_dynamic
-
-    def _impl(inputs, input_types):
-        shape = _infer_shape(inputs[0], prelude.mod)
+    def size(self, inputs, input_types):
+        shape = _infer_shape(inputs[0], self.prelude.mod)
         axis = None
         if len(inputs) > 1:
             axis = int(inputs[1])
 
         if any(map(lambda s: isinstance(s, tvm.tir.expr.Any), shape)):
             if axis is None or isinstance(shape[axis], tvm.tir.expr.Any):
-                return _impl_dynamic(inputs[0], axis)
+                shape_dynamic = _op.shape_of(inputs[0], dtype="int32")
+                if axis is not None:
+                    return _op.take(shape_dynamic, _expr.const(axis), 0)
+                return shape_dynamic
 
         if axis is not None:
             return _expr.const(shape[axis])
         return _expr.const(shape)
 
-    return _impl
-
-
-def _numtotensor():
-    def _impl(inputs, input_types):
+    def numtotensor(self, inputs, input_types):
         val = inputs[0]
         dtype = input_types[0]
 
@@ -1307,18 +1095,10 @@ def _impl(inputs, input_types):
         arr = val * np.ones([]).astype(dtype)
         return arr
 
-    return _impl
-
-
-def _tensortonum():
-    def _impl(inputs, input_types):
+    def tensortonum(self, inputs, input_types):
         return inputs[0]
 
-    return _impl
-
-
-def _view():
-    def _impl(inputs, input_types):
+    def view(self, inputs, input_types):
         data = inputs[0]
 
         if len(inputs) == 3:
@@ -1336,11 +1116,7 @@ def _impl(inputs, input_types):
 
         return _op.transform.reshape(data, new_shape)
 
-    return _impl
-
-
-def _reshape():
-    def _impl(inputs, input_types):
+    def reshape(self, inputs, input_types):
         data = inputs[0]
         new_shape = inputs[1]
 
@@ -1371,11 +1147,7 @@ def _impl(inputs, input_types):
             new_shape = tmp_shape
         return _op.transform.reshape(data, new_shape)
 
-    return _impl
-
-
-def _pixel_shuffle(prelude):
-    def _impl(inputs, input_types):
+    def pixel_shuffle(self, inputs, input_types):
         data = inputs[0]
         upscale_factor = inputs[1]
         upscale_squared = upscale_factor * upscale_factor
@@ -1384,7 +1156,7 @@ def _impl(inputs, input_types):
             c % upscale_squared == 0
         ), "input channel should be divisible by square of upscale_factor"
 
-        ndims = len(_infer_shape(data, prelude.mod))
+        ndims = len(_infer_shape(data, self.prelude.mod))
         axes = list(range(ndims))
         num_inputs = len(inputs)
         oc = c // upscale_squared
@@ -1402,46 +1174,26 @@ def _impl(inputs, input_types):
         data = _op.transform.transpose(data, axes)
         return _op.transform.reshape(data, out_shape)
 
-    return _impl
-
-
-def _clone():
-    def _impl(inputs, input_types):
+    def clone(self, inputs, input_types):
         data = inputs[0]
         return _op.tensor.copy(data)
 
-    return _impl
-
-
-def _log_softmax():
-    def _impl(inputs, input_types):
+    def log_softmax(self, inputs, input_types):
         data = inputs[0]
         axis = int(inputs[1])
         return _op.nn.log_softmax(data, axis)
 
-    return _impl
-
-
-def _sigmoid():
-    def _impl(inputs, input_types):
+    def sigmoid(self, inputs, input_types):
         data = inputs[0]
         return _op.tensor.sigmoid(data)
 
-    return _impl
-
-
-def _softplus():
-    def _impl(inputs, input_types):
+    def softplus(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         beta = _expr.const(float(inputs[1]), dtype=dtype)
         return _op.log(_op.exp(inputs[0] * beta) + _expr.const(1.0, dtype=dtype)) / beta
 
-    return _impl
-
-
-def _avg_pool2d(prelude):
-    def _impl(inputs, input_types):
+    def avg_pool2d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -1460,16 +1212,12 @@ def func(x):
                 count_include_pad=count_include_pad,
             )
 
-        if _is_quantized_tensor(data, prelude):
+        if _is_quantized_tensor(data, self.prelude):
             return qnn_torch.apply_with_upcast(data, func)
 
         return func(data)
 
-    return _impl
-
-
-def _avg_pool3d():
-    def _impl(inputs, input_types):
+    def avg_pool3d(self, inputs, input_types):
         data = inputs[0]
 
         pool_size = inputs[1]
@@ -1487,41 +1235,32 @@ def _impl(inputs, input_types):
             count_include_pad=count_include_pad,
         )
 
-    return _impl
-
-
-def _dropout():
-    def _impl(inputs, input_types):
+    def dropout(self, inputs, input_types):
         data = inputs[0]
         rate = float(inputs[1])
 
         return _op.nn.dropout(data, rate)
 
-    return _impl
-
-
-def _reduce(name):
-    def _impl(inputs, input_types):
-        data = inputs[0]
-        axis = None
-        keepdims = False
-
-        if len(inputs) > 2:  # default, torch have only data, axis=None, keepdims=False
-            if isinstance(inputs[1], int):
-                axis = int(inputs[1])
-            elif _is_int_seq(inputs[1]):
-                axis = inputs[1]
-            else:
-                axis = list(_infer_shape(inputs[1]))
-            keepdims = bool(inputs[2])
+    def make_reduce(self, name):
+        def reduce(inputs, input_types):
+            data = inputs[0]
+            axis = None
+            keepdims = False
 
-        return get_relay_op(name)(data, axis=axis, keepdims=keepdims)
+            if len(inputs) > 2:  # default, torch have only data, axis=None, keepdims=False
+                if isinstance(inputs[1], int):
+                    axis = int(inputs[1])
+                elif _is_int_seq(inputs[1]):
+                    axis = inputs[1]
+                else:
+                    axis = list(_infer_shape(inputs[1]))
+                keepdims = bool(inputs[2])
 
-    return _impl
+            return get_relay_op(name)(data, axis=axis, keepdims=keepdims)
 
+        return reduce
 
-def _norm():
-    def _impl(inputs, input_types):
+    def norm(self, inputs, input_types):
         data = inputs[0]
         dtype = input_types[0]
         axis = None
@@ -1543,11 +1282,7 @@ def _impl(inputs, input_types):
                 reci_order,
             )
 
-    return _impl
-
-
-def _frobenius_norm():
-    def _impl(inputs, input_types):
+    def frobenius_norm(self, inputs, input_types):
         data = inputs[0]
         axis = None
         keepdims = False
@@ -1557,11 +1292,7 @@ def _impl(inputs, input_types):
 
         return _op.sqrt(_op.reduce.sum((data * data), axis=axis, keepdims=keepdims))
 
-    return _impl
-
-
-def _std():
-    def _impl(inputs, input_types):
+    def std(self, inputs, input_types):
         data = inputs[0]
         if len(inputs) == 2:
             axis = None
@@ -1574,11 +1305,7 @@ def _impl(inputs, input_types):
 
         return _op.reduce.std(data, axis=axis, keepdims=keepdims, unbiased=unbiased)
 
-    return _impl
-
-
-def _variance():
-    def _impl(inputs, input_types):
+    def variance(self, inputs, input_types):
         data = inputs[0]
         if len(inputs) == 2:
             axis = None
@@ -1591,11 +1318,7 @@ def _impl(inputs, input_types):
 
         return _op.reduce.variance(data, axis=axis, keepdims=keepdims, unbiased=unbiased)
 
-    return _impl
-
-
-def _mean(prelude):
-    def _impl(inputs, input_types):
+    def mean(self, inputs, input_types):
         data = inputs[0]
 
         if inputs[1]:
@@ -1615,7 +1338,7 @@ def _impl(inputs, input_types):
         def func(x):
             return _op.mean(x, axis, keepdims, exclude)
 
-        if _is_quantized_tensor(data, prelude):
+        if _is_quantized_tensor(data, self.prelude):
             assert len(inputs) == 6, "Input quant param not found in op inputs"
             input_scale = _expr.const(inputs[4])
             input_zero_point = _expr.const(inputs[5])
@@ -1623,18 +1346,14 @@ def func(x):
 
         return func(data)
 
-    return _impl
-
-
-def _chunk(prelude):
-    def _impl(inputs, input_types):
+    def chunk(self, inputs, input_types):
         data = inputs[0]
 
         num_chunks = int(inputs[1])
         axis = int(inputs[2])
 
         if isinstance(data, _expr.Expr):
-            inferred_shape = _infer_shape(data, prelude.mod)
+            inferred_shape = _infer_shape(data, self.prelude.mod)
 
         shape = []
         for infer in inferred_shape:
@@ -1670,18 +1389,14 @@ def _impl(inputs, input_types):
 
         return chunks
 
-    return _impl
-
-
-def _matmul(prelude):
-    def _impl(inputs, input_types):
+    def matmul(self, inputs, input_types):
 
         inputs_0 = inputs[0]
         inputs_1 = inputs[1]
 
         # Need to check input shape as batch matmul must be supported.
-        a_shape = _infer_shape(inputs_0, prelude.mod)
-        b_shape = _infer_shape(inputs_1, prelude.mod)
+        a_shape = _infer_shape(inputs_0, self.prelude.mod)
+        b_shape = _infer_shape(inputs_1, self.prelude.mod)
 
         # When performing a batch matmul, we need to properly handle N-dim shapes.
         if len(a_shape) > 2 or len(b_shape) > 2:
@@ -1689,8 +1404,8 @@ def _impl(inputs, input_types):
             a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]])
             b = _op.reshape(inputs_1, [-1, b_shape[-2], b_shape[-1]])
             # Broadcast b to match batch size of a
-            new_b_shape = list(_infer_shape(b, prelude.mod))
-            new_a_shape = _infer_shape(a, prelude.mod)
+            new_b_shape = list(_infer_shape(b, self.prelude.mod))
+            new_a_shape = _infer_shape(a, self.prelude.mod)
             if new_a_shape[0] > new_b_shape[0]:
                 new_b_shape[0] = new_a_shape[0]
                 b = _op.broadcast_to(b, new_b_shape)
@@ -1714,11 +1429,7 @@ def _impl(inputs, input_types):
 
         return out
 
-    return _impl
-
-
-def _expand():
-    def _impl(inputs, input_types):
+    def expand(self, inputs, input_types):
         data_in = inputs[0]
         shape = list(_infer_shape(data_in))
 
@@ -1740,85 +1451,64 @@ def _impl(inputs, input_types):
 
         return out
 
-    return _impl
-
-
-def _int():
-    def _impl(inputs, input_types):
+    def int(self, inputs, input_types):
         if isinstance(inputs[0], _expr.Expr):
             return inputs[0]
         return int(inputs[0])
 
-    return _impl
-
-
-def _identity():
-    def _impl(inputs, input_types):
+    def identity(self, inputs, input_types):
         return inputs[0]
 
-    return _impl
-
-
-def _none():
-    def _impl(inputs, input_types):
+    def none(self, inputs, input_types):
         return None
 
-    return _impl
-
-
-def _pad(mode):
-    def _impl(inputs, input_types):
-        data = inputs[0]
-        if isinstance(inputs[1], list):
-            pad_list = inputs[1]
-        else:
-            pad_list = list(_infer_shape(inputs[1]))
-
-        # initialize paddings based on input len
-        pad_len = len(_infer_shape(data)) * 2
-        paddings = [0] * pad_len
-
-        if len(pad_list) >= 2:
-            paddings[-1] = pad_list[1]
-            paddings[-2] = pad_list[0]
-        if len(pad_list) >= 4:
-            paddings[-3] = pad_list[3]
-            paddings[-4] = pad_list[2]
-        if len(pad_list) >= 6:
-            paddings[-5] = pad_list[5]
-            paddings[-6] = pad_list[4]
-
-        # group into tuple of 2 ints
-        paddings = [paddings[i : i + 2] for i in range(0, len(paddings), 2)]
-
-        const_paddings = []
-        for pad in paddings:
-            const_paddings.append([])
-            for p in pad:
-                if not isinstance(p, int):
-                    p = int(_infer_value(p, {}).asnumpy())
-                const_paddings[-1].append(p)
-
-        if mode == "constant":
-            return _op.nn.pad(data, const_paddings, pad_value=inputs[2], pad_mode=mode)
-        else:
-            return _op.nn.pad(data, const_paddings, pad_mode=mode)
-
-    return _impl
+    def make_pad(self, mode):
+        def pad(inputs, input_types):
+            data = inputs[0]
+            if isinstance(inputs[1], list):
+                pad_list = inputs[1]
+            else:
+                pad_list = list(_infer_shape(inputs[1]))
+
+            # initialize paddings based on input len
+            pad_len = len(_infer_shape(data)) * 2
+            paddings = [0] * pad_len
+
+            if len(pad_list) >= 2:
+                paddings[-1] = pad_list[1]
+                paddings[-2] = pad_list[0]
+            if len(pad_list) >= 4:
+                paddings[-3] = pad_list[3]
+                paddings[-4] = pad_list[2]
+            if len(pad_list) >= 6:
+                paddings[-5] = pad_list[5]
+                paddings[-6] = pad_list[4]
+
+            # group into tuple of 2 ints
+            paddings = [paddings[i : i + 2] for i in range(0, len(paddings), 2)]
+
+            const_paddings = []
+            for pad in paddings:
+                const_paddings.append([])
+                for p in pad:
+                    if not isinstance(p, int):
+                        p = int(_infer_value(p, {}).asnumpy())
+                    const_paddings[-1].append(p)
+
+            if mode == "constant":
+                return _op.nn.pad(data, const_paddings, pad_value=inputs[2], pad_mode=mode)
+            else:
+                return _op.nn.pad(data, const_paddings, pad_mode=mode)
 
+        return pad
 
-def _clamp():
-    def _impl(inputs, input_types):
+    def clamp(self, inputs, input_types):
         data = inputs[0]
         amin = inputs[1] if inputs[1] else np.finfo(np.float32).min
         amax = inputs[2] if inputs[2] else np.finfo(np.float32).max
         return _op.clip(data, amin, amax)
 
-    return _impl
-
-
-def _to():
-    def _impl(inputs, input_types):
+    def to(self, inputs, input_types):
         data = inputs[0]
         dtype = inputs[1] if inputs[1] is not None and not isinstance(inputs[1], str) else inputs[2]
         # special handling for aten::to(data, 6, _, _, _) case
@@ -1844,87 +1534,81 @@ def _impl(inputs, input_types):
 
         return ret
 
-    return _impl
-
-
-def _get_upsample_out_size(inputs, method):
-    # This assumes a static shape
-    out_size = []
-    if inputs[1] is not None:
-        for size in inputs[1]:
-            if not isinstance(size, int):
-                out_size.append(int(_infer_value(size, {}).asnumpy()))
-            else:
-                out_size.append(size)
-    else:
-        scale_index = 3 if method in ["bilinear", "trilinear"] else 2
-        scales = inputs[scale_index]
-        assert scales is not None, "neither out size nor scale provided"
-        assert isinstance(scales, list)
-        ishape = _infer_shape(inputs[0])
-        for i, scale in enumerate(scales):
-            out_size.append(int(math.floor(float(ishape[2 + i]) * scale)))
-
-    return out_size
-
-
-def _upsample(method, prelude):
-    def _impl(inputs, input_types):
-        data = inputs[0]
-        out_size = _get_upsample_out_size(inputs, method)
-
-        if len(inputs) > 2 and method == "bilinear":
-            align_corners = inputs[2]
-        else:
-            align_corners = False
-
-        if method == "nearest_neighbor":
-            coord_trans = "asymmetric"
-        elif align_corners:
-            coord_trans = "align_corners"
+    @staticmethod
+    def get_upsample_out_size(inputs, method):
+        # This assumes a static shape
+        out_size = []
+        if inputs[1] is not None:
+            for size in inputs[1]:
+                if not isinstance(size, int):
+                    out_size.append(int(_infer_value(size, {}).asnumpy()))
+                else:
+                    out_size.append(size)
         else:
-            coord_trans = "half_pixel"
-
-        def func(x):
-            return _op.image.resize(x, out_size, "NCHW", method, coord_trans)
+            scale_index = 3 if method in ["bilinear", "trilinear"] else 2
+            scales = inputs[scale_index]
+            assert scales is not None, "neither out size nor scale provided"
+            assert isinstance(scales, list)
+            ishape = _infer_shape(inputs[0])
+            for i, scale in enumerate(scales):
+                out_size.append(int(math.floor(float(ishape[2 + i]) * scale)))
+
+        return out_size
+
+    def make_upsample(self, method):
+        def upsample(inputs, input_types):
+            data = inputs[0]
+            out_size = self.get_upsample_out_size(inputs, method)
+
+            if len(inputs) > 2 and method == "bilinear":
+                align_corners = inputs[2]
+            else:
+                align_corners = False
 
-        if _is_quantized_tensor(data, prelude):
-            # input qparams are manually appended by us
-            assert isinstance(inputs[-2], float)
-            assert isinstance(inputs[-1], int)
-            input_scale = _expr.const(inputs[-2])
-            input_zero_point = _expr.const(inputs[-1])
-            return qnn_torch.quantized_upsample(data, input_scale, input_zero_point, func)
+            if method == "nearest_neighbor":
+                coord_trans = "asymmetric"
+            elif align_corners:
+                coord_trans = "align_corners"
+            else:
+                coord_trans = "half_pixel"
 
-        return func(data)
+            def func(x):
+                return _op.image.resize(x, out_size, "NCHW", method, coord_trans)
 
-    return _impl
+            if _is_quantized_tensor(data, self.prelude):
+                # input qparams are manually appended by us
+                assert isinstance(inputs[-2], float)
+                assert isinstance(inputs[-1], int)
+                input_scale = _expr.const(inputs[-2])
+                input_zero_point = _expr.const(inputs[-1])
+                return qnn_torch.quantized_upsample(data, input_scale, input_zero_point, func)
 
+            return func(data)
 
-def _upsample3d(method):
-    def _impl(inputs, input_types):
-        data = inputs[0]
-        out_size = _get_upsample_out_size(inputs, method)
+        return upsample
 
-        if len(inputs) > 2 and method == "trilinear":
-            align_corners = inputs[2]
-        else:
-            align_corners = False
+    def make_upsample3d(self, method):
+        def upsample3d(inputs, input_types):
+            data = inputs[0]
+            out_size = self.get_upsample_out_size(inputs, method)
 
-        if method == "nearest_neighbor":
-            coord_trans = "asymmetric"
-        elif align_corners:
-            coord_trans = "align_corners"
-        else:
-            coord_trans = "half_pixel"
+            if len(inputs) > 2 and method == "trilinear":
+                align_corners = inputs[2]
+            else:
+                align_corners = False
 
-        return _op.image.resize3d(data, out_size, "NCDHW", method, coord_trans)
+            if method == "nearest_neighbor":
+                coord_trans = "asymmetric"
+            elif align_corners:
+                coord_trans = "align_corners"
+            else:
+                coord_trans = "half_pixel"
 
-    return _impl
+            return _op.image.resize3d(data, out_size, "NCDHW", method, coord_trans)
 
+        return upsample3d
 
-def _expand_as():
-    def _impl(inputs, input_types):
+    def expand_as(self, inputs, input_types):
         target = inputs[1]
         t0 = _infer_type(inputs[0]).checked_type.dtype
         t1 = _infer_type(inputs[1]).checked_type.dtype
@@ -1932,34 +1616,18 @@ def _impl(inputs, input_types):
             target = _op.cast(target, t0)
         return _op.broadcast_to_like(inputs[0], target)
 
-    return _impl
-
-
-def _Bool():
-    def _impl(inputs, input_types):
+    def Bool(self, inputs, input_types):
         assert len(inputs) == 1
         return inputs[0]
 
-    return _impl
-
-
-def _Float():
-    def _impl(inputs, input_types):
+    def Float(self, inputs, input_types):
         assert len(inputs) == 1
         return _op.cast(inputs[0], "float32")
 
-    return _impl
-
-
-def _mm():
-    def _impl(inputs, input_types):
+    def mm(self, inputs, input_types):
         return _op.nn.dense(inputs[0], inputs[1])
 
-    return _impl
-
-
-def _bitwise_not():
-    def _impl(inputs, input_types):
+    def bitwise_not(self, inputs, input_types):
         data = inputs[0]
         # The input tensor must be of integral or Boolean types.
         # For bool tensors, it computes the logical NOT
@@ -1970,11 +1638,7 @@ def _impl(inputs, input_types):
 
         return out
 
-    return _impl
-
-
-def _bitwise_xor():
-    def _impl(inputs, input_types):
+    def bitwise_xor(self, inputs, input_types):
         lhs = inputs[0]
         rhs = inputs[1]
         lhs = _op.cast(lhs, "bool") if input_types[0] == "bool" else _op.cast(lhs, "int")
@@ -1982,91 +1646,55 @@ def _impl(inputs, input_types):
 
         return _op.bitwise_xor(lhs, rhs)
 
-    return _impl
-
-
-def _logical_not():
-    def _impl(inputs, input_types):
+    def logical_not(self, inputs, input_types):
         data = _wrap_const(inputs[0])
         return _op.logical_not(_op.cast(data, "bool"))
 
-    return _impl
-
-
-def _logical_xor():
-    def _impl(inputs, input_types):
+    def logical_xor(self, inputs, input_types):
         lhs = _op.cast(inputs[0], "bool")
         rhs = _op.cast(inputs[1], "bool")
 
         return _op.logical_xor(lhs, rhs)
 
-    return _impl
-
-
-def _list_getitem(prelude):
-    def _impl(inputs, input_types):
-        return prelude.nth(inputs[0], _wrap_const(inputs[1]))
-
-    return _impl
-
-
-def _list_len(prelude):
-    def _impl(inputs, input_types):
-        return prelude.length(inputs[0])
+    def list_getitem(self, inputs, input_types):
+        return self.prelude.nth(inputs[0], _wrap_const(inputs[1]))
 
-    return _impl
+    def list_len(self, inputs, input_types):
+        return self.prelude.length(inputs[0])
 
-
-def _type_as():
-    def _impl(inputs, input_types):
+    def type_as(self, inputs, input_types):
         assert len(inputs) == 2
         assert len(input_types) == 2
         return _op.cast(inputs[0], input_types[1])
 
-    return _impl
-
-
-def _gather():
-    def _impl(inputs, input_types):
+    def gather(self, inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
         indices = inputs[2]
 
         return _op.gather(data, axis, indices)
 
-    return _impl
-
-
-def _add(prelude):
-    # add_ is overloaded for tensor add and list concat
-    def _impl(inputs, input_types):
+    def add(self, inputs, input_types):
+        # add_ is overloaded for tensor add and list concat
         if input_types[0] == "ListType":
-            return prelude.concat(inputs[0], inputs[1])
-        return _elemwise("add")(inputs, input_types)
-
-    return _impl
+            return self.prelude.concat(inputs[0], inputs[1])
+        return self.make_elemwise("add")(inputs, input_types)
 
-
-def _tensor_array_stack(prelude):
-    def _impl(inputs, input_types):
+    def tensor_array_stack(self, inputs, input_types):
         dim = inputs[1]
         assert dim == 0, "stacking on a dynamic tensor list only supported on a first axis"
-        tensor_array, shape = _convert_to_tensor_array(inputs[0], prelude)
+        tensor_array, shape = _convert_to_tensor_array(inputs[0], self.prelude)
 
         stacked_shape = (Any(),) + shape
-        stack = prelude.get_global_var_static("tensor_array_stack", "float32", shape)
+        stack = self.prelude.get_global_var_static("tensor_array_stack", "float32", shape)
         stacked = stack(tensor_array)
 
-        static_tensor_array_ops = StaticTensorArrayOps(prelude, "float32", stacked_shape)
+        static_tensor_array_ops = StaticTensorArrayOps(self.prelude, "float32", stacked_shape)
         static_tensor_array_ops.register()
-        get_tensor = prelude.get_global_var_static("tensor_get_data", "float32", stacked_shape)
+        get_tensor = self.prelude.get_global_var_static("tensor_get_data", "float32", stacked_shape)
         return get_tensor(stacked)
 
-    return _impl
-
-
-def _stack(prelude):
-    def _impl(inputs, input_types):
+    def stack(self, inputs, input_types):
         if isinstance(inputs[0], list):
             # a static python list of tensors
             dim = inputs[1]
@@ -2074,17 +1702,13 @@ def _impl(inputs, input_types):
         else:
             # List ADT case
             assert isinstance(inputs[0], _expr.Expr)
-            ty = _infer_type_with_prelude(inputs[0], prelude)
-            list_ty = prelude.mod.get_global_type_var("List")
+            ty = _infer_type_with_prelude(inputs[0], self.prelude)
+            list_ty = self.prelude.mod.get_global_type_var("List")
             msg = "The input list is expected to be List ADT"
             assert isinstance(ty, tvm.ir.TypeCall) and ty.func == list_ty, msg
-            return _tensor_array_stack(prelude)(inputs, input_types)
-
-    return _impl
-
+            return self.tensor_array_stack(inputs, input_types)
 
-def _rsub():
-    def _impl(inputs, input_types):
+    def rsub(self, inputs, input_types):
         data0, data1 = _pytorch_promote_types(inputs[:2], input_types[:2])
 
         # TODO (t-vi): should this also be part of the type promotion?
@@ -2093,21 +1717,13 @@ def _impl(inputs, input_types):
         # note: rsub means data0 and data1 swap places
         return get_relay_op("subtract")(data1, alpha * data0)
 
-    return _impl
-
-
-def _embedding():
-    def _impl(inputs, input_types):
+    def embedding(self, inputs, input_types):
         weight = inputs[0]
         indices = inputs[1]
 
         return _op.take(weight, indices.astype("int32"), axis=0)
 
-    return _impl
-
-
-def _one_hot():
-    def _impl(inputs, input_types):
+    def one_hot(self, inputs, input_types):
         indices = inputs[0].astype("int32")
         num_classes = inputs[1]
         if num_classes == -1:
@@ -2120,28 +1736,16 @@ def _impl(inputs, input_types):
 
         return _op.one_hot(indices, on_value, off_value, num_classes, -1, dtype)
 
-    return _impl
-
-
-def _index():
-    def _impl(inputs, input_types):
+    def index(self, inputs, input_types):
         data = inputs[0]
         indices = inputs[1]
         return _op.adv_index([data] + indices)
 
-    return _impl
-
-
-def _meshgrid():
-    def _impl(inputs, input_types):
+    def meshgrid(self, inputs, input_types):
         data = inputs[0]
         return _op.meshgrid(data, indexing="ij")
 
-    return _impl
-
-
-def _nms(prelude):
-    def _impl(inputs, input_types):
+    def nms(self, inputs, input_types):
         boxes = inputs[0]
         scores = inputs[1]
         iou_threshold = inputs[2]
@@ -2187,11 +1791,7 @@ def _impl(inputs, input_types):
         # in torchvision, indices from nms are int64
         return _op.cast(ret, "int64")
 
-    return _impl
-
-
-def _logsumexp():
-    def _impl(inputs, input_types):
+    def logsumexp(self, inputs, input_types):
         data = _pytorch_promote_types(inputs[:1], input_types[:1])
         dim_list = inputs[1]
         keepdim = inputs[2] if len(inputs) > 2 else False
@@ -2199,11 +1799,7 @@ def _impl(inputs, input_types):
         assert isinstance(dim_list, list), "dim is expected to be a list"
         return _op.logsumexp(data[0], axis=dim_list, keepdims=keepdim)
 
-    return _impl
-
-
-def _roi_align(prelude):
-    def _impl(inputs, input_types):
+    def roi_align(self, inputs, input_types):
         data = inputs[0]
         boxes = inputs[1]
 
@@ -2217,16 +1813,12 @@ def _impl(inputs, input_types):
 
         return _op.vision.roi_align(data, boxes, output_size, spatial_scale, sample_ratio)
 
-    return _impl
-
-
-def _unbind():
-    def _impl(inputs, input_types):
+    def unbind(self, inputs, input_types):
         data = inputs[0]
         dim = int(inputs[1])
         ishapes = _infer_shape(data)
         if dim >= len(ishapes):
-            msg = "Please check input dim, it shouldn't" "be greater than or equal to rank."
+            msg = "Please check input dim, it shouldn't be greater than or equal to rank."
             raise AttributeError(msg)
 
         selections = ishapes[dim]
@@ -2239,13 +1831,9 @@ def _impl(inputs, input_types):
         ret = _expr.TupleWrapper(_expr.Tuple(ret), selections)
         return ret
 
-    return _impl
-
-
-def _shape_as_tensor(prelude):
-    def _impl(inputs, input_types):
+    def shape_as_tensor(self, inputs, input_types):
         is_symbolic_shape = False
-        input_shape = _infer_shape(inputs[0], prelude.mod)
+        input_shape = _infer_shape(inputs[0], self.prelude.mod)
         for axis in input_shape:
             if not isinstance(axis, (int, tvm.tir.IntImm)):
                 is_symbolic_shape = True
@@ -2258,45 +1846,30 @@ def _impl(inputs, input_types):
 
         return ret
 
-    return _impl
-
-
-def _logical_and():
-    def _impl(inputs, input_types):
+    def logical_and(self, inputs, input_types):
         lhs = _op.cast(inputs[0], "bool")
         rhs = _op.cast(inputs[1], "bool")
 
         return _op.logical_and(lhs, rhs)
 
-    return _impl
-
-
-def _nonzero(is_numpy_style):
-    def _impl(inputs, input_types):
+    def nonzero(self, inputs, input_types, is_numpy_style=False):
         data = inputs[0]
         ret = _op.transform.argwhere(data)
-
         if is_numpy_style or (len(inputs) > 1 and inputs[1]):
-            return _unbind()([ret, 1], None)
-
+            return self.unbind([ret, 1], None)
         return ret
 
-    return _impl
-
+    def nonzero_numpy(self, inputs, input_types):
+        return self.nonzero(inputs, input_types, is_numpy_style=False)
 
-def _scatter():
-    def _impl(inputs, input_types):
+    def scatter(self, inputs, input_types):
         data = inputs[0]
         axis = int(inputs[1])
         index = inputs[2]
         src = inputs[3]
         return _op.transform.scatter(data, index, src, axis)
 
-    return _impl
-
-
-def _scalar_tensor():
-    def _impl(inputs, input_types):
+    def scalar_tensor(self, inputs, input_types):
         data = inputs[0]
         cast_map = {
             6: "float32",
@@ -2309,11 +1882,7 @@ def _impl(inputs, input_types):
             data = data.data.asnumpy().tolist()
         return _expr.const(data, cast_map[type_key])
 
-    return _impl
-
-
-def _interpolate():
-    def _impl(inputs, input_types):
+    def interpolate(self, inputs, input_types):
         if isinstance(inputs[1], _expr.Expr):
             out_size = inputs[1]
         elif isinstance(inputs[1], list):
@@ -2342,26 +1911,14 @@ def _impl(inputs, input_types):
 
         return _op.image.resize(data, out_size, "NCHW", method, coord_trans)
 
-    return _impl
-
-
-def _numel():
-    def _impl(inputs, input_types):
+    def numel(self, inputs, input_types):
         return _op.ndarray_size(inputs[0])
 
-    return _impl
-
-
-def _empty():
-    def _impl(inputs, input_types):
+    def empty(self, inputs, input_types):
         shape = inputs[0]
         return _op.zeros(shape, _convert_dtype_value(inputs[1]))
 
-    return _impl
-
-
-def _bincount():
-    def _impl(inputs, input_types):
+    def bincount(self, inputs, input_types):
         data = inputs[0]
         weights = inputs[1]
         maximum = _op.max(data)
@@ -2377,18 +1934,427 @@ def _impl(inputs, input_types):
         counts = _op.zeros(_op.reshape(dim, [1]), out_dtype)
         return _op.scatter_add(counts, data, updates, axis=0)
 
-    return _impl
-
-
-def _scatter_add():
-    def _impl(inputs, input_types):
+    def scatter_add(self, inputs, input_types):
         data = inputs[0]
         axis = inputs[1]
         index = inputs[2]
         src = inputs[3]
         return _op.scatter_add(data, index, src, axis=axis)
 
-    return _impl
+    # Operator mappings
+    def create_convert_map(self):
+        self.convert_map = {
+            "aten::pixel_shuffle": self.pixel_shuffle,
+            "aten::device": self.none,
+            "prim::device": self.none,
+            "aten::sub": self.make_elemwise("subtract"),
+            "aten::sub_": self.make_elemwise("subtract"),
+            "aten::max": self.max,
+            "aten::min": self.min,
+            "aten::mul": self.make_elemwise("multiply"),
+            "aten::mul_": self.make_elemwise("multiply"),
+            "aten::pow": self.make_elemwise("power"),
+            "aten::arange": self.arange,
+            "aten::meshgrid": self.meshgrid,
+            "aten::div": self.make_elemwise("divide"),
+            "aten::div_": self.make_elemwise("divide"),
+            "aten::floor_divide": self.make_elemwise("floor_divide"),
+            "aten::true_divide": self.make_elemwise("divide"),
+            "aten::addcdiv": self.addcdiv,
+            "aten::addcmul": self.addcmul,
+            "aten::ones": self.ones,
+            "aten::ones_like": self.ones_like,
+            "aten::zeros": self.zeros,
+            "aten::zeros_like": self.zeros_like,
+            "aten::full": self.full,
+            "aten::full_like": self.full_like,
+            "aten::linspace": self.linspace,
+            "aten::reciprocal": self.reciprocal,
+            "aten::repeat": self.repeat,
+            "aten::repeat_interleave": self.repeat_interleave,
+            "aten::to": self.to,
+            "aten::squeeze": self.squeeze,
+            "aten::unsqueeze": self.unsqueeze,
+            "aten::cat": self.concatenate,
+            "aten::slice": self.slice,
+            "aten::split": self.split,
+            "aten::split_with_sizes": self.split_with_sizes,
+            "aten::select": self.select,
+            "aten::take": self.take,
+            "aten::where": self.where,
+            "aten::topk": self.topk,
+            "aten::relu": self.relu,
+            "aten::relu_": self.relu,
+            "aten::prelu": self.prelu,
+            "aten::leaky_relu": self.leaky_relu,
+            "aten::leaky_relu_": self.leaky_relu,
+            "aten::elu": self.elu,
+            "aten::elu_": self.elu,
+            "aten::celu": self.celu,
+            "aten::gelu": self.gelu,
+            "aten::selu": self.selu,
+            "aten::log_sigmoid": self.log_sigmoid,
+            "aten::adaptive_avg_pool2d": self.adaptive_avg_pool_2d,
+            "aten::adaptive_max_pool2d": self.adaptive_max_pool_2d,
+            "aten::max_pool2d": self.maxpool_2d,
+            "aten::max_pool2d_with_indices": self.maxpool_2d_with_indices,
+            "aten::max_pool1d": self.maxpool_1d,
+            "aten::max_pool3d": self.maxpool_3d,
+            "aten::hardtanh": self.hardtanh,
+            "aten::hardtanh_": self.hardtanh,
+            "aten::_convolution": self.convolution,
+            "aten::softmax": self.softmax,
+            "aten::threshold": self.threshold,
+            "aten::threshold_": self.threshold,
+            "aten::contiguous": self.contiguous,
+            "aten::batch_norm": self.batch_norm,
+            "aten::instance_norm": self.instance_norm,
+            "aten::layer_norm": self.layer_norm,
+            "aten::group_norm": self.group_norm,
+            "aten::transpose": self.transpose,
+            "aten::transpose_": self.transpose,
+            "aten::t": self.transpose,
+            "aten::flatten": self.flatten,
+            "aten::addmm": self.addmm,
+            "aten::size": self.size,
+            "aten::view": self.view,
+            "aten::reshape": self.reshape,
+            "aten::clone": self.clone,
+            "aten::log_softmax": self.log_softmax,
+            "aten::sigmoid": self.sigmoid,
+            "aten::softplus": self.softplus,
+            "aten::avg_pool2d": self.avg_pool2d,
+            "aten::avg_pool3d": self.avg_pool3d,
+            "aten::dropout": self.dropout,
+            "aten::dropout_": self.dropout,
+            "aten::feature_dropout": self.dropout,
+            "aten::alpha_dropout": self.dropout,
+            "aten::mean": self.mean,
+            "aten::chunk": self.chunk,
+            "aten::matmul": self.matmul,
+            "aten::bmm": self.matmul,
+            "aten::expand": self.expand,
+            "aten::Int": self.int,
+            "prim::NumToTensor": self.numtotensor,
+            "prim::ImplicitTensorToNum": self.tensortonum,
+            "aten::ScalarImplicit": self.tensortonum,
+            "aten::constant_pad_nd": self.make_pad("constant"),
+            "aten::reflection_pad1d": self.make_pad("reflect"),
+            "aten::reflection_pad2d": self.make_pad("reflect"),
+            "aten::replication_pad1d": self.make_pad("edge"),
+            "aten::replication_pad2d": self.make_pad("edge"),
+            "aten::replication_pad3d": self.make_pad("edge"),
+            "aten::permute": self.transpose,
+            "aten::sum": self.make_reduce("sum"),
+            "aten::prod": self.make_reduce("prod"),
+            "aten::argmin": self.make_reduce("argmin"),
+            "aten::argmax": self.make_reduce("argmax"),
+            "aten::norm": self.norm,
+            "aten::frobenius_norm": self.frobenius_norm,
+            "aten::std": self.std,
+            "aten::var": self.variance,
+            "aten::abs": self.make_unary("abs"),
+            "aten::neg": self.make_unary("negative"),
+            "aten::cos": self.make_unary("cos"),
+            "aten::cosh": self.make_unary("cosh"),
+            "aten::sin": self.make_unary("sin"),
+            "aten::sinh": self.make_unary("sinh"),
+            "aten::tan": self.make_unary("tan"),
+            "aten::tanh": self.make_unary("tanh"),
+            "aten::acos": self.make_unary("acos"),
+            "aten::asin": self.make_unary("asin"),
+            "aten::atan": self.make_unary("atan"),
+            "aten::log": self.make_unary("log"),
+            "aten::log2": self.make_unary("log2"),
+            "aten::log10": self.make_unary("log10"),
+            "aten::log1p": self.log1p,
+            "aten::exp": self.make_unary("exp"),
+            "aten::erf": self.make_unary("erf"),
+            "aten::trunc": self.make_unary("trunc"),
+            "aten::sign": self.make_unary("sign"),
+            "aten::sqrt": self.make_unary("sqrt"),
+            "aten::rsqrt": self.make_unary("rsqrt"),
+            "aten::ceil": self.make_unary("ceil"),
+            "aten::floor": self.make_unary("floor"),
+            "aten::round": self.make_unary("round"),
+            "aten::isfinite": self.make_unary("isfinite"),
+            "aten::isinf": self.make_unary("isinf"),
+            "aten::isnan": self.make_unary("isnan"),
+            "aten::clamp": self.clamp,
+            "aten::clamp_": self.clamp,
+            "aten::detach": self.identity,
+            "aten::upsample_bilinear2d": self.make_upsample("bilinear"),
+            "aten::upsample_nearest2d": self.make_upsample("nearest_neighbor"),
+            "aten::upsample_trilinear3d": self.make_upsample3d("trilinear"),
+            "aten::upsample_nearest3d": self.make_upsample3d("nearest_neighbor"),
+            "aten::expand_as": self.expand_as,
+            "aten::lt": self.make_elemwise("less"),
+            "aten::gt": self.make_elemwise("greater"),
+            "aten::le": self.make_elemwise("less_equal"),
+            "aten::ge": self.make_elemwise("greater_equal"),
+            "aten::ne": self.make_elemwise("not_equal"),
+            "aten::eq": self.make_elemwise("equal"),
+            "aten::logical_not": self.logical_not,
+            "aten::logical_xor": self.logical_xor,
+            "aten::bitwise_not": self.bitwise_not,
+            "aten::bitwise_xor": self.bitwise_xor,
+            "aten::Bool": self.Bool,
+            "aten::Float": self.Float,
+            "aten::adaptive_avg_pool3d": self.adaptive_avg_pool_3d,
+            "aten::adaptive_max_pool3d": self.adaptive_max_pool_3d,
+            "aten::rsub": self.rsub,
+            "aten::embedding": self.embedding,
+            "aten::one_hot": self.one_hot,
+            "aten::mm": self.matmul,
+            "aten::add": self.add,
+            "aten::add_": self.add,
+            "aten::stack": self.stack,
+            "aten::__getitem__": self.list_getitem,
+            "aten::len": self.list_len,
+            "aten::type_as": self.type_as,
+            "aten::gather": self.gather,
+            "aten::index_select": self.select,
+            "aten::index": self.index,
+            "torchvision::nms": self.nms,
+            "aten::logsumexp": self.logsumexp,
+            "torchvision::roi_align": self.roi_align,
+            "aten::unbind": self.unbind,
+            "aten::__and__": self.logical_and,
+            "aten::_shape_as_tensor": self.shape_as_tensor,
+            "aten::nonzero": self.nonzero,
+            "aten::nonzero_numpy": self.nonzero_numpy,
+            "aten::scatter": self.scatter,
+            "aten::scalar_tensor": self.scalar_tensor,
+            "aten::__interpolate": self.interpolate,
+            "aten::IntImplicit": self.identity,
+            "aten::tensor": self.identity,  # used for example in tensor(1.0)
+            "aten::numel": self.numel,
+            "aten::empty": self.empty,
+            "aten::bincount": self.bincount,
+            "aten::scatter_add": self.scatter_add,
+            "aten::__not__": self.logical_not,
+        }
+
+    def update_convert_map(self, custom_map):
+        self.convert_map.update(custom_map)
+
+    def report_missing_conversion(self, op_names):
+        """ Check if all ops in an input graph are supported by TVM """
+        known_ops = [
+            "prim::Constant",
+            "prim::GetAttr",
+            "prim::ListConstruct",
+            "prim::ListUnpack",
+            "prim::TupleConstruct",
+            "prim::TupleUnpack",
+            "prim::RaiseException",
+            "prim::If",
+            "prim::Loop",
+        ]
+        known_ops += list(self.convert_map.keys())
+        known_ops += list(qnn_torch.convert_map.keys())
+
+        missing = [op_name for op_name in op_names if op_name not in known_ops]
+
+        if missing:
+            msg = "The following operators are not implemented: {}".format(missing)
+            raise NotImplementedError(msg)
+
+    def convert_block(self, block, outputs):
+        """ Translate Torch "Block", used for prim::If and prim::Loop """
+        ops = _get_operator_nodes(block.nodes())
+        ret_names = _get_input_names(block.returnNode())
+        return self.convert_operators(ops, outputs, ret_names)
+
+    def convert_if(self, if_node, outputs):
+        """ Translate Torch prim::If to Relay If """
+        cond = outputs[if_node.inputsAt(0).debugName()]
+        blocks = list(if_node.blocks())
+        true_branch = self.convert_block(blocks[0], outputs)
+        false_branch = self.convert_block(blocks[1], outputs)
+        assert len(true_branch) == 1 and len(false_branch) == 1
+        return _expr.If(cond, true_branch[0], false_branch[0])
+
+    def convert_loop(self, loop_node, outputs):
+        """ Translate Torch prim::Loop to Relay while_loop """
+
+        def get_input(index):
+            ivalue = loop_node.inputsAt(index)
+            inode = ivalue.node()
+            if inode.kind() == "prim::Constant":
+                return _expr.const(_get_constant(inode))
+            var_name = ivalue.debugName()
+            assert var_name in outputs
+            return _wrap_const(outputs[var_name])
+
+        # Refer to the spec for prim::Loop below
+        # https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/OVERVIEW.md#loops
+        # The first input: %max_trip_count
+        # The second input: %initial_condition
+        # The rest of input: loop variables
+        max_loop_count = get_input(0)
+        init_cond = get_input(1)
+        num_loop_var = len(list(loop_node.inputs())) - 2
+        init_vals = [get_input(i + 2) for i in range(num_loop_var)]
+
+        # while loop has always max_loop_count being int64 max
+        # max_loop_count.data (tvm.runtime.NDArray) is -1, so _get_constant again
+        is_while_loop = (
+            isinstance(max_loop_count, _expr.Constant)
+            and _get_constant(loop_node.inputsAt(0).node()) == sys.maxsize
+        )
+
+        if is_while_loop:
+            loop_iter_dtype = "bool"
+            # while loop with non input dependent condition such as while i < 10:
+            # init_cond is int, need to cast to bool to type check
+            if isinstance(init_cond, _expr.Constant):
+                init_cond = _op.cast(init_cond, "bool")
+            init_loop_iter_val = init_cond
+        else:
+            loop_iter_dtype = "int32"
+            # always count from 0
+            init_loop_iter_val = _expr.const(0, dtype="int32")
+
+        body_block = list(loop_node.blocks())[0]
+        block_input_names = _get_input_names(body_block)
+        num_block_inputs = len(block_input_names)
+        name_val_pairs = list(zip(block_input_names, [init_loop_iter_val] + init_vals))
+        outputs.update(name_val_pairs)
+
+        def get_var(name, val):
+            if val:
+                checked_type = _infer_type_with_prelude(val, self.prelude)
+                if hasattr(checked_type, "shape"):
+                    shape = get_const_tuple(checked_type.shape)
+                    actual_shape = []
+                    for dim in shape:
+                        if isinstance(dim, int) and dim == 0:
+                            actual_shape.append(Any())
+                        else:
+                            actual_shape.append(dim)
+                    return _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
+                else:
+                    return _expr.var(name, type_annotation=checked_type)
+            return _expr.var(name)
+
+        loop_iter_var = _expr.var(block_input_names[0], shape=(), dtype=loop_iter_dtype)
+        loop_vars = [get_var(name, val) for name, val in name_val_pairs[1:]]
+
+        # Add non constant free variables to loop variables to prevent code blow up
+        # Without this, if there are two for loops in a row, which often happens
+        # if the outer loop is unrolled, the computation corresponding to the first for loop
+        # is inlined inside loop body, turning O(N) + O(N) computation into O(N^2).
+        # This issue was found when converting from Stacked LSTM test. Torch does not add the
+        # outputof the eariler loop into loop variables of the next loop.
+        # So the variable corresponding to the first loop output appears free in the second
+        # loop body.
+        free_vars = [
+            var
+            for var in _get_free_vars_from_block(body_block)
+            if var in outputs
+            and not isinstance(outputs[var], (_expr.Constant, int, float, str))
+            and outputs[var]
+        ]
+
+        prev_outputs = {}
+        for name in free_vars:
+            prev_output = outputs[name]
+            new_loop_var = get_var(name, prev_output)
+            prev_outputs[name] = prev_output
+            outputs[name] = new_loop_var
+            loop_vars.append(new_loop_var)
+            init_vals.append(prev_output)
+
+        def cond(*current_vals):
+            i = current_vals[0]
+
+            if is_while_loop:
+                return _op.equal(i, _expr.const(True, "bool"))
+
+            return _op.less(i, max_loop_count)
+
+        def body(*current_vals):
+            # Update loop variables using the prev iteration outputs
+            assert len(current_vals) == num_block_inputs + len(free_vars)
+
+            for (i, val) in enumerate(current_vals):
+                if i < num_block_inputs:
+                    outputs[block_input_names[i]] = val
+                else:
+                    outputs[free_vars[i - num_block_inputs]] = val
+
+            block_outputs = self.convert_block(body_block, outputs)
+            block_outputs += [outputs[name] for name in free_vars]
+
+            if not is_while_loop:
+                # iter var increment implicit in torch, so do it manually
+                # for while loop, block_outputs[0] is already a boolean,
+                # the result of termination check
+                incr = _expr.const(1, dtype="int32")
+                block_outputs[0] = current_vals[0] + incr
+
+            return block_outputs
+
+        loop = while_loop(cond, [loop_iter_var] + loop_vars, body)
+        loop_val = loop(init_loop_iter_val, *init_vals)
+
+        # restore original output values for free vars
+        outputs.update(prev_outputs)
+
+        # The first element is a loop counter or boolean condition, ignore it
+        return [_expr.TupleGetItem(loop_val, i + 1) for i in range(num_loop_var)]
+
+    def convert_operators(self, operators, outputs, ret_names):
+        """ Convert each Torch IR operators to Relay equivalent """
+        for node_name, op_node in operators:
+            operator = op_node.kind()
+            inputs = _get_op_inputs(op_node, outputs)
+
+            if operator == "prim::Constant":
+                outputs[node_name] = _get_constant(op_node)
+            elif operator == "prim::ListConstruct" and _should_construct_dynamic_list(op_node):
+                outputs[node_name] = _convert_to_list_adt(inputs, self.prelude)
+            elif operator == "prim::ListConstruct":
+                # This assumes that no more elements will be appended to this list
+                # In this case, we keep the Python list
+                outputs[node_name] = inputs
+            elif operator == "prim::TupleConstruct":
+                outputs[node_name] = _expr.Tuple(inputs)
+            elif operator in ["prim::ListUnpack", "prim::TupleUnpack"]:
+                assert len(inputs) == 1
+                if isinstance(inputs[0], (list, _expr.TupleWrapper)):
+                    unpacked = inputs[0]
+                else:
+                    unpacked = _unpack_tuple(inputs[0])
+                outputs.update(zip(_get_output_names(op_node), unpacked))
+            elif operator == "prim::prim::RaiseException":
+                logging.warning("raising exceptions is ignored")
+                outputs[node_name] = None
+            elif operator == "prim::If":
+                if_out = self.convert_if(op_node, outputs)
+                outputs[node_name] = if_out
+            elif operator == "prim::Loop":
+                loop_out = self.convert_loop(op_node, outputs)
+                unpacked_names = _get_output_names(op_node)
+                assert len(loop_out) == len(unpacked_names)
+                outputs.update(zip(unpacked_names, loop_out))
+            else:
+                relay_op = self.convert_map[operator]
+                relay_out = relay_op(
+                    inputs, _get_input_types(op_node, outputs, default_dtype=self.default_dtype)
+                )
+
+                if isinstance(relay_out, tuple):
+                    # This is for torch operators that return multiple outputs
+                    # See _adaptive_max_2d above for example
+                    out_names = _get_output_names(op_node)
+                    outputs.update(zip(out_names, relay_out))
+                else:
+                    assert op_node.outputsSize() == 1
+                    outputs[node_name] = relay_out
+
+        return [_wrap_const(outputs[ret_name]) for ret_name in ret_names]
 
 
 def _pytorch_result_type(dtypes, non_tensor_inputs):
@@ -2544,202 +2510,6 @@ def _wrap_const(c):
     return c
 
 
-# Operator mappings
-def _get_convert_map(prelude, default_dtype):
-    convert_map = {
-        "aten::pixel_shuffle": _pixel_shuffle(prelude),
-        "aten::device": _none(),
-        "prim::device": _none(),
-        "aten::sub": _elemwise("subtract"),
-        "aten::sub_": _elemwise("subtract"),
-        "aten::max": _max(),
-        "aten::min": _min(),
-        "aten::mul": _elemwise("multiply"),
-        "aten::mul_": _elemwise("multiply"),
-        "aten::pow": _elemwise("power"),
-        "aten::arange": _arange(),
-        "aten::meshgrid": _meshgrid(),
-        "aten::div": _elemwise("divide"),
-        "aten::div_": _elemwise("divide"),
-        "aten::floor_divide": _elemwise("floor_divide"),
-        "aten::true_divide": _elemwise("divide"),
-        "aten::addcdiv": _addcdiv(),
-        "aten::addcmul": _addcmul(),
-        "aten::ones": _ones(default_dtype),
-        "aten::ones_like": _ones_like(default_dtype),
-        "aten::zeros": _zeros(default_dtype),
-        "aten::zeros_like": _zeros_like(default_dtype),
-        "aten::full": _full(default_dtype),
-        "aten::full_like": _full_like(default_dtype),
-        "aten::linspace": _linspace(),
-        "aten::reciprocal": _reciprocal(),
-        "aten::repeat": _repeat(),
-        "aten::repeat_interleave": _repeat_interleave(),
-        "aten::to": _to(),
-        "aten::squeeze": _squeeze(),
-        "aten::unsqueeze": _unsqueeze(),
-        "aten::cat": _concatenate(prelude),
-        "aten::slice": _slice(),
-        "aten::split": _split(),
-        "aten::split_with_sizes": _split_with_sizes(),
-        "aten::select": _select(),
-        "aten::take": _take(),
-        "aten::where": _where(),
-        "aten::topk": _topk(),
-        "aten::relu": _relu(prelude),
-        "aten::relu_": _relu(prelude),
-        "aten::prelu": _prelu(),
-        "aten::leaky_relu": _leaky_relu(),
-        "aten::leaky_relu_": _leaky_relu(),
-        "aten::elu": _elu(),
-        "aten::elu_": _elu(),
-        "aten::celu": _celu(),
-        "aten::gelu": _gelu(),
-        "aten::selu": _selu(),
-        "aten::log_sigmoid": _log_sigmoid(),
-        "aten::adaptive_avg_pool2d": _adaptive_avg_pool_2d(prelude),
-        "aten::adaptive_max_pool2d": _adaptive_max_pool_2d(),
-        "aten::max_pool2d": _maxpool_2d(),
-        "aten::max_pool2d_with_indices": _maxpool_2d_with_indices(),
-        "aten::max_pool1d": _maxpool_1d(),
-        "aten::max_pool3d": _maxpool_3d(),
-        "aten::hardtanh": _hardtanh(),
-        "aten::hardtanh_": _hardtanh(),
-        "aten::_convolution": _convolution(),
-        "aten::softmax": _softmax(),
-        "aten::threshold": _threshold(),
-        "aten::threshold_": _threshold(),
-        "aten::contiguous": _contiguous(),
-        "aten::batch_norm": _batch_norm(),
-        "aten::instance_norm": _instance_norm(),
-        "aten::layer_norm": _layer_norm(),
-        "aten::group_norm": _group_norm(),
-        "aten::transpose": _transpose(prelude),
-        "aten::transpose_": _transpose(prelude),
-        "aten::t": _transpose(prelude),
-        "aten::flatten": _flatten(),
-        "aten::addmm": _addmm(),
-        "aten::size": _size(prelude),
-        "aten::view": _view(),
-        "aten::reshape": _reshape(),
-        "aten::clone": _clone(),
-        "aten::log_softmax": _log_softmax(),
-        "aten::sigmoid": _sigmoid(),
-        "aten::softplus": _softplus(),
-        "aten::avg_pool2d": _avg_pool2d(prelude),
-        "aten::avg_pool3d": _avg_pool3d(),
-        "aten::dropout": _dropout(),
-        "aten::dropout_": _dropout(),
-        "aten::feature_dropout": _dropout(),
-        "aten::alpha_dropout": _dropout(),
-        "aten::mean": _mean(prelude),
-        "aten::chunk": _chunk(prelude),
-        "aten::matmul": _matmul(prelude),
-        "aten::bmm": _matmul(prelude),
-        "aten::expand": _expand(),
-        "aten::Int": _int(),
-        "prim::NumToTensor": _numtotensor(),
-        "prim::ImplicitTensorToNum": _tensortonum(),
-        "aten::ScalarImplicit": _tensortonum(),
-        "aten::constant_pad_nd": _pad("constant"),
-        "aten::reflection_pad1d": _pad("reflect"),
-        "aten::reflection_pad2d": _pad("reflect"),
-        "aten::replication_pad1d": _pad("edge"),
-        "aten::replication_pad2d": _pad("edge"),
-        "aten::replication_pad3d": _pad("edge"),
-        "aten::permute": _transpose(prelude),
-        "aten::sum": _reduce("sum"),
-        "aten::prod": _reduce("prod"),
-        "aten::argmin": _reduce("argmin"),
-        "aten::argmax": _reduce("argmax"),
-        "aten::norm": _norm(),
-        "aten::frobenius_norm": _frobenius_norm(),
-        "aten::std": _std(),
-        "aten::var": _variance(),
-        "aten::abs": _unary("abs"),
-        "aten::neg": _unary("negative"),
-        "aten::cos": _unary("cos"),
-        "aten::cosh": _unary("cosh"),
-        "aten::sin": _unary("sin"),
-        "aten::sinh": _unary("sinh"),
-        "aten::tan": _unary("tan"),
-        "aten::tanh": _unary("tanh"),
-        "aten::acos": _unary("acos"),
-        "aten::asin": _unary("asin"),
-        "aten::atan": _unary("atan"),
-        "aten::log": _unary("log"),
-        "aten::log2": _unary("log2"),
-        "aten::log10": _unary("log10"),
-        "aten::log1p": _log1p(),
-        "aten::exp": _unary("exp"),
-        "aten::erf": _unary("erf"),
-        "aten::trunc": _unary("trunc"),
-        "aten::sign": _unary("sign"),
-        "aten::sqrt": _unary("sqrt"),
-        "aten::rsqrt": _unary("rsqrt"),
-        "aten::ceil": _unary("ceil"),
-        "aten::floor": _unary("floor"),
-        "aten::round": _unary("round"),
-        "aten::isfinite": _unary("isfinite"),
-        "aten::isinf": _unary("isinf"),
-        "aten::isnan": _unary("isnan"),
-        "aten::clamp": _clamp(),
-        "aten::clamp_": _clamp(),
-        "aten::detach": _identity(),
-        "aten::upsample_bilinear2d": _upsample("bilinear", prelude),
-        "aten::upsample_nearest2d": _upsample("nearest_neighbor", prelude),
-        "aten::upsample_trilinear3d": _upsample3d("trilinear"),
-        "aten::upsample_nearest3d": _upsample3d("nearest_neighbor"),
-        "aten::expand_as": _expand_as(),
-        "aten::lt": _elemwise("less"),
-        "aten::gt": _elemwise("greater"),
-        "aten::le": _elemwise("less_equal"),
-        "aten::ge": _elemwise("greater_equal"),
-        "aten::ne": _elemwise("not_equal"),
-        "aten::eq": _elemwise("equal"),
-        "aten::logical_not": _logical_not(),
-        "aten::logical_xor": _logical_xor(),
-        "aten::bitwise_not": _bitwise_not(),
-        "aten::bitwise_xor": _bitwise_xor(),
-        "aten::Bool": _Bool(),
-        "aten::Float": _Float(),
-        "aten::adaptive_avg_pool3d": _adaptive_avg_pool_3d(),
-        "aten::adaptive_max_pool3d": _adaptive_max_pool_3d(),
-        "aten::rsub": _rsub(),
-        "aten::embedding": _embedding(),
-        "aten::one_hot": _one_hot(),
-        "aten::mm": _matmul(prelude),
-        "aten::add": _add(prelude),
-        "aten::add_": _add(prelude),
-        "aten::stack": _stack(prelude),
-        "aten::__getitem__": _list_getitem(prelude),
-        "aten::len": _list_len(prelude),
-        "aten::type_as": _type_as(),
-        "aten::gather": _gather(),
-        "aten::index_select": _select(),
-        "aten::index": _index(),
-        "torchvision::nms": _nms(prelude),
-        "aten::logsumexp": _logsumexp(),
-        "torchvision::roi_align": _roi_align(prelude),
-        "aten::unbind": _unbind(),
-        "aten::__and__": _logical_and(),
-        "aten::_shape_as_tensor": _shape_as_tensor(prelude),
-        "aten::nonzero": _nonzero(False),
-        "aten::nonzero_numpy": _nonzero(True),
-        "aten::scatter": _scatter(),
-        "aten::scalar_tensor": _scalar_tensor(),
-        "aten::__interpolate": _interpolate(),
-        "aten::IntImplicit": _identity(),
-        "aten::tensor": _identity(),  # used for example in tensor(1.0)
-        "aten::numel": _numel(),
-        "aten::empty": _empty(),
-        "aten::bincount": _bincount(),
-        "aten::scatter_add": _scatter_add(),
-        "aten::__not__": _logical_not(),
-    }
-    return convert_map
-
-
 def _run_jit_passes(graph):
     """ The inline pass is necessary to unwrap prim::CallMethod """
     # pylint: disable=c-extension-no-member
@@ -2793,29 +2563,6 @@ def _get_users(node):
     return [use.user for use in _get_uses(node)]
 
 
-def _report_missing_conversion(op_names, convert_map):
-    """ Check if all ops in an input graph are supported by TVM """
-    known_ops = [
-        "prim::Constant",
-        "prim::GetAttr",
-        "prim::ListConstruct",
-        "prim::ListUnpack",
-        "prim::TupleConstruct",
-        "prim::TupleUnpack",
-        "prim::RaiseException",
-        "prim::If",
-        "prim::Loop",
-    ]
-    known_ops += list(convert_map.keys())
-    known_ops += list(qnn_torch.convert_map.keys())
-
-    missing = [op_name for op_name in op_names if op_name not in known_ops]
-
-    if missing:
-        msg = "The following operators are not implemented: {}".format(missing)
-        raise NotImplementedError(msg)
-
-
 def _getattr_attr_name(node):
     attribute_names = node.attributeNames()
     assert len(attribute_names) == 1
@@ -3117,211 +2864,6 @@ def convert_params(graph, state_dict):
     return params, param_tensors, packed_param_map
 
 
-def convert_block(block, outputs, convert_map, prelude, default_dtype="float32"):
-    """ Translate Torch "Block", used for prim::If and prim::Loop """
-    ops = _get_operator_nodes(block.nodes())
-    ret_names = _get_input_names(block.returnNode())
-    return convert_operators(
-        ops, outputs, ret_names, convert_map, prelude, default_dtype=default_dtype
-    )
-
-
-def convert_if(if_node, outputs, convert_map, prelude, default_dtype="float32"):
-    """ Translate Torch prim::If to Relay If """
-    cond = outputs[if_node.inputsAt(0).debugName()]
-    blocks = list(if_node.blocks())
-    true_branch = convert_block(
-        blocks[0], outputs, convert_map, prelude, default_dtype=default_dtype
-    )
-    false_branch = convert_block(
-        blocks[1], outputs, convert_map, prelude, default_dtype=default_dtype
-    )
-    assert len(true_branch) == 1 and len(false_branch) == 1
-    return _expr.If(cond, true_branch[0], false_branch[0])
-
-
-def convert_loop(loop_node, outputs, convert_map, prelude):
-    """ Translate Torch prim::Loop to Relay while_loop """
-
-    def get_input(index):
-        ivalue = loop_node.inputsAt(index)
-        inode = ivalue.node()
-        if inode.kind() == "prim::Constant":
-            return _expr.const(_get_constant(inode))
-        var_name = ivalue.debugName()
-        assert var_name in outputs
-        return _wrap_const(outputs[var_name])
-
-    # Refer to the spec for prim::Loop below
-    # https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/OVERVIEW.md#loops
-    # The first input: %max_trip_count
-    # The second input: %initial_condition
-    # The rest of input: loop variables
-    max_loop_count = get_input(0)
-    init_cond = get_input(1)
-    num_loop_var = len(list(loop_node.inputs())) - 2
-    init_vals = [get_input(i + 2) for i in range(num_loop_var)]
-
-    # while loop has always max_loop_count being int64 max
-    # max_loop_count.data (tvm.runtime.NDArray) is -1, so _get_constant again
-    is_while_loop = (
-        isinstance(max_loop_count, _expr.Constant)
-        and _get_constant(loop_node.inputsAt(0).node()) == sys.maxsize
-    )
-
-    if is_while_loop:
-        loop_iter_dtype = "bool"
-        # while loop with non input dependent condition such as while i < 10:
-        # init_cond is int, need to cast to bool to type check
-        if isinstance(init_cond, _expr.Constant):
-            init_cond = _op.cast(init_cond, "bool")
-        init_loop_iter_val = init_cond
-    else:
-        loop_iter_dtype = "int32"
-        # always count from 0
-        init_loop_iter_val = _expr.const(0, dtype="int32")
-
-    body_block = list(loop_node.blocks())[0]
-    block_input_names = _get_input_names(body_block)
-    num_block_inputs = len(block_input_names)
-    name_val_pairs = list(zip(block_input_names, [init_loop_iter_val] + init_vals))
-    outputs.update(name_val_pairs)
-
-    def get_var(name, val):
-        if val:
-            checked_type = _infer_type_with_prelude(val, prelude)
-            if hasattr(checked_type, "shape"):
-                shape = get_const_tuple(checked_type.shape)
-                actual_shape = []
-                for dim in shape:
-                    if isinstance(dim, int) and dim == 0:
-                        actual_shape.append(Any())
-                    else:
-                        actual_shape.append(dim)
-                return _expr.var(name, shape=actual_shape, dtype=checked_type.dtype)
-            else:
-                return _expr.var(name, type_annotation=checked_type)
-        return _expr.var(name)
-
-    loop_iter_var = _expr.var(block_input_names[0], shape=(), dtype=loop_iter_dtype)
-    loop_vars = [get_var(name, val) for name, val in name_val_pairs[1:]]
-
-    # Add non constant free variables to loop variables to prevent code blow up
-    # Without this, if there are two for loops in a row, which often happens
-    # if the outer loop is unrolled, the computation corresponding to the first for loop
-    # is inlined inside loop body, turning O(N) + O(N) computation into O(N^2).
-    # This issue was found when converting from Stacked LSTM test. Torch does not add the output
-    # of the eariler loop into loop variables of the next loop.
-    # So the variable corresponding to the first loop output appears free in the second loop body.
-    free_vars = [
-        var
-        for var in _get_free_vars_from_block(body_block)
-        if var in outputs
-        and not isinstance(outputs[var], (_expr.Constant, int, float, str))
-        and outputs[var]
-    ]
-
-    prev_outputs = {}
-    for name in free_vars:
-        prev_output = outputs[name]
-        new_loop_var = get_var(name, prev_output)
-        prev_outputs[name] = prev_output
-        outputs[name] = new_loop_var
-        loop_vars.append(new_loop_var)
-        init_vals.append(prev_output)
-
-    def cond(*current_vals):
-        i = current_vals[0]
-
-        if is_while_loop:
-            return _op.equal(i, _expr.const(True, "bool"))
-
-        return _op.less(i, max_loop_count)
-
-    def body(*current_vals):
-        # Update loop variables using the prev iteration outputs
-        assert len(current_vals) == num_block_inputs + len(free_vars)
-
-        for (i, val) in enumerate(current_vals):
-            if i < num_block_inputs:
-                outputs[block_input_names[i]] = val
-            else:
-                outputs[free_vars[i - num_block_inputs]] = val
-
-        block_outputs = convert_block(body_block, outputs, convert_map, prelude)
-        block_outputs += [outputs[name] for name in free_vars]
-
-        if not is_while_loop:
-            # iter var increment implicit in torch, so do it manually
-            # for while loop, block_outputs[0] is already a boolean,
-            # the result of termination check
-            incr = _expr.const(1, dtype="int32")
-            block_outputs[0] = current_vals[0] + incr
-
-        return block_outputs
-
-    loop = while_loop(cond, [loop_iter_var] + loop_vars, body)
-    loop_val = loop(init_loop_iter_val, *init_vals)
-
-    # restore original output values for free vars
-    outputs.update(prev_outputs)
-
-    # The first element is a loop counter or boolean condition, ignore it
-    return [_expr.TupleGetItem(loop_val, i + 1) for i in range(num_loop_var)]
-
-
-def convert_operators(operators, outputs, ret_names, convert_map, prelude, default_dtype="float32"):
-    """ Convert each Torch IR operators to Relay equivalent """
-    for node_name, op_node in operators:
-        operator = op_node.kind()
-        inputs = _get_op_inputs(op_node, outputs)
-
-        if operator == "prim::Constant":
-            outputs[node_name] = _get_constant(op_node)
-        elif operator == "prim::ListConstruct" and _should_construct_dynamic_list(op_node):
-            outputs[node_name] = _convert_to_list_adt(inputs, prelude)
-        elif operator == "prim::ListConstruct":
-            # This assumes that no more elements will be appended to this list
-            # In this case, we keep the Python list
-            outputs[node_name] = inputs
-        elif operator == "prim::TupleConstruct":
-            outputs[node_name] = _expr.Tuple(inputs)
-        elif operator in ["prim::ListUnpack", "prim::TupleUnpack"]:
-            assert len(inputs) == 1
-            if isinstance(inputs[0], (list, _expr.TupleWrapper)):
-                unpacked = inputs[0]
-            else:
-                unpacked = _unpack_tuple(inputs[0])
-            outputs.update(zip(_get_output_names(op_node), unpacked))
-        elif operator == "prim::prim::RaiseException":
-            logging.warning("raising exceptions is ignored")
-            outputs[node_name] = None
-        elif operator == "prim::If":
-            if_out = convert_if(op_node, outputs, convert_map, prelude, default_dtype=default_dtype)
-            outputs[node_name] = if_out
-        elif operator == "prim::Loop":
-            loop_out = convert_loop(op_node, outputs, convert_map, prelude)
-            unpacked_names = _get_output_names(op_node)
-            assert len(loop_out) == len(unpacked_names)
-            outputs.update(zip(unpacked_names, loop_out))
-        else:
-            relay_op = convert_map[operator]
-            relay_out = relay_op(
-                inputs, _get_input_types(op_node, outputs, default_dtype=default_dtype)
-            )
-
-            if isinstance(relay_out, tuple):
-                # This is for torch operators that return multiple outputs
-                # See _adaptive_max_2d above for example
-                out_names = _get_output_names(op_node)
-                outputs.update(zip(out_names, relay_out))
-            else:
-                assert op_node.outputsSize() == 1
-                outputs[node_name] = relay_out
-
-    return [_wrap_const(outputs[ret_name]) for ret_name in ret_names]
-
-
 def get_all_op_names(graph):
     """ Return all operator names in the input graph """
     nodes = list(graph.nodes())
@@ -3370,16 +2912,16 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
     mod = tvm.IRModule()
     prelude = Prelude(mod)
 
-    convert_map = _get_convert_map(prelude, default_dtype)
+    converter = PyTorchOpConverter(prelude, default_dtype)
 
     graph = script_module.graph.copy()
     _run_jit_passes(graph)
 
     if custom_convert_map:
-        convert_map.update(custom_convert_map)
+        converter.update_convert_map(custom_convert_map)
 
     op_names = get_all_op_names(graph)
-    _report_missing_conversion(op_names, convert_map)
+    converter.report_missing_conversion(op_names)
 
     is_module = isinstance(script_module, torch.jit.ScriptModule)
     params = script_module.state_dict() if is_module else {}
@@ -3399,16 +2941,9 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
         qnn_torch.add_input_quant_params_to_op_inputs(graph)
         qnn_torch.add_quant_params_to_outputs(outputs, packed_param_map, weight_quant_params)
         qnn_torch.add_quant_params(tvm_params, weight_quant_params)
-        convert_map.update(qnn_torch.convert_map)
-
-    ret = convert_operators(
-        _get_operator_nodes(graph.nodes()),
-        outputs,
-        ret_name,
-        convert_map,
-        prelude,
-        default_dtype=default_dtype,
-    )
+        converter.update_convert_map(qnn_torch.convert_map)
+
+    ret = converter.convert_operators(_get_operator_nodes(graph.nodes()), outputs, ret_name)
 
     mod["main"] = tvm.relay.Function(_analysis.free_vars(ret[0]), ret[0])
 

From b589920b5c206d3adccf6c5c2702731db3f81668 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 4 Dec 2020 06:02:52 -0800
Subject: [PATCH 256/258] [Relay][Frontend][Onnx] Add support for Size op in
 Onnx frontend. (#7031)

* Add support for Size op in Onnx frontend.

* Simplify target testing.
---
 python/tvm/relay/frontend/onnx.py          |  1 +
 tests/python/frontend/onnx/test_forward.py | 28 ++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 7ebad7297471..d65f5676fb33 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2374,6 +2374,7 @@ def _get_convert_map(opset):
         "Gather": Gather.get_converter(opset),
         "GatherElements": GatherElements.get_converter(opset),
         "GatherND": GatherND.get_converter(opset),
+        "Size": AttrCvt("ndarray_size", extras={"dtype": "int64"}),
         "Scatter": Scatter.get_converter(opset),
         "ScatterElements": Scatter.get_converter(opset),
         "Squeeze": AttrCvt("squeeze", {"axes": "axis"}),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index b84e55ac800c..3ddc80af3a32 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3888,6 +3888,33 @@ def test_if():
             tvm.testing.assert_allclose(correct_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
 
 
+@tvm.testing.uses_gpu
+def test_size():
+    def verify_size(indata):
+        node = helper.make_node(
+            "Size",
+            inputs=["X"],
+            outputs=["Y"],
+        )
+
+        graph = helper.make_graph(
+            [node],
+            "size_test",
+            inputs=[helper.make_tensor_value_info("X", TensorProto.INT64, list(indata.shape))],
+            outputs=[helper.make_tensor_value_info("Y", TensorProto.INT64, [])],
+        )
+
+        model = helper.make_model(graph, producer_name="size_test")
+
+        verify_with_ort_with_inputs(model, [indata], dtype="int64", use_vm=True, opset=11)
+
+    input_data = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    verify_size(input_data)
+
+    input_data = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]], dtype=np.int64)
+    verify_size(input_data)
+
+
 if __name__ == "__main__":
     test_flatten()
     test_reshape()
@@ -3964,3 +3991,4 @@ def test_if():
     test_roi_align()
     test_range()
     test_loop()
+    test_size()

From 7e503cba1c3acd081777522f1c77cf0697e489ad Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 4 Dec 2020 07:56:15 -0800
Subject: [PATCH 257/258] [Frontend] Prevent tflite frontend from producing
 int64 shape/parameters (#7030)

---
 python/tvm/relay/frontend/common.py | 11 ++++++++++
 python/tvm/relay/frontend/tflite.py | 33 +++++++++++++++++------------
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index ae51f2155402..8c74f3a54138 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -601,3 +601,14 @@ def __call__(self, inputs, attrs, *args):
         if "tvm_custom" in attrs:
             attrs.pop("tvm_custom")
         return get_relay_op(self._new_name)(*inputs, **attrs)
+
+
+def to_int_list(np_array):
+    """Convert a np array to a python int list.
+
+    Note: This function converts np.int32 to python's int.
+    If we don't do this conversion, numpy's automatic upcast will make
+    the shape / parameters be converted to int64 IntImm in relay and
+    cause problems in relay/TOPI.
+    """
+    return [int(x) for x in np_array]
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 3572d35c6e3b..3f0140d19b1f 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -30,7 +30,7 @@
 from .. import qnn as _qnn
 from ... import nd as _nd
 from .common import ExprTable
-from .common import infer_shape as _infer_shape
+from .common import infer_shape as _infer_shape, to_int_list
 from .tflite_flexbuffer import FlexBufferDecoder
 
 
@@ -345,7 +345,7 @@ def get_tensor_value(self, tensor_wrapper):
         data = tensor_wrapper.buffer.DataAsNumpy()
 
         if tensor_wrapper.tensor.ShapeLength() != 0:
-            shape = tensor_wrapper.tensor.ShapeAsNumpy()
+            shape = to_int_list(tensor_wrapper.tensor.ShapeAsNumpy())
         else:
             shape = []
 
@@ -503,7 +503,7 @@ def convert_reshape(self, op):
             op_options = op.BuiltinOptions()
             reshape_options = ReshapeOptions()
             reshape_options.Init(op_options.Bytes, op_options.Pos)
-            target_shape = tuple(reshape_options.NewShapeAsNumpy())
+            target_shape = to_int_list(reshape_options.NewShapeAsNumpy())
 
         in_expr = self.get_expr(input_tensor_idx)
 
@@ -1387,7 +1387,7 @@ def convert_gather(self, op):
         axis = gather_options.Axis()
 
         # Check the indices are with in bounds.
-        data_shape = list(input_tensors[0].tensor.ShapeAsNumpy())
+        data_shape = to_int_list(input_tensors[0].tensor.ShapeAsNumpy())
         data_dim = len(data_shape)
 
         axis = data_dim + axis if axis < 0 else axis
@@ -1505,7 +1505,7 @@ def convert_strided_slice(self, op):
         new_axis_mask = options.NewAxisMask()
         shrink_axis_mask = options.ShrinkAxisMask()
 
-        data_shape = list(input_tensors[0].tensor.ShapeAsNumpy())
+        data_shape = to_int_list(input_tensors[0].tensor.ShapeAsNumpy())
         data_dim = len(data_shape)
         stride_dim = len(stride)
 
@@ -1757,7 +1757,7 @@ def convert_fully_connected(self, op):
         output_tensor_type = output_tensor.tensor.Type()
         output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
 
-        weight_tensor_shape = weight_tensor.tensor.ShapeAsNumpy()
+        weight_tensor_shape = to_int_list(weight_tensor.tensor.ShapeAsNumpy())
 
         # Weight should have only 2 dimensions(TFLite convention)
         assert len(weight_tensor_shape) == 2, "Weight should be only 2-dim"
@@ -1951,15 +1951,17 @@ def convert_conv(self, op, conv_type):
         padding = conv_options.Padding()
         fused_activation_fn = conv_options.FusedActivationFunction()
 
-        _, input_h, input_w, input_c = input_tensor.tensor.ShapeAsNumpy()
+        _, input_h, input_w, input_c = to_int_list(input_tensor.tensor.ShapeAsNumpy())
 
         if is_depthwise_conv:
             # TFLite depthwise convolution kernel layout is:
             # 1 KH KW C(input_c * depth_multiplier)
-            _, kernel_h, kernel_w, in_channels = weight_tensor.tensor.ShapeAsNumpy()
+            _, kernel_h, kernel_w, in_channels = to_int_list(weight_tensor.tensor.ShapeAsNumpy())
             assert in_channels == input_c * depth_multiplier
         else:
-            output_channels, kernel_h, kernel_w, _ = weight_tensor.tensor.ShapeAsNumpy()
+            output_channels, kernel_h, kernel_w, _ = to_int_list(
+                weight_tensor.tensor.ShapeAsNumpy()
+            )
 
         dilated_kernel_h = dilation_h * (kernel_h - 1) + 1
         dilated_kernel_w = dilation_w * (kernel_w - 1) + 1
@@ -2007,6 +2009,7 @@ def convert_conv(self, op, conv_type):
             pass
         elif padding == Padding.SAME:
             pad_top, pad_bottom = get_pad_value(input_h, dilated_kernel_h, stride_h)
+
             pad_left, pad_right = get_pad_value(input_w, dilated_kernel_w, stride_w)
             do_pad = not (pad_top == 0 and pad_bottom == 0 and pad_left == 0 and pad_right == 0)
             if do_pad:
@@ -2160,7 +2163,7 @@ def convert_slice(self, op):
         size = list(self.get_tensor_value(input_tensors[2]))
         # strided_slice(Relay) needs the slice's end indices, not the size
         end = size
-        input_tensor_shape = input_tensor.tensor.ShapeAsNumpy()
+        input_tensor_shape = to_int_list(input_tensor.tensor.ShapeAsNumpy())
         input_tensor_rank = len(input_tensor_shape)
         for i in range(input_tensor_rank):
             if size[i] == -1:
@@ -2322,7 +2325,7 @@ def convert_pool2d(self, op, pool_type):
 
         in_expr = self.get_expr(input_tensor_idx)
 
-        _, input_h, input_w, _ = input_tensor.tensor.ShapeAsNumpy()
+        _, input_h, input_w, _ = to_int_list(input_tensor.tensor.ShapeAsNumpy())
         if padding == Padding.VALID:
             pass
         elif padding == Padding.SAME:
@@ -2701,10 +2704,12 @@ def convert_transpose_conv(self, op):
 
         # Input (data) Tensor. NHWC layout
         input_tensor = input_tensors[2]
-        _, input_h, input_w, input_c = input_tensor.tensor.ShapeAsNumpy()
+        _, input_h, input_w, input_c = to_int_list(input_tensor.tensor.ShapeAsNumpy())
         # Weights tensor. TFLite uses OHWI layout
         weights_tensor = input_tensors[1]
-        out_channels, kernel_h, kernel_w, in_channels = weights_tensor.tensor.ShapeAsNumpy()
+        out_channels, kernel_h, kernel_w, in_channels = to_int_list(
+            weights_tensor.tensor.ShapeAsNumpy()
+        )
         assert (
             input_c == in_channels
         ), "Input channel in the filter should match to channel in the input"
@@ -3120,7 +3125,7 @@ def convert_matrix_diag(self, op):
             ), "TFLite MATRIX_DIAG requires diagonal and output tensors' \
                     scale and zero points to be equal"
 
-        shape = diagonal.tensor.ShapeAsNumpy()
+        shape = to_int_list(diagonal.tensor.ShapeAsNumpy())
         shape = np.append(shape, shape[-1])
         dtype = self.get_tensor_type_str(diagonal.tensor.Type())
 

From a8bad3035cc2b9e736534bcd13fad159507ec43b Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Fri, 4 Dec 2020 16:52:49 +0000
Subject: [PATCH 258/258] Update dmlc_tvm_commit_id.txt

---
 dmlc_tvm_commit_id.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dmlc_tvm_commit_id.txt b/dmlc_tvm_commit_id.txt
index e46bbee4e022..7b294e50bf70 100644
--- a/dmlc_tvm_commit_id.txt
+++ b/dmlc_tvm_commit_id.txt
@@ -1 +1 @@
-8daa97ec87118ecdf38453ca878655cb08fba329
\ No newline at end of file
+9554e645922357af1d11679a102f3763b80b740f
\ No newline at end of file